├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── custom-issue.md
    │   └── feature_request.md
    ├── dependabot.yml
    └── workflows
    │   ├── build_package.yml
    │   ├── codeql.yml
    │   ├── lint.yml
    │   ├── publish_release.yml
    │   └── unit_test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── Makefile
├── README.md
├── TOS.pdf
├── examples
    ├── extract
    │   ├── automotive_sector_analysis.ipynb
    │   ├── data
    │   │   ├── automotive_sector_analysis
    │   │   │   ├── modeling_assumptions.txt
    │   │   │   └── workflow_img.png
    │   │   ├── insider_transactions
    │   │   │   ├── .gitignore
    │   │   │   ├── cik_mapping.json
    │   │   │   └── workflow-diag.png
    │   │   ├── lm317_structured_extraction
    │   │   │   └── lm317_extraction.png
    │   │   ├── resumes
    │   │   │   ├── ai_researcher.pdf
    │   │   │   ├── ml_engineer.pdf
    │   │   │   └── software_architect.pdf
    │   │   └── sec_filings
    │   │   │   ├── nvda_10k.pdf
    │   │   │   ├── nvda_10k_page_40.png
    │   │   │   ├── nvda_10k_page_41.png
    │   │   │   └── web_ui.png
    │   ├── extract_data_with_citations.ipynb
    │   ├── insider_buy_sell.ipynb
    │   ├── lm317_structured_extraction.ipynb
    │   ├── resume_screening.ipynb
    │   ├── sec_10k_filing.ipynb
    │   └── solar_panel_e2e_comparison.ipynb
    ├── parse
    │   ├── advanced_rag
    │   │   ├── dynamic_section_retrieval.ipynb
    │   │   └── dynamic_section_retrieval_img.png
    │   ├── agents
    │   │   └── demo_simple_openai_agent.ipynb
    │   ├── caltrain
    │   │   ├── caltrain_schedule_weekend.pdf
    │   │   └── caltrain_text_mode.ipynb
    │   ├── data
    │   │   ├── BP_Excel.xlsx
    │   │   └── nvidia_quarterly_revenue_trend_by_market.xlsx
    │   ├── demo_advanced.ipynb
    │   ├── demo_api.ipynb
    │   ├── demo_basic.ipynb
    │   ├── demo_excel.ipynb
    │   ├── demo_get_charts.ipynb
    │   ├── demo_insurance.ipynb
    │   ├── demo_json.ipynb
    │   ├── demo_json_tour.ipynb
    │   ├── demo_languages.ipynb
    │   ├── demo_mongodb.ipynb
    │   ├── demo_starter_multimodal.ipynb
    │   ├── demo_starter_parse_selected_pages.ipynb
    │   ├── excel
    │   │   ├── dcf_rag.ipynb
    │   │   ├── o1_excel_rag.ipynb
    │   │   └── references
    │   │   │   ├── query1.png
    │   │   │   ├── query2.png
    │   │   │   ├── query3.png
    │   │   │   ├── query4.png
    │   │   │   ├── query5.png
    │   │   │   └── recursive_retrieval.png
    │   ├── json_tour_screenshots
    │   │   ├── 32778fb0-9e83-4b00-aebe-0d7f59ff0b5f-img_p0_1.png
    │   │   ├── 32778fb0-9e83-4b00-aebe-0d7f59ff0b5f-page_1.jpg
    │   │   ├── img_p0_1.png
    │   │   ├── links_page.png
    │   │   ├── page_1.png
    │   │   └── page_35.png
    │   ├── knowledge_graphs
    │   │   ├── kg_agent.ipynb
    │   │   └── sf2023_budget_kg_screenshot.png
    │   ├── multimodal
    │   │   ├── XC9500_CPLD_Family_p3.png
    │   │   ├── claude_parse.ipynb
    │   │   ├── gemini2_flash.ipynb
    │   │   ├── gpt4o_mini.ipynb
    │   │   ├── insurance_rag.ipynb
    │   │   ├── legal_rag.ipynb
    │   │   ├── llama2-p33.png
    │   │   ├── llama3.1-p5.png
    │   │   ├── multimodal_contextual_retrieval_rag.ipynb
    │   │   ├── multimodal_contextual_retrieval_rag_img.png
    │   │   ├── multimodal_rag_slide_deck.ipynb
    │   │   ├── multimodal_rag_slide_deck_img.png
    │   │   ├── multimodal_report_generation.ipynb
    │   │   ├── multimodal_report_generation_agent.ipynb
    │   │   ├── multimodal_report_generation_agent_img.png
    │   │   └── product_manual_rag.ipynb
    │   ├── other_files
    │   │   ├── demo_ppt_basic.ipynb
    │   │   └── demo_ppt_financial.ipynb
    │   ├── parsing_instructions
    │   │   ├── expense_report_document.pdf
    │   │   ├── expense_report_document.png
    │   │   ├── mcdonalds_receipt.png
    │   │   ├── parsing_instructions.ipynb
    │   │   ├── purchase_order_document.pdf
    │   │   └── purchase_order_document.png
    │   ├── parsing_modes
    │   │   ├── demo_auto_mode.ipynb
    │   │   ├── demo_layout_agent_mode_visual_citations.ipynb
    │   │   ├── diagram.jpg
    │   │   ├── layout_agent_citation_engine.png
    │   │   ├── layout_agent_moe.png
    │   │   ├── layout_agent_parse_explainer.png
    │   │   ├── mermaid_render.png
    │   │   ├── page_1.png
    │   │   ├── page_11.png
    │   │   ├── page_14.png
    │   │   └── page_3.png
    │   ├── report_generation
    │   │   └── rfp_response
    │   │   │   ├── generate_rfp.ipynb
    │   │   │   └── generate_rfp_img.png
    │   └── test_tesla_impact_report
    │   │   ├── 2019-tesla-impact-report-short.pdf
    │   │   └── test_gpt4o.ipynb
    └── report
    │   └── basic_report.ipynb
├── extract.md
├── llama_cloud_services
    ├── __init__.py
    ├── constants.py
    ├── extract
    │   ├── __init__.py
    │   ├── extract.py
    │   └── utils.py
    ├── parse
    │   ├── __init__.py
    │   ├── base.py
    │   ├── cli
    │   │   ├── __init__.py
    │   │   └── main.py
    │   ├── types.py
    │   └── utils.py
    └── report
    │   ├── __init__.py
    │   ├── base.py
    │   └── report.py
├── llama_parse
    ├── README.md
    ├── llama_parse
    │   ├── __init__.py
    │   ├── base.py
    │   ├── cli
    │   │   ├── __init__.py
    │   │   └── main.py
    │   └── utils.py
    ├── poetry.lock
    └── pyproject.toml
├── parse.md
├── poetry.lock
├── pyproject.toml
├── report.md
└── tests
    ├── __init__.py
    ├── extract
        ├── __init__.py
        ├── data
        │   ├── receipt
        │   │   ├── noisebridge_receipt.pdf
        │   │   ├── noisebridge_receipt.test.json
        │   │   └── schema.json
        │   ├── resume
        │   │   ├── schema.json
        │   │   ├── software_architect_resume.html
        │   │   └── software_architect_resume.test.json
        │   └── slide
        │   │   ├── saas_slide.pdf
        │   │   ├── saas_slide.test.json
        │   │   └── schema.json
        ├── test_benchmark.py
        ├── test_extract_api.py
        ├── test_extract_e2e.py
        └── util.py
    ├── parse
        ├── __init__.py
        ├── test_llama_parse.py
        └── test_llama_parse_result.py
    ├── report
        ├── __init__.py
        └── test_llama_report.py
    └── test_files
        ├── attention_is_all_you_need.pdf
        ├── attention_is_all_you_need_chart.pdf
        ├── images
            └── 67b428c6-9edb-4550-83d9-5e35165846ca-page_1.jpg
        ├── paper.md
        ├── resume
            ├── receipt
            │   ├── noisebridge_receipt.pdf
            │   ├── noisebridge_receipt.test.json
            │   └── schema.json
            ├── schema.json
            ├── software_architect_resume.html
            └── software_architect_resume.test.json
        └── slide
            ├── saas_slide.pdf
            ├── saas_slide.test.json
            └── schema.json


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | Write a concise description of what the bug is.
12 | 
13 | **Files**
14 | If possible, please provide the PDF file causing the issue.
15 | 
16 | **Job ID**
17 | If you have it, please provide the ID of the job you ran.
18 | You can find it here: https://cloud.llamaindex.ai/parse in the "History" tab.
19 | 
20 | **Client:**
21 | Please remove untested options:
22 |  - Python Library
23 |  - API
24 |  - Frontend (cloud.llamaindex.ai)
25 |  - Typescript Library
26 |  - Notebook
27 | 
28 | **Additional context**
29 | Add any additional context about the problem here.
30 | What options did you use? Premium mode, multimodal, fast mode, parsing instructions, etc.
31 | Screenshots, code snippets, etc.
32 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom-issue.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Custom issue
 3 | about: Not a bug nor a feature request
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Please see the documentation for all configuration options:
 2 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 3 | # and
 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "github-actions"
 9 |     directory: "/"
10 |     schedule:
11 |       interval: "weekly"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/build_package.yml:
--------------------------------------------------------------------------------
 1 | name: Build Package
 2 | 
 3 | # Build package on its own without additional pip install
 4 | 
 5 | on:
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 | 
11 | env:
12 |   POETRY_VERSION: "1.6.1"
13 | 
14 | jobs:
15 |   build:
16 |     runs-on: ${{ matrix.os }}
17 |     strategy:
18 |       # You can use PyPy versions in python-version.
19 |       # For example, pypy-2.7 and pypy-3.8
20 |       matrix:
21 |         os: [ubuntu-latest, windows-latest]
22 |         python-version: ["3.9"]
23 |     steps:
24 |       - uses: actions/checkout@v4
25 |       - name: Set up python ${{ matrix.python-version }}
26 |         uses: actions/setup-python@v5
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 |       - name: Install Poetry
30 |         uses: snok/install-poetry@v1
31 |         with:
32 |           version: ${{ env.POETRY_VERSION }}
33 |       - name: Install deps
34 |         shell: bash
35 |         run: poetry install
36 |       - name: Ensure lock works
37 |         shell: bash
38 |         run: poetry lock
39 |       - name: Build
40 |         shell: bash
41 |         run: poetry build
42 |       - name: Test installing built package
43 |         shell: bash
44 |         run: python -m pip install .
45 |       - name: Test import
46 |         shell: bash
47 |         working-directory: ${{ vars.RUNNER_TEMP }}
48 |         run: python -c "import llama_cloud_services"
49 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["main"]
 6 |   pull_request:
 7 |     # The branches below must be a subset of the branches above
 8 |     branches: ["main"]
 9 |   schedule:
10 |     - cron: "30 16 * * 4"
11 | 
12 | jobs:
13 |   analyze:
14 |     name: Analyze
15 |     # Runner size impacts CodeQL analysis time. To learn more, please see:
16 |     #   - https://gh.io/recommended-hardware-resources-for-running-codeql
17 |     #   - https://gh.io/supported-runners-and-hardware-resources
18 |     #   - https://gh.io/using-larger-runners
19 |     # Consider using larger runners for possible analysis time improvements.
20 |     runs-on: "ubuntu-latest"
21 |     timeout-minutes: 360
22 |     permissions:
23 |       actions: read
24 |       contents: read
25 |       security-events: write
26 | 
27 |     steps:
28 |       - name: Checkout repository
29 |         uses: actions/checkout@v4
30 | 
31 |       # Initializes the CodeQL tools for scanning.
32 |       - name: Initialize CodeQL
33 |         uses: github/codeql-action/init@v3
34 |         with:
35 |           languages: python
36 |           dependency-caching: true
37 | 
38 |       - name: Perform CodeQL Analysis
39 |         uses: github/codeql-action/analyze@v3
40 |         with:
41 |           category: "/language:python"
42 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Linting
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | env:
10 |   POETRY_VERSION: "1.6.1"
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       # You can use PyPy versions in python-version.
17 |       # For example, pypy-2.7 and pypy-3.8
18 |       matrix:
19 |         python-version: ["3.9"]
20 |     steps:
21 |       - uses: actions/checkout@v4
22 |         with:
23 |           fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }}
24 |       - name: Set up python ${{ matrix.python-version }}
25 |         uses: actions/setup-python@v5
26 |         with:
27 |           python-version: ${{ matrix.python-version }}
28 |       - name: Install Poetry
29 |         uses: snok/install-poetry@v1
30 |         with:
31 |           version: ${{ env.POETRY_VERSION }}
32 |       - name: Install pre-commit
33 |         shell: bash
34 |         run: poetry run pip install pre-commit
35 |       - name: Run linter
36 |         shell: bash
37 |         run: poetry run make lint
38 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_release.yml:
--------------------------------------------------------------------------------
 1 | name: Publish llama-parse to PyPI / GitHub
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "v*"
 7 | 
 8 |   workflow_dispatch:
 9 | 
10 | env:
11 |   POETRY_VERSION: "1.6.1"
12 |   PYTHON_VERSION: "3.9"
13 | 
14 | jobs:
15 |   build-n-publish:
16 |     name: Build and publish to PyPI
17 |     if: github.repository == 'run-llama/llama_cloud_services'
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |       - uses: actions/checkout@v4
22 |       - name: Set up python ${{ env.PYTHON_VERSION }}
23 |         uses: actions/setup-python@v5
24 |         with:
25 |           python-version: ${{ env.PYTHON_VERSION }}
26 | 
27 |       - name: Install Poetry
28 |         uses: snok/install-poetry@v1
29 |         with:
30 |           version: ${{ env.POETRY_VERSION }}
31 | 
32 |       - name: Install deps
33 |         shell: bash
34 |         run: pip install -e .
35 | 
36 |       - name: Build and publish llama-cloud-services
37 |         uses: JRubics/poetry-publish@v2.1
38 |         with:
39 |           pypi_token: ${{ secrets.LLAMA_PARSE_PYPI_TOKEN }}
40 |           poetry_install_options: "--without dev"
41 | 
42 |       - name: Wait for PyPI to update
43 |         run: |
44 |           sleep 60
45 | 
46 |       - name: Update llama-parse lock file
47 |         run: |
48 |           cd llama_parse && poetry lock
49 | 
50 |       - name: Build and publish llama-parse
51 |         uses: JRubics/poetry-publish@v2.1
52 |         with:
53 |           package_directory: "./llama_parse"
54 |           pypi_token: ${{ secrets.LLAMA_PARSE_PYPI_TOKEN }}
55 |           poetry_install_options: "--without dev"
56 | 
57 |       - name: Create GitHub Release
58 |         id: create_release
59 |         uses: actions/create-release@v1
60 |         env:
61 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
62 |         with:
63 |           tag_name: ${{ github.ref }}
64 |           release_name: ${{ github.ref }}
65 |           draft: false
66 |           prerelease: false
67 | 
68 |       - name: Get Asset name
69 |         run: |
70 |           export PKG=$(ls dist/ | grep tar)
71 |           set -- $PKG
72 |           echo "name=$1" >> $GITHUB_ENV
73 | 
74 |       - name: Upload Release Asset (sdist) to GitHub
75 |         id: upload-release-asset
76 |         uses: actions/upload-release-asset@v1
77 |         env:
78 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
79 |         with:
80 |           upload_url: ${{ steps.create_release.outputs.upload_url }}
81 |           asset_path: dist/${{ env.name }}
82 |           asset_name: ${{ env.name }}
83 |           asset_content_type: application/zip
84 | 


--------------------------------------------------------------------------------
/.github/workflows/unit_test.yml:
--------------------------------------------------------------------------------
 1 | name: Unit Testing
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | env:
10 |   POETRY_VERSION: "1.6.1"
11 |   LLAMA_CLOUD_API_KEY: ${{ secrets.LLAMA_CLOUD_API_KEY }}
12 | 
13 | jobs:
14 |   test:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       # You can use PyPy versions in python-version.
18 |       # For example, pypy-2.7 and pypy-3.8
19 |       matrix:
20 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
21 |     steps:
22 |       - uses: actions/checkout@v4
23 |         with:
24 |           fetch-depth: 0
25 |       - name: Set up python ${{ matrix.python-version }}
26 |         uses: actions/setup-python@v5
27 |         with:
28 |           python-version: ${{ matrix.python-version }}
29 |       - name: Install Poetry
30 |         uses: snok/install-poetry@v1
31 |         with:
32 |           version: ${{ env.POETRY_VERSION }}
33 |       - name: Install deps
34 |         shell: bash
35 |         run: poetry install --with dev
36 |       - name: Run testing
37 |         env:
38 |           CI: true
39 |         shell: bash
40 |         run: poetry run pytest tests
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .git
2 | __pycache__/
3 | *.pyc
4 | .DS_Store
5 | .idea
6 | .env*
7 | .ipynb_checkpoints*
8 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | default_language_version:
 3 |   python: python3
 4 | 
 5 | repos:
 6 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 7 |     rev: v4.5.0
 8 |     hooks:
 9 |       - id: check-byte-order-marker
10 |       - id: check-merge-conflict
11 |       - id: check-symlinks
12 |       - id: check-toml
13 |       - id: check-yaml
14 |       - id: detect-private-key
15 |       - id: end-of-file-fixer
16 |       - id: mixed-line-ending
17 |       - id: trailing-whitespace
18 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
19 |     rev: v0.1.5
20 | 
21 |     hooks:
22 |       - id: ruff
23 |         args: [--fix, --exit-non-zero-on-fix]
24 |         exclude: ".*poetry.lock"
25 |   - repo: https://github.com/psf/black-pre-commit-mirror
26 |     rev: 23.10.1
27 |     hooks:
28 |       - id: black-jupyter
29 |         name: black-src
30 |         alias: black
31 |         exclude: ".*poetry.lock"
32 |   - repo: https://github.com/pre-commit/mirrors-mypy
33 |     rev: v1.0.1
34 |     hooks:
35 |       - id: mypy
36 |         exclude: ^tests/
37 |         additional_dependencies:
38 |           [
39 |             "types-requests",
40 |             "types-Deprecated",
41 |             "types-redis",
42 |             "types-setuptools",
43 |             "types-PyYAML",
44 |             "types-protobuf==4.24.0.4",
45 |           ]
46 |         args:
47 |           [
48 |             --disallow-untyped-defs,
49 |             --ignore-missing-imports,
50 |             --python-version=3.10,
51 |           ]
52 |   - repo: https://github.com/adamchainz/blacken-docs
53 |     rev: 1.16.0
54 |     hooks:
55 |       - id: blacken-docs
56 |         name: black-docs-text
57 |         alias: black
58 |         types_or: [rst, markdown, tex]
59 |         additional_dependencies: [black==23.10.1]
60 |         # Using PEP 8's line length in docs prevents excess left/right scrolling
61 |         args: [--line-length=79]
62 |   - repo: https://github.com/pre-commit/mirrors-prettier
63 |     rev: v3.0.3
64 |     hooks:
65 |       - id: prettier
66 |         exclude: poetry.lock
67 |   - repo: https://github.com/codespell-project/codespell
68 |     rev: v2.2.6
69 |     hooks:
70 |       - id: codespell
71 |         additional_dependencies: [tomli]
72 |         exclude: ^(poetry.lock|examples)
73 |         args:
74 |           [
75 |             "--ignore-words-list",
76 |             "astroid,gallary,momento,narl,ot,rouge,nin,gere,te,inh,vor",
77 |           ]
78 |   - repo: https://github.com/srstevenson/nb-clean
79 |     rev: 3.1.0
80 |     hooks:
81 |       - id: nb-clean
82 |         args: [--preserve-cell-outputs, --remove-empty-cells]
83 |   - repo: https://github.com/pappasam/toml-sort
84 |     rev: v0.23.1
85 |     hooks:
86 |       - id: toml-sort-fix
87 |         exclude: ".*poetry.lock"
88 | 
89 | exclude: .github/ISSUE_TEMPLATE
90 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 LlamaIndex
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
 2 | 
 3 | help:	## Show all Makefile targets.
 4 | 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
 5 | 
 6 | format:	## Run code autoformatters (black).
 7 | 	pre-commit install
 8 | 	git ls-files | xargs pre-commit run black --files
 9 | 
10 | lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
11 | 	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
12 | 
13 | test:	## Run tests via pytest
14 | 	pytest tests
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cloud-services)](https://pypi.org/project/llama-cloud-services/)
 2 | [![GitHub contributors](https://img.shields.io/github/contributors/run-llama/llama_cloud_services)](https://github.com/run-llama/llama_cloud_services/graphs/contributors)
 3 | [![Discord](https://img.shields.io/discord/1059199217496772688)](https://discord.gg/dGcwcsnxhU)
 4 | 
 5 | # Llama Cloud Services
 6 | 
 7 | This repository contains the code for hand-written SDKs and clients for interacting with LlamaCloud.
 8 | 
 9 | This includes:
10 | 
11 | - [LlamaParse](./parse.md) - A GenAI-native document parser that can parse complex document data for any downstream LLM use case (Agents, RAG, data processing, etc.).
12 | - [LlamaReport (beta/invite-only)](./report.md) - A prebuilt agentic report builder that can be used to build reports from a variety of data sources.
13 | - [LlamaExtract](./extract.md) - A prebuilt agentic data extractor that can be used to transform data into a structured JSON representation.
14 | 
15 | ## Getting Started
16 | 
17 | Install the package:
18 | 
19 | ```bash
20 | pip install llama-cloud-services
21 | ```
22 | 
23 | Then, get your API key from [LlamaCloud](https://cloud.llamaindex.ai/).
24 | 
25 | Then, you can use the services in your code:
26 | 
27 | ```python
28 | from llama_cloud_services import LlamaParse, LlamaReport, LlamaExtract
29 | 
30 | parser = LlamaParse(api_key="YOUR_API_KEY")
31 | report = LlamaReport(api_key="YOUR_API_KEY")
32 | extract = LlamaExtract(api_key="YOUR_API_KEY")
33 | ```
34 | 
35 | See the quickstart guides for each service for more information:
36 | 
37 | - [LlamaParse](./parse.md)
38 | - [LlamaReport (beta/invite-only)](./report.md)
39 | - [LlamaExtract](./extract.md)
40 | 
41 | ## Switch to EU SaaS 🇪🇺
42 | 
43 | If you are interested in using LlamaCloud services in the EU, you can adjust your base URL to `https://api.cloud.eu.llamaindex.ai`.
44 | 
45 | You can also create your API key in the EU region [here](https://cloud.eu.llamaindex.ai).
46 | 
47 | ```python
48 | from llama_cloud_services import (
49 |     LlamaParse,
50 |     LlamaReport,
51 |     LlamaExtract,
52 |     EU_BASE_URL,
53 | )
54 | 
55 | parser = LlamaParse(api_key="YOUR_API_KEY", base_url=EU_BASE_URL)
56 | report = LlamaReport(api_key="YOUR_API_KEY", base_url=EU_BASE_URL)
57 | extract = LlamaExtract(api_key="YOUR_API_KEY", base_url=EU_BASE_URL)
58 | ```
59 | 
60 | ## Documentation
61 | 
62 | You can see complete SDK and API documentation for each service on [our official docs](https://docs.cloud.llamaindex.ai/).
63 | 
64 | ## Terms of Service
65 | 
66 | See the [Terms of Service Here](./TOS.pdf).
67 | 
68 | ## Get in Touch (LlamaCloud)
69 | 
70 | You can get in touch with us by following our [contact link](https://www.llamaindex.ai/contact).
71 | 


--------------------------------------------------------------------------------
/TOS.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/TOS.pdf


--------------------------------------------------------------------------------
/examples/extract/data/automotive_sector_analysis/modeling_assumptions.txt:
--------------------------------------------------------------------------------
 1 | # Financial Modeling Assumptions
 2 | Discount Rate: 8%
 3 | Terminal Growth Rate: 2%
 4 | Tax Rate: 25%
 5 | Revenue Growth (Years 1-5): 10% per annum
 6 | Revenue Growth (Years 6-10): 5% per annum
 7 | Capital Expenditures as % of Revenue: 7%
 8 | Working Capital Assumption: 3% of Revenue
 9 | Depreciation Rate: 10% per annum
10 | Cost of Capital Assumption: 8%
11 | 


--------------------------------------------------------------------------------
/examples/extract/data/automotive_sector_analysis/workflow_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/automotive_sector_analysis/workflow_img.png


--------------------------------------------------------------------------------
/examples/extract/data/insider_transactions/.gitignore:
--------------------------------------------------------------------------------
1 | sec_form_4_dump.json
2 | 


--------------------------------------------------------------------------------
/examples/extract/data/insider_transactions/workflow-diag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/insider_transactions/workflow-diag.png


--------------------------------------------------------------------------------
/examples/extract/data/lm317_structured_extraction/lm317_extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/lm317_structured_extraction/lm317_extraction.png


--------------------------------------------------------------------------------
/examples/extract/data/resumes/ai_researcher.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/resumes/ai_researcher.pdf


--------------------------------------------------------------------------------
/examples/extract/data/resumes/ml_engineer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/resumes/ml_engineer.pdf


--------------------------------------------------------------------------------
/examples/extract/data/resumes/software_architect.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/resumes/software_architect.pdf


--------------------------------------------------------------------------------
/examples/extract/data/sec_filings/nvda_10k.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/sec_filings/nvda_10k.pdf


--------------------------------------------------------------------------------
/examples/extract/data/sec_filings/nvda_10k_page_40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/sec_filings/nvda_10k_page_40.png


--------------------------------------------------------------------------------
/examples/extract/data/sec_filings/nvda_10k_page_41.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/sec_filings/nvda_10k_page_41.png


--------------------------------------------------------------------------------
/examples/extract/data/sec_filings/web_ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/sec_filings/web_ui.png


--------------------------------------------------------------------------------
/examples/parse/advanced_rag/dynamic_section_retrieval_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/advanced_rag/dynamic_section_retrieval_img.png


--------------------------------------------------------------------------------
/examples/parse/agents/demo_simple_openai_agent.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# LlamaParse Agent\n",
  8 |     "\n",
  9 |     "This demo walks through using an OpenAI Agent with [LlamaParse](https://cloud.llamaindex.ai)."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Setup"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "!pip install llama-cloud-services llama-index llama-index-postprocessor-sbert-rerank"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import os\n",
 35 |     "\n",
 36 |     "os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-...\"\n",
 37 |     "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "from llama_index.core import Settings\n",
 47 |     "from llama_index.embeddings.openai import OpenAIEmbedding\n",
 48 |     "from llama_index.llms.openai import OpenAI\n",
 49 |     "\n",
 50 |     "Settings.embed_model = OpenAIEmbedding(model=\"text-embedding-3-small\")\n",
 51 |     "Settings.llm = OpenAI(model=\"gpt-3.5-turbo\", temperature=0.2)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "## Parsing \n",
 59 |     "\n",
 60 |     "For parsing, lets use a [recent paper](https://huggingface.co/papers/2403.09611) on Multi-Modal pretraining"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "!wget https://arxiv.org/pdf/2403.09611.pdf -O paper.pdf"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "Below, we can tell the parser to skip content we don't want. In this case, the references section will just add noise to a RAG system."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "from llama_cloud_services import LlamaParse\n",
 86 |     "\n",
 87 |     "parser = LlamaParse(\n",
 88 |     "    result_type=\"markdown\",\n",
 89 |     ")"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "Started parsing the file under job_id 81251f39-01be-434e-99e8-1c1b83b82098\n"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "documents = await parser.aload_data(\"paper.pdf\")"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "name": "stdout",
116 |      "output_type": "stream",
117 |      "text": [
118 |       "Embeddings have been explicitly disabled. Using MockEmbedding.\n"
119 |      ]
120 |     },
121 |     {
122 |      "name": "stderr",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "41it [00:00, 26765.21it/s]\n",
126 |       "100%|██████████| 41/41 [00:13<00:00,  2.98it/s]\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "import nest_asyncio\n",
132 |     "\n",
133 |     "nest_asyncio.apply()\n",
134 |     "\n",
135 |     "from llama_index.core.node_parser import (\n",
136 |     "    MarkdownElementNodeParser,\n",
137 |     "    SentenceSplitter,\n",
138 |     ")\n",
139 |     "\n",
140 |     "# explicitly extract tables with the MarkdownElementNodeParser\n",
141 |     "node_parser = MarkdownElementNodeParser(num_workers=8)\n",
142 |     "nodes = node_parser.get_nodes_from_documents(documents)\n",
143 |     "nodes, objects = node_parser.get_nodes_and_objects(nodes)\n",
144 |     "\n",
145 |     "# Chain splitters to ensure chunk size requirements are met\n",
146 |     "nodes = SentenceSplitter(chunk_size=512, chunk_overlap=20).get_nodes_from_documents(\n",
147 |     "    nodes\n",
148 |     ")"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "## Chat over the paper, lets find out what it is about!"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "from llama_index.core import VectorStoreIndex, SummaryIndex\n",
165 |     "\n",
166 |     "vector_index = VectorStoreIndex(nodes=nodes)\n",
167 |     "summary_index = SummaryIndex(nodes=nodes)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "from llama_index.agent.openai import OpenAIAgent\n",
177 |     "from llama_index.core.tools import QueryEngineTool, ToolMetadata\n",
178 |     "from llama_index.postprocessor.colbert_rerank import ColbertRerank\n",
179 |     "\n",
180 |     "tools = [\n",
181 |     "    QueryEngineTool(\n",
182 |     "        vector_index.as_query_engine(\n",
183 |     "            similarity_top_k=8, node_postprocessors=[ColbertRerank(top_n=3)]\n",
184 |     "        ),\n",
185 |     "        metadata=ToolMetadata(\n",
186 |     "            name=\"search\",\n",
187 |     "            description=\"Search the document, pass the entire user message in the query\",\n",
188 |     "        ),\n",
189 |     "    ),\n",
190 |     "    QueryEngineTool(\n",
191 |     "        summary_index.as_query_engine(),\n",
192 |     "        metadata=ToolMetadata(\n",
193 |     "            name=\"summarize\",\n",
194 |     "            description=\"Summarize the document using the user message\",\n",
195 |     "        ),\n",
196 |     "    ),\n",
197 |     "]\n",
198 |     "\n",
199 |     "agent = OpenAIAgent.from_tools(tools=tools, verbose=True)"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [
207 |     {
208 |      "name": "stdout",
209 |      "output_type": "stream",
210 |      "text": [
211 |       "Added user message to memory: What is the summary of the paper?\n",
212 |       "=== Calling Function ===\n",
213 |       "Calling function: summarize with args: {\"input\":\"summary\"}\n",
214 |       "Got output: The research focuses on developing Multimodal Large Language Models (MLLMs) by incorporating image-caption, interleaved image-text, and text-only data for pre-training. It highlights the importance of factors like the image encoder, resolution, and token count, while downplaying the design of the vision-language connector. With models scaling up to 30B parameters, the MM1 family demonstrates impressive performance in pre-training metrics and competitive outcomes on diverse multimodal benchmarks. It demonstrates abilities such as in-context learning and multi-image reasoning, aiming to provide valuable insights for creating MLLMs that benefit the research community.\n",
215 |       "========================\n",
216 |       "\n"
217 |      ]
218 |     }
219 |    ],
220 |    "source": [
221 |     "# note -- this will take a while with local LLMs, its sending every node in the document to the LLM\n",
222 |     "resp = agent.chat(\"What is the summary of the paper?\")"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "name": "stdout",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "The summary of the paper highlights the development of Multimodal Large Language Models (MLLMs) by incorporating image-caption, interleaved image-text, and text-only data for pre-training. The research emphasizes factors like the image encoder, resolution, and token count, while de-emphasizing the design of the vision-language connector. The MM1 family of models, scaling up to 30B parameters, shows impressive performance in pre-training metrics and competitive outcomes on various multimodal benchmarks. These models demonstrate capabilities such as in-context learning and multi-image reasoning, aiming to provide valuable insights for creating MLLMs that benefit the research community.\n"
235 |      ]
236 |     }
237 |    ],
238 |    "source": [
239 |     "print(str(resp))"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {},
246 |    "outputs": [
247 |     {
248 |      "name": "stdout",
249 |      "output_type": "stream",
250 |      "text": [
251 |       "Added user message to memory: How do the authors evaluate their work?\n",
252 |       "=== Calling Function ===\n",
253 |       "Calling function: search with args: {\"input\":\"evaluation methods\"}\n",
254 |       "Got output: The evaluation methods involve synthesizing all benchmark results into a single meta-average number to simplify comparisons. This is achieved by normalizing the evaluation metrics with respect to a baseline configuration, standardizing the results for each task, adjusting every metric by dividing it by its respective baseline, and then averaging across all metrics.\n",
255 |       "========================\n",
256 |       "\n"
257 |      ]
258 |     }
259 |    ],
260 |    "source": [
261 |     "resp = agent.chat(\"How do the authors evaluate their work?\")"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "name": "stdout",
271 |      "output_type": "stream",
272 |      "text": [
273 |       "The authors evaluate their work by synthesizing all benchmark results into a single meta-average number to simplify comparisons. They normalize the evaluation metrics with respect to a baseline configuration, standardize the results for each task, adjust every metric by dividing it by its respective baseline, and then average across all metrics for evaluation.\n"
274 |      ]
275 |     }
276 |    ],
277 |    "source": [
278 |     "print(str(resp))"
279 |    ]
280 |   }
281 |  ],
282 |  "metadata": {
283 |   "kernelspec": {
284 |    "display_name": "llama-parse-aNC435Vv-py3.10",
285 |    "language": "python",
286 |    "name": "python3"
287 |   },
288 |   "language_info": {
289 |    "codemirror_mode": {
290 |     "name": "ipython",
291 |     "version": 3
292 |    },
293 |    "file_extension": ".py",
294 |    "mimetype": "text/x-python",
295 |    "name": "python",
296 |    "nbconvert_exporter": "python",
297 |    "pygments_lexer": "ipython3"
298 |   }
299 |  },
300 |  "nbformat": 4,
301 |  "nbformat_minor": 2
302 | }
303 | 


--------------------------------------------------------------------------------
/examples/parse/caltrain/caltrain_schedule_weekend.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/caltrain/caltrain_schedule_weekend.pdf


--------------------------------------------------------------------------------
/examples/parse/data/BP_Excel.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/data/BP_Excel.xlsx


--------------------------------------------------------------------------------
/examples/parse/data/nvidia_quarterly_revenue_trend_by_market.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/data/nvidia_quarterly_revenue_trend_by_market.xlsx


--------------------------------------------------------------------------------
/examples/parse/demo_api.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Using the Raw API\n",
  8 |     "\n",
  9 |     "This notebook walks through how to use the raw API and how"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "--2024-02-02 11:11:39--  https://arxiv.org/pdf/1706.03762.pdf\n",
 22 |       "Resolving arxiv.org (arxiv.org)... 151.101.131.42, 151.101.3.42, 151.101.67.42, ...\n",
 23 |       "Connecting to arxiv.org (arxiv.org)|151.101.131.42|:443... connected.\n",
 24 |       "HTTP request sent, awaiting response... 200 OK\n",
 25 |       "Length: 2215244 (2.1M) [application/pdf]\n",
 26 |       "Saving to: ‘./attention.pdf’\n",
 27 |       "\n",
 28 |       "./attention.pdf     100%[===================>]   2.11M  --.-KB/s    in 0.08s   \n",
 29 |       "\n",
 30 |       "2024-02-02 11:11:39 (27.3 MB/s) - ‘./attention.pdf’ saved [2215244/2215244]\n",
 31 |       "\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "!wget \"https://arxiv.org/pdf/1706.03762.pdf\" -O \"./attention.pdf\""
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "api_key = \"llx-...\""
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "import mimetypes\n",
 55 |     "import requests\n",
 56 |     "import time\n",
 57 |     "\n",
 58 |     "headers = {\"Authorization\": f\"Bearer {api_key}\"}\n",
 59 |     "file_path = \"./attention.pdf\"\n",
 60 |     "base_url = \"https://api.cloud.llamaindex.ai/api/parsing\"\n",
 61 |     "\n",
 62 |     "with open(file_path, \"rb\") as f:\n",
 63 |     "    mime_type = mimetypes.guess_type(file_path)[0]\n",
 64 |     "    files = {\"file\": (f.name, f, mime_type)}\n",
 65 |     "\n",
 66 |     "    # send the request, upload the file\n",
 67 |     "    url = f\"{base_url}/upload\"\n",
 68 |     "    response = requests.post(url, headers=headers, files=files)\n",
 69 |     "\n",
 70 |     "response.raise_for_status()\n",
 71 |     "# get the job id for the result_url\n",
 72 |     "job_id = response.json()[\"id\"]\n",
 73 |     "result_type = \"text\"  # or \"markdown\"\n",
 74 |     "result_url = f\"{base_url}/job/{job_id}/result/{result_type}\"\n",
 75 |     "\n",
 76 |     "# check for the result until its ready\n",
 77 |     "while True:\n",
 78 |     "    response = requests.get(result_url, headers=headers)\n",
 79 |     "    if response.status_code == 200:\n",
 80 |     "        break\n",
 81 |     "\n",
 82 |     "    time.sleep(2)\n",
 83 |     "\n",
 84 |     "# download the result\n",
 85 |     "result = response.json()\n",
 86 |     "output = result[result_type]"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "name": "stdout",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "                          Provided proper attribution is provided, Google hereby grants permission to\n",
 99 |       "                          reproduce the tables and figures in this paper solely for use in journalistic or\n",
100 |       "                                                                   scholarly works.\n",
101 |       "                                                   Attention Is All You Need\n",
102 |       "arXiv:1706.03762v7  [cs.CL]  2 Aug 2023\n",
103 |       "                            Ashish Vaswani∗                Noam Shazeer∗               Niki Parmar∗             Jakob Uszkoreit∗\n",
104 |       "                              Google Brain                  Google Brain             Google Research            Google Research\n",
105 |       "                        avaswani@google.com             noam@google.com            nikip@google.com            usz@google.com\n",
106 |       "                                Llion Jones∗                 Aidan N. Gomez∗        †                  Łukasz Kaiser∗\n",
107 |       "                             Google Research                University of Toronto        \n"
108 |      ]
109 |     }
110 |    ],
111 |    "source": [
112 |     "print(output[:1000])"
113 |    ]
114 |   }
115 |  ],
116 |  "metadata": {
117 |   "kernelspec": {
118 |    "display_name": "llama-parse-aNC435Vv-py3.11",
119 |    "language": "python",
120 |    "name": "python3"
121 |   },
122 |   "language_info": {
123 |    "codemirror_mode": {
124 |     "name": "ipython",
125 |     "version": 3
126 |    },
127 |    "file_extension": ".py",
128 |    "mimetype": "text/x-python",
129 |    "name": "python",
130 |    "nbconvert_exporter": "python",
131 |    "pygments_lexer": "ipython3"
132 |   }
133 |  },
134 |  "nbformat": 4,
135 |  "nbformat_minor": 2
136 | }
137 | 


--------------------------------------------------------------------------------
/examples/parse/demo_starter_parse_selected_pages.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<a href=\"https://colab.research.google.com/github/run-llama/llama_cloud_services/blob/main/examples/parse/demo_starter_parse_selected_pages.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Parse Selected Pages \n",
 15 |     "\n",
 16 |     "In this notebook we will demonstrate how to parse selected pages in a document using LlamaParse."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "### Installation\n",
 24 |     "\n",
 25 |     "Here we install `llama-parse` used for parsing the document"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "!pip install llama-cloud-services"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "### Set API Key"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import os\n",
 51 |     "\n",
 52 |     "# API access to llama-cloud\n",
 53 |     "os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"<YOUR LLAMACLOUD API KEY>\""
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "### Download Data\n",
 61 |     "\n",
 62 |     "Here we download Uber 2021 10K SEC filings data for the demonstration."
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf' -O './uber_2021.pdf'"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "### Parse the PDF file in selected pages\n",
 79 |     "\n",
 80 |     "Here we will parse the PDF file in selected pages and get the text in `markdown` format."
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "Started parsing the file under job_id ad1087c1-b085-4dc7-9aa8-d13cdd440f2b\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "from llama_cloud_services import LlamaParse\n",
 98 |     "\n",
 99 |     "parser = LlamaParse(target_pages=\"0,1,2\")\n",
100 |     "\n",
101 |     "results = await parser.aparse(\"./uber_2021.pdf\")\n",
102 |     "documents = results.get_text_documents(split_by_page=True)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "data": {
112 |       "text/plain": [
113 |        "[Document(id_='d0b34f4a-27ef-48e2-a92a-386e5e265f4c', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\\n', text='# UNITED STATES SECURITIES AND EXCHANGE COMMISSION\\n\\n# Washington, D.C. 20549\\n\\n# FORM 10-K\\n\\n(Mark One)\\n\\n☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\\n\\nFor the fiscal year ended December 31, 2021\\n\\nOR\\n\\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\\n\\nFor the transition period from _____ to _____\\n\\nCommission File Number: 001-38902\\n\\n# UBER TECHNOLOGIES, INC.\\n\\n(Exact name of registrant as specified in its charter)\\n\\nDelaware\\n\\n45-2647441\\n\\n(State or other jurisdiction of incorporation or organization) (I.R.S. Employer Identification No.)\\n\\n1515 3rd Street\\n\\nSan Francisco, California 94158\\n\\n(Address of principal executive offices, including zip code)\\n\\n(415) 612-8582\\n\\n(Registrant’s telephone number, including area code)\\n\\n# Securities registered pursuant to Section 12(b) of the Act:\\n\\n|Title of each class|Trading Symbol(s)|Name of each exchange on which registered|\\n|---|---|---|\\n|Common Stock, par value $0.00001 per share|UBER|New York Stock Exchange|\\n\\nSecurities registered pursuant to Section 12(g) of the Act: None\\n\\nIndicate by check mark whether the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒ No ☐\\n\\nIndicate by check mark whether the registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act. Yes ☐ No ☒\\n\\nIndicate by check mark whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12 months (or for such shorter period that the registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days. Yes ☒ No ☐\\n\\nIndicate by check mark whether the registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (§232.405 of this chapter) during the preceding 12 months (or for such shorter period that the registrant was required to submit such files). Yes ☒ No ☐\\n\\nIndicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company, or an emerging growth company. See the definitions of “large accelerated filer,” “accelerated filer,” “smaller reporting company,” and “emerging growth company” in Rule 12b-2 of the Exchange Act.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\\n', text_template='{metadata_str}\\n\\n{content}'),\n",
114 |        " Document(id_='253b1141-a260-466e-b164-b39df67ef799', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\\n', text=\"# Large accelerated filer\\n\\n☒\\n\\n# Accelerated filer\\n\\n☐\\n\\n# Non-accelerated filer\\n\\n☐\\n\\n# Smaller reporting company\\n\\n☐\\n\\n# Emerging growth company\\n\\n☐\\n\\nIf an emerging growth company, indicate by check mark if the registrant has elected not to use the extended transition period for complying with any new or revised financial accounting standards provided pursuant to Section 13(a) of the Exchange Act.\\n\\n☐\\n\\nIndicate by check mark whether the registrant has filed a report on and attestation to its management’s assessment of the effectiveness of its internal control over financial reporting under Section 404(b) of the Sarbanes-Oxley Act (15 U.S.C. 7262(b)) by the registered public accounting firm that prepared or issued\\n\\n☒\\n\\nIndicate by check mark whether the registrant is a shell company (as defined in Rule 12b-2 of the Exchange Act). Yes\\n\\n☐\\n\\nNo\\n\\n☒\\n\\nThe aggregate market value of the voting and non-voting common equity held by non-affiliates of the registrant as of June 30, 2021, the last business day of the registrant's most recently completed second fiscal quarter, was approximately $90.5 billion based upon the closing price reported for such date on the New York Stock Exchange.\\n\\nThe number of shares of the registrant's common stock outstanding as of February 22, 2022 was 1,954,464,088.\\n\\n# DOCUMENTS INCORPORATED BY REFERENCE\\n\\nPortions of the registrant’s Definitive Proxy Statement relating to the Annual Meeting of Stockholders are incorporated by reference into Part III of this Annual Report on Form 10-K where indicated. Such Definitive Proxy Statement will be filed with the Securities and Exchange Commission within 120 days after the end of the registrant’s fiscal year ended December 31, 2021.\", mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\\n', text_template='{metadata_str}\\n\\n{content}'),\n",
115 |        " Document(id_='ad988239-3ab5-498d-85ba-a29241db24d4', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\\n', text='# UBER TECHNOLOGIES, INC.\\n\\n# TABLE OF CONTENTS\\n\\n|Special Note Regarding Forward-Looking Statements|2|\\n|---|---|\\n|PART I|PART I|\\n|Item 1. Business|4|\\n|Item 1A. Risk Factors|11|\\n|Item 1B. Unresolved Staff Comments|46|\\n|Item 2. Properties|46|\\n|Item 3. Legal Proceedings|46|\\n|Item 4. Mine Safety Disclosures|47|\\n|PART II|PART II|\\n|Item 5. Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities|47|\\n|Item 6. [Reserved]|48|\\n|Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations|48|\\n|Item 7A. Quantitative and Qualitative Disclosures About Market Risk|69|\\n|Item 8. Financial Statements and Supplementary Data|70|\\n|Item 9. Changes in and Disagreements with Accountants on Accounting and Financial Disclosure|146|\\n|Item 9A. Controls and Procedures|147|\\n|Item 9B. Other Information|147|\\n|Item 9C. Disclosure Regarding Foreign Jurisdictions that Prevent Inspections|147|\\n|PART III|PART III|\\n|Item 10. Directors, Executive Officers and Corporate Governance|147|\\n|Item 11. Executive Compensation|147|\\n|Item 12. Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters|148|\\n|Item 13. Certain Relationships and Related Transactions, and Director Independence|148|\\n|Item 14. Principal Accounting Fees and Services|148|\\n|PART IV|PART IV|\\n|Item 15. Exhibits, Financial Statement Schedules|148|\\n|Item 16. Form 10-K Summary|148|\\n|Exhibit Index|149|\\n|Signatures|152|', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\\n', text_template='{metadata_str}\\n\\n{content}')]"
116 |       ]
117 |      },
118 |      "execution_count": null,
119 |      "metadata": {},
120 |      "output_type": "execute_result"
121 |     }
122 |    ],
123 |    "source": [
124 |     "documents"
125 |    ]
126 |   }
127 |  ],
128 |  "metadata": {
129 |   "kernelspec": {
130 |    "display_name": "llamacloud",
131 |    "language": "python",
132 |    "name": "llamacloud"
133 |   },
134 |   "language_info": {
135 |    "codemirror_mode": {
136 |     "name": "ipython",
137 |     "version": 3
138 |    },
139 |    "file_extension": ".py",
140 |    "mimetype": "text/x-python",
141 |    "name": "python",
142 |    "nbconvert_exporter": "python",
143 |    "pygments_lexer": "ipython3"
144 |   }
145 |  },
146 |  "nbformat": 4,
147 |  "nbformat_minor": 2
148 | }
149 | 


--------------------------------------------------------------------------------
/examples/parse/excel/references/query1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/excel/references/query1.png


--------------------------------------------------------------------------------
/examples/parse/excel/references/query2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/excel/references/query2.png


--------------------------------------------------------------------------------
/examples/parse/excel/references/query3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/excel/references/query3.png


--------------------------------------------------------------------------------
/examples/parse/excel/references/query4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/excel/references/query4.png


--------------------------------------------------------------------------------
/examples/parse/excel/references/query5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/excel/references/query5.png


--------------------------------------------------------------------------------
/examples/parse/excel/references/recursive_retrieval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/excel/references/recursive_retrieval.png


--------------------------------------------------------------------------------
/examples/parse/json_tour_screenshots/32778fb0-9e83-4b00-aebe-0d7f59ff0b5f-img_p0_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/json_tour_screenshots/32778fb0-9e83-4b00-aebe-0d7f59ff0b5f-img_p0_1.png


--------------------------------------------------------------------------------
/examples/parse/json_tour_screenshots/32778fb0-9e83-4b00-aebe-0d7f59ff0b5f-page_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/json_tour_screenshots/32778fb0-9e83-4b00-aebe-0d7f59ff0b5f-page_1.jpg


--------------------------------------------------------------------------------
/examples/parse/json_tour_screenshots/img_p0_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/json_tour_screenshots/img_p0_1.png


--------------------------------------------------------------------------------
/examples/parse/json_tour_screenshots/links_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/json_tour_screenshots/links_page.png


--------------------------------------------------------------------------------
/examples/parse/json_tour_screenshots/page_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/json_tour_screenshots/page_1.png


--------------------------------------------------------------------------------
/examples/parse/json_tour_screenshots/page_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/json_tour_screenshots/page_35.png


--------------------------------------------------------------------------------
/examples/parse/knowledge_graphs/sf2023_budget_kg_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/knowledge_graphs/sf2023_budget_kg_screenshot.png


--------------------------------------------------------------------------------
/examples/parse/multimodal/XC9500_CPLD_Family_p3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/multimodal/XC9500_CPLD_Family_p3.png


--------------------------------------------------------------------------------
/examples/parse/multimodal/llama2-p33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/multimodal/llama2-p33.png


--------------------------------------------------------------------------------
/examples/parse/multimodal/llama3.1-p5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/multimodal/llama3.1-p5.png


--------------------------------------------------------------------------------
/examples/parse/multimodal/multimodal_contextual_retrieval_rag_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/multimodal/multimodal_contextual_retrieval_rag_img.png


--------------------------------------------------------------------------------
/examples/parse/multimodal/multimodal_rag_slide_deck_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/multimodal/multimodal_rag_slide_deck_img.png


--------------------------------------------------------------------------------
/examples/parse/multimodal/multimodal_report_generation_agent_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/multimodal/multimodal_report_generation_agent_img.png


--------------------------------------------------------------------------------
/examples/parse/other_files/demo_ppt_financial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# LlamaParse - Parsing Financial Powerpoints 📊\n",
  8 |     "\n",
  9 |     "In this cookbook we show you how to use LlamaParse to parse a financial powerpoint."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Installation\n",
 17 |     "\n",
 18 |     "Parsing instruction are part of the LlamaParse API. They can be access by directly specifying the parsing_instruction parameter in the API or by using LlamaParse python module (which we will use for this tutorial).\n",
 19 |     "\n",
 20 |     "To install llama-parse, just get it from `pip`:"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "%pip install llama-index\n",
 30 |     "%pip install llama-cloud-services\n",
 31 |     "%pip install torch transformers python-pptx Pillow"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "## API Key\n",
 39 |     "\n",
 40 |     "The use of LlamaParse requires an API key which you can get here: https://cloud.llamaindex.ai/parse"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import os\n",
 50 |     "\n",
 51 |     "os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-...\"\n",
 52 |     "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\""
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "**NOTE**: Since LlamaParse is natively async, running the sync code in a notebook requires the use of nest_asyncio.\n"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "import nest_asyncio\n",
 69 |     "\n",
 70 |     "nest_asyncio.apply()"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "## Importing the package\n",
 78 |     "\n",
 79 |     "To import llama_parse simply do:"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "from llama_cloud_services import LlamaParse"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "## Using LlamaParse to Parse Presentations\n",
 96 |     "\n",
 97 |     "Like Powerpoints, presentations are often hard to extract for RAG. With LlamaParse we can now parse them and unclock their content of presentations for RAG.\n",
 98 |     "\n",
 99 |     "Let's download a financial report from the World Meteorological Association."
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "! mkdir data; wget \"https://meetings.wmo.int/Cg-19/PublishingImages/SitePages/FINAC-43/7%20-%20EC-77-Doc%205%20Financial%20Statements%20for%202022%20(FINAC).pptx\" -O data/presentation.pptx"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "### Parsing the presentation\n",
116 |     "\n",
117 |     "Now let's parse it into Markdown with LlamaParse and the default LlamaIndex parser.\n",
118 |     "\n",
119 |     "\n"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "#### Llama Index default"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "from llama_index.core import SimpleDirectoryReader\n",
136 |     "\n",
137 |     "vanilla_documents = SimpleDirectoryReader(\"./data/\").load_data()"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "#### Llama Parse"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "name": "stdout",
154 |      "output_type": "stream",
155 |      "text": [
156 |       "Started parsing the file under job_id 56724c0d-e45a-4e30-ae8c-e416173c608a\n"
157 |      ]
158 |     }
159 |    ],
160 |    "source": [
161 |     "llama_parse_documents = LlamaParse(result_type=\"markdown\").load_data(\n",
162 |     "    \"./data/presentation.pptx\"\n",
163 |     ")"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "Let's take a look at the parsed output from an example slide (see image below).\n",
171 |     "\n",
172 |     "As we can see the table is faithfully extracted!"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [
180 |     {
181 |      "name": "stdout",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "ation and mitigation\n",
185 |       "---\n",
186 |       "|Item|31 Dec 2022|31 Dec 2021|Change|\n",
187 |       "|---|---|---|---|\n",
188 |       "|Payables and accruals|4,685|4,066|619|\n",
189 |       "|Employee benefits|127,215|84,676|42,539|\n",
190 |       "|Contributions received in advance|6,975|10,192|(3,217)|\n",
191 |       "|Unearned revenue from exchange transactions|20|651|(631)|\n",
192 |       "|Deferred Revenue|71,301|55,737|15,564|\n",
193 |       "|Borrowings|28,229|29,002|(773)|\n",
194 |       "|Funds held in trust|30,373|29,014|1,359|\n",
195 |       "|Provisions|1,706|1,910|(204)|\n",
196 |       "|Total Liabilities|270,504|215,248|55,256|\n",
197 |       "---\n",
198 |       "## Liabilities\n",
199 |       "\n",
200 |       "Employee Ben\n"
201 |      ]
202 |     }
203 |    ],
204 |    "source": [
205 |     "print(llama_parse_documents[0].get_content()[-2800:-2300])"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "Compared against the original slide image.\n",
213 |     "![Demo](demo_ppt_financial_1.png)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "## Comparing the two for RAG\n",
221 |     "\n",
222 |     "The main difference between LlamaParse and the previous directory reader approach, it that LlamaParse will extract the document in a structured format, allowing better RAG."
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "### Query Engine on SimpleDirectoryReader results"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n",
239 |     "\n",
240 |     "vanilla_index = VectorStoreIndex.from_documents(vanilla_documents)\n",
241 |     "vanilla_query_engine = vanilla_index.as_query_engine()"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "### Query Engine on LlamaParse Results\n"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "llama_parse_index = VectorStoreIndex.from_documents(llama_parse_documents)\n",
258 |     "llama_parse_query_engine = llama_parse_index.as_query_engine()"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "### Liability provision\n",
266 |     "What was the liability provision as of Dec 31 2021?\n",
267 |     "\n",
268 |     "<!-- <img src=\"https://drive.usercontent.google.com/download?id=184jVq0QyspDnmCyRfV0ebmJJxmAOJHba&authuser=0\" /> -->"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {},
275 |    "outputs": [
276 |     {
277 |      "name": "stdout",
278 |      "output_type": "stream",
279 |      "text": [
280 |       "The liability provision as of December 31, 2021, included Employee Benefit Liabilities, Contributions received in advance (assessed contributions), and Deferred revenue.\n"
281 |      ]
282 |     }
283 |    ],
284 |    "source": [
285 |     "vanilla_response = vanilla_query_engine.query(\n",
286 |     "    \"What was the liability provision as of Dec 31 2021?\"\n",
287 |     ")\n",
288 |     "print(vanilla_response)"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "metadata": {},
295 |    "outputs": [
296 |     {
297 |      "name": "stdout",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "The liability provision as of December 31, 2021, was 1,910 CHF.\n"
301 |      ]
302 |     }
303 |    ],
304 |    "source": [
305 |     "llama_parse_response = llama_parse_query_engine.query(\n",
306 |     "    \"What was the liability provision as of Dec 31 2021?\"\n",
307 |     ")\n",
308 |     "print(llama_parse_response)"
309 |    ]
310 |   }
311 |  ],
312 |  "metadata": {
313 |   "colab": {
314 |    "provenance": []
315 |   },
316 |   "kernelspec": {
317 |    "display_name": "llama_parse",
318 |    "language": "python",
319 |    "name": "llama_parse"
320 |   },
321 |   "language_info": {
322 |    "codemirror_mode": {
323 |     "name": "ipython",
324 |     "version": 3
325 |    },
326 |    "file_extension": ".py",
327 |    "mimetype": "text/x-python",
328 |    "name": "python",
329 |    "nbconvert_exporter": "python",
330 |    "pygments_lexer": "ipython3"
331 |   }
332 |  },
333 |  "nbformat": 4,
334 |  "nbformat_minor": 4
335 | }
336 | 


--------------------------------------------------------------------------------
/examples/parse/parsing_instructions/expense_report_document.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_instructions/expense_report_document.pdf


--------------------------------------------------------------------------------
/examples/parse/parsing_instructions/expense_report_document.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_instructions/expense_report_document.png


--------------------------------------------------------------------------------
/examples/parse/parsing_instructions/mcdonalds_receipt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_instructions/mcdonalds_receipt.png


--------------------------------------------------------------------------------
/examples/parse/parsing_instructions/purchase_order_document.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_instructions/purchase_order_document.pdf


--------------------------------------------------------------------------------
/examples/parse/parsing_instructions/purchase_order_document.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_instructions/purchase_order_document.png


--------------------------------------------------------------------------------
/examples/parse/parsing_modes/diagram.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/diagram.jpg


--------------------------------------------------------------------------------
/examples/parse/parsing_modes/layout_agent_citation_engine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/layout_agent_citation_engine.png


--------------------------------------------------------------------------------
/examples/parse/parsing_modes/layout_agent_moe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/layout_agent_moe.png


--------------------------------------------------------------------------------
/examples/parse/parsing_modes/layout_agent_parse_explainer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/layout_agent_parse_explainer.png


--------------------------------------------------------------------------------
/examples/parse/parsing_modes/mermaid_render.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/mermaid_render.png


--------------------------------------------------------------------------------
/examples/parse/parsing_modes/page_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/page_1.png


--------------------------------------------------------------------------------
/examples/parse/parsing_modes/page_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/page_11.png


--------------------------------------------------------------------------------
/examples/parse/parsing_modes/page_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/page_14.png


--------------------------------------------------------------------------------
/examples/parse/parsing_modes/page_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/page_3.png


--------------------------------------------------------------------------------
/examples/parse/report_generation/rfp_response/generate_rfp_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/report_generation/rfp_response/generate_rfp_img.png


--------------------------------------------------------------------------------
/examples/parse/test_tesla_impact_report/2019-tesla-impact-report-short.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/test_tesla_impact_report/2019-tesla-impact-report-short.pdf


--------------------------------------------------------------------------------
/extract.md:
--------------------------------------------------------------------------------
  1 | # LlamaExtract
  2 | 
  3 | LlamaExtract provides a simple API for extracting structured data from unstructured documents like PDFs, text files and images.
  4 | 
  5 | ## Quick Start
  6 | 
  7 | ```python
  8 | from llama_cloud_services import LlamaExtract
  9 | from pydantic import BaseModel, Field
 10 | 
 11 | # Initialize client
 12 | extractor = LlamaExtract()
 13 | 
 14 | 
 15 | # Define schema using Pydantic
 16 | class Resume(BaseModel):
 17 |     name: str = Field(description="Full name of candidate")
 18 |     email: str = Field(description="Email address")
 19 |     skills: list[str] = Field(description="Technical skills and technologies")
 20 | 
 21 | 
 22 | # Create extraction agent
 23 | agent = extractor.create_agent(name="resume-parser", data_schema=Resume)
 24 | 
 25 | # Extract data from document
 26 | result = agent.extract("resume.pdf")
 27 | print(result.data)
 28 | ```
 29 | 
 30 | ## Core Concepts
 31 | 
 32 | - **Extraction Agents**: Reusable extractors configured with a specific schema and extraction settings.
 33 | - **Data Schema**: Structure definition for the data you want to extract in the form of a JSON schema or a Pydantic model.
 34 | - **Extraction Jobs**: Asynchronous extraction tasks that can be monitored.
 35 | 
 36 | ## Defining Schemas
 37 | 
 38 | Schemas can be defined using either Pydantic models or JSON Schema:
 39 | 
 40 | ### Using Pydantic (Recommended)
 41 | 
 42 | ```python
 43 | from pydantic import BaseModel, Field
 44 | from typing import List, Optional
 45 | 
 46 | 
 47 | class Experience(BaseModel):
 48 |     company: str = Field(description="Company name")
 49 |     title: str = Field(description="Job title")
 50 |     start_date: Optional[str] = Field(description="Start date of employment")
 51 |     end_date: Optional[str] = Field(description="End date of employment")
 52 | 
 53 | 
 54 | class Resume(BaseModel):
 55 |     name: str = Field(description="Candidate name")
 56 |     experience: List[Experience] = Field(description="Work history")
 57 | ```
 58 | 
 59 | ### Using JSON Schema
 60 | 
 61 | ```python
 62 | schema = {
 63 |     "type": "object",
 64 |     "properties": {
 65 |         "name": {"type": "string", "description": "Candidate name"},
 66 |         "experience": {
 67 |             "type": "array",
 68 |             "description": "Work history",
 69 |             "items": {
 70 |                 "type": "object",
 71 |                 "properties": {
 72 |                     "company": {
 73 |                         "type": "string",
 74 |                         "description": "Company name",
 75 |                     },
 76 |                     "title": {"type": "string", "description": "Job title"},
 77 |                     "start_date": {
 78 |                         "anyOf": [{"type": "string"}, {"type": "null"}],
 79 |                         "description": "Start date of employment",
 80 |                     },
 81 |                     "end_date": {
 82 |                         "anyOf": [{"type": "string"}, {"type": "null"}],
 83 |                         "description": "End date of employment",
 84 |                     },
 85 |                 },
 86 |             },
 87 |         },
 88 |     },
 89 | }
 90 | 
 91 | agent = extractor.create_agent(name="resume-parser", data_schema=schema)
 92 | ```
 93 | 
 94 | ### Important restrictions on JSON/Pydantic Schema
 95 | 
 96 | _LlamaExtract only supports a subset of the JSON Schema specification._ While limited, it should
 97 | be sufficient for a wide variety of use-cases.
 98 | 
 99 | - All fields are required by default. Nullable fields must be explicitly marked as such,
100 |   using `anyOf` with a `null` type. See `"start_date"` field above.
101 | - Root node must be of type `object`.
102 | - Schema nesting must be limited to within 5 levels.
103 | - The important fields are key names/titles, type and description. Fields for
104 |   formatting, default values, etc. are **not supported**. If you need these, you can add the
105 |   restrictions to your field description and/or use a post-processing step. e.g. default values can be supported by making a field optional and then setting `"null"` values from the extraction result to the default value.
106 | - There are other restrictions on number of keys, size of the schema, etc. that you may
107 |   hit for complex extraction use cases. In such cases, it is worth thinking how to restructure
108 |   your extraction workflow to fit within these constraints, e.g. by extracting subset of fields
109 |   and later merging them together.
110 | 
111 | ## Other Extraction APIs
112 | 
113 | ### Extraction over bytes or text
114 | 
115 | You can use the `SourceText` class to extract from bytes or text directly without using a file. If passing the file bytes,
116 | you will need to pass the filename to the `SourceText` class.
117 | 
118 | ```python
119 | with open("resume.pdf", "rb") as f:
120 |     file_bytes = f.read()
121 | result = test_agent.extract(SourceText(file=file_bytes, filename="resume.pdf"))
122 | ```
123 | 
124 | ```python
125 | result = test_agent.extract(
126 |     SourceText(text_content="Candidate Name: Jane Doe")
127 | )
128 | ```
129 | 
130 | ### Batch Processing
131 | 
132 | Process multiple files asynchronously:
133 | 
134 | ```python
135 | # Queue multiple files for extraction
136 | jobs = await agent.queue_extraction(["resume1.pdf", "resume2.pdf"])
137 | 
138 | # Check job status
139 | for job in jobs:
140 |     status = agent.get_extraction_job(job.id).status
141 |     print(f"Job {job.id}: {status}")
142 | 
143 | # Get results when complete
144 | results = [agent.get_extraction_run_for_job(job.id) for job in jobs]
145 | ```
146 | 
147 | ### Updating Schemas
148 | 
149 | Schemas can be modified and updated after creation:
150 | 
151 | ```python
152 | # Update schema
153 | agent.data_schema = new_schema
154 | 
155 | # Save changes
156 | agent.save()
157 | ```
158 | 
159 | ### Managing Agents
160 | 
161 | ```python
162 | # List all agents
163 | agents = extractor.list_agents()
164 | 
165 | # Get specific agent
166 | agent = extractor.get_agent(name="resume-parser")
167 | 
168 | # Delete agent
169 | extractor.delete_agent(agent.id)
170 | ```
171 | 
172 | ## Installation
173 | 
174 | ```bash
175 | pip install llama-extract==0.1.0
176 | ```
177 | 
178 | ## Tips & Best Practices
179 | 
180 | At the core of LlamaExtract is the schema, which defines the structure of the data you want to extract from your documents.
181 | 
182 | 1. **Schema Design**:
183 | 
184 |    - Try to limit schema nesting to 3-4 levels.
185 |    - Make fields optional when data might not always be present. Having required fields may force the model
186 |      to hallucinate when these fields are not present in the documents.
187 |    - When you want to extract a variable number of entities, use an `array` type. However, note that you cannot use
188 |      an `array` type for the root node.
189 |    - Use descriptive field names and detailed descriptions. Use descriptions to pass formatting
190 |      instructions or few-shot examples.
191 |    - Above all, start simple and iteratively build your schema to incorporate requirements.
192 | 
193 | 2. **Running Extractions**:
194 |    - Note that resetting `agent.schema` will not save the schema to the database,
195 |      until you call `agent.save`, but it will be used for running extractions.
196 |    - Check job status prior to accessing results. Any extraction error should be available as
197 |      part of `job.error` or `extraction_run.error` fields for debugging.
198 |    - Consider async operations (`queue_extraction`) for large-scale extraction once you have finalized your schema.
199 | 
200 | ### Hitting "The response was too long to be processed" Error
201 | 
202 | This implies that the extraction response is hitting output token limits of the LLM. In such cases, it is worth rethinking the design of your schema to enable a more efficient/scalable extraction. e.g.
203 | 
204 | - Instead of one field that extracts a complex object, you can use multiple fields to distribute the extraction logic.
205 | - You can also use multiple schemas to extract different subsets of fields from the same document and merge them later.
206 | 
207 | Another option (orthogonal to the above) is to break the document into smaller sections and extract from each section individually, when possible. LlamaExtract will in most cases be able to handle both document and schema chunking automatically, but there are cases where you may need to do this manually.
208 | 
209 | ## Additional Resources
210 | 
211 | - [Example Notebook](examples/resume_screening.ipynb) - Detailed walkthrough of resume parsing
212 | - [Discord Community](https://discord.com/invite/eN6D2HQ4aX) - Get help and share feedback
213 | 


--------------------------------------------------------------------------------
/llama_cloud_services/__init__.py:
--------------------------------------------------------------------------------
 1 | from llama_cloud_services.parse import LlamaParse
 2 | from llama_cloud_services.report import ReportClient, LlamaReport
 3 | from llama_cloud_services.extract import LlamaExtract, ExtractionAgent
 4 | from llama_cloud_services.constants import EU_BASE_URL
 5 | 
 6 | __all__ = [
 7 |     "LlamaParse",
 8 |     "ReportClient",
 9 |     "LlamaReport",
10 |     "LlamaExtract",
11 |     "ExtractionAgent",
12 |     "EU_BASE_URL",
13 | ]
14 | 


--------------------------------------------------------------------------------
/llama_cloud_services/constants.py:
--------------------------------------------------------------------------------
1 | EU_BASE_URL = "https://api.cloud.eu.llamaindex.ai"
2 | 


--------------------------------------------------------------------------------
/llama_cloud_services/extract/__init__.py:
--------------------------------------------------------------------------------
 1 | from llama_cloud_services.extract.extract import (
 2 |     LlamaExtract,
 3 |     ExtractConfig,
 4 |     ExtractionAgent,
 5 |     SourceText,
 6 |     ExtractTarget,
 7 |     ExtractMode,
 8 | )
 9 | 
10 | __all__ = [
11 |     "LlamaExtract",
12 |     "ExtractionAgent",
13 |     "SourceText",
14 |     "ExtractConfig",
15 |     "ExtractTarget",
16 |     "ExtractMode",
17 | ]
18 | 


--------------------------------------------------------------------------------
/llama_cloud_services/extract/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List, Union, Generator
 2 | from contextlib import contextmanager
 3 | 
 4 | # Asyncio error messages
 5 | nest_asyncio_err = "cannot be called from a running event loop"
 6 | nest_asyncio_msg = (
 7 |     "The event loop is already running. "
 8 |     "Add `import nest_asyncio; nest_asyncio.apply()` to your code to fix this issue."
 9 | )
10 | 
11 | 
12 | def is_jupyter() -> bool:
13 |     """Check if we're running in a Jupyter environment."""
14 |     try:
15 |         from IPython import get_ipython
16 | 
17 |         return get_ipython().__class__.__name__ == "ZMQInteractiveShell"
18 |     except (ImportError, AttributeError):
19 |         return False
20 | 
21 | 
22 | @contextmanager
23 | def augment_async_errors() -> Generator[None, None, None]:
24 |     """Context manager to add helpful information for errors due to nested event loops."""
25 |     try:
26 |         yield
27 |     except RuntimeError as e:
28 |         if nest_asyncio_err in str(e):
29 |             raise RuntimeError(nest_asyncio_msg)
30 |         raise
31 | 
32 | 
33 | JSONType = Union[Dict[str, Any], List[Any], str, int, float, bool, None]
34 | JSONObjectType = Dict[str, JSONType]
35 | 
36 | 
37 | class ExperimentalWarning(Warning):
38 |     """Warning for experimental features."""
39 | 
40 |     pass
41 | 


--------------------------------------------------------------------------------
/llama_cloud_services/parse/__init__.py:
--------------------------------------------------------------------------------
1 | from llama_cloud_services.parse.base import (
2 |     LlamaParse,
3 |     ResultType,
4 |     ParsingMode,
5 |     FailedPageMode,
6 | )
7 | 
8 | __all__ = ["LlamaParse", "ResultType", "ParsingMode", "FailedPageMode"]
9 | 


--------------------------------------------------------------------------------
/llama_cloud_services/parse/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/llama_cloud_services/parse/cli/__init__.py


--------------------------------------------------------------------------------
/llama_cloud_services/parse/cli/main.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import json
 3 | from enum import Enum
 4 | from pathlib import Path
 5 | from pydantic.fields import FieldInfo
 6 | from typing import Any, Callable, List
 7 | 
 8 | from llama_cloud_services.parse.base import LlamaParse
 9 | 
10 | 
11 | def pydantic_field_to_click_option(name: str, field: FieldInfo) -> click.Option:
12 |     """Convert a Pydantic field to a Click option."""
13 |     kwargs = {
14 |         "default": field.default if field.default else None,
15 |         "help": field.description,
16 |     }
17 | 
18 |     if isinstance(kwargs["default"], Enum):
19 |         kwargs["default"] = kwargs["default"].value
20 | 
21 |     if field.annotation is bool:
22 |         kwargs["is_flag"] = True
23 |         if field.default and field.default is True:
24 |             name = f"no-{name}"
25 |     return click.option(f'--{name.replace("_", "-")}', **kwargs)
26 | 
27 | 
28 | def add_options(options: List[click.Option]) -> Callable:
29 |     def _add_options(func: Callable) -> Callable:
30 |         for option in reversed(options):
31 |             func = option(func)
32 |         return func
33 | 
34 |     return _add_options
35 | 
36 | 
37 | @click.command()
38 | @click.argument("file_paths", nargs=-1, type=click.Path(exists=True, path_type=Path))
39 | @click.option(
40 |     "--output-file", type=click.Path(path_type=Path), help="Path to save the output"
41 | )
42 | @click.option("--output-raw-json", is_flag=True, help="Output the raw JSON result")
43 | @add_options(
44 |     [
45 |         pydantic_field_to_click_option(name, field)
46 |         for name, field in LlamaParse.model_fields.items()
47 |         if name not in ["custom_client"]
48 |     ]
49 | )
50 | def parse(**kwargs: Any) -> None:
51 |     """Parse files using LlamaParse and output the results."""
52 |     file_paths = kwargs.pop("file_paths")
53 |     output_file = kwargs.pop("output_file")
54 |     output_raw_json = kwargs.pop("output_raw_json")
55 | 
56 |     # Remove None values to use LlamaParse defaults
57 |     kwargs = {k: v for k, v in kwargs.items() if v is not None}
58 | 
59 |     # Remove no- prefix for boolean flags
60 |     kwargs = {k.replace("no_", ""): v for k, v in kwargs.items()}
61 | 
62 |     parser = LlamaParse(**kwargs)
63 |     if output_raw_json:
64 |         results = parser.get_json_result(list(file_paths))
65 | 
66 |         if output_file:
67 |             with output_file.open("w") as f:
68 |                 json.dump(results, f)
69 |             click.echo(f"Results saved to {output_file}")
70 |         else:
71 |             click.echo(results)
72 |     else:
73 |         results = parser.load_data(list(file_paths))
74 | 
75 |         if output_file:
76 |             with output_file.open("w") as f:
77 |                 for i, doc in enumerate(results):
78 |                     f.write(f"File: {doc.metadata.get('file_path', 'Unknown')}\n")  # type: ignore
79 |                     f.write(doc.text)  # type: ignore
80 |                     if i < len(results) - 1:
81 |                         f.write("\n\n---\n\n")
82 |             click.echo(f"Results saved to {output_file}")
83 |         else:
84 |             for i, doc in enumerate(results):
85 |                 click.echo(f"File: {doc.metadata.get('file_path', 'Unknown')}")  # type: ignore
86 |                 click.echo(doc.text)  # type: ignore
87 |                 if i < len(results) - 1:
88 |                     click.echo("\n---\n")
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     parse()
93 | 


--------------------------------------------------------------------------------
/llama_cloud_services/parse/utils.py:
--------------------------------------------------------------------------------
  1 | import httpx
  2 | import logging
  3 | from enum import Enum
  4 | from tenacity import (
  5 |     retry,
  6 |     stop_after_attempt,
  7 |     wait_exponential,
  8 |     retry_if_exception,
  9 |     before_sleep_log,
 10 | )
 11 | from typing import Any
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | # Asyncio error messages
 16 | nest_asyncio_err = "cannot be called from a running event loop"
 17 | nest_asyncio_msg = "The event loop is already running. Add `import nest_asyncio; nest_asyncio.apply()` to your code to fix this issue."
 18 | 
 19 | 
 20 | class ResultType(str, Enum):
 21 |     """The result type for the parser."""
 22 | 
 23 |     TXT = "text"
 24 |     MD = "markdown"
 25 |     JSON = "json"
 26 |     STRUCTURED = "structured"
 27 | 
 28 | 
 29 | class ParsingMode(str, Enum):
 30 |     """The parsing mode for the parser."""
 31 | 
 32 |     parse_page_without_llm = "parse_page_without_llm"
 33 |     parse_page_with_llm = "parse_page_with_llm"
 34 |     parse_page_with_lvm = "parse_page_with_lvm"
 35 |     parse_page_with_agent = "parse_page_with_agent"
 36 |     parse_document_with_llm = "parse_document_with_llm"
 37 |     parse_document_with_agent = "parse_document_with_agent"
 38 | 
 39 | 
 40 | class FailedPageMode(str, Enum):
 41 |     """
 42 |     Enum for representing the different available page error handling modes
 43 |     """
 44 | 
 45 |     raw_text = "raw_text"
 46 |     blank_page = "blank_page"
 47 |     error_message = "error_message"
 48 | 
 49 | 
 50 | class Language(str, Enum):
 51 |     BAZA = "abq"
 52 |     ADYGHE = "ady"
 53 |     AFRIKAANS = "af"
 54 |     ANGIKA = "ang"
 55 |     ARABIC = "ar"
 56 |     ASSAMESE = "as"
 57 |     AVAR = "ava"
 58 |     AZERBAIJANI = "az"
 59 |     BELARUSIAN = "be"
 60 |     BULGARIAN = "bg"
 61 |     BIHARI = "bh"
 62 |     BHOJPURI = "bho"
 63 |     BENGALI = "bn"
 64 |     BOSNIAN = "bs"
 65 |     SIMPLIFIED_CHINESE = "ch_sim"
 66 |     TRADITIONAL_CHINESE = "ch_tra"
 67 |     CHECHEN = "che"
 68 |     CZECH = "cs"
 69 |     WELSH = "cy"
 70 |     DANISH = "da"
 71 |     DARGWA = "dar"
 72 |     GERMAN = "de"
 73 |     ENGLISH = "en"
 74 |     SPANISH = "es"
 75 |     ESTONIAN = "et"
 76 |     PERSIAN_FARSI = "fa"
 77 |     FRENCH = "fr"
 78 |     IRISH = "ga"
 79 |     GOAN_KONKANI = "gom"
 80 |     HINDI = "hi"
 81 |     CROATIAN = "hr"
 82 |     HUNGARIAN = "hu"
 83 |     INDONESIAN = "id"
 84 |     INGUSH = "inh"
 85 |     ICELANDIC = "is"
 86 |     ITALIAN = "it"
 87 |     JAPANESE = "ja"
 88 |     KABARDIAN = "kbd"
 89 |     KANNADA = "kn"
 90 |     KOREAN = "ko"
 91 |     KURDISH = "ku"
 92 |     LATIN = "la"
 93 |     LAK = "lbe"
 94 |     LEZGHIAN = "lez"
 95 |     LITHUANIAN = "lt"
 96 |     LATVIAN = "lv"
 97 |     MAGAHI = "mah"
 98 |     MAITHILI = "mai"
 99 |     MAORI = "mi"
100 |     MONGOLIAN = "mn"
101 |     MARATHI = "mr"
102 |     MALAY = "ms"
103 |     MALTESE = "mt"
104 |     NEPALI = "ne"
105 |     NEWARI = "new"
106 |     DUTCH = "nl"
107 |     NORWEGIAN = "no"
108 |     OCCITAN = "oc"
109 |     PALI = "pi"
110 |     POLISH = "pl"
111 |     PORTUGUESE = "pt"
112 |     ROMANIAN = "ro"
113 |     RUSSIAN = "ru"
114 |     SERBIAN_CYRILLIC = "rs_cyrillic"
115 |     SERBIAN_LATIN = "rs_latin"
116 |     NAGPURI = "sck"
117 |     SLOVAK = "sk"
118 |     SLOVENIAN = "sl"
119 |     ALBANIAN = "sq"
120 |     SWEDISH = "sv"
121 |     SWAHILI = "sw"
122 |     TAMIL = "ta"
123 |     TABASSARAN = "tab"
124 |     TELUGU = "te"
125 |     THAI = "th"
126 |     TAJIK = "tjk"
127 |     TAGALOG = "tl"
128 |     TURKISH = "tr"
129 |     UYGHUR = "ug"
130 |     UKRAINIAN = "uk"
131 |     URDU = "ur"
132 |     UZBEK = "uz"
133 |     VIETNAMESE = "vi"
134 | 
135 | 
136 | SUPPORTED_FILE_TYPES = [
137 |     ".pdf",
138 |     # document and presentations
139 |     ".602",
140 |     ".abw",
141 |     ".cgm",
142 |     ".cwk",
143 |     ".doc",
144 |     ".docx",
145 |     ".docm",
146 |     ".dot",
147 |     ".dotm",
148 |     ".hwp",
149 |     ".key",
150 |     ".lwp",
151 |     ".mw",
152 |     ".mcw",
153 |     ".pages",
154 |     ".pbd",
155 |     ".ppt",
156 |     ".pptm",
157 |     ".pptx",
158 |     ".pot",
159 |     ".potm",
160 |     ".potx",
161 |     ".rtf",
162 |     ".sda",
163 |     ".sdd",
164 |     ".sdp",
165 |     ".sdw",
166 |     ".sgl",
167 |     ".sti",
168 |     ".sxi",
169 |     ".sxw",
170 |     ".stw",
171 |     ".sxg",
172 |     ".txt",
173 |     ".uof",
174 |     ".uop",
175 |     ".uot",
176 |     ".vor",
177 |     ".wpd",
178 |     ".wps",
179 |     ".xml",
180 |     ".zabw",
181 |     ".epub",
182 |     # images
183 |     ".jpg",
184 |     ".jpeg",
185 |     ".png",
186 |     ".gif",
187 |     ".bmp",
188 |     ".svg",
189 |     ".tiff",
190 |     ".webp",
191 |     # web
192 |     ".htm",
193 |     ".html",
194 |     # spreadsheets
195 |     ".xlsx",
196 |     ".xls",
197 |     ".xlsm",
198 |     ".xlsb",
199 |     ".xlw",
200 |     ".csv",
201 |     ".dif",
202 |     ".sylk",
203 |     ".slk",
204 |     ".prn",
205 |     ".numbers",
206 |     ".et",
207 |     ".ods",
208 |     ".fods",
209 |     ".uos1",
210 |     ".uos2",
211 |     ".dbf",
212 |     ".wk1",
213 |     ".wk2",
214 |     ".wk3",
215 |     ".wk4",
216 |     ".wks",
217 |     ".123",
218 |     ".wq1",
219 |     ".wq2",
220 |     ".wb1",
221 |     ".wb2",
222 |     ".wb3",
223 |     ".qpw",
224 |     ".xlr",
225 |     ".eth",
226 |     ".tsv",
227 |     ".mp3",
228 |     ".mp4",
229 |     ".mpeg",
230 |     ".mpga",
231 |     ".m4a",
232 |     ".wav",
233 |     ".webm",
234 | ]
235 | 
236 | 
237 | def should_retry(exception: Exception) -> bool:
238 |     """Check if the exception should be retried.
239 | 
240 |     Args:
241 |         exception: The exception to check.
242 |     """
243 |     # Retry on connection errors (network issues)
244 |     if isinstance(
245 |         exception,
246 |         (
247 |             httpx.ConnectError,
248 |             httpx.ConnectTimeout,
249 |             httpx.ReadTimeout,
250 |             httpx.WriteTimeout,
251 |             httpx.RemoteProtocolError,
252 |         ),
253 |     ):
254 |         return True
255 | 
256 |     # Retry on specific HTTP status codes
257 |     if isinstance(exception, httpx.HTTPStatusError):
258 |         status_code = exception.response.status_code
259 |         # Retry on rate limiting or temporary server errors
260 |         return status_code in (429, 500, 502, 503, 504)
261 | 
262 |     return False
263 | 
264 | 
265 | async def make_api_request(
266 |     client: httpx.AsyncClient,
267 |     method: str,
268 |     url: str,
269 |     timeout: float = 60.0,
270 |     max_retries: int = 5,
271 |     **httpx_kwargs: Any,
272 | ) -> httpx.Response:
273 |     """Make an retrying API request to the LlamaParse API.
274 | 
275 |     Args:
276 |         client: The httpx.AsyncClient to use for the request.
277 |         url: The URL to request.
278 |         headers: The headers to include in the request.
279 |         timeout: The timeout for the request.
280 |         max_retries: The maximum number of retries for the request.
281 |     """
282 | 
283 |     @retry(
284 |         stop=stop_after_attempt(max_retries),
285 |         wait=wait_exponential(multiplier=1, min=4, max=timeout),
286 |         retry=retry_if_exception(should_retry),
287 |         before_sleep=before_sleep_log(logger, logging.WARNING),
288 |     )
289 |     async def _make_request(url: str, **httpx_kwargs: Any) -> httpx.Response:
290 |         if method == "GET":
291 |             response = await client.get(url, **httpx_kwargs)
292 |         elif method == "POST":
293 |             response = await client.post(url, **httpx_kwargs)
294 |         else:
295 |             raise ValueError(f"Invalid method: {method}")
296 |         response.raise_for_status()
297 |         return response
298 | 
299 |     return await _make_request(url, **httpx_kwargs)
300 | 


--------------------------------------------------------------------------------
/llama_cloud_services/report/__init__.py:
--------------------------------------------------------------------------------
1 | from llama_cloud_services.report.report import ReportClient
2 | from llama_cloud_services.report.base import LlamaReport
3 | 
4 | __all__ = ["ReportClient", "LlamaReport"]
5 | 


--------------------------------------------------------------------------------
/llama_cloud_services/report/base.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import httpx
  3 | import os
  4 | import io
  5 | from concurrent.futures import ThreadPoolExecutor
  6 | from typing import Optional, List, Union, Any, Coroutine, TypeVar
  7 | from urllib.parse import urljoin
  8 | 
  9 | from llama_cloud.types import ReportMetadata
 10 | from llama_cloud_services.report.report import ReportClient
 11 | 
 12 | T = TypeVar("T")
 13 | 
 14 | 
 15 | class LlamaReport:
 16 |     """Client for managing reports and general report operations."""
 17 | 
 18 |     def __init__(
 19 |         self,
 20 |         api_key: Optional[str] = None,
 21 |         project_id: Optional[str] = None,
 22 |         organization_id: Optional[str] = None,
 23 |         base_url: Optional[str] = None,
 24 |         timeout: Optional[int] = None,
 25 |         async_httpx_client: Optional[httpx.AsyncClient] = None,
 26 |     ):
 27 |         self.api_key = api_key or os.getenv("LLAMA_CLOUD_API_KEY", None)
 28 |         if not self.api_key:
 29 |             raise ValueError("No API key provided.")
 30 | 
 31 |         self.base_url = base_url or os.getenv(
 32 |             "LLAMA_CLOUD_BASE_URL", "https://api.cloud.llamaindex.ai"
 33 |         )
 34 |         self.timeout = timeout or 60
 35 | 
 36 |         # Initialize HTTP clients
 37 |         self._aclient = async_httpx_client or httpx.AsyncClient(timeout=self.timeout)
 38 | 
 39 |         # Set auth headers
 40 |         self.headers = {
 41 |             "Authorization": f"Bearer {self.api_key}",
 42 |         }
 43 | 
 44 |         self.organization_id = organization_id
 45 |         self.project_id = project_id
 46 |         self._client_params = {
 47 |             "timeout": self._aclient.timeout,
 48 |             "headers": self._aclient.headers,
 49 |             "base_url": self._aclient.base_url,
 50 |             "auth": self._aclient.auth,
 51 |             "event_hooks": self._aclient.event_hooks,
 52 |             "cookies": self._aclient.cookies,
 53 |             "max_redirects": self._aclient.max_redirects,
 54 |             "params": self._aclient.params,
 55 |             "trust_env": self._aclient.trust_env,
 56 |         }
 57 |         self._thread_pool = ThreadPoolExecutor(
 58 |             max_workers=min(10, (os.cpu_count() or 1) + 4)
 59 |         )
 60 | 
 61 |     @property
 62 |     def aclient(self) -> httpx.AsyncClient:
 63 |         if self._aclient is None:
 64 |             self._aclient = httpx.AsyncClient(**self._client_params)
 65 |         return self._aclient
 66 | 
 67 |     def _run_sync(self, coro: Coroutine[Any, Any, T]) -> T:
 68 |         """Run coroutine in a separate thread to avoid event loop issues"""
 69 | 
 70 |         # force a new client for this thread/event loop
 71 |         original_client = self._aclient
 72 |         self._aclient = None
 73 | 
 74 |         def run_coro() -> T:
 75 |             async def wrapped_coro() -> T:
 76 |                 return await coro
 77 | 
 78 |             return asyncio.run(wrapped_coro())
 79 | 
 80 |         result = self._thread_pool.submit(run_coro).result()
 81 | 
 82 |         # restore the original client
 83 |         self._aclient = original_client
 84 | 
 85 |         return result
 86 | 
 87 |     async def _get_default_project(self) -> str:
 88 |         response = await self.aclient.get(
 89 |             urljoin(str(self.base_url), "/api/v1/projects"), headers=self.headers
 90 |         )
 91 |         response.raise_for_status()
 92 |         projects = response.json()
 93 |         default_project = [p for p in projects if p.get("is_default")]
 94 |         return default_project[0]["id"]
 95 | 
 96 |     async def _build_url(
 97 |         self, endpoint: str, extra_params: Optional[List[str]] = None
 98 |     ) -> str:
 99 |         """Helper method to build URLs with common query parameters."""
100 |         url = urljoin(str(self.base_url), endpoint)
101 | 
102 |         if not self.project_id:
103 |             self.project_id = await self._get_default_project()
104 | 
105 |         query_params = []
106 |         if self.organization_id:
107 |             query_params.append(f"organization_id={self.organization_id}")
108 |         if self.project_id:
109 |             query_params.append(f"project_id={self.project_id}")
110 |         if extra_params:
111 |             query_params.extend([p for p in extra_params if p is not None])
112 | 
113 |         if query_params:
114 |             url += "?" + "&".join(query_params)
115 | 
116 |         return url
117 | 
118 |     async def acreate_report(
119 |         self,
120 |         name: str,
121 |         template_instructions: Optional[str] = None,
122 |         template_text: Optional[str] = None,
123 |         template_file: Optional[Union[str, tuple[str, bytes]]] = None,
124 |         input_files: Optional[List[Union[str, tuple[str, bytes]]]] = None,
125 |         existing_retriever_id: Optional[str] = None,
126 |     ) -> ReportClient:
127 |         """Create a new report asynchronously."""
128 |         url = await self._build_url("/api/v1/reports/")
129 |         open_files: List[io.BufferedReader] = []
130 | 
131 |         data = {"name": name}
132 |         if template_instructions:
133 |             data["template_instructions"] = template_instructions
134 |         if template_text:
135 |             data["template_text"] = template_text
136 |         if existing_retriever_id:
137 |             data["existing_retriever_id"] = str(existing_retriever_id)
138 | 
139 |         files: List[tuple[str, io.BufferedReader | bytes]] = []
140 |         if template_file:
141 |             if isinstance(template_file, str):
142 |                 open_files.append(open(template_file, "rb"))
143 |                 files.append(("template_file", open_files[-1]))
144 |             else:
145 |                 files.append(("template_file", template_file[1]))
146 | 
147 |         if input_files:
148 |             for f in input_files:
149 |                 if isinstance(f, str):
150 |                     open_files.append(open(f, "rb"))
151 |                     files.append(("files", open_files[-1]))
152 |                 else:
153 |                     files.append(("files", f[1]))
154 | 
155 |         response = await self.aclient.post(
156 |             url, headers=self.headers, data=data, files=files
157 |         )
158 |         try:
159 |             response.raise_for_status()
160 |             report_id = response.json()["id"]
161 |             return ReportClient(report_id, name, self)
162 |         except httpx.HTTPStatusError as e:
163 |             raise ValueError(
164 |                 f"Failed to create report: {e.response.text}\nError Code: {e.response.status_code}"
165 |             )
166 |         finally:
167 |             for open_file in open_files:
168 |                 open_file.close()
169 | 
170 |     def create_report(
171 |         self,
172 |         name: str,
173 |         template_instructions: Optional[str] = None,
174 |         template_text: Optional[str] = None,
175 |         template_file: Optional[Union[str, tuple[str, bytes]]] = None,
176 |         input_files: Optional[List[Union[str, tuple[str, bytes]]]] = None,
177 |         existing_retriever_id: Optional[str] = None,
178 |     ) -> ReportClient:
179 |         """Create a new report."""
180 |         return self._run_sync(
181 |             self.acreate_report(
182 |                 name=name,
183 |                 template_instructions=template_instructions,
184 |                 template_text=template_text,
185 |                 template_file=template_file,
186 |                 input_files=input_files,
187 |                 existing_retriever_id=existing_retriever_id,
188 |             )
189 |         )
190 | 
191 |     async def alist_reports(
192 |         self, state: Optional[str] = None, limit: int = 100, offset: int = 0
193 |     ) -> List[ReportClient]:
194 |         """List all reports asynchronously."""
195 |         params = []
196 |         if state:
197 |             params.append(f"state={state}")
198 |         if limit:
199 |             params.append(f"limit={limit}")
200 |         if offset:
201 |             params.append(f"offset={offset}")
202 | 
203 |         url = await self._build_url(
204 |             "/api/v1/reports/list",
205 |             extra_params=params,
206 |         )
207 | 
208 |         response = await self.aclient.get(url, headers=self.headers)
209 |         response.raise_for_status()
210 |         data = response.json()
211 | 
212 |         return [
213 |             ReportClient(r["report_id"], r["name"], self)
214 |             for r in data["report_responses"]
215 |         ]
216 | 
217 |     def list_reports(
218 |         self, state: Optional[str] = None, limit: int = 100, offset: int = 0
219 |     ) -> List[ReportClient]:
220 |         """Synchronous wrapper for listing reports."""
221 |         return self._run_sync(self.alist_reports(state, limit, offset))
222 | 
223 |     async def aget_report(self, report_id: str) -> ReportClient:
224 |         """Get a Report instance for working with a specific report."""
225 |         url = await self._build_url(f"/api/v1/reports/{report_id}")
226 | 
227 |         response = await self.aclient.get(url, headers=self.headers)
228 |         response.raise_for_status()
229 |         data = response.json()
230 | 
231 |         return ReportClient(data["report_id"], data["name"], self)
232 | 
233 |     def get_report(self, report_id: str) -> ReportClient:
234 |         """Synchronous wrapper for getting a report."""
235 |         return self._run_sync(self.aget_report(report_id))
236 | 
237 |     async def aget_report_metadata(self, report_id: str) -> ReportMetadata:
238 |         """Get metadata for a specific report asynchronously.
239 | 
240 |         Returns:
241 |             dict containing:
242 |             - id: Report ID
243 |             - name: Report name
244 |             - state: Current report state
245 |             - report_metadata: Additional metadata
246 |             - template_file: Name of template file if used
247 |             - template_instructions: Template instructions if provided
248 |             - input_files: List of input file names
249 |         """
250 |         url = await self._build_url(f"/api/v1/reports/{report_id}/metadata")
251 | 
252 |         response = await self.aclient.get(url, headers=self.headers)
253 |         response.raise_for_status()
254 |         return ReportMetadata(**response.json())
255 | 
256 |     def get_report_metadata(self, report_id: str) -> ReportMetadata:
257 |         """Synchronous wrapper for getting report metadata."""
258 |         return self._run_sync(self.aget_report_metadata(report_id))
259 | 
260 |     async def adelete_report(self, report_id: str) -> None:
261 |         """Delete a specific report asynchronously."""
262 |         url = await self._build_url(f"/api/v1/reports/{report_id}")
263 | 
264 |         response = await self.aclient.delete(url, headers=self.headers)
265 |         response.raise_for_status()
266 | 
267 |     def delete_report(self, report_id: str) -> None:
268 |         """Synchronous wrapper for deleting a report."""
269 |         return self._run_sync(self.adelete_report(report_id))
270 | 


--------------------------------------------------------------------------------
/llama_parse/README.md:
--------------------------------------------------------------------------------
  1 | # LlamaParse
  2 | 
  3 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-parse)](https://pypi.org/project/llama-parse/)
  4 | [![GitHub contributors](https://img.shields.io/github/contributors/run-llama/llama_parse)](https://github.com/run-llama/llama_parse/graphs/contributors)
  5 | [![Discord](https://img.shields.io/discord/1059199217496772688)](https://discord.gg/dGcwcsnxhU)
  6 | 
  7 | LlamaParse is a **GenAI-native document parser** that can parse complex document data for any downstream LLM use case (RAG, agents).
  8 | 
  9 | It is really good at the following:
 10 | 
 11 | - ✅ **Broad file type support**: Parsing a variety of unstructured file types (.pdf, .pptx, .docx, .xlsx, .html) with text, tables, visual elements, weird layouts, and more.
 12 | - ✅ **Table recognition**: Parsing embedded tables accurately into text and semi-structured representations.
 13 | - ✅ **Multimodal parsing and chunking**: Extracting visual elements (images/diagrams) into structured formats and return image chunks using the latest multimodal models.
 14 | - ✅ **Custom parsing**: Input custom prompt instructions to customize the output the way you want it.
 15 | 
 16 | LlamaParse directly integrates with [LlamaIndex](https://github.com/run-llama/llama_index).
 17 | 
 18 | The free plan is up to 1000 pages a day. Paid plan is free 7k pages per week + 0.3c per additional page by default. There is a sandbox available to test the API [**https://cloud.llamaindex.ai/parse ↗**](https://cloud.llamaindex.ai/parse).
 19 | 
 20 | Read below for some quickstart information, or see the [full documentation](https://docs.cloud.llamaindex.ai/).
 21 | 
 22 | If you're a company interested in enterprise RAG solutions, and/or high volume/on-prem usage of LlamaParse, come [talk to us](https://www.llamaindex.ai/contact).
 23 | 
 24 | ## Getting Started
 25 | 
 26 | First, login and get an api-key from [**https://cloud.llamaindex.ai/api-key ↗**](https://cloud.llamaindex.ai/api-key).
 27 | 
 28 | Then, make sure you have the latest LlamaIndex version installed.
 29 | 
 30 | **NOTE:** If you are upgrading from v0.9.X, we recommend following our [migration guide](https://pretty-sodium-5e0.notion.site/v0-10-0-Migration-Guide-6ede431dcb8841b09ea171e7f133bd77), as well as uninstalling your previous version first.
 31 | 
 32 | ```
 33 | pip uninstall llama-index  # run this if upgrading from v0.9.x or older
 34 | pip install -U llama-index --upgrade --no-cache-dir --force-reinstall
 35 | ```
 36 | 
 37 | Lastly, install the package:
 38 | 
 39 | `pip install llama-parse`
 40 | 
 41 | Now you can parse your first PDF file using the command line interface. Use the command `llama-parse [file_paths]`. See the help text with `llama-parse --help`.
 42 | 
 43 | ```bash
 44 | export LLAMA_CLOUD_API_KEY='llx-...'
 45 | 
 46 | # output as text
 47 | llama-parse my_file.pdf --result-type text --output-file output.txt
 48 | 
 49 | # output as markdown
 50 | llama-parse my_file.pdf --result-type markdown --output-file output.md
 51 | 
 52 | # output as raw json
 53 | llama-parse my_file.pdf --output-raw-json --output-file output.json
 54 | ```
 55 | 
 56 | You can also create simple scripts:
 57 | 
 58 | ```python
 59 | import nest_asyncio
 60 | 
 61 | nest_asyncio.apply()
 62 | 
 63 | from llama_parse import LlamaParse
 64 | 
 65 | parser = LlamaParse(
 66 |     api_key="llx-...",  # can also be set in your env as LLAMA_CLOUD_API_KEY
 67 |     result_type="markdown",  # "markdown" and "text" are available
 68 |     num_workers=4,  # if multiple files passed, split in `num_workers` API calls
 69 |     verbose=True,
 70 |     language="en",  # Optionally you can define a language, default=en
 71 | )
 72 | 
 73 | # sync
 74 | documents = parser.load_data("./my_file.pdf")
 75 | 
 76 | # sync batch
 77 | documents = parser.load_data(["./my_file1.pdf", "./my_file2.pdf"])
 78 | 
 79 | # async
 80 | documents = await parser.aload_data("./my_file.pdf")
 81 | 
 82 | # async batch
 83 | documents = await parser.aload_data(["./my_file1.pdf", "./my_file2.pdf"])
 84 | ```
 85 | 
 86 | ## Using with file object
 87 | 
 88 | You can parse a file object directly:
 89 | 
 90 | ```python
 91 | import nest_asyncio
 92 | 
 93 | nest_asyncio.apply()
 94 | 
 95 | from llama_parse import LlamaParse
 96 | 
 97 | parser = LlamaParse(
 98 |     api_key="llx-...",  # can also be set in your env as LLAMA_CLOUD_API_KEY
 99 |     result_type="markdown",  # "markdown" and "text" are available
100 |     num_workers=4,  # if multiple files passed, split in `num_workers` API calls
101 |     verbose=True,
102 |     language="en",  # Optionally you can define a language, default=en
103 | )
104 | 
105 | file_name = "my_file1.pdf"
106 | extra_info = {"file_name": file_name}
107 | 
108 | with open(f"./{file_name}", "rb") as f:
109 |     # must provide extra_info with file_name key with passing file object
110 |     documents = parser.load_data(f, extra_info=extra_info)
111 | 
112 | # you can also pass file bytes directly
113 | with open(f"./{file_name}", "rb") as f:
114 |     file_bytes = f.read()
115 |     # must provide extra_info with file_name key with passing file bytes
116 |     documents = parser.load_data(file_bytes, extra_info=extra_info)
117 | ```
118 | 
119 | ## Using with `SimpleDirectoryReader`
120 | 
121 | You can also integrate the parser as the default PDF loader in `SimpleDirectoryReader`:
122 | 
123 | ```python
124 | import nest_asyncio
125 | 
126 | nest_asyncio.apply()
127 | 
128 | from llama_parse import LlamaParse
129 | from llama_index.core import SimpleDirectoryReader
130 | 
131 | parser = LlamaParse(
132 |     api_key="llx-...",  # can also be set in your env as LLAMA_CLOUD_API_KEY
133 |     result_type="markdown",  # "markdown" and "text" are available
134 |     verbose=True,
135 | )
136 | 
137 | file_extractor = {".pdf": parser}
138 | documents = SimpleDirectoryReader(
139 |     "./data", file_extractor=file_extractor
140 | ).load_data()
141 | ```
142 | 
143 | Full documentation for `SimpleDirectoryReader` can be found on the [LlamaIndex Documentation](https://docs.llamaindex.ai/en/stable/module_guides/loading/simpledirectoryreader.html).
144 | 
145 | ## Examples
146 | 
147 | Several end-to-end indexing examples can be found in the examples folder
148 | 
149 | - [Getting Started](/examples/parse/demo_basic.ipynb)
150 | - [Advanced RAG Example](/examples/parse/demo_advanced.ipynb)
151 | - [Raw API Usage](/examples/parse/demo_api.ipynb)
152 | 
153 | ## Documentation
154 | 
155 | [https://docs.cloud.llamaindex.ai/](https://docs.cloud.llamaindex.ai/)
156 | 
157 | ## Terms of Service
158 | 
159 | See the [Terms of Service Here](./TOS.pdf).
160 | 
161 | ## Get in Touch (LlamaCloud)
162 | 
163 | LlamaParse is part of LlamaCloud, our e2e enterprise RAG platform that provides out-of-the-box, production-ready connectors, indexing, and retrieval over your complex data sources. We offer SaaS and VPC options.
164 | 
165 | LlamaCloud is currently available via waitlist (join by [creating an account](https://cloud.llamaindex.ai/)). If you're interested in state-of-the-art quality and in centralizing your RAG efforts, come [get in touch with us](https://www.llamaindex.ai/contact).
166 | 


--------------------------------------------------------------------------------
/llama_parse/llama_parse/__init__.py:
--------------------------------------------------------------------------------
1 | from llama_cloud_services.parse import (
2 |     LlamaParse,
3 |     ResultType,
4 |     ParsingMode,
5 |     FailedPageMode,
6 | )
7 | 
8 | __all__ = ["LlamaParse", "ResultType", "ParsingMode", "FailedPageMode"]
9 | 


--------------------------------------------------------------------------------
/llama_parse/llama_parse/base.py:
--------------------------------------------------------------------------------
 1 | from llama_cloud_services.parse.base import (
 2 |     LlamaParse,
 3 |     ResultType,
 4 |     ParsingMode,
 5 |     FailedPageMode,
 6 |     FileInput,
 7 |     _DEFAULT_SEPARATOR,
 8 |     JOB_RESULT_URL,
 9 |     JOB_STATUS_ROUTE,
10 |     JOB_UPLOAD_ROUTE,
11 | )
12 | 
13 | __all__ = [
14 |     "LlamaParse",
15 |     "ResultType",
16 |     "FileInput",
17 |     "ParsingMode",
18 |     "FailedPageMode",
19 |     "_DEFAULT_SEPARATOR",
20 |     "JOB_RESULT_URL",
21 |     "JOB_STATUS_ROUTE",
22 |     "JOB_UPLOAD_ROUTE",
23 | ]
24 | 


--------------------------------------------------------------------------------
/llama_parse/llama_parse/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/llama_parse/llama_parse/cli/__init__.py


--------------------------------------------------------------------------------
/llama_parse/llama_parse/cli/main.py:
--------------------------------------------------------------------------------
1 | from llama_cloud_services.parse.cli.main import parse
2 | 
3 | if __name__ == "__main__":
4 |     parse()
5 | 


--------------------------------------------------------------------------------
/llama_parse/llama_parse/utils.py:
--------------------------------------------------------------------------------
 1 | from llama_cloud_services.parse.utils import (
 2 |     SUPPORTED_FILE_TYPES,
 3 |     Language,
 4 |     ResultType,
 5 |     ParsingMode,
 6 |     FailedPageMode,
 7 | )
 8 | 
 9 | __all__ = [
10 |     "SUPPORTED_FILE_TYPES",
11 |     "Language",
12 |     "ResultType",
13 |     "ParsingMode",
14 |     "FailedPageMode",
15 | ]
16 | 


--------------------------------------------------------------------------------
/llama_parse/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["poetry-core"]
 3 | build-backend = "poetry.core.masonry.api"
 4 | 
 5 | [tool.poetry]
 6 | name = "llama-parse"
 7 | version = "0.6.25"
 8 | description = "Parse files into RAG-Optimized formats."
 9 | authors = ["Logan Markewich <logan@llamaindex.ai>"]
10 | license = "MIT"
11 | readme = "README.md"
12 | packages = [{include = "llama_parse"}]
13 | 
14 | [tool.poetry.dependencies]
15 | python = ">=3.9,<4.0"
16 | llama-cloud-services = ">=0.6.24"
17 | 
18 | [tool.poetry.group.dev.dependencies]
19 | pytest = "^8.0.0"
20 | pytest-asyncio = "*"
21 | ipykernel = "^6.29.0"
22 | 
23 | [tool.poetry.scripts]
24 | llama-parse = "llama_parse.cli.main:parse"
25 | 


--------------------------------------------------------------------------------
/parse.md:
--------------------------------------------------------------------------------
  1 | # LlamaParse
  2 | 
  3 | LlamaParse is a **GenAI-native document parser** that can parse complex document data for any downstream LLM use case (RAG, agents).
  4 | 
  5 | It is really good at the following:
  6 | 
  7 | - ✅ **Broad file type support**: Parsing a variety of unstructured file types (.pdf, .pptx, .docx, .xlsx, .html) with text, tables, visual elements, weird layouts, and more.
  8 | - ✅ **Table recognition**: Parsing embedded tables accurately into text and semi-structured representations.
  9 | - ✅ **Multimodal parsing and chunking**: Extracting visual elements (images/diagrams) into structured formats and return image chunks using the latest multimodal models.
 10 | - ✅ **Custom parsing**: Input custom prompt instructions to customize the output the way you want it.
 11 | 
 12 | LlamaParse directly integrates with [LlamaIndex](https://github.com/run-llama/llama_index).
 13 | 
 14 | The free plan is up to 1000 pages a day. Paid plan is free 7k pages per week + 0.3c per additional page by default. There is a sandbox available to test the API [**https://cloud.llamaindex.ai/parse ↗**](https://cloud.llamaindex.ai/parse).
 15 | 
 16 | Read below for some quickstart information, or see the [full documentation](https://docs.cloud.llamaindex.ai/).
 17 | 
 18 | If you're a company interested in enterprise RAG solutions, and/or high volume/on-prem usage of LlamaParse, come [talk to us](https://www.llamaindex.ai/contact).
 19 | 
 20 | ## Getting Started
 21 | 
 22 | First, login and get an api-key from [**https://cloud.llamaindex.ai/api-key ↗**](https://cloud.llamaindex.ai/api-key).
 23 | 
 24 | Then, install the package:
 25 | 
 26 | `pip install llama-cloud-services`
 27 | 
 28 | ## CLI Usage
 29 | 
 30 | Now you can parse your first PDF file using the command line interface. Use the command `llama-parse [file_paths]`. See the help text with `llama-parse --help`.
 31 | 
 32 | ```bash
 33 | export LLAMA_CLOUD_API_KEY='llx-...'
 34 | 
 35 | # output as text
 36 | llama-parse my_file.pdf --result-type text --output-file output.txt
 37 | 
 38 | # output as markdown
 39 | llama-parse my_file.pdf --result-type markdown --output-file output.md
 40 | 
 41 | # output as raw json
 42 | llama-parse my_file.pdf --output-raw-json --output-file output.json
 43 | ```
 44 | 
 45 | ## Python Usage
 46 | 
 47 | You can also create simple scripts:
 48 | 
 49 | ```python
 50 | from llama_cloud_services import LlamaParse
 51 | 
 52 | parser = LlamaParse(
 53 |     api_key="llx-...",  # can also be set in your env as LLAMA_CLOUD_API_KEY
 54 |     num_workers=4,  # if multiple files passed, split in `num_workers` API calls
 55 |     verbose=True,
 56 |     language="en",  # Optionally you can define a language, default=en
 57 | )
 58 | 
 59 | # sync
 60 | result = parser.parse("./my_file.pdf")
 61 | 
 62 | # sync batch
 63 | results = parser.parse(["./my_file1.pdf", "./my_file2.pdf"])
 64 | 
 65 | # async
 66 | result = await parser.aparse("./my_file.pdf")
 67 | 
 68 | # async batch
 69 | results = await parser.aparse(["./my_file1.pdf", "./my_file2.pdf"])
 70 | ```
 71 | 
 72 | The result object is a fully typed `JobResult` object, and you can interact with it to parse and transform various parts of the result:
 73 | 
 74 | ```python
 75 | # get the llama-index markdown documents
 76 | markdown_documents = result.get_markdown_documents(split_by_page=True)
 77 | 
 78 | # get the llama-index text documents
 79 | text_documents = result.get_text_documents(split_by_page=False)
 80 | 
 81 | # get the image documents
 82 | image_documents = result.get_image_documents(
 83 |     include_screenshot_images=True,
 84 |     include_object_images=False,
 85 |     # Optional: download the images to a directory
 86 |     # (default is to return the image bytes in ImageDocument objects)
 87 |     image_download_dir="./images",
 88 | )
 89 | 
 90 | # access the raw job result
 91 | # Items will vary based on the parser configuration
 92 | for page in result.pages:
 93 |     print(page.text)
 94 |     print(page.md)
 95 |     print(page.images)
 96 |     print(page.layout)
 97 |     print(page.structuredData)
 98 | ```
 99 | 
100 | See more details about the result object in the [example notebook](./examples/parse/demo_json_tour.ipynb).
101 | 
102 | ### Using with file object / bytes
103 | 
104 | You can parse a file object directly:
105 | 
106 | ```python
107 | from llama_cloud_services import LlamaParse
108 | 
109 | parser = LlamaParse(
110 |     api_key="llx-...",  # can also be set in your env as LLAMA_CLOUD_API_KEY
111 |     num_workers=4,  # if multiple files passed, split in `num_workers` API calls
112 |     verbose=True,
113 |     language="en",  # Optionally you can define a language, default=en
114 | )
115 | 
116 | file_name = "my_file1.pdf"
117 | extra_info = {"file_name": file_name}
118 | 
119 | with open(f"./{file_name}", "rb") as f:
120 |     # must provide extra_info with file_name key with passing file object
121 |     result = parser.parse(f, extra_info=extra_info)
122 | 
123 | # you can also pass file bytes directly
124 | with open(f"./{file_name}", "rb") as f:
125 |     file_bytes = f.read()
126 |     # must provide extra_info with file_name key with passing file bytes
127 |     result = parser.parse(file_bytes, extra_info=extra_info)
128 | ```
129 | 
130 | ### Using with `SimpleDirectoryReader`
131 | 
132 | You can also integrate the parser as the default PDF loader in `SimpleDirectoryReader`:
133 | 
134 | ```python
135 | from llama_cloud_services import LlamaParse
136 | from llama_index.core import SimpleDirectoryReader
137 | 
138 | parser = LlamaParse(
139 |     api_key="llx-...",  # can also be set in your env as LLAMA_CLOUD_API_KEY
140 |     result_type="markdown",  # "markdown" and "text" are available
141 |     verbose=True,
142 | )
143 | 
144 | file_extractor = {".pdf": parser}
145 | documents = SimpleDirectoryReader(
146 |     "./data", file_extractor=file_extractor
147 | ).load_data()
148 | ```
149 | 
150 | Full documentation for `SimpleDirectoryReader` can be found on the [LlamaIndex Documentation](https://docs.llamaindex.ai/en/stable/module_guides/loading/simpledirectoryreader.html).
151 | 
152 | ## Examples
153 | 
154 | Several end-to-end indexing examples can be found in the examples folder
155 | 
156 | - [Getting Started](examples/parse/demo_basic.ipynb)
157 | - [Advanced RAG Example](examples/parse/demo_advanced.ipynb)
158 | - [Raw API Usage](examples/parse/demo_api.ipynb)
159 | - [Result Object Tour](examples/parse/demo_json_tour.ipynb)
160 | 
161 | ## Documentation
162 | 
163 | [https://docs.cloud.llamaindex.ai/](https://docs.cloud.llamaindex.ai/)
164 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["poetry-core"]
 3 | build-backend = "poetry.core.masonry.api"
 4 | 
 5 | [tool.mypy]
 6 | files = ["llama_cloud_services"]
 7 | python_version = "3.10"
 8 | 
 9 | [tool.poetry]
10 | name = "llama-cloud-services"
11 | version = "0.6.25"
12 | description = "Tailored SDK clients for LlamaCloud services."
13 | authors = ["Logan Markewich <logan@runllama.ai>"]
14 | license = "MIT"
15 | readme = "README.md"
16 | packages = [{include = "llama_cloud_services"}]
17 | 
18 | [tool.poetry.dependencies]
19 | python = ">=3.9,<4.0"
20 | llama-index-core = ">=0.12.0"
21 | llama-cloud = "==0.1.23"
22 | pydantic = ">=2.8,!=2.10"
23 | click = "^8.1.7"
24 | python-dotenv = "^1.0.1"
25 | eval-type-backport = {python = "<3.10", version = "^0.2.0"}
26 | platformdirs = "^4.3.7"
27 | 
28 | [tool.poetry.group.dev.dependencies]
29 | pytest = "^8.0.0"
30 | pytest-asyncio = "*"
31 | ipykernel = "^6.29.0"
32 | pre-commit = "3.2.0"
33 | autoevals = "^0.0.114"
34 | deepdiff = "^8.1.1"
35 | ipython = "^8.12.3"
36 | jupyter = "^1.1.1"
37 | mypy = "^1.14.1"
38 | 
39 | [tool.poetry.scripts]
40 | llama-parse = "llama_cloud_services.parse.cli.main:parse"
41 | 


--------------------------------------------------------------------------------
/report.md:
--------------------------------------------------------------------------------
  1 | # LlamaReport (beta/invite-only)
  2 | 
  3 | LlamaReport is a prebuilt agentic report builder that can be used to build reports from a variety of data sources.
  4 | 
  5 | The python SDK for interacting with the LlamaReport API. The SDK provides two main classes:
  6 | 
  7 | - `LlamaReport`: For managing reports (create, list, delete)
  8 | - `ReportClient`: For working with a specific report (editing, approving, etc.)
  9 | 
 10 | ## Quickstart
 11 | 
 12 | ```bash
 13 | pip install llama-cloud-services
 14 | ```
 15 | 
 16 | ```python
 17 | from llama_cloud_services import LlamaReport
 18 | 
 19 | # Initialize the client
 20 | client = LlamaReport(
 21 |     api_key="your-api-key",
 22 |     # Optional: Specify project_id, organization_id, async_httpx_client
 23 | )
 24 | 
 25 | # Create a new report
 26 | report = client.create_report(
 27 |     "My Report",
 28 |     # must have one of template_text or template_instructions
 29 |     template_text="Your template text",
 30 |     template_instructions="Instructions for the template",
 31 |     # must have one of input_files or retriever_id
 32 |     input_files=["data1.pdf", "data2.pdf"],
 33 |     retriever_id="retriever-id",
 34 | )
 35 | ```
 36 | 
 37 | ## Working with Reports
 38 | 
 39 | The typical workflow for a report involves:
 40 | 
 41 | 1. Creating the report
 42 | 2. Waiting for and approving the plan
 43 | 3. Waiting for report generation
 44 | 4. Making edits to the report
 45 | 
 46 | Here's a complete example:
 47 | 
 48 | ```python
 49 | # Create a report
 50 | report = client.create_report(
 51 |     "Quarterly Analysis", input_files=["q1_data.pdf", "q2_data.pdf"]
 52 | )
 53 | 
 54 | # Wait for the plan to be ready
 55 | plan = report.wait_for_plan()
 56 | 
 57 | # Option 1: Directly approve the plan
 58 | report.update_plan(action="approve")
 59 | 
 60 | # Option 2: Suggest and review edits to the plan
 61 | suggestions = report.suggest_edits(
 62 |     "Can you add a section about market trends?"
 63 | )
 64 | for suggestion in suggestions:
 65 |     print(suggestion)
 66 | 
 67 |     # Accept or reject the suggestion
 68 |     if input("Accept? (y/n): ").lower() == "y":
 69 |         report.accept_edit(suggestion)
 70 |     else:
 71 |         report.reject_edit(suggestion)
 72 | 
 73 | # Wait for the report to complete
 74 | report = report.wait_for_completion()
 75 | 
 76 | # Make edits to the final report
 77 | suggestions = report.suggest_edits("Make the executive summary more concise")
 78 | 
 79 | # Review and accept/reject suggestions as above
 80 | ...
 81 | ```
 82 | 
 83 | ### Getting the Final Report
 84 | 
 85 | Once you are satisfied with the report, you can get the final report object and use the content as you see fit.
 86 | 
 87 | Here's an example of printing out the final report:
 88 | 
 89 | ```python
 90 | report = report.get()
 91 | report_text = "\n\n".join([block.template for block in report.blocks])
 92 | 
 93 | print(report_text)
 94 | ```
 95 | 
 96 | ## Additional Features
 97 | 
 98 | - **Async Support**: All methods have async counterparts: `create_report` -> `acreate_report`, `wait_for_plan` -> `await_for_plan`, etc.
 99 | - **Automatic Chat History**: The SDK automatically keeps track of chat history for each suggestion, unless you specify `auto_history=False` in `suggest_edits`.
100 | - **Custom HTTP Client**: You can provide your own `httpx.AsyncClient` to the `LlamaReport` class.
101 | - **Project and Organization IDs**: You can specify `project_id` and `organization_id` to use a specific project or organization.
102 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/__init__.py


--------------------------------------------------------------------------------
/tests/extract/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/extract/__init__.py


--------------------------------------------------------------------------------
/tests/extract/data/receipt/noisebridge_receipt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/extract/data/receipt/noisebridge_receipt.pdf


--------------------------------------------------------------------------------
/tests/extract/data/receipt/noisebridge_receipt.test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "receiptNumber": "27215058",
 3 |   "invoiceNumber": "87B37C90152",
 4 |   "datePaid": "2024-07-19",
 5 |   "paymentMethod": {
 6 |     "type": "visa",
 7 |     "lastFourDigits": "7267"
 8 |   },
 9 |   "merchant": {
10 |     "name": "Noisebridge",
11 |     "address": {
12 |       "street": "272 Capp St",
13 |       "city": "San Francisco",
14 |       "state": "California",
15 |       "postalCode": "94110",
16 |       "country": "United States"
17 |     },
18 |     "phone": "1 6507017829",
19 |     "email": "treasurer+stripe@noisebridge.net"
20 |   },
21 |   "billTo": "noisebridge@seldo.com",
22 |   "items": [
23 |     {
24 |       "description": "$10 / month",
25 |       "quantity": 1,
26 |       "unitPrice": 10.0,
27 |       "amount": 10.0,
28 |       "period": {
29 |         "start": "2024-07-19",
30 |         "end": "2024-08-19"
31 |       }
32 |     }
33 |   ],
34 |   "subtotal": 10.0,
35 |   "total": 10.0,
36 |   "amountPaid": 10.0
37 | }
38 | 


--------------------------------------------------------------------------------
/tests/extract/data/receipt/schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "http://json-schema.org/draft-07/schema#",
  3 |   "type": "object",
  4 |   "required": ["receiptNumber", "datePaid", "total", "items"],
  5 |   "properties": {
  6 |     "receiptNumber": {
  7 |       "type": "string"
  8 |     },
  9 |     "invoiceNumber": {
 10 |       "type": "string"
 11 |     },
 12 |     "datePaid": {
 13 |       "type": "string",
 14 |       "format": "date"
 15 |     },
 16 |     "paymentMethod": {
 17 |       "type": "object",
 18 |       "properties": {
 19 |         "type": {
 20 |           "type": "string",
 21 |           "enum": ["visa", "mastercard", "amex", "cash", "other"]
 22 |         },
 23 |         "lastFourDigits": {
 24 |           "type": "string",
 25 |           "pattern": "^[0-9]{4}$"
 26 |         }
 27 |       }
 28 |     },
 29 |     "merchant": {
 30 |       "type": "object",
 31 |       "properties": {
 32 |         "name": {
 33 |           "type": "string"
 34 |         },
 35 |         "address": {
 36 |           "type": "object",
 37 |           "properties": {
 38 |             "street": {
 39 |               "type": "string"
 40 |             },
 41 |             "city": {
 42 |               "type": "string"
 43 |             },
 44 |             "state": {
 45 |               "type": "string"
 46 |             },
 47 |             "postalCode": {
 48 |               "type": "string"
 49 |             },
 50 |             "country": {
 51 |               "type": "string"
 52 |             }
 53 |           }
 54 |         },
 55 |         "phone": {
 56 |           "type": "string"
 57 |         },
 58 |         "email": {
 59 |           "type": "string",
 60 |           "format": "email"
 61 |         }
 62 |       }
 63 |     },
 64 |     "billTo": {
 65 |       "type": "string",
 66 |       "format": "email"
 67 |     },
 68 |     "items": {
 69 |       "type": "array",
 70 |       "items": {
 71 |         "type": "object",
 72 |         "required": [
 73 |           "description",
 74 |           "quantity",
 75 |           "unitPrice",
 76 |           "amount",
 77 |           "period"
 78 |         ],
 79 |         "properties": {
 80 |           "description": {
 81 |             "type": "string"
 82 |           },
 83 |           "quantity": {
 84 |             "type": "integer",
 85 |             "minimum": 1
 86 |           },
 87 |           "unitPrice": {
 88 |             "type": "number",
 89 |             "minimum": 0
 90 |           },
 91 |           "amount": {
 92 |             "type": "number",
 93 |             "minimum": 0
 94 |           },
 95 |           "period": {
 96 |             "type": "object",
 97 |             "properties": {
 98 |               "start": {
 99 |                 "type": "string",
100 |                 "format": "date"
101 |               },
102 |               "end": {
103 |                 "type": "string",
104 |                 "format": "date"
105 |               }
106 |             }
107 |           }
108 |         }
109 |       }
110 |     },
111 |     "subtotal": {
112 |       "type": "number",
113 |       "minimum": 0
114 |     },
115 |     "total": {
116 |       "type": "number",
117 |       "minimum": 0
118 |     },
119 |     "amountPaid": {
120 |       "type": "number",
121 |       "minimum": 0
122 |     }
123 |   }
124 | }
125 | 


--------------------------------------------------------------------------------
/tests/extract/data/resume/schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "http://json-schema.org/draft-07/schema#",
  3 |   "title": "Resume Schema",
  4 |   "type": "object",
  5 |   "required": ["basics", "skills", "experience"],
  6 |   "properties": {
  7 |     "basics": {
  8 |       "type": "object",
  9 |       "required": ["name", "email"],
 10 |       "properties": {
 11 |         "name": {
 12 |           "type": "string"
 13 |         },
 14 |         "email": {
 15 |           "type": "string",
 16 |           "format": "email"
 17 |         },
 18 |         "phone": {
 19 |           "type": "string"
 20 |         },
 21 |         "location": {
 22 |           "type": "object",
 23 |           "properties": {
 24 |             "city": {
 25 |               "type": "string"
 26 |             },
 27 |             "region": {
 28 |               "type": "string"
 29 |             },
 30 |             "country": {
 31 |               "type": "string"
 32 |             }
 33 |           }
 34 |         },
 35 |         "profiles": {
 36 |           "type": "array",
 37 |           "items": {
 38 |             "type": "object",
 39 |             "properties": {
 40 |               "network": {
 41 |                 "type": "string"
 42 |               },
 43 |               "url": {
 44 |                 "type": "string",
 45 |                 "format": "uri"
 46 |               }
 47 |             }
 48 |           }
 49 |         },
 50 |         "summary": {
 51 |           "type": "string"
 52 |         }
 53 |       }
 54 |     },
 55 |     "skills": {
 56 |       "type": "array",
 57 |       "items": {
 58 |         "type": "object",
 59 |         "properties": {
 60 |           "category": {
 61 |             "type": "string"
 62 |           },
 63 |           "keywords": {
 64 |             "type": "array",
 65 |             "items": {
 66 |               "type": "string"
 67 |             }
 68 |           },
 69 |           "level": {
 70 |             "type": "string",
 71 |             "enum": ["beginner", "intermediate", "advanced", "expert"]
 72 |           }
 73 |         }
 74 |       }
 75 |     },
 76 |     "experience": {
 77 |       "type": "array",
 78 |       "items": {
 79 |         "type": "object",
 80 |         "required": ["company", "position", "startDate"],
 81 |         "properties": {
 82 |           "company": {
 83 |             "type": "string"
 84 |           },
 85 |           "position": {
 86 |             "type": "string"
 87 |           },
 88 |           "startDate": {
 89 |             "type": "string",
 90 |             "format": "date"
 91 |           },
 92 |           "endDate": {
 93 |             "type": "string",
 94 |             "format": "date"
 95 |           },
 96 |           "highlights": {
 97 |             "type": "array",
 98 |             "items": {
 99 |               "type": "string"
100 |             }
101 |           },
102 |           "technologies": {
103 |             "type": "array",
104 |             "items": {
105 |               "type": "string"
106 |             }
107 |           }
108 |         }
109 |       }
110 |     },
111 |     "education": {
112 |       "type": "array",
113 |       "items": {
114 |         "type": "object",
115 |         "required": ["institution", "degree"],
116 |         "properties": {
117 |           "institution": {
118 |             "type": "string"
119 |           },
120 |           "degree": {
121 |             "type": "string"
122 |           },
123 |           "field": {
124 |             "type": "string"
125 |           },
126 |           "graduationDate": {
127 |             "type": "string",
128 |             "format": "date"
129 |           },
130 |           "gpa": {
131 |             "type": "number"
132 |           }
133 |         }
134 |       }
135 |     },
136 |     "certifications": {
137 |       "type": "array",
138 |       "items": {
139 |         "type": "object",
140 |         "properties": {
141 |           "name": {
142 |             "type": "string"
143 |           },
144 |           "issuer": {
145 |             "type": "string"
146 |           },
147 |           "date": {
148 |             "type": "string",
149 |             "format": "date"
150 |           },
151 |           "validUntil": {
152 |             "type": "string",
153 |             "format": "date"
154 |           }
155 |         }
156 |       }
157 |     },
158 |     "publications": {
159 |       "type": "array",
160 |       "items": {
161 |         "type": "object",
162 |         "properties": {
163 |           "title": {
164 |             "type": "string"
165 |           },
166 |           "publisher": {
167 |             "type": "string"
168 |           },
169 |           "date": {
170 |             "type": "string",
171 |             "format": "date"
172 |           },
173 |           "url": {
174 |             "type": "string",
175 |             "format": "uri"
176 |           }
177 |         }
178 |       }
179 |     }
180 |   }
181 | }
182 | 


--------------------------------------------------------------------------------
/tests/extract/data/resume/software_architect_resume.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html>
  3 |   <head>
  4 |     <style>
  5 |       body {
  6 |         font-family: "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
  7 |         margin: 0;
  8 |         padding: 0;
  9 |         background: #fff;
 10 |         color: #333;
 11 |         line-height: 1.6;
 12 |       }
 13 | 
 14 |       .container {
 15 |         display: flex;
 16 |         max-width: 1200px;
 17 |         margin: 0 auto;
 18 |         box-shadow: 0 0 20px rgba(0, 0, 0, 0.1);
 19 |         min-height: 100vh;
 20 |       }
 21 | 
 22 |       .sidebar {
 23 |         background: #2c3e50;
 24 |         color: white;
 25 |         padding: 2rem;
 26 |         width: 300px;
 27 |       }
 28 | 
 29 |       .main-content {
 30 |         padding: 2rem;
 31 |         flex: 1;
 32 |       }
 33 | 
 34 |       .profile-name {
 35 |         font-size: 2.5rem;
 36 |         margin: 0;
 37 |         color: #2c3e50;
 38 |         border-bottom: 3px solid #3498db;
 39 |         padding-bottom: 0.5rem;
 40 |       }
 41 | 
 42 |       .profile-title {
 43 |         font-size: 1.5rem;
 44 |         color: #7f8c8d;
 45 |         margin: 0.5rem 0 2rem 0;
 46 |       }
 47 | 
 48 |       .contact-info {
 49 |         margin-bottom: 2rem;
 50 |       }
 51 | 
 52 |       .section-title {
 53 |         font-size: 1.2rem;
 54 |         text-transform: uppercase;
 55 |         color: #3498db;
 56 |         margin-bottom: 1rem;
 57 |         letter-spacing: 1px;
 58 |       }
 59 | 
 60 |       .sidebar .section-title {
 61 |         color: white;
 62 |         border-bottom: 2px solid #3498db;
 63 |         padding-bottom: 0.5rem;
 64 |       }
 65 | 
 66 |       .skill-category {
 67 |         margin-bottom: 1rem;
 68 |       }
 69 | 
 70 |       .skill-list {
 71 |         list-style: none;
 72 |         padding: 0;
 73 |         margin: 0;
 74 |       }
 75 | 
 76 |       .skill-list li {
 77 |         margin-bottom: 0.5rem;
 78 |         font-size: 0.9rem;
 79 |       }
 80 | 
 81 |       .experience-item {
 82 |         margin-bottom: 2rem;
 83 |       }
 84 | 
 85 |       .company-name {
 86 |         font-weight: bold;
 87 |         color: #2c3e50;
 88 |         font-size: 1.1rem;
 89 |       }
 90 | 
 91 |       .job-title {
 92 |         color: #3498db;
 93 |         font-weight: bold;
 94 |       }
 95 | 
 96 |       .date {
 97 |         color: #7f8c8d;
 98 |         font-size: 0.9rem;
 99 |       }
100 | 
101 |       .achievements {
102 |         list-style: disc;
103 |         padding-left: 1.2rem;
104 |         margin-top: 0.5rem;
105 |       }
106 | 
107 |       .contact-info a {
108 |         color: white;
109 |         text-decoration: none;
110 |       }
111 | 
112 |       .education-item {
113 |         margin-bottom: 1rem;
114 |       }
115 |     </style>
116 |   </head>
117 | 
118 |   <body>
119 |     <div class="container">
120 |       <div class="sidebar">
121 |         <div class="contact-info">
122 |           <h2 class="section-title">Contact</h2>
123 |           <p>sarah.chen@email.com</p>
124 |           <p>(555) 123-4567</p>
125 |           <p>San Francisco, CA</p>
126 |           <p><a href="#">LinkedIn Profile</a></p>
127 |         </div>
128 | 
129 |         <div class="skills-section">
130 |           <h2 class="section-title">Technical Skills</h2>
131 | 
132 |           <div class="skill-category">
133 |             <h3>Architecture & Design</h3>
134 |             <ul class="skill-list">
135 |               <li>Microservices</li>
136 |               <li>Event-Driven Architecture</li>
137 |               <li>Domain-Driven Design</li>
138 |               <li>REST APIs</li>
139 |             </ul>
140 |           </div>
141 | 
142 |           <div class="skill-category">
143 |             <h3>Cloud Platforms</h3>
144 |             <ul class="skill-list">
145 |               <li>AWS (Advanced)</li>
146 |               <li>Azure</li>
147 |               <li>Google Cloud Platform</li>
148 |             </ul>
149 |           </div>
150 | 
151 |           <div class="skill-category">
152 |             <h3>Programming</h3>
153 |             <ul class="skill-list">
154 |               <li>Java</li>
155 |               <li>Python</li>
156 |               <li>Go</li>
157 |               <li>JavaScript/TypeScript</li>
158 |             </ul>
159 |           </div>
160 | 
161 |           <div class="skill-category">
162 |             <h3>Certifications</h3>
163 |             <ul class="skill-list">
164 |               <li>AWS Solutions Architect - Professional</li>
165 |               <li>Google Cloud Architect</li>
166 |               <li>Certified Kubernetes Administrator</li>
167 |             </ul>
168 |           </div>
169 |         </div>
170 |       </div>
171 | 
172 |       <div class="main-content">
173 |         <h1 class="profile-name">Sarah Chen</h1>
174 |         <div class="profile-title">Senior Software Architect</div>
175 | 
176 |         <div class="section">
177 |           <h2 class="section-title">Professional Summary</h2>
178 |           <p>
179 |             Innovative Software Architect with over 12 years of experience
180 |             designing and implementing large-scale distributed systems. Proven
181 |             track record of leading technical teams and delivering robust
182 |             enterprise solutions. Expert in cloud architecture, microservices,
183 |             and emerging technologies with a focus on scalable, maintainable
184 |             systems.
185 |           </p>
186 |         </div>
187 | 
188 |         <div class="section">
189 |           <h2 class="section-title">Professional Experience</h2>
190 | 
191 |           <div class="experience-item">
192 |             <div class="company-name">TechCorp Solutions</div>
193 |             <div class="job-title">Senior Software Architect</div>
194 |             <div class="date">2020 - Present</div>
195 |             <ul class="achievements">
196 |               <li>
197 |                 Led architectural design and implementation of a cloud-native
198 |                 platform serving 2M+ users
199 |               </li>
200 |               <li>
201 |                 Established architectural guidelines and best practices adopted
202 |                 across 12 development teams
203 |               </li>
204 |               <li>
205 |                 Reduced system latency by 40% through implementation of
206 |                 event-driven architecture
207 |               </li>
208 |               <li>
209 |                 Mentored 15+ senior developers in cloud-native development
210 |                 practices
211 |               </li>
212 |             </ul>
213 |           </div>
214 | 
215 |           <div class="experience-item">
216 |             <div class="company-name">DataFlow Systems</div>
217 |             <div class="job-title">Lead Software Engineer</div>
218 |             <div class="date">2016 - 2020</div>
219 |             <ul class="achievements">
220 |               <li>
221 |                 Architected and led development of distributed data processing
222 |                 platform handling 5TB daily
223 |               </li>
224 |               <li>
225 |                 Designed microservices architecture reducing deployment time by
226 |                 65%
227 |               </li>
228 |               <li>
229 |                 Led migration of legacy monolith to cloud-native architecture
230 |               </li>
231 |               <li>
232 |                 Managed team of 8 engineers across 3 international locations
233 |               </li>
234 |             </ul>
235 |           </div>
236 | 
237 |           <div class="experience-item">
238 |             <div class="company-name">InnovateTech</div>
239 |             <div class="job-title">Senior Software Engineer</div>
240 |             <div class="date">2013 - 2016</div>
241 |             <ul class="achievements">
242 |               <li>
243 |                 Developed high-performance trading platform processing 100K
244 |                 transactions per second
245 |               </li>
246 |               <li>
247 |                 Implemented real-time analytics engine reducing processing
248 |                 latency by 75%
249 |               </li>
250 |               <li>
251 |                 Led adoption of container orchestration reducing deployment
252 |                 costs by 35%
253 |               </li>
254 |             </ul>
255 |           </div>
256 |         </div>
257 | 
258 |         <div class="section">
259 |           <h2 class="section-title">Education</h2>
260 | 
261 |           <div class="education-item">
262 |             <div class="company-name">Stanford University</div>
263 |             <div class="job-title">Master of Science in Computer Science</div>
264 |             <div class="date">2013</div>
265 |             <p>Focus: Distributed Systems and Machine Learning</p>
266 |           </div>
267 | 
268 |           <div class="education-item">
269 |             <div class="company-name">University of California, Berkeley</div>
270 |             <div class="job-title">
271 |               Bachelor of Science in Computer Engineering
272 |             </div>
273 |             <div class="date">2011</div>
274 |             <p>Magna Cum Laude</p>
275 |           </div>
276 |         </div>
277 | 
278 |         <div class="section">
279 |           <h2 class="section-title">Patents & Speaking</h2>
280 |           <ul class="achievements">
281 |             <li>
282 |               Co-inventor on three patents for distributed systems architecture
283 |             </li>
284 |             <li>
285 |               Published paper on "Scalable Microservices Architecture" at IEEE
286 |               Cloud Computing Conference 2022
287 |             </li>
288 |             <li>
289 |               Keynote Speaker, CloudCon 2023: "Future of Cloud-Native
290 |               Architecture"
291 |             </li>
292 |             <li>Regular presenter at local tech meetups and conferences</li>
293 |           </ul>
294 |         </div>
295 |       </div>
296 |     </div>
297 |   </body>
298 | </html>
299 | 


--------------------------------------------------------------------------------
/tests/extract/data/resume/software_architect_resume.test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "basics": {
 3 |     "name": "Sarah Chen",
 4 |     "email": "san.francisco@email.com",
 5 |     "phone": "(555) 123-4567",
 6 |     "location": {
 7 |       "city": "San Francisco",
 8 |       "region": "CA",
 9 |       "country": "USA"
10 |     }
11 |   },
12 |   "skills": [
13 |     {
14 |       "category": "Architecture & Design",
15 |       "keywords": [
16 |         "Microservices",
17 |         "Event-Driven Architecture",
18 |         "Domain-Driven Design",
19 |         "REST APIs"
20 |       ]
21 |     },
22 |     {
23 |       "category": "Cloud Platforms",
24 |       "keywords": ["AWS", "Azure", "Google Cloud Platform"]
25 |     },
26 |     {
27 |       "category": "Programming Languages",
28 |       "keywords": ["Java", "Python", "Go", "JavaScript", "TypeScript"]
29 |     }
30 |   ],
31 |   "experience": [
32 |     {
33 |       "company": "TechCorp Solutions",
34 |       "position": "Senior Software Architect",
35 |       "startDate": "2020-01-01",
36 |       "endDate": "2024-01-10"
37 |     },
38 |     {
39 |       "company": "DataFlow Systems",
40 |       "position": "Lead Software Engineer",
41 |       "startDate": "2016-01-01",
42 |       "endDate": "2019-12-31",
43 |       "technologies": [
44 |         "Distributed Systems",
45 |         "Microservices",
46 |         "Cloud Migration"
47 |       ]
48 |     },
49 |     {
50 |       "company": "InnovateTech",
51 |       "position": "Senior Software Engineer",
52 |       "startDate": "2013-01-01",
53 |       "endDate": "2015-12-31",
54 |       "technologies": [
55 |         "High-performance Computing",
56 |         "Real-time Analytics",
57 |         "Container Orchestration"
58 |       ]
59 |     }
60 |   ],
61 |   "education": [
62 |     {
63 |       "institution": "Stanford University",
64 |       "degree": "Master of Science",
65 |       "field": "Computer Science",
66 |       "graduationDate": "2013-01-01",
67 |       "specialization": "Distributed Systems and Machine Learning"
68 |     },
69 |     {
70 |       "institution": "University of California, Berkeley",
71 |       "degree": "Bachelor of Science",
72 |       "field": "Computer Engineering",
73 |       "graduationDate": "2011-01-01"
74 |     }
75 |   ],
76 |   "certifications": [
77 |     {
78 |       "name": "AWS Solutions Architect - Professional"
79 |     },
80 |     {
81 |       "name": "Google Cloud Architect"
82 |     },
83 |     {
84 |       "name": "Certified Kubernetes Administrator"
85 |     }
86 |   ],
87 |   "publications": [
88 |     {
89 |       "title": "Scalable Microservices Architecture",
90 |       "publisher": "IEEE Cloud Computing Conference",
91 |       "date": "2022-01-01"
92 |     }
93 |   ]
94 | }
95 | 


--------------------------------------------------------------------------------
/tests/extract/data/slide/saas_slide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/extract/data/slide/saas_slide.pdf


--------------------------------------------------------------------------------
/tests/extract/data/slide/saas_slide.test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "companyInfo": {
 3 |     "name": "CloudFlow Analytics",
 4 |     "fundingStage": "Series A",
 5 |     "foundedYear": null,
 6 |     "industry": null,
 7 |     "location": null
 8 |   },
 9 |   "financialMetrics": {
10 |     "mrr": {
11 |       "value": 580000,
12 |       "currency": "USD",
13 |       "growthRate": 27
14 |     },
15 |     "grossMargin": 88
16 |   },
17 |   "growthMetrics": {
18 |     "customers": {
19 |       "total": 1247,
20 |       "growth": 142,
21 |       "enterprisePercent": null
22 |     },
23 |     "nrr": 147
24 |   },
25 |   "marketMetrics": {
26 |     "tam": 50000000000,
27 |     "sam": null,
28 |     "marketShare": null,
29 |     "competitors": null
30 |   },
31 |   "differentiators": [
32 |     {
33 |       "claim": "Processing Speed",
34 |       "metric": "5x faster",
35 |       "comparisonTarget": "competitors"
36 |     },
37 |     {
38 |       "claim": "ML Accuracy",
39 |       "metric": "99.9%",
40 |       "comparisonTarget": null
41 |     },
42 |     {
43 |       "claim": "Market Potential",
44 |       "metric": "80%",
45 |       "comparisonTarget": "Fortune 500"
46 |     }
47 |   ]
48 | }
49 | 


--------------------------------------------------------------------------------
/tests/extract/data/slide/schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "http://json-schema.org/draft-07/schema#",
  3 |   "type": "object",
  4 |   "required": ["companyInfo", "financialMetrics", "growthMetrics"],
  5 |   "properties": {
  6 |     "companyInfo": {
  7 |       "type": "object",
  8 |       "required": ["name", "fundingStage"],
  9 |       "properties": {
 10 |         "name": {
 11 |           "type": "string"
 12 |         },
 13 |         "fundingStage": {
 14 |           "type": "string",
 15 |           "enum": ["Pre-seed", "Seed", "Series A", "Series B", "Series C+"]
 16 |         },
 17 |         "foundedYear": {
 18 |           "anyOf": [
 19 |             {
 20 |               "type": "integer"
 21 |             },
 22 |             {
 23 |               "type": "null"
 24 |             }
 25 |           ]
 26 |         },
 27 |         "industry": {
 28 |           "anyOf": [
 29 |             {
 30 |               "type": "string"
 31 |             },
 32 |             {
 33 |               "type": "null"
 34 |             }
 35 |           ]
 36 |         },
 37 |         "location": {
 38 |           "anyOf": [
 39 |             {
 40 |               "type": "string"
 41 |             },
 42 |             {
 43 |               "type": "null"
 44 |             }
 45 |           ]
 46 |         }
 47 |       }
 48 |     },
 49 |     "financialMetrics": {
 50 |       "type": "object",
 51 |       "required": ["mrr", "growthRate"],
 52 |       "properties": {
 53 |         "mrr": {
 54 |           "type": "object",
 55 |           "description": "Monthly Recurring Revenue",
 56 |           "required": ["value", "currency", "growthRate"],
 57 |           "properties": {
 58 |             "value": {
 59 |               "type": "number"
 60 |             },
 61 |             "currency": {
 62 |               "type": "string"
 63 |             },
 64 |             "growthRate": {
 65 |               "type": "number"
 66 |             }
 67 |           }
 68 |         },
 69 |         "grossMargin": {
 70 |           "type": "number"
 71 |         }
 72 |       }
 73 |     },
 74 |     "growthMetrics": {
 75 |       "type": "object",
 76 |       "required": ["customers", "nrr"],
 77 |       "properties": {
 78 |         "customers": {
 79 |           "type": "object",
 80 |           "required": ["total", "growth"],
 81 |           "properties": {
 82 |             "total": {
 83 |               "type": "integer"
 84 |             },
 85 |             "growth": {
 86 |               "type": "number"
 87 |             }
 88 |           }
 89 |         },
 90 |         "nrr": {
 91 |           "description": "Net Revenue Retention",
 92 |           "type": "number"
 93 |         }
 94 |       }
 95 |     },
 96 |     "differentiators": {
 97 |       "type": "array",
 98 |       "items": {
 99 |         "type": "object",
100 |         "required": ["claim", "metric"],
101 |         "properties": {
102 |           "claim": {
103 |             "type": "string"
104 |           },
105 |           "metric": {
106 |             "type": "string"
107 |           },
108 |           "comparisonTarget": {
109 |             "anyOf": [
110 |               {
111 |                 "type": "string"
112 |               },
113 |               {
114 |                 "type": "null"
115 |               }
116 |             ]
117 |           }
118 |         }
119 |       }
120 |     }
121 |   }
122 | }
123 | 


--------------------------------------------------------------------------------
/tests/extract/test_benchmark.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | 
  4 | from llama_cloud_services.extract import LlamaExtract, ExtractionAgent
  5 | from time import perf_counter
  6 | from collections import namedtuple
  7 | import json
  8 | import uuid
  9 | from llama_cloud.types import (
 10 |     ExtractConfig,
 11 |     ExtractMode,
 12 |     LlamaParseParameters,
 13 |     LlamaExtractSettings,
 14 | )
 15 | from tests.extract.util import load_test_dotenv
 16 | 
 17 | 
 18 | load_test_dotenv()
 19 | 
 20 | TEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
 21 | # Get configuration from environment
 22 | LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
 23 | LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL")
 24 | LLAMA_CLOUD_PROJECT_ID = os.getenv("LLAMA_CLOUD_PROJECT_ID")
 25 | 
 26 | TestCase = namedtuple(
 27 |     "TestCase", ["name", "schema_path", "config", "input_file", "expected_output"]
 28 | )
 29 | 
 30 | 
 31 | def get_test_cases():
 32 |     """Get all test cases from TEST_DIR.
 33 | 
 34 |     Returns:
 35 |         List[TestCase]: List of test cases
 36 |     """
 37 |     test_cases = []
 38 | 
 39 |     for data_type in os.listdir(TEST_DIR):
 40 |         data_type_dir = os.path.join(TEST_DIR, data_type)
 41 |         if not os.path.isdir(data_type_dir):
 42 |             continue
 43 | 
 44 |         schema_path = os.path.join(data_type_dir, "schema.json")
 45 |         if not os.path.exists(schema_path):
 46 |             continue
 47 | 
 48 |         input_files = []
 49 | 
 50 |         for file in os.listdir(data_type_dir):
 51 |             file_path = os.path.join(data_type_dir, file)
 52 |             if (
 53 |                 not os.path.isfile(file_path)
 54 |                 or file == "schema.json"
 55 |                 or file.endswith(".test.json")
 56 |             ):
 57 |                 continue
 58 | 
 59 |             input_files.append(file_path)
 60 | 
 61 |         settings = [
 62 |             ExtractConfig(extraction_mode=ExtractMode.FAST),
 63 |             ExtractConfig(extraction_mode=ExtractMode.BALANCED),
 64 |         ]
 65 | 
 66 |         for input_file in sorted(input_files):
 67 |             base_name = os.path.splitext(os.path.basename(input_file))[0]
 68 |             expected_output = os.path.join(data_type_dir, f"{base_name}.test.json")
 69 | 
 70 |             if not os.path.exists(expected_output):
 71 |                 continue
 72 | 
 73 |             test_name = f"{data_type}/{os.path.basename(input_file)}"
 74 |             for setting in settings:
 75 |                 test_cases.append(
 76 |                     TestCase(
 77 |                         name=test_name,
 78 |                         schema_path=schema_path,
 79 |                         input_file=input_file,
 80 |                         config=setting,
 81 |                         expected_output=expected_output,
 82 |                     )
 83 |                 )
 84 | 
 85 |     return test_cases
 86 | 
 87 | 
 88 | @pytest.fixture(scope="session")
 89 | def extractor():
 90 |     """Create a single LlamaExtract instance for all tests."""
 91 |     extract = LlamaExtract(
 92 |         api_key=LLAMA_CLOUD_API_KEY,
 93 |         base_url=LLAMA_CLOUD_BASE_URL,
 94 |         project_id=LLAMA_CLOUD_PROJECT_ID,
 95 |         verbose=True,
 96 |     )
 97 |     yield extract
 98 |     # Cleanup thread pool at end of session
 99 |     extract._thread_pool.shutdown()
100 | 
101 | 
102 | @pytest.fixture
103 | def extraction_agent(test_case: TestCase, extractor: LlamaExtract):
104 |     """Fixture to create and cleanup extraction agent for each test."""
105 |     # Create unique name with random UUID (important for CI to avoid conflicts)
106 |     unique_id = uuid.uuid4().hex[:8]
107 |     agent_name = f"{test_case.name}_{unique_id}"
108 | 
109 |     with open(test_case.schema_path, "r") as f:
110 |         schema = json.load(f)
111 | 
112 |     # Clean up any existing agents with this name
113 |     try:
114 |         agents = extractor.list_agents()
115 |         for agent in agents:
116 |             if agent.name == agent_name:
117 |                 extractor.delete_agent(agent.id)
118 |     except Exception as e:
119 |         print(f"Warning: Failed to cleanup existing agent: {str(e)}")
120 | 
121 |     # Create new agent
122 |     agent = extractor.create_agent(agent_name, schema, config=test_case.config)
123 |     yield agent
124 | 
125 | 
126 | @pytest.mark.skipif(
127 |     "CI" in os.environ,
128 |     reason="CI environment is not suitable for benchmarking",
129 | )
130 | @pytest.mark.parametrize("test_case", get_test_cases(), ids=lambda x: x.name)
131 | @pytest.mark.asyncio(loop_scope="session")
132 | async def test_extraction(
133 |     test_case: TestCase, extraction_agent: ExtractionAgent
134 | ) -> None:
135 |     start = perf_counter()
136 |     result = await extraction_agent._run_extraction_test(
137 |         test_case.input_file,
138 |         extract_settings=LlamaExtractSettings(
139 |             llama_parse_params=LlamaParseParameters(
140 |                 invalidate_cache=True,
141 |                 do_not_cache=True,
142 |             )
143 |         ),
144 |     )
145 |     end = perf_counter()
146 |     print(f"Time taken: {end - start} seconds")
147 |     print(result)
148 | 


--------------------------------------------------------------------------------
/tests/extract/test_extract_api.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | from pathlib import Path
  4 | from pydantic import BaseModel
  5 | 
  6 | from llama_cloud_services.extract import LlamaExtract, ExtractionAgent, SourceText
  7 | from tests.extract.util import load_test_dotenv
  8 | 
  9 | load_test_dotenv()
 10 | 
 11 | # Get configuration from environment
 12 | LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
 13 | LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL")
 14 | LLAMA_CLOUD_PROJECT_ID = os.getenv("LLAMA_CLOUD_PROJECT_ID")
 15 | 
 16 | # Skip all tests if API key is not set
 17 | pytestmark = pytest.mark.skipif(
 18 |     not LLAMA_CLOUD_API_KEY, reason="LLAMA_CLOUD_API_KEY not set"
 19 | )
 20 | 
 21 | 
 22 | # Test data
 23 | class TestSchema(BaseModel):
 24 |     title: str
 25 |     summary: str
 26 | 
 27 | 
 28 | # Test data paths
 29 | TEST_DIR = Path(__file__).parent / "data"
 30 | TEST_PDF = TEST_DIR / "slide" / "saas_slide.pdf"
 31 | 
 32 | 
 33 | @pytest.fixture
 34 | def llama_extract():
 35 |     return LlamaExtract(
 36 |         api_key=LLAMA_CLOUD_API_KEY,
 37 |         base_url=LLAMA_CLOUD_BASE_URL,
 38 |         project_id=LLAMA_CLOUD_PROJECT_ID,
 39 |         verbose=True,
 40 |     )
 41 | 
 42 | 
 43 | @pytest.fixture
 44 | def test_agent_name():
 45 |     return "test-api-agent"
 46 | 
 47 | 
 48 | @pytest.fixture
 49 | def test_schema_dict():
 50 |     return {
 51 |         "type": "object",
 52 |         "properties": {
 53 |             "title": {"type": "string"},
 54 |             "summary": {"type": "string"},
 55 |         },
 56 |     }
 57 | 
 58 | 
 59 | @pytest.fixture
 60 | def test_agent(llama_extract, test_agent_name, test_schema_dict, request):
 61 |     """Creates a test agent and cleans it up after the test"""
 62 |     test_id = request.node.nodeid
 63 |     test_hash = hex(hash(test_id))[-8:]
 64 |     base_name = test_agent_name
 65 | 
 66 |     base_name = next(
 67 |         (marker.args[0] for marker in request.node.iter_markers("agent_name")),
 68 |         base_name,
 69 |     )
 70 |     name = f"{base_name}_{test_hash}"
 71 | 
 72 |     schema = next(
 73 |         (
 74 |             marker.args[0][0] if isinstance(marker.args[0], tuple) else marker.args[0]
 75 |             for marker in request.node.iter_markers("agent_schema")
 76 |         ),
 77 |         test_schema_dict,
 78 |     )
 79 | 
 80 |     # Cleanup existing agent
 81 |     try:
 82 |         for agent in llama_extract.list_agents():
 83 |             if agent.name == name:
 84 |                 llama_extract.delete_agent(agent.id)
 85 |     except Exception as e:
 86 |         print(f"Warning: Failed to cleanup existing agent: {e}")
 87 | 
 88 |     agent = llama_extract.create_agent(name=name, data_schema=schema)
 89 |     yield agent
 90 | 
 91 |     # Cleanup after test
 92 |     try:
 93 |         llama_extract.delete_agent(agent.id)
 94 |     except Exception as e:
 95 |         print(f"Warning: Failed to delete agent {agent.id}: {e}")
 96 | 
 97 | 
 98 | class TestLlamaExtract:
 99 |     def test_init_without_api_key(self):
100 |         env_backup = os.getenv("LLAMA_CLOUD_API_KEY")
101 |         del os.environ["LLAMA_CLOUD_API_KEY"]
102 |         with pytest.raises(ValueError, match="The API key is required"):
103 |             LlamaExtract(api_key=None, base_url=LLAMA_CLOUD_BASE_URL)
104 |         os.environ["LLAMA_CLOUD_API_KEY"] = env_backup
105 | 
106 |     @pytest.mark.agent_name("test-dict-schema-agent")
107 |     def test_create_agent_with_dict_schema(self, test_agent):
108 |         assert isinstance(test_agent, ExtractionAgent)
109 | 
110 |     @pytest.mark.agent_name("test-pydantic-schema-agent")
111 |     @pytest.mark.agent_schema((TestSchema,))
112 |     def test_create_agent_with_pydantic_schema(self, test_agent):
113 |         assert isinstance(test_agent, ExtractionAgent)
114 | 
115 |     def test_get_agent_by_name(self, llama_extract, test_agent):
116 |         agent = llama_extract.get_agent(name=test_agent.name)
117 |         assert isinstance(agent, ExtractionAgent)
118 |         assert agent.name == test_agent.name
119 |         assert agent.id == test_agent.id
120 |         assert agent.data_schema == test_agent.data_schema
121 | 
122 |     def test_get_agent_by_id(self, llama_extract, test_agent):
123 |         agent = llama_extract.get_agent(id=test_agent.id)
124 |         assert isinstance(agent, ExtractionAgent)
125 |         assert agent.id == test_agent.id
126 |         assert agent.name == test_agent.name
127 |         assert agent.data_schema == test_agent.data_schema
128 | 
129 |     def test_list_agents(self, llama_extract, test_agent):
130 |         agents = llama_extract.list_agents()
131 |         assert isinstance(agents, list)
132 |         assert any(a.id == test_agent.id for a in agents)
133 | 
134 | 
135 | class TestExtractionAgent:
136 |     @pytest.mark.asyncio
137 |     async def test_extract_single_file(self, test_agent):
138 |         result = await test_agent.aextract(TEST_PDF)
139 |         assert result.status == "SUCCESS"
140 |         assert result.data is not None
141 |         assert isinstance(result.data, dict)
142 |         assert "title" in result.data
143 |         assert "summary" in result.data
144 | 
145 |     def test_sync_extract_single_file(self, test_agent):
146 |         result = test_agent.extract(TEST_PDF)
147 |         assert result.status == "SUCCESS"
148 |         assert result.data is not None
149 |         assert isinstance(result.data, dict)
150 |         assert "title" in result.data
151 |         assert "summary" in result.data
152 | 
153 |     def test_extract_file_from_buffered_io(self, test_agent):
154 |         result = test_agent.extract(SourceText(file=open(TEST_PDF, "rb")))
155 |         assert result.status == "SUCCESS"
156 |         assert result.data is not None
157 |         assert isinstance(result.data, dict)
158 |         assert "title" in result.data
159 |         assert "summary" in result.data
160 | 
161 |     def test_extract_file_from_bytes(self, test_agent):
162 |         with open(TEST_PDF, "rb") as f:
163 |             file_bytes = f.read()
164 |         result = test_agent.extract(SourceText(file=file_bytes, filename=TEST_PDF.name))
165 |         assert result.status == "SUCCESS"
166 |         assert result.data is not None
167 |         assert isinstance(result.data, dict)
168 |         assert "title" in result.data
169 |         assert "summary" in result.data
170 | 
171 |     def test_extract_from_text_content(self, test_agent):
172 |         TEST_TEXT = """
173 |         # Llamas
174 |         Llamas are social animals and live with others as a herd. Their wool is soft and
175 |         contains only a small amount of lanolin.[2] Llamas can learn simple tasks after a
176 |         few repetitions. When using a pack, they can carry about 25 to 30% of their body
177 |         weight for 8 to 13 km (5–8 miles).[3] The name llama (also historically spelled
178 |         "glama") was adopted by European settlers from native Peruvians.
179 |         """
180 |         result = test_agent.extract(SourceText(text_content=TEST_TEXT))
181 |         assert result.status == "SUCCESS"
182 |         assert result.data is not None
183 |         assert isinstance(result.data, dict)
184 |         assert "title" in result.data
185 |         assert "summary" in result.data
186 | 
187 |     @pytest.mark.asyncio
188 |     async def test_extract_multiple_files(self, test_agent):
189 |         files = [TEST_PDF, TEST_PDF]  # Using same file twice for testing
190 |         response = await test_agent.aextract(files)
191 | 
192 |         assert len(response) == 2
193 |         for result in response:
194 |             assert result.status == "SUCCESS"
195 |             assert result.data is not None
196 |             assert isinstance(result.data, dict)
197 |             assert "title" in result.data
198 |             assert "summary" in result.data
199 | 
200 |     def test_save_agent_updates(
201 |         self, test_agent: ExtractionAgent, llama_extract: LlamaExtract
202 |     ):
203 |         new_schema = {
204 |             "type": "object",
205 |             "properties": {
206 |                 "new_field": {"type": "string"},
207 |                 "title": {"type": "string"},
208 |                 "summary": {"type": "string"},
209 |             },
210 |         }
211 |         test_agent.data_schema = new_schema
212 |         test_agent.save()
213 | 
214 |         # Verify the update by getting a fresh instance
215 |         updated_agent = llama_extract.get_agent(name=test_agent.name)
216 |         assert "new_field" in updated_agent.data_schema["properties"]
217 | 
218 |     def test_list_extraction_runs(self, test_agent: ExtractionAgent):
219 |         assert test_agent.list_extraction_runs().total == 0
220 |         test_agent.extract(TEST_PDF)
221 |         runs = test_agent.list_extraction_runs()
222 |         assert runs.total > 0
223 | 
224 |     def test_delete_extraction_run(self, test_agent: ExtractionAgent):
225 |         assert test_agent.list_extraction_runs().total == 0
226 |         run = test_agent.extract(TEST_PDF)
227 |         test_agent.delete_extraction_run(run.id)
228 |         runs = test_agent.list_extraction_runs()
229 |         assert runs.total == 0
230 | 


--------------------------------------------------------------------------------
/tests/extract/test_extract_e2e.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | 
  4 | from llama_cloud_services.extract import LlamaExtract, ExtractionAgent
  5 | from collections import namedtuple
  6 | import json
  7 | import uuid
  8 | from llama_cloud.types import ExtractConfig, ExtractMode
  9 | from deepdiff import DeepDiff
 10 | from tests.extract.util import json_subset_match_score, load_test_dotenv
 11 | 
 12 | load_test_dotenv()
 13 | 
 14 | 
 15 | TEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
 16 | # Get configuration from environment
 17 | LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
 18 | LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL")
 19 | LLAMA_CLOUD_PROJECT_ID = os.getenv("LLAMA_CLOUD_PROJECT_ID")
 20 | 
 21 | TestCase = namedtuple(
 22 |     "TestCase", ["name", "schema_path", "config", "input_file", "expected_output"]
 23 | )
 24 | 
 25 | 
 26 | def get_test_cases():
 27 |     """Get all test cases from TEST_DIR.
 28 | 
 29 |     Returns:
 30 |         List[TestCase]: List of test cases
 31 |     """
 32 |     test_cases = []
 33 | 
 34 |     for data_type in os.listdir(TEST_DIR):
 35 |         data_type_dir = os.path.join(TEST_DIR, data_type)
 36 |         if not os.path.isdir(data_type_dir):
 37 |             continue
 38 | 
 39 |         schema_path = os.path.join(data_type_dir, "schema.json")
 40 |         if not os.path.exists(schema_path):
 41 |             continue
 42 | 
 43 |         input_files = []
 44 | 
 45 |         for file in os.listdir(data_type_dir):
 46 |             file_path = os.path.join(data_type_dir, file)
 47 |             if (
 48 |                 not os.path.isfile(file_path)
 49 |                 or file == "schema.json"
 50 |                 or file.endswith(".test.json")
 51 |             ):
 52 |                 continue
 53 | 
 54 |             input_files.append(file_path)
 55 | 
 56 |         settings = [
 57 |             ExtractConfig(extraction_mode=ExtractMode.FAST),
 58 |             ExtractConfig(extraction_mode=ExtractMode.BALANCED),
 59 |         ]
 60 | 
 61 |         for input_file in sorted(input_files):
 62 |             base_name = os.path.splitext(os.path.basename(input_file))[0]
 63 |             expected_output = os.path.join(data_type_dir, f"{base_name}.test.json")
 64 | 
 65 |             if not os.path.exists(expected_output):
 66 |                 continue
 67 | 
 68 |             test_name = f"{data_type}/{os.path.basename(input_file)}"
 69 |             for setting in settings:
 70 |                 test_cases.append(
 71 |                     TestCase(
 72 |                         name=test_name,
 73 |                         schema_path=schema_path,
 74 |                         input_file=input_file,
 75 |                         config=setting,
 76 |                         expected_output=expected_output,
 77 |                     )
 78 |                 )
 79 | 
 80 |     return test_cases
 81 | 
 82 | 
 83 | @pytest.fixture(scope="session")
 84 | def extractor():
 85 |     """Create a single LlamaExtract instance for all tests."""
 86 |     extract = LlamaExtract(
 87 |         api_key=LLAMA_CLOUD_API_KEY,
 88 |         base_url=LLAMA_CLOUD_BASE_URL,
 89 |         project_id=LLAMA_CLOUD_PROJECT_ID,
 90 |         verbose=True,
 91 |     )
 92 |     yield extract
 93 |     # Cleanup thread pool at end of session
 94 |     extract._thread_pool.shutdown()
 95 | 
 96 | 
 97 | @pytest.fixture
 98 | def extraction_agent(test_case: TestCase, extractor: LlamaExtract):
 99 |     """Fixture to create and cleanup extraction agent for each test."""
100 |     # Create unique name with random UUID (important for CI to avoid conflicts)
101 |     unique_id = uuid.uuid4().hex[:8]
102 |     agent_name = f"{test_case.name}_{unique_id}"
103 | 
104 |     with open(test_case.schema_path, "r") as f:
105 |         schema = json.load(f)
106 | 
107 |     # Clean up any existing agents with this name
108 |     try:
109 |         agents = extractor.list_agents()
110 |         for agent in agents:
111 |             if agent.name == agent_name:
112 |                 extractor.delete_agent(agent.id)
113 |     except Exception as e:
114 |         print(f"Warning: Failed to cleanup existing agent: {str(e)}")
115 | 
116 |     # Create new agent
117 |     agent = extractor.create_agent(agent_name, schema, config=test_case.config)
118 |     yield agent
119 | 
120 |     # Cleanup after test
121 |     try:
122 |         extractor.delete_agent(agent.id)
123 |     except Exception as e:
124 |         print(f"Warning: Failed to delete agent {agent.id}: {str(e)}")
125 | 
126 | 
127 | @pytest.mark.skipif(
128 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
129 |     reason="LLAMA_CLOUD_API_KEY not set",
130 | )
131 | @pytest.mark.parametrize("test_case", get_test_cases(), ids=lambda x: x.name)
132 | def test_extraction(test_case: TestCase, extraction_agent: ExtractionAgent) -> None:
133 |     result = extraction_agent.extract(test_case.input_file).data
134 |     with open(test_case.expected_output, "r") as f:
135 |         expected = json.load(f)
136 |     # TODO: fix the saas_slide test
137 |     assert json_subset_match_score(expected, result) > 0.3, DeepDiff(
138 |         expected, result, ignore_order=True
139 |     )
140 | 


--------------------------------------------------------------------------------
/tests/extract/util.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from autoevals.string import Levenshtein
 4 | from autoevals.number import NumericDiff
 5 | from dotenv import load_dotenv
 6 | from pathlib import Path
 7 | 
 8 | 
 9 | def load_test_dotenv():
10 |     load_dotenv(Path(__file__).parent.parent.parent / ".env.dev", override=True)
11 | 
12 | 
13 | def json_subset_match_score(expected: Any, actual: Any) -> float:
14 |     """
15 |     Adapted from autoevals.JsonDiff to only test on the subset of keys within the expected json.
16 |     """
17 |     string_scorer = Levenshtein()
18 |     number_scorer = NumericDiff()
19 |     if isinstance(expected, dict) and isinstance(actual, dict):
20 |         if len(expected) == 0 and len(actual) == 0:
21 |             return 1
22 |         keys = set(expected.keys())
23 |         scores = [json_subset_match_score(expected.get(k), actual.get(k)) for k in keys]
24 |         scores = [s for s in scores if s is not None]
25 |         return sum(scores) / len(scores)
26 |     elif isinstance(expected, list) and isinstance(actual, list):
27 |         if len(expected) == 0 and len(actual) == 0:
28 |             return 1
29 |         scores = [json_subset_match_score(e1, e2) for (e1, e2) in zip(expected, actual)]
30 |         scores = [s for s in scores if s is not None]
31 |         return sum(scores) / max(len(expected), len(actual))
32 |     elif isinstance(expected, str) and isinstance(actual, str):
33 |         return string_scorer.eval(expected, actual).score
34 |     elif (isinstance(expected, int) or isinstance(expected, float)) and (
35 |         isinstance(actual, int) or isinstance(actual, float)
36 |     ):
37 |         return number_scorer.eval(expected, actual).score
38 |     elif expected is None and actual is None:
39 |         return 1
40 |     elif expected is None or actual is None:
41 |         return 0
42 |     else:
43 |         return 0
44 | 


--------------------------------------------------------------------------------
/tests/parse/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/parse/__init__.py


--------------------------------------------------------------------------------
/tests/parse/test_llama_parse.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import shutil
  4 | from fsspec.implementations.local import LocalFileSystem
  5 | from httpx import AsyncClient
  6 | 
  7 | from llama_cloud_services.parse import LlamaParse
  8 | 
  9 | 
 10 | @pytest.mark.skipif(
 11 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
 12 |     reason="LLAMA_CLOUD_API_KEY not set",
 13 | )
 14 | def test_simple_page_text() -> None:
 15 |     parser = LlamaParse(result_type="text")
 16 | 
 17 |     filepath = "tests/test_files/attention_is_all_you_need.pdf"
 18 |     result = parser.load_data(filepath)
 19 |     assert len(result) == 1
 20 |     assert len(result[0].text) > 0
 21 | 
 22 | 
 23 | @pytest.fixture
 24 | def markdown_parser() -> LlamaParse:
 25 |     if os.environ.get("LLAMA_CLOUD_API_KEY", "") == "":
 26 |         pytest.skip("LLAMA_CLOUD_API_KEY not set")
 27 |     return LlamaParse(result_type="markdown", ignore_errors=False)
 28 | 
 29 | 
 30 | def test_simple_page_markdown(markdown_parser: LlamaParse) -> None:
 31 |     filepath = "tests/test_files/attention_is_all_you_need.pdf"
 32 |     result = markdown_parser.load_data(filepath)
 33 |     assert len(result) == 1
 34 |     assert len(result[0].text) > 0
 35 | 
 36 | 
 37 | def test_simple_page_markdown_bytes(markdown_parser: LlamaParse) -> None:
 38 |     markdown_parser = LlamaParse(result_type="markdown", ignore_errors=False)
 39 | 
 40 |     filepath = "tests/test_files/attention_is_all_you_need.pdf"
 41 |     with open(filepath, "rb") as f:
 42 |         file_bytes = f.read()
 43 |     # client must provide extra_info with file_name
 44 |     with pytest.raises(ValueError):
 45 |         result = markdown_parser.load_data(file_bytes)
 46 |     result = markdown_parser.load_data(
 47 |         file_bytes, extra_info={"file_name": "attention_is_all_you_need.pdf"}
 48 |     )
 49 |     assert len(result) == 1
 50 |     assert len(result[0].text) > 0
 51 | 
 52 | 
 53 | def test_simple_page_markdown_buffer(markdown_parser: LlamaParse) -> None:
 54 |     markdown_parser = LlamaParse(result_type="markdown", ignore_errors=False)
 55 | 
 56 |     filepath = "tests/test_files/attention_is_all_you_need.pdf"
 57 |     with open(filepath, "rb") as f:
 58 |         # client must provide extra_info with file_name
 59 |         with pytest.raises(ValueError):
 60 |             result = markdown_parser.load_data(f)
 61 |         result = markdown_parser.load_data(
 62 |             f, extra_info={"file_name": "attention_is_all_you_need.pdf"}
 63 |         )
 64 |         assert len(result) == 1
 65 |         assert len(result[0].text) > 0
 66 | 
 67 | 
 68 | @pytest.mark.skipif(
 69 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
 70 |     reason="LLAMA_CLOUD_API_KEY not set",
 71 | )
 72 | @pytest.mark.asyncio
 73 | async def test_simple_page_with_custom_fs() -> None:
 74 |     parser = LlamaParse(result_type="markdown")
 75 |     fs = LocalFileSystem()
 76 |     filepath = "tests/test_files/attention_is_all_you_need.pdf"
 77 |     result = await parser.aload_data(filepath, fs=fs)
 78 |     assert len(result) == 1
 79 | 
 80 | 
 81 | @pytest.mark.skipif(
 82 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
 83 |     reason="LLAMA_CLOUD_API_KEY not set",
 84 | )
 85 | @pytest.mark.asyncio
 86 | async def test_simple_page_progress_workers() -> None:
 87 |     parser = LlamaParse(result_type="markdown", show_progress=True, verbose=True)
 88 | 
 89 |     filepath = "tests/test_files/attention_is_all_you_need.pdf"
 90 |     result = await parser.aload_data([filepath, filepath])
 91 |     assert len(result) == 2
 92 |     assert len(result[0].text) > 0
 93 | 
 94 |     parser = LlamaParse(
 95 |         result_type="markdown", show_progress=True, num_workers=2, verbose=True
 96 |     )
 97 | 
 98 |     filepath = "tests/test_files/attention_is_all_you_need.pdf"
 99 |     result = await parser.aload_data([filepath, filepath])
100 |     assert len(result) == 2
101 |     assert len(result[0].text) > 0
102 | 
103 | 
104 | @pytest.mark.skipif(
105 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
106 |     reason="LLAMA_CLOUD_API_KEY not set",
107 | )
108 | @pytest.mark.asyncio
109 | async def test_custom_client() -> None:
110 |     custom_client = AsyncClient(verify=False, timeout=10)
111 |     parser = LlamaParse(result_type="markdown", custom_client=custom_client)
112 |     filepath = "tests/test_files/attention_is_all_you_need.pdf"
113 |     result = await parser.aload_data(filepath)
114 |     assert len(result) == 1
115 |     assert len(result[0].text) > 0
116 | 
117 | 
118 | @pytest.mark.skipif(
119 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
120 |     reason="LLAMA_CLOUD_API_KEY not set",
121 | )
122 | @pytest.mark.asyncio
123 | async def test_input_url() -> None:
124 |     parser = LlamaParse(result_type="markdown")
125 | 
126 |     # links to a resume example
127 |     input_url = "https://cdn-blog.novoresume.com/articles/google-docs-resume-templates/basic-google-docs-resume.png"
128 |     result = await parser.aload_data(input_url)
129 |     assert len(result) == 1
130 |     assert "your name" in result[0].text.lower()
131 | 
132 | 
133 | @pytest.mark.skipif(
134 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
135 |     reason="LLAMA_CLOUD_API_KEY not set",
136 | )
137 | @pytest.mark.asyncio
138 | async def test_input_url_with_website_input() -> None:
139 |     parser = LlamaParse(result_type="markdown")
140 |     input_url = "https://www.example.com"
141 |     result = await parser.aload_data(input_url)
142 |     assert len(result) == 1
143 |     assert "example" in result[0].text.lower()
144 | 
145 | 
146 | @pytest.mark.skipif(
147 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
148 |     reason="LLAMA_CLOUD_API_KEY not set",
149 | )
150 | @pytest.mark.asyncio
151 | async def test_mixing_input_types() -> None:
152 |     parser = LlamaParse(result_type="markdown")
153 |     filepath = "tests/test_files/attention_is_all_you_need.pdf"
154 |     input_url = "https://cdn-blog.novoresume.com/articles/google-docs-resume-templates/basic-google-docs-resume.png"
155 |     result = await parser.aload_data([filepath, input_url])
156 | 
157 |     assert len(result) == 2
158 | 
159 | 
160 | @pytest.mark.skipif(
161 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
162 |     reason="LLAMA_CLOUD_API_KEY not set",
163 | )
164 | @pytest.mark.asyncio
165 | async def test_download_images() -> None:
166 |     parser = LlamaParse(result_type="markdown", take_screenshot=True)
167 |     filepath = "tests/test_files/attention_is_all_you_need.pdf"
168 |     json_result = await parser.aget_json([filepath])
169 | 
170 |     assert len(json_result) == 1
171 |     assert len(json_result[0]["pages"][0]["images"]) > 0
172 | 
173 |     download_path = "tests/test_files/images"
174 |     shutil.rmtree(download_path, ignore_errors=True)
175 | 
176 |     await parser.aget_images(json_result, download_path)
177 |     assert len(os.listdir(download_path)) == len(json_result[0]["pages"][0]["images"])
178 | 


--------------------------------------------------------------------------------
/tests/parse/test_llama_parse_result.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | import os
  3 | import pytest
  4 | from llama_cloud_services import LlamaParse
  5 | from llama_cloud_services.parse.types import JobResult
  6 | 
  7 | 
  8 | @pytest.fixture
  9 | def file_path() -> str:
 10 |     return "tests/test_files/attention_is_all_you_need.pdf"
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def chart_file_path() -> str:
 15 |     return "tests/test_files/attention_is_all_you_need_chart.pdf"
 16 | 
 17 | 
 18 | @pytest.mark.asyncio
 19 | @pytest.mark.skipif(
 20 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
 21 |     reason="LLAMA_CLOUD_API_KEY not set",
 22 | )
 23 | async def test_basic_parse_result(file_path: str):
 24 |     parser = LlamaParse(
 25 |         take_screenshot=True,
 26 |         auto_mode=True,
 27 |         fast_mode=False,
 28 |     )
 29 |     result = await parser.aparse(file_path)
 30 | 
 31 |     assert isinstance(result, JobResult)
 32 |     assert result.job_id is not None
 33 |     assert result.file_name == file_path
 34 |     assert len(result.pages) > 0
 35 | 
 36 |     assert result.pages[0].text is not None
 37 |     assert len(result.pages[0].text) > 0
 38 | 
 39 |     assert result.pages[0].md is not None
 40 |     assert len(result.pages[0].md) > 0
 41 | 
 42 |     assert result.pages[0].md != result.pages[0].text
 43 | 
 44 |     assert len(result.pages[0].images) > 0
 45 |     assert result.pages[0].images[0].name is not None
 46 | 
 47 |     with tempfile.TemporaryDirectory() as temp_dir:
 48 |         file_names = await result.asave_all_images(temp_dir)
 49 |         assert len(file_names) > 0
 50 |         for file_name in file_names:
 51 |             assert os.path.exists(file_name)
 52 |             assert os.path.getsize(file_name) > 0
 53 | 
 54 |     assert result.job_metadata is not None
 55 | 
 56 |     text_documents = result.get_text_documents(
 57 |         split_by_page=True,
 58 |     )
 59 |     assert len(text_documents) > 0
 60 |     assert text_documents[0].text is not None
 61 |     assert len(text_documents[0].text) > 0
 62 | 
 63 |     markdown_documents = result.get_markdown_documents(
 64 |         split_by_page=True,
 65 |     )
 66 |     assert len(markdown_documents) > 0
 67 |     assert markdown_documents[0].text is not None
 68 |     assert len(markdown_documents[0].text) > 0
 69 | 
 70 |     image_documents = await result.aget_image_documents(
 71 |         include_screenshot_images=True,
 72 |         include_object_images=False,
 73 |     )
 74 |     assert len(image_documents) > 0
 75 |     assert image_documents[0].image is not None
 76 |     assert len(image_documents[0].resolve_image().getvalue()) > 0
 77 | 
 78 | 
 79 | @pytest.mark.asyncio
 80 | @pytest.mark.skip(
 81 |     reason="TODO: I don't actually know how to trigger links in the output."
 82 | )
 83 | async def test_link_parse_result(file_path: str):
 84 |     parser = LlamaParse(
 85 |         annotate_links=True,
 86 |     )
 87 |     result = await parser.aparse(file_path)
 88 | 
 89 |     assert isinstance(result, JobResult)
 90 |     assert len(result.pages) > 0
 91 |     assert len(result.pages[0].links) > 0
 92 | 
 93 | 
 94 | @pytest.mark.asyncio
 95 | @pytest.mark.skipif(
 96 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
 97 |     reason="LLAMA_CLOUD_API_KEY not set",
 98 | )
 99 | async def test_parse_structured_output(file_path: str):
100 |     parser = LlamaParse(
101 |         structured_output=True,
102 |         structured_output_json_schema_name="imFeelingLucky",
103 |     )
104 |     result = await parser.aparse(file_path)
105 |     assert isinstance(result, JobResult)
106 |     assert len(result.pages) > 0
107 |     assert len(result.pages[0].structuredData) > 0
108 | 
109 | 
110 | @pytest.mark.asyncio
111 | @pytest.mark.skipif(
112 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
113 |     reason="LLAMA_CLOUD_API_KEY not set",
114 | )
115 | async def test_parse_charts(chart_file_path: str):
116 |     parser = LlamaParse(
117 |         extract_charts=True,
118 |     )
119 |     result = await parser.aparse(chart_file_path)
120 |     assert isinstance(result, JobResult)
121 |     assert len(result.pages) > 0
122 |     assert len(result.pages[0].charts) > 0
123 | 
124 | 
125 | @pytest.mark.asyncio
126 | @pytest.mark.skipif(
127 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
128 |     reason="LLAMA_CLOUD_API_KEY not set",
129 | )
130 | async def test_parse_layout(file_path: str):
131 |     parser = LlamaParse(
132 |         extract_layout=True,
133 |     )
134 |     result = await parser.aparse(file_path)
135 | 
136 |     assert isinstance(result, JobResult)
137 |     assert len(result.pages) > 0
138 |     assert len(result.pages[0].layout) > 0
139 | 
140 | 
141 | @pytest.mark.skipif(
142 |     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
143 |     reason="LLAMA_CLOUD_API_KEY not set",
144 | )
145 | def test_parse_multiple_files(file_path: str, chart_file_path: str):
146 |     parser = LlamaParse()
147 |     result = parser.parse([file_path, chart_file_path])
148 | 
149 |     assert isinstance(result, list)
150 |     assert len(result) == 2
151 |     assert isinstance(result[0], JobResult)
152 |     assert isinstance(result[1], JobResult)
153 |     assert result[0].file_name == file_path
154 |     assert result[1].file_name == chart_file_path
155 | 


--------------------------------------------------------------------------------
/tests/report/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/report/__init__.py


--------------------------------------------------------------------------------
/tests/report/test_llama_report.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import uuid
  4 | from typing import AsyncGenerator
  5 | from pytest_asyncio import fixture as async_fixture
  6 | from llama_cloud_services.report import LlamaReport, ReportClient
  7 | 
  8 | # Skip tests if no API key is set
  9 | pytestmark = pytest.mark.skipif(
 10 |     not os.getenv("LLAMA_CLOUD_API_KEY") or os.getenv("CI") == "true",
 11 |     reason="No API key provided",
 12 | )
 13 | 
 14 | 
 15 | @async_fixture(scope="function")
 16 | async def client() -> AsyncGenerator[LlamaReport, None]:
 17 |     """Create a LlamaReport client."""
 18 |     client = LlamaReport()
 19 |     reports_before = await client.alist_reports()
 20 |     reports_before_ids = [r.report_id for r in reports_before]
 21 |     try:
 22 |         yield client
 23 |     finally:
 24 |         # clean up reports
 25 |         try:
 26 |             reports_after = await client.alist_reports()
 27 |             reports_after_ids = [r.report_id for r in reports_after]
 28 |             for report_id in reports_before_ids:
 29 |                 if report_id not in reports_after_ids:
 30 |                     await client.adelete_report(report_id)
 31 |         except Exception:
 32 |             pass
 33 |         finally:
 34 |             await client.aclient.aclose()
 35 | 
 36 | 
 37 | @pytest.fixture(scope="function")
 38 | def unique_name() -> str:
 39 |     """Generate a unique report name."""
 40 |     return f"test-report-{uuid.uuid4()}"
 41 | 
 42 | 
 43 | @async_fixture(scope="function")
 44 | async def report(
 45 |     client: LlamaReport, unique_name: str
 46 | ) -> AsyncGenerator[ReportClient, None]:
 47 |     """Create a report."""
 48 |     report = await client.acreate_report(
 49 |         name=unique_name,
 50 |         template_text=(
 51 |             "# [Some title]\n\n"
 52 |             " ## TLDR\n"
 53 |             "A quick summary of the paper.\n\n"
 54 |             "## Details\n"
 55 |             "More details about the paper, possible more than one section here.\n"
 56 |         ),
 57 |         input_files=["tests/test_files/paper.md"],
 58 |     )
 59 |     try:
 60 |         yield report
 61 |     finally:
 62 |         await report.adelete()
 63 | 
 64 | 
 65 | @pytest.mark.asyncio
 66 | @pytest.mark.xfail(
 67 |     condition=lambda: os.getenv("CI"),
 68 |     reason="Backend db issues; needs to be fixed.",
 69 | )
 70 | async def test_create_and_delete_report(
 71 |     client: LlamaReport, report: ReportClient
 72 | ) -> None:
 73 |     """Test basic report creation and deletion."""
 74 |     # Verify the report exists
 75 |     metadata = await report.aget_metadata()
 76 |     assert metadata.name == report.name
 77 | 
 78 |     # Test listing reports
 79 |     reports = await client.alist_reports()
 80 |     assert any(r.report_id == report.report_id for r in reports)
 81 | 
 82 |     # Test getting report by ID
 83 |     fetched_report = await client.aget_report(report.report_id)
 84 |     assert fetched_report.report_id == report.report_id
 85 |     assert fetched_report.name == report.name
 86 | 
 87 | 
 88 | @pytest.mark.asyncio
 89 | @pytest.mark.xfail(
 90 |     condition=lambda: os.getenv("CI"),
 91 |     reason="Report plan sometimes times out",
 92 |     raises=TimeoutError,
 93 | )
 94 | async def test_report_plan_workflow(report: ReportClient) -> None:
 95 |     """Test the report planning workflow."""
 96 |     # Wait for the plan
 97 |     plan = await report.await_for_plan()
 98 |     assert plan is not None
 99 | 
100 |     # Approve the plan
101 |     response = await report.aupdate_plan(action="approve")
102 |     assert response is not None
103 | 
104 |     # Wait for completion
105 |     completed_report = await report.await_completion()
106 |     assert len(completed_report.blocks) > 0
107 | 
108 |     # Get edit suggestions
109 |     suggestions = await report.asuggest_edits(
110 |         "TLDR section header more formal.", auto_history=True
111 |     )
112 |     assert len(suggestions) > 0
113 | 
114 |     # Test accepting an edit
115 |     await report.aaccept_edit(suggestions[0])
116 | 
117 |     # Get more suggestions and test rejecting
118 |     more_suggestions = await report.asuggest_edits(
119 |         "Add a section about machine learning.", auto_history=True
120 |     )
121 |     assert len(more_suggestions) > 0
122 |     await report.areject_edit(more_suggestions[0])
123 | 
124 |     # Verify chat history is maintained
125 |     assert len(report.chat_history) >= 4  # 2 user messages + 2 assistant responses
126 | 
127 |     # get events
128 |     events = await report.aget_events()
129 |     assert len(events) > 0
130 | 


--------------------------------------------------------------------------------
/tests/test_files/attention_is_all_you_need.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/test_files/attention_is_all_you_need.pdf


--------------------------------------------------------------------------------
/tests/test_files/attention_is_all_you_need_chart.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/test_files/attention_is_all_you_need_chart.pdf


--------------------------------------------------------------------------------
/tests/test_files/images/67b428c6-9edb-4550-83d9-5e35165846ca-page_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/test_files/images/67b428c6-9edb-4550-83d9-5e35165846ca-page_1.jpg


--------------------------------------------------------------------------------
/tests/test_files/resume/receipt/noisebridge_receipt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/test_files/resume/receipt/noisebridge_receipt.pdf


--------------------------------------------------------------------------------
/tests/test_files/resume/receipt/noisebridge_receipt.test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "receiptNumber": "27215058",
 3 |   "invoiceNumber": "87B37C90152",
 4 |   "datePaid": "2024-07-19",
 5 |   "paymentMethod": {
 6 |     "type": "visa",
 7 |     "lastFourDigits": "7267"
 8 |   },
 9 |   "merchant": {
10 |     "name": "Noisebridge",
11 |     "address": {
12 |       "street": "272 Capp St",
13 |       "city": "San Francisco",
14 |       "state": "California",
15 |       "postalCode": "94110",
16 |       "country": "United States"
17 |     },
18 |     "phone": "1 6507017829",
19 |     "email": "treasurer+stripe@noisebridge.net"
20 |   },
21 |   "billTo": "noisebridge@seldo.com",
22 |   "items": [
23 |     {
24 |       "description": "$10 / month",
25 |       "quantity": 1,
26 |       "unitPrice": 10.0,
27 |       "amount": 10.0,
28 |       "period": {
29 |         "start": "2024-07-19",
30 |         "end": "2024-08-19"
31 |       }
32 |     }
33 |   ],
34 |   "subtotal": 10.0,
35 |   "total": 10.0,
36 |   "amountPaid": 10.0
37 | }
38 | 


--------------------------------------------------------------------------------
/tests/test_files/resume/receipt/schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "http://json-schema.org/draft-07/schema#",
  3 |   "type": "object",
  4 |   "required": ["receiptNumber", "datePaid", "total", "items"],
  5 |   "properties": {
  6 |     "receiptNumber": {
  7 |       "type": "string"
  8 |     },
  9 |     "invoiceNumber": {
 10 |       "type": "string"
 11 |     },
 12 |     "datePaid": {
 13 |       "type": "string",
 14 |       "format": "date"
 15 |     },
 16 |     "paymentMethod": {
 17 |       "type": "object",
 18 |       "properties": {
 19 |         "type": {
 20 |           "type": "string",
 21 |           "enum": ["visa", "mastercard", "amex", "cash", "other"]
 22 |         },
 23 |         "lastFourDigits": {
 24 |           "type": "string",
 25 |           "pattern": "^[0-9]{4}$"
 26 |         }
 27 |       }
 28 |     },
 29 |     "merchant": {
 30 |       "type": "object",
 31 |       "properties": {
 32 |         "name": {
 33 |           "type": "string"
 34 |         },
 35 |         "address": {
 36 |           "type": "object",
 37 |           "properties": {
 38 |             "street": {
 39 |               "type": "string"
 40 |             },
 41 |             "city": {
 42 |               "type": "string"
 43 |             },
 44 |             "state": {
 45 |               "type": "string"
 46 |             },
 47 |             "postalCode": {
 48 |               "type": "string"
 49 |             },
 50 |             "country": {
 51 |               "type": "string"
 52 |             }
 53 |           }
 54 |         },
 55 |         "phone": {
 56 |           "type": "string"
 57 |         },
 58 |         "email": {
 59 |           "type": "string",
 60 |           "format": "email"
 61 |         }
 62 |       }
 63 |     },
 64 |     "billTo": {
 65 |       "type": "string",
 66 |       "format": "email"
 67 |     },
 68 |     "items": {
 69 |       "type": "array",
 70 |       "items": {
 71 |         "type": "object",
 72 |         "required": [
 73 |           "description",
 74 |           "quantity",
 75 |           "unitPrice",
 76 |           "amount",
 77 |           "period"
 78 |         ],
 79 |         "properties": {
 80 |           "description": {
 81 |             "type": "string"
 82 |           },
 83 |           "quantity": {
 84 |             "type": "integer",
 85 |             "minimum": 1
 86 |           },
 87 |           "unitPrice": {
 88 |             "type": "number",
 89 |             "minimum": 0
 90 |           },
 91 |           "amount": {
 92 |             "type": "number",
 93 |             "minimum": 0
 94 |           },
 95 |           "period": {
 96 |             "type": "object",
 97 |             "properties": {
 98 |               "start": {
 99 |                 "type": "string",
100 |                 "format": "date"
101 |               },
102 |               "end": {
103 |                 "type": "string",
104 |                 "format": "date"
105 |               }
106 |             }
107 |           }
108 |         }
109 |       }
110 |     },
111 |     "subtotal": {
112 |       "type": "number",
113 |       "minimum": 0
114 |     },
115 |     "total": {
116 |       "type": "number",
117 |       "minimum": 0
118 |     },
119 |     "amountPaid": {
120 |       "type": "number",
121 |       "minimum": 0
122 |     }
123 |   }
124 | }
125 | 


--------------------------------------------------------------------------------
/tests/test_files/resume/schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "http://json-schema.org/draft-07/schema#",
  3 |   "title": "Resume Schema",
  4 |   "type": "object",
  5 |   "required": ["basics", "skills", "experience"],
  6 |   "properties": {
  7 |     "basics": {
  8 |       "type": "object",
  9 |       "required": ["name", "email"],
 10 |       "properties": {
 11 |         "name": {
 12 |           "type": "string"
 13 |         },
 14 |         "email": {
 15 |           "type": "string",
 16 |           "format": "email"
 17 |         },
 18 |         "phone": {
 19 |           "type": "string"
 20 |         },
 21 |         "location": {
 22 |           "type": "object",
 23 |           "properties": {
 24 |             "city": {
 25 |               "type": "string"
 26 |             },
 27 |             "region": {
 28 |               "type": "string"
 29 |             },
 30 |             "country": {
 31 |               "type": "string"
 32 |             }
 33 |           }
 34 |         },
 35 |         "profiles": {
 36 |           "type": "array",
 37 |           "items": {
 38 |             "type": "object",
 39 |             "properties": {
 40 |               "network": {
 41 |                 "type": "string"
 42 |               },
 43 |               "url": {
 44 |                 "type": "string",
 45 |                 "format": "uri"
 46 |               }
 47 |             }
 48 |           }
 49 |         },
 50 |         "summary": {
 51 |           "type": "string"
 52 |         }
 53 |       }
 54 |     },
 55 |     "skills": {
 56 |       "type": "array",
 57 |       "items": {
 58 |         "type": "object",
 59 |         "properties": {
 60 |           "category": {
 61 |             "type": "string"
 62 |           },
 63 |           "keywords": {
 64 |             "type": "array",
 65 |             "items": {
 66 |               "type": "string"
 67 |             }
 68 |           },
 69 |           "level": {
 70 |             "type": "string",
 71 |             "enum": ["beginner", "intermediate", "advanced", "expert"]
 72 |           }
 73 |         }
 74 |       }
 75 |     },
 76 |     "experience": {
 77 |       "type": "array",
 78 |       "items": {
 79 |         "type": "object",
 80 |         "required": ["company", "position", "startDate"],
 81 |         "properties": {
 82 |           "company": {
 83 |             "type": "string"
 84 |           },
 85 |           "position": {
 86 |             "type": "string"
 87 |           },
 88 |           "startDate": {
 89 |             "type": "string",
 90 |             "format": "date"
 91 |           },
 92 |           "endDate": {
 93 |             "type": "string",
 94 |             "format": "date"
 95 |           },
 96 |           "highlights": {
 97 |             "type": "array",
 98 |             "items": {
 99 |               "type": "string"
100 |             }
101 |           },
102 |           "technologies": {
103 |             "type": "array",
104 |             "items": {
105 |               "type": "string"
106 |             }
107 |           }
108 |         }
109 |       }
110 |     },
111 |     "education": {
112 |       "type": "array",
113 |       "items": {
114 |         "type": "object",
115 |         "required": ["institution", "degree"],
116 |         "properties": {
117 |           "institution": {
118 |             "type": "string"
119 |           },
120 |           "degree": {
121 |             "type": "string"
122 |           },
123 |           "field": {
124 |             "type": "string"
125 |           },
126 |           "graduationDate": {
127 |             "type": "string",
128 |             "format": "date"
129 |           },
130 |           "gpa": {
131 |             "type": "number"
132 |           }
133 |         }
134 |       }
135 |     },
136 |     "certifications": {
137 |       "type": "array",
138 |       "items": {
139 |         "type": "object",
140 |         "properties": {
141 |           "name": {
142 |             "type": "string"
143 |           },
144 |           "issuer": {
145 |             "type": "string"
146 |           },
147 |           "date": {
148 |             "type": "string",
149 |             "format": "date"
150 |           },
151 |           "validUntil": {
152 |             "type": "string",
153 |             "format": "date"
154 |           }
155 |         }
156 |       }
157 |     },
158 |     "publications": {
159 |       "type": "array",
160 |       "items": {
161 |         "type": "object",
162 |         "properties": {
163 |           "title": {
164 |             "type": "string"
165 |           },
166 |           "publisher": {
167 |             "type": "string"
168 |           },
169 |           "date": {
170 |             "type": "string",
171 |             "format": "date"
172 |           },
173 |           "url": {
174 |             "type": "string",
175 |             "format": "uri"
176 |           }
177 |         }
178 |       }
179 |     }
180 |   }
181 | }
182 | 


--------------------------------------------------------------------------------
/tests/test_files/resume/software_architect_resume.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html>
  3 |   <head>
  4 |     <style>
  5 |       body {
  6 |         font-family: "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
  7 |         margin: 0;
  8 |         padding: 0;
  9 |         background: #fff;
 10 |         color: #333;
 11 |         line-height: 1.6;
 12 |       }
 13 | 
 14 |       .container {
 15 |         display: flex;
 16 |         max-width: 1200px;
 17 |         margin: 0 auto;
 18 |         box-shadow: 0 0 20px rgba(0, 0, 0, 0.1);
 19 |         min-height: 100vh;
 20 |       }
 21 | 
 22 |       .sidebar {
 23 |         background: #2c3e50;
 24 |         color: white;
 25 |         padding: 2rem;
 26 |         width: 300px;
 27 |       }
 28 | 
 29 |       .main-content {
 30 |         padding: 2rem;
 31 |         flex: 1;
 32 |       }
 33 | 
 34 |       .profile-name {
 35 |         font-size: 2.5rem;
 36 |         margin: 0;
 37 |         color: #2c3e50;
 38 |         border-bottom: 3px solid #3498db;
 39 |         padding-bottom: 0.5rem;
 40 |       }
 41 | 
 42 |       .profile-title {
 43 |         font-size: 1.5rem;
 44 |         color: #7f8c8d;
 45 |         margin: 0.5rem 0 2rem 0;
 46 |       }
 47 | 
 48 |       .contact-info {
 49 |         margin-bottom: 2rem;
 50 |       }
 51 | 
 52 |       .section-title {
 53 |         font-size: 1.2rem;
 54 |         text-transform: uppercase;
 55 |         color: #3498db;
 56 |         margin-bottom: 1rem;
 57 |         letter-spacing: 1px;
 58 |       }
 59 | 
 60 |       .sidebar .section-title {
 61 |         color: white;
 62 |         border-bottom: 2px solid #3498db;
 63 |         padding-bottom: 0.5rem;
 64 |       }
 65 | 
 66 |       .skill-category {
 67 |         margin-bottom: 1rem;
 68 |       }
 69 | 
 70 |       .skill-list {
 71 |         list-style: none;
 72 |         padding: 0;
 73 |         margin: 0;
 74 |       }
 75 | 
 76 |       .skill-list li {
 77 |         margin-bottom: 0.5rem;
 78 |         font-size: 0.9rem;
 79 |       }
 80 | 
 81 |       .experience-item {
 82 |         margin-bottom: 2rem;
 83 |       }
 84 | 
 85 |       .company-name {
 86 |         font-weight: bold;
 87 |         color: #2c3e50;
 88 |         font-size: 1.1rem;
 89 |       }
 90 | 
 91 |       .job-title {
 92 |         color: #3498db;
 93 |         font-weight: bold;
 94 |       }
 95 | 
 96 |       .date {
 97 |         color: #7f8c8d;
 98 |         font-size: 0.9rem;
 99 |       }
100 | 
101 |       .achievements {
102 |         list-style: disc;
103 |         padding-left: 1.2rem;
104 |         margin-top: 0.5rem;
105 |       }
106 | 
107 |       .contact-info a {
108 |         color: white;
109 |         text-decoration: none;
110 |       }
111 | 
112 |       .education-item {
113 |         margin-bottom: 1rem;
114 |       }
115 |     </style>
116 |   </head>
117 | 
118 |   <body>
119 |     <div class="container">
120 |       <div class="sidebar">
121 |         <div class="contact-info">
122 |           <h2 class="section-title">Contact</h2>
123 |           <p>sarah.chen@email.com</p>
124 |           <p>(555) 123-4567</p>
125 |           <p>San Francisco, CA</p>
126 |           <p><a href="#">LinkedIn Profile</a></p>
127 |         </div>
128 | 
129 |         <div class="skills-section">
130 |           <h2 class="section-title">Technical Skills</h2>
131 | 
132 |           <div class="skill-category">
133 |             <h3>Architecture & Design</h3>
134 |             <ul class="skill-list">
135 |               <li>Microservices</li>
136 |               <li>Event-Driven Architecture</li>
137 |               <li>Domain-Driven Design</li>
138 |               <li>REST APIs</li>
139 |             </ul>
140 |           </div>
141 | 
142 |           <div class="skill-category">
143 |             <h3>Cloud Platforms</h3>
144 |             <ul class="skill-list">
145 |               <li>AWS (Advanced)</li>
146 |               <li>Azure</li>
147 |               <li>Google Cloud Platform</li>
148 |             </ul>
149 |           </div>
150 | 
151 |           <div class="skill-category">
152 |             <h3>Programming</h3>
153 |             <ul class="skill-list">
154 |               <li>Java</li>
155 |               <li>Python</li>
156 |               <li>Go</li>
157 |               <li>JavaScript/TypeScript</li>
158 |             </ul>
159 |           </div>
160 | 
161 |           <div class="skill-category">
162 |             <h3>Certifications</h3>
163 |             <ul class="skill-list">
164 |               <li>AWS Solutions Architect - Professional</li>
165 |               <li>Google Cloud Architect</li>
166 |               <li>Certified Kubernetes Administrator</li>
167 |             </ul>
168 |           </div>
169 |         </div>
170 |       </div>
171 | 
172 |       <div class="main-content">
173 |         <h1 class="profile-name">Sarah Chen</h1>
174 |         <div class="profile-title">Senior Software Architect</div>
175 | 
176 |         <div class="section">
177 |           <h2 class="section-title">Professional Summary</h2>
178 |           <p>
179 |             Innovative Software Architect with over 12 years of experience
180 |             designing and implementing large-scale distributed systems. Proven
181 |             track record of leading technical teams and delivering robust
182 |             enterprise solutions. Expert in cloud architecture, microservices,
183 |             and emerging technologies with a focus on scalable, maintainable
184 |             systems.
185 |           </p>
186 |         </div>
187 | 
188 |         <div class="section">
189 |           <h2 class="section-title">Professional Experience</h2>
190 | 
191 |           <div class="experience-item">
192 |             <div class="company-name">TechCorp Solutions</div>
193 |             <div class="job-title">Senior Software Architect</div>
194 |             <div class="date">2020 - Present</div>
195 |             <ul class="achievements">
196 |               <li>
197 |                 Led architectural design and implementation of a cloud-native
198 |                 platform serving 2M+ users
199 |               </li>
200 |               <li>
201 |                 Established architectural guidelines and best practices adopted
202 |                 across 12 development teams
203 |               </li>
204 |               <li>
205 |                 Reduced system latency by 40% through implementation of
206 |                 event-driven architecture
207 |               </li>
208 |               <li>
209 |                 Mentored 15+ senior developers in cloud-native development
210 |                 practices
211 |               </li>
212 |             </ul>
213 |           </div>
214 | 
215 |           <div class="experience-item">
216 |             <div class="company-name">DataFlow Systems</div>
217 |             <div class="job-title">Lead Software Engineer</div>
218 |             <div class="date">2016 - 2020</div>
219 |             <ul class="achievements">
220 |               <li>
221 |                 Architected and led development of distributed data processing
222 |                 platform handling 5TB daily
223 |               </li>
224 |               <li>
225 |                 Designed microservices architecture reducing deployment time by
226 |                 65%
227 |               </li>
228 |               <li>
229 |                 Led migration of legacy monolith to cloud-native architecture
230 |               </li>
231 |               <li>
232 |                 Managed team of 8 engineers across 3 international locations
233 |               </li>
234 |             </ul>
235 |           </div>
236 | 
237 |           <div class="experience-item">
238 |             <div class="company-name">InnovateTech</div>
239 |             <div class="job-title">Senior Software Engineer</div>
240 |             <div class="date">2013 - 2016</div>
241 |             <ul class="achievements">
242 |               <li>
243 |                 Developed high-performance trading platform processing 100K
244 |                 transactions per second
245 |               </li>
246 |               <li>
247 |                 Implemented real-time analytics engine reducing processing
248 |                 latency by 75%
249 |               </li>
250 |               <li>
251 |                 Led adoption of container orchestration reducing deployment
252 |                 costs by 35%
253 |               </li>
254 |             </ul>
255 |           </div>
256 |         </div>
257 | 
258 |         <div class="section">
259 |           <h2 class="section-title">Education</h2>
260 | 
261 |           <div class="education-item">
262 |             <div class="company-name">Stanford University</div>
263 |             <div class="job-title">Master of Science in Computer Science</div>
264 |             <div class="date">2013</div>
265 |             <p>Focus: Distributed Systems and Machine Learning</p>
266 |           </div>
267 | 
268 |           <div class="education-item">
269 |             <div class="company-name">University of California, Berkeley</div>
270 |             <div class="job-title">
271 |               Bachelor of Science in Computer Engineering
272 |             </div>
273 |             <div class="date">2011</div>
274 |             <p>Magna Cum Laude</p>
275 |           </div>
276 |         </div>
277 | 
278 |         <div class="section">
279 |           <h2 class="section-title">Patents & Speaking</h2>
280 |           <ul class="achievements">
281 |             <li>
282 |               Co-inventor on three patents for distributed systems architecture
283 |             </li>
284 |             <li>
285 |               Published paper on "Scalable Microservices Architecture" at IEEE
286 |               Cloud Computing Conference 2022
287 |             </li>
288 |             <li>
289 |               Keynote Speaker, CloudCon 2023: "Future of Cloud-Native
290 |               Architecture"
291 |             </li>
292 |             <li>Regular presenter at local tech meetups and conferences</li>
293 |           </ul>
294 |         </div>
295 |       </div>
296 |     </div>
297 |   </body>
298 | </html>
299 | 


--------------------------------------------------------------------------------
/tests/test_files/resume/software_architect_resume.test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "basics": {
 3 |     "name": "Sarah Chen",
 4 |     "email": "san.francisco@email.com",
 5 |     "phone": "(555) 123-4567",
 6 |     "location": {
 7 |       "city": "San Francisco",
 8 |       "region": "CA",
 9 |       "country": "USA"
10 |     }
11 |   },
12 |   "skills": [
13 |     {
14 |       "category": "Architecture & Design",
15 |       "keywords": [
16 |         "Microservices",
17 |         "Event-Driven Architecture",
18 |         "Domain-Driven Design",
19 |         "REST APIs"
20 |       ]
21 |     },
22 |     {
23 |       "category": "Cloud Platforms",
24 |       "keywords": ["AWS", "Azure", "Google Cloud Platform"]
25 |     },
26 |     {
27 |       "category": "Programming Languages",
28 |       "keywords": ["Java", "Python", "Go", "JavaScript", "TypeScript"]
29 |     }
30 |   ],
31 |   "experience": [
32 |     {
33 |       "company": "TechCorp Solutions",
34 |       "position": "Senior Software Architect",
35 |       "startDate": "2020-01-01",
36 |       "endDate": "2024-01-10"
37 |     },
38 |     {
39 |       "company": "DataFlow Systems",
40 |       "position": "Lead Software Engineer",
41 |       "startDate": "2016-01-01",
42 |       "endDate": "2019-12-31",
43 |       "technologies": [
44 |         "Distributed Systems",
45 |         "Microservices",
46 |         "Cloud Migration"
47 |       ]
48 |     },
49 |     {
50 |       "company": "InnovateTech",
51 |       "position": "Senior Software Engineer",
52 |       "startDate": "2013-01-01",
53 |       "endDate": "2015-12-31",
54 |       "technologies": [
55 |         "High-performance Computing",
56 |         "Real-time Analytics",
57 |         "Container Orchestration"
58 |       ]
59 |     }
60 |   ],
61 |   "education": [
62 |     {
63 |       "institution": "Stanford University",
64 |       "degree": "Master of Science",
65 |       "field": "Computer Science",
66 |       "graduationDate": "2013-01-01",
67 |       "specialization": "Distributed Systems and Machine Learning"
68 |     },
69 |     {
70 |       "institution": "University of California, Berkeley",
71 |       "degree": "Bachelor of Science",
72 |       "field": "Computer Engineering",
73 |       "graduationDate": "2011-01-01"
74 |     }
75 |   ],
76 |   "certifications": [
77 |     {
78 |       "name": "AWS Solutions Architect - Professional"
79 |     },
80 |     {
81 |       "name": "Google Cloud Architect"
82 |     },
83 |     {
84 |       "name": "Certified Kubernetes Administrator"
85 |     }
86 |   ],
87 |   "publications": [
88 |     {
89 |       "title": "Scalable Microservices Architecture",
90 |       "publisher": "IEEE Cloud Computing Conference",
91 |       "date": "2022-01-01"
92 |     }
93 |   ]
94 | }
95 | 


--------------------------------------------------------------------------------
/tests/test_files/slide/saas_slide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/test_files/slide/saas_slide.pdf


--------------------------------------------------------------------------------
/tests/test_files/slide/saas_slide.test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "companyInfo": {
 3 |     "name": "CloudFlow Analytics",
 4 |     "fundingStage": "Series A",
 5 |     "foundedYear": null,
 6 |     "industry": null,
 7 |     "location": null
 8 |   },
 9 |   "financialMetrics": {
10 |     "mrr": {
11 |       "value": 580000,
12 |       "currency": "USD",
13 |       "growthRate": 27
14 |     },
15 |     "grossMargin": 88
16 |   },
17 |   "growthMetrics": {
18 |     "customers": {
19 |       "total": 1247,
20 |       "growth": 142,
21 |       "enterprisePercent": null
22 |     },
23 |     "nrr": 147
24 |   },
25 |   "marketMetrics": {
26 |     "tam": 50000000000,
27 |     "sam": null,
28 |     "marketShare": null,
29 |     "competitors": null
30 |   },
31 |   "differentiators": [
32 |     {
33 |       "claim": "Processing Speed",
34 |       "metric": "5x faster",
35 |       "comparisonTarget": "competitors"
36 |     },
37 |     {
38 |       "claim": "ML Accuracy",
39 |       "metric": "99.9%",
40 |       "comparisonTarget": null
41 |     },
42 |     {
43 |       "claim": "Market Potential",
44 |       "metric": "80%",
45 |       "comparisonTarget": "Fortune 500"
46 |     }
47 |   ]
48 | }
49 | 


--------------------------------------------------------------------------------
/tests/test_files/slide/schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "http://json-schema.org/draft-07/schema#",
  3 |   "type": "object",
  4 |   "required": ["companyInfo", "financialMetrics", "growthMetrics"],
  5 |   "properties": {
  6 |     "companyInfo": {
  7 |       "type": "object",
  8 |       "required": ["name", "fundingStage"],
  9 |       "properties": {
 10 |         "name": {
 11 |           "type": "string"
 12 |         },
 13 |         "fundingStage": {
 14 |           "type": "string",
 15 |           "enum": ["Pre-seed", "Seed", "Series A", "Series B", "Series C+"]
 16 |         },
 17 |         "foundedYear": {
 18 |           "anyOf": [
 19 |             {
 20 |               "type": "integer"
 21 |             },
 22 |             {
 23 |               "type": "null"
 24 |             }
 25 |           ]
 26 |         },
 27 |         "industry": {
 28 |           "anyOf": [
 29 |             {
 30 |               "type": "string"
 31 |             },
 32 |             {
 33 |               "type": "null"
 34 |             }
 35 |           ]
 36 |         },
 37 |         "location": {
 38 |           "anyOf": [
 39 |             {
 40 |               "type": "string"
 41 |             },
 42 |             {
 43 |               "type": "null"
 44 |             }
 45 |           ]
 46 |         }
 47 |       }
 48 |     },
 49 |     "financialMetrics": {
 50 |       "type": "object",
 51 |       "required": ["mrr", "growthRate"],
 52 |       "properties": {
 53 |         "mrr": {
 54 |           "type": "object",
 55 |           "description": "Monthly Recurring Revenue",
 56 |           "required": ["value", "currency", "growthRate"],
 57 |           "properties": {
 58 |             "value": {
 59 |               "type": "number"
 60 |             },
 61 |             "currency": {
 62 |               "type": "string"
 63 |             },
 64 |             "growthRate": {
 65 |               "type": "number"
 66 |             }
 67 |           }
 68 |         },
 69 |         "grossMargin": {
 70 |           "type": "number"
 71 |         }
 72 |       }
 73 |     },
 74 |     "growthMetrics": {
 75 |       "type": "object",
 76 |       "required": ["customers", "nrr"],
 77 |       "properties": {
 78 |         "customers": {
 79 |           "type": "object",
 80 |           "required": ["total", "growth"],
 81 |           "properties": {
 82 |             "total": {
 83 |               "type": "integer"
 84 |             },
 85 |             "growth": {
 86 |               "type": "number"
 87 |             }
 88 |           }
 89 |         },
 90 |         "nrr": {
 91 |           "description": "Net Revenue Retention",
 92 |           "type": "number"
 93 |         }
 94 |       }
 95 |     },
 96 |     "differentiators": {
 97 |       "type": "array",
 98 |       "items": {
 99 |         "type": "object",
100 |         "required": ["claim", "metric"],
101 |         "properties": {
102 |           "claim": {
103 |             "type": "string"
104 |           },
105 |           "metric": {
106 |             "type": "string"
107 |           },
108 |           "comparisonTarget": {
109 |             "anyOf": [
110 |               {
111 |                 "type": "string"
112 |               },
113 |               {
114 |                 "type": "null"
115 |               }
116 |             ]
117 |           }
118 |         }
119 |       }
120 |     }
121 |   }
122 | }
123 | 


--------------------------------------------------------------------------------