├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── custom-issue.md │ └── feature_request.md ├── dependabot.yml └── workflows │ ├── build_package.yml │ ├── codeql.yml │ ├── lint.yml │ ├── publish_release.yml │ └── unit_test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── Makefile ├── README.md ├── TOS.pdf ├── examples ├── extract │ ├── automotive_sector_analysis.ipynb │ ├── data │ │ ├── automotive_sector_analysis │ │ │ ├── modeling_assumptions.txt │ │ │ └── workflow_img.png │ │ ├── insider_transactions │ │ │ ├── .gitignore │ │ │ ├── cik_mapping.json │ │ │ └── workflow-diag.png │ │ ├── lm317_structured_extraction │ │ │ └── lm317_extraction.png │ │ ├── resumes │ │ │ ├── ai_researcher.pdf │ │ │ ├── ml_engineer.pdf │ │ │ └── software_architect.pdf │ │ └── sec_filings │ │ │ ├── nvda_10k.pdf │ │ │ ├── nvda_10k_page_40.png │ │ │ ├── nvda_10k_page_41.png │ │ │ └── web_ui.png │ ├── extract_data_with_citations.ipynb │ ├── insider_buy_sell.ipynb │ ├── lm317_structured_extraction.ipynb │ ├── resume_screening.ipynb │ ├── sec_10k_filing.ipynb │ └── solar_panel_e2e_comparison.ipynb ├── parse │ ├── advanced_rag │ │ ├── dynamic_section_retrieval.ipynb │ │ └── dynamic_section_retrieval_img.png │ ├── agents │ │ └── demo_simple_openai_agent.ipynb │ ├── caltrain │ │ ├── caltrain_schedule_weekend.pdf │ │ └── caltrain_text_mode.ipynb │ ├── data │ │ ├── BP_Excel.xlsx │ │ └── nvidia_quarterly_revenue_trend_by_market.xlsx │ ├── demo_advanced.ipynb │ ├── demo_api.ipynb │ ├── demo_basic.ipynb │ ├── demo_excel.ipynb │ ├── demo_get_charts.ipynb │ ├── demo_insurance.ipynb │ ├── demo_json.ipynb │ ├── demo_json_tour.ipynb │ ├── demo_languages.ipynb │ ├── demo_mongodb.ipynb │ ├── demo_starter_multimodal.ipynb │ ├── demo_starter_parse_selected_pages.ipynb │ ├── excel │ │ ├── dcf_rag.ipynb │ │ ├── o1_excel_rag.ipynb │ │ └── references │ │ │ ├── query1.png │ │ │ ├── query2.png │ │ │ ├── query3.png │ │ │ ├── query4.png │ │ │ ├── query5.png │ │ │ └── recursive_retrieval.png │ ├── json_tour_screenshots │ │ ├── 32778fb0-9e83-4b00-aebe-0d7f59ff0b5f-img_p0_1.png │ │ ├── 32778fb0-9e83-4b00-aebe-0d7f59ff0b5f-page_1.jpg │ │ ├── img_p0_1.png │ │ ├── links_page.png │ │ ├── page_1.png │ │ └── page_35.png │ ├── knowledge_graphs │ │ ├── kg_agent.ipynb │ │ └── sf2023_budget_kg_screenshot.png │ ├── multimodal │ │ ├── XC9500_CPLD_Family_p3.png │ │ ├── claude_parse.ipynb │ │ ├── gemini2_flash.ipynb │ │ ├── gpt4o_mini.ipynb │ │ ├── insurance_rag.ipynb │ │ ├── legal_rag.ipynb │ │ ├── llama2-p33.png │ │ ├── llama3.1-p5.png │ │ ├── multimodal_contextual_retrieval_rag.ipynb │ │ ├── multimodal_contextual_retrieval_rag_img.png │ │ ├── multimodal_rag_slide_deck.ipynb │ │ ├── multimodal_rag_slide_deck_img.png │ │ ├── multimodal_report_generation.ipynb │ │ ├── multimodal_report_generation_agent.ipynb │ │ ├── multimodal_report_generation_agent_img.png │ │ └── product_manual_rag.ipynb │ ├── other_files │ │ ├── demo_ppt_basic.ipynb │ │ └── demo_ppt_financial.ipynb │ ├── parsing_instructions │ │ ├── expense_report_document.pdf │ │ ├── expense_report_document.png │ │ ├── mcdonalds_receipt.png │ │ ├── parsing_instructions.ipynb │ │ ├── purchase_order_document.pdf │ │ └── purchase_order_document.png │ ├── parsing_modes │ │ ├── demo_auto_mode.ipynb │ │ ├── demo_layout_agent_mode_visual_citations.ipynb │ │ ├── diagram.jpg │ │ ├── layout_agent_citation_engine.png │ │ ├── layout_agent_moe.png │ │ ├── layout_agent_parse_explainer.png │ │ ├── mermaid_render.png │ │ ├── page_1.png │ │ ├── page_11.png │ │ ├── page_14.png │ │ └── page_3.png │ ├── report_generation │ │ └── rfp_response │ │ │ ├── generate_rfp.ipynb │ │ │ └── generate_rfp_img.png │ └── test_tesla_impact_report │ │ ├── 2019-tesla-impact-report-short.pdf │ │ └── test_gpt4o.ipynb └── report │ └── basic_report.ipynb ├── extract.md ├── llama_cloud_services ├── __init__.py ├── constants.py ├── extract │ ├── __init__.py │ ├── extract.py │ └── utils.py ├── parse │ ├── __init__.py │ ├── base.py │ ├── cli │ │ ├── __init__.py │ │ └── main.py │ ├── types.py │ └── utils.py └── report │ ├── __init__.py │ ├── base.py │ └── report.py ├── llama_parse ├── README.md ├── llama_parse │ ├── __init__.py │ ├── base.py │ ├── cli │ │ ├── __init__.py │ │ └── main.py │ └── utils.py ├── poetry.lock └── pyproject.toml ├── parse.md ├── poetry.lock ├── pyproject.toml ├── report.md └── tests ├── __init__.py ├── extract ├── __init__.py ├── data │ ├── receipt │ │ ├── noisebridge_receipt.pdf │ │ ├── noisebridge_receipt.test.json │ │ └── schema.json │ ├── resume │ │ ├── schema.json │ │ ├── software_architect_resume.html │ │ └── software_architect_resume.test.json │ └── slide │ │ ├── saas_slide.pdf │ │ ├── saas_slide.test.json │ │ └── schema.json ├── test_benchmark.py ├── test_extract_api.py ├── test_extract_e2e.py └── util.py ├── parse ├── __init__.py ├── test_llama_parse.py └── test_llama_parse_result.py ├── report ├── __init__.py └── test_llama_report.py └── test_files ├── attention_is_all_you_need.pdf ├── attention_is_all_you_need_chart.pdf ├── images └── 67b428c6-9edb-4550-83d9-5e35165846ca-page_1.jpg ├── paper.md ├── resume ├── receipt │ ├── noisebridge_receipt.pdf │ ├── noisebridge_receipt.test.json │ └── schema.json ├── schema.json ├── software_architect_resume.html └── software_architect_resume.test.json └── slide ├── saas_slide.pdf ├── saas_slide.test.json └── schema.json /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | Write a concise description of what the bug is. 12 | 13 | **Files** 14 | If possible, please provide the PDF file causing the issue. 15 | 16 | **Job ID** 17 | If you have it, please provide the ID of the job you ran. 18 | You can find it here: https://cloud.llamaindex.ai/parse in the "History" tab. 19 | 20 | **Client:** 21 | Please remove untested options: 22 | - Python Library 23 | - API 24 | - Frontend (cloud.llamaindex.ai) 25 | - Typescript Library 26 | - Notebook 27 | 28 | **Additional context** 29 | Add any additional context about the problem here. 30 | What options did you use? Premium mode, multimodal, fast mode, parsing instructions, etc. 31 | Screenshots, code snippets, etc. 32 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom-issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Custom issue 3 | about: Not a bug nor a feature request 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Please see the documentation for all configuration options: 2 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 3 | # and 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "github-actions" 9 | directory: "/" 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/build_package.yml: -------------------------------------------------------------------------------- 1 | name: Build Package 2 | 3 | # Build package on its own without additional pip install 4 | 5 | on: 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | 11 | env: 12 | POETRY_VERSION: "1.6.1" 13 | 14 | jobs: 15 | build: 16 | runs-on: ${{ matrix.os }} 17 | strategy: 18 | # You can use PyPy versions in python-version. 19 | # For example, pypy-2.7 and pypy-3.8 20 | matrix: 21 | os: [ubuntu-latest, windows-latest] 22 | python-version: ["3.9"] 23 | steps: 24 | - uses: actions/checkout@v4 25 | - name: Set up python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install Poetry 30 | uses: snok/install-poetry@v1 31 | with: 32 | version: ${{ env.POETRY_VERSION }} 33 | - name: Install deps 34 | shell: bash 35 | run: poetry install 36 | - name: Ensure lock works 37 | shell: bash 38 | run: poetry lock 39 | - name: Build 40 | shell: bash 41 | run: poetry build 42 | - name: Test installing built package 43 | shell: bash 44 | run: python -m pip install . 45 | - name: Test import 46 | shell: bash 47 | working-directory: ${{ vars.RUNNER_TEMP }} 48 | run: python -c "import llama_cloud_services" 49 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | pull_request: 7 | # The branches below must be a subset of the branches above 8 | branches: ["main"] 9 | schedule: 10 | - cron: "30 16 * * 4" 11 | 12 | jobs: 13 | analyze: 14 | name: Analyze 15 | # Runner size impacts CodeQL analysis time. To learn more, please see: 16 | # - https://gh.io/recommended-hardware-resources-for-running-codeql 17 | # - https://gh.io/supported-runners-and-hardware-resources 18 | # - https://gh.io/using-larger-runners 19 | # Consider using larger runners for possible analysis time improvements. 20 | runs-on: "ubuntu-latest" 21 | timeout-minutes: 360 22 | permissions: 23 | actions: read 24 | contents: read 25 | security-events: write 26 | 27 | steps: 28 | - name: Checkout repository 29 | uses: actions/checkout@v4 30 | 31 | # Initializes the CodeQL tools for scanning. 32 | - name: Initialize CodeQL 33 | uses: github/codeql-action/init@v3 34 | with: 35 | languages: python 36 | dependency-caching: true 37 | 38 | - name: Perform CodeQL Analysis 39 | uses: github/codeql-action/analyze@v3 40 | with: 41 | category: "/language:python" 42 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Linting 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | env: 10 | POETRY_VERSION: "1.6.1" 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | # You can use PyPy versions in python-version. 17 | # For example, pypy-2.7 and pypy-3.8 18 | matrix: 19 | python-version: ["3.9"] 20 | steps: 21 | - uses: actions/checkout@v4 22 | with: 23 | fetch-depth: ${{ github.event_name == 'pull_request' && 2 || 0 }} 24 | - name: Set up python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install Poetry 29 | uses: snok/install-poetry@v1 30 | with: 31 | version: ${{ env.POETRY_VERSION }} 32 | - name: Install pre-commit 33 | shell: bash 34 | run: poetry run pip install pre-commit 35 | - name: Run linter 36 | shell: bash 37 | run: poetry run make lint 38 | -------------------------------------------------------------------------------- /.github/workflows/publish_release.yml: -------------------------------------------------------------------------------- 1 | name: Publish llama-parse to PyPI / GitHub 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*" 7 | 8 | workflow_dispatch: 9 | 10 | env: 11 | POETRY_VERSION: "1.6.1" 12 | PYTHON_VERSION: "3.9" 13 | 14 | jobs: 15 | build-n-publish: 16 | name: Build and publish to PyPI 17 | if: github.repository == 'run-llama/llama_cloud_services' 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | - name: Set up python ${{ env.PYTHON_VERSION }} 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: ${{ env.PYTHON_VERSION }} 26 | 27 | - name: Install Poetry 28 | uses: snok/install-poetry@v1 29 | with: 30 | version: ${{ env.POETRY_VERSION }} 31 | 32 | - name: Install deps 33 | shell: bash 34 | run: pip install -e . 35 | 36 | - name: Build and publish llama-cloud-services 37 | uses: JRubics/poetry-publish@v2.1 38 | with: 39 | pypi_token: ${{ secrets.LLAMA_PARSE_PYPI_TOKEN }} 40 | poetry_install_options: "--without dev" 41 | 42 | - name: Wait for PyPI to update 43 | run: | 44 | sleep 60 45 | 46 | - name: Update llama-parse lock file 47 | run: | 48 | cd llama_parse && poetry lock 49 | 50 | - name: Build and publish llama-parse 51 | uses: JRubics/poetry-publish@v2.1 52 | with: 53 | package_directory: "./llama_parse" 54 | pypi_token: ${{ secrets.LLAMA_PARSE_PYPI_TOKEN }} 55 | poetry_install_options: "--without dev" 56 | 57 | - name: Create GitHub Release 58 | id: create_release 59 | uses: actions/create-release@v1 60 | env: 61 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token 62 | with: 63 | tag_name: ${{ github.ref }} 64 | release_name: ${{ github.ref }} 65 | draft: false 66 | prerelease: false 67 | 68 | - name: Get Asset name 69 | run: | 70 | export PKG=$(ls dist/ | grep tar) 71 | set -- $PKG 72 | echo "name=$1" >> $GITHUB_ENV 73 | 74 | - name: Upload Release Asset (sdist) to GitHub 75 | id: upload-release-asset 76 | uses: actions/upload-release-asset@v1 77 | env: 78 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 79 | with: 80 | upload_url: ${{ steps.create_release.outputs.upload_url }} 81 | asset_path: dist/${{ env.name }} 82 | asset_name: ${{ env.name }} 83 | asset_content_type: application/zip 84 | -------------------------------------------------------------------------------- /.github/workflows/unit_test.yml: -------------------------------------------------------------------------------- 1 | name: Unit Testing 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | env: 10 | POETRY_VERSION: "1.6.1" 11 | LLAMA_CLOUD_API_KEY: ${{ secrets.LLAMA_CLOUD_API_KEY }} 12 | 13 | jobs: 14 | test: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | # You can use PyPy versions in python-version. 18 | # For example, pypy-2.7 and pypy-3.8 19 | matrix: 20 | python-version: ["3.9", "3.10", "3.11", "3.12"] 21 | steps: 22 | - uses: actions/checkout@v4 23 | with: 24 | fetch-depth: 0 25 | - name: Set up python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install Poetry 30 | uses: snok/install-poetry@v1 31 | with: 32 | version: ${{ env.POETRY_VERSION }} 33 | - name: Install deps 34 | shell: bash 35 | run: poetry install --with dev 36 | - name: Run testing 37 | env: 38 | CI: true 39 | shell: bash 40 | run: poetry run pytest tests 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .git 2 | __pycache__/ 3 | *.pyc 4 | .DS_Store 5 | .idea 6 | .env* 7 | .ipynb_checkpoints* 8 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | default_language_version: 3 | python: python3 4 | 5 | repos: 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | rev: v4.5.0 8 | hooks: 9 | - id: check-byte-order-marker 10 | - id: check-merge-conflict 11 | - id: check-symlinks 12 | - id: check-toml 13 | - id: check-yaml 14 | - id: detect-private-key 15 | - id: end-of-file-fixer 16 | - id: mixed-line-ending 17 | - id: trailing-whitespace 18 | - repo: https://github.com/charliermarsh/ruff-pre-commit 19 | rev: v0.1.5 20 | 21 | hooks: 22 | - id: ruff 23 | args: [--fix, --exit-non-zero-on-fix] 24 | exclude: ".*poetry.lock" 25 | - repo: https://github.com/psf/black-pre-commit-mirror 26 | rev: 23.10.1 27 | hooks: 28 | - id: black-jupyter 29 | name: black-src 30 | alias: black 31 | exclude: ".*poetry.lock" 32 | - repo: https://github.com/pre-commit/mirrors-mypy 33 | rev: v1.0.1 34 | hooks: 35 | - id: mypy 36 | exclude: ^tests/ 37 | additional_dependencies: 38 | [ 39 | "types-requests", 40 | "types-Deprecated", 41 | "types-redis", 42 | "types-setuptools", 43 | "types-PyYAML", 44 | "types-protobuf==4.24.0.4", 45 | ] 46 | args: 47 | [ 48 | --disallow-untyped-defs, 49 | --ignore-missing-imports, 50 | --python-version=3.10, 51 | ] 52 | - repo: https://github.com/adamchainz/blacken-docs 53 | rev: 1.16.0 54 | hooks: 55 | - id: blacken-docs 56 | name: black-docs-text 57 | alias: black 58 | types_or: [rst, markdown, tex] 59 | additional_dependencies: [black==23.10.1] 60 | # Using PEP 8's line length in docs prevents excess left/right scrolling 61 | args: [--line-length=79] 62 | - repo: https://github.com/pre-commit/mirrors-prettier 63 | rev: v3.0.3 64 | hooks: 65 | - id: prettier 66 | exclude: poetry.lock 67 | - repo: https://github.com/codespell-project/codespell 68 | rev: v2.2.6 69 | hooks: 70 | - id: codespell 71 | additional_dependencies: [tomli] 72 | exclude: ^(poetry.lock|examples) 73 | args: 74 | [ 75 | "--ignore-words-list", 76 | "astroid,gallary,momento,narl,ot,rouge,nin,gere,te,inh,vor", 77 | ] 78 | - repo: https://github.com/srstevenson/nb-clean 79 | rev: 3.1.0 80 | hooks: 81 | - id: nb-clean 82 | args: [--preserve-cell-outputs, --remove-empty-cells] 83 | - repo: https://github.com/pappasam/toml-sort 84 | rev: v0.23.1 85 | hooks: 86 | - id: toml-sort-fix 87 | exclude: ".*poetry.lock" 88 | 89 | exclude: .github/ISSUE_TEMPLATE 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 LlamaIndex 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | GIT_ROOT ?= $(shell git rev-parse --show-toplevel) 2 | 3 | help: ## Show all Makefile targets. 4 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' 5 | 6 | format: ## Run code autoformatters (black). 7 | pre-commit install 8 | git ls-files | xargs pre-commit run black --files 9 | 10 | lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy 11 | pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files 12 | 13 | test: ## Run tests via pytest 14 | pytest tests 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cloud-services)](https://pypi.org/project/llama-cloud-services/) 2 | [![GitHub contributors](https://img.shields.io/github/contributors/run-llama/llama_cloud_services)](https://github.com/run-llama/llama_cloud_services/graphs/contributors) 3 | [![Discord](https://img.shields.io/discord/1059199217496772688)](https://discord.gg/dGcwcsnxhU) 4 | 5 | # Llama Cloud Services 6 | 7 | This repository contains the code for hand-written SDKs and clients for interacting with LlamaCloud. 8 | 9 | This includes: 10 | 11 | - [LlamaParse](./parse.md) - A GenAI-native document parser that can parse complex document data for any downstream LLM use case (Agents, RAG, data processing, etc.). 12 | - [LlamaReport (beta/invite-only)](./report.md) - A prebuilt agentic report builder that can be used to build reports from a variety of data sources. 13 | - [LlamaExtract](./extract.md) - A prebuilt agentic data extractor that can be used to transform data into a structured JSON representation. 14 | 15 | ## Getting Started 16 | 17 | Install the package: 18 | 19 | ```bash 20 | pip install llama-cloud-services 21 | ``` 22 | 23 | Then, get your API key from [LlamaCloud](https://cloud.llamaindex.ai/). 24 | 25 | Then, you can use the services in your code: 26 | 27 | ```python 28 | from llama_cloud_services import LlamaParse, LlamaReport, LlamaExtract 29 | 30 | parser = LlamaParse(api_key="YOUR_API_KEY") 31 | report = LlamaReport(api_key="YOUR_API_KEY") 32 | extract = LlamaExtract(api_key="YOUR_API_KEY") 33 | ``` 34 | 35 | See the quickstart guides for each service for more information: 36 | 37 | - [LlamaParse](./parse.md) 38 | - [LlamaReport (beta/invite-only)](./report.md) 39 | - [LlamaExtract](./extract.md) 40 | 41 | ## Switch to EU SaaS 🇪🇺 42 | 43 | If you are interested in using LlamaCloud services in the EU, you can adjust your base URL to `https://api.cloud.eu.llamaindex.ai`. 44 | 45 | You can also create your API key in the EU region [here](https://cloud.eu.llamaindex.ai). 46 | 47 | ```python 48 | from llama_cloud_services import ( 49 | LlamaParse, 50 | LlamaReport, 51 | LlamaExtract, 52 | EU_BASE_URL, 53 | ) 54 | 55 | parser = LlamaParse(api_key="YOUR_API_KEY", base_url=EU_BASE_URL) 56 | report = LlamaReport(api_key="YOUR_API_KEY", base_url=EU_BASE_URL) 57 | extract = LlamaExtract(api_key="YOUR_API_KEY", base_url=EU_BASE_URL) 58 | ``` 59 | 60 | ## Documentation 61 | 62 | You can see complete SDK and API documentation for each service on [our official docs](https://docs.cloud.llamaindex.ai/). 63 | 64 | ## Terms of Service 65 | 66 | See the [Terms of Service Here](./TOS.pdf). 67 | 68 | ## Get in Touch (LlamaCloud) 69 | 70 | You can get in touch with us by following our [contact link](https://www.llamaindex.ai/contact). 71 | -------------------------------------------------------------------------------- /TOS.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/TOS.pdf -------------------------------------------------------------------------------- /examples/extract/data/automotive_sector_analysis/modeling_assumptions.txt: -------------------------------------------------------------------------------- 1 | # Financial Modeling Assumptions 2 | Discount Rate: 8% 3 | Terminal Growth Rate: 2% 4 | Tax Rate: 25% 5 | Revenue Growth (Years 1-5): 10% per annum 6 | Revenue Growth (Years 6-10): 5% per annum 7 | Capital Expenditures as % of Revenue: 7% 8 | Working Capital Assumption: 3% of Revenue 9 | Depreciation Rate: 10% per annum 10 | Cost of Capital Assumption: 8% 11 | -------------------------------------------------------------------------------- /examples/extract/data/automotive_sector_analysis/workflow_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/automotive_sector_analysis/workflow_img.png -------------------------------------------------------------------------------- /examples/extract/data/insider_transactions/.gitignore: -------------------------------------------------------------------------------- 1 | sec_form_4_dump.json 2 | -------------------------------------------------------------------------------- /examples/extract/data/insider_transactions/workflow-diag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/insider_transactions/workflow-diag.png -------------------------------------------------------------------------------- /examples/extract/data/lm317_structured_extraction/lm317_extraction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/lm317_structured_extraction/lm317_extraction.png -------------------------------------------------------------------------------- /examples/extract/data/resumes/ai_researcher.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/resumes/ai_researcher.pdf -------------------------------------------------------------------------------- /examples/extract/data/resumes/ml_engineer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/resumes/ml_engineer.pdf -------------------------------------------------------------------------------- /examples/extract/data/resumes/software_architect.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/resumes/software_architect.pdf -------------------------------------------------------------------------------- /examples/extract/data/sec_filings/nvda_10k.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/sec_filings/nvda_10k.pdf -------------------------------------------------------------------------------- /examples/extract/data/sec_filings/nvda_10k_page_40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/sec_filings/nvda_10k_page_40.png -------------------------------------------------------------------------------- /examples/extract/data/sec_filings/nvda_10k_page_41.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/sec_filings/nvda_10k_page_41.png -------------------------------------------------------------------------------- /examples/extract/data/sec_filings/web_ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/extract/data/sec_filings/web_ui.png -------------------------------------------------------------------------------- /examples/parse/advanced_rag/dynamic_section_retrieval_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/advanced_rag/dynamic_section_retrieval_img.png -------------------------------------------------------------------------------- /examples/parse/agents/demo_simple_openai_agent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# LlamaParse Agent\n", 8 | "\n", 9 | "This demo walks through using an OpenAI Agent with [LlamaParse](https://cloud.llamaindex.ai)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Setup" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "!pip install llama-cloud-services llama-index llama-index-postprocessor-sbert-rerank" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import os\n", 35 | "\n", 36 | "os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-...\"\n", 37 | "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "from llama_index.core import Settings\n", 47 | "from llama_index.embeddings.openai import OpenAIEmbedding\n", 48 | "from llama_index.llms.openai import OpenAI\n", 49 | "\n", 50 | "Settings.embed_model = OpenAIEmbedding(model=\"text-embedding-3-small\")\n", 51 | "Settings.llm = OpenAI(model=\"gpt-3.5-turbo\", temperature=0.2)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Parsing \n", 59 | "\n", 60 | "For parsing, lets use a [recent paper](https://huggingface.co/papers/2403.09611) on Multi-Modal pretraining" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "!wget https://arxiv.org/pdf/2403.09611.pdf -O paper.pdf" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Below, we can tell the parser to skip content we don't want. In this case, the references section will just add noise to a RAG system." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "from llama_cloud_services import LlamaParse\n", 86 | "\n", 87 | "parser = LlamaParse(\n", 88 | " result_type=\"markdown\",\n", 89 | ")" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "Started parsing the file under job_id 81251f39-01be-434e-99e8-1c1b83b82098\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "documents = await parser.aload_data(\"paper.pdf\")" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Embeddings have been explicitly disabled. Using MockEmbedding.\n" 119 | ] 120 | }, 121 | { 122 | "name": "stderr", 123 | "output_type": "stream", 124 | "text": [ 125 | "41it [00:00, 26765.21it/s]\n", 126 | "100%|██████████| 41/41 [00:13<00:00, 2.98it/s]\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "import nest_asyncio\n", 132 | "\n", 133 | "nest_asyncio.apply()\n", 134 | "\n", 135 | "from llama_index.core.node_parser import (\n", 136 | " MarkdownElementNodeParser,\n", 137 | " SentenceSplitter,\n", 138 | ")\n", 139 | "\n", 140 | "# explicitly extract tables with the MarkdownElementNodeParser\n", 141 | "node_parser = MarkdownElementNodeParser(num_workers=8)\n", 142 | "nodes = node_parser.get_nodes_from_documents(documents)\n", 143 | "nodes, objects = node_parser.get_nodes_and_objects(nodes)\n", 144 | "\n", 145 | "# Chain splitters to ensure chunk size requirements are met\n", 146 | "nodes = SentenceSplitter(chunk_size=512, chunk_overlap=20).get_nodes_from_documents(\n", 147 | " nodes\n", 148 | ")" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "## Chat over the paper, lets find out what it is about!" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "from llama_index.core import VectorStoreIndex, SummaryIndex\n", 165 | "\n", 166 | "vector_index = VectorStoreIndex(nodes=nodes)\n", 167 | "summary_index = SummaryIndex(nodes=nodes)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "from llama_index.agent.openai import OpenAIAgent\n", 177 | "from llama_index.core.tools import QueryEngineTool, ToolMetadata\n", 178 | "from llama_index.postprocessor.colbert_rerank import ColbertRerank\n", 179 | "\n", 180 | "tools = [\n", 181 | " QueryEngineTool(\n", 182 | " vector_index.as_query_engine(\n", 183 | " similarity_top_k=8, node_postprocessors=[ColbertRerank(top_n=3)]\n", 184 | " ),\n", 185 | " metadata=ToolMetadata(\n", 186 | " name=\"search\",\n", 187 | " description=\"Search the document, pass the entire user message in the query\",\n", 188 | " ),\n", 189 | " ),\n", 190 | " QueryEngineTool(\n", 191 | " summary_index.as_query_engine(),\n", 192 | " metadata=ToolMetadata(\n", 193 | " name=\"summarize\",\n", 194 | " description=\"Summarize the document using the user message\",\n", 195 | " ),\n", 196 | " ),\n", 197 | "]\n", 198 | "\n", 199 | "agent = OpenAIAgent.from_tools(tools=tools, verbose=True)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "Added user message to memory: What is the summary of the paper?\n", 212 | "=== Calling Function ===\n", 213 | "Calling function: summarize with args: {\"input\":\"summary\"}\n", 214 | "Got output: The research focuses on developing Multimodal Large Language Models (MLLMs) by incorporating image-caption, interleaved image-text, and text-only data for pre-training. It highlights the importance of factors like the image encoder, resolution, and token count, while downplaying the design of the vision-language connector. With models scaling up to 30B parameters, the MM1 family demonstrates impressive performance in pre-training metrics and competitive outcomes on diverse multimodal benchmarks. It demonstrates abilities such as in-context learning and multi-image reasoning, aiming to provide valuable insights for creating MLLMs that benefit the research community.\n", 215 | "========================\n", 216 | "\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "# note -- this will take a while with local LLMs, its sending every node in the document to the LLM\n", 222 | "resp = agent.chat(\"What is the summary of the paper?\")" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | "The summary of the paper highlights the development of Multimodal Large Language Models (MLLMs) by incorporating image-caption, interleaved image-text, and text-only data for pre-training. The research emphasizes factors like the image encoder, resolution, and token count, while de-emphasizing the design of the vision-language connector. The MM1 family of models, scaling up to 30B parameters, shows impressive performance in pre-training metrics and competitive outcomes on various multimodal benchmarks. These models demonstrate capabilities such as in-context learning and multi-image reasoning, aiming to provide valuable insights for creating MLLMs that benefit the research community.\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "print(str(resp))" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "Added user message to memory: How do the authors evaluate their work?\n", 252 | "=== Calling Function ===\n", 253 | "Calling function: search with args: {\"input\":\"evaluation methods\"}\n", 254 | "Got output: The evaluation methods involve synthesizing all benchmark results into a single meta-average number to simplify comparisons. This is achieved by normalizing the evaluation metrics with respect to a baseline configuration, standardizing the results for each task, adjusting every metric by dividing it by its respective baseline, and then averaging across all metrics.\n", 255 | "========================\n", 256 | "\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "resp = agent.chat(\"How do the authors evaluate their work?\")" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "name": "stdout", 271 | "output_type": "stream", 272 | "text": [ 273 | "The authors evaluate their work by synthesizing all benchmark results into a single meta-average number to simplify comparisons. They normalize the evaluation metrics with respect to a baseline configuration, standardize the results for each task, adjust every metric by dividing it by its respective baseline, and then average across all metrics for evaluation.\n" 274 | ] 275 | } 276 | ], 277 | "source": [ 278 | "print(str(resp))" 279 | ] 280 | } 281 | ], 282 | "metadata": { 283 | "kernelspec": { 284 | "display_name": "llama-parse-aNC435Vv-py3.10", 285 | "language": "python", 286 | "name": "python3" 287 | }, 288 | "language_info": { 289 | "codemirror_mode": { 290 | "name": "ipython", 291 | "version": 3 292 | }, 293 | "file_extension": ".py", 294 | "mimetype": "text/x-python", 295 | "name": "python", 296 | "nbconvert_exporter": "python", 297 | "pygments_lexer": "ipython3" 298 | } 299 | }, 300 | "nbformat": 4, 301 | "nbformat_minor": 2 302 | } 303 | -------------------------------------------------------------------------------- /examples/parse/caltrain/caltrain_schedule_weekend.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/caltrain/caltrain_schedule_weekend.pdf -------------------------------------------------------------------------------- /examples/parse/data/BP_Excel.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/data/BP_Excel.xlsx -------------------------------------------------------------------------------- /examples/parse/data/nvidia_quarterly_revenue_trend_by_market.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/data/nvidia_quarterly_revenue_trend_by_market.xlsx -------------------------------------------------------------------------------- /examples/parse/demo_api.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Using the Raw API\n", 8 | "\n", 9 | "This notebook walks through how to use the raw API and how" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "--2024-02-02 11:11:39-- https://arxiv.org/pdf/1706.03762.pdf\n", 22 | "Resolving arxiv.org (arxiv.org)... 151.101.131.42, 151.101.3.42, 151.101.67.42, ...\n", 23 | "Connecting to arxiv.org (arxiv.org)|151.101.131.42|:443... connected.\n", 24 | "HTTP request sent, awaiting response... 200 OK\n", 25 | "Length: 2215244 (2.1M) [application/pdf]\n", 26 | "Saving to: ‘./attention.pdf’\n", 27 | "\n", 28 | "./attention.pdf 100%[===================>] 2.11M --.-KB/s in 0.08s \n", 29 | "\n", 30 | "2024-02-02 11:11:39 (27.3 MB/s) - ‘./attention.pdf’ saved [2215244/2215244]\n", 31 | "\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "!wget \"https://arxiv.org/pdf/1706.03762.pdf\" -O \"./attention.pdf\"" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "api_key = \"llx-...\"" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "import mimetypes\n", 55 | "import requests\n", 56 | "import time\n", 57 | "\n", 58 | "headers = {\"Authorization\": f\"Bearer {api_key}\"}\n", 59 | "file_path = \"./attention.pdf\"\n", 60 | "base_url = \"https://api.cloud.llamaindex.ai/api/parsing\"\n", 61 | "\n", 62 | "with open(file_path, \"rb\") as f:\n", 63 | " mime_type = mimetypes.guess_type(file_path)[0]\n", 64 | " files = {\"file\": (f.name, f, mime_type)}\n", 65 | "\n", 66 | " # send the request, upload the file\n", 67 | " url = f\"{base_url}/upload\"\n", 68 | " response = requests.post(url, headers=headers, files=files)\n", 69 | "\n", 70 | "response.raise_for_status()\n", 71 | "# get the job id for the result_url\n", 72 | "job_id = response.json()[\"id\"]\n", 73 | "result_type = \"text\" # or \"markdown\"\n", 74 | "result_url = f\"{base_url}/job/{job_id}/result/{result_type}\"\n", 75 | "\n", 76 | "# check for the result until its ready\n", 77 | "while True:\n", 78 | " response = requests.get(result_url, headers=headers)\n", 79 | " if response.status_code == 200:\n", 80 | " break\n", 81 | "\n", 82 | " time.sleep(2)\n", 83 | "\n", 84 | "# download the result\n", 85 | "result = response.json()\n", 86 | "output = result[result_type]" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | " Provided proper attribution is provided, Google hereby grants permission to\n", 99 | " reproduce the tables and figures in this paper solely for use in journalistic or\n", 100 | " scholarly works.\n", 101 | " Attention Is All You Need\n", 102 | "arXiv:1706.03762v7 [cs.CL] 2 Aug 2023\n", 103 | " Ashish Vaswani∗ Noam Shazeer∗ Niki Parmar∗ Jakob Uszkoreit∗\n", 104 | " Google Brain Google Brain Google Research Google Research\n", 105 | " avaswani@google.com noam@google.com nikip@google.com usz@google.com\n", 106 | " Llion Jones∗ Aidan N. Gomez∗ † Łukasz Kaiser∗\n", 107 | " Google Research University of Toronto \n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "print(output[:1000])" 113 | ] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "llama-parse-aNC435Vv-py3.11", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3" 132 | } 133 | }, 134 | "nbformat": 4, 135 | "nbformat_minor": 2 136 | } 137 | -------------------------------------------------------------------------------- /examples/parse/demo_starter_parse_selected_pages.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\"Open" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Parse Selected Pages \n", 15 | "\n", 16 | "In this notebook we will demonstrate how to parse selected pages in a document using LlamaParse." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### Installation\n", 24 | "\n", 25 | "Here we install `llama-parse` used for parsing the document" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "!pip install llama-cloud-services" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### Set API Key" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "import os\n", 51 | "\n", 52 | "# API access to llama-cloud\n", 53 | "os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"\"" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "### Download Data\n", 61 | "\n", 62 | "Here we download Uber 2021 10K SEC filings data for the demonstration." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf' -O './uber_2021.pdf'" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "### Parse the PDF file in selected pages\n", 79 | "\n", 80 | "Here we will parse the PDF file in selected pages and get the text in `markdown` format." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "Started parsing the file under job_id ad1087c1-b085-4dc7-9aa8-d13cdd440f2b\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "from llama_cloud_services import LlamaParse\n", 98 | "\n", 99 | "parser = LlamaParse(target_pages=\"0,1,2\")\n", 100 | "\n", 101 | "results = await parser.aparse(\"./uber_2021.pdf\")\n", 102 | "documents = results.get_text_documents(split_by_page=True)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/plain": [ 113 | "[Document(id_='d0b34f4a-27ef-48e2-a92a-386e5e265f4c', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\\n', text='# UNITED STATES SECURITIES AND EXCHANGE COMMISSION\\n\\n# Washington, D.C. 20549\\n\\n# FORM 10-K\\n\\n(Mark One)\\n\\n☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\\n\\nFor the fiscal year ended December 31, 2021\\n\\nOR\\n\\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\\n\\nFor the transition period from _____ to _____\\n\\nCommission File Number: 001-38902\\n\\n# UBER TECHNOLOGIES, INC.\\n\\n(Exact name of registrant as specified in its charter)\\n\\nDelaware\\n\\n45-2647441\\n\\n(State or other jurisdiction of incorporation or organization) (I.R.S. Employer Identification No.)\\n\\n1515 3rd Street\\n\\nSan Francisco, California 94158\\n\\n(Address of principal executive offices, including zip code)\\n\\n(415) 612-8582\\n\\n(Registrant’s telephone number, including area code)\\n\\n# Securities registered pursuant to Section 12(b) of the Act:\\n\\n|Title of each class|Trading Symbol(s)|Name of each exchange on which registered|\\n|---|---|---|\\n|Common Stock, par value $0.00001 per share|UBER|New York Stock Exchange|\\n\\nSecurities registered pursuant to Section 12(g) of the Act: None\\n\\nIndicate by check mark whether the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒ No ☐\\n\\nIndicate by check mark whether the registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act. Yes ☐ No ☒\\n\\nIndicate by check mark whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12 months (or for such shorter period that the registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days. Yes ☒ No ☐\\n\\nIndicate by check mark whether the registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (§232.405 of this chapter) during the preceding 12 months (or for such shorter period that the registrant was required to submit such files). Yes ☒ No ☐\\n\\nIndicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company, or an emerging growth company. See the definitions of “large accelerated filer,” “accelerated filer,” “smaller reporting company,” and “emerging growth company” in Rule 12b-2 of the Exchange Act.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\\n', text_template='{metadata_str}\\n\\n{content}'),\n", 114 | " Document(id_='253b1141-a260-466e-b164-b39df67ef799', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\\n', text=\"# Large accelerated filer\\n\\n☒\\n\\n# Accelerated filer\\n\\n☐\\n\\n# Non-accelerated filer\\n\\n☐\\n\\n# Smaller reporting company\\n\\n☐\\n\\n# Emerging growth company\\n\\n☐\\n\\nIf an emerging growth company, indicate by check mark if the registrant has elected not to use the extended transition period for complying with any new or revised financial accounting standards provided pursuant to Section 13(a) of the Exchange Act.\\n\\n☐\\n\\nIndicate by check mark whether the registrant has filed a report on and attestation to its management’s assessment of the effectiveness of its internal control over financial reporting under Section 404(b) of the Sarbanes-Oxley Act (15 U.S.C. 7262(b)) by the registered public accounting firm that prepared or issued\\n\\n☒\\n\\nIndicate by check mark whether the registrant is a shell company (as defined in Rule 12b-2 of the Exchange Act). Yes\\n\\n☐\\n\\nNo\\n\\n☒\\n\\nThe aggregate market value of the voting and non-voting common equity held by non-affiliates of the registrant as of June 30, 2021, the last business day of the registrant's most recently completed second fiscal quarter, was approximately $90.5 billion based upon the closing price reported for such date on the New York Stock Exchange.\\n\\nThe number of shares of the registrant's common stock outstanding as of February 22, 2022 was 1,954,464,088.\\n\\n# DOCUMENTS INCORPORATED BY REFERENCE\\n\\nPortions of the registrant’s Definitive Proxy Statement relating to the Annual Meeting of Stockholders are incorporated by reference into Part III of this Annual Report on Form 10-K where indicated. Such Definitive Proxy Statement will be filed with the Securities and Exchange Commission within 120 days after the end of the registrant’s fiscal year ended December 31, 2021.\", mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\\n', text_template='{metadata_str}\\n\\n{content}'),\n", 115 | " Document(id_='ad988239-3ab5-498d-85ba-a29241db24d4', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\\n', text='# UBER TECHNOLOGIES, INC.\\n\\n# TABLE OF CONTENTS\\n\\n|Special Note Regarding Forward-Looking Statements|2|\\n|---|---|\\n|PART I|PART I|\\n|Item 1. Business|4|\\n|Item 1A. Risk Factors|11|\\n|Item 1B. Unresolved Staff Comments|46|\\n|Item 2. Properties|46|\\n|Item 3. Legal Proceedings|46|\\n|Item 4. Mine Safety Disclosures|47|\\n|PART II|PART II|\\n|Item 5. Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities|47|\\n|Item 6. [Reserved]|48|\\n|Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations|48|\\n|Item 7A. Quantitative and Qualitative Disclosures About Market Risk|69|\\n|Item 8. Financial Statements and Supplementary Data|70|\\n|Item 9. Changes in and Disagreements with Accountants on Accounting and Financial Disclosure|146|\\n|Item 9A. Controls and Procedures|147|\\n|Item 9B. Other Information|147|\\n|Item 9C. Disclosure Regarding Foreign Jurisdictions that Prevent Inspections|147|\\n|PART III|PART III|\\n|Item 10. Directors, Executive Officers and Corporate Governance|147|\\n|Item 11. Executive Compensation|147|\\n|Item 12. Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters|148|\\n|Item 13. Certain Relationships and Related Transactions, and Director Independence|148|\\n|Item 14. Principal Accounting Fees and Services|148|\\n|PART IV|PART IV|\\n|Item 15. Exhibits, Financial Statement Schedules|148|\\n|Item 16. Form 10-K Summary|148|\\n|Exhibit Index|149|\\n|Signatures|152|', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\\n', text_template='{metadata_str}\\n\\n{content}')]" 116 | ] 117 | }, 118 | "execution_count": null, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "documents" 125 | ] 126 | } 127 | ], 128 | "metadata": { 129 | "kernelspec": { 130 | "display_name": "llamacloud", 131 | "language": "python", 132 | "name": "llamacloud" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 3 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython3" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 2 148 | } 149 | -------------------------------------------------------------------------------- /examples/parse/excel/references/query1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/excel/references/query1.png -------------------------------------------------------------------------------- /examples/parse/excel/references/query2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/excel/references/query2.png -------------------------------------------------------------------------------- /examples/parse/excel/references/query3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/excel/references/query3.png -------------------------------------------------------------------------------- /examples/parse/excel/references/query4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/excel/references/query4.png -------------------------------------------------------------------------------- /examples/parse/excel/references/query5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/excel/references/query5.png -------------------------------------------------------------------------------- /examples/parse/excel/references/recursive_retrieval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/excel/references/recursive_retrieval.png -------------------------------------------------------------------------------- /examples/parse/json_tour_screenshots/32778fb0-9e83-4b00-aebe-0d7f59ff0b5f-img_p0_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/json_tour_screenshots/32778fb0-9e83-4b00-aebe-0d7f59ff0b5f-img_p0_1.png -------------------------------------------------------------------------------- /examples/parse/json_tour_screenshots/32778fb0-9e83-4b00-aebe-0d7f59ff0b5f-page_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/json_tour_screenshots/32778fb0-9e83-4b00-aebe-0d7f59ff0b5f-page_1.jpg -------------------------------------------------------------------------------- /examples/parse/json_tour_screenshots/img_p0_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/json_tour_screenshots/img_p0_1.png -------------------------------------------------------------------------------- /examples/parse/json_tour_screenshots/links_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/json_tour_screenshots/links_page.png -------------------------------------------------------------------------------- /examples/parse/json_tour_screenshots/page_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/json_tour_screenshots/page_1.png -------------------------------------------------------------------------------- /examples/parse/json_tour_screenshots/page_35.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/json_tour_screenshots/page_35.png -------------------------------------------------------------------------------- /examples/parse/knowledge_graphs/sf2023_budget_kg_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/knowledge_graphs/sf2023_budget_kg_screenshot.png -------------------------------------------------------------------------------- /examples/parse/multimodal/XC9500_CPLD_Family_p3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/multimodal/XC9500_CPLD_Family_p3.png -------------------------------------------------------------------------------- /examples/parse/multimodal/llama2-p33.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/multimodal/llama2-p33.png -------------------------------------------------------------------------------- /examples/parse/multimodal/llama3.1-p5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/multimodal/llama3.1-p5.png -------------------------------------------------------------------------------- /examples/parse/multimodal/multimodal_contextual_retrieval_rag_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/multimodal/multimodal_contextual_retrieval_rag_img.png -------------------------------------------------------------------------------- /examples/parse/multimodal/multimodal_rag_slide_deck_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/multimodal/multimodal_rag_slide_deck_img.png -------------------------------------------------------------------------------- /examples/parse/multimodal/multimodal_report_generation_agent_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/multimodal/multimodal_report_generation_agent_img.png -------------------------------------------------------------------------------- /examples/parse/other_files/demo_ppt_financial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# LlamaParse - Parsing Financial Powerpoints 📊\n", 8 | "\n", 9 | "In this cookbook we show you how to use LlamaParse to parse a financial powerpoint." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Installation\n", 17 | "\n", 18 | "Parsing instruction are part of the LlamaParse API. They can be access by directly specifying the parsing_instruction parameter in the API or by using LlamaParse python module (which we will use for this tutorial).\n", 19 | "\n", 20 | "To install llama-parse, just get it from `pip`:" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "%pip install llama-index\n", 30 | "%pip install llama-cloud-services\n", 31 | "%pip install torch transformers python-pptx Pillow" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "## API Key\n", 39 | "\n", 40 | "The use of LlamaParse requires an API key which you can get here: https://cloud.llamaindex.ai/parse" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "import os\n", 50 | "\n", 51 | "os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-...\"\n", 52 | "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "**NOTE**: Since LlamaParse is natively async, running the sync code in a notebook requires the use of nest_asyncio.\n" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "import nest_asyncio\n", 69 | "\n", 70 | "nest_asyncio.apply()" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## Importing the package\n", 78 | "\n", 79 | "To import llama_parse simply do:" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "from llama_cloud_services import LlamaParse" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## Using LlamaParse to Parse Presentations\n", 96 | "\n", 97 | "Like Powerpoints, presentations are often hard to extract for RAG. With LlamaParse we can now parse them and unclock their content of presentations for RAG.\n", 98 | "\n", 99 | "Let's download a financial report from the World Meteorological Association." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "! mkdir data; wget \"https://meetings.wmo.int/Cg-19/PublishingImages/SitePages/FINAC-43/7%20-%20EC-77-Doc%205%20Financial%20Statements%20for%202022%20(FINAC).pptx\" -O data/presentation.pptx" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### Parsing the presentation\n", 116 | "\n", 117 | "Now let's parse it into Markdown with LlamaParse and the default LlamaIndex parser.\n", 118 | "\n", 119 | "\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "#### Llama Index default" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "from llama_index.core import SimpleDirectoryReader\n", 136 | "\n", 137 | "vanilla_documents = SimpleDirectoryReader(\"./data/\").load_data()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "#### Llama Parse" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "Started parsing the file under job_id 56724c0d-e45a-4e30-ae8c-e416173c608a\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "llama_parse_documents = LlamaParse(result_type=\"markdown\").load_data(\n", 162 | " \"./data/presentation.pptx\"\n", 163 | ")" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "Let's take a look at the parsed output from an example slide (see image below).\n", 171 | "\n", 172 | "As we can see the table is faithfully extracted!" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "ation and mitigation\n", 185 | "---\n", 186 | "|Item|31 Dec 2022|31 Dec 2021|Change|\n", 187 | "|---|---|---|---|\n", 188 | "|Payables and accruals|4,685|4,066|619|\n", 189 | "|Employee benefits|127,215|84,676|42,539|\n", 190 | "|Contributions received in advance|6,975|10,192|(3,217)|\n", 191 | "|Unearned revenue from exchange transactions|20|651|(631)|\n", 192 | "|Deferred Revenue|71,301|55,737|15,564|\n", 193 | "|Borrowings|28,229|29,002|(773)|\n", 194 | "|Funds held in trust|30,373|29,014|1,359|\n", 195 | "|Provisions|1,706|1,910|(204)|\n", 196 | "|Total Liabilities|270,504|215,248|55,256|\n", 197 | "---\n", 198 | "## Liabilities\n", 199 | "\n", 200 | "Employee Ben\n" 201 | ] 202 | } 203 | ], 204 | "source": [ 205 | "print(llama_parse_documents[0].get_content()[-2800:-2300])" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "Compared against the original slide image.\n", 213 | "![Demo](demo_ppt_financial_1.png)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "## Comparing the two for RAG\n", 221 | "\n", 222 | "The main difference between LlamaParse and the previous directory reader approach, it that LlamaParse will extract the document in a structured format, allowing better RAG." 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "### Query Engine on SimpleDirectoryReader results" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", 239 | "\n", 240 | "vanilla_index = VectorStoreIndex.from_documents(vanilla_documents)\n", 241 | "vanilla_query_engine = vanilla_index.as_query_engine()" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "### Query Engine on LlamaParse Results\n" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "llama_parse_index = VectorStoreIndex.from_documents(llama_parse_documents)\n", 258 | "llama_parse_query_engine = llama_parse_index.as_query_engine()" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "### Liability provision\n", 266 | "What was the liability provision as of Dec 31 2021?\n", 267 | "\n", 268 | "" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "name": "stdout", 278 | "output_type": "stream", 279 | "text": [ 280 | "The liability provision as of December 31, 2021, included Employee Benefit Liabilities, Contributions received in advance (assessed contributions), and Deferred revenue.\n" 281 | ] 282 | } 283 | ], 284 | "source": [ 285 | "vanilla_response = vanilla_query_engine.query(\n", 286 | " \"What was the liability provision as of Dec 31 2021?\"\n", 287 | ")\n", 288 | "print(vanilla_response)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "The liability provision as of December 31, 2021, was 1,910 CHF.\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "llama_parse_response = llama_parse_query_engine.query(\n", 306 | " \"What was the liability provision as of Dec 31 2021?\"\n", 307 | ")\n", 308 | "print(llama_parse_response)" 309 | ] 310 | } 311 | ], 312 | "metadata": { 313 | "colab": { 314 | "provenance": [] 315 | }, 316 | "kernelspec": { 317 | "display_name": "llama_parse", 318 | "language": "python", 319 | "name": "llama_parse" 320 | }, 321 | "language_info": { 322 | "codemirror_mode": { 323 | "name": "ipython", 324 | "version": 3 325 | }, 326 | "file_extension": ".py", 327 | "mimetype": "text/x-python", 328 | "name": "python", 329 | "nbconvert_exporter": "python", 330 | "pygments_lexer": "ipython3" 331 | } 332 | }, 333 | "nbformat": 4, 334 | "nbformat_minor": 4 335 | } 336 | -------------------------------------------------------------------------------- /examples/parse/parsing_instructions/expense_report_document.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_instructions/expense_report_document.pdf -------------------------------------------------------------------------------- /examples/parse/parsing_instructions/expense_report_document.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_instructions/expense_report_document.png -------------------------------------------------------------------------------- /examples/parse/parsing_instructions/mcdonalds_receipt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_instructions/mcdonalds_receipt.png -------------------------------------------------------------------------------- /examples/parse/parsing_instructions/purchase_order_document.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_instructions/purchase_order_document.pdf -------------------------------------------------------------------------------- /examples/parse/parsing_instructions/purchase_order_document.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_instructions/purchase_order_document.png -------------------------------------------------------------------------------- /examples/parse/parsing_modes/diagram.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/diagram.jpg -------------------------------------------------------------------------------- /examples/parse/parsing_modes/layout_agent_citation_engine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/layout_agent_citation_engine.png -------------------------------------------------------------------------------- /examples/parse/parsing_modes/layout_agent_moe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/layout_agent_moe.png -------------------------------------------------------------------------------- /examples/parse/parsing_modes/layout_agent_parse_explainer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/layout_agent_parse_explainer.png -------------------------------------------------------------------------------- /examples/parse/parsing_modes/mermaid_render.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/mermaid_render.png -------------------------------------------------------------------------------- /examples/parse/parsing_modes/page_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/page_1.png -------------------------------------------------------------------------------- /examples/parse/parsing_modes/page_11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/page_11.png -------------------------------------------------------------------------------- /examples/parse/parsing_modes/page_14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/page_14.png -------------------------------------------------------------------------------- /examples/parse/parsing_modes/page_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/parsing_modes/page_3.png -------------------------------------------------------------------------------- /examples/parse/report_generation/rfp_response/generate_rfp_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/report_generation/rfp_response/generate_rfp_img.png -------------------------------------------------------------------------------- /examples/parse/test_tesla_impact_report/2019-tesla-impact-report-short.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/examples/parse/test_tesla_impact_report/2019-tesla-impact-report-short.pdf -------------------------------------------------------------------------------- /extract.md: -------------------------------------------------------------------------------- 1 | # LlamaExtract 2 | 3 | LlamaExtract provides a simple API for extracting structured data from unstructured documents like PDFs, text files and images. 4 | 5 | ## Quick Start 6 | 7 | ```python 8 | from llama_cloud_services import LlamaExtract 9 | from pydantic import BaseModel, Field 10 | 11 | # Initialize client 12 | extractor = LlamaExtract() 13 | 14 | 15 | # Define schema using Pydantic 16 | class Resume(BaseModel): 17 | name: str = Field(description="Full name of candidate") 18 | email: str = Field(description="Email address") 19 | skills: list[str] = Field(description="Technical skills and technologies") 20 | 21 | 22 | # Create extraction agent 23 | agent = extractor.create_agent(name="resume-parser", data_schema=Resume) 24 | 25 | # Extract data from document 26 | result = agent.extract("resume.pdf") 27 | print(result.data) 28 | ``` 29 | 30 | ## Core Concepts 31 | 32 | - **Extraction Agents**: Reusable extractors configured with a specific schema and extraction settings. 33 | - **Data Schema**: Structure definition for the data you want to extract in the form of a JSON schema or a Pydantic model. 34 | - **Extraction Jobs**: Asynchronous extraction tasks that can be monitored. 35 | 36 | ## Defining Schemas 37 | 38 | Schemas can be defined using either Pydantic models or JSON Schema: 39 | 40 | ### Using Pydantic (Recommended) 41 | 42 | ```python 43 | from pydantic import BaseModel, Field 44 | from typing import List, Optional 45 | 46 | 47 | class Experience(BaseModel): 48 | company: str = Field(description="Company name") 49 | title: str = Field(description="Job title") 50 | start_date: Optional[str] = Field(description="Start date of employment") 51 | end_date: Optional[str] = Field(description="End date of employment") 52 | 53 | 54 | class Resume(BaseModel): 55 | name: str = Field(description="Candidate name") 56 | experience: List[Experience] = Field(description="Work history") 57 | ``` 58 | 59 | ### Using JSON Schema 60 | 61 | ```python 62 | schema = { 63 | "type": "object", 64 | "properties": { 65 | "name": {"type": "string", "description": "Candidate name"}, 66 | "experience": { 67 | "type": "array", 68 | "description": "Work history", 69 | "items": { 70 | "type": "object", 71 | "properties": { 72 | "company": { 73 | "type": "string", 74 | "description": "Company name", 75 | }, 76 | "title": {"type": "string", "description": "Job title"}, 77 | "start_date": { 78 | "anyOf": [{"type": "string"}, {"type": "null"}], 79 | "description": "Start date of employment", 80 | }, 81 | "end_date": { 82 | "anyOf": [{"type": "string"}, {"type": "null"}], 83 | "description": "End date of employment", 84 | }, 85 | }, 86 | }, 87 | }, 88 | }, 89 | } 90 | 91 | agent = extractor.create_agent(name="resume-parser", data_schema=schema) 92 | ``` 93 | 94 | ### Important restrictions on JSON/Pydantic Schema 95 | 96 | _LlamaExtract only supports a subset of the JSON Schema specification._ While limited, it should 97 | be sufficient for a wide variety of use-cases. 98 | 99 | - All fields are required by default. Nullable fields must be explicitly marked as such, 100 | using `anyOf` with a `null` type. See `"start_date"` field above. 101 | - Root node must be of type `object`. 102 | - Schema nesting must be limited to within 5 levels. 103 | - The important fields are key names/titles, type and description. Fields for 104 | formatting, default values, etc. are **not supported**. If you need these, you can add the 105 | restrictions to your field description and/or use a post-processing step. e.g. default values can be supported by making a field optional and then setting `"null"` values from the extraction result to the default value. 106 | - There are other restrictions on number of keys, size of the schema, etc. that you may 107 | hit for complex extraction use cases. In such cases, it is worth thinking how to restructure 108 | your extraction workflow to fit within these constraints, e.g. by extracting subset of fields 109 | and later merging them together. 110 | 111 | ## Other Extraction APIs 112 | 113 | ### Extraction over bytes or text 114 | 115 | You can use the `SourceText` class to extract from bytes or text directly without using a file. If passing the file bytes, 116 | you will need to pass the filename to the `SourceText` class. 117 | 118 | ```python 119 | with open("resume.pdf", "rb") as f: 120 | file_bytes = f.read() 121 | result = test_agent.extract(SourceText(file=file_bytes, filename="resume.pdf")) 122 | ``` 123 | 124 | ```python 125 | result = test_agent.extract( 126 | SourceText(text_content="Candidate Name: Jane Doe") 127 | ) 128 | ``` 129 | 130 | ### Batch Processing 131 | 132 | Process multiple files asynchronously: 133 | 134 | ```python 135 | # Queue multiple files for extraction 136 | jobs = await agent.queue_extraction(["resume1.pdf", "resume2.pdf"]) 137 | 138 | # Check job status 139 | for job in jobs: 140 | status = agent.get_extraction_job(job.id).status 141 | print(f"Job {job.id}: {status}") 142 | 143 | # Get results when complete 144 | results = [agent.get_extraction_run_for_job(job.id) for job in jobs] 145 | ``` 146 | 147 | ### Updating Schemas 148 | 149 | Schemas can be modified and updated after creation: 150 | 151 | ```python 152 | # Update schema 153 | agent.data_schema = new_schema 154 | 155 | # Save changes 156 | agent.save() 157 | ``` 158 | 159 | ### Managing Agents 160 | 161 | ```python 162 | # List all agents 163 | agents = extractor.list_agents() 164 | 165 | # Get specific agent 166 | agent = extractor.get_agent(name="resume-parser") 167 | 168 | # Delete agent 169 | extractor.delete_agent(agent.id) 170 | ``` 171 | 172 | ## Installation 173 | 174 | ```bash 175 | pip install llama-extract==0.1.0 176 | ``` 177 | 178 | ## Tips & Best Practices 179 | 180 | At the core of LlamaExtract is the schema, which defines the structure of the data you want to extract from your documents. 181 | 182 | 1. **Schema Design**: 183 | 184 | - Try to limit schema nesting to 3-4 levels. 185 | - Make fields optional when data might not always be present. Having required fields may force the model 186 | to hallucinate when these fields are not present in the documents. 187 | - When you want to extract a variable number of entities, use an `array` type. However, note that you cannot use 188 | an `array` type for the root node. 189 | - Use descriptive field names and detailed descriptions. Use descriptions to pass formatting 190 | instructions or few-shot examples. 191 | - Above all, start simple and iteratively build your schema to incorporate requirements. 192 | 193 | 2. **Running Extractions**: 194 | - Note that resetting `agent.schema` will not save the schema to the database, 195 | until you call `agent.save`, but it will be used for running extractions. 196 | - Check job status prior to accessing results. Any extraction error should be available as 197 | part of `job.error` or `extraction_run.error` fields for debugging. 198 | - Consider async operations (`queue_extraction`) for large-scale extraction once you have finalized your schema. 199 | 200 | ### Hitting "The response was too long to be processed" Error 201 | 202 | This implies that the extraction response is hitting output token limits of the LLM. In such cases, it is worth rethinking the design of your schema to enable a more efficient/scalable extraction. e.g. 203 | 204 | - Instead of one field that extracts a complex object, you can use multiple fields to distribute the extraction logic. 205 | - You can also use multiple schemas to extract different subsets of fields from the same document and merge them later. 206 | 207 | Another option (orthogonal to the above) is to break the document into smaller sections and extract from each section individually, when possible. LlamaExtract will in most cases be able to handle both document and schema chunking automatically, but there are cases where you may need to do this manually. 208 | 209 | ## Additional Resources 210 | 211 | - [Example Notebook](examples/resume_screening.ipynb) - Detailed walkthrough of resume parsing 212 | - [Discord Community](https://discord.com/invite/eN6D2HQ4aX) - Get help and share feedback 213 | -------------------------------------------------------------------------------- /llama_cloud_services/__init__.py: -------------------------------------------------------------------------------- 1 | from llama_cloud_services.parse import LlamaParse 2 | from llama_cloud_services.report import ReportClient, LlamaReport 3 | from llama_cloud_services.extract import LlamaExtract, ExtractionAgent 4 | from llama_cloud_services.constants import EU_BASE_URL 5 | 6 | __all__ = [ 7 | "LlamaParse", 8 | "ReportClient", 9 | "LlamaReport", 10 | "LlamaExtract", 11 | "ExtractionAgent", 12 | "EU_BASE_URL", 13 | ] 14 | -------------------------------------------------------------------------------- /llama_cloud_services/constants.py: -------------------------------------------------------------------------------- 1 | EU_BASE_URL = "https://api.cloud.eu.llamaindex.ai" 2 | -------------------------------------------------------------------------------- /llama_cloud_services/extract/__init__.py: -------------------------------------------------------------------------------- 1 | from llama_cloud_services.extract.extract import ( 2 | LlamaExtract, 3 | ExtractConfig, 4 | ExtractionAgent, 5 | SourceText, 6 | ExtractTarget, 7 | ExtractMode, 8 | ) 9 | 10 | __all__ = [ 11 | "LlamaExtract", 12 | "ExtractionAgent", 13 | "SourceText", 14 | "ExtractConfig", 15 | "ExtractTarget", 16 | "ExtractMode", 17 | ] 18 | -------------------------------------------------------------------------------- /llama_cloud_services/extract/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Union, Generator 2 | from contextlib import contextmanager 3 | 4 | # Asyncio error messages 5 | nest_asyncio_err = "cannot be called from a running event loop" 6 | nest_asyncio_msg = ( 7 | "The event loop is already running. " 8 | "Add `import nest_asyncio; nest_asyncio.apply()` to your code to fix this issue." 9 | ) 10 | 11 | 12 | def is_jupyter() -> bool: 13 | """Check if we're running in a Jupyter environment.""" 14 | try: 15 | from IPython import get_ipython 16 | 17 | return get_ipython().__class__.__name__ == "ZMQInteractiveShell" 18 | except (ImportError, AttributeError): 19 | return False 20 | 21 | 22 | @contextmanager 23 | def augment_async_errors() -> Generator[None, None, None]: 24 | """Context manager to add helpful information for errors due to nested event loops.""" 25 | try: 26 | yield 27 | except RuntimeError as e: 28 | if nest_asyncio_err in str(e): 29 | raise RuntimeError(nest_asyncio_msg) 30 | raise 31 | 32 | 33 | JSONType = Union[Dict[str, Any], List[Any], str, int, float, bool, None] 34 | JSONObjectType = Dict[str, JSONType] 35 | 36 | 37 | class ExperimentalWarning(Warning): 38 | """Warning for experimental features.""" 39 | 40 | pass 41 | -------------------------------------------------------------------------------- /llama_cloud_services/parse/__init__.py: -------------------------------------------------------------------------------- 1 | from llama_cloud_services.parse.base import ( 2 | LlamaParse, 3 | ResultType, 4 | ParsingMode, 5 | FailedPageMode, 6 | ) 7 | 8 | __all__ = ["LlamaParse", "ResultType", "ParsingMode", "FailedPageMode"] 9 | -------------------------------------------------------------------------------- /llama_cloud_services/parse/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/llama_cloud_services/parse/cli/__init__.py -------------------------------------------------------------------------------- /llama_cloud_services/parse/cli/main.py: -------------------------------------------------------------------------------- 1 | import click 2 | import json 3 | from enum import Enum 4 | from pathlib import Path 5 | from pydantic.fields import FieldInfo 6 | from typing import Any, Callable, List 7 | 8 | from llama_cloud_services.parse.base import LlamaParse 9 | 10 | 11 | def pydantic_field_to_click_option(name: str, field: FieldInfo) -> click.Option: 12 | """Convert a Pydantic field to a Click option.""" 13 | kwargs = { 14 | "default": field.default if field.default else None, 15 | "help": field.description, 16 | } 17 | 18 | if isinstance(kwargs["default"], Enum): 19 | kwargs["default"] = kwargs["default"].value 20 | 21 | if field.annotation is bool: 22 | kwargs["is_flag"] = True 23 | if field.default and field.default is True: 24 | name = f"no-{name}" 25 | return click.option(f'--{name.replace("_", "-")}', **kwargs) 26 | 27 | 28 | def add_options(options: List[click.Option]) -> Callable: 29 | def _add_options(func: Callable) -> Callable: 30 | for option in reversed(options): 31 | func = option(func) 32 | return func 33 | 34 | return _add_options 35 | 36 | 37 | @click.command() 38 | @click.argument("file_paths", nargs=-1, type=click.Path(exists=True, path_type=Path)) 39 | @click.option( 40 | "--output-file", type=click.Path(path_type=Path), help="Path to save the output" 41 | ) 42 | @click.option("--output-raw-json", is_flag=True, help="Output the raw JSON result") 43 | @add_options( 44 | [ 45 | pydantic_field_to_click_option(name, field) 46 | for name, field in LlamaParse.model_fields.items() 47 | if name not in ["custom_client"] 48 | ] 49 | ) 50 | def parse(**kwargs: Any) -> None: 51 | """Parse files using LlamaParse and output the results.""" 52 | file_paths = kwargs.pop("file_paths") 53 | output_file = kwargs.pop("output_file") 54 | output_raw_json = kwargs.pop("output_raw_json") 55 | 56 | # Remove None values to use LlamaParse defaults 57 | kwargs = {k: v for k, v in kwargs.items() if v is not None} 58 | 59 | # Remove no- prefix for boolean flags 60 | kwargs = {k.replace("no_", ""): v for k, v in kwargs.items()} 61 | 62 | parser = LlamaParse(**kwargs) 63 | if output_raw_json: 64 | results = parser.get_json_result(list(file_paths)) 65 | 66 | if output_file: 67 | with output_file.open("w") as f: 68 | json.dump(results, f) 69 | click.echo(f"Results saved to {output_file}") 70 | else: 71 | click.echo(results) 72 | else: 73 | results = parser.load_data(list(file_paths)) 74 | 75 | if output_file: 76 | with output_file.open("w") as f: 77 | for i, doc in enumerate(results): 78 | f.write(f"File: {doc.metadata.get('file_path', 'Unknown')}\n") # type: ignore 79 | f.write(doc.text) # type: ignore 80 | if i < len(results) - 1: 81 | f.write("\n\n---\n\n") 82 | click.echo(f"Results saved to {output_file}") 83 | else: 84 | for i, doc in enumerate(results): 85 | click.echo(f"File: {doc.metadata.get('file_path', 'Unknown')}") # type: ignore 86 | click.echo(doc.text) # type: ignore 87 | if i < len(results) - 1: 88 | click.echo("\n---\n") 89 | 90 | 91 | if __name__ == "__main__": 92 | parse() 93 | -------------------------------------------------------------------------------- /llama_cloud_services/parse/utils.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | import logging 3 | from enum import Enum 4 | from tenacity import ( 5 | retry, 6 | stop_after_attempt, 7 | wait_exponential, 8 | retry_if_exception, 9 | before_sleep_log, 10 | ) 11 | from typing import Any 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | # Asyncio error messages 16 | nest_asyncio_err = "cannot be called from a running event loop" 17 | nest_asyncio_msg = "The event loop is already running. Add `import nest_asyncio; nest_asyncio.apply()` to your code to fix this issue." 18 | 19 | 20 | class ResultType(str, Enum): 21 | """The result type for the parser.""" 22 | 23 | TXT = "text" 24 | MD = "markdown" 25 | JSON = "json" 26 | STRUCTURED = "structured" 27 | 28 | 29 | class ParsingMode(str, Enum): 30 | """The parsing mode for the parser.""" 31 | 32 | parse_page_without_llm = "parse_page_without_llm" 33 | parse_page_with_llm = "parse_page_with_llm" 34 | parse_page_with_lvm = "parse_page_with_lvm" 35 | parse_page_with_agent = "parse_page_with_agent" 36 | parse_document_with_llm = "parse_document_with_llm" 37 | parse_document_with_agent = "parse_document_with_agent" 38 | 39 | 40 | class FailedPageMode(str, Enum): 41 | """ 42 | Enum for representing the different available page error handling modes 43 | """ 44 | 45 | raw_text = "raw_text" 46 | blank_page = "blank_page" 47 | error_message = "error_message" 48 | 49 | 50 | class Language(str, Enum): 51 | BAZA = "abq" 52 | ADYGHE = "ady" 53 | AFRIKAANS = "af" 54 | ANGIKA = "ang" 55 | ARABIC = "ar" 56 | ASSAMESE = "as" 57 | AVAR = "ava" 58 | AZERBAIJANI = "az" 59 | BELARUSIAN = "be" 60 | BULGARIAN = "bg" 61 | BIHARI = "bh" 62 | BHOJPURI = "bho" 63 | BENGALI = "bn" 64 | BOSNIAN = "bs" 65 | SIMPLIFIED_CHINESE = "ch_sim" 66 | TRADITIONAL_CHINESE = "ch_tra" 67 | CHECHEN = "che" 68 | CZECH = "cs" 69 | WELSH = "cy" 70 | DANISH = "da" 71 | DARGWA = "dar" 72 | GERMAN = "de" 73 | ENGLISH = "en" 74 | SPANISH = "es" 75 | ESTONIAN = "et" 76 | PERSIAN_FARSI = "fa" 77 | FRENCH = "fr" 78 | IRISH = "ga" 79 | GOAN_KONKANI = "gom" 80 | HINDI = "hi" 81 | CROATIAN = "hr" 82 | HUNGARIAN = "hu" 83 | INDONESIAN = "id" 84 | INGUSH = "inh" 85 | ICELANDIC = "is" 86 | ITALIAN = "it" 87 | JAPANESE = "ja" 88 | KABARDIAN = "kbd" 89 | KANNADA = "kn" 90 | KOREAN = "ko" 91 | KURDISH = "ku" 92 | LATIN = "la" 93 | LAK = "lbe" 94 | LEZGHIAN = "lez" 95 | LITHUANIAN = "lt" 96 | LATVIAN = "lv" 97 | MAGAHI = "mah" 98 | MAITHILI = "mai" 99 | MAORI = "mi" 100 | MONGOLIAN = "mn" 101 | MARATHI = "mr" 102 | MALAY = "ms" 103 | MALTESE = "mt" 104 | NEPALI = "ne" 105 | NEWARI = "new" 106 | DUTCH = "nl" 107 | NORWEGIAN = "no" 108 | OCCITAN = "oc" 109 | PALI = "pi" 110 | POLISH = "pl" 111 | PORTUGUESE = "pt" 112 | ROMANIAN = "ro" 113 | RUSSIAN = "ru" 114 | SERBIAN_CYRILLIC = "rs_cyrillic" 115 | SERBIAN_LATIN = "rs_latin" 116 | NAGPURI = "sck" 117 | SLOVAK = "sk" 118 | SLOVENIAN = "sl" 119 | ALBANIAN = "sq" 120 | SWEDISH = "sv" 121 | SWAHILI = "sw" 122 | TAMIL = "ta" 123 | TABASSARAN = "tab" 124 | TELUGU = "te" 125 | THAI = "th" 126 | TAJIK = "tjk" 127 | TAGALOG = "tl" 128 | TURKISH = "tr" 129 | UYGHUR = "ug" 130 | UKRAINIAN = "uk" 131 | URDU = "ur" 132 | UZBEK = "uz" 133 | VIETNAMESE = "vi" 134 | 135 | 136 | SUPPORTED_FILE_TYPES = [ 137 | ".pdf", 138 | # document and presentations 139 | ".602", 140 | ".abw", 141 | ".cgm", 142 | ".cwk", 143 | ".doc", 144 | ".docx", 145 | ".docm", 146 | ".dot", 147 | ".dotm", 148 | ".hwp", 149 | ".key", 150 | ".lwp", 151 | ".mw", 152 | ".mcw", 153 | ".pages", 154 | ".pbd", 155 | ".ppt", 156 | ".pptm", 157 | ".pptx", 158 | ".pot", 159 | ".potm", 160 | ".potx", 161 | ".rtf", 162 | ".sda", 163 | ".sdd", 164 | ".sdp", 165 | ".sdw", 166 | ".sgl", 167 | ".sti", 168 | ".sxi", 169 | ".sxw", 170 | ".stw", 171 | ".sxg", 172 | ".txt", 173 | ".uof", 174 | ".uop", 175 | ".uot", 176 | ".vor", 177 | ".wpd", 178 | ".wps", 179 | ".xml", 180 | ".zabw", 181 | ".epub", 182 | # images 183 | ".jpg", 184 | ".jpeg", 185 | ".png", 186 | ".gif", 187 | ".bmp", 188 | ".svg", 189 | ".tiff", 190 | ".webp", 191 | # web 192 | ".htm", 193 | ".html", 194 | # spreadsheets 195 | ".xlsx", 196 | ".xls", 197 | ".xlsm", 198 | ".xlsb", 199 | ".xlw", 200 | ".csv", 201 | ".dif", 202 | ".sylk", 203 | ".slk", 204 | ".prn", 205 | ".numbers", 206 | ".et", 207 | ".ods", 208 | ".fods", 209 | ".uos1", 210 | ".uos2", 211 | ".dbf", 212 | ".wk1", 213 | ".wk2", 214 | ".wk3", 215 | ".wk4", 216 | ".wks", 217 | ".123", 218 | ".wq1", 219 | ".wq2", 220 | ".wb1", 221 | ".wb2", 222 | ".wb3", 223 | ".qpw", 224 | ".xlr", 225 | ".eth", 226 | ".tsv", 227 | ".mp3", 228 | ".mp4", 229 | ".mpeg", 230 | ".mpga", 231 | ".m4a", 232 | ".wav", 233 | ".webm", 234 | ] 235 | 236 | 237 | def should_retry(exception: Exception) -> bool: 238 | """Check if the exception should be retried. 239 | 240 | Args: 241 | exception: The exception to check. 242 | """ 243 | # Retry on connection errors (network issues) 244 | if isinstance( 245 | exception, 246 | ( 247 | httpx.ConnectError, 248 | httpx.ConnectTimeout, 249 | httpx.ReadTimeout, 250 | httpx.WriteTimeout, 251 | httpx.RemoteProtocolError, 252 | ), 253 | ): 254 | return True 255 | 256 | # Retry on specific HTTP status codes 257 | if isinstance(exception, httpx.HTTPStatusError): 258 | status_code = exception.response.status_code 259 | # Retry on rate limiting or temporary server errors 260 | return status_code in (429, 500, 502, 503, 504) 261 | 262 | return False 263 | 264 | 265 | async def make_api_request( 266 | client: httpx.AsyncClient, 267 | method: str, 268 | url: str, 269 | timeout: float = 60.0, 270 | max_retries: int = 5, 271 | **httpx_kwargs: Any, 272 | ) -> httpx.Response: 273 | """Make an retrying API request to the LlamaParse API. 274 | 275 | Args: 276 | client: The httpx.AsyncClient to use for the request. 277 | url: The URL to request. 278 | headers: The headers to include in the request. 279 | timeout: The timeout for the request. 280 | max_retries: The maximum number of retries for the request. 281 | """ 282 | 283 | @retry( 284 | stop=stop_after_attempt(max_retries), 285 | wait=wait_exponential(multiplier=1, min=4, max=timeout), 286 | retry=retry_if_exception(should_retry), 287 | before_sleep=before_sleep_log(logger, logging.WARNING), 288 | ) 289 | async def _make_request(url: str, **httpx_kwargs: Any) -> httpx.Response: 290 | if method == "GET": 291 | response = await client.get(url, **httpx_kwargs) 292 | elif method == "POST": 293 | response = await client.post(url, **httpx_kwargs) 294 | else: 295 | raise ValueError(f"Invalid method: {method}") 296 | response.raise_for_status() 297 | return response 298 | 299 | return await _make_request(url, **httpx_kwargs) 300 | -------------------------------------------------------------------------------- /llama_cloud_services/report/__init__.py: -------------------------------------------------------------------------------- 1 | from llama_cloud_services.report.report import ReportClient 2 | from llama_cloud_services.report.base import LlamaReport 3 | 4 | __all__ = ["ReportClient", "LlamaReport"] 5 | -------------------------------------------------------------------------------- /llama_cloud_services/report/base.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import httpx 3 | import os 4 | import io 5 | from concurrent.futures import ThreadPoolExecutor 6 | from typing import Optional, List, Union, Any, Coroutine, TypeVar 7 | from urllib.parse import urljoin 8 | 9 | from llama_cloud.types import ReportMetadata 10 | from llama_cloud_services.report.report import ReportClient 11 | 12 | T = TypeVar("T") 13 | 14 | 15 | class LlamaReport: 16 | """Client for managing reports and general report operations.""" 17 | 18 | def __init__( 19 | self, 20 | api_key: Optional[str] = None, 21 | project_id: Optional[str] = None, 22 | organization_id: Optional[str] = None, 23 | base_url: Optional[str] = None, 24 | timeout: Optional[int] = None, 25 | async_httpx_client: Optional[httpx.AsyncClient] = None, 26 | ): 27 | self.api_key = api_key or os.getenv("LLAMA_CLOUD_API_KEY", None) 28 | if not self.api_key: 29 | raise ValueError("No API key provided.") 30 | 31 | self.base_url = base_url or os.getenv( 32 | "LLAMA_CLOUD_BASE_URL", "https://api.cloud.llamaindex.ai" 33 | ) 34 | self.timeout = timeout or 60 35 | 36 | # Initialize HTTP clients 37 | self._aclient = async_httpx_client or httpx.AsyncClient(timeout=self.timeout) 38 | 39 | # Set auth headers 40 | self.headers = { 41 | "Authorization": f"Bearer {self.api_key}", 42 | } 43 | 44 | self.organization_id = organization_id 45 | self.project_id = project_id 46 | self._client_params = { 47 | "timeout": self._aclient.timeout, 48 | "headers": self._aclient.headers, 49 | "base_url": self._aclient.base_url, 50 | "auth": self._aclient.auth, 51 | "event_hooks": self._aclient.event_hooks, 52 | "cookies": self._aclient.cookies, 53 | "max_redirects": self._aclient.max_redirects, 54 | "params": self._aclient.params, 55 | "trust_env": self._aclient.trust_env, 56 | } 57 | self._thread_pool = ThreadPoolExecutor( 58 | max_workers=min(10, (os.cpu_count() or 1) + 4) 59 | ) 60 | 61 | @property 62 | def aclient(self) -> httpx.AsyncClient: 63 | if self._aclient is None: 64 | self._aclient = httpx.AsyncClient(**self._client_params) 65 | return self._aclient 66 | 67 | def _run_sync(self, coro: Coroutine[Any, Any, T]) -> T: 68 | """Run coroutine in a separate thread to avoid event loop issues""" 69 | 70 | # force a new client for this thread/event loop 71 | original_client = self._aclient 72 | self._aclient = None 73 | 74 | def run_coro() -> T: 75 | async def wrapped_coro() -> T: 76 | return await coro 77 | 78 | return asyncio.run(wrapped_coro()) 79 | 80 | result = self._thread_pool.submit(run_coro).result() 81 | 82 | # restore the original client 83 | self._aclient = original_client 84 | 85 | return result 86 | 87 | async def _get_default_project(self) -> str: 88 | response = await self.aclient.get( 89 | urljoin(str(self.base_url), "/api/v1/projects"), headers=self.headers 90 | ) 91 | response.raise_for_status() 92 | projects = response.json() 93 | default_project = [p for p in projects if p.get("is_default")] 94 | return default_project[0]["id"] 95 | 96 | async def _build_url( 97 | self, endpoint: str, extra_params: Optional[List[str]] = None 98 | ) -> str: 99 | """Helper method to build URLs with common query parameters.""" 100 | url = urljoin(str(self.base_url), endpoint) 101 | 102 | if not self.project_id: 103 | self.project_id = await self._get_default_project() 104 | 105 | query_params = [] 106 | if self.organization_id: 107 | query_params.append(f"organization_id={self.organization_id}") 108 | if self.project_id: 109 | query_params.append(f"project_id={self.project_id}") 110 | if extra_params: 111 | query_params.extend([p for p in extra_params if p is not None]) 112 | 113 | if query_params: 114 | url += "?" + "&".join(query_params) 115 | 116 | return url 117 | 118 | async def acreate_report( 119 | self, 120 | name: str, 121 | template_instructions: Optional[str] = None, 122 | template_text: Optional[str] = None, 123 | template_file: Optional[Union[str, tuple[str, bytes]]] = None, 124 | input_files: Optional[List[Union[str, tuple[str, bytes]]]] = None, 125 | existing_retriever_id: Optional[str] = None, 126 | ) -> ReportClient: 127 | """Create a new report asynchronously.""" 128 | url = await self._build_url("/api/v1/reports/") 129 | open_files: List[io.BufferedReader] = [] 130 | 131 | data = {"name": name} 132 | if template_instructions: 133 | data["template_instructions"] = template_instructions 134 | if template_text: 135 | data["template_text"] = template_text 136 | if existing_retriever_id: 137 | data["existing_retriever_id"] = str(existing_retriever_id) 138 | 139 | files: List[tuple[str, io.BufferedReader | bytes]] = [] 140 | if template_file: 141 | if isinstance(template_file, str): 142 | open_files.append(open(template_file, "rb")) 143 | files.append(("template_file", open_files[-1])) 144 | else: 145 | files.append(("template_file", template_file[1])) 146 | 147 | if input_files: 148 | for f in input_files: 149 | if isinstance(f, str): 150 | open_files.append(open(f, "rb")) 151 | files.append(("files", open_files[-1])) 152 | else: 153 | files.append(("files", f[1])) 154 | 155 | response = await self.aclient.post( 156 | url, headers=self.headers, data=data, files=files 157 | ) 158 | try: 159 | response.raise_for_status() 160 | report_id = response.json()["id"] 161 | return ReportClient(report_id, name, self) 162 | except httpx.HTTPStatusError as e: 163 | raise ValueError( 164 | f"Failed to create report: {e.response.text}\nError Code: {e.response.status_code}" 165 | ) 166 | finally: 167 | for open_file in open_files: 168 | open_file.close() 169 | 170 | def create_report( 171 | self, 172 | name: str, 173 | template_instructions: Optional[str] = None, 174 | template_text: Optional[str] = None, 175 | template_file: Optional[Union[str, tuple[str, bytes]]] = None, 176 | input_files: Optional[List[Union[str, tuple[str, bytes]]]] = None, 177 | existing_retriever_id: Optional[str] = None, 178 | ) -> ReportClient: 179 | """Create a new report.""" 180 | return self._run_sync( 181 | self.acreate_report( 182 | name=name, 183 | template_instructions=template_instructions, 184 | template_text=template_text, 185 | template_file=template_file, 186 | input_files=input_files, 187 | existing_retriever_id=existing_retriever_id, 188 | ) 189 | ) 190 | 191 | async def alist_reports( 192 | self, state: Optional[str] = None, limit: int = 100, offset: int = 0 193 | ) -> List[ReportClient]: 194 | """List all reports asynchronously.""" 195 | params = [] 196 | if state: 197 | params.append(f"state={state}") 198 | if limit: 199 | params.append(f"limit={limit}") 200 | if offset: 201 | params.append(f"offset={offset}") 202 | 203 | url = await self._build_url( 204 | "/api/v1/reports/list", 205 | extra_params=params, 206 | ) 207 | 208 | response = await self.aclient.get(url, headers=self.headers) 209 | response.raise_for_status() 210 | data = response.json() 211 | 212 | return [ 213 | ReportClient(r["report_id"], r["name"], self) 214 | for r in data["report_responses"] 215 | ] 216 | 217 | def list_reports( 218 | self, state: Optional[str] = None, limit: int = 100, offset: int = 0 219 | ) -> List[ReportClient]: 220 | """Synchronous wrapper for listing reports.""" 221 | return self._run_sync(self.alist_reports(state, limit, offset)) 222 | 223 | async def aget_report(self, report_id: str) -> ReportClient: 224 | """Get a Report instance for working with a specific report.""" 225 | url = await self._build_url(f"/api/v1/reports/{report_id}") 226 | 227 | response = await self.aclient.get(url, headers=self.headers) 228 | response.raise_for_status() 229 | data = response.json() 230 | 231 | return ReportClient(data["report_id"], data["name"], self) 232 | 233 | def get_report(self, report_id: str) -> ReportClient: 234 | """Synchronous wrapper for getting a report.""" 235 | return self._run_sync(self.aget_report(report_id)) 236 | 237 | async def aget_report_metadata(self, report_id: str) -> ReportMetadata: 238 | """Get metadata for a specific report asynchronously. 239 | 240 | Returns: 241 | dict containing: 242 | - id: Report ID 243 | - name: Report name 244 | - state: Current report state 245 | - report_metadata: Additional metadata 246 | - template_file: Name of template file if used 247 | - template_instructions: Template instructions if provided 248 | - input_files: List of input file names 249 | """ 250 | url = await self._build_url(f"/api/v1/reports/{report_id}/metadata") 251 | 252 | response = await self.aclient.get(url, headers=self.headers) 253 | response.raise_for_status() 254 | return ReportMetadata(**response.json()) 255 | 256 | def get_report_metadata(self, report_id: str) -> ReportMetadata: 257 | """Synchronous wrapper for getting report metadata.""" 258 | return self._run_sync(self.aget_report_metadata(report_id)) 259 | 260 | async def adelete_report(self, report_id: str) -> None: 261 | """Delete a specific report asynchronously.""" 262 | url = await self._build_url(f"/api/v1/reports/{report_id}") 263 | 264 | response = await self.aclient.delete(url, headers=self.headers) 265 | response.raise_for_status() 266 | 267 | def delete_report(self, report_id: str) -> None: 268 | """Synchronous wrapper for deleting a report.""" 269 | return self._run_sync(self.adelete_report(report_id)) 270 | -------------------------------------------------------------------------------- /llama_parse/README.md: -------------------------------------------------------------------------------- 1 | # LlamaParse 2 | 3 | [![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-parse)](https://pypi.org/project/llama-parse/) 4 | [![GitHub contributors](https://img.shields.io/github/contributors/run-llama/llama_parse)](https://github.com/run-llama/llama_parse/graphs/contributors) 5 | [![Discord](https://img.shields.io/discord/1059199217496772688)](https://discord.gg/dGcwcsnxhU) 6 | 7 | LlamaParse is a **GenAI-native document parser** that can parse complex document data for any downstream LLM use case (RAG, agents). 8 | 9 | It is really good at the following: 10 | 11 | - ✅ **Broad file type support**: Parsing a variety of unstructured file types (.pdf, .pptx, .docx, .xlsx, .html) with text, tables, visual elements, weird layouts, and more. 12 | - ✅ **Table recognition**: Parsing embedded tables accurately into text and semi-structured representations. 13 | - ✅ **Multimodal parsing and chunking**: Extracting visual elements (images/diagrams) into structured formats and return image chunks using the latest multimodal models. 14 | - ✅ **Custom parsing**: Input custom prompt instructions to customize the output the way you want it. 15 | 16 | LlamaParse directly integrates with [LlamaIndex](https://github.com/run-llama/llama_index). 17 | 18 | The free plan is up to 1000 pages a day. Paid plan is free 7k pages per week + 0.3c per additional page by default. There is a sandbox available to test the API [**https://cloud.llamaindex.ai/parse ↗**](https://cloud.llamaindex.ai/parse). 19 | 20 | Read below for some quickstart information, or see the [full documentation](https://docs.cloud.llamaindex.ai/). 21 | 22 | If you're a company interested in enterprise RAG solutions, and/or high volume/on-prem usage of LlamaParse, come [talk to us](https://www.llamaindex.ai/contact). 23 | 24 | ## Getting Started 25 | 26 | First, login and get an api-key from [**https://cloud.llamaindex.ai/api-key ↗**](https://cloud.llamaindex.ai/api-key). 27 | 28 | Then, make sure you have the latest LlamaIndex version installed. 29 | 30 | **NOTE:** If you are upgrading from v0.9.X, we recommend following our [migration guide](https://pretty-sodium-5e0.notion.site/v0-10-0-Migration-Guide-6ede431dcb8841b09ea171e7f133bd77), as well as uninstalling your previous version first. 31 | 32 | ``` 33 | pip uninstall llama-index # run this if upgrading from v0.9.x or older 34 | pip install -U llama-index --upgrade --no-cache-dir --force-reinstall 35 | ``` 36 | 37 | Lastly, install the package: 38 | 39 | `pip install llama-parse` 40 | 41 | Now you can parse your first PDF file using the command line interface. Use the command `llama-parse [file_paths]`. See the help text with `llama-parse --help`. 42 | 43 | ```bash 44 | export LLAMA_CLOUD_API_KEY='llx-...' 45 | 46 | # output as text 47 | llama-parse my_file.pdf --result-type text --output-file output.txt 48 | 49 | # output as markdown 50 | llama-parse my_file.pdf --result-type markdown --output-file output.md 51 | 52 | # output as raw json 53 | llama-parse my_file.pdf --output-raw-json --output-file output.json 54 | ``` 55 | 56 | You can also create simple scripts: 57 | 58 | ```python 59 | import nest_asyncio 60 | 61 | nest_asyncio.apply() 62 | 63 | from llama_parse import LlamaParse 64 | 65 | parser = LlamaParse( 66 | api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY 67 | result_type="markdown", # "markdown" and "text" are available 68 | num_workers=4, # if multiple files passed, split in `num_workers` API calls 69 | verbose=True, 70 | language="en", # Optionally you can define a language, default=en 71 | ) 72 | 73 | # sync 74 | documents = parser.load_data("./my_file.pdf") 75 | 76 | # sync batch 77 | documents = parser.load_data(["./my_file1.pdf", "./my_file2.pdf"]) 78 | 79 | # async 80 | documents = await parser.aload_data("./my_file.pdf") 81 | 82 | # async batch 83 | documents = await parser.aload_data(["./my_file1.pdf", "./my_file2.pdf"]) 84 | ``` 85 | 86 | ## Using with file object 87 | 88 | You can parse a file object directly: 89 | 90 | ```python 91 | import nest_asyncio 92 | 93 | nest_asyncio.apply() 94 | 95 | from llama_parse import LlamaParse 96 | 97 | parser = LlamaParse( 98 | api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY 99 | result_type="markdown", # "markdown" and "text" are available 100 | num_workers=4, # if multiple files passed, split in `num_workers` API calls 101 | verbose=True, 102 | language="en", # Optionally you can define a language, default=en 103 | ) 104 | 105 | file_name = "my_file1.pdf" 106 | extra_info = {"file_name": file_name} 107 | 108 | with open(f"./{file_name}", "rb") as f: 109 | # must provide extra_info with file_name key with passing file object 110 | documents = parser.load_data(f, extra_info=extra_info) 111 | 112 | # you can also pass file bytes directly 113 | with open(f"./{file_name}", "rb") as f: 114 | file_bytes = f.read() 115 | # must provide extra_info with file_name key with passing file bytes 116 | documents = parser.load_data(file_bytes, extra_info=extra_info) 117 | ``` 118 | 119 | ## Using with `SimpleDirectoryReader` 120 | 121 | You can also integrate the parser as the default PDF loader in `SimpleDirectoryReader`: 122 | 123 | ```python 124 | import nest_asyncio 125 | 126 | nest_asyncio.apply() 127 | 128 | from llama_parse import LlamaParse 129 | from llama_index.core import SimpleDirectoryReader 130 | 131 | parser = LlamaParse( 132 | api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY 133 | result_type="markdown", # "markdown" and "text" are available 134 | verbose=True, 135 | ) 136 | 137 | file_extractor = {".pdf": parser} 138 | documents = SimpleDirectoryReader( 139 | "./data", file_extractor=file_extractor 140 | ).load_data() 141 | ``` 142 | 143 | Full documentation for `SimpleDirectoryReader` can be found on the [LlamaIndex Documentation](https://docs.llamaindex.ai/en/stable/module_guides/loading/simpledirectoryreader.html). 144 | 145 | ## Examples 146 | 147 | Several end-to-end indexing examples can be found in the examples folder 148 | 149 | - [Getting Started](/examples/parse/demo_basic.ipynb) 150 | - [Advanced RAG Example](/examples/parse/demo_advanced.ipynb) 151 | - [Raw API Usage](/examples/parse/demo_api.ipynb) 152 | 153 | ## Documentation 154 | 155 | [https://docs.cloud.llamaindex.ai/](https://docs.cloud.llamaindex.ai/) 156 | 157 | ## Terms of Service 158 | 159 | See the [Terms of Service Here](./TOS.pdf). 160 | 161 | ## Get in Touch (LlamaCloud) 162 | 163 | LlamaParse is part of LlamaCloud, our e2e enterprise RAG platform that provides out-of-the-box, production-ready connectors, indexing, and retrieval over your complex data sources. We offer SaaS and VPC options. 164 | 165 | LlamaCloud is currently available via waitlist (join by [creating an account](https://cloud.llamaindex.ai/)). If you're interested in state-of-the-art quality and in centralizing your RAG efforts, come [get in touch with us](https://www.llamaindex.ai/contact). 166 | -------------------------------------------------------------------------------- /llama_parse/llama_parse/__init__.py: -------------------------------------------------------------------------------- 1 | from llama_cloud_services.parse import ( 2 | LlamaParse, 3 | ResultType, 4 | ParsingMode, 5 | FailedPageMode, 6 | ) 7 | 8 | __all__ = ["LlamaParse", "ResultType", "ParsingMode", "FailedPageMode"] 9 | -------------------------------------------------------------------------------- /llama_parse/llama_parse/base.py: -------------------------------------------------------------------------------- 1 | from llama_cloud_services.parse.base import ( 2 | LlamaParse, 3 | ResultType, 4 | ParsingMode, 5 | FailedPageMode, 6 | FileInput, 7 | _DEFAULT_SEPARATOR, 8 | JOB_RESULT_URL, 9 | JOB_STATUS_ROUTE, 10 | JOB_UPLOAD_ROUTE, 11 | ) 12 | 13 | __all__ = [ 14 | "LlamaParse", 15 | "ResultType", 16 | "FileInput", 17 | "ParsingMode", 18 | "FailedPageMode", 19 | "_DEFAULT_SEPARATOR", 20 | "JOB_RESULT_URL", 21 | "JOB_STATUS_ROUTE", 22 | "JOB_UPLOAD_ROUTE", 23 | ] 24 | -------------------------------------------------------------------------------- /llama_parse/llama_parse/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/llama_parse/llama_parse/cli/__init__.py -------------------------------------------------------------------------------- /llama_parse/llama_parse/cli/main.py: -------------------------------------------------------------------------------- 1 | from llama_cloud_services.parse.cli.main import parse 2 | 3 | if __name__ == "__main__": 4 | parse() 5 | -------------------------------------------------------------------------------- /llama_parse/llama_parse/utils.py: -------------------------------------------------------------------------------- 1 | from llama_cloud_services.parse.utils import ( 2 | SUPPORTED_FILE_TYPES, 3 | Language, 4 | ResultType, 5 | ParsingMode, 6 | FailedPageMode, 7 | ) 8 | 9 | __all__ = [ 10 | "SUPPORTED_FILE_TYPES", 11 | "Language", 12 | "ResultType", 13 | "ParsingMode", 14 | "FailedPageMode", 15 | ] 16 | -------------------------------------------------------------------------------- /llama_parse/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["poetry-core"] 3 | build-backend = "poetry.core.masonry.api" 4 | 5 | [tool.poetry] 6 | name = "llama-parse" 7 | version = "0.6.25" 8 | description = "Parse files into RAG-Optimized formats." 9 | authors = ["Logan Markewich "] 10 | license = "MIT" 11 | readme = "README.md" 12 | packages = [{include = "llama_parse"}] 13 | 14 | [tool.poetry.dependencies] 15 | python = ">=3.9,<4.0" 16 | llama-cloud-services = ">=0.6.24" 17 | 18 | [tool.poetry.group.dev.dependencies] 19 | pytest = "^8.0.0" 20 | pytest-asyncio = "*" 21 | ipykernel = "^6.29.0" 22 | 23 | [tool.poetry.scripts] 24 | llama-parse = "llama_parse.cli.main:parse" 25 | -------------------------------------------------------------------------------- /parse.md: -------------------------------------------------------------------------------- 1 | # LlamaParse 2 | 3 | LlamaParse is a **GenAI-native document parser** that can parse complex document data for any downstream LLM use case (RAG, agents). 4 | 5 | It is really good at the following: 6 | 7 | - ✅ **Broad file type support**: Parsing a variety of unstructured file types (.pdf, .pptx, .docx, .xlsx, .html) with text, tables, visual elements, weird layouts, and more. 8 | - ✅ **Table recognition**: Parsing embedded tables accurately into text and semi-structured representations. 9 | - ✅ **Multimodal parsing and chunking**: Extracting visual elements (images/diagrams) into structured formats and return image chunks using the latest multimodal models. 10 | - ✅ **Custom parsing**: Input custom prompt instructions to customize the output the way you want it. 11 | 12 | LlamaParse directly integrates with [LlamaIndex](https://github.com/run-llama/llama_index). 13 | 14 | The free plan is up to 1000 pages a day. Paid plan is free 7k pages per week + 0.3c per additional page by default. There is a sandbox available to test the API [**https://cloud.llamaindex.ai/parse ↗**](https://cloud.llamaindex.ai/parse). 15 | 16 | Read below for some quickstart information, or see the [full documentation](https://docs.cloud.llamaindex.ai/). 17 | 18 | If you're a company interested in enterprise RAG solutions, and/or high volume/on-prem usage of LlamaParse, come [talk to us](https://www.llamaindex.ai/contact). 19 | 20 | ## Getting Started 21 | 22 | First, login and get an api-key from [**https://cloud.llamaindex.ai/api-key ↗**](https://cloud.llamaindex.ai/api-key). 23 | 24 | Then, install the package: 25 | 26 | `pip install llama-cloud-services` 27 | 28 | ## CLI Usage 29 | 30 | Now you can parse your first PDF file using the command line interface. Use the command `llama-parse [file_paths]`. See the help text with `llama-parse --help`. 31 | 32 | ```bash 33 | export LLAMA_CLOUD_API_KEY='llx-...' 34 | 35 | # output as text 36 | llama-parse my_file.pdf --result-type text --output-file output.txt 37 | 38 | # output as markdown 39 | llama-parse my_file.pdf --result-type markdown --output-file output.md 40 | 41 | # output as raw json 42 | llama-parse my_file.pdf --output-raw-json --output-file output.json 43 | ``` 44 | 45 | ## Python Usage 46 | 47 | You can also create simple scripts: 48 | 49 | ```python 50 | from llama_cloud_services import LlamaParse 51 | 52 | parser = LlamaParse( 53 | api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY 54 | num_workers=4, # if multiple files passed, split in `num_workers` API calls 55 | verbose=True, 56 | language="en", # Optionally you can define a language, default=en 57 | ) 58 | 59 | # sync 60 | result = parser.parse("./my_file.pdf") 61 | 62 | # sync batch 63 | results = parser.parse(["./my_file1.pdf", "./my_file2.pdf"]) 64 | 65 | # async 66 | result = await parser.aparse("./my_file.pdf") 67 | 68 | # async batch 69 | results = await parser.aparse(["./my_file1.pdf", "./my_file2.pdf"]) 70 | ``` 71 | 72 | The result object is a fully typed `JobResult` object, and you can interact with it to parse and transform various parts of the result: 73 | 74 | ```python 75 | # get the llama-index markdown documents 76 | markdown_documents = result.get_markdown_documents(split_by_page=True) 77 | 78 | # get the llama-index text documents 79 | text_documents = result.get_text_documents(split_by_page=False) 80 | 81 | # get the image documents 82 | image_documents = result.get_image_documents( 83 | include_screenshot_images=True, 84 | include_object_images=False, 85 | # Optional: download the images to a directory 86 | # (default is to return the image bytes in ImageDocument objects) 87 | image_download_dir="./images", 88 | ) 89 | 90 | # access the raw job result 91 | # Items will vary based on the parser configuration 92 | for page in result.pages: 93 | print(page.text) 94 | print(page.md) 95 | print(page.images) 96 | print(page.layout) 97 | print(page.structuredData) 98 | ``` 99 | 100 | See more details about the result object in the [example notebook](./examples/parse/demo_json_tour.ipynb). 101 | 102 | ### Using with file object / bytes 103 | 104 | You can parse a file object directly: 105 | 106 | ```python 107 | from llama_cloud_services import LlamaParse 108 | 109 | parser = LlamaParse( 110 | api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY 111 | num_workers=4, # if multiple files passed, split in `num_workers` API calls 112 | verbose=True, 113 | language="en", # Optionally you can define a language, default=en 114 | ) 115 | 116 | file_name = "my_file1.pdf" 117 | extra_info = {"file_name": file_name} 118 | 119 | with open(f"./{file_name}", "rb") as f: 120 | # must provide extra_info with file_name key with passing file object 121 | result = parser.parse(f, extra_info=extra_info) 122 | 123 | # you can also pass file bytes directly 124 | with open(f"./{file_name}", "rb") as f: 125 | file_bytes = f.read() 126 | # must provide extra_info with file_name key with passing file bytes 127 | result = parser.parse(file_bytes, extra_info=extra_info) 128 | ``` 129 | 130 | ### Using with `SimpleDirectoryReader` 131 | 132 | You can also integrate the parser as the default PDF loader in `SimpleDirectoryReader`: 133 | 134 | ```python 135 | from llama_cloud_services import LlamaParse 136 | from llama_index.core import SimpleDirectoryReader 137 | 138 | parser = LlamaParse( 139 | api_key="llx-...", # can also be set in your env as LLAMA_CLOUD_API_KEY 140 | result_type="markdown", # "markdown" and "text" are available 141 | verbose=True, 142 | ) 143 | 144 | file_extractor = {".pdf": parser} 145 | documents = SimpleDirectoryReader( 146 | "./data", file_extractor=file_extractor 147 | ).load_data() 148 | ``` 149 | 150 | Full documentation for `SimpleDirectoryReader` can be found on the [LlamaIndex Documentation](https://docs.llamaindex.ai/en/stable/module_guides/loading/simpledirectoryreader.html). 151 | 152 | ## Examples 153 | 154 | Several end-to-end indexing examples can be found in the examples folder 155 | 156 | - [Getting Started](examples/parse/demo_basic.ipynb) 157 | - [Advanced RAG Example](examples/parse/demo_advanced.ipynb) 158 | - [Raw API Usage](examples/parse/demo_api.ipynb) 159 | - [Result Object Tour](examples/parse/demo_json_tour.ipynb) 160 | 161 | ## Documentation 162 | 163 | [https://docs.cloud.llamaindex.ai/](https://docs.cloud.llamaindex.ai/) 164 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["poetry-core"] 3 | build-backend = "poetry.core.masonry.api" 4 | 5 | [tool.mypy] 6 | files = ["llama_cloud_services"] 7 | python_version = "3.10" 8 | 9 | [tool.poetry] 10 | name = "llama-cloud-services" 11 | version = "0.6.25" 12 | description = "Tailored SDK clients for LlamaCloud services." 13 | authors = ["Logan Markewich "] 14 | license = "MIT" 15 | readme = "README.md" 16 | packages = [{include = "llama_cloud_services"}] 17 | 18 | [tool.poetry.dependencies] 19 | python = ">=3.9,<4.0" 20 | llama-index-core = ">=0.12.0" 21 | llama-cloud = "==0.1.23" 22 | pydantic = ">=2.8,!=2.10" 23 | click = "^8.1.7" 24 | python-dotenv = "^1.0.1" 25 | eval-type-backport = {python = "<3.10", version = "^0.2.0"} 26 | platformdirs = "^4.3.7" 27 | 28 | [tool.poetry.group.dev.dependencies] 29 | pytest = "^8.0.0" 30 | pytest-asyncio = "*" 31 | ipykernel = "^6.29.0" 32 | pre-commit = "3.2.0" 33 | autoevals = "^0.0.114" 34 | deepdiff = "^8.1.1" 35 | ipython = "^8.12.3" 36 | jupyter = "^1.1.1" 37 | mypy = "^1.14.1" 38 | 39 | [tool.poetry.scripts] 40 | llama-parse = "llama_cloud_services.parse.cli.main:parse" 41 | -------------------------------------------------------------------------------- /report.md: -------------------------------------------------------------------------------- 1 | # LlamaReport (beta/invite-only) 2 | 3 | LlamaReport is a prebuilt agentic report builder that can be used to build reports from a variety of data sources. 4 | 5 | The python SDK for interacting with the LlamaReport API. The SDK provides two main classes: 6 | 7 | - `LlamaReport`: For managing reports (create, list, delete) 8 | - `ReportClient`: For working with a specific report (editing, approving, etc.) 9 | 10 | ## Quickstart 11 | 12 | ```bash 13 | pip install llama-cloud-services 14 | ``` 15 | 16 | ```python 17 | from llama_cloud_services import LlamaReport 18 | 19 | # Initialize the client 20 | client = LlamaReport( 21 | api_key="your-api-key", 22 | # Optional: Specify project_id, organization_id, async_httpx_client 23 | ) 24 | 25 | # Create a new report 26 | report = client.create_report( 27 | "My Report", 28 | # must have one of template_text or template_instructions 29 | template_text="Your template text", 30 | template_instructions="Instructions for the template", 31 | # must have one of input_files or retriever_id 32 | input_files=["data1.pdf", "data2.pdf"], 33 | retriever_id="retriever-id", 34 | ) 35 | ``` 36 | 37 | ## Working with Reports 38 | 39 | The typical workflow for a report involves: 40 | 41 | 1. Creating the report 42 | 2. Waiting for and approving the plan 43 | 3. Waiting for report generation 44 | 4. Making edits to the report 45 | 46 | Here's a complete example: 47 | 48 | ```python 49 | # Create a report 50 | report = client.create_report( 51 | "Quarterly Analysis", input_files=["q1_data.pdf", "q2_data.pdf"] 52 | ) 53 | 54 | # Wait for the plan to be ready 55 | plan = report.wait_for_plan() 56 | 57 | # Option 1: Directly approve the plan 58 | report.update_plan(action="approve") 59 | 60 | # Option 2: Suggest and review edits to the plan 61 | suggestions = report.suggest_edits( 62 | "Can you add a section about market trends?" 63 | ) 64 | for suggestion in suggestions: 65 | print(suggestion) 66 | 67 | # Accept or reject the suggestion 68 | if input("Accept? (y/n): ").lower() == "y": 69 | report.accept_edit(suggestion) 70 | else: 71 | report.reject_edit(suggestion) 72 | 73 | # Wait for the report to complete 74 | report = report.wait_for_completion() 75 | 76 | # Make edits to the final report 77 | suggestions = report.suggest_edits("Make the executive summary more concise") 78 | 79 | # Review and accept/reject suggestions as above 80 | ... 81 | ``` 82 | 83 | ### Getting the Final Report 84 | 85 | Once you are satisfied with the report, you can get the final report object and use the content as you see fit. 86 | 87 | Here's an example of printing out the final report: 88 | 89 | ```python 90 | report = report.get() 91 | report_text = "\n\n".join([block.template for block in report.blocks]) 92 | 93 | print(report_text) 94 | ``` 95 | 96 | ## Additional Features 97 | 98 | - **Async Support**: All methods have async counterparts: `create_report` -> `acreate_report`, `wait_for_plan` -> `await_for_plan`, etc. 99 | - **Automatic Chat History**: The SDK automatically keeps track of chat history for each suggestion, unless you specify `auto_history=False` in `suggest_edits`. 100 | - **Custom HTTP Client**: You can provide your own `httpx.AsyncClient` to the `LlamaReport` class. 101 | - **Project and Organization IDs**: You can specify `project_id` and `organization_id` to use a specific project or organization. 102 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/__init__.py -------------------------------------------------------------------------------- /tests/extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/extract/__init__.py -------------------------------------------------------------------------------- /tests/extract/data/receipt/noisebridge_receipt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/extract/data/receipt/noisebridge_receipt.pdf -------------------------------------------------------------------------------- /tests/extract/data/receipt/noisebridge_receipt.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "receiptNumber": "27215058", 3 | "invoiceNumber": "87B37C90152", 4 | "datePaid": "2024-07-19", 5 | "paymentMethod": { 6 | "type": "visa", 7 | "lastFourDigits": "7267" 8 | }, 9 | "merchant": { 10 | "name": "Noisebridge", 11 | "address": { 12 | "street": "272 Capp St", 13 | "city": "San Francisco", 14 | "state": "California", 15 | "postalCode": "94110", 16 | "country": "United States" 17 | }, 18 | "phone": "1 6507017829", 19 | "email": "treasurer+stripe@noisebridge.net" 20 | }, 21 | "billTo": "noisebridge@seldo.com", 22 | "items": [ 23 | { 24 | "description": "$10 / month", 25 | "quantity": 1, 26 | "unitPrice": 10.0, 27 | "amount": 10.0, 28 | "period": { 29 | "start": "2024-07-19", 30 | "end": "2024-08-19" 31 | } 32 | } 33 | ], 34 | "subtotal": 10.0, 35 | "total": 10.0, 36 | "amountPaid": 10.0 37 | } 38 | -------------------------------------------------------------------------------- /tests/extract/data/receipt/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema#", 3 | "type": "object", 4 | "required": ["receiptNumber", "datePaid", "total", "items"], 5 | "properties": { 6 | "receiptNumber": { 7 | "type": "string" 8 | }, 9 | "invoiceNumber": { 10 | "type": "string" 11 | }, 12 | "datePaid": { 13 | "type": "string", 14 | "format": "date" 15 | }, 16 | "paymentMethod": { 17 | "type": "object", 18 | "properties": { 19 | "type": { 20 | "type": "string", 21 | "enum": ["visa", "mastercard", "amex", "cash", "other"] 22 | }, 23 | "lastFourDigits": { 24 | "type": "string", 25 | "pattern": "^[0-9]{4}$" 26 | } 27 | } 28 | }, 29 | "merchant": { 30 | "type": "object", 31 | "properties": { 32 | "name": { 33 | "type": "string" 34 | }, 35 | "address": { 36 | "type": "object", 37 | "properties": { 38 | "street": { 39 | "type": "string" 40 | }, 41 | "city": { 42 | "type": "string" 43 | }, 44 | "state": { 45 | "type": "string" 46 | }, 47 | "postalCode": { 48 | "type": "string" 49 | }, 50 | "country": { 51 | "type": "string" 52 | } 53 | } 54 | }, 55 | "phone": { 56 | "type": "string" 57 | }, 58 | "email": { 59 | "type": "string", 60 | "format": "email" 61 | } 62 | } 63 | }, 64 | "billTo": { 65 | "type": "string", 66 | "format": "email" 67 | }, 68 | "items": { 69 | "type": "array", 70 | "items": { 71 | "type": "object", 72 | "required": [ 73 | "description", 74 | "quantity", 75 | "unitPrice", 76 | "amount", 77 | "period" 78 | ], 79 | "properties": { 80 | "description": { 81 | "type": "string" 82 | }, 83 | "quantity": { 84 | "type": "integer", 85 | "minimum": 1 86 | }, 87 | "unitPrice": { 88 | "type": "number", 89 | "minimum": 0 90 | }, 91 | "amount": { 92 | "type": "number", 93 | "minimum": 0 94 | }, 95 | "period": { 96 | "type": "object", 97 | "properties": { 98 | "start": { 99 | "type": "string", 100 | "format": "date" 101 | }, 102 | "end": { 103 | "type": "string", 104 | "format": "date" 105 | } 106 | } 107 | } 108 | } 109 | } 110 | }, 111 | "subtotal": { 112 | "type": "number", 113 | "minimum": 0 114 | }, 115 | "total": { 116 | "type": "number", 117 | "minimum": 0 118 | }, 119 | "amountPaid": { 120 | "type": "number", 121 | "minimum": 0 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /tests/extract/data/resume/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema#", 3 | "title": "Resume Schema", 4 | "type": "object", 5 | "required": ["basics", "skills", "experience"], 6 | "properties": { 7 | "basics": { 8 | "type": "object", 9 | "required": ["name", "email"], 10 | "properties": { 11 | "name": { 12 | "type": "string" 13 | }, 14 | "email": { 15 | "type": "string", 16 | "format": "email" 17 | }, 18 | "phone": { 19 | "type": "string" 20 | }, 21 | "location": { 22 | "type": "object", 23 | "properties": { 24 | "city": { 25 | "type": "string" 26 | }, 27 | "region": { 28 | "type": "string" 29 | }, 30 | "country": { 31 | "type": "string" 32 | } 33 | } 34 | }, 35 | "profiles": { 36 | "type": "array", 37 | "items": { 38 | "type": "object", 39 | "properties": { 40 | "network": { 41 | "type": "string" 42 | }, 43 | "url": { 44 | "type": "string", 45 | "format": "uri" 46 | } 47 | } 48 | } 49 | }, 50 | "summary": { 51 | "type": "string" 52 | } 53 | } 54 | }, 55 | "skills": { 56 | "type": "array", 57 | "items": { 58 | "type": "object", 59 | "properties": { 60 | "category": { 61 | "type": "string" 62 | }, 63 | "keywords": { 64 | "type": "array", 65 | "items": { 66 | "type": "string" 67 | } 68 | }, 69 | "level": { 70 | "type": "string", 71 | "enum": ["beginner", "intermediate", "advanced", "expert"] 72 | } 73 | } 74 | } 75 | }, 76 | "experience": { 77 | "type": "array", 78 | "items": { 79 | "type": "object", 80 | "required": ["company", "position", "startDate"], 81 | "properties": { 82 | "company": { 83 | "type": "string" 84 | }, 85 | "position": { 86 | "type": "string" 87 | }, 88 | "startDate": { 89 | "type": "string", 90 | "format": "date" 91 | }, 92 | "endDate": { 93 | "type": "string", 94 | "format": "date" 95 | }, 96 | "highlights": { 97 | "type": "array", 98 | "items": { 99 | "type": "string" 100 | } 101 | }, 102 | "technologies": { 103 | "type": "array", 104 | "items": { 105 | "type": "string" 106 | } 107 | } 108 | } 109 | } 110 | }, 111 | "education": { 112 | "type": "array", 113 | "items": { 114 | "type": "object", 115 | "required": ["institution", "degree"], 116 | "properties": { 117 | "institution": { 118 | "type": "string" 119 | }, 120 | "degree": { 121 | "type": "string" 122 | }, 123 | "field": { 124 | "type": "string" 125 | }, 126 | "graduationDate": { 127 | "type": "string", 128 | "format": "date" 129 | }, 130 | "gpa": { 131 | "type": "number" 132 | } 133 | } 134 | } 135 | }, 136 | "certifications": { 137 | "type": "array", 138 | "items": { 139 | "type": "object", 140 | "properties": { 141 | "name": { 142 | "type": "string" 143 | }, 144 | "issuer": { 145 | "type": "string" 146 | }, 147 | "date": { 148 | "type": "string", 149 | "format": "date" 150 | }, 151 | "validUntil": { 152 | "type": "string", 153 | "format": "date" 154 | } 155 | } 156 | } 157 | }, 158 | "publications": { 159 | "type": "array", 160 | "items": { 161 | "type": "object", 162 | "properties": { 163 | "title": { 164 | "type": "string" 165 | }, 166 | "publisher": { 167 | "type": "string" 168 | }, 169 | "date": { 170 | "type": "string", 171 | "format": "date" 172 | }, 173 | "url": { 174 | "type": "string", 175 | "format": "uri" 176 | } 177 | } 178 | } 179 | } 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /tests/extract/data/resume/software_architect_resume.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 116 | 117 | 118 | 119 |
120 | 171 | 172 |
173 |

Sarah Chen

174 |
Senior Software Architect
175 | 176 |
177 |

Professional Summary

178 |

179 | Innovative Software Architect with over 12 years of experience 180 | designing and implementing large-scale distributed systems. Proven 181 | track record of leading technical teams and delivering robust 182 | enterprise solutions. Expert in cloud architecture, microservices, 183 | and emerging technologies with a focus on scalable, maintainable 184 | systems. 185 |

186 |
187 | 188 |
189 |

Professional Experience

190 | 191 |
192 |
TechCorp Solutions
193 |
Senior Software Architect
194 |
2020 - Present
195 |
    196 |
  • 197 | Led architectural design and implementation of a cloud-native 198 | platform serving 2M+ users 199 |
  • 200 |
  • 201 | Established architectural guidelines and best practices adopted 202 | across 12 development teams 203 |
  • 204 |
  • 205 | Reduced system latency by 40% through implementation of 206 | event-driven architecture 207 |
  • 208 |
  • 209 | Mentored 15+ senior developers in cloud-native development 210 | practices 211 |
  • 212 |
213 |
214 | 215 |
216 |
DataFlow Systems
217 |
Lead Software Engineer
218 |
2016 - 2020
219 |
    220 |
  • 221 | Architected and led development of distributed data processing 222 | platform handling 5TB daily 223 |
  • 224 |
  • 225 | Designed microservices architecture reducing deployment time by 226 | 65% 227 |
  • 228 |
  • 229 | Led migration of legacy monolith to cloud-native architecture 230 |
  • 231 |
  • 232 | Managed team of 8 engineers across 3 international locations 233 |
  • 234 |
235 |
236 | 237 |
238 |
InnovateTech
239 |
Senior Software Engineer
240 |
2013 - 2016
241 |
    242 |
  • 243 | Developed high-performance trading platform processing 100K 244 | transactions per second 245 |
  • 246 |
  • 247 | Implemented real-time analytics engine reducing processing 248 | latency by 75% 249 |
  • 250 |
  • 251 | Led adoption of container orchestration reducing deployment 252 | costs by 35% 253 |
  • 254 |
255 |
256 |
257 | 258 |
259 |

Education

260 | 261 |
262 |
Stanford University
263 |
Master of Science in Computer Science
264 |
2013
265 |

Focus: Distributed Systems and Machine Learning

266 |
267 | 268 |
269 |
University of California, Berkeley
270 |
271 | Bachelor of Science in Computer Engineering 272 |
273 |
2011
274 |

Magna Cum Laude

275 |
276 |
277 | 278 |
279 |

Patents & Speaking

280 |
    281 |
  • 282 | Co-inventor on three patents for distributed systems architecture 283 |
  • 284 |
  • 285 | Published paper on "Scalable Microservices Architecture" at IEEE 286 | Cloud Computing Conference 2022 287 |
  • 288 |
  • 289 | Keynote Speaker, CloudCon 2023: "Future of Cloud-Native 290 | Architecture" 291 |
  • 292 |
  • Regular presenter at local tech meetups and conferences
  • 293 |
294 |
295 |
296 |
297 | 298 | 299 | -------------------------------------------------------------------------------- /tests/extract/data/resume/software_architect_resume.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "basics": { 3 | "name": "Sarah Chen", 4 | "email": "san.francisco@email.com", 5 | "phone": "(555) 123-4567", 6 | "location": { 7 | "city": "San Francisco", 8 | "region": "CA", 9 | "country": "USA" 10 | } 11 | }, 12 | "skills": [ 13 | { 14 | "category": "Architecture & Design", 15 | "keywords": [ 16 | "Microservices", 17 | "Event-Driven Architecture", 18 | "Domain-Driven Design", 19 | "REST APIs" 20 | ] 21 | }, 22 | { 23 | "category": "Cloud Platforms", 24 | "keywords": ["AWS", "Azure", "Google Cloud Platform"] 25 | }, 26 | { 27 | "category": "Programming Languages", 28 | "keywords": ["Java", "Python", "Go", "JavaScript", "TypeScript"] 29 | } 30 | ], 31 | "experience": [ 32 | { 33 | "company": "TechCorp Solutions", 34 | "position": "Senior Software Architect", 35 | "startDate": "2020-01-01", 36 | "endDate": "2024-01-10" 37 | }, 38 | { 39 | "company": "DataFlow Systems", 40 | "position": "Lead Software Engineer", 41 | "startDate": "2016-01-01", 42 | "endDate": "2019-12-31", 43 | "technologies": [ 44 | "Distributed Systems", 45 | "Microservices", 46 | "Cloud Migration" 47 | ] 48 | }, 49 | { 50 | "company": "InnovateTech", 51 | "position": "Senior Software Engineer", 52 | "startDate": "2013-01-01", 53 | "endDate": "2015-12-31", 54 | "technologies": [ 55 | "High-performance Computing", 56 | "Real-time Analytics", 57 | "Container Orchestration" 58 | ] 59 | } 60 | ], 61 | "education": [ 62 | { 63 | "institution": "Stanford University", 64 | "degree": "Master of Science", 65 | "field": "Computer Science", 66 | "graduationDate": "2013-01-01", 67 | "specialization": "Distributed Systems and Machine Learning" 68 | }, 69 | { 70 | "institution": "University of California, Berkeley", 71 | "degree": "Bachelor of Science", 72 | "field": "Computer Engineering", 73 | "graduationDate": "2011-01-01" 74 | } 75 | ], 76 | "certifications": [ 77 | { 78 | "name": "AWS Solutions Architect - Professional" 79 | }, 80 | { 81 | "name": "Google Cloud Architect" 82 | }, 83 | { 84 | "name": "Certified Kubernetes Administrator" 85 | } 86 | ], 87 | "publications": [ 88 | { 89 | "title": "Scalable Microservices Architecture", 90 | "publisher": "IEEE Cloud Computing Conference", 91 | "date": "2022-01-01" 92 | } 93 | ] 94 | } 95 | -------------------------------------------------------------------------------- /tests/extract/data/slide/saas_slide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/extract/data/slide/saas_slide.pdf -------------------------------------------------------------------------------- /tests/extract/data/slide/saas_slide.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "companyInfo": { 3 | "name": "CloudFlow Analytics", 4 | "fundingStage": "Series A", 5 | "foundedYear": null, 6 | "industry": null, 7 | "location": null 8 | }, 9 | "financialMetrics": { 10 | "mrr": { 11 | "value": 580000, 12 | "currency": "USD", 13 | "growthRate": 27 14 | }, 15 | "grossMargin": 88 16 | }, 17 | "growthMetrics": { 18 | "customers": { 19 | "total": 1247, 20 | "growth": 142, 21 | "enterprisePercent": null 22 | }, 23 | "nrr": 147 24 | }, 25 | "marketMetrics": { 26 | "tam": 50000000000, 27 | "sam": null, 28 | "marketShare": null, 29 | "competitors": null 30 | }, 31 | "differentiators": [ 32 | { 33 | "claim": "Processing Speed", 34 | "metric": "5x faster", 35 | "comparisonTarget": "competitors" 36 | }, 37 | { 38 | "claim": "ML Accuracy", 39 | "metric": "99.9%", 40 | "comparisonTarget": null 41 | }, 42 | { 43 | "claim": "Market Potential", 44 | "metric": "80%", 45 | "comparisonTarget": "Fortune 500" 46 | } 47 | ] 48 | } 49 | -------------------------------------------------------------------------------- /tests/extract/data/slide/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema#", 3 | "type": "object", 4 | "required": ["companyInfo", "financialMetrics", "growthMetrics"], 5 | "properties": { 6 | "companyInfo": { 7 | "type": "object", 8 | "required": ["name", "fundingStage"], 9 | "properties": { 10 | "name": { 11 | "type": "string" 12 | }, 13 | "fundingStage": { 14 | "type": "string", 15 | "enum": ["Pre-seed", "Seed", "Series A", "Series B", "Series C+"] 16 | }, 17 | "foundedYear": { 18 | "anyOf": [ 19 | { 20 | "type": "integer" 21 | }, 22 | { 23 | "type": "null" 24 | } 25 | ] 26 | }, 27 | "industry": { 28 | "anyOf": [ 29 | { 30 | "type": "string" 31 | }, 32 | { 33 | "type": "null" 34 | } 35 | ] 36 | }, 37 | "location": { 38 | "anyOf": [ 39 | { 40 | "type": "string" 41 | }, 42 | { 43 | "type": "null" 44 | } 45 | ] 46 | } 47 | } 48 | }, 49 | "financialMetrics": { 50 | "type": "object", 51 | "required": ["mrr", "growthRate"], 52 | "properties": { 53 | "mrr": { 54 | "type": "object", 55 | "description": "Monthly Recurring Revenue", 56 | "required": ["value", "currency", "growthRate"], 57 | "properties": { 58 | "value": { 59 | "type": "number" 60 | }, 61 | "currency": { 62 | "type": "string" 63 | }, 64 | "growthRate": { 65 | "type": "number" 66 | } 67 | } 68 | }, 69 | "grossMargin": { 70 | "type": "number" 71 | } 72 | } 73 | }, 74 | "growthMetrics": { 75 | "type": "object", 76 | "required": ["customers", "nrr"], 77 | "properties": { 78 | "customers": { 79 | "type": "object", 80 | "required": ["total", "growth"], 81 | "properties": { 82 | "total": { 83 | "type": "integer" 84 | }, 85 | "growth": { 86 | "type": "number" 87 | } 88 | } 89 | }, 90 | "nrr": { 91 | "description": "Net Revenue Retention", 92 | "type": "number" 93 | } 94 | } 95 | }, 96 | "differentiators": { 97 | "type": "array", 98 | "items": { 99 | "type": "object", 100 | "required": ["claim", "metric"], 101 | "properties": { 102 | "claim": { 103 | "type": "string" 104 | }, 105 | "metric": { 106 | "type": "string" 107 | }, 108 | "comparisonTarget": { 109 | "anyOf": [ 110 | { 111 | "type": "string" 112 | }, 113 | { 114 | "type": "null" 115 | } 116 | ] 117 | } 118 | } 119 | } 120 | } 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /tests/extract/test_benchmark.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from llama_cloud_services.extract import LlamaExtract, ExtractionAgent 5 | from time import perf_counter 6 | from collections import namedtuple 7 | import json 8 | import uuid 9 | from llama_cloud.types import ( 10 | ExtractConfig, 11 | ExtractMode, 12 | LlamaParseParameters, 13 | LlamaExtractSettings, 14 | ) 15 | from tests.extract.util import load_test_dotenv 16 | 17 | 18 | load_test_dotenv() 19 | 20 | TEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") 21 | # Get configuration from environment 22 | LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") 23 | LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL") 24 | LLAMA_CLOUD_PROJECT_ID = os.getenv("LLAMA_CLOUD_PROJECT_ID") 25 | 26 | TestCase = namedtuple( 27 | "TestCase", ["name", "schema_path", "config", "input_file", "expected_output"] 28 | ) 29 | 30 | 31 | def get_test_cases(): 32 | """Get all test cases from TEST_DIR. 33 | 34 | Returns: 35 | List[TestCase]: List of test cases 36 | """ 37 | test_cases = [] 38 | 39 | for data_type in os.listdir(TEST_DIR): 40 | data_type_dir = os.path.join(TEST_DIR, data_type) 41 | if not os.path.isdir(data_type_dir): 42 | continue 43 | 44 | schema_path = os.path.join(data_type_dir, "schema.json") 45 | if not os.path.exists(schema_path): 46 | continue 47 | 48 | input_files = [] 49 | 50 | for file in os.listdir(data_type_dir): 51 | file_path = os.path.join(data_type_dir, file) 52 | if ( 53 | not os.path.isfile(file_path) 54 | or file == "schema.json" 55 | or file.endswith(".test.json") 56 | ): 57 | continue 58 | 59 | input_files.append(file_path) 60 | 61 | settings = [ 62 | ExtractConfig(extraction_mode=ExtractMode.FAST), 63 | ExtractConfig(extraction_mode=ExtractMode.BALANCED), 64 | ] 65 | 66 | for input_file in sorted(input_files): 67 | base_name = os.path.splitext(os.path.basename(input_file))[0] 68 | expected_output = os.path.join(data_type_dir, f"{base_name}.test.json") 69 | 70 | if not os.path.exists(expected_output): 71 | continue 72 | 73 | test_name = f"{data_type}/{os.path.basename(input_file)}" 74 | for setting in settings: 75 | test_cases.append( 76 | TestCase( 77 | name=test_name, 78 | schema_path=schema_path, 79 | input_file=input_file, 80 | config=setting, 81 | expected_output=expected_output, 82 | ) 83 | ) 84 | 85 | return test_cases 86 | 87 | 88 | @pytest.fixture(scope="session") 89 | def extractor(): 90 | """Create a single LlamaExtract instance for all tests.""" 91 | extract = LlamaExtract( 92 | api_key=LLAMA_CLOUD_API_KEY, 93 | base_url=LLAMA_CLOUD_BASE_URL, 94 | project_id=LLAMA_CLOUD_PROJECT_ID, 95 | verbose=True, 96 | ) 97 | yield extract 98 | # Cleanup thread pool at end of session 99 | extract._thread_pool.shutdown() 100 | 101 | 102 | @pytest.fixture 103 | def extraction_agent(test_case: TestCase, extractor: LlamaExtract): 104 | """Fixture to create and cleanup extraction agent for each test.""" 105 | # Create unique name with random UUID (important for CI to avoid conflicts) 106 | unique_id = uuid.uuid4().hex[:8] 107 | agent_name = f"{test_case.name}_{unique_id}" 108 | 109 | with open(test_case.schema_path, "r") as f: 110 | schema = json.load(f) 111 | 112 | # Clean up any existing agents with this name 113 | try: 114 | agents = extractor.list_agents() 115 | for agent in agents: 116 | if agent.name == agent_name: 117 | extractor.delete_agent(agent.id) 118 | except Exception as e: 119 | print(f"Warning: Failed to cleanup existing agent: {str(e)}") 120 | 121 | # Create new agent 122 | agent = extractor.create_agent(agent_name, schema, config=test_case.config) 123 | yield agent 124 | 125 | 126 | @pytest.mark.skipif( 127 | "CI" in os.environ, 128 | reason="CI environment is not suitable for benchmarking", 129 | ) 130 | @pytest.mark.parametrize("test_case", get_test_cases(), ids=lambda x: x.name) 131 | @pytest.mark.asyncio(loop_scope="session") 132 | async def test_extraction( 133 | test_case: TestCase, extraction_agent: ExtractionAgent 134 | ) -> None: 135 | start = perf_counter() 136 | result = await extraction_agent._run_extraction_test( 137 | test_case.input_file, 138 | extract_settings=LlamaExtractSettings( 139 | llama_parse_params=LlamaParseParameters( 140 | invalidate_cache=True, 141 | do_not_cache=True, 142 | ) 143 | ), 144 | ) 145 | end = perf_counter() 146 | print(f"Time taken: {end - start} seconds") 147 | print(result) 148 | -------------------------------------------------------------------------------- /tests/extract/test_extract_api.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from pathlib import Path 4 | from pydantic import BaseModel 5 | 6 | from llama_cloud_services.extract import LlamaExtract, ExtractionAgent, SourceText 7 | from tests.extract.util import load_test_dotenv 8 | 9 | load_test_dotenv() 10 | 11 | # Get configuration from environment 12 | LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") 13 | LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL") 14 | LLAMA_CLOUD_PROJECT_ID = os.getenv("LLAMA_CLOUD_PROJECT_ID") 15 | 16 | # Skip all tests if API key is not set 17 | pytestmark = pytest.mark.skipif( 18 | not LLAMA_CLOUD_API_KEY, reason="LLAMA_CLOUD_API_KEY not set" 19 | ) 20 | 21 | 22 | # Test data 23 | class TestSchema(BaseModel): 24 | title: str 25 | summary: str 26 | 27 | 28 | # Test data paths 29 | TEST_DIR = Path(__file__).parent / "data" 30 | TEST_PDF = TEST_DIR / "slide" / "saas_slide.pdf" 31 | 32 | 33 | @pytest.fixture 34 | def llama_extract(): 35 | return LlamaExtract( 36 | api_key=LLAMA_CLOUD_API_KEY, 37 | base_url=LLAMA_CLOUD_BASE_URL, 38 | project_id=LLAMA_CLOUD_PROJECT_ID, 39 | verbose=True, 40 | ) 41 | 42 | 43 | @pytest.fixture 44 | def test_agent_name(): 45 | return "test-api-agent" 46 | 47 | 48 | @pytest.fixture 49 | def test_schema_dict(): 50 | return { 51 | "type": "object", 52 | "properties": { 53 | "title": {"type": "string"}, 54 | "summary": {"type": "string"}, 55 | }, 56 | } 57 | 58 | 59 | @pytest.fixture 60 | def test_agent(llama_extract, test_agent_name, test_schema_dict, request): 61 | """Creates a test agent and cleans it up after the test""" 62 | test_id = request.node.nodeid 63 | test_hash = hex(hash(test_id))[-8:] 64 | base_name = test_agent_name 65 | 66 | base_name = next( 67 | (marker.args[0] for marker in request.node.iter_markers("agent_name")), 68 | base_name, 69 | ) 70 | name = f"{base_name}_{test_hash}" 71 | 72 | schema = next( 73 | ( 74 | marker.args[0][0] if isinstance(marker.args[0], tuple) else marker.args[0] 75 | for marker in request.node.iter_markers("agent_schema") 76 | ), 77 | test_schema_dict, 78 | ) 79 | 80 | # Cleanup existing agent 81 | try: 82 | for agent in llama_extract.list_agents(): 83 | if agent.name == name: 84 | llama_extract.delete_agent(agent.id) 85 | except Exception as e: 86 | print(f"Warning: Failed to cleanup existing agent: {e}") 87 | 88 | agent = llama_extract.create_agent(name=name, data_schema=schema) 89 | yield agent 90 | 91 | # Cleanup after test 92 | try: 93 | llama_extract.delete_agent(agent.id) 94 | except Exception as e: 95 | print(f"Warning: Failed to delete agent {agent.id}: {e}") 96 | 97 | 98 | class TestLlamaExtract: 99 | def test_init_without_api_key(self): 100 | env_backup = os.getenv("LLAMA_CLOUD_API_KEY") 101 | del os.environ["LLAMA_CLOUD_API_KEY"] 102 | with pytest.raises(ValueError, match="The API key is required"): 103 | LlamaExtract(api_key=None, base_url=LLAMA_CLOUD_BASE_URL) 104 | os.environ["LLAMA_CLOUD_API_KEY"] = env_backup 105 | 106 | @pytest.mark.agent_name("test-dict-schema-agent") 107 | def test_create_agent_with_dict_schema(self, test_agent): 108 | assert isinstance(test_agent, ExtractionAgent) 109 | 110 | @pytest.mark.agent_name("test-pydantic-schema-agent") 111 | @pytest.mark.agent_schema((TestSchema,)) 112 | def test_create_agent_with_pydantic_schema(self, test_agent): 113 | assert isinstance(test_agent, ExtractionAgent) 114 | 115 | def test_get_agent_by_name(self, llama_extract, test_agent): 116 | agent = llama_extract.get_agent(name=test_agent.name) 117 | assert isinstance(agent, ExtractionAgent) 118 | assert agent.name == test_agent.name 119 | assert agent.id == test_agent.id 120 | assert agent.data_schema == test_agent.data_schema 121 | 122 | def test_get_agent_by_id(self, llama_extract, test_agent): 123 | agent = llama_extract.get_agent(id=test_agent.id) 124 | assert isinstance(agent, ExtractionAgent) 125 | assert agent.id == test_agent.id 126 | assert agent.name == test_agent.name 127 | assert agent.data_schema == test_agent.data_schema 128 | 129 | def test_list_agents(self, llama_extract, test_agent): 130 | agents = llama_extract.list_agents() 131 | assert isinstance(agents, list) 132 | assert any(a.id == test_agent.id for a in agents) 133 | 134 | 135 | class TestExtractionAgent: 136 | @pytest.mark.asyncio 137 | async def test_extract_single_file(self, test_agent): 138 | result = await test_agent.aextract(TEST_PDF) 139 | assert result.status == "SUCCESS" 140 | assert result.data is not None 141 | assert isinstance(result.data, dict) 142 | assert "title" in result.data 143 | assert "summary" in result.data 144 | 145 | def test_sync_extract_single_file(self, test_agent): 146 | result = test_agent.extract(TEST_PDF) 147 | assert result.status == "SUCCESS" 148 | assert result.data is not None 149 | assert isinstance(result.data, dict) 150 | assert "title" in result.data 151 | assert "summary" in result.data 152 | 153 | def test_extract_file_from_buffered_io(self, test_agent): 154 | result = test_agent.extract(SourceText(file=open(TEST_PDF, "rb"))) 155 | assert result.status == "SUCCESS" 156 | assert result.data is not None 157 | assert isinstance(result.data, dict) 158 | assert "title" in result.data 159 | assert "summary" in result.data 160 | 161 | def test_extract_file_from_bytes(self, test_agent): 162 | with open(TEST_PDF, "rb") as f: 163 | file_bytes = f.read() 164 | result = test_agent.extract(SourceText(file=file_bytes, filename=TEST_PDF.name)) 165 | assert result.status == "SUCCESS" 166 | assert result.data is not None 167 | assert isinstance(result.data, dict) 168 | assert "title" in result.data 169 | assert "summary" in result.data 170 | 171 | def test_extract_from_text_content(self, test_agent): 172 | TEST_TEXT = """ 173 | # Llamas 174 | Llamas are social animals and live with others as a herd. Their wool is soft and 175 | contains only a small amount of lanolin.[2] Llamas can learn simple tasks after a 176 | few repetitions. When using a pack, they can carry about 25 to 30% of their body 177 | weight for 8 to 13 km (5–8 miles).[3] The name llama (also historically spelled 178 | "glama") was adopted by European settlers from native Peruvians. 179 | """ 180 | result = test_agent.extract(SourceText(text_content=TEST_TEXT)) 181 | assert result.status == "SUCCESS" 182 | assert result.data is not None 183 | assert isinstance(result.data, dict) 184 | assert "title" in result.data 185 | assert "summary" in result.data 186 | 187 | @pytest.mark.asyncio 188 | async def test_extract_multiple_files(self, test_agent): 189 | files = [TEST_PDF, TEST_PDF] # Using same file twice for testing 190 | response = await test_agent.aextract(files) 191 | 192 | assert len(response) == 2 193 | for result in response: 194 | assert result.status == "SUCCESS" 195 | assert result.data is not None 196 | assert isinstance(result.data, dict) 197 | assert "title" in result.data 198 | assert "summary" in result.data 199 | 200 | def test_save_agent_updates( 201 | self, test_agent: ExtractionAgent, llama_extract: LlamaExtract 202 | ): 203 | new_schema = { 204 | "type": "object", 205 | "properties": { 206 | "new_field": {"type": "string"}, 207 | "title": {"type": "string"}, 208 | "summary": {"type": "string"}, 209 | }, 210 | } 211 | test_agent.data_schema = new_schema 212 | test_agent.save() 213 | 214 | # Verify the update by getting a fresh instance 215 | updated_agent = llama_extract.get_agent(name=test_agent.name) 216 | assert "new_field" in updated_agent.data_schema["properties"] 217 | 218 | def test_list_extraction_runs(self, test_agent: ExtractionAgent): 219 | assert test_agent.list_extraction_runs().total == 0 220 | test_agent.extract(TEST_PDF) 221 | runs = test_agent.list_extraction_runs() 222 | assert runs.total > 0 223 | 224 | def test_delete_extraction_run(self, test_agent: ExtractionAgent): 225 | assert test_agent.list_extraction_runs().total == 0 226 | run = test_agent.extract(TEST_PDF) 227 | test_agent.delete_extraction_run(run.id) 228 | runs = test_agent.list_extraction_runs() 229 | assert runs.total == 0 230 | -------------------------------------------------------------------------------- /tests/extract/test_extract_e2e.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | from llama_cloud_services.extract import LlamaExtract, ExtractionAgent 5 | from collections import namedtuple 6 | import json 7 | import uuid 8 | from llama_cloud.types import ExtractConfig, ExtractMode 9 | from deepdiff import DeepDiff 10 | from tests.extract.util import json_subset_match_score, load_test_dotenv 11 | 12 | load_test_dotenv() 13 | 14 | 15 | TEST_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") 16 | # Get configuration from environment 17 | LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") 18 | LLAMA_CLOUD_BASE_URL = os.getenv("LLAMA_CLOUD_BASE_URL") 19 | LLAMA_CLOUD_PROJECT_ID = os.getenv("LLAMA_CLOUD_PROJECT_ID") 20 | 21 | TestCase = namedtuple( 22 | "TestCase", ["name", "schema_path", "config", "input_file", "expected_output"] 23 | ) 24 | 25 | 26 | def get_test_cases(): 27 | """Get all test cases from TEST_DIR. 28 | 29 | Returns: 30 | List[TestCase]: List of test cases 31 | """ 32 | test_cases = [] 33 | 34 | for data_type in os.listdir(TEST_DIR): 35 | data_type_dir = os.path.join(TEST_DIR, data_type) 36 | if not os.path.isdir(data_type_dir): 37 | continue 38 | 39 | schema_path = os.path.join(data_type_dir, "schema.json") 40 | if not os.path.exists(schema_path): 41 | continue 42 | 43 | input_files = [] 44 | 45 | for file in os.listdir(data_type_dir): 46 | file_path = os.path.join(data_type_dir, file) 47 | if ( 48 | not os.path.isfile(file_path) 49 | or file == "schema.json" 50 | or file.endswith(".test.json") 51 | ): 52 | continue 53 | 54 | input_files.append(file_path) 55 | 56 | settings = [ 57 | ExtractConfig(extraction_mode=ExtractMode.FAST), 58 | ExtractConfig(extraction_mode=ExtractMode.BALANCED), 59 | ] 60 | 61 | for input_file in sorted(input_files): 62 | base_name = os.path.splitext(os.path.basename(input_file))[0] 63 | expected_output = os.path.join(data_type_dir, f"{base_name}.test.json") 64 | 65 | if not os.path.exists(expected_output): 66 | continue 67 | 68 | test_name = f"{data_type}/{os.path.basename(input_file)}" 69 | for setting in settings: 70 | test_cases.append( 71 | TestCase( 72 | name=test_name, 73 | schema_path=schema_path, 74 | input_file=input_file, 75 | config=setting, 76 | expected_output=expected_output, 77 | ) 78 | ) 79 | 80 | return test_cases 81 | 82 | 83 | @pytest.fixture(scope="session") 84 | def extractor(): 85 | """Create a single LlamaExtract instance for all tests.""" 86 | extract = LlamaExtract( 87 | api_key=LLAMA_CLOUD_API_KEY, 88 | base_url=LLAMA_CLOUD_BASE_URL, 89 | project_id=LLAMA_CLOUD_PROJECT_ID, 90 | verbose=True, 91 | ) 92 | yield extract 93 | # Cleanup thread pool at end of session 94 | extract._thread_pool.shutdown() 95 | 96 | 97 | @pytest.fixture 98 | def extraction_agent(test_case: TestCase, extractor: LlamaExtract): 99 | """Fixture to create and cleanup extraction agent for each test.""" 100 | # Create unique name with random UUID (important for CI to avoid conflicts) 101 | unique_id = uuid.uuid4().hex[:8] 102 | agent_name = f"{test_case.name}_{unique_id}" 103 | 104 | with open(test_case.schema_path, "r") as f: 105 | schema = json.load(f) 106 | 107 | # Clean up any existing agents with this name 108 | try: 109 | agents = extractor.list_agents() 110 | for agent in agents: 111 | if agent.name == agent_name: 112 | extractor.delete_agent(agent.id) 113 | except Exception as e: 114 | print(f"Warning: Failed to cleanup existing agent: {str(e)}") 115 | 116 | # Create new agent 117 | agent = extractor.create_agent(agent_name, schema, config=test_case.config) 118 | yield agent 119 | 120 | # Cleanup after test 121 | try: 122 | extractor.delete_agent(agent.id) 123 | except Exception as e: 124 | print(f"Warning: Failed to delete agent {agent.id}: {str(e)}") 125 | 126 | 127 | @pytest.mark.skipif( 128 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 129 | reason="LLAMA_CLOUD_API_KEY not set", 130 | ) 131 | @pytest.mark.parametrize("test_case", get_test_cases(), ids=lambda x: x.name) 132 | def test_extraction(test_case: TestCase, extraction_agent: ExtractionAgent) -> None: 133 | result = extraction_agent.extract(test_case.input_file).data 134 | with open(test_case.expected_output, "r") as f: 135 | expected = json.load(f) 136 | # TODO: fix the saas_slide test 137 | assert json_subset_match_score(expected, result) > 0.3, DeepDiff( 138 | expected, result, ignore_order=True 139 | ) 140 | -------------------------------------------------------------------------------- /tests/extract/util.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from autoevals.string import Levenshtein 4 | from autoevals.number import NumericDiff 5 | from dotenv import load_dotenv 6 | from pathlib import Path 7 | 8 | 9 | def load_test_dotenv(): 10 | load_dotenv(Path(__file__).parent.parent.parent / ".env.dev", override=True) 11 | 12 | 13 | def json_subset_match_score(expected: Any, actual: Any) -> float: 14 | """ 15 | Adapted from autoevals.JsonDiff to only test on the subset of keys within the expected json. 16 | """ 17 | string_scorer = Levenshtein() 18 | number_scorer = NumericDiff() 19 | if isinstance(expected, dict) and isinstance(actual, dict): 20 | if len(expected) == 0 and len(actual) == 0: 21 | return 1 22 | keys = set(expected.keys()) 23 | scores = [json_subset_match_score(expected.get(k), actual.get(k)) for k in keys] 24 | scores = [s for s in scores if s is not None] 25 | return sum(scores) / len(scores) 26 | elif isinstance(expected, list) and isinstance(actual, list): 27 | if len(expected) == 0 and len(actual) == 0: 28 | return 1 29 | scores = [json_subset_match_score(e1, e2) for (e1, e2) in zip(expected, actual)] 30 | scores = [s for s in scores if s is not None] 31 | return sum(scores) / max(len(expected), len(actual)) 32 | elif isinstance(expected, str) and isinstance(actual, str): 33 | return string_scorer.eval(expected, actual).score 34 | elif (isinstance(expected, int) or isinstance(expected, float)) and ( 35 | isinstance(actual, int) or isinstance(actual, float) 36 | ): 37 | return number_scorer.eval(expected, actual).score 38 | elif expected is None and actual is None: 39 | return 1 40 | elif expected is None or actual is None: 41 | return 0 42 | else: 43 | return 0 44 | -------------------------------------------------------------------------------- /tests/parse/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/parse/__init__.py -------------------------------------------------------------------------------- /tests/parse/test_llama_parse.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import shutil 4 | from fsspec.implementations.local import LocalFileSystem 5 | from httpx import AsyncClient 6 | 7 | from llama_cloud_services.parse import LlamaParse 8 | 9 | 10 | @pytest.mark.skipif( 11 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 12 | reason="LLAMA_CLOUD_API_KEY not set", 13 | ) 14 | def test_simple_page_text() -> None: 15 | parser = LlamaParse(result_type="text") 16 | 17 | filepath = "tests/test_files/attention_is_all_you_need.pdf" 18 | result = parser.load_data(filepath) 19 | assert len(result) == 1 20 | assert len(result[0].text) > 0 21 | 22 | 23 | @pytest.fixture 24 | def markdown_parser() -> LlamaParse: 25 | if os.environ.get("LLAMA_CLOUD_API_KEY", "") == "": 26 | pytest.skip("LLAMA_CLOUD_API_KEY not set") 27 | return LlamaParse(result_type="markdown", ignore_errors=False) 28 | 29 | 30 | def test_simple_page_markdown(markdown_parser: LlamaParse) -> None: 31 | filepath = "tests/test_files/attention_is_all_you_need.pdf" 32 | result = markdown_parser.load_data(filepath) 33 | assert len(result) == 1 34 | assert len(result[0].text) > 0 35 | 36 | 37 | def test_simple_page_markdown_bytes(markdown_parser: LlamaParse) -> None: 38 | markdown_parser = LlamaParse(result_type="markdown", ignore_errors=False) 39 | 40 | filepath = "tests/test_files/attention_is_all_you_need.pdf" 41 | with open(filepath, "rb") as f: 42 | file_bytes = f.read() 43 | # client must provide extra_info with file_name 44 | with pytest.raises(ValueError): 45 | result = markdown_parser.load_data(file_bytes) 46 | result = markdown_parser.load_data( 47 | file_bytes, extra_info={"file_name": "attention_is_all_you_need.pdf"} 48 | ) 49 | assert len(result) == 1 50 | assert len(result[0].text) > 0 51 | 52 | 53 | def test_simple_page_markdown_buffer(markdown_parser: LlamaParse) -> None: 54 | markdown_parser = LlamaParse(result_type="markdown", ignore_errors=False) 55 | 56 | filepath = "tests/test_files/attention_is_all_you_need.pdf" 57 | with open(filepath, "rb") as f: 58 | # client must provide extra_info with file_name 59 | with pytest.raises(ValueError): 60 | result = markdown_parser.load_data(f) 61 | result = markdown_parser.load_data( 62 | f, extra_info={"file_name": "attention_is_all_you_need.pdf"} 63 | ) 64 | assert len(result) == 1 65 | assert len(result[0].text) > 0 66 | 67 | 68 | @pytest.mark.skipif( 69 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 70 | reason="LLAMA_CLOUD_API_KEY not set", 71 | ) 72 | @pytest.mark.asyncio 73 | async def test_simple_page_with_custom_fs() -> None: 74 | parser = LlamaParse(result_type="markdown") 75 | fs = LocalFileSystem() 76 | filepath = "tests/test_files/attention_is_all_you_need.pdf" 77 | result = await parser.aload_data(filepath, fs=fs) 78 | assert len(result) == 1 79 | 80 | 81 | @pytest.mark.skipif( 82 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 83 | reason="LLAMA_CLOUD_API_KEY not set", 84 | ) 85 | @pytest.mark.asyncio 86 | async def test_simple_page_progress_workers() -> None: 87 | parser = LlamaParse(result_type="markdown", show_progress=True, verbose=True) 88 | 89 | filepath = "tests/test_files/attention_is_all_you_need.pdf" 90 | result = await parser.aload_data([filepath, filepath]) 91 | assert len(result) == 2 92 | assert len(result[0].text) > 0 93 | 94 | parser = LlamaParse( 95 | result_type="markdown", show_progress=True, num_workers=2, verbose=True 96 | ) 97 | 98 | filepath = "tests/test_files/attention_is_all_you_need.pdf" 99 | result = await parser.aload_data([filepath, filepath]) 100 | assert len(result) == 2 101 | assert len(result[0].text) > 0 102 | 103 | 104 | @pytest.mark.skipif( 105 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 106 | reason="LLAMA_CLOUD_API_KEY not set", 107 | ) 108 | @pytest.mark.asyncio 109 | async def test_custom_client() -> None: 110 | custom_client = AsyncClient(verify=False, timeout=10) 111 | parser = LlamaParse(result_type="markdown", custom_client=custom_client) 112 | filepath = "tests/test_files/attention_is_all_you_need.pdf" 113 | result = await parser.aload_data(filepath) 114 | assert len(result) == 1 115 | assert len(result[0].text) > 0 116 | 117 | 118 | @pytest.mark.skipif( 119 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 120 | reason="LLAMA_CLOUD_API_KEY not set", 121 | ) 122 | @pytest.mark.asyncio 123 | async def test_input_url() -> None: 124 | parser = LlamaParse(result_type="markdown") 125 | 126 | # links to a resume example 127 | input_url = "https://cdn-blog.novoresume.com/articles/google-docs-resume-templates/basic-google-docs-resume.png" 128 | result = await parser.aload_data(input_url) 129 | assert len(result) == 1 130 | assert "your name" in result[0].text.lower() 131 | 132 | 133 | @pytest.mark.skipif( 134 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 135 | reason="LLAMA_CLOUD_API_KEY not set", 136 | ) 137 | @pytest.mark.asyncio 138 | async def test_input_url_with_website_input() -> None: 139 | parser = LlamaParse(result_type="markdown") 140 | input_url = "https://www.example.com" 141 | result = await parser.aload_data(input_url) 142 | assert len(result) == 1 143 | assert "example" in result[0].text.lower() 144 | 145 | 146 | @pytest.mark.skipif( 147 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 148 | reason="LLAMA_CLOUD_API_KEY not set", 149 | ) 150 | @pytest.mark.asyncio 151 | async def test_mixing_input_types() -> None: 152 | parser = LlamaParse(result_type="markdown") 153 | filepath = "tests/test_files/attention_is_all_you_need.pdf" 154 | input_url = "https://cdn-blog.novoresume.com/articles/google-docs-resume-templates/basic-google-docs-resume.png" 155 | result = await parser.aload_data([filepath, input_url]) 156 | 157 | assert len(result) == 2 158 | 159 | 160 | @pytest.mark.skipif( 161 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 162 | reason="LLAMA_CLOUD_API_KEY not set", 163 | ) 164 | @pytest.mark.asyncio 165 | async def test_download_images() -> None: 166 | parser = LlamaParse(result_type="markdown", take_screenshot=True) 167 | filepath = "tests/test_files/attention_is_all_you_need.pdf" 168 | json_result = await parser.aget_json([filepath]) 169 | 170 | assert len(json_result) == 1 171 | assert len(json_result[0]["pages"][0]["images"]) > 0 172 | 173 | download_path = "tests/test_files/images" 174 | shutil.rmtree(download_path, ignore_errors=True) 175 | 176 | await parser.aget_images(json_result, download_path) 177 | assert len(os.listdir(download_path)) == len(json_result[0]["pages"][0]["images"]) 178 | -------------------------------------------------------------------------------- /tests/parse/test_llama_parse_result.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import os 3 | import pytest 4 | from llama_cloud_services import LlamaParse 5 | from llama_cloud_services.parse.types import JobResult 6 | 7 | 8 | @pytest.fixture 9 | def file_path() -> str: 10 | return "tests/test_files/attention_is_all_you_need.pdf" 11 | 12 | 13 | @pytest.fixture 14 | def chart_file_path() -> str: 15 | return "tests/test_files/attention_is_all_you_need_chart.pdf" 16 | 17 | 18 | @pytest.mark.asyncio 19 | @pytest.mark.skipif( 20 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 21 | reason="LLAMA_CLOUD_API_KEY not set", 22 | ) 23 | async def test_basic_parse_result(file_path: str): 24 | parser = LlamaParse( 25 | take_screenshot=True, 26 | auto_mode=True, 27 | fast_mode=False, 28 | ) 29 | result = await parser.aparse(file_path) 30 | 31 | assert isinstance(result, JobResult) 32 | assert result.job_id is not None 33 | assert result.file_name == file_path 34 | assert len(result.pages) > 0 35 | 36 | assert result.pages[0].text is not None 37 | assert len(result.pages[0].text) > 0 38 | 39 | assert result.pages[0].md is not None 40 | assert len(result.pages[0].md) > 0 41 | 42 | assert result.pages[0].md != result.pages[0].text 43 | 44 | assert len(result.pages[0].images) > 0 45 | assert result.pages[0].images[0].name is not None 46 | 47 | with tempfile.TemporaryDirectory() as temp_dir: 48 | file_names = await result.asave_all_images(temp_dir) 49 | assert len(file_names) > 0 50 | for file_name in file_names: 51 | assert os.path.exists(file_name) 52 | assert os.path.getsize(file_name) > 0 53 | 54 | assert result.job_metadata is not None 55 | 56 | text_documents = result.get_text_documents( 57 | split_by_page=True, 58 | ) 59 | assert len(text_documents) > 0 60 | assert text_documents[0].text is not None 61 | assert len(text_documents[0].text) > 0 62 | 63 | markdown_documents = result.get_markdown_documents( 64 | split_by_page=True, 65 | ) 66 | assert len(markdown_documents) > 0 67 | assert markdown_documents[0].text is not None 68 | assert len(markdown_documents[0].text) > 0 69 | 70 | image_documents = await result.aget_image_documents( 71 | include_screenshot_images=True, 72 | include_object_images=False, 73 | ) 74 | assert len(image_documents) > 0 75 | assert image_documents[0].image is not None 76 | assert len(image_documents[0].resolve_image().getvalue()) > 0 77 | 78 | 79 | @pytest.mark.asyncio 80 | @pytest.mark.skip( 81 | reason="TODO: I don't actually know how to trigger links in the output." 82 | ) 83 | async def test_link_parse_result(file_path: str): 84 | parser = LlamaParse( 85 | annotate_links=True, 86 | ) 87 | result = await parser.aparse(file_path) 88 | 89 | assert isinstance(result, JobResult) 90 | assert len(result.pages) > 0 91 | assert len(result.pages[0].links) > 0 92 | 93 | 94 | @pytest.mark.asyncio 95 | @pytest.mark.skipif( 96 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 97 | reason="LLAMA_CLOUD_API_KEY not set", 98 | ) 99 | async def test_parse_structured_output(file_path: str): 100 | parser = LlamaParse( 101 | structured_output=True, 102 | structured_output_json_schema_name="imFeelingLucky", 103 | ) 104 | result = await parser.aparse(file_path) 105 | assert isinstance(result, JobResult) 106 | assert len(result.pages) > 0 107 | assert len(result.pages[0].structuredData) > 0 108 | 109 | 110 | @pytest.mark.asyncio 111 | @pytest.mark.skipif( 112 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 113 | reason="LLAMA_CLOUD_API_KEY not set", 114 | ) 115 | async def test_parse_charts(chart_file_path: str): 116 | parser = LlamaParse( 117 | extract_charts=True, 118 | ) 119 | result = await parser.aparse(chart_file_path) 120 | assert isinstance(result, JobResult) 121 | assert len(result.pages) > 0 122 | assert len(result.pages[0].charts) > 0 123 | 124 | 125 | @pytest.mark.asyncio 126 | @pytest.mark.skipif( 127 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 128 | reason="LLAMA_CLOUD_API_KEY not set", 129 | ) 130 | async def test_parse_layout(file_path: str): 131 | parser = LlamaParse( 132 | extract_layout=True, 133 | ) 134 | result = await parser.aparse(file_path) 135 | 136 | assert isinstance(result, JobResult) 137 | assert len(result.pages) > 0 138 | assert len(result.pages[0].layout) > 0 139 | 140 | 141 | @pytest.mark.skipif( 142 | os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", 143 | reason="LLAMA_CLOUD_API_KEY not set", 144 | ) 145 | def test_parse_multiple_files(file_path: str, chart_file_path: str): 146 | parser = LlamaParse() 147 | result = parser.parse([file_path, chart_file_path]) 148 | 149 | assert isinstance(result, list) 150 | assert len(result) == 2 151 | assert isinstance(result[0], JobResult) 152 | assert isinstance(result[1], JobResult) 153 | assert result[0].file_name == file_path 154 | assert result[1].file_name == chart_file_path 155 | -------------------------------------------------------------------------------- /tests/report/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/report/__init__.py -------------------------------------------------------------------------------- /tests/report/test_llama_report.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import uuid 4 | from typing import AsyncGenerator 5 | from pytest_asyncio import fixture as async_fixture 6 | from llama_cloud_services.report import LlamaReport, ReportClient 7 | 8 | # Skip tests if no API key is set 9 | pytestmark = pytest.mark.skipif( 10 | not os.getenv("LLAMA_CLOUD_API_KEY") or os.getenv("CI") == "true", 11 | reason="No API key provided", 12 | ) 13 | 14 | 15 | @async_fixture(scope="function") 16 | async def client() -> AsyncGenerator[LlamaReport, None]: 17 | """Create a LlamaReport client.""" 18 | client = LlamaReport() 19 | reports_before = await client.alist_reports() 20 | reports_before_ids = [r.report_id for r in reports_before] 21 | try: 22 | yield client 23 | finally: 24 | # clean up reports 25 | try: 26 | reports_after = await client.alist_reports() 27 | reports_after_ids = [r.report_id for r in reports_after] 28 | for report_id in reports_before_ids: 29 | if report_id not in reports_after_ids: 30 | await client.adelete_report(report_id) 31 | except Exception: 32 | pass 33 | finally: 34 | await client.aclient.aclose() 35 | 36 | 37 | @pytest.fixture(scope="function") 38 | def unique_name() -> str: 39 | """Generate a unique report name.""" 40 | return f"test-report-{uuid.uuid4()}" 41 | 42 | 43 | @async_fixture(scope="function") 44 | async def report( 45 | client: LlamaReport, unique_name: str 46 | ) -> AsyncGenerator[ReportClient, None]: 47 | """Create a report.""" 48 | report = await client.acreate_report( 49 | name=unique_name, 50 | template_text=( 51 | "# [Some title]\n\n" 52 | " ## TLDR\n" 53 | "A quick summary of the paper.\n\n" 54 | "## Details\n" 55 | "More details about the paper, possible more than one section here.\n" 56 | ), 57 | input_files=["tests/test_files/paper.md"], 58 | ) 59 | try: 60 | yield report 61 | finally: 62 | await report.adelete() 63 | 64 | 65 | @pytest.mark.asyncio 66 | @pytest.mark.xfail( 67 | condition=lambda: os.getenv("CI"), 68 | reason="Backend db issues; needs to be fixed.", 69 | ) 70 | async def test_create_and_delete_report( 71 | client: LlamaReport, report: ReportClient 72 | ) -> None: 73 | """Test basic report creation and deletion.""" 74 | # Verify the report exists 75 | metadata = await report.aget_metadata() 76 | assert metadata.name == report.name 77 | 78 | # Test listing reports 79 | reports = await client.alist_reports() 80 | assert any(r.report_id == report.report_id for r in reports) 81 | 82 | # Test getting report by ID 83 | fetched_report = await client.aget_report(report.report_id) 84 | assert fetched_report.report_id == report.report_id 85 | assert fetched_report.name == report.name 86 | 87 | 88 | @pytest.mark.asyncio 89 | @pytest.mark.xfail( 90 | condition=lambda: os.getenv("CI"), 91 | reason="Report plan sometimes times out", 92 | raises=TimeoutError, 93 | ) 94 | async def test_report_plan_workflow(report: ReportClient) -> None: 95 | """Test the report planning workflow.""" 96 | # Wait for the plan 97 | plan = await report.await_for_plan() 98 | assert plan is not None 99 | 100 | # Approve the plan 101 | response = await report.aupdate_plan(action="approve") 102 | assert response is not None 103 | 104 | # Wait for completion 105 | completed_report = await report.await_completion() 106 | assert len(completed_report.blocks) > 0 107 | 108 | # Get edit suggestions 109 | suggestions = await report.asuggest_edits( 110 | "TLDR section header more formal.", auto_history=True 111 | ) 112 | assert len(suggestions) > 0 113 | 114 | # Test accepting an edit 115 | await report.aaccept_edit(suggestions[0]) 116 | 117 | # Get more suggestions and test rejecting 118 | more_suggestions = await report.asuggest_edits( 119 | "Add a section about machine learning.", auto_history=True 120 | ) 121 | assert len(more_suggestions) > 0 122 | await report.areject_edit(more_suggestions[0]) 123 | 124 | # Verify chat history is maintained 125 | assert len(report.chat_history) >= 4 # 2 user messages + 2 assistant responses 126 | 127 | # get events 128 | events = await report.aget_events() 129 | assert len(events) > 0 130 | -------------------------------------------------------------------------------- /tests/test_files/attention_is_all_you_need.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/test_files/attention_is_all_you_need.pdf -------------------------------------------------------------------------------- /tests/test_files/attention_is_all_you_need_chart.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/test_files/attention_is_all_you_need_chart.pdf -------------------------------------------------------------------------------- /tests/test_files/images/67b428c6-9edb-4550-83d9-5e35165846ca-page_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/test_files/images/67b428c6-9edb-4550-83d9-5e35165846ca-page_1.jpg -------------------------------------------------------------------------------- /tests/test_files/resume/receipt/noisebridge_receipt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/test_files/resume/receipt/noisebridge_receipt.pdf -------------------------------------------------------------------------------- /tests/test_files/resume/receipt/noisebridge_receipt.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "receiptNumber": "27215058", 3 | "invoiceNumber": "87B37C90152", 4 | "datePaid": "2024-07-19", 5 | "paymentMethod": { 6 | "type": "visa", 7 | "lastFourDigits": "7267" 8 | }, 9 | "merchant": { 10 | "name": "Noisebridge", 11 | "address": { 12 | "street": "272 Capp St", 13 | "city": "San Francisco", 14 | "state": "California", 15 | "postalCode": "94110", 16 | "country": "United States" 17 | }, 18 | "phone": "1 6507017829", 19 | "email": "treasurer+stripe@noisebridge.net" 20 | }, 21 | "billTo": "noisebridge@seldo.com", 22 | "items": [ 23 | { 24 | "description": "$10 / month", 25 | "quantity": 1, 26 | "unitPrice": 10.0, 27 | "amount": 10.0, 28 | "period": { 29 | "start": "2024-07-19", 30 | "end": "2024-08-19" 31 | } 32 | } 33 | ], 34 | "subtotal": 10.0, 35 | "total": 10.0, 36 | "amountPaid": 10.0 37 | } 38 | -------------------------------------------------------------------------------- /tests/test_files/resume/receipt/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema#", 3 | "type": "object", 4 | "required": ["receiptNumber", "datePaid", "total", "items"], 5 | "properties": { 6 | "receiptNumber": { 7 | "type": "string" 8 | }, 9 | "invoiceNumber": { 10 | "type": "string" 11 | }, 12 | "datePaid": { 13 | "type": "string", 14 | "format": "date" 15 | }, 16 | "paymentMethod": { 17 | "type": "object", 18 | "properties": { 19 | "type": { 20 | "type": "string", 21 | "enum": ["visa", "mastercard", "amex", "cash", "other"] 22 | }, 23 | "lastFourDigits": { 24 | "type": "string", 25 | "pattern": "^[0-9]{4}$" 26 | } 27 | } 28 | }, 29 | "merchant": { 30 | "type": "object", 31 | "properties": { 32 | "name": { 33 | "type": "string" 34 | }, 35 | "address": { 36 | "type": "object", 37 | "properties": { 38 | "street": { 39 | "type": "string" 40 | }, 41 | "city": { 42 | "type": "string" 43 | }, 44 | "state": { 45 | "type": "string" 46 | }, 47 | "postalCode": { 48 | "type": "string" 49 | }, 50 | "country": { 51 | "type": "string" 52 | } 53 | } 54 | }, 55 | "phone": { 56 | "type": "string" 57 | }, 58 | "email": { 59 | "type": "string", 60 | "format": "email" 61 | } 62 | } 63 | }, 64 | "billTo": { 65 | "type": "string", 66 | "format": "email" 67 | }, 68 | "items": { 69 | "type": "array", 70 | "items": { 71 | "type": "object", 72 | "required": [ 73 | "description", 74 | "quantity", 75 | "unitPrice", 76 | "amount", 77 | "period" 78 | ], 79 | "properties": { 80 | "description": { 81 | "type": "string" 82 | }, 83 | "quantity": { 84 | "type": "integer", 85 | "minimum": 1 86 | }, 87 | "unitPrice": { 88 | "type": "number", 89 | "minimum": 0 90 | }, 91 | "amount": { 92 | "type": "number", 93 | "minimum": 0 94 | }, 95 | "period": { 96 | "type": "object", 97 | "properties": { 98 | "start": { 99 | "type": "string", 100 | "format": "date" 101 | }, 102 | "end": { 103 | "type": "string", 104 | "format": "date" 105 | } 106 | } 107 | } 108 | } 109 | } 110 | }, 111 | "subtotal": { 112 | "type": "number", 113 | "minimum": 0 114 | }, 115 | "total": { 116 | "type": "number", 117 | "minimum": 0 118 | }, 119 | "amountPaid": { 120 | "type": "number", 121 | "minimum": 0 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /tests/test_files/resume/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema#", 3 | "title": "Resume Schema", 4 | "type": "object", 5 | "required": ["basics", "skills", "experience"], 6 | "properties": { 7 | "basics": { 8 | "type": "object", 9 | "required": ["name", "email"], 10 | "properties": { 11 | "name": { 12 | "type": "string" 13 | }, 14 | "email": { 15 | "type": "string", 16 | "format": "email" 17 | }, 18 | "phone": { 19 | "type": "string" 20 | }, 21 | "location": { 22 | "type": "object", 23 | "properties": { 24 | "city": { 25 | "type": "string" 26 | }, 27 | "region": { 28 | "type": "string" 29 | }, 30 | "country": { 31 | "type": "string" 32 | } 33 | } 34 | }, 35 | "profiles": { 36 | "type": "array", 37 | "items": { 38 | "type": "object", 39 | "properties": { 40 | "network": { 41 | "type": "string" 42 | }, 43 | "url": { 44 | "type": "string", 45 | "format": "uri" 46 | } 47 | } 48 | } 49 | }, 50 | "summary": { 51 | "type": "string" 52 | } 53 | } 54 | }, 55 | "skills": { 56 | "type": "array", 57 | "items": { 58 | "type": "object", 59 | "properties": { 60 | "category": { 61 | "type": "string" 62 | }, 63 | "keywords": { 64 | "type": "array", 65 | "items": { 66 | "type": "string" 67 | } 68 | }, 69 | "level": { 70 | "type": "string", 71 | "enum": ["beginner", "intermediate", "advanced", "expert"] 72 | } 73 | } 74 | } 75 | }, 76 | "experience": { 77 | "type": "array", 78 | "items": { 79 | "type": "object", 80 | "required": ["company", "position", "startDate"], 81 | "properties": { 82 | "company": { 83 | "type": "string" 84 | }, 85 | "position": { 86 | "type": "string" 87 | }, 88 | "startDate": { 89 | "type": "string", 90 | "format": "date" 91 | }, 92 | "endDate": { 93 | "type": "string", 94 | "format": "date" 95 | }, 96 | "highlights": { 97 | "type": "array", 98 | "items": { 99 | "type": "string" 100 | } 101 | }, 102 | "technologies": { 103 | "type": "array", 104 | "items": { 105 | "type": "string" 106 | } 107 | } 108 | } 109 | } 110 | }, 111 | "education": { 112 | "type": "array", 113 | "items": { 114 | "type": "object", 115 | "required": ["institution", "degree"], 116 | "properties": { 117 | "institution": { 118 | "type": "string" 119 | }, 120 | "degree": { 121 | "type": "string" 122 | }, 123 | "field": { 124 | "type": "string" 125 | }, 126 | "graduationDate": { 127 | "type": "string", 128 | "format": "date" 129 | }, 130 | "gpa": { 131 | "type": "number" 132 | } 133 | } 134 | } 135 | }, 136 | "certifications": { 137 | "type": "array", 138 | "items": { 139 | "type": "object", 140 | "properties": { 141 | "name": { 142 | "type": "string" 143 | }, 144 | "issuer": { 145 | "type": "string" 146 | }, 147 | "date": { 148 | "type": "string", 149 | "format": "date" 150 | }, 151 | "validUntil": { 152 | "type": "string", 153 | "format": "date" 154 | } 155 | } 156 | } 157 | }, 158 | "publications": { 159 | "type": "array", 160 | "items": { 161 | "type": "object", 162 | "properties": { 163 | "title": { 164 | "type": "string" 165 | }, 166 | "publisher": { 167 | "type": "string" 168 | }, 169 | "date": { 170 | "type": "string", 171 | "format": "date" 172 | }, 173 | "url": { 174 | "type": "string", 175 | "format": "uri" 176 | } 177 | } 178 | } 179 | } 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /tests/test_files/resume/software_architect_resume.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 116 | 117 | 118 | 119 |
120 | 171 | 172 |
173 |

Sarah Chen

174 |
Senior Software Architect
175 | 176 |
177 |

Professional Summary

178 |

179 | Innovative Software Architect with over 12 years of experience 180 | designing and implementing large-scale distributed systems. Proven 181 | track record of leading technical teams and delivering robust 182 | enterprise solutions. Expert in cloud architecture, microservices, 183 | and emerging technologies with a focus on scalable, maintainable 184 | systems. 185 |

186 |
187 | 188 |
189 |

Professional Experience

190 | 191 |
192 |
TechCorp Solutions
193 |
Senior Software Architect
194 |
2020 - Present
195 |
    196 |
  • 197 | Led architectural design and implementation of a cloud-native 198 | platform serving 2M+ users 199 |
  • 200 |
  • 201 | Established architectural guidelines and best practices adopted 202 | across 12 development teams 203 |
  • 204 |
  • 205 | Reduced system latency by 40% through implementation of 206 | event-driven architecture 207 |
  • 208 |
  • 209 | Mentored 15+ senior developers in cloud-native development 210 | practices 211 |
  • 212 |
213 |
214 | 215 |
216 |
DataFlow Systems
217 |
Lead Software Engineer
218 |
2016 - 2020
219 |
    220 |
  • 221 | Architected and led development of distributed data processing 222 | platform handling 5TB daily 223 |
  • 224 |
  • 225 | Designed microservices architecture reducing deployment time by 226 | 65% 227 |
  • 228 |
  • 229 | Led migration of legacy monolith to cloud-native architecture 230 |
  • 231 |
  • 232 | Managed team of 8 engineers across 3 international locations 233 |
  • 234 |
235 |
236 | 237 |
238 |
InnovateTech
239 |
Senior Software Engineer
240 |
2013 - 2016
241 |
    242 |
  • 243 | Developed high-performance trading platform processing 100K 244 | transactions per second 245 |
  • 246 |
  • 247 | Implemented real-time analytics engine reducing processing 248 | latency by 75% 249 |
  • 250 |
  • 251 | Led adoption of container orchestration reducing deployment 252 | costs by 35% 253 |
  • 254 |
255 |
256 |
257 | 258 |
259 |

Education

260 | 261 |
262 |
Stanford University
263 |
Master of Science in Computer Science
264 |
2013
265 |

Focus: Distributed Systems and Machine Learning

266 |
267 | 268 |
269 |
University of California, Berkeley
270 |
271 | Bachelor of Science in Computer Engineering 272 |
273 |
2011
274 |

Magna Cum Laude

275 |
276 |
277 | 278 |
279 |

Patents & Speaking

280 |
    281 |
  • 282 | Co-inventor on three patents for distributed systems architecture 283 |
  • 284 |
  • 285 | Published paper on "Scalable Microservices Architecture" at IEEE 286 | Cloud Computing Conference 2022 287 |
  • 288 |
  • 289 | Keynote Speaker, CloudCon 2023: "Future of Cloud-Native 290 | Architecture" 291 |
  • 292 |
  • Regular presenter at local tech meetups and conferences
  • 293 |
294 |
295 |
296 |
297 | 298 | 299 | -------------------------------------------------------------------------------- /tests/test_files/resume/software_architect_resume.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "basics": { 3 | "name": "Sarah Chen", 4 | "email": "san.francisco@email.com", 5 | "phone": "(555) 123-4567", 6 | "location": { 7 | "city": "San Francisco", 8 | "region": "CA", 9 | "country": "USA" 10 | } 11 | }, 12 | "skills": [ 13 | { 14 | "category": "Architecture & Design", 15 | "keywords": [ 16 | "Microservices", 17 | "Event-Driven Architecture", 18 | "Domain-Driven Design", 19 | "REST APIs" 20 | ] 21 | }, 22 | { 23 | "category": "Cloud Platforms", 24 | "keywords": ["AWS", "Azure", "Google Cloud Platform"] 25 | }, 26 | { 27 | "category": "Programming Languages", 28 | "keywords": ["Java", "Python", "Go", "JavaScript", "TypeScript"] 29 | } 30 | ], 31 | "experience": [ 32 | { 33 | "company": "TechCorp Solutions", 34 | "position": "Senior Software Architect", 35 | "startDate": "2020-01-01", 36 | "endDate": "2024-01-10" 37 | }, 38 | { 39 | "company": "DataFlow Systems", 40 | "position": "Lead Software Engineer", 41 | "startDate": "2016-01-01", 42 | "endDate": "2019-12-31", 43 | "technologies": [ 44 | "Distributed Systems", 45 | "Microservices", 46 | "Cloud Migration" 47 | ] 48 | }, 49 | { 50 | "company": "InnovateTech", 51 | "position": "Senior Software Engineer", 52 | "startDate": "2013-01-01", 53 | "endDate": "2015-12-31", 54 | "technologies": [ 55 | "High-performance Computing", 56 | "Real-time Analytics", 57 | "Container Orchestration" 58 | ] 59 | } 60 | ], 61 | "education": [ 62 | { 63 | "institution": "Stanford University", 64 | "degree": "Master of Science", 65 | "field": "Computer Science", 66 | "graduationDate": "2013-01-01", 67 | "specialization": "Distributed Systems and Machine Learning" 68 | }, 69 | { 70 | "institution": "University of California, Berkeley", 71 | "degree": "Bachelor of Science", 72 | "field": "Computer Engineering", 73 | "graduationDate": "2011-01-01" 74 | } 75 | ], 76 | "certifications": [ 77 | { 78 | "name": "AWS Solutions Architect - Professional" 79 | }, 80 | { 81 | "name": "Google Cloud Architect" 82 | }, 83 | { 84 | "name": "Certified Kubernetes Administrator" 85 | } 86 | ], 87 | "publications": [ 88 | { 89 | "title": "Scalable Microservices Architecture", 90 | "publisher": "IEEE Cloud Computing Conference", 91 | "date": "2022-01-01" 92 | } 93 | ] 94 | } 95 | -------------------------------------------------------------------------------- /tests/test_files/slide/saas_slide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/run-llama/llama_cloud_services/90431090e9e2989af765b4fd095f7ae981efab75/tests/test_files/slide/saas_slide.pdf -------------------------------------------------------------------------------- /tests/test_files/slide/saas_slide.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "companyInfo": { 3 | "name": "CloudFlow Analytics", 4 | "fundingStage": "Series A", 5 | "foundedYear": null, 6 | "industry": null, 7 | "location": null 8 | }, 9 | "financialMetrics": { 10 | "mrr": { 11 | "value": 580000, 12 | "currency": "USD", 13 | "growthRate": 27 14 | }, 15 | "grossMargin": 88 16 | }, 17 | "growthMetrics": { 18 | "customers": { 19 | "total": 1247, 20 | "growth": 142, 21 | "enterprisePercent": null 22 | }, 23 | "nrr": 147 24 | }, 25 | "marketMetrics": { 26 | "tam": 50000000000, 27 | "sam": null, 28 | "marketShare": null, 29 | "competitors": null 30 | }, 31 | "differentiators": [ 32 | { 33 | "claim": "Processing Speed", 34 | "metric": "5x faster", 35 | "comparisonTarget": "competitors" 36 | }, 37 | { 38 | "claim": "ML Accuracy", 39 | "metric": "99.9%", 40 | "comparisonTarget": null 41 | }, 42 | { 43 | "claim": "Market Potential", 44 | "metric": "80%", 45 | "comparisonTarget": "Fortune 500" 46 | } 47 | ] 48 | } 49 | -------------------------------------------------------------------------------- /tests/test_files/slide/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema#", 3 | "type": "object", 4 | "required": ["companyInfo", "financialMetrics", "growthMetrics"], 5 | "properties": { 6 | "companyInfo": { 7 | "type": "object", 8 | "required": ["name", "fundingStage"], 9 | "properties": { 10 | "name": { 11 | "type": "string" 12 | }, 13 | "fundingStage": { 14 | "type": "string", 15 | "enum": ["Pre-seed", "Seed", "Series A", "Series B", "Series C+"] 16 | }, 17 | "foundedYear": { 18 | "anyOf": [ 19 | { 20 | "type": "integer" 21 | }, 22 | { 23 | "type": "null" 24 | } 25 | ] 26 | }, 27 | "industry": { 28 | "anyOf": [ 29 | { 30 | "type": "string" 31 | }, 32 | { 33 | "type": "null" 34 | } 35 | ] 36 | }, 37 | "location": { 38 | "anyOf": [ 39 | { 40 | "type": "string" 41 | }, 42 | { 43 | "type": "null" 44 | } 45 | ] 46 | } 47 | } 48 | }, 49 | "financialMetrics": { 50 | "type": "object", 51 | "required": ["mrr", "growthRate"], 52 | "properties": { 53 | "mrr": { 54 | "type": "object", 55 | "description": "Monthly Recurring Revenue", 56 | "required": ["value", "currency", "growthRate"], 57 | "properties": { 58 | "value": { 59 | "type": "number" 60 | }, 61 | "currency": { 62 | "type": "string" 63 | }, 64 | "growthRate": { 65 | "type": "number" 66 | } 67 | } 68 | }, 69 | "grossMargin": { 70 | "type": "number" 71 | } 72 | } 73 | }, 74 | "growthMetrics": { 75 | "type": "object", 76 | "required": ["customers", "nrr"], 77 | "properties": { 78 | "customers": { 79 | "type": "object", 80 | "required": ["total", "growth"], 81 | "properties": { 82 | "total": { 83 | "type": "integer" 84 | }, 85 | "growth": { 86 | "type": "number" 87 | } 88 | } 89 | }, 90 | "nrr": { 91 | "description": "Net Revenue Retention", 92 | "type": "number" 93 | } 94 | } 95 | }, 96 | "differentiators": { 97 | "type": "array", 98 | "items": { 99 | "type": "object", 100 | "required": ["claim", "metric"], 101 | "properties": { 102 | "claim": { 103 | "type": "string" 104 | }, 105 | "metric": { 106 | "type": "string" 107 | }, 108 | "comparisonTarget": { 109 | "anyOf": [ 110 | { 111 | "type": "string" 112 | }, 113 | { 114 | "type": "null" 115 | } 116 | ] 117 | } 118 | } 119 | } 120 | } 121 | } 122 | } 123 | --------------------------------------------------------------------------------