├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── bug-report.yml
    │   ├── config.yml
    │   ├── documentation.yml
    │   └── feature-request.yml
    ├── pull_request_template.md
    └── workflows
    │   ├── pr_agent.yml
    │   └── python-app.yml
├── .gitignore
├── .pre-commit-config.yaml
├── README.md
├── any_parser
    ├── __init__.py
    ├── any_parser.py
    ├── async_parser.py
    ├── base_parser.py
    ├── batch_parser.py
    ├── constants.py
    ├── sync_parser.py
    └── utils.py
├── examples
    ├── async_extract_key_value_img.ipynb
    ├── async_extract_key_value_pdf.ipynb
    ├── async_extract_pii.ipynb
    ├── async_extract_resume_key_value.ipynb
    ├── async_extract_tables.ipynb
    ├── async_parse_pdf.ipynb
    ├── async_parse_pdf2.ipynb
    ├── async_parse_with_layout.ipynb
    ├── async_parse_with_ocr.ipynb
    ├── extract_key_value_img.ipynb
    ├── extract_key_value_pdf.ipynb
    ├── extract_pii.ipynb
    ├── extract_resume_key_value.ipynb
    ├── extract_tables.ipynb
    ├── parse_batch_api.ipynb
    ├── parse_docx.ipynb
    ├── parse_img.ipynb
    ├── parse_pdf.ipynb
    ├── parse_pdf2.ipynb
    └── sample_data
    │   ├── Earnings-Presentation-Q2-2024.pdf
    │   ├── cambioml_logo_large.png
    │   ├── resume_1.pdf
    │   ├── resume_1.png
    │   ├── sample.pdf
    │   ├── stoxx_index_guide_0003.pdf
    │   ├── test1.pdf
    │   ├── test2.pdf
    │   ├── test3.pdf
    │   ├── test3.png
    │   ├── test_1figure_1table.png
    │   ├── test_invoice.pdf
    │   ├── test_medical_report.jpeg
    │   ├── test_odf.docx
    │   ├── test_odf.pptx
    │   ├── test_w2.docx
    │   ├── test_w2.png
    │   └── test_w2.pptx
├── pyproject.toml
├── run_tests.sh
└── tests
    ├── README.md
    ├── __init__.py
    ├── outputs
        ├── correct_docx_output.txt
        ├── correct_pdf_output.txt
        ├── correct_png_output.txt
        └── correct_pptx_output.txt
    ├── test.py
    ├── test_batch_api.py
    └── test_data.py


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Default codeowners/reviewers for all code changes
2 | * @CambioML @Sdddell @goldmermaid @lingjiekong @boqiny
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.yml:
--------------------------------------------------------------------------------
 1 | name: 🐛 Bug Report
 2 | description: Create a report to help us reproduce and fix the bug
 3 | 
 4 | body:
 5 | - type: markdown
 6 |   attributes:
 7 |     value: >
 8 |       #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/CambioML/any-parser/issues?q=is%3Aissue+sort%3Acreated-desc+).
 9 | - type: textarea
10 |   attributes:
11 |     label: 🐛 Describe the bug
12 |     description: |
13 |       Please provide a clear and concise description of what the bug is.
14 | 
15 |       If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example:
16 | 
17 |       ```python
18 |       ...
19 |       ```
20 | 
21 |       If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
22 | 
23 |       Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
24 |     placeholder: |
25 |       A clear and concise description of what the bug is.
26 | 
27 |       ```python
28 |       # Sample code to reproduce the problem
29 |       ```
30 | 
31 |       ```
32 |       The error message you got, with the full traceback.
33 |       ```
34 |   validations:
35 |     required: true
36 | - type: textarea
37 |   attributes:
38 |     label: Versions
39 |     description: |
40 |       Please run the following and paste the output below.
41 |       ```sh
42 |       wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
43 |       # For security purposes, please check the contents of collect_env.py before running it.
44 |       python collect_env.py
45 |       ```
46 |   validations:
47 |     required: true
48 | - type: markdown
49 |   attributes:
50 |     value: >
51 |       Thanks for contributing 🎉!
52 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 |   - name: Questions
4 |     url: https://cambiomlworkspace.slack.com/join/shared_invite/zt-1zes33rmt-20Rag043uvExUaUdvt5_xQ#/shared-invite/email
5 |     about: Ask questions and discuss with other CambioML community members
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: 📚 Documentation
 2 | description: Report an issue related to https://www.cambioml.com/docs/any-parser/index.html
 3 | 
 4 | body:
 5 | - type: textarea
 6 |   attributes:
 7 |     label: 📚 The doc issue
 8 |     description: >
 9 |       A clear and concise description of what content in https://www.cambioml.com/docs/any-parser/index.html is an issue.
10 |   validations:
11 |     required: true
12 | - type: textarea
13 |   attributes:
14 |     label: Suggest a potential alternative/fix
15 |     description: >
16 |       Tell us how we could improve the documentation in this regard.
17 | - type: markdown
18 |   attributes:
19 |     value: >
20 |       Thanks for contributing 🎉!
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: 🚀 Feature request
 2 | description: Submit a proposal/request for a new any-parser feature
 3 | 
 4 | body:
 5 | - type: textarea
 6 |   attributes:
 7 |     label: 🚀 The feature, motivation and pitch
 8 |     description: >
 9 |       A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
10 |   validations:
11 |     required: true
12 | - type: textarea
13 |   attributes:
14 |     label: Alternatives
15 |     description: >
16 |       A description of any alternative solutions or features you've considered, if any.
17 | - type: textarea
18 |   attributes:
19 |     label: Additional context
20 |     description: >
21 |       Add any other context or screenshots about the feature request.
22 | - type: markdown
23 |   attributes:
24 |     value: >
25 |       Thanks for contributing 🎉!
26 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Description
 2 | <!-- Provide a brief description of the changes introduced by this PR -->
 3 | 
 4 | ## Related Issue
 5 | <!-- Link to the related issue (if applicable) using #issue_number -->
 6 | 
 7 | ## Type of Change
 8 | <!-- Put an `x` in the boxes that apply -->
 9 | 
10 | - [ ] Bug fix (non-breaking change which fixes an issue)
11 | - [ ] New feature (non-breaking change which adds functionality)
12 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
13 | - [ ] Documentation update
14 | - [ ] Code refactoring
15 | - [ ] Performance improvement
16 | 
17 | ## How Has This Been Tested?
18 | <!-- Describe the tests you ran to verify your changes -->
19 | 
20 | ## Screenshots (if applicable)
21 | <!-- Add screenshots to help explain your changes -->
22 | 
23 | ## Checklist
24 | <!-- Put an `x` in the boxes that apply -->
25 | 
26 | - [ ] My code follows the project's style guidelines
27 | - [ ] I have performed a self-review of my own code
28 | - [ ] I have commented my code, particularly in hard-to-understand areas
29 | - [ ] I have made corresponding changes to the documentation
30 | - [ ] My changes generate no new warnings
31 | - [ ] I have added tests that prove my fix is effective or that my feature works
32 | - [ ] New and existing unit tests pass locally with my changes
33 | 
34 | ## Additional Notes
35 | <!-- Add any additional notes or context about the PR here --> 
36 | 


--------------------------------------------------------------------------------
/.github/workflows/pr_agent.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   pull_request:
 3 |     types: [opened, reopened, ready_for_review]
 4 |   issue_comment:
 5 | jobs:
 6 |   pr_agent_job:
 7 |     if: ${{ github.event.sender.type != 'Bot' }}
 8 |     runs-on: ubuntu-latest
 9 |     permissions:
10 |       issues: write
11 |       pull-requests: write
12 |       contents: write
13 |     name: Run pr agent on every pull request, respond to user comments
14 |     steps:
15 |       - name: PR Agent action step
16 |         id: pragent
17 |         uses: Codium-ai/pr-agent@main
18 |         env:
19 |           OPENAI_KEY: ${{ secrets.OPENAI_API_KEY }}
20 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
21 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | name: Python application
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 | 
 9 | permissions:
10 |   contents: read
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.10"]
19 |       max-parallel: 1  # Ensures the tests run sequentially
20 |     
21 |     steps:
22 |     - uses: actions/checkout@v3
23 |     - name: Set up Python
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         pip install flake8
31 |         pip install black
32 |         pip install isort
33 |         python -m pip install poetry
34 |         poetry install --no-root # This will install the project dependencies defined in pyproject.toml
35 |     - name: Lint with flake8
36 |       run: |
37 |         # stop the build if there are Python syntax errors or undefined names
38 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
39 |         # exit-zero treats all errors as warnings.
40 |         flake8 . --count --exit-zero --max-complexity=10 --statistics
41 |     - name: Format code with Black
42 |       run: |
43 |         black . --exclude="" --check --verbose
44 |     - name: Sort imports with isort
45 |       run: |
46 |         isort . --profile=black --check-only --verbose
47 |     - name: Test with unittest
48 |       env:
49 |         CAMBIO_API_KEY: ${{ secrets.CAMBIO_API_KEY }}
50 |       run: |
51 |         poetry run python -m unittest discover -v tests
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # mac
163 | .DS_Store
164 | 
165 | # vscode
166 | .vscode/
167 | 
168 | # data/
169 | *.xlsx
170 | *.csv
171 | *.jsonl


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/psf/black
 3 |     rev: 24.8.0
 4 |     hooks:
 5 |       - id: black
 6 |         args: [--exclude=""]
 7 | 
 8 |   # this is not technically always safe but usually is
 9 |   # use comments `# isort: off` and `# isort: on` to disable/re-enable isort
10 |   - repo: https://github.com/pycqa/isort
11 |     rev: 5.13.2
12 |     hooks:
13 |       - id: isort
14 |         args: [--profile=black]
15 | 
16 |   # this is slightly dangerous because python imports have side effects
17 |   # and this tool removes unused imports, which may be providing
18 |   # necessary side effects for the code to run
19 |   - repo: https://github.com/PyCQA/autoflake
20 |     rev: v2.3.1
21 |     hooks:
22 |       - id: autoflake
23 |         args:
24 |           - "--in-place"
25 |           - "--expand-star-imports"
26 |           - "--remove-duplicate-keys"
27 |           - "--remove-unused-variables"
28 |           - "--remove-all-unused-imports"
29 |         exclude: "any-parser/__init__.py"
30 | 
31 |   # run all unittests
32 |   - repo: local
33 |     hooks:
34 |       - id: unittests
35 |         name: unittests
36 |         entry: ./run_tests.sh
37 |         language: script
38 |         pass_filenames: false
39 |         # Optional: Specify types of files that trigger this hook
40 |         # types: [python]
41 |         # Optional: Specify files or directories to exclude
42 |         # exclude: '^docs/'
43 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🌊 AnyParser
  2 | <p align="center">
  3 |   <a href="https://pypi.org/project/any-parser/"><img src="https://img.shields.io/pypi/v/any-parser.svg" alt="pypi_status" /></a>
  4 |   <a href="https://github.com/cambioml/any-parser/graphs/commit-activity"><img alt="Commit activity" src="https://img.shields.io/github/commit-activity/m/cambioml/any-parser?style=flat-square"/></a>
  5 |   <a href="https://join.slack.com/t/cambiomlworkspace/shared_invite/zt-1zes33rmt-20Rag043uvExUaUdvt5_xQ"><img src="https://badgen.net/badge/Join/Community/cyan?icon=slack" alt="Slack" /></a>
  6 | </p>
  7 | 
  8 | **AnyParser** provides an API to accurately extract unstructured data (e.g., PDFs, images, charts) into a structured format.
  9 | 
 10 | ## :seedling: Set up your AnyParser API key
 11 | 
 12 | To get started, generate your API key from the [Sandbox Account Page](https://www.cambioml.com/account). Each account comes with **100 free pages**.
 13 | 
 14 | > ⚠️ **Note:** The free API is limited to 10 pages/call.
 15 | 
 16 | For more information or to inquire about larger usage plans, feel free to contact us at info@cambioml.com.
 17 | 
 18 | To set up your API key (`CAMBIO_API_KEY`), follow these steps:
 19 | 1. Create a `.env` file in the root directory of your project.
 20 | 2. Add the following line to the `.env` file:
 21 | ```
 22 | CAMBIO_API_KEY=0cam************************
 23 | ```
 24 | 
 25 | 
 26 | ## :computer: Installation
 27 | ### 1. Set Up a New Conda Environment and Install AnyParser
 28 | First, create and activate a new Conda environment, then install AnyParser:
 29 | ```bash
 30 | conda create -n any-parse python=3.10 -y
 31 | conda activate any-parse
 32 | pip3 install any-parser
 33 | ```
 34 | ### 2. Create an AnyParser Instance Using Your API Key
 35 | Use your API key to create an instance of AnyParser. Make sure you’ve set up your .env file to store your API key securely:
 36 | ```python
 37 | import os
 38 | from dotenv import load_dotenv
 39 | from any_parser import AnyParser
 40 | 
 41 | # Load environment variables
 42 | load_dotenv(override=True)
 43 | 
 44 | # Get the API key from the environment
 45 | example_apikey = os.getenv("CAMBIO_API_KEY")
 46 | 
 47 | # Create an AnyParser instance
 48 | ap = AnyParser(api_key=example_apikey)
 49 | ```
 50 | 
 51 | ### 3. Run Synchronous Extraction
 52 | To extract data synchronously and receive immediate results:
 53 | ```python
 54 | # Extract content from the file and get the markdown output along with processing time
 55 | markdown, total_time = ap.parse(file_path="./data/test.pdf")
 56 | ```
 57 | 
 58 | ### 4. Run Asynchronous Extraction
 59 | For asynchronous extraction, send the file for processing and fetch results later:
 60 | ```python
 61 | # Send the file to begin asynchronous extraction
 62 | file_id = ap.async_parse(file_path="./data/test.pdf")
 63 | 
 64 | # Fetch the extracted content using the file ID
 65 | markdown = ap.async_fetch(file_id=file_id)
 66 | ```
 67 | 
 68 | ### 5. Run Batch Extraction (Beta)
 69 | For batch extraction, send the file to begin processing and fetch results later:
 70 | ```python
 71 | # Send the file to begin batch extraction
 72 | response = ap.batches.create(file_path="./data/test.pdf")
 73 | request_id = response.requestId
 74 | 
 75 | # Fetch the extracted content using the request ID
 76 | markdown = ap.batches.retrieve(request_id)
 77 | ```
 78 | 
 79 | Batch API for folder input:
 80 | ```python
 81 | # Send the folder to begin batch extraction
 82 | WORKING_FOLDER = "./sample_data"
 83 | # This will generate a jsonl with filename and requestID
 84 | response = ap.batches.create(WORKING_FOLDER)
 85 | ```
 86 | 
 87 | Each response in the JSONL file contains:
 88 | - The filename
 89 | - A unique request ID
 90 | - Additional processing metadata
 91 | 
 92 | You can later use these request IDs to retrieve the extracted content for each file:
 93 | 
 94 | ```python
 95 | # Fetch the extracted content using the request ID from the jsonl file
 96 | markdown = ap.batches.retrieve(request_id)
 97 | ```
 98 | For more details about code implementation of batch API, refer to
 99 | [examples/parse_batch_upload.py](examples/parse_batch_upload.py)  and  [examples/parse_batch_fetch.py](examples/parse_batch_fetch.py) 
100 | 
101 | > ⚠️ **Note:** Batch extraction is currently in beta testing. Processing time may take up to 12 hours to complete.
102 | > 
103 | > ⚠️ **Important:** API keys generated from cambioml.com do not automatically have batch processing permissions. Please contact info@cambioml.com to request batch processing access for your API key.
104 | 
105 | ## :scroll:  Examples
106 | Check out these examples to see how you can utilize **AnyParser** to extract text, numbers, and symbols in fewer than 10 lines of code!
107 | 
108 | ### [Extract all text and layout from PDF into Markdown Format](https://github.com/CambioML/any-parser/blob/rt-migration/examples/pdf_to_markdown.ipynb)
109 | Are you an AI engineer looking to **accurately** extract both the text and layout (e.g., table of contents or Markdown headers hierarchy) from a PDF? Check out this [3-minute notebook demo](https://github.com/CambioML/any-parser/blob/rt-migration/examples/pdf_to_markdown.ipynb).
110 | 
111 | ### [Extract a Table from an Image into Markdown Format](https://github.com/CambioML/any-parser/blob/rt-migration/examples/image_to_markdown.ipynb)
112 | Are you a financial analyst needing to **accurately** extract numbers from a table within an image? Explore this [3-minute notebook example](https://github.com/CambioML/any-parser/blob/rt-migration/examples/image_to_markdown.ipynb).
113 | 


--------------------------------------------------------------------------------
/any_parser/__init__.py:
--------------------------------------------------------------------------------
1 | """AnyParser module for parsing data."""
2 | 
3 | from any_parser.any_parser import AnyParser
4 | 
5 | __all__ = ["AnyParser"]
6 | 
7 | __version__ = "0.0.24"
8 | 


--------------------------------------------------------------------------------
/any_parser/any_parser.py:
--------------------------------------------------------------------------------
  1 | """AnyParser RT: Real-time parser for any data format."""
  2 | 
  3 | import base64
  4 | import json
  5 | import time
  6 | import uuid
  7 | from collections.abc import Iterable
  8 | from io import StringIO
  9 | from pathlib import Path
 10 | 
 11 | import requests
 12 | 
 13 | from any_parser.async_parser import AsyncParser
 14 | from any_parser.batch_parser import BatchParser
 15 | from any_parser.constants import ProcessType
 16 | from any_parser.sync_parser import (
 17 |     ExtractKeyValueSyncParser,
 18 |     ExtractPIISyncParser,
 19 |     ExtractResumeKeyValueSyncParser,
 20 |     ExtractTablesSyncParser,
 21 |     ParseSyncParser,
 22 | )
 23 | from any_parser.utils import validate_file_inputs
 24 | 
 25 | PUBLIC_SHARED_BASE_URL = "https://public-api.cambioml.com"
 26 | PUBLIC_BATCH_BASE_URL = "http://batch-api.cambioml.com"
 27 | TIMEOUT = 180
 28 | 
 29 | 
 30 | def handle_file_processing(func):
 31 |     """
 32 |     Decorator to handle common file processing logic for parsing
 33 |     and extraction operations.
 34 | 
 35 |     This decorator manages file input validation and processing, supporting
 36 |     either direct file content or file path inputs. It performs base64 encoding
 37 |     of file contents when a file path is provided.
 38 | 
 39 |     Args:
 40 |         func: The decorated function that performs the actual parsing or
 41 |         extraction.
 42 | 
 43 |     Parameters for decorated functions:
 44 |         file_path (str, optional): Path to the file to be processed. If
 45 |             provided, the file will be read and encoded in base64.
 46 |         file_content (str, optional): Base64-encoded content of the file. If
 47 |             provided, file_path will be ignored.
 48 |         file_type (str, optional): The file extension/type (e.g., 'pdf').
 49 |             If not provided and file_path is given, it will be inferred from
 50 |             the file extension.
 51 |         *args, **kwargs: Additional arguments passed to the decorated function.
 52 | 
 53 |     Returns:
 54 |         tuple: A tuple containing (error_message, result), where:
 55 |             - error_message (str): Error message if processing fails, empty
 56 |                 string on success
 57 |             - result (str): Empty string if error occurs, otherwise the
 58 |                 processed result from func
 59 | 
 60 |     Usage:
 61 |         @handle_file_processing
 62 |         def parse(self, file_path=None, file_content=None, file_type=None):
 63 |             # Implementation
 64 |             pass
 65 | 
 66 |     Note:
 67 |         Either file_path or file_content must be provided, but not both.
 68 |         If file_path is provided, the file content will be read and encoded in
 69 |         base64, and file_type will be inferred from the file extension.
 70 |         If file_content is provided, file_type will be validated, and a
 71 |         temporary file path will be generated for generating presigned url(for
 72 |         async parsing and extraction)
 73 |     """
 74 | 
 75 |     def wrapper(
 76 |         self,
 77 |         file_path=None,
 78 |         file_content=None,
 79 |         file_type=None,
 80 |         *args,
 81 |         **kwargs,
 82 |     ):
 83 |         # pylint: disable=too-many-arguments
 84 |         # Validate inputs
 85 |         is_valid, error_message = validate_file_inputs(
 86 |             file_path=file_path,
 87 |             file_content=file_content,
 88 |             file_type=file_type,
 89 |         )
 90 | 
 91 |         if not is_valid:
 92 |             return error_message, ""
 93 | 
 94 |         # Encode the file content in base64 if file_path is provided
 95 |         if file_path:
 96 |             try:
 97 |                 with open(file_path, "rb") as file:
 98 |                     file_content = base64.b64encode(file.read()).decode("utf-8")
 99 |                     file_type = Path(file_path).suffix.lower().lstrip(".")
100 |             except Exception as e:
101 |                 return f"Error: {e}", ""
102 |         else:
103 |             # generate a random file path for genrating presigned url
104 |             file_path = f"/tmp/{uuid.uuid4()}.{file_type}"
105 | 
106 |         return func(
107 |             self,
108 |             file_path=file_path,
109 |             file_content=file_content,
110 |             file_type=file_type,
111 |             *args,
112 |             **kwargs,
113 |         )
114 | 
115 |     return wrapper
116 | 
117 | 
118 | class AnyParser:
119 |     """Real-time parser for processing various data formats.
120 | 
121 |     Provides both synchronous and asynchronous methods for parsing and
122 |     extracting information from different types of files.
123 |     """
124 | 
125 |     def __init__(
126 |         self,
127 |         api_key: str,
128 |         base_url: str = PUBLIC_SHARED_BASE_URL,
129 |         batch_url: str = PUBLIC_BATCH_BASE_URL,
130 |     ) -> None:
131 |         """Initialize AnyParser with API credentials.
132 | 
133 |         Args:
134 |             api_key: Authentication key for API access
135 |             base_url: API endpoint URL, defaults to public endpoint
136 |         """
137 |         self._async_parser = AsyncParser(api_key, base_url)
138 |         self._sync_parse = ParseSyncParser(api_key, base_url)
139 |         self._sync_extract_key_value = ExtractKeyValueSyncParser(api_key, base_url)
140 |         self._sync_extract_resume_key_value = ExtractResumeKeyValueSyncParser(
141 |             api_key, base_url
142 |         )
143 |         self._sync_extract_pii = ExtractPIISyncParser(api_key, base_url)
144 |         self._sync_extract_tables = ExtractTablesSyncParser(api_key, base_url)
145 |         self.batches = BatchParser(api_key, batch_url)
146 | 
147 |     @handle_file_processing
148 |     def parse(
149 |         self,
150 |         file_path=None,
151 |         file_content=None,
152 |         file_type=None,
153 |         extract_args=None,
154 |     ):
155 |         """Extract full content from a file synchronously.
156 | 
157 |         Args:
158 |             file_path: Path to input file
159 |             file_content: Base64 encoded file content
160 |             file_type: File format extension
161 |             extract_args: Additional extraction parameters
162 | 
163 |         Returns:
164 |             tuple: (result, timing_info) or (error_message, "")
165 |         """
166 |         return self._sync_parse.parse(
167 |             file_path=file_path,
168 |             file_content=file_content,
169 |             file_type=file_type,
170 |             extract_args=extract_args,
171 |         )
172 | 
173 |     @handle_file_processing
174 |     def extract_pii(
175 |         self,
176 |         file_path=None,
177 |         file_content=None,
178 |         file_type=None,
179 |     ):
180 |         """
181 |         Extract PII data from a file synchronously.
182 |         """
183 |         return self._sync_extract_pii.extract(
184 |             file_path=file_path,
185 |             file_content=file_content,
186 |             file_type=file_type,
187 |         )
188 | 
189 |     @staticmethod
190 |     def flatten_to_string(item):
191 |         """
192 |         Flatten any iterable object to a string.
193 |         """
194 | 
195 |         if isinstance(item, str):
196 |             return item
197 | 
198 |         # if item is a dict, flatten all keys and values
199 |         if isinstance(item, dict):
200 |             parts = []
201 |             for k, v in item.items():
202 |                 parts.append(AnyParser.flatten_to_string(k))
203 |                 parts.append(AnyParser.flatten_to_string(v))
204 |             return "".join(parts)
205 | 
206 |         # item is other iterable objects
207 |         if isinstance(item, Iterable):
208 |             parts = []
209 |             for sub_item in item:
210 |                 parts.append(AnyParser.flatten_to_string(sub_item))
211 |             return "".join(parts)
212 | 
213 |         # item is not iterable objects
214 |         return str(item)
215 | 
216 |     @handle_file_processing
217 |     def extract_tables(
218 |         self,
219 |         file_path=None,
220 |         file_content=None,
221 |         file_type=None,
222 |         return_type="html",
223 |     ):
224 |         """Extract tables from a file in real-time.
225 | 
226 |         Args:
227 |             file_path (str): The path to the file to be parsed.
228 |             return_type (str): 'html' or 'csv'
229 |         Returns:
230 |             tuple(str, str)
231 |         """
232 |         extracted_html, time_elapsed = self._sync_extract_tables.extract(
233 |             file_path=file_path,
234 |             file_content=file_content,
235 |             file_type=file_type,
236 |         )
237 | 
238 |         if isinstance(extracted_html, list):
239 |             extracted_html = AnyParser.flatten_to_string(extracted_html)
240 | 
241 |         if return_type.lower() == "csv":
242 |             try:
243 |                 import pandas as pd
244 |             except ImportError:
245 |                 raise ImportError("Please install pandas to use CSV return_type")
246 | 
247 |             if isinstance(extracted_html, list):
248 |                 extracted_html = "".join(str(item) for item in extracted_html)
249 | 
250 |             df_list = pd.read_html(StringIO(extracted_html))
251 |             combined_df = pd.concat(df_list, ignore_index=True)
252 |             csv_output = combined_df.to_csv(index=False)
253 | 
254 |             return csv_output, time_elapsed
255 | 
256 |         return extracted_html, time_elapsed
257 | 
258 |     @handle_file_processing
259 |     def extract_key_value(
260 |         self,
261 |         file_path=None,
262 |         file_content=None,
263 |         file_type=None,
264 |         extract_instruction=None,
265 |     ):
266 |         """Extract key-value pairs from a file in real-time.
267 | 
268 |         Args:
269 |             file_path (str): The path to the file to be parsed.
270 |             extract_instruction (Dict): A dictionary containing the keys to be
271 |                 extracted, with their values as the description of those keys.
272 |         Returns:
273 |             tuple(str, str): The extracted data and the time taken.
274 |         """
275 |         return self._sync_extract_key_value.extract(
276 |             file_path=file_path,
277 |             file_content=file_content,
278 |             file_type=file_type,
279 |             extract_args={"extract_instruction": extract_instruction},
280 |         )
281 | 
282 |     @handle_file_processing
283 |     def extract_resume_key_value(
284 |         self, file_path=None, file_content=None, file_type=None
285 |     ):
286 |         """Extract resume in real-time.
287 | 
288 |         Args:
289 |             file_path (str): The path to the file to be parsed.
290 |         Returns:
291 |             tuple(str, str): The extracted data and the time taken.
292 |                 extracted data includes:
293 |                     - "education": Education
294 |                     - "work_experience": Work Experience
295 |                     - "personal_info": Personal Information
296 |                     - "skills": Skills
297 |                     - "certifications": Certifications
298 |                     - "projects": Projects
299 |                     - "pii": Personally Identifiable Information - includes
300 |                         only name, email, and phone
301 |         """
302 |         return self._sync_extract_resume_key_value.extract(
303 |             file_path=file_path,
304 |             file_content=file_content,
305 |             file_type=file_type,
306 |         )
307 | 
308 |     # Example of decorated methods:
309 |     @handle_file_processing
310 |     def async_parse(
311 |         self,
312 |         file_path=None,
313 |         file_content=None,
314 |         file_type=None,
315 |         extract_args=None,
316 |     ):
317 |         """Extract full content from a file asynchronously."""
318 |         return self._async_parser.send_async_request(
319 |             process_type=ProcessType.PARSE,
320 |             file_path=file_path,  # type: ignore
321 |             file_content=file_content,  # type: ignore
322 |             extract_args=extract_args,
323 |         )
324 | 
325 |     @handle_file_processing
326 |     def async_parse_with_layout(
327 |         self, file_path=None, file_content=None, file_type=None
328 |     ):
329 |         """Extract content from a file asynchronously with layout analysis."""
330 |         return self._async_parser.send_async_request(
331 |             process_type=ProcessType.PARSE_WITH_LAYOUT,
332 |             file_path=file_path,  # type: ignore
333 |             file_content=file_content,  # type: ignore
334 |         )
335 | 
336 |     @handle_file_processing
337 |     def async_parse_with_ocr(self, file_path=None, file_content=None, file_type=None):
338 |         """Extract full content from a file asynchronously with OCR."""
339 |         return self._async_parser.send_async_request(
340 |             process_type=ProcessType.PARSE_WITH_OCR,
341 |             file_path=file_path,  # type: ignore
342 |             file_content=file_content,  # type: ignore
343 |         )
344 | 
345 |     @handle_file_processing
346 |     def async_extract_pii(
347 |         self,
348 |         file_path=None,
349 |         file_content=None,
350 |         file_type=None,
351 |         extract_args=None,
352 |     ):
353 |         """Extract PII from a file asynchronously."""
354 |         return self._async_parser.send_async_request(
355 |             process_type=ProcessType.EXTRACT_PII,
356 |             file_path=file_path,  # type: ignore
357 |             file_content=file_content,  # type: ignore
358 |             extract_args=extract_args,
359 |         )
360 | 
361 |     @handle_file_processing
362 |     def async_extract_tables(self, file_path=None, file_content=None, file_type=None):
363 |         """Extract tables from a file asynchronously."""
364 |         return self._async_parser.send_async_request(
365 |             process_type=ProcessType.EXTRACT_TABLES,
366 |             file_path=file_path,  # type: ignore
367 |             file_content=file_content,  # type: ignore
368 |         )
369 | 
370 |     @handle_file_processing
371 |     def async_extract_key_value(
372 |         self,
373 |         file_path=None,
374 |         file_content=None,
375 |         file_type=None,
376 |         extract_instruction=None,
377 |     ):
378 |         """Extract key-value pairs from a file asynchronously."""
379 |         return self._async_parser.send_async_request(
380 |             process_type=ProcessType.EXTRACT_KEY_VALUE,
381 |             file_path=file_path,  # type: ignore
382 |             file_content=file_content,  # type: ignore
383 |             extract_args={"extract_instruction": extract_instruction},
384 |         )
385 | 
386 |     @handle_file_processing
387 |     def async_extract_resume_key_value(
388 |         self, file_path=None, file_content=None, file_type=None
389 |     ):
390 |         """Extract resume key-value pairs from a file asynchronously."""
391 |         return self._async_parser.send_async_request(
392 |             process_type=ProcessType.EXTRACT_RESUME_KEY_VALUE,
393 |             file_path=file_path,  # type: ignore
394 |             file_content=file_content,  # type: ignore
395 |             extract_args=None,
396 |         )
397 | 
398 |     def async_fetch(
399 |         self,
400 |         file_id: str,
401 |         sync: bool = True,
402 |         sync_timeout: int = 180,
403 |         sync_interval: int = 5,
404 |     ) -> str:
405 |         """Fetches extraction results asynchronously.
406 | 
407 |         Args:
408 |             file_id (str): The ID of the file to fetch results for.
409 |             sync (bool, optional): Whether to wait for the results
410 |                 synchronously.
411 |             sync_timeout (int, optional): Maximum time to wait for results in
412 |                 seconds. Defaults to 180.
413 |             sync_interval (int, optional): Time interval between polling
414 |                 attempts in seconds. Defaults to 5.
415 | 
416 |         Returns:
417 |             str: The extracted results as a markdown string.
418 |             None: If the extraction is still in progress (when sync is False).
419 |         """
420 | 
421 |         response = None
422 |         # Create the JSON payload
423 |         payload = {"file_id": file_id}
424 |         if sync:
425 |             start_time = time.time()
426 |             while time.time() < start_time + sync_timeout:
427 |                 response = requests.post(
428 |                     self._async_parser._async_fetch_url,
429 |                     headers=self._async_parser._headers,
430 |                     data=json.dumps(payload),
431 |                     timeout=TIMEOUT,
432 |                 )
433 |                 if response.status_code == 202:
434 |                     print("Waiting for response...")
435 |                     time.sleep(sync_interval)
436 |                     continue
437 |                 break
438 |         else:
439 |             response = requests.post(
440 |                 self._async_parser._async_fetch_url,
441 |                 headers=self._async_parser._headers,
442 |                 data=json.dumps(payload),
443 |                 timeout=TIMEOUT,
444 |             )
445 | 
446 |         return self._async_parser.handle_async_response(response)
447 | 


--------------------------------------------------------------------------------
/any_parser/async_parser.py:
--------------------------------------------------------------------------------
  1 | """Asynchronous parser implementation."""
  2 | 
  3 | import json
  4 | from pathlib import Path
  5 | from typing import Dict, Optional
  6 | 
  7 | import requests
  8 | 
  9 | from any_parser.base_parser import BaseParser
 10 | from any_parser.constants import ProcessType
 11 | from any_parser.utils import upload_file_to_presigned_url
 12 | 
 13 | TIMEOUT = 60
 14 | 
 15 | 
 16 | class BasePostProcessor:
 17 |     def __init__(self, successor=None) -> None:
 18 |         self.successor = successor
 19 | 
 20 |     def process(self, json_response: Dict) -> str:
 21 |         if self.successor:
 22 |             return self.successor.process(json_response)
 23 |         return f"Error: Invalid JSON response: {json_response}"
 24 | 
 25 | 
 26 | class ParsePostProcessor(BasePostProcessor):
 27 |     def process(self, json_response: Dict) -> str:
 28 |         if "markdown" in json_response:
 29 |             return json_response["markdown"]
 30 |         if "result" in json_response:
 31 |             return json_response["result"]
 32 |         return super().process(json_response)
 33 | 
 34 | 
 35 | class KeyValuePostProcessor(BasePostProcessor):
 36 |     def process(self, json_response: Dict) -> str:
 37 |         if "json" in json_response:
 38 |             return json_response["json"]
 39 |         return super().process(json_response)
 40 | 
 41 | 
 42 | class ExtractPIIPostProcessor(BasePostProcessor):
 43 |     def process(self, json_response: Dict) -> str:
 44 |         if "pii_extraction" in json_response:
 45 |             return json_response["pii_extraction"]
 46 |         return super().process(json_response)
 47 | 
 48 | 
 49 | class ExtractResumeKeyValuePostProcessor(BasePostProcessor):
 50 | 
 51 |     def process(self, json_response: Dict) -> str:
 52 |         if "resume_extraction" in json_response:
 53 |             return json_response["resume_extraction"]
 54 |         return super().process(json_response)
 55 | 
 56 | 
 57 | class AsyncParser(BaseParser):
 58 |     def __init__(self, api_key: str, base_url: str) -> None:
 59 |         super().__init__(api_key, base_url)
 60 |         self._async_upload_url = f"{self._base_url}/async/upload"
 61 |         self._async_fetch_url = f"{self._base_url}/async/fetch"
 62 | 
 63 |     def send_async_request(
 64 |         self,
 65 |         process_type: ProcessType,
 66 |         file_path: str,
 67 |         file_content: str,
 68 |         extract_args: Optional[Dict] = None,
 69 |     ) -> str:
 70 |         """Extract full content from a file asynchronously.
 71 | 
 72 |         Args:
 73 |             process_type (ProcessType): The type of processing to be done.
 74 |             file_path (str): The path to the file to be parsed.
 75 |             file_content (str): The content of the file to be parsed.
 76 |             extract_args (Optional[Dict]): Additional extraction arguments.
 77 | 
 78 |         Returns:
 79 |             str: The file id of the uploaded file.
 80 |         """
 81 | 
 82 |         file_name = Path(file_path).name
 83 | 
 84 |         # Create the JSON payload
 85 |         payload = {
 86 |             "file_name": file_name,
 87 |             "process_type": process_type.value,
 88 |         }
 89 | 
 90 |         if extract_args is not None and isinstance(extract_args, dict):
 91 |             payload["extract_args"] = extract_args  # type: ignore
 92 | 
 93 |         # Send the POST request
 94 |         response = requests.post(
 95 |             self._async_upload_url,
 96 |             headers=self._headers,
 97 |             data=json.dumps(payload),
 98 |             timeout=TIMEOUT,
 99 |         )
100 | 
101 |         # If response successful, upload the file
102 |         return upload_file_to_presigned_url(file_content, response)
103 | 
104 |     def handle_async_response(self, response) -> str:
105 |         if response is None:
106 |             return "Error: timeout, no response received"
107 |         if response.status_code == 202:
108 |             return ""
109 |         if response.status_code == 200:
110 |             extract_resume_processor = ExtractResumeKeyValuePostProcessor()
111 |             key_value_processor = KeyValuePostProcessor(extract_resume_processor)
112 |             extract_pii_processor = ExtractPIIPostProcessor(key_value_processor)
113 |             handler = ParsePostProcessor(extract_pii_processor)
114 |             try:
115 |                 return handler.process(response.json())
116 |             except json.JSONDecodeError:
117 |                 return f"Error: Invalid JSON response: {response.text}"
118 | 
119 |         return f"Error: {response.status_code} {response.text}"
120 | 


--------------------------------------------------------------------------------
/any_parser/base_parser.py:
--------------------------------------------------------------------------------
 1 | """Base parser implementation."""
 2 | 
 3 | 
 4 | class BaseParser:
 5 |     def __init__(self, api_key: str, base_url: str) -> None:
 6 |         self._api_key = api_key
 7 |         self._base_url = base_url
 8 |         self._headers = {
 9 |             "Content-Type": "application/json",
10 |             "x-api-key": self._api_key,
11 |         }
12 | 


--------------------------------------------------------------------------------
/any_parser/batch_parser.py:
--------------------------------------------------------------------------------
  1 | """Batch parser implementation."""
  2 | 
  3 | import logging
  4 | import os
  5 | from concurrent.futures import ThreadPoolExecutor, as_completed
  6 | from pathlib import Path
  7 | from typing import List, Optional, Union
  8 | 
  9 | import requests
 10 | from pydantic import BaseModel, Field
 11 | 
 12 | from any_parser.base_parser import BaseParser
 13 | 
 14 | TIMEOUT = 60
 15 | MAX_WORKERS = 10
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | class UploadResponse(BaseModel):
 21 |     """
 22 |     Response from the batch upload endpoint.
 23 |     """
 24 | 
 25 |     fileName: str
 26 |     requestId: str
 27 |     requestStatus: str
 28 | 
 29 | 
 30 | class UsageResponse(BaseModel):
 31 |     """
 32 |     Response from the batch usage endpoint.
 33 |     """
 34 | 
 35 |     pageLimit: int
 36 |     pageRemaining: int
 37 | 
 38 | 
 39 | class FileStatusResponse(BaseModel):
 40 |     """
 41 |     Response from the batch file status endpoint.
 42 |     """
 43 | 
 44 |     fileName: str
 45 |     fileType: str
 46 |     requestId: str
 47 |     requestStatus: str
 48 |     uploadTime: str
 49 |     completionTime: Optional[str] = None
 50 |     result: Optional[List[str]] = Field(default_factory=list)
 51 |     error: Optional[List[str]] = Field(default_factory=list)
 52 | 
 53 | 
 54 | class BatchParser(BaseParser):
 55 |     def __init__(self, api_key: str, base_url: str) -> None:
 56 |         super().__init__(api_key, base_url)
 57 |         self._file_upload_url = f"{self._base_url}/files/"
 58 |         self._processing_status_url = f"{self._base_url}/files/" + "{request_id}"
 59 |         self._usage_url = f"{self._base_url}/users/current/usage"
 60 | 
 61 |         # remove "Content-Type" from headers
 62 |         self._headers.pop("Content-Type")
 63 | 
 64 |     def create(self, file_path: str) -> Union[UploadResponse, List[UploadResponse]]:
 65 |         """Upload a single file or folder for batch processing.
 66 | 
 67 |         Args:
 68 |             file_path: Path to the file or folder to upload
 69 | 
 70 |         Returns:
 71 |             If file: Single UploadResponse object containing upload details
 72 |             If folder: List of UploadResponse objects for each file
 73 |         """
 74 |         path = Path(file_path)
 75 |         if path.is_file():
 76 |             return self._upload_single_file(path)
 77 |         elif path.is_dir():
 78 |             return self._upload_folder(path)
 79 |         else:
 80 |             raise ValueError(f"Path {file_path} does not exist")
 81 | 
 82 |     def _upload_single_file(self, file_path: Path) -> UploadResponse:
 83 |         """Upload a single file for batch processing."""
 84 |         if not os.path.isfile(file_path):
 85 |             raise FileNotFoundError(f"The file path '{file_path}' does not exist.")
 86 | 
 87 |         with open(file_path, "rb") as f:
 88 |             files = {"file": f}
 89 |             response = requests.post(
 90 |                 self._file_upload_url,
 91 |                 headers=self._headers,
 92 |                 files=files,
 93 |                 timeout=TIMEOUT,
 94 |             )
 95 | 
 96 |             if response.status_code != 200:
 97 |                 raise Exception(f"Upload failed: {response.text}")
 98 | 
 99 |             data = response.json()
100 |             return UploadResponse(
101 |                 fileName=data["fileName"],
102 |                 requestId=data["requestId"],
103 |                 requestStatus=data["requestStatus"],
104 |             )
105 | 
106 |     def _upload_folder(self, folder_path: Path) -> List[UploadResponse]:
107 |         """Upload all files in a folder for batch processing.
108 | 
109 |         Args:
110 |             folder_path: Path to the folder containing files to upload
111 | 
112 |         Returns:
113 |             List of UploadResponse objects for each uploaded file
114 |         """
115 |         # Get all files in folder and subfolders
116 |         files = []
117 |         for root, _, filenames in os.walk(folder_path):
118 |             for filename in filenames:
119 |                 files.append(Path(root) / filename)
120 | 
121 |         # Upload files concurrently using thread pool
122 |         responses = []
123 |         with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
124 |             future_to_file = {
125 |                 executor.submit(self._upload_single_file, file_path): file_path
126 |                 for file_path in files
127 |             }
128 | 
129 |             for future in as_completed(future_to_file):
130 |                 file_path = future_to_file[future]
131 |                 try:
132 |                     response = future.result()
133 |                     responses.append(response)
134 |                 except Exception as e:
135 |                     logger.error(f"Failed to upload {file_path}: {str(e)}")
136 | 
137 |         return responses
138 | 
139 |     def retrieve(self, request_id: str) -> FileStatusResponse:
140 |         """Get the processing status of a file.
141 | 
142 |         Args:
143 |             request_id: The ID of the file processing request
144 | 
145 |         Returns:
146 |             FileProcessingStatus object containing status details
147 |         """
148 |         response = requests.get(
149 |             self._processing_status_url.format(request_id=request_id),
150 |             headers=self._headers,
151 |             timeout=TIMEOUT,
152 |         )
153 | 
154 |         if response.status_code != 200:
155 |             raise Exception(f"Status check failed: {response.text}")
156 | 
157 |         data = response.json()
158 |         return FileStatusResponse(**data)
159 | 
160 |     def get_usage(self) -> UsageResponse:
161 |         """Get current usage information.
162 | 
163 |         Returns:
164 |             UsageResponse object containing usage details
165 |         """
166 |         response = requests.get(
167 |             self._usage_url,
168 |             headers=self._headers,
169 |             timeout=TIMEOUT,
170 |         )
171 | 
172 |         if response.status_code != 200:
173 |             raise Exception(f"Usage check failed: {response.text}")
174 | 
175 |         data = response.json()
176 |         return UsageResponse(
177 |             pageLimit=data["pageLimit"], pageRemaining=data["pageRemaining"]
178 |         )
179 | 


--------------------------------------------------------------------------------
/any_parser/constants.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class ProcessType(Enum):
 5 |     EXTRACT_PII = "extract_pii"
 6 |     EXTRACT_TABLES = "extract_tables"
 7 |     EXTRACT_KEY_VALUE = "extract_key_value"
 8 |     EXTRACT_RESUME_KEY_VALUE = "extract_resume_key_value"
 9 |     PARSE = "parse"
10 |     PARSE_WITH_OCR = "parse_with_ocr"
11 |     PARSE_WITH_LAYOUT = "parse_with_layout"
12 | 


--------------------------------------------------------------------------------
/any_parser/sync_parser.py:
--------------------------------------------------------------------------------
  1 | """Synchronous parser implementation."""
  2 | 
  3 | import json
  4 | import time
  5 | from typing import Any, Dict, Optional, Tuple
  6 | 
  7 | import requests
  8 | 
  9 | from any_parser.base_parser import BaseParser
 10 | 
 11 | TIMEOUT = 60
 12 | 
 13 | 
 14 | class BaseSyncParser(BaseParser):
 15 | 
 16 |     def get_sync_response(
 17 |         self,
 18 |         url_endpoint: str,
 19 |         file_content: str,
 20 |         file_type: str,
 21 |         extract_args: Optional[Dict[str, Any]] = None,
 22 |     ) -> Tuple[Optional[requests.Response], str]:
 23 |         payload = {
 24 |             "file_content": file_content,
 25 |             "file_type": file_type,
 26 |         }
 27 |         if extract_args:
 28 |             payload["extract_args"] = extract_args  # type: ignore
 29 | 
 30 |         start_time = time.time()
 31 |         response = requests.post(
 32 |             url_endpoint,
 33 |             headers=self._headers,
 34 |             data=json.dumps(payload),
 35 |             timeout=TIMEOUT,
 36 |         )
 37 |         end_time = time.time()
 38 | 
 39 |         if response.status_code != 200:
 40 |             return None, f"Error: {response.status_code} {response.text}"
 41 | 
 42 |         return response, f"{end_time - start_time:.2f} seconds"
 43 | 
 44 |     def parse(
 45 |         self,
 46 |         file_path=None,
 47 |         file_content=None,
 48 |         file_type=None,
 49 |         extract_args=None,
 50 |     ):
 51 |         """Converts the given file to markdown."""
 52 |         raise NotImplementedError
 53 | 
 54 |     def extract(
 55 |         self,
 56 |         file_path=None,
 57 |         file_content=None,
 58 |         file_type=None,
 59 |         extract_args=None,
 60 |     ):
 61 |         """Extracts information from the given file."""
 62 |         raise NotImplementedError
 63 | 
 64 | 
 65 | class ParseSyncParser(BaseSyncParser):
 66 |     """Parse parser implementation."""
 67 | 
 68 |     def parse(
 69 |         self,
 70 |         file_path=None,
 71 |         file_content=None,
 72 |         file_type=None,
 73 |         extract_args=None,
 74 |     ):
 75 |         response, info = self.get_sync_response(
 76 |             f"{self._base_url}/parse",
 77 |             file_content=file_content,  # type: ignore
 78 |             file_type=file_type,  # type: ignore
 79 |             extract_args=extract_args,
 80 |         )
 81 | 
 82 |         if response is None:
 83 |             return info, ""
 84 | 
 85 |         try:
 86 |             response_data = response.json()
 87 |             result = response_data["markdown"]
 88 |             return result, f"Time Elapsed: {info}"
 89 |         except json.JSONDecodeError:
 90 |             return f"Error: Invalid JSON response: {response.text}", ""
 91 | 
 92 | 
 93 | class ExtractPIISyncParser(BaseSyncParser):
 94 |     """Extract PII parser implementation."""
 95 | 
 96 |     def extract(
 97 |         self,
 98 |         file_path=None,
 99 |         file_content=None,
100 |         file_type=None,
101 |         extract_args=None,
102 |     ):
103 |         response, info = self.get_sync_response(
104 |             f"{self._base_url}/extract_pii",
105 |             file_content=file_content,  # type: ignore
106 |             file_type=file_type,  # type: ignore
107 |             extract_args=None,
108 |         )
109 | 
110 |         if response is None:
111 |             return info, ""
112 | 
113 |         try:
114 |             response_data = response.json()
115 |             result = response_data["pii_extraction"]
116 |             return result, f"Time Elapsed: {info}"
117 |         except json.JSONDecodeError:
118 |             return f"Error: Invalid JSON response: {response.text}", ""
119 | 
120 | 
121 | class ExtractTablesSyncParser(BaseSyncParser):
122 |     """Extract tables parser implementation."""
123 | 
124 |     def extract(
125 |         self,
126 |         file_path=None,
127 |         file_content=None,
128 |         file_type=None,
129 |         extract_args=None,
130 |     ):
131 |         response, info = self.get_sync_response(
132 |             f"{self._base_url}/extract_tables",
133 |             file_content=file_content,  # type: ignore
134 |             file_type=file_type,  # type: ignore
135 |             extract_args=None,
136 |         )
137 | 
138 |         if response is None:
139 |             return info, ""
140 | 
141 |         try:
142 |             response_data = response.json()
143 |             result = response_data["markdown"]
144 |             return result, f"Time Elapsed: {info}"
145 |         except json.JSONDecodeError:
146 |             return f"Error: Invalid JSON response: {response.text}", ""
147 | 
148 | 
149 | class ExtractKeyValueSyncParser(BaseSyncParser):
150 |     """Extract key-value parser implementation."""
151 | 
152 |     def extract(
153 |         self,
154 |         file_path=None,
155 |         file_content=None,
156 |         file_type=None,
157 |         extract_args=None,
158 |     ):
159 |         response, info = self.get_sync_response(
160 |             f"{self._base_url}/extract_key_value",
161 |             file_content=file_content,  # type: ignore
162 |             file_type=file_type,  # type: ignore
163 |             extract_args={"extract_instruction": extract_args},
164 |         )
165 | 
166 |         if response is None:
167 |             return info, ""
168 | 
169 |         try:
170 |             response_data = response.json()
171 |             result = response_data["json"]
172 |             return result, f"Time Elapsed: {info}"
173 |         except json.JSONDecodeError:
174 |             return f"Error: Invalid JSON response: {response.text}", ""
175 | 
176 | 
177 | class ExtractResumeKeyValueSyncParser(BaseSyncParser):
178 |     """Extract resume key-value parser implementation."""
179 | 
180 |     def extract(
181 |         self,
182 |         file_path=None,
183 |         file_content=None,
184 |         file_type=None,
185 |         extract_args=None,
186 |     ):
187 |         response, info = self.get_sync_response(
188 |             f"{self._base_url}/extract_resume_key_value",
189 |             file_content=file_content,  # type: ignore
190 |             file_type=file_type,  # type: ignore
191 |             extract_args=None,
192 |         )
193 | 
194 |         if response is None:
195 |             return info, ""
196 | 
197 |         try:
198 |             response_data = response.json()
199 |             result = response_data["extraction_result"]
200 |             return result, f"Time Elapsed: {info}"
201 |         except json.JSONDecodeError:
202 |             return f"Error: Invalid JSON response: {response.text}", ""
203 | 


--------------------------------------------------------------------------------
/any_parser/utils.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import io
  3 | import json
  4 | from enum import Enum
  5 | from pathlib import Path
  6 | from typing import Optional, Tuple
  7 | 
  8 | import requests
  9 | 
 10 | SUPPORTED_FILE_EXTENSIONS = [
 11 |     "pdf",
 12 |     "doc",
 13 |     "docx",
 14 |     "ppt",
 15 |     "pptx",
 16 |     "jpg",
 17 |     "jpeg",
 18 |     "png",
 19 |     "gif",
 20 | ]
 21 | 
 22 | 
 23 | class ValidationError(Enum):
 24 |     MISSING_INPUTS = "Either file_content or file_path must be provided"
 25 |     MISSING_FILE_TYPE = "file_type must be provided when using file_content"
 26 |     NOT_FOUND = "File does not exist: {}"
 27 |     UNSUPPORTED_FILE_TYPE = "Unsupported file type: {}. Supported file types: {}"
 28 |     FILE_EMPTY = "File is empty: {}"
 29 |     FILE_TOO_LARGE = "File size exceeds maximum limit of {} MB: {}"
 30 |     OTHER = "{}"
 31 | 
 32 | 
 33 | def validate_file_inputs(
 34 |     file_path: Optional[str],
 35 |     file_content: Optional[str],
 36 |     file_type: Optional[str],
 37 | ) -> Tuple[bool, str]:
 38 |     """Validate inputs for the parser or extractor.
 39 | 
 40 |     Args:
 41 |         file_content (Optional[str]): Base64 encoded file content
 42 |         file_path (Optional[str]): Path to the file
 43 |         file_type (Optional[str]): File extension/type
 44 | 
 45 |     Returns:
 46 |         Tuple[bool, str]: (is_valid, error_message)
 47 |         - is_valid: True if validation passes, False otherwise
 48 |         - error_message: "" if validation passes, error if validation fails
 49 |     """
 50 |     # Check if at least one input method is provided
 51 |     if file_content is None and file_path is None:
 52 |         return False, ValidationError.MISSING_INPUTS.value
 53 | 
 54 |     # Validate file_content path
 55 |     if file_content is not None and file_type is None:
 56 |         return False, ValidationError.MISSING_FILE_TYPE.value
 57 | 
 58 |     # Validate file path if provided
 59 |     if file_path is not None:
 60 |         path = Path(file_path)
 61 | 
 62 |         # Check if file exists
 63 |         if not path.is_file():
 64 |             return False, ValidationError.NOT_FOUND.value.format(file_path)
 65 | 
 66 |         # Check if file is empty
 67 |         if path.stat().st_size == 0:
 68 |             return False, ValidationError.FILE_EMPTY.value.format(file_path)
 69 | 
 70 |         # If file_type not provided, extract it from file_path
 71 |         if file_type is None:
 72 |             file_type = path.suffix.lower().lstrip(".")
 73 | 
 74 |     # Validate file type
 75 |     if file_type not in SUPPORTED_FILE_EXTENSIONS:
 76 |         supported_types = ", ".join(sorted(SUPPORTED_FILE_EXTENSIONS))
 77 |         return False, ValidationError.UNSUPPORTED_FILE_TYPE.value.format(
 78 |             file_type, supported_types
 79 |         )
 80 | 
 81 |     return True, ""
 82 | 
 83 | 
 84 | def upload_file_to_presigned_url(
 85 |     file_content: str, response: requests.Response, timeout: int = 10
 86 | ) -> str:
 87 |     if response.status_code == 200:
 88 |         try:
 89 |             file_id = response.json().get("fileId")
 90 |             presigned_url = response.json().get("presignedUrl")
 91 | 
 92 |             # Decode base64 content
 93 |             decoded_content = base64.b64decode(file_content)
 94 | 
 95 |             # Create file-like object from decoded content
 96 |             files = {"file": ("file", io.BytesIO(decoded_content))}
 97 | 
 98 |             upload_resp = requests.post(
 99 |                 presigned_url["url"],
100 |                 data=presigned_url["fields"],
101 |                 files=files,
102 |                 timeout=timeout,
103 |             )
104 |             if upload_resp.status_code != 204:
105 |                 return f"Error: {upload_resp.status_code} {upload_resp.text}"
106 |             return file_id
107 |         except json.JSONDecodeError:
108 |             return "Error: Invalid JSON response"
109 |     else:
110 |         return f"Error: {response.status_code} {response.text}"
111 | 


--------------------------------------------------------------------------------
/examples/async_extract_pii.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
 10 |     "# !pip3 install --upgrade ipython\n",
 11 |     "# !pip3 install --upgrade any-parser"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from any_parser import AnyParser"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "ap = AnyParser(api_key=\"...\")"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "file_path = \"./sample_data/resume_1.pdf\"\n",
 39 |     "file_id = ap.async_extract_pii(file_path)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 6,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "pii_info = ap.async_fetch(file_id=file_id)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 7,
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/plain": [
 59 |        "{'name': 'Gary Jiang',\n",
 60 |        " 'phone_number': '+1-213-725-7637',\n",
 61 |        " 'address': None,\n",
 62 |        " 'email_address': 'jiangzhehuan0105@gmail.com',\n",
 63 |        " 'linkedin_url': 'https://linkedin.com/in/gary-jiang',\n",
 64 |        " 'github_url': None,\n",
 65 |        " 'summary': 'Full-stack Software Engineer'}"
 66 |       ]
 67 |      },
 68 |      "execution_count": 7,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "pii_info"
 75 |    ]
 76 |   }
 77 |  ],
 78 |  "metadata": {
 79 |   "kernelspec": {
 80 |    "display_name": "any",
 81 |    "language": "python",
 82 |    "name": "python3"
 83 |   },
 84 |   "language_info": {
 85 |    "codemirror_mode": {
 86 |     "name": "ipython",
 87 |     "version": 3
 88 |    },
 89 |    "file_extension": ".py",
 90 |    "mimetype": "text/x-python",
 91 |    "name": "python",
 92 |    "nbconvert_exporter": "python",
 93 |    "pygments_lexer": "ipython3",
 94 |    "version": "-1.-1.-1"
 95 |   }
 96 |  },
 97 |  "nbformat": 4,
 98 |  "nbformat_minor": 2
 99 | }
100 | 


--------------------------------------------------------------------------------
/examples/async_extract_resume_key_value.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
 10 |     "# !pip3 install --upgrade ipython\n",
 11 |     "# !pip3 install --upgrade any-parser"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from IPython.display import display\n",
 21 |     "from any_parser import AnyParser"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "ap = AnyParser(api_key=\"...\")"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "file_path = \"./sample_data/resume_1.pdf\"\n",
 40 |     "file_id = ap.async_extract_resume_key_value(file_path)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 4,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "name": "stdout",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "Waiting for response...\n",
 53 |       "Waiting for response...\n",
 54 |       "Waiting for response...\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "json_result = ap.async_fetch(file_id=file_id)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 5,
 65 |    "metadata": {},
 66 |    "outputs": [
 67 |     {
 68 |      "data": {
 69 |       "text/plain": [
 70 |        "{'pii': {'full_name': 'GARY JIANG',\n",
 71 |        "  'email': 'jiangzhehuan0105@gmail.com',\n",
 72 |        "  'phone': '+1 (213) 725-7637'},\n",
 73 |        " 'education': [{'organization': 'Shenyang University of Technology',\n",
 74 |        "   'degree': \"Bachelor's Degree\",\n",
 75 |        "   'major': 'Computer Science',\n",
 76 |        "   'start_date': '2008-01-01',\n",
 77 |        "   'end_date': '2012-12-31',\n",
 78 |        "   'courses': None,\n",
 79 |        "   'achievements': None}],\n",
 80 |        " 'work_experience': [{'job_title': 'Full Stack Developer',\n",
 81 |        "   'company_name': 'VIMMERSE',\n",
 82 |        "   'location': None,\n",
 83 |        "   'start_date': '2023-06-01',\n",
 84 |        "   'end_date': 'present',\n",
 85 |        "   'job_type': None,\n",
 86 |        "   'summary': 'Developed an AI-powered editor web application that brings photos to life by animating images in 3D.',\n",
 87 |        "   'bullet_points': ['Developed robust back-end services utilizing Python (Flask/FastAPI) and Node.js (AWS Lambda) for efficient and scalable web applications',\n",
 88 |        "    'Built user-friendly and interactive websites using Next.js, ensuring a seamless user experience',\n",
 89 |        "    'Deployed and managed AWS infrastructure, including EC2 instances, S3 storage buckets, DynamoDB for NoSQL data management, and Cognito user pools for secure user authentication',\n",
 90 |        "    'Experienced Agile and Scrum methodologies within a fast-paced startup environment to ensure efficient project delivery and continuous improvement',\n",
 91 |        "    'Collaborated effectively with cross-functional teams (design, product management) to deliver projects on time, fostering a positive and collaborative work environment']},\n",
 92 |        "  {'job_title': 'Full Stack Developer',\n",
 93 |        "   'company_name': 'VIKING SASQUATCH',\n",
 94 |        "   'location': None,\n",
 95 |        "   'start_date': '2023-01-01',\n",
 96 |        "   'end_date': '2023-06-01',\n",
 97 |        "   'job_type': None,\n",
 98 |        "   'summary': 'Developed APIs and Integrations for all of the parties that work on Real Estate Transactions.',\n",
 99 |        "   'bullet_points': ['Connecting Mortgage, Title, and Real Estate to solve pain points and improve automation and create efficiencies',\n",
100 |        "    'Implemented a user-friendly front-end interface using Nuxt.js, ensuring a seamless user experience',\n",
101 |        "    'Built backend APIs utilizing Node.js serverless functions for optimal performance',\n",
102 |        "    'Managed data storage and security by implementing a MySQL database',\n",
103 |        "    'Collaborated effectively within a team using Agile methodologies like sprint planning, daily standups, retrospectives to ensure project delivery and continuous improvement']},\n",
104 |        "  {'job_title': 'Full Stack Developer',\n",
105 |        "   'company_name': 'ROX PAY SRL',\n",
106 |        "   'location': None,\n",
107 |        "   'start_date': '2021-12-01',\n",
108 |        "   'end_date': '2022-12-31',\n",
109 |        "   'job_type': None,\n",
110 |        "   'summary': 'Built Fintech Software House that aims to optimize B2B payments by offering a systemic solution that gives value-added services in collection of payments, financial information and corporate liquidity by essentially creating a Commission Free, Open Loop, Payment Gateway system.',\n",
111 |        "   'bullet_points': ['Developed front-end by using React.js and Redux, Javascript/Typescript',\n",
112 |        "    'Contributed developing backend utilizing Django/Python']},\n",
113 |        "  {'job_title': 'Freelancer',\n",
114 |        "   'company_name': 'FREELANCE',\n",
115 |        "   'location': None,\n",
116 |        "   'start_date': '2017-09-01',\n",
117 |        "   'end_date': '2021-10-31',\n",
118 |        "   'job_type': None,\n",
119 |        "   'summary': 'Developed and managed many web and mobile applications while working as freelancer at Internet Dzyns LLC company.',\n",
120 |        "   'bullet_points': ['Developed multiple web applications, participating in the whole process of their development: product design and estimation, code design and development, DevOps, UI/UX design, product launch and maintenance',\n",
121 |        "    'Developed cross-platform mobile application using Flutter and Ionic/Angular',\n",
122 |        "    'Developed NFT marketplace websites and wrote smart contracts']},\n",
123 |        "  {'job_title': 'Server Administrator, Java Developer',\n",
124 |        "   'company_name': 'NEUSOFT',\n",
125 |        "   'location': None,\n",
126 |        "   'start_date': '2014-06-01',\n",
127 |        "   'end_date': '2017-08-31',\n",
128 |        "   'job_type': None,\n",
129 |        "   'summary': 'Worked as intern and software developer after graduated university.',\n",
130 |        "   'bullet_points': ['Correct analytical and reasoning skills to troubleshoot and repair server issues',\n",
131 |        "    'Operating Systems & Security Software',\n",
132 |        "    'Java / Spring Boot / Hibernate']}],\n",
133 |        " 'personal_info': {'name': 'GARY JIANG',\n",
134 |        "  'phone_number': '+1-213-725-7637',\n",
135 |        "  'address': None,\n",
136 |        "  'email_address': 'jiangzhehuan0105@gmail.com',\n",
137 |        "  'linkedin_url': 'linkedin.com/in/gary-jiang',\n",
138 |        "  'github_url': None,\n",
139 |        "  'summary': None},\n",
140 |        " 'skills': {'Programming Languages': ['Python',\n",
141 |        "   'PHP',\n",
142 |        "   'Javascript',\n",
143 |        "   'Typescript',\n",
144 |        "   'HTML',\n",
145 |        "   'CSS'],\n",
146 |        "  'Tools': ['Flask',\n",
147 |        "   'Django',\n",
148 |        "   'FastAPI',\n",
149 |        "   'Laravel',\n",
150 |        "   'Node.js',\n",
151 |        "   'SQL databases',\n",
152 |        "   'Next.js',\n",
153 |        "   'React',\n",
154 |        "   'Redux',\n",
155 |        "   'Nuxt.js',\n",
156 |        "   'Vue',\n",
157 |        "   'AWS Lambda',\n",
158 |        "   'Cognito',\n",
159 |        "   'EC2',\n",
160 |        "   'S3',\n",
161 |        "   'DynamoDB',\n",
162 |        "   'API Gateway',\n",
163 |        "   'Flutter',\n",
164 |        "   'Ionic',\n",
165 |        "   'Angular',\n",
166 |        "   'Git',\n",
167 |        "   'Version Control',\n",
168 |        "   'DevOps',\n",
169 |        "   'CI/CD'],\n",
170 |        "  'Other': ['Startup Experience',\n",
171 |        "   'Adaptable',\n",
172 |        "   'Resourceful',\n",
173 |        "   'Prioritization',\n",
174 |        "   'Hybrid Mobile App Development',\n",
175 |        "   'AGILE',\n",
176 |        "   'SCRUM']},\n",
177 |        " 'certifications': [],\n",
178 |        " 'projects': []}"
179 |       ]
180 |      },
181 |      "metadata": {},
182 |      "output_type": "display_data"
183 |     }
184 |    ],
185 |    "source": [
186 |     "display(json_result)"
187 |    ]
188 |   }
189 |  ],
190 |  "metadata": {
191 |   "kernelspec": {
192 |    "display_name": "any",
193 |    "language": "python",
194 |    "name": "python3"
195 |   },
196 |   "language_info": {
197 |    "codemirror_mode": {
198 |     "name": "ipython",
199 |     "version": 3
200 |    },
201 |    "file_extension": ".py",
202 |    "mimetype": "text/x-python",
203 |    "name": "python",
204 |    "nbconvert_exporter": "python",
205 |    "pygments_lexer": "ipython3",
206 |    "version": "3.11.10"
207 |   }
208 |  },
209 |  "nbformat": 4,
210 |  "nbformat_minor": 2
211 | }
212 | 


--------------------------------------------------------------------------------
/examples/async_extract_tables.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
 10 |     "# !pip3 install --upgrade ipython\n",
 11 |     "# !pip3 install --upgrade any-parser"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from IPython.display import display, Markdown\n",
 21 |     "from any_parser import AnyParser\n",
 22 |     "import os"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 3,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "ap = AnyParser(api_key=os.getenv(\"CAMBIO_API_KEY\"))"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 4,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "file_path = \"./sample_data/sample.pdf\"\n",
 41 |     "file_id = ap.async_extract_tables(file_path)"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 5,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "Waiting for response...\n",
 54 |       "Waiting for response...\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "markdown_output = ap.async_fetch(file_id=file_id)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 6,
 65 |    "metadata": {},
 66 |    "outputs": [
 67 |     {
 68 |      "data": {
 69 |       "text/plain": [
 70 |        "['<table>\\n<tbody>\\n<tr><td>1 Overview 3                 </td><td>Technical information </td></tr>\\n<tr><td>2 Key requirements 4         </td><td>Ordering information  </td></tr>\\n<tr><td>3 Planned availability date 5</td><td>Terms and conditions  </td></tr>\\n<tr><td>3 Program number 8           </td><td>Prices                </td></tr>\\n<tr><td>3 Publications 8             </td><td>Announcement countries</td></tr>\\n</tbody>\\n</table>\\n\\n',\n",
 71 |        " '',\n",
 72 |        " '<table>\\n<tbody>\\n<tr><td>Program number</td><td>VRM  </td><td>Program name                                           </td></tr>\\n<tr><td>5737-L70      </td><td>2.8.0</td><td>IBM InfoSphere Optim Data Privacy for Unstructured Data</td></tr>\\n</tbody>\\n</table>\\n\\n',\n",
 73 |        " '<table>\\n<tbody>\\n<tr><td>Part number description                                                                                                                   </td><td>Part number</td></tr>\\n<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte License + SW Subscription &amp; Support 12 Months</td><td>D2604LL    </td></tr>\\n</tbody>\\n</table>\\n\\n',\n",
 74 |        " '<table>\\n<tbody>\\n<tr><td>Part number description                                                                                                                        </td><td>Part number</td></tr>\\n<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte SW Subscription &amp; Support Reinstatement 12 Months </td><td>D2605LL    </td></tr>\\n<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte Annual SW Subscription &amp; Support Renewal 12 Months</td><td>E0QGMLL    </td></tr>\\n<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte Monthly License                                   </td><td>D2608LL    </td></tr>\\n</tbody>\\n</table>\\n\\n<table>\\n<tbody>\\n<tr><td>Part number description                                                                                                                                  </td><td>Part number</td></tr>\\n<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems(R) Annual Terabyte License + SW Subscription &amp; Support 12 Months  </td><td>D2606LL    </td></tr>\\n<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte SW Subscription &amp; Support Reinstatement 12 Months </td><td>D2607LL    </td></tr>\\n<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte Annual SW Subscription &amp; Support Renewal 12 Months</td><td>E0QGNLL    </td></tr>\\n<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte Monthly License                                   </td><td>D2609LL    </td></tr>\\n</tbody>\\n</table>\\n\\n<table>\\n<tbody>\\n<tr><td>Program identifier</td><td>License Information document title                     </td><td>License Information document number</td></tr>\\n<tr><td>5737-L70          </td><td>IBM InfoSphere Optim Data Privacy for Unstructured Data</td><td>L-JERN-BFQ3KR                      </td></tr>\\n</tbody>\\n</table>\\n\\n',\n",
 75 |        " '<table>\\n<tbody>\\n<tr><td>Program identifier</td><td>License Information document title                     </td><td>License Information document number</td></tr>\\n<tr><td>5737-L70          </td><td>IBM InfoSphere Optim Data Privacy for Unstructured Data</td><td>L-JERN-BFQ3KR                      </td></tr>\\n</tbody>\\n</table>\\n\\n',\n",
 76 |        " '',\n",
 77 |        " '',\n",
 78 |        " '']"
 79 |       ]
 80 |      },
 81 |      "execution_count": 6,
 82 |      "metadata": {},
 83 |      "output_type": "execute_result"
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "markdown_output"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 9,
 93 |    "metadata": {},
 94 |    "outputs": [
 95 |     {
 96 |      "data": {
 97 |       "text/markdown": [
 98 |        "<table>\n",
 99 |        "<tbody>\n",
100 |        "<tr><td>1 Overview 3                 </td><td>Technical information </td></tr>\n",
101 |        "<tr><td>2 Key requirements 4         </td><td>Ordering information  </td></tr>\n",
102 |        "<tr><td>3 Planned availability date 5</td><td>Terms and conditions  </td></tr>\n",
103 |        "<tr><td>3 Program number 8           </td><td>Prices                </td></tr>\n",
104 |        "<tr><td>3 Publications 8             </td><td>Announcement countries</td></tr>\n",
105 |        "</tbody>\n",
106 |        "</table>\n",
107 |        "\n"
108 |       ],
109 |       "text/plain": [
110 |        "<IPython.core.display.Markdown object>"
111 |       ]
112 |      },
113 |      "metadata": {},
114 |      "output_type": "display_data"
115 |     }
116 |    ],
117 |    "source": [
118 |     "display(Markdown(markdown_output[0]))"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 14,
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "data": {
128 |       "text/markdown": [
129 |        "<table>\n",
130 |        "<tbody>\n",
131 |        "<tr><td>Part number description                                                                                                                        </td><td>Part number</td></tr>\n",
132 |        "<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte SW Subscription &amp; Support Reinstatement 12 Months </td><td>D2605LL    </td></tr>\n",
133 |        "<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte Annual SW Subscription &amp; Support Renewal 12 Months</td><td>E0QGMLL    </td></tr>\n",
134 |        "<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte Monthly License                                   </td><td>D2608LL    </td></tr>\n",
135 |        "</tbody>\n",
136 |        "</table>\n",
137 |        "\n",
138 |        "<table>\n",
139 |        "<tbody>\n",
140 |        "<tr><td>Part number description                                                                                                                                  </td><td>Part number</td></tr>\n",
141 |        "<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems(R) Annual Terabyte License + SW Subscription &amp; Support 12 Months  </td><td>D2606LL    </td></tr>\n",
142 |        "<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte SW Subscription &amp; Support Reinstatement 12 Months </td><td>D2607LL    </td></tr>\n",
143 |        "<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte Annual SW Subscription &amp; Support Renewal 12 Months</td><td>E0QGNLL    </td></tr>\n",
144 |        "<tr><td>IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte Monthly License                                   </td><td>D2609LL    </td></tr>\n",
145 |        "</tbody>\n",
146 |        "</table>\n",
147 |        "\n",
148 |        "<table>\n",
149 |        "<tbody>\n",
150 |        "<tr><td>Program identifier</td><td>License Information document title                     </td><td>License Information document number</td></tr>\n",
151 |        "<tr><td>5737-L70          </td><td>IBM InfoSphere Optim Data Privacy for Unstructured Data</td><td>L-JERN-BFQ3KR                      </td></tr>\n",
152 |        "</tbody>\n",
153 |        "</table>\n",
154 |        "\n"
155 |       ],
156 |       "text/plain": [
157 |        "<IPython.core.display.Markdown object>"
158 |       ]
159 |      },
160 |      "metadata": {},
161 |      "output_type": "display_data"
162 |     }
163 |    ],
164 |    "source": [
165 |     "display(Markdown(markdown_output[4]))"
166 |    ]
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "any",
172 |    "language": "python",
173 |    "name": "python3"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 3
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython3",
185 |    "version": "-1.-1.-1"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 2
190 | }
191 | 


--------------------------------------------------------------------------------
/examples/async_parse_pdf2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
 10 |     "# !pip3 install --upgrade ipython\n",
 11 |     "# !pip3 install --upgrade any-parser"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from IPython.display import display, Markdown\n",
 21 |     "from any_parser import AnyParser\n",
 22 |     "import os\n",
 23 |     "from dotenv import load_dotenv"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 3,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "load_dotenv(override=True)\n",
 33 |     "example_apikey = os.getenv(\"CAMBIO_API_KEY\")"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 4,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "ap = AnyParser(example_apikey)\n",
 43 |     "\n",
 44 |     "# Define extract_args as a dictionary with your desired parameters\n",
 45 |     "extract_args = {\n",
 46 |     "    \"vqa_figures_flag\": True,\n",
 47 |     "    \"vqa_charts_flag\": True\n",
 48 |     "}\n",
 49 |     "\n",
 50 |     "file_id = ap.async_parse(file_path=\"./sample_data/Earnings-Presentation-Q2-2024.pdf\", extract_args=extract_args)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 5,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "Waiting for response...\n",
 63 |       "Waiting for response...\n",
 64 |       "Waiting for response...\n",
 65 |       "Waiting for response...\n",
 66 |       "Waiting for response...\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "markdown_output = ap.async_fetch(file_id=file_id)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 7,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "data": {
 81 |       "text/markdown": [
 82 |        "Meta Earnings Presentation Q2 2024 \n",
 83 |        "\n",
 84 |        "investor.fb.com\n",
 85 |        "\n",
 86 |        " Meta logo, consisting of an infinity symbol followed by the text \"Meta\"\n",
 87 |        "\n",
 88 |        "Revenue by User Geography  Meta logo  \n",
 89 |        "\n",
 90 |        "In Millions\n",
 91 |        "\n",
 92 |        " \n",
 93 |        "| Quarter | US & Canada | Europe | Asia-Pacific | Rest of World | Total |\n",
 94 |        "|---|---|---|---|---|---|\n",
 95 |        "| Q2'24 | 16,847 | 9,300 | 7,888 | 5,036 | 39,071 |\n",
 96 |        "| Q1'24 | 15,824 | 8,483 | 7,481 | 4,667 | 36,455 |\n",
 97 |        "| Q4'23 | 18,585 | 9,441 | 7,512 | 4,573 | 40,111 |\n",
 98 |        "| Q3'23 | 15,190 | 7,777 | 6,928 | 4,251 | 34,146 |\n",
 99 |        "| Q2'23 | 14,422 | 7,323 | 6,515 | 3,739 | 31,999 |\n",
100 |        "| Q1'23 | 13,048 | 6,345 | 5,960 | 3,292 | 28,645 |\n",
101 |        "| Q4'22 | 15,636 | 7,050 | 6,050 | 3,429 | 32,165 |\n",
102 |        "| Q3'22 | 13,035 | 5,797 | 5,782 | 3,100 | 27,714 |\n",
103 |        "| Q2'22 | 13,249 | 6,452 | 5,797 | 3,213 | 28,822 |\n",
104 |        "\n",
105 |        "This stacked bar chart shows the revenue by user geography for Meta from Q2'22 to Q2'24. The revenue is divided into four categories: US & Canada, Europe, Asia-Pacific, and Rest of World. The total revenue for each quarter is shown at the top of each bar.\n",
106 |        " \n",
107 |        "\n",
108 |        "Our revenue by user geography is geographically apportioned based on our estimation of the geographic location of our users when they perform a revenue-generating activity. This allocation differs from our revenue disaggregated by geography disclosure in our condensed consolidated financial statements where revenue is geographically apportioned based on the addresses of our customers.\n",
109 |        "\n",
110 |        " 3\n",
111 |        "\n",
112 |        "Segment Results  Meta logo  \n",
113 |        "\n",
114 |        "In Millions\n",
115 |        "\n",
116 |        " \n",
117 |        "|  | Q2'22 | Q3'22 | Q4'22 | Q1'23 | Q2'23 | Q3'23 | Q4'23 | Q1'24 | Q2'24 |\n",
118 |        "|---|---|---|---|---|---|---|---|---|---|\n",
119 |        "| Advertising | $ 28,152 | $ 27,237 | $ 31,254 | $ 28,101 | $ 31,498 | $ 33,643 | $ 38,706 | $ 35,635 | $ 38,329 |\n",
120 |        "| Other | 218 | 192 | 184 | 205 | 225 | 293 | 334 | 380 | 389 |\n",
121 |        "| Family of Apps Revenue | 28,370 | 27,429 | 31,438 | 28,306 | 31,723 | 33,936 | 39,040 | 36,015 | 38,718 |\n",
122 |        "| Reality Labs Revenue | 452 | 285 | 727 | 339 | 276 | 210 | 1,071 | 440 | 353 |\n",
123 |        "| Total Revenue | $ 28,822 | $ 27,714 | $ 32,165 | $ 28,645 | $ 31,999 | $ 34,146 | $ 40,111 | $ 36,455 | $ 39,071 |\n",
124 |        "| Family of Apps Operating Income | $ 11,164 | $ 9,336 | $ 10,678 | $ 11,219 | $ 13,131 | $ 17,490 | $ 21,030 | $ 17,664 | $ 19,335 |\n",
125 |        "| Reality Labs Operating (Loss) | (2,806) | (3,672) | (4,279) | (3,992) | (3,739) | (3,742) | (4,646) | (3,846) | (4,488) |\n",
126 |        "| Total Income from Operations | $ 8,358 | $ 5,664 | $ 6,399 | $ 7,227 | $ 9,392 | $ 13,748 | $ 16,384 | $ 13,818 | $ 14,847 |\n",
127 |        "| Operating Margin | 29% | 20% | 20% | 25% | 29% | 40% | 41% | 38% | 38% |\n",
128 |        " \n",
129 |        "\n",
130 |        "We report our financial results based on two reportable segments: Family of Apps (FoA) and Reality Labs (RL). FoA includes Facebook, Instagram, Messenger, WhatsApp, and other services. RL includes our virtual, augmented, and mixed reality related consumer hardware, software, and content.\n",
131 |        "\n",
132 |        " 4\n",
133 |        "\n",
134 |        "Net Income  Meta logo  \n",
135 |        "\n",
136 |        "In Millions\n",
137 |        "\n",
138 |        " \n",
139 |        "| Quarter | Net Income |\n",
140 |        "|---|---|\n",
141 |        "| Q2'22 | $6,687 |\n",
142 |        "| Q3'22 | $4,395 |\n",
143 |        "| Q4'22 | $4,652 |\n",
144 |        "| Q1'23 | $5,709 |\n",
145 |        "| Q2'23 | $7,788 |\n",
146 |        "| Q3'23 | $11,583 |\n",
147 |        "| Q4'23 | $14,017 |\n",
148 |        "| Q1'24 | $12,369 |\n",
149 |        "| Q2'24 | $13,465 |\n",
150 |        "\n",
151 |        "This bar chart shows the Net Income in millions for Meta from Q2'22 to Q2'24. The y-axis ranges from $0 to $14,017 million, with increments of $1,000 million. The highest net income was $14,017 million in Q4'23, while the lowest was $4,395 million in Q3'22.\n",
152 |        " \n",
153 |        "\n",
154 |        " 7\n",
155 |        "\n",
156 |        "Diluted Earnings Per Share  Meta logo  \n",
157 |        "\n",
158 |        " \n",
159 |        "| Quarter | Earnings Per Share |\n",
160 |        "|---|---|\n",
161 |        "| Q2'22 | $2.46 |\n",
162 |        "| Q3'22 | $1.64 |\n",
163 |        "| Q4'22 | $1.76 |\n",
164 |        "| Q1'23 | $2.20 |\n",
165 |        "| Q2'23 | $2.98 |\n",
166 |        "| Q3'23 | $4.39 |\n",
167 |        "| Q4'23 | $5.33 |\n",
168 |        "| Q1'24 | $4.71 |\n",
169 |        "| Q2'24 | $5.16 |\n",
170 |        "\n",
171 |        "This bar chart shows the Diluted Earnings Per Share for Meta from Q2'22 to Q2'24. The y-axis ranges from $1.64 to $5.33, with increments of $0.02. The chart demonstrates an overall increasing trend in earnings per share over the period, with the highest point in Q4'23 at $5.33 and the lowest in Q3'22 at $1.64.\n",
172 |        " \n",
173 |        "\n",
174 |        " 8\n",
175 |        "\n",
176 |        "Limitations of Key Metrics and Other Data  Meta logo  \n",
177 |        "\n",
178 |        "To calculate our estimates of DAP, we currently use a series of machine learning models that are developed based on internal reviews of limited samples of user accounts and calibrated against user survey data. We apply significant judgment in designing these models and calculating these estimates. For example, to match user accounts within individual products and across multiple products, we use data signals such as similar device information, IP addresses, and user names. We also calibrate our models against data from periodic user surveys of varying sizes and frequency across our products, which survey questions are based on monthly usage, and which are inherently subject to error. The timing and results of such user surveys have in the past contributed, and may in the future contribute, to changes in our reported Family metrics from period to period. In addition, our data limitations may affect our understanding of certain details of our business and increase the risk of error for our Family metrics estimates. Our techniques and models rely on a variety of data signals from different products, and we rely on more limited data signals for some products compared to others. For example, as a result of limited visibility into encrypted products, we have fewer data signals from WhatsApp user accounts and primarily rely on phone numbers and device information to match WhatsApp user accounts with accounts on our other products. Any loss of access to data signals we use in our process for calculating Family metrics, whether as a result of our own product decisions, actions by third-party browser or mobile platforms, regulatory or legislative requirements, or other factors, also may impact the stability or accuracy of our reported Family metrics, as well as our ability to report these metrics at all. Our estimates of Family metrics also may change as our methodologies evolve, including through the application of new data signals or technologies, product changes, or other improvements in our user surveys, algorithms, or machine learning that may improve our ability to match accounts within and across our products or otherwise evaluate the broad population of our users. In addition, such evolution may allow us to identify previously undetected violating accounts (as defined below).\n",
179 |        "\n",
180 |        "We regularly evaluate our Family metrics to estimate the percentage of our DAP consisting solely of \"violating\" accounts. We define \"violating\" accounts as accounts which we believe are intended to be used for purposes that violate our terms of service, including bots and spam. In the first quarter of 2024, we estimated that less than 3% of our worldwide DAP consisted solely of violating accounts. Such estimation is based on an internal review of a limited sample of accounts, and we apply significant judgment in making this determination. For example, we look for account information and behaviors associated with Facebook and Instagram accounts that appear to be inauthentic to the reviewers, but we have limited visibility into WhatsApp user activity due to encryption. In addition, if we believe an individual person has one or more violating accounts, we do not include such person in our violating accounts estimation as long as we believe they have one account that does not constitute a violating account. From time to time, we disable certain user accounts, make product changes, or take other actions to reduce the number of violating accounts among our users, which may also reduce our DAP estimates in a particular period. We intend to disclose our estimates of the percentage of our DAP consisting solely of violating accounts on an annual basis. Violating accounts are very difficult to measure at our scale, and it is possible that the actual number of violating accounts may vary significantly from our estimates.\n",
181 |        "\n",
182 |        "## User Geography\n",
183 |        "\n",
184 |        "Our estimates for revenue by user location, as well as year-over-year percentage changes in ad impressions delivered and the average price per ad by user location, are also affected by data limitations and other challenges in measuring user geography. Our data regarding the geographic location of our users is estimated based on a number of factors, such as the user's IP address and self-disclosed location. These factors may not always accurately reflect the user's actual location. For example, a user may appear to be accessing our products from the location of the proxy server that the user connects to rather than from the user's actual location. The methodologies used to measure our metrics are also susceptible to algorithm or other technical errors.\n",
185 |        "\n",
186 |        " 17"
187 |       ],
188 |       "text/plain": [
189 |        "<IPython.core.display.Markdown object>"
190 |       ]
191 |      },
192 |      "metadata": {},
193 |      "output_type": "display_data"
194 |     }
195 |    ],
196 |    "source": [
197 |     "# Join the list elements with newlines to create a single string\n",
198 |     "markdown_text = '\\n\\n'.join(markdown_output)\n",
199 |     "display(Markdown(markdown_text))"
200 |    ]
201 |   }
202 |  ],
203 |  "metadata": {
204 |   "kernelspec": {
205 |    "display_name": "any",
206 |    "language": "python",
207 |    "name": "python3"
208 |   },
209 |   "language_info": {
210 |    "codemirror_mode": {
211 |     "name": "ipython",
212 |     "version": 3
213 |    },
214 |    "file_extension": ".py",
215 |    "mimetype": "text/x-python",
216 |    "name": "python",
217 |    "nbconvert_exporter": "python",
218 |    "pygments_lexer": "ipython3",
219 |    "version": "3.10.15"
220 |   }
221 |  },
222 |  "nbformat": 4,
223 |  "nbformat_minor": 2
224 | }
225 | 


--------------------------------------------------------------------------------
/examples/async_parse_with_layout.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
 10 |     "# !pip3 install --upgrade ipython\n",
 11 |     "# !pip3 install --upgrade any-parser"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from IPython.display import display, Markdown\n",
 21 |     "from any_parser import AnyParser"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "ap = AnyParser(api_key=\"...\")"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "file_path = \"./sample_data/test_1figure_1table.png\"\n",
 40 |     "file_id = ap.async_parse_with_layout(file_path)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 4,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "name": "stdout",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "Waiting for response...\n",
 53 |       "Waiting for response...\n",
 54 |       "Waiting for response...\n",
 55 |       "Waiting for response...\n",
 56 |       "Waiting for response...\n",
 57 |       "Waiting for response...\n",
 58 |       "Waiting for response...\n",
 59 |       "Waiting for response...\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "markdown_output = ap.async_fetch(file_id=file_id)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 5,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "data": {
 74 |       "text/markdown": [
 75 |        "\n",
 76 |        "<table>\n",
 77 |        "<tbody>\n",
 78 |        "<tr><td>            </td><td>latency    </td><td>(ms)           </td></tr>\n",
 79 |        "<tr><td>participants</td><td>mean       </td><td>99th percentile</td></tr>\n",
 80 |        "<tr><td>1           </td><td>17.0 +1.4  </td><td>75.0 34.9      </td></tr>\n",
 81 |        "<tr><td>2           </td><td>24.5 +2.5  </td><td>87.6 +35.9     </td></tr>\n",
 82 |        "<tr><td>5           </td><td>31.5 +6.2  </td><td>104.5 52.2     </td></tr>\n",
 83 |        "<tr><td>10          </td><td>30.0 +3.7  </td><td>95.6 +25.4     </td></tr>\n",
 84 |        "<tr><td>25          </td><td>35.5 +5.6  </td><td>100.4 42.7     </td></tr>\n",
 85 |        "<tr><td>50          </td><td>42.7 4.1   </td><td>93.7 22.9      </td></tr>\n",
 86 |        "<tr><td>100         </td><td>71.4 7.6   </td><td>131.2 +17.6    </td></tr>\n",
 87 |        "<tr><td>200         </td><td>150.5 +11.0</td><td>320.3 35.1     </td></tr>\n",
 88 |        "</tbody>\n",
 89 |        "</table>\n",
 90 |        "\n",
 91 |        "\n",
 92 |        "\n",
 93 |        "Table 4: Two-phase commit scalability. Mean and standard deviations over 10 runs.\n",
 94 |        "\n",
 95 |        "CPUs. Snapshot reads can execute at any up-to-date replicas, so their throughput increases almost linearly with the number of replicas. Single-read read-only transactions only execute at leaders because timestamp assignment must happen at leaders. Read-only-transaction throughput increases with the number of replicas because the number of effective spanservers increases: in the experimental setup, the number of spanservers equaled the number of replicas, and leaders were randomly distributed among the zones. Write throughput benefits from the same experimental artifact (which explains the increase in throughput from 3 to 5 replicas), but that benefit is outweighed by the linear increase in the amount of work performed per write, as the number of replicas increases.\n",
 96 |        "\n",
 97 |        "Table 4 demonstrates that two-phase commit can scale to a reasonable number of participants: it summarizes a set of experiments run across 3 zones, each with 25 spanservers. Scaling up to 50 participants is reasonable in both mean and 99th-percentile, and latencies start to rise noticeably at 100 participants.\n",
 98 |        "\n",
 99 |        "5.2 Availability\n",
100 |        "\n",
101 |        "Figure 5 illustrates the availability benefits of running Spanner in multiple datacenters. It shows the results of three experiments on throughput in the presence of datacenter failure, all of which are overlaid onto the same time scale. The test universe consisted of 5 zones Zi, each of which had 25 spanservers. The test database was sharded into 1250 Paxos groups, and 100 test clients constantly issued non-snapshot reads at an aggregate rate of 50K reads/second. All of the leaders were explicitly placed in Z1. Five seconds into each test, all of the servers in one zone were killed: non-leader kills Z2; leader-hard kills Z1; leader-soft kills Z1, but it gives notifications to all of the servers that they should handoff leadership first.\n",
102 |        "\n",
103 |        "Killing Z2 has no effect on read throughput. Killing Z1 while giving the leaders time to handoff leadership to a different zone has a minor effect: the throughput drop is not visible in the graph, but is around 3-4%. On the other hand, killing Z1 with no warning has a severe effect: the rate of completion drops almost to 0. As leaders get re-elected, though, the throughput of the system rises to approximately 100K reads/second because of two artifacts of our experiment: there is extra capacity in the system, and operations are queued while the leader is unavailable. As a result, the throughput of the system rises before leveling off again at its steady-state rate.\n",
104 |        "\n",
105 |        "We can also see the effect of the fact that Paxos leader leases are set to 10 seconds. When we kill the zone, the leader-lease expiration times for the groups should be evenly distributed over the next 10 seconds. Soon after each lease from a dead leader expires, a new leader is elected. Approximately 10 seconds after the kill time, all of the groups have leaders and throughput has recovered. Shorter lease times would reduce the effect of server deaths on availability, but would require greater amounts of lease-renewal network traffic. We are in the process of designing and implementing a mechanism that will cause slaves to release Paxos leader leases upon leader failure.\n",
106 |        "\n",
107 |        "5.3 TrueTime\n",
108 |        "\n",
109 |        "Two questions must be answered with respect to TrueTime: is ε truly a bound on clock uncertainty, and how bad does ε get? For the former, the most serious problem would be if a local clock’s drift were greater than 200usec/sec: that would break assumptions made by TrueTime. Our machine statistics show that bad CPUs are 6 times more likely than bad clocks. That is, clock issues are extremely infrequent, relative to much more serious hardware problems. As a result, we believe that TrueTime’s implementation is as trustworthy as any other piece of software upon which Spanner depends.\n",
110 |        "\n",
111 |        "![![<@mask_p0_e1_figure>(timeout=1h)](https://anyparser-realtime-test-j-assetsconstructfilebucke-2wg0ln280yvz.s3.amazonaws.com/result_parse_with_layout/async_S4iyw7RAEE8CTGkVgHYeI8nsTmSALI1U2HXvAN6j/2024/11/14/test_1figure_1table_f00964f1-abcc-4e62-b2af-46249d9c70d4.png/%3C%40mask_p0_e1_figure_s3%3E.png?AWSAccessKeyId=ASIAXM24X76XLDJJDDZX&Signature=Ef8urOX4Oj%2Bdxx%2F1IOh0OqgJ0%2B4%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEFoaCXVzLXdlc3QtMiJGMEQCIBJHF0qjs7xZL9IBZf0a7YooU6WJP1EeclCbGaKCaLFPAiB%2BFjaYEyzmBWPFVh%2FRSUVhrEEdc%2FlQdUaLSTP%2FgclPaSrcAwjj%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwODYxMTI2NDQzMCIMGyjwrhVEC7fYAvneKrADV3HpyrnA8A6QUdLRnfZZM74MpeETlq%2BvlIjpQ5CPxB%2BTWpNRlq4c3eo%2BzKRX87bl9kpFmBaFXJPc9ot%2BN3L3Vcp%2FzvnI0iB4gqlN4jGexU5wVpTclORB1TAK%2FcO6AFfGACTLrUg0GzgcbwYR%2FGIvhxSGj1Ule9MDXL%2FG2YGMnqFDndKirbwufY4dlBYehDzqNii3kB3v5nGFsYKmAdVEocKdeIK6cv%2Fybj3w58l9vDyRMgr0%2FLWposZ160WIEvPMWMseKe6Q87%2BbEL8hcyl5i0aFxeGf4xv1Foiz74tcJcPL7RuwpQYCb3BztfD11Vo8334cla8p5LlEfkj1OEDHVXW15FJpw29pZN1q0IBIQNeBHtajkpu7BPzURXYZIUnvnWnpCPHTThM8z2Az1mhtou69uKWjO6iVeOe%2BrbqGMXbKEJxuKraEh%2BXVukZWmzlxwaiyJ2gomNXTQmO0gaLpiU934WqlJu9mGl0mw686KPwwdVOudV4RUgXAZhpT7j%2FzydhxVNK0sHX%2F02lTm1v6%2BRpsUN1Xvd%2FXMuj1%2FM8q5B86wkwUj1YjgFoQ9qcljZu8MPik1bkGOp8BvunCWNInmGehKh0yaRGfQn0y%2FgecCbOQoOqRUuLahI8ZBrixkIBUOkyinWTmsdLG6ItJXkiKFBOAHU0tq97U0Fbb0mq0v6L%2Bfr1INT52vqWsaXTwxiLSJeGJTEve1SCCRttFsIpkZF5MEmB3V0irDz3lVQbyV1Z2lWSe%2Br13a5DSeH4REoiwqEKtKN%2FCV4WPDhK5G%2FUm%2B8LmNrgUGm77&Expires=1731551406)(timeout=1h)](https://anyparser-realtime-test-j-assetsconstructfilebucke-2wg0ln280yvz.s3.amazonaws.com/result_parse_with_layout/async_S4iyw7RAEE8CTGkVgHYeI8nsTmSALI1U2HXvAN6j/2024/11/14/test_1figure_1table_f00964f1-abcc-4e62-b2af-46249d9c70d4.png/%3C%40mask_p0_e1_figure_s3%3E.png?AWSAccessKeyId=ASIAXM24X76XLDJJDDZX&Signature=Ef8urOX4Oj%2Bdxx%2F1IOh0OqgJ0%2B4%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEFoaCXVzLXdlc3QtMiJGMEQCIBJHF0qjs7xZL9IBZf0a7YooU6WJP1EeclCbGaKCaLFPAiB%2BFjaYEyzmBWPFVh%2FRSUVhrEEdc%2FlQdUaLSTP%2FgclPaSrcAwjj%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwODYxMTI2NDQzMCIMGyjwrhVEC7fYAvneKrADV3HpyrnA8A6QUdLRnfZZM74MpeETlq%2BvlIjpQ5CPxB%2BTWpNRlq4c3eo%2BzKRX87bl9kpFmBaFXJPc9ot%2BN3L3Vcp%2FzvnI0iB4gqlN4jGexU5wVpTclORB1TAK%2FcO6AFfGACTLrUg0GzgcbwYR%2FGIvhxSGj1Ule9MDXL%2FG2YGMnqFDndKirbwufY4dlBYehDzqNii3kB3v5nGFsYKmAdVEocKdeIK6cv%2Fybj3w58l9vDyRMgr0%2FLWposZ160WIEvPMWMseKe6Q87%2BbEL8hcyl5i0aFxeGf4xv1Foiz74tcJcPL7RuwpQYCb3BztfD11Vo8334cla8p5LlEfkj1OEDHVXW15FJpw29pZN1q0IBIQNeBHtajkpu7BPzURXYZIUnvnWnpCPHTThM8z2Az1mhtou69uKWjO6iVeOe%2BrbqGMXbKEJxuKraEh%2BXVukZWmzlxwaiyJ2gomNXTQmO0gaLpiU934WqlJu9mGl0mw686KPwwdVOudV4RUgXAZhpT7j%2FzydhxVNK0sHX%2F02lTm1v6%2BRpsUN1Xvd%2FXMuj1%2FM8q5B86wkwUj1YjgFoQ9qcljZu8MPik1bkGOp8BvunCWNInmGehKh0yaRGfQn0y%2FgecCbOQoOqRUuLahI8ZBrixkIBUOkyinWTmsdLG6ItJXkiKFBOAHU0tq97U0Fbb0mq0v6L%2Bfr1INT52vqWsaXTwxiLSJeGJTEve1SCCRttFsIpkZF5MEmB3V0irDz3lVQbyV1Z2lWSe%2Br13a5DSeH4REoiwqEKtKN%2FCV4WPDhK5G%2FUm%2B8LmNrgUGm77&Expires=1731551406)\n",
112 |        "\n",
113 |        "Figure 5: Effect of killing servers on throughput.\n"
114 |       ],
115 |       "text/plain": [
116 |        "<IPython.core.display.Markdown object>"
117 |       ]
118 |      },
119 |      "metadata": {},
120 |      "output_type": "display_data"
121 |     }
122 |    ],
123 |    "source": [
124 |     "display(Markdown(markdown_output))"
125 |    ]
126 |   }
127 |  ],
128 |  "metadata": {
129 |   "kernelspec": {
130 |    "display_name": "any",
131 |    "language": "python",
132 |    "name": "python3"
133 |   },
134 |   "language_info": {
135 |    "codemirror_mode": {
136 |     "name": "ipython",
137 |     "version": 3
138 |    },
139 |    "file_extension": ".py",
140 |    "mimetype": "text/x-python",
141 |    "name": "python",
142 |    "nbconvert_exporter": "python",
143 |    "pygments_lexer": "ipython3",
144 |    "version": "-1.-1.-1"
145 |   }
146 |  },
147 |  "nbformat": 4,
148 |  "nbformat_minor": 2
149 | }
150 | 


--------------------------------------------------------------------------------
/examples/async_parse_with_ocr.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
 10 |     "# !pip3 install --upgrade ipython\n",
 11 |     "# !pip3 install --upgrade any-parser"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from IPython.display import display, Markdown\n",
 21 |     "from any_parser import AnyParser"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "ap = AnyParser(api_key=\"...\")"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "file_path = \"./sample_data/test_1figure_1table.png\"\n",
 40 |     "file_id = ap.async_parse_with_ocr(file_path)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 4,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "name": "stdout",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "Waiting for response...\n",
 53 |       "Waiting for response...\n",
 54 |       "Waiting for response...\n",
 55 |       "Waiting for response...\n",
 56 |       "Waiting for response...\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "markdown_output = ap.async_fetch(file_id=file_id)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 6,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "data": {
 71 |       "text/markdown": [
 72 |        "## Table 4: Two-phase commit scalability. Mean and standard deviations over 10 runs.\n",
 73 |        "\n",
 74 |        "| participants | mean        | 99th percentile |\n",
 75 |        "|--------------|-------------|-----------------|\n",
 76 |        "| 1            | 17.0 ±1.4   | 75.0 ±34.9      |\n",
 77 |        "| 2            | 24.5 ±2.5   | 87.6 ±35.9      |\n",
 78 |        "| 5            | 31.5 ±6.2   | 104.5 ±52.2     |\n",
 79 |        "| 10           | 30.0 ±3.7   | 95.6 ±25.4      |\n",
 80 |        "| 25           | 35.5 ±5.6   | 100.4 ±42.7     |\n",
 81 |        "| 50           | 42.7 ±4.1   | 93.7 ±22.9      |\n",
 82 |        "| 100          | 71.4 ±7.6   | 131.2 ±17.6     |\n",
 83 |        "| 200          | 150.5 ±11.0 | 320.3 ±35.1     |\n",
 84 |        "\n",
 85 |        "## 5.2 Availability\n",
 86 |        "\n",
 87 |        "Figure 5 illustrates the availability benefits of running Spanner in multiple datacenters. It shows the results of three experiments on throughput in the presence of datacenter failure, all of which are overlaid onto the same time scale. The test universe consisted of 5 zones Z1-Z5, each of which had 25 spanservers. The test database was sharded into 1250 Paxos groups, and 100 test clients constantly issued non-snapshot reads at an aggregate rate of 50K reads/second. All of the leaders were explicitly placed in Z1. Five seconds into each test, all of the servers in one zone were killed: non-leader kills Z2; leader-hard kills Z1; leader-soft kills Z1, but it gives notifications to all of the servers that they should handoff leadership first.\n",
 88 |        "\n",
 89 |        "Killing Z2 has no effect on read throughput. Killing Z1 while giving the leaders time to handoff leadership to a different zone has a minor effect: the throughput drop is not visible in the graph, but is around 3-4%. On the other hand, killing Z1 with no warning has a severe effect: the rate of completion drops almost to 0. As leaders get re-elected, though, the throughput of the system rises to approximately 100K reads/second because of two artifacts of our experiment: there is extra capacity in the system, and operations are queued while the leader is unavailable. As a result, the throughput of the system rises before leveling off again at its steady-state rate.\n",
 90 |        "\n",
 91 |        "We can also see the effect of the fact that Paxos leader leases are set to 10 seconds. When we kill the zone, the leader-lease expiration times for the groups should be evenly distributed over the next 10 seconds. Soon after each lease from a dead leader expires, a new leader is elected. Approximately 10 seconds after the kill time, all of the groups have leaders and throughput has recovered. Shorter lease times would reduce the effect of server deaths on availability, but would require greater amounts of lease-renewal network traffic. We are in the process of designing and implementing a mechanism that will cause slaves to release Paxos leader leases upon leader failure.\n",
 92 |        "\n",
 93 |        "## 5.3 TrueTime\n",
 94 |        "\n",
 95 |        "Two questions must be answered with respect to TrueTime: is ε truly a bound on clock uncertainty, and how bad does ε get? For the former, the most serious problem would be if a local clock's drift were greater than 200us/sec: that would break assumptions made by TrueTime. Our machine statistics show that bad CPUs are 6 times more likely than bad clocks. That is, clock issues are extremely infrequent, relative to much more serious hardware problems. As a result, we believe that TrueTime's implementation is as trustworthy as any other piece of software upon which Spanner depends.\n",
 96 |        "\n",
 97 |        "Figure 6 presents TrueTime data taken at several thousand spanserver machines across datacenters up to 2200"
 98 |       ],
 99 |       "text/plain": [
100 |        "<IPython.core.display.Markdown object>"
101 |       ]
102 |      },
103 |      "metadata": {},
104 |      "output_type": "display_data"
105 |     }
106 |    ],
107 |    "source": [
108 |     "display(Markdown(markdown_output))"
109 |    ]
110 |   }
111 |  ],
112 |  "metadata": {
113 |   "kernelspec": {
114 |    "display_name": "any",
115 |    "language": "python",
116 |    "name": "python3"
117 |   },
118 |   "language_info": {
119 |    "codemirror_mode": {
120 |     "name": "ipython",
121 |     "version": 3
122 |    },
123 |    "file_extension": ".py",
124 |    "mimetype": "text/x-python",
125 |    "name": "python",
126 |    "nbconvert_exporter": "python",
127 |    "pygments_lexer": "ipython3",
128 |    "version": "-1.-1.-1"
129 |   }
130 |  },
131 |  "nbformat": 4,
132 |  "nbformat_minor": 2
133 | }
134 | 


--------------------------------------------------------------------------------
/examples/extract_pii.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
 10 |     "# !pip3 install --upgrade ipython\n",
 11 |     "# !pip3 install --upgrade any-parser"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from any_parser import AnyParser"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "ap = AnyParser(api_key=\"...\")"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "file_path = \"./sample_data/resume_1.pdf\"\n",
 39 |     "pii_info, time = ap.extract_pii(file_path)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 5,
 45 |    "metadata": {},
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "'Time Elapsed: 8.02 seconds'"
 51 |       ]
 52 |      },
 53 |      "execution_count": 5,
 54 |      "metadata": {},
 55 |      "output_type": "execute_result"
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "time"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 6,
 65 |    "metadata": {},
 66 |    "outputs": [
 67 |     {
 68 |      "data": {
 69 |       "text/plain": [
 70 |        "{'name': 'Gary Jiang',\n",
 71 |        " 'phone_number': '+1-213-725-7637',\n",
 72 |        " 'address': None,\n",
 73 |        " 'email_address': 'jiangzhehuan0105@gmail.com',\n",
 74 |        " 'linkedin_url': 'https://linkedin.com/in/gary-jiang',\n",
 75 |        " 'github_url': None,\n",
 76 |        " 'summary': 'Full-stack Software Engineer'}"
 77 |       ]
 78 |      },
 79 |      "execution_count": 6,
 80 |      "metadata": {},
 81 |      "output_type": "execute_result"
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "pii_info"
 86 |    ]
 87 |   }
 88 |  ],
 89 |  "metadata": {
 90 |   "kernelspec": {
 91 |    "display_name": "any",
 92 |    "language": "python",
 93 |    "name": "python3"
 94 |   },
 95 |   "language_info": {
 96 |    "codemirror_mode": {
 97 |     "name": "ipython",
 98 |     "version": 3
 99 |    },
100 |    "file_extension": ".py",
101 |    "mimetype": "text/x-python",
102 |    "name": "python",
103 |    "nbconvert_exporter": "python",
104 |    "pygments_lexer": "ipython3",
105 |    "version": "-1.-1.-1"
106 |   }
107 |  },
108 |  "nbformat": 4,
109 |  "nbformat_minor": 2
110 | }
111 | 


--------------------------------------------------------------------------------
/examples/extract_resume_key_value.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
 10 |     "# !pip3 install --upgrade ipython\n",
 11 |     "# !pip3 install --upgrade any-parser"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from IPython.display import display\n",
 21 |     "from any_parser import AnyParser"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "ap = AnyParser(api_key=\"...\")"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 5,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "file_path = \"./sample_data/resume_1.pdf\"\n",
 40 |     "json_result = ap.extract_resume_key_value(file_path)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 6,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "data": {
 50 |       "text/plain": [
 51 |        "({'pii': {'full_name': 'GARY JIANG',\n",
 52 |        "   'email': 'jiangzhehuan0105@gmail.com',\n",
 53 |        "   'phone': '+1 (213) 725-7637'},\n",
 54 |        "  'education': [{'organization': 'Shenyang University of Technology',\n",
 55 |        "    'degree': \"Bachelor's Degree\",\n",
 56 |        "    'major': 'Computer Science',\n",
 57 |        "    'start_date': '2008-01-01',\n",
 58 |        "    'end_date': '2012-12-31',\n",
 59 |        "    'courses': None,\n",
 60 |        "    'achievements': None}],\n",
 61 |        "  'work_experience': [{'job_title': 'Full Stack Developer',\n",
 62 |        "    'company_name': 'VIMMERSE',\n",
 63 |        "    'location': None,\n",
 64 |        "    'start_date': '2023-06-01',\n",
 65 |        "    'end_date': 'present',\n",
 66 |        "    'job_type': None,\n",
 67 |        "    'summary': 'Developed an AI-powered editor web application that brings photos to life by animating images in 3D.',\n",
 68 |        "    'bullet_points': ['Developed robust back-end services utilizing Python (Flask/FastAPI) and Node.js (AWS Lambda) for efficient and scalable web applications',\n",
 69 |        "     'Built user-friendly and interactive websites using Next.js, ensuring a seamless user experience',\n",
 70 |        "     'Deployed and managed AWS infrastructure, including EC2 instances, S3 storage buckets, DynamoDB for NoSQL data management, and Cognito user pools for secure user authentication',\n",
 71 |        "     'Experienced Agile and Scrum methodologies within a fast-paced startup environment to ensure efficient project delivery and continuous improvement',\n",
 72 |        "     'Collaborated effectively with cross-functional teams (design, product management) to deliver projects on time, fostering a positive and collaborative work environment']},\n",
 73 |        "   {'job_title': 'Full Stack Developer',\n",
 74 |        "    'company_name': 'VIKING SASQUATCH',\n",
 75 |        "    'location': None,\n",
 76 |        "    'start_date': '2023-01-01',\n",
 77 |        "    'end_date': '2023-06-30',\n",
 78 |        "    'job_type': None,\n",
 79 |        "    'summary': 'Developed APIs and Integrations for all of the parties that work on Real Estate Transactions.',\n",
 80 |        "    'bullet_points': ['Connecting Mortgage, Title, and Real Estate to solve pain points and improve automation and create efficiencies',\n",
 81 |        "     'Implemented a user-friendly front-end interface using Nuxt.js, ensuring a seamless user experience',\n",
 82 |        "     'Built backend APIs utilizing Node.js serverless functions for optimal performance',\n",
 83 |        "     'Managed data storage and security by implementing a MySQL database',\n",
 84 |        "     'Collaborated effectively within a team using Agile methodologies like sprint planning, daily standups, retrospectives to ensure project delivery and continuous improvement']},\n",
 85 |        "   {'job_title': 'Full Stack Developer',\n",
 86 |        "    'company_name': 'ROX PAY SRL',\n",
 87 |        "    'location': None,\n",
 88 |        "    'start_date': '2021-12-01',\n",
 89 |        "    'end_date': '2022-12-31',\n",
 90 |        "    'job_type': None,\n",
 91 |        "    'summary': 'Built Fintech Software House that aims to optimize B2B payments by offering a systemic solution that gives value-added services in collection of payments, financial information and corporate liquidity by essentially creating a Commission Free, Open Loop, Payment Gateway system.',\n",
 92 |        "    'bullet_points': ['Developed front-end by using React.js and Redux, Javascript/Typescript',\n",
 93 |        "     'Contributed developing backend utilizing Django/Python']},\n",
 94 |        "   {'job_title': 'Freelancer',\n",
 95 |        "    'company_name': 'FREELANCE',\n",
 96 |        "    'location': None,\n",
 97 |        "    'start_date': '2017-09-01',\n",
 98 |        "    'end_date': '2021-10-31',\n",
 99 |        "    'job_type': None,\n",
100 |        "    'summary': 'Developed and managed many web and mobile applications while working as freelancer at Internet Dzyns LLC company.',\n",
101 |        "    'bullet_points': ['Developed multiple web applications, participating in the whole process of their development: product design and estimation, code design and development, DevOps, UI/UX design, product launch and maintenance',\n",
102 |        "     'Developed cross-platform mobile application using Flutter and Ionic/Angular',\n",
103 |        "     'Developed NFT marketplace websites and wrote smart contracts']},\n",
104 |        "   {'job_title': 'Server Administrator, Java Developer',\n",
105 |        "    'company_name': 'NEUSOFT',\n",
106 |        "    'location': None,\n",
107 |        "    'start_date': '2014-06-01',\n",
108 |        "    'end_date': '2017-08-31',\n",
109 |        "    'job_type': None,\n",
110 |        "    'summary': 'Worked as intern and software developer after graduated university.',\n",
111 |        "    'bullet_points': ['Correct analytical and reasoning skills to troubleshoot and repair server issues',\n",
112 |        "     'Operating Systems & Security Software',\n",
113 |        "     'Java / Spring Boot / Hibernate']}],\n",
114 |        "  'personal_info': {'name': 'GARY JIANG',\n",
115 |        "   'phone_number': '+1-213-725-7637',\n",
116 |        "   'address': None,\n",
117 |        "   'email_address': 'jiangzhehuan0105@gmail.com',\n",
118 |        "   'linkedin_url': 'linkedin.com/in/gary-jiang',\n",
119 |        "   'github_url': None,\n",
120 |        "   'summary': None},\n",
121 |        "  'skills': {'Programming Languages': ['Python',\n",
122 |        "    'PHP',\n",
123 |        "    'Javascript',\n",
124 |        "    'Typescript',\n",
125 |        "    'HTML',\n",
126 |        "    'CSS'],\n",
127 |        "   'Tools': ['Flask',\n",
128 |        "    'Django',\n",
129 |        "    'FastAPI',\n",
130 |        "    'Laravel',\n",
131 |        "    'Node.js',\n",
132 |        "    'SQL databases',\n",
133 |        "    'Next.js',\n",
134 |        "    'React',\n",
135 |        "    'Redux',\n",
136 |        "    'Nuxt.js',\n",
137 |        "    'Vue',\n",
138 |        "    'AWS Lambda',\n",
139 |        "    'Cognito',\n",
140 |        "    'EC2',\n",
141 |        "    'S3',\n",
142 |        "    'DynamoDB',\n",
143 |        "    'API gateway',\n",
144 |        "    'Git',\n",
145 |        "    'Version Control',\n",
146 |        "    'DevOps',\n",
147 |        "    'CI/CD'],\n",
148 |        "   'Other': ['Startup Experience',\n",
149 |        "    'Adaptable',\n",
150 |        "    'Resourceful',\n",
151 |        "    'Prioritization',\n",
152 |        "    'Hybrid Mobile App Development',\n",
153 |        "    'Flutter',\n",
154 |        "    'Ionic',\n",
155 |        "    'Angular',\n",
156 |        "    'AGILE',\n",
157 |        "    'SCRUM']},\n",
158 |        "  'certifications': [],\n",
159 |        "  'projects': []},\n",
160 |        " 'Time Elapsed: 27.27 seconds')"
161 |       ]
162 |      },
163 |      "metadata": {},
164 |      "output_type": "display_data"
165 |     }
166 |    ],
167 |    "source": [
168 |     "display(json_result)"
169 |    ]
170 |   }
171 |  ],
172 |  "metadata": {
173 |   "kernelspec": {
174 |    "display_name": "any",
175 |    "language": "python",
176 |    "name": "python3"
177 |   },
178 |   "language_info": {
179 |    "codemirror_mode": {
180 |     "name": "ipython",
181 |     "version": 3
182 |    },
183 |    "file_extension": ".py",
184 |    "mimetype": "text/x-python",
185 |    "name": "python",
186 |    "nbconvert_exporter": "python",
187 |    "pygments_lexer": "ipython3",
188 |    "version": "-1.-1.-1"
189 |   }
190 |  },
191 |  "nbformat": 4,
192 |  "nbformat_minor": 2
193 | }
194 | 


--------------------------------------------------------------------------------
/examples/extract_tables.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
 10 |     "# !pip3 install --upgrade ipython\n",
 11 |     "# !pip3 install --upgrade any-parser\n",
 12 |     "# !pip3 install pandas lxml html5lib bs4\n"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "from IPython.display import display, Markdown\n",
 22 |     "from any_parser import AnyParser"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 3,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "ap = AnyParser(api_key=\"...\")"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 7,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "csv_output, time_info = ap.extract_tables(\n",
 41 |     "    file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"csv\"\n",
 42 |     ")\n",
 43 |     "\n",
 44 |     "html_output, time_info = ap.extract_tables(\n",
 45 |     "    file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"html\"\n",
 46 |     ")"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 8,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "CPU times: user 3 μs, sys: 1 μs, total: 4 μs\n",
 59 |       "Wall time: 5.96 μs\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "time"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 10,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "data": {
 74 |       "text/markdown": [
 75 |        "0,1,2\n",
 76 |        ",latency,(ms)\n",
 77 |        "participants,mean,99th percentile\n",
 78 |        "1,17.0 +1.4,75.0 34.9\n",
 79 |        "2,24.5 +2.5,87.6 35.9\n",
 80 |        "5,31.5 +6.2,104.5 52.2\n",
 81 |        "10,30.0 +3.7,95.6 25.4\n",
 82 |        "25,35.5 +5.6,100.4 42.7\n",
 83 |        "50,42.7 +4.1,93.7 22.9\n",
 84 |        "100,71.4 +7.6,131.2 +17.6\n",
 85 |        "200,150.5 +11.0,320.3 35.1\n"
 86 |       ],
 87 |       "text/plain": [
 88 |        "<IPython.core.display.Markdown object>"
 89 |       ]
 90 |      },
 91 |      "metadata": {},
 92 |      "output_type": "display_data"
 93 |     },
 94 |     {
 95 |      "data": {
 96 |       "text/markdown": [
 97 |        "<table>\n",
 98 |        "<tbody>\n",
 99 |        "<tr><td>            </td><td>latency    </td><td>(ms)           </td></tr>\n",
100 |        "<tr><td>participants</td><td>mean       </td><td>99th percentile</td></tr>\n",
101 |        "<tr><td>1           </td><td>17.0 +1.4  </td><td>75.0 34.9      </td></tr>\n",
102 |        "<tr><td>2           </td><td>24.5 +2.5  </td><td>87.6 35.9      </td></tr>\n",
103 |        "<tr><td>5           </td><td>31.5 +6.2  </td><td>104.5 52.2     </td></tr>\n",
104 |        "<tr><td>10          </td><td>30.0 +3.7  </td><td>95.6 25.4      </td></tr>\n",
105 |        "<tr><td>25          </td><td>35.5 +5.6  </td><td>100.4 42.7     </td></tr>\n",
106 |        "<tr><td>50          </td><td>42.7 +4.1  </td><td>93.7 22.9      </td></tr>\n",
107 |        "<tr><td>100         </td><td>71.4 +7.6  </td><td>131.2 +17.6    </td></tr>\n",
108 |        "<tr><td>200         </td><td>150.5 +11.0</td><td>320.3 35.1     </td></tr>\n",
109 |        "</tbody>\n",
110 |        "</table>\n",
111 |        "\n"
112 |       ],
113 |       "text/plain": [
114 |        "<IPython.core.display.Markdown object>"
115 |       ]
116 |      },
117 |      "metadata": {},
118 |      "output_type": "display_data"
119 |     }
120 |    ],
121 |    "source": [
122 |     "if isinstance(csv_output, list):\n",
123 |     "    csv_output_str = \"\\n\".join(csv_output)\n",
124 |     "else:\n",
125 |     "    csv_output_str = csv_output\n",
126 |     "\n",
127 |     "display(Markdown(csv_output_str))\n",
128 |     "display(Markdown(html_output))"
129 |    ]
130 |   }
131 |  ],
132 |  "metadata": {
133 |   "kernelspec": {
134 |    "display_name": "any",
135 |    "language": "python",
136 |    "name": "python3"
137 |   },
138 |   "language_info": {
139 |    "codemirror_mode": {
140 |     "name": "ipython",
141 |     "version": 3
142 |    },
143 |    "file_extension": ".py",
144 |    "mimetype": "text/x-python",
145 |    "name": "python",
146 |    "nbconvert_exporter": "python",
147 |    "pygments_lexer": "ipython3",
148 |    "version": "3.10.16"
149 |   }
150 |  },
151 |  "nbformat": 4,
152 |  "nbformat_minor": 2
153 | }
154 | 


--------------------------------------------------------------------------------
/examples/parse_batch_api.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Anyparser Batch API Example"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
 17 |     "# !pip3 install --upgrade ipython\n",
 18 |     "# !pip3 install --upgrade any-parser"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "### Step1: Batch API Folder Processing Upload"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 1,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "import json\n",
 35 |     "import os\n",
 36 |     "from datetime import datetime\n",
 37 |     "\n",
 38 |     "from dotenv import load_dotenv\n",
 39 |     "\n",
 40 |     "from any_parser import AnyParser"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# Load environment variables\n",
 50 |     "load_dotenv(override=True)\n",
 51 |     "\n",
 52 |     "# Get API key and create parser\n",
 53 |     "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n",
 54 |     "if not api_key:\n",
 55 |     "    raise ValueError(\"CAMBIO_API_KEY is not set\")\n",
 56 |     "ap = AnyParser(api_key)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "Create Batch Request"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 3,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "name": "stdout",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "Upload responses saved to: ./sample_data_20250103003352.jsonl\n"
 76 |      ]
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "# Upload folder for batch processing\n",
 81 |     "WORKING_FOLDER = \"./sample_data\"\n",
 82 |     "responses = ap.batches.create(WORKING_FOLDER)\n",
 83 |     "\n",
 84 |     "# Save responses to JSONL file with timestamp\n",
 85 |     "timestamp = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
 86 |     "output_file = f\"./sample_data_{timestamp}.jsonl\"\n",
 87 |     "\n",
 88 |     "with open(output_file, \"w\") as f:\n",
 89 |     "    for response in responses:\n",
 90 |     "        f.write(json.dumps(response.model_dump()) + \"\\n\")\n",
 91 |     "\n",
 92 |     "print(f\"Upload responses saved to: {output_file}\")"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "Check the first element status in the jsonl using the requestId"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 4,
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "name": "stdout",
109 |      "output_type": "stream",
110 |      "text": [
111 |       "Checking status for file: Earnings-Presentation-Q2-2024.pdf\n",
112 |       "Content not yet available\n"
113 |      ]
114 |     }
115 |    ],
116 |    "source": [
117 |     "# Get first response from the JSONL file\n",
118 |     "with open(output_file, \"r\") as f:\n",
119 |     "    first_response = json.loads(f.readline())\n",
120 |     "\n",
121 |     "request_id = first_response[\"requestId\"]\n",
122 |     "print(f\"Checking status for file: {first_response['fileName']}\")\n",
123 |     "\n",
124 |     "# Retrieve status using request ID\n",
125 |     "markdown = ap.batches.retrieve(request_id)\n",
126 |     "if markdown and markdown.result:\n",
127 |     "    print(\"Content retrieved successfully\")\n",
128 |     "else:\n",
129 |     "    print(\"Content not yet available\")"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "Note: Batch extraction is currently in beta testing. Processing time may take up to 2 hours to complete."
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "After 2 hours, you can check the content of the first file in the folder again"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 6,
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "name": "stdout",
153 |      "output_type": "stream",
154 |      "text": [
155 |       "Content retrieved successfully\n"
156 |      ]
157 |     }
158 |    ],
159 |    "source": [
160 |     "# Retrieve status using request ID\n",
161 |     "markdown = ap.batches.retrieve(request_id)\n",
162 |     "if markdown and markdown.result:\n",
163 |     "    print(\"Content retrieved successfully\")\n",
164 |     "else:\n",
165 |     "    print(\"Content not yet available\")"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "### Step2: Batch API folder fetch response\n"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 16,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "import json\n",
182 |     "import logging\n",
183 |     "import os\n",
184 |     "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
185 |     "\n",
186 |     "from dotenv import load_dotenv\n",
187 |     "\n",
188 |     "from any_parser import AnyParser\n",
189 |     "\n",
190 |     "# Configure logging\n",
191 |     "logging.basicConfig(level=logging.INFO)\n",
192 |     "logger = logging.getLogger(__name__)\n",
193 |     "\n",
194 |     "# Load environment variables\n",
195 |     "load_dotenv(override=True)\n",
196 |     "\n",
197 |     "MAX_WORKER = 10"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 17,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "# Get API key and create parser\n",
207 |     "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n",
208 |     "if not api_key:\n",
209 |     "    raise ValueError(\"CAMBIO_API_KEY is not set\")\n",
210 |     "ap = AnyParser(api_key)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "Read responses from JSONL file"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 18,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "# Change to your real output json from parse_batch_upload.py\n",
227 |     "response_file = \"./sample_data_20250102103047.jsonl\"\n",
228 |     "with open(response_file, \"r\") as f:\n",
229 |     "    responses = [json.loads(line) for line in f]"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 19,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "# Process responses concurrently\n",
239 |     "def process_response(response):\n",
240 |     "    \"\"\"Process a single response by retrieving markdown content\"\"\"\n",
241 |     "    request_id = response[\"requestId\"]\n",
242 |     "    try:\n",
243 |     "        markdown = ap.batches.retrieve(request_id)\n",
244 |     "        if markdown and markdown.result:\n",
245 |     "            response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n",
246 |     "            response[\"requestStatus\"] = \"COMPLETED\"\n",
247 |     "            response[\"completionTime\"] = markdown.completionTime\n",
248 |     "    except Exception as e:\n",
249 |     "        logger.error(f\"Error processing {request_id}: {str(e)}\")\n",
250 |     "        response[\"error\"] = [str(e)]\n",
251 |     "    return response"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 20,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "name": "stdout",
261 |      "output_type": "stream",
262 |      "text": [
263 |       "Updated all responses in ./sample_data_20250102103047.jsonl with markdown content\n"
264 |      ]
265 |     }
266 |    ],
267 |    "source": [
268 |     "# Process responses concurrently\n",
269 |     "with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n",
270 |     "    future_to_response = {\n",
271 |     "        executor.submit(process_response, response): response\n",
272 |     "        for response in responses\n",
273 |     "    }\n",
274 |     "\n",
275 |     "    updated_responses = []\n",
276 |     "    for future in as_completed(future_to_response):\n",
277 |     "        updated_response = future.result()\n",
278 |     "        updated_responses.append(updated_response)\n",
279 |     "\n",
280 |     "# Write all updated responses back to file\n",
281 |     "with open(response_file, \"w\") as f:\n",
282 |     "    for response in updated_responses:\n",
283 |     "        f.write(json.dumps(response) + \"\\n\")\n",
284 |     "\n",
285 |     "print(f\"Updated all responses in {response_file} with markdown content\")"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "Print out the first row from the updated file"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 21,
298 |    "metadata": {},
299 |    "outputs": [
300 |     {
301 |      "name": "stdout",
302 |      "output_type": "stream",
303 |      "text": [
304 |       "First row from updated file:\n",
305 |       "{\n",
306 |       "  \"fileName\": \"Earnings-Presentation-Q2-2024.pdf\",\n",
307 |       "  \"requestId\": \"cfb556cb-e5f9-4b6c-a2f7-6ba982858a92\",\n",
308 |       "  \"requestStatus\": \"COMPLETED\",\n",
309 |       "  \"result\": [\n",
310 |       "    \"## Meta Earnings Presentation\\n## Q2 2024\\n\\ninvestor.fb.com Meta logo, consisting of a stylized infinity symbol next to the text \\\"Meta\\\"\"\n",
311 |       "  ],\n",
312 |       "  \"completionTime\": \"2025-01-02T04:34:56.494827+00:00\"\n",
313 |       "}\n"
314 |      ]
315 |     }
316 |    ],
317 |    "source": [
318 |     "# Read and print first row from the updated file\n",
319 |     "with open(response_file, \"r\") as f:\n",
320 |     "    first_row = json.loads(f.readline())\n",
321 |     "    print(\"First row from updated file:\")\n",
322 |     "    print(json.dumps(first_row, indent=2))"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {},
328 |    "source": [
329 |     "## End of the notebook\n",
330 |     "\n",
331 |     "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n",
332 |     "\n",
333 |     "<a href=\"https://www.cambioml.com/\" title=\"Title\">\n",
334 |     "    <img src=\"./sample_data/cambioml_logo_large.png\" style=\"height: 100px; display: block; margin-left: auto; margin-right: auto;\"/>\n",
335 |     "</a>"
336 |    ]
337 |   }
338 |  ],
339 |  "metadata": {
340 |   "kernelspec": {
341 |    "display_name": "any-parse",
342 |    "language": "python",
343 |    "name": "python3"
344 |   },
345 |   "language_info": {
346 |    "codemirror_mode": {
347 |     "name": "ipython",
348 |     "version": 3
349 |    },
350 |    "file_extension": ".py",
351 |    "mimetype": "text/x-python",
352 |    "name": "python",
353 |    "nbconvert_exporter": "python",
354 |    "pygments_lexer": "ipython3",
355 |    "version": "3.10.15"
356 |   }
357 |  },
358 |  "nbformat": 4,
359 |  "nbformat_minor": 2
360 | }
361 | 


--------------------------------------------------------------------------------
/examples/parse_pdf2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 15,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
 10 |     "# !pip3 install --upgrade ipython\n",
 11 |     "# !pip3 install --upgrade any-parser"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 16,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from IPython.display import display, Markdown\n",
 21 |     "from any_parser import AnyParser\n",
 22 |     "import os\n",
 23 |     "from dotenv import load_dotenv"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 17,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "load_dotenv(override=True)\n",
 33 |     "example_apikey = os.getenv(\"CAMBIO_API_KEY\")"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 18,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "ap = AnyParser(example_apikey)\n",
 43 |     "\n",
 44 |     "# Define extract_args as a dictionary with your desired parameters\n",
 45 |     "extract_args = {\n",
 46 |     "    \"vqa_figures_flag\": True,\n",
 47 |     "    \"vqa_charts_flag\": True\n",
 48 |     "}\n",
 49 |     "\n",
 50 |     "# Pass extract_args to the parse method\n",
 51 |     "markdown_output, time = ap.parse(\n",
 52 |     "    file_path=\"./sample_data/Earnings-Presentation-Q2-2024.pdf\",\n",
 53 |     "    extract_args=extract_args\n",
 54 |     ")"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 19,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/plain": [
 65 |        "'Time Elapsed: 23.25 seconds'"
 66 |       ]
 67 |      },
 68 |      "execution_count": 19,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "time"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 20,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/markdown": [
 85 |        "Meta Earnings Presentation Q2 2024 \n",
 86 |        "\n",
 87 |        "investor.fb.com\n",
 88 |        "\n",
 89 |        " Meta logo, consisting of an infinity symbol followed by the text \"Meta\"\n",
 90 |        "\n",
 91 |        "Revenue by User Geography  Meta logo  \n",
 92 |        "\n",
 93 |        "In Millions\n",
 94 |        "\n",
 95 |        " \n",
 96 |        "| Quarter | US & Canada | Europe | Asia-Pacific | Rest of World | Total |\n",
 97 |        "|---|---|---|---|---|---|\n",
 98 |        "| Q2'24 | 16,847 | 9,300 | 7,888 | 5,036 | 39,071 |\n",
 99 |        "| Q1'24 | 15,824 | 8,483 | 7,481 | 4,667 | 36,455 |\n",
100 |        "| Q4'23 | 18,585 | 9,441 | 7,512 | 4,573 | 40,111 |\n",
101 |        "| Q3'23 | 15,190 | 7,777 | 6,928 | 4,251 | 34,146 |\n",
102 |        "| Q2'23 | 14,422 | 7,323 | 6,515 | 3,739 | 31,999 |\n",
103 |        "| Q1'23 | 13,048 | 6,345 | 5,960 | 3,292 | 28,645 |\n",
104 |        "| Q4'22 | 15,636 | 7,050 | 6,050 | 3,429 | 32,165 |\n",
105 |        "| Q3'22 | 13,035 | 5,797 | 5,782 | 3,100 | 27,714 |\n",
106 |        "| Q2'22 | 13,249 | 6,452 | 5,797 | 3,213 | 28,822 |\n",
107 |        "\n",
108 |        "This stacked bar chart shows the revenue by user geography for Meta from Q2'22 to Q2'24. The revenue is divided into four categories: US & Canada, Europe, Asia-Pacific, and Rest of World. The total revenue for each quarter is shown at the top of each bar.\n",
109 |        " \n",
110 |        "\n",
111 |        "Our revenue by user geography is geographically apportioned based on our estimation of the geographic location of our users when they perform a revenue-generating activity. This allocation differs from our revenue disaggregated by geography disclosure in our condensed consolidated financial statements where revenue is geographically apportioned based on the addresses of our customers.\n",
112 |        "\n",
113 |        " 3\n",
114 |        "\n",
115 |        "Segment Results  Meta logo  \n",
116 |        "\n",
117 |        "In Millions\n",
118 |        "\n",
119 |        " \n",
120 |        "|  | Q2'22 | Q3'22 | Q4'22 | Q1'23 | Q2'23 | Q3'23 | Q4'23 | Q1'24 | Q2'24 |\n",
121 |        "|---|---|---|---|---|---|---|---|---|---|\n",
122 |        "| Advertising | $ 28,152 | $ 27,237 | $ 31,254 | $ 28,101 | $ 31,498 | $ 33,643 | $ 38,706 | $ 35,635 | $ 38,329 |\n",
123 |        "| Other | 218 | 192 | 184 | 205 | 225 | 293 | 334 | 380 | 389 |\n",
124 |        "| Family of Apps Revenue | 28,370 | 27,429 | 31,438 | 28,306 | 31,723 | 33,936 | 39,040 | 36,015 | 38,718 |\n",
125 |        "| Reality Labs Revenue | 452 | 285 | 727 | 339 | 276 | 210 | 1,071 | 440 | 353 |\n",
126 |        "| Total Revenue | $ 28,822 | $ 27,714 | $ 32,165 | $ 28,645 | $ 31,999 | $ 34,146 | $ 40,111 | $ 36,455 | $ 39,071 |\n",
127 |        "| Family of Apps Operating Income | $ 11,164 | $ 9,336 | $ 10,678 | $ 11,219 | $ 13,131 | $ 17,490 | $ 21,030 | $ 17,664 | $ 19,335 |\n",
128 |        "| Reality Labs Operating (Loss) | (2,806) | (3,672) | (4,279) | (3,992) | (3,739) | (3,742) | (4,646) | (3,846) | (4,488) |\n",
129 |        "| Total Income from Operations | $ 8,358 | $ 5,664 | $ 6,399 | $ 7,227 | $ 9,392 | $ 13,748 | $ 16,384 | $ 13,818 | $ 14,847 |\n",
130 |        "| Operating Margin | 29% | 20% | 20% | 25% | 29% | 40% | 41% | 38% | 38% |\n",
131 |        " \n",
132 |        "\n",
133 |        "We report our financial results based on two reportable segments: Family of Apps (FoA) and Reality Labs (RL). FoA includes Facebook, Instagram, Messenger, WhatsApp, and other services. RL includes our virtual, augmented, and mixed reality related consumer hardware, software, and content.\n",
134 |        "\n",
135 |        " 4\n",
136 |        "\n",
137 |        "Net Income  Meta logo  \n",
138 |        "\n",
139 |        "In Millions\n",
140 |        "\n",
141 |        " \n",
142 |        "| Quarter | Net Income |\n",
143 |        "|---|---|\n",
144 |        "| Q2'22 | $6,687 |\n",
145 |        "| Q3'22 | $4,395 |\n",
146 |        "| Q4'22 | $4,652 |\n",
147 |        "| Q1'23 | $5,709 |\n",
148 |        "| Q2'23 | $7,788 |\n",
149 |        "| Q3'23 | $11,583 |\n",
150 |        "| Q4'23 | $14,017 |\n",
151 |        "| Q1'24 | $12,369 |\n",
152 |        "| Q2'24 | $13,465 |\n",
153 |        "\n",
154 |        "This bar chart shows the Net Income in millions for Meta from Q2'22 to Q2'24. The y-axis ranges from $0 to $14,017 million, with increments of $1,000 million. The highest net income was $14,017 million in Q4'23, while the lowest was $4,395 million in Q3'22.\n",
155 |        " \n",
156 |        "\n",
157 |        " 7\n",
158 |        "\n",
159 |        "Diluted Earnings Per Share  Meta logo  \n",
160 |        "\n",
161 |        " \n",
162 |        "| Quarter | Earnings Per Share |\n",
163 |        "|---|---|\n",
164 |        "| Q2'22 | $2.46 |\n",
165 |        "| Q3'22 | $1.64 |\n",
166 |        "| Q4'22 | $1.76 |\n",
167 |        "| Q1'23 | $2.20 |\n",
168 |        "| Q2'23 | $2.98 |\n",
169 |        "| Q3'23 | $4.39 |\n",
170 |        "| Q4'23 | $5.33 |\n",
171 |        "| Q1'24 | $4.71 |\n",
172 |        "| Q2'24 | $5.16 |\n",
173 |        "\n",
174 |        "This bar chart shows the Diluted Earnings Per Share for Meta from Q2'22 to Q2'24. The y-axis ranges from $1.64 to $5.33, with increments of $0.02. The chart demonstrates an overall increasing trend in earnings per share over the period, with the highest point in Q4'23 at $5.33 and the lowest in Q3'22 at $1.64.\n",
175 |        " \n",
176 |        "\n",
177 |        " 8\n",
178 |        "\n",
179 |        "Limitations of Key Metrics and Other Data  Meta logo  \n",
180 |        "\n",
181 |        "To calculate our estimates of DAP, we currently use a series of machine learning models that are developed based on internal reviews of limited samples of user accounts and calibrated against user survey data. We apply significant judgment in designing these models and calculating these estimates. For example, to match user accounts within individual products and across multiple products, we use data signals such as similar device information, IP addresses, and user names. We also calibrate our models against data from periodic user surveys of varying sizes and frequency across our products, which survey questions are based on monthly usage, and which are inherently subject to error. The timing and results of such user surveys have in the past contributed, and may in the future contribute, to changes in our reported Family metrics from period to period. In addition, our data limitations may affect our understanding of certain details of our business and increase the risk of error for our Family metrics estimates. Our techniques and models rely on a variety of data signals from different products, and we rely on more limited data signals for some products compared to others. For example, as a result of limited visibility into encrypted products, we have fewer data signals from WhatsApp user accounts and primarily rely on phone numbers and device information to match WhatsApp user accounts with accounts on our other products. Any loss of access to data signals we use in our process for calculating Family metrics, whether as a result of our own product decisions, actions by third-party browser or mobile platforms, regulatory or legislative requirements, or other factors, also may impact the stability or accuracy of our reported Family metrics, as well as our ability to report these metrics at all. Our estimates of Family metrics also may change as our methodologies evolve, including through the application of new data signals or technologies, product changes, or other improvements in our user surveys, algorithms, or machine learning that may improve our ability to match accounts within and across our products or otherwise evaluate the broad population of our users. In addition, such evolution may allow us to identify previously undetected violating accounts (as defined below).\n",
182 |        "\n",
183 |        "We regularly evaluate our Family metrics to estimate the percentage of our DAP consisting solely of \"violating\" accounts. We define \"violating\" accounts as accounts which we believe are intended to be used for purposes that violate our terms of service, including bots and spam. In the first quarter of 2024, we estimated that less than 3% of our worldwide DAP consisted solely of violating accounts. Such estimation is based on an internal review of a limited sample of accounts, and we apply significant judgment in making this determination. For example, we look for account information and behaviors associated with Facebook and Instagram accounts that appear to be inauthentic to the reviewers, but we have limited visibility into WhatsApp user activity due to encryption. In addition, if we believe an individual person has one or more violating accounts, we do not include such person in our violating accounts estimation as long as we believe they have one account that does not constitute a violating account. From time to time, we disable certain user accounts, make product changes, or take other actions to reduce the number of violating accounts among our users, which may also reduce our DAP estimates in a particular period. We intend to disclose our estimates of the percentage of our DAP consisting solely of violating accounts on an annual basis. Violating accounts are very difficult to measure at our scale, and it is possible that the actual number of violating accounts may vary significantly from our estimates.\n",
184 |        "\n",
185 |        "## User Geography\n",
186 |        "\n",
187 |        "Our estimates for revenue by user location, as well as year-over-year percentage changes in ad impressions delivered and the average price per ad by user location, are also affected by data limitations and other challenges in measuring user geography. Our data regarding the geographic location of our users is estimated based on a number of factors, such as the user's IP address and self-disclosed location. These factors may not always accurately reflect the user's actual location. For example, a user may appear to be accessing our products from the location of the proxy server that the user connects to rather than from the user's actual location. The methodologies used to measure our metrics are also susceptible to algorithm or other technical errors.\n",
188 |        "\n",
189 |        " 17"
190 |       ],
191 |       "text/plain": [
192 |        "<IPython.core.display.Markdown object>"
193 |       ]
194 |      },
195 |      "metadata": {},
196 |      "output_type": "display_data"
197 |     }
198 |    ],
199 |    "source": [
200 |     "# Join the list elements with newlines to create a single string\n",
201 |     "markdown_text = '\\n\\n'.join(markdown_output)\n",
202 |     "display(Markdown(markdown_text))"
203 |    ]
204 |   }
205 |  ],
206 |  "metadata": {
207 |   "kernelspec": {
208 |    "display_name": "any",
209 |    "language": "python",
210 |    "name": "python3"
211 |   },
212 |   "language_info": {
213 |    "codemirror_mode": {
214 |     "name": "ipython",
215 |     "version": 3
216 |    },
217 |    "file_extension": ".py",
218 |    "mimetype": "text/x-python",
219 |    "name": "python",
220 |    "nbconvert_exporter": "python",
221 |    "pygments_lexer": "ipython3",
222 |    "version": "3.10.15"
223 |   }
224 |  },
225 |  "nbformat": 4,
226 |  "nbformat_minor": 2
227 | }
228 | 


--------------------------------------------------------------------------------
/examples/sample_data/Earnings-Presentation-Q2-2024.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/Earnings-Presentation-Q2-2024.pdf


--------------------------------------------------------------------------------
/examples/sample_data/cambioml_logo_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/cambioml_logo_large.png


--------------------------------------------------------------------------------
/examples/sample_data/resume_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/resume_1.pdf


--------------------------------------------------------------------------------
/examples/sample_data/resume_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/resume_1.png


--------------------------------------------------------------------------------
/examples/sample_data/sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/sample.pdf


--------------------------------------------------------------------------------
/examples/sample_data/stoxx_index_guide_0003.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/stoxx_index_guide_0003.pdf


--------------------------------------------------------------------------------
/examples/sample_data/test1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test1.pdf


--------------------------------------------------------------------------------
/examples/sample_data/test2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test2.pdf


--------------------------------------------------------------------------------
/examples/sample_data/test3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test3.pdf


--------------------------------------------------------------------------------
/examples/sample_data/test3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test3.png


--------------------------------------------------------------------------------
/examples/sample_data/test_1figure_1table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_1figure_1table.png


--------------------------------------------------------------------------------
/examples/sample_data/test_invoice.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_invoice.pdf


--------------------------------------------------------------------------------
/examples/sample_data/test_medical_report.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_medical_report.jpeg


--------------------------------------------------------------------------------
/examples/sample_data/test_odf.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_odf.docx


--------------------------------------------------------------------------------
/examples/sample_data/test_odf.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_odf.pptx


--------------------------------------------------------------------------------
/examples/sample_data/test_w2.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_w2.docx


--------------------------------------------------------------------------------
/examples/sample_data/test_w2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_w2.png


--------------------------------------------------------------------------------
/examples/sample_data/test_w2.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_w2.pptx


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "any-parser"
 3 | version = "0.0.24"
 4 | description = "Parser for all."
 5 | authors = ["CambioML <wanwanaiai45@gmail.com>"]
 6 | maintainers = ["Rachel Hu <goldpiggy@berkeley.edu>"]
 7 | readme = "README.md"
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = ">=3.9,<3.13"
11 | requests = "^2.25.0"
12 | python-dotenv = "^1.0.0"
13 | pydantic = "^2.10.3"
14 | 
15 | [tool.poetry.group.dev.dependencies]
16 | black = "^24.8.0"
17 | isort = "^5.13.2"
18 | autoflake = "^2.3.1"
19 | pytest = "^8.3.3"
20 | pre-commit = "^4.0.1"
21 | 
22 | [tool.poetry.group.optional.dependencies]
23 | Levenshtein = [
24 |     { version = "0.25.1", python = "<3.9" },
25 |     { version = "0.26.0", python = ">=3.9" }
26 | ]
27 | 
28 | [build-system]
29 | requires = ["poetry-core"]
30 | build-backend = "poetry.core.masonry.api"
31 | 


--------------------------------------------------------------------------------
/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | python -m unittest discover tests -v
3 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Testing
 2 | Overview of running tests for AnyParser sdk. These should be run before submitting any pull request.
 3 | 
 4 | These tests are written using the unittest framework in Python. The tests are located in the `tests/test.py` file. Test data is located in the `tests/test_data.py` file.
 5 | 
 6 | ## Setup
 7 | 1. Install the required packages by running the following command:
 8 | ```bash
 9 | poetry install
10 | ```
11 | In the `dev.dependencies` section of the `pyproject.toml` file, you will see the packages that are installed.
12 | 
13 | 2. Add a `.env` file in the `tests` folder with the following content:
14 | ```bash
15 | CAMBIO_API_KEY=*************
16 | ```
17 | 
18 | ## Pre-commit
19 | This project uses pre-commit to run checks before committing code. To initialize `pre-commit` for this repo, run the following command:
20 | ```bash
21 | pre-commit install
22 | ```
23 | 
24 | Now, with every commit, the checks will run automatically on the files added to the commit. The checks include:
25 | - `black` for code formatting
26 | - `flake8` for linting
27 | - `isort` for import sorting
28 | - running the unit tests in `tests/test.py`
29 | 
30 | If you want to run the checks manually, you can run the following command:
31 | ```bash
32 | pre-commit run --all-files
33 | ```
34 | 
35 | ## Running Tests Manually
36 | 1. Make sure you are in the project root folder.
37 | 2. Run the following command:
38 | ```bash
39 | ./run_tests.sh
40 | ```
41 | 
42 | If you just want to run an individual test within the test.py file, you can run the following command:
43 | ```bash
44 | python -m unittest -k <test_name>
45 | ```
46 | 
47 | For example, if you want to run `test_pdf_sync_extract`, you can run the following command:
48 | ```bash
49 | python -m unittest -k test_pdf_sync_extract
50 | ```
51 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/tests/__init__.py


--------------------------------------------------------------------------------
/tests/outputs/correct_docx_output.txt:
--------------------------------------------------------------------------------
 1 | ## Test document
 2 | 
 3 | Here is an example chart:
 4 | 
 5 |  
 6 | | Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 |
 7 | |---|---|---|---|---|---|
 8 | | Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% |
 9 | | Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% |
10 | | Office 365 Commercial seat growth (a/o) | 14% | 12% | 11% | 11% | 10% |
11 | | Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 |
12 | | Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% |
13 | | LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% |
14 |  
15 | 
16 | Growth rates include non-GAAP CC growth (GAAP % / CC %)


--------------------------------------------------------------------------------
/tests/outputs/correct_pdf_output.txt:
--------------------------------------------------------------------------------
  1 | STOXX INDEX METHODOLOGY GUIDE 
  2 | 
  3 | ## CONTENTS
  4 | 
  5 | 6.5.1. OVERVIEW 49
  6 | 6.5.2. INDEX REVIEW 49
  7 | 6.5.3. ONGOING MAINTENANCE 51
  8 | 
  9 | 7. STOXX BENCHMARK INDICES (BMI) 52
 10 | 
 11 | 7.1. STOXX GLOBAL INDICES 52
 12 | 7.1.1. OVERVIEW 52
 13 | 7.1.2. INDEX REVIEW 53
 14 | 7.1.3. ONGOING MAINTENANCE 55
 15 | 
 16 | 7.2. STOXX GLOBAL 1800 AND DERIVED INDICES 56
 17 | 7.2.1. OVERVIEW 56
 18 | 7.2.2. INDEX REVIEW 56
 19 | 7.2.3. ONGOING MAINTENANCE 58
 20 | 
 21 | 7.3. SIZE INDICES BASED ON THE STOXX GLOBAL INDICES 60
 22 | 7.3.1. OVERVIEW 60
 23 | 7.3.2. INDEX REVIEW 60
 24 | 7.3.3. ONGOING MAINTENANCE 62
 25 | 
 26 | 7.4. SECTOR INDICES BASED ON THE STOXX GLOBAL INDICES 63
 27 | 7.4.1. OVERVIEW 63
 28 | 7.4.2. INDEX REVIEW 63
 29 | 7.4.3. ONGOING MAINTENANCE 64
 30 | 
 31 | 7.5. STOXX EUROPE 600 AND EURO STOXX SUPERSECTOR INDICES: 30% / 15% CAPS 65
 32 | 7.5.1. OVERVIEW 65
 33 | 7.5.2. INDEX REVIEW 65
 34 | 7.5.3. ONGOING MAINTENANCE 66
 35 | 
 36 | 7.6. STOXX REGIONAL REAL ESTATE INDICES: 20% CAPS67
 37 | 7.6.1. OVERVIEW 67
 38 | 7.6.2. INDEX REVIEW 67
 39 | 7.6.3. ONGOING MAINTENANCE 67
 40 | 
 41 | 7.7. STOXX EMERGING MARKETS 800 LO 68
 42 | 7.7.1. OVERVIEW 68
 43 | 7.7.2. INDEX REVIEW 68
 44 | 7.7.3. ONGOING MAINTENANCE 68
 45 | 
 46 | 7.8. STOXX INDUSTRY AND SUPERSECTOR LEGACY INDICES 70
 47 | 7.8.1. OVERVIEW 70
 48 | 7.8.2. INDEX REVIEW 71
 49 | 7.8.3. ONGOING MAINTENANCE 71
 50 | 
 51 | 7.9. EURO STOXX SUPERSECTOR 5/10/40 INDICES 72
 52 | 7.9.1. OVERVIEW 72
 53 | 7.9.2. INDEX REVIEW 72
 54 | 7.9.3. ONGOING MAINTENANCE 73
 55 | 
 56 | 7.10. STOXX EUROPE 600 INDUSTRY 30-15 INDICES 74
 57 | 7.10.1. OVERVIEW 74
 58 | 7.10.2. INDEX REVIEW 74
 59 | 7.10.3. ONGOING MAINTENANCE 75
 60 | 
 61 | 7.11. STOXX SEMICONDUCTOR 30 INDEX 76
 62 | 7.11.1. OVERVIEW 76
 63 | 7.11.2. INDEX REVIEW 76
 64 | 7.11.3. ONGOING MAINTENANCE 77
 65 | 
 66 | ## 8. STOXX EQUAL WEIGHT INDICES 78
 67 | 
 68 | 8.1. STOXX EQUAL WEIGHT INDICES 78
 69 | 8.1.1. OVERVIEW 78
 70 | 8.1.2. INDEX REVIEW 78
 71 | 8.1.3. ONGOING MAINTENANCE 78
 72 | 
 73 | ## 9. STOXX BLUE-CHIP INDICES 80
 74 | 
 75 | 9.1. STOXX GLOBAL AND COUNTRY BLUE-CHIP INDICES 80
 76 | 9.1.1. OVERVIEW 80
 77 | 9.1.2. INDEX REVIEW 81
 78 | 9.1.3. ONGOING MAINTENANCE 84
 79 | 
 80 | 9.2. EURO STOXX 50 85
 81 | 9.2.1. OVERVIEW 85
 82 | 9.2.2. INDEX REVIEW 85
 83 | 9.2.3. ONGOING MAINTENANCE 86
 84 | 
 85 | 9.3. STOXX REGIONAL BLUE-CHIP INDICES 88
 86 | 9.3.1. OVERVIEW 88
 87 | 9.3.2. INDEX REVIEW 88
 88 | 9.3.3. ONGOING MAINTENANCE 89
 89 | 
 90 | 9.4. STOXX GLOBAL 150 91
 91 | 9.4.1. OVERVIEW 91
 92 | 9.4.2. INDEX REVIEW 91
 93 | 9.4.3. ONGOING MAINTENANCE 91
 94 | 
 95 | 9.5. STOXX BALKAN 50 EQUAL WEIGHT 92
 96 | 9.5.1. OVERVIEW 92
 97 | 9.5.2. INDEX REVIEW 92
 98 | 9.5.3. ONGOING MAINTENANCE 93
 99 | 
100 | 9.6. STOXX CANADA 60 94
101 | 9.6.1. OVERVIEW 94
102 | 9.6.2. INDEX REVIEW 94
103 | 9.6.3. ONGOING MAINTENANCE 95
104 | 
105 | ## 10. STOXX DIVIDEND INDICES 96
106 | 
107 | 10.1. STOXX SELECT DIVIDEND INDICES 96
108 | 10.1.1. OVERVIEW 96
109 | 10.1.2. INDEX REVIEW 96
110 | 10.1.3. STOXX SELECT DIVIDEND INDICES 99
111 | 10.1.4. ONGOING MAINTENANCE 101
112 | 
113 | 10.2. STOXX ASEAN-FIVE SELECT DIVIDEND 50 104
114 | 10.2.1. OVERVIEW 104
115 | 10.2.2. INDEX REVIEW 104
116 | 10.2.3. ONGOING MAINTENANCE 105
117 | 
118 | 10.3. STOXX ASEAN SELECT DIVIDEND 30 106
119 | 
120 |  3/529 
121 | 
122 |   Part of DEUTSCHE BÖRSE GROUP


--------------------------------------------------------------------------------
/tests/outputs/correct_png_output.txt:
--------------------------------------------------------------------------------
1 | | Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 |
2 | |---|---|---|---|---|---|
3 | | Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% |
4 | | Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% |
5 | | Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% |
6 | | Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 |
7 | | Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% |
8 | | LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% |


--------------------------------------------------------------------------------
/tests/outputs/correct_pptx_output.txt:
--------------------------------------------------------------------------------
 1 | ## Test finical report
 2 | ## Title
 3 | 
 4 | • Chart 1 example
 5 | 
 6 |  
 7 | | Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 |
 8 | |---|---|---|---|---|---|
 9 | | Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% |
10 | | Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% |
11 | | Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% |
12 | | Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 |
13 | | Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% |
14 | | LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% |
15 |  
16 | 
17 | Growth rates include non-GAAP CC growth (GAAP % / CC %).
18 | Thanks


--------------------------------------------------------------------------------
/tests/test.py:
--------------------------------------------------------------------------------
  1 | """Testing Synchronous and Asynchronous Extraction"""
  2 | 
  3 | import base64
  4 | import os
  5 | import sys
  6 | import time
  7 | import unittest
  8 | from pathlib import Path
  9 | 
 10 | import Levenshtein
 11 | from dotenv import load_dotenv
 12 | 
 13 | from tests.test_data import EXTRACT_JSON_TEST_DATA
 14 | 
 15 | sys.path.append(".")
 16 | load_dotenv(override=True)
 17 | from any_parser import AnyParser  # noqa: E402
 18 | 
 19 | 
 20 | def get_ground_truth(file_path: str) -> str:
 21 |     """Get the ground truth from the file."""
 22 |     with open(file_path, "r", encoding="utf-8") as file:
 23 |         return file.read()
 24 | 
 25 | 
 26 | def _preprocess_markdown_text(text: str) -> str:
 27 |     """Clean the markdown text."""
 28 |     return text.replace("#", "").replace("\n", "")
 29 | 
 30 | 
 31 | def compare_markdown(generated_output: str, correct_output: str) -> float:
 32 |     """
 33 |     Compare the generated markdown to the correct markdown using
 34 |     Levenshtein Distance.
 35 |     """
 36 |     # Preprocess both outputs to clean markdown text
 37 |     generated_output = _preprocess_markdown_text(generated_output)
 38 |     correct_output = _preprocess_markdown_text(correct_output)
 39 | 
 40 |     distance = Levenshtein.distance(generated_output, correct_output)
 41 | 
 42 |     max_len = max(len(generated_output), len(correct_output))
 43 |     similarity_percentage = ((max_len - distance) / max_len) * 100
 44 | 
 45 |     return similarity_percentage
 46 | 
 47 | 
 48 | class TestAnyParser(unittest.TestCase):
 49 |     """Testing Any Parser"""
 50 | 
 51 |     def setUp(self):
 52 |         self.api_key = os.environ.get("CAMBIO_API_KEY")
 53 |         if not self.api_key:
 54 |             raise ValueError("CAMBIO_API_KEY is not set")
 55 |         self.ap = AnyParser(self.api_key)
 56 | 
 57 |     def test_pdf_sync_parse(self):
 58 |         """Synchronous PDF Parse"""
 59 |         working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
 60 |         correct_output_file = "./tests/outputs/correct_pdf_output.txt"
 61 | 
 62 |         # extract
 63 |         markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
 64 |         markdown = "\n".join(markdown_list)
 65 |         self.assertFalse(markdown.startswith("Error:"), markdown)
 66 |         correct_output = get_ground_truth(correct_output_file)
 67 |         percentage = compare_markdown(markdown, correct_output)
 68 | 
 69 |         self.assertGreaterEqual(
 70 |             percentage, 90, f"Output similarity too low: {percentage:.2f}%"
 71 |         )
 72 |         self.assertIn("Time Elapsed", elapsed_time)
 73 | 
 74 |     def test_pdf_sync_parse_with_file_content(self):
 75 |         """Synchronous PDF Parse with file content"""
 76 |         working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
 77 |         correct_output_file = "./tests/outputs/correct_pdf_output.txt"
 78 | 
 79 |         with open(working_file, "rb") as file:
 80 |             file_content = base64.b64encode(file.read()).decode("utf-8")
 81 |             file_type = Path(working_file).suffix.lower().lstrip(".")
 82 | 
 83 |         # extract
 84 |         markdown_list, elapsed_time = self.ap.parse(
 85 |             file_content=file_content, file_type=file_type
 86 |         )
 87 |         markdown = "\n".join(markdown_list)
 88 | 
 89 |         self.assertFalse(markdown.startswith("Error:"), markdown)
 90 |         correct_output = get_ground_truth(correct_output_file)
 91 |         percentage = compare_markdown(markdown, correct_output)
 92 | 
 93 |         self.assertGreaterEqual(
 94 |             percentage, 90, f"Output similarity too low: {percentage:.2f}%"
 95 |         )
 96 |         self.assertIn("Time Elapsed", elapsed_time)
 97 | 
 98 |     def test_pdf_async_parse_and_fetch(self):
 99 |         """Asynchronous PDF Parse and Fetch"""
100 |         working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
101 |         correct_output_file = "./tests/outputs/correct_pdf_output.txt"
102 | 
103 |         # extract
104 |         file_id = self.ap.async_parse(file_path=working_file)
105 |         self.assertFalse(file_id.startswith("Error:"), file_id)
106 |         # fetch
107 |         markdown_list = self.ap.async_fetch(file_id=file_id)
108 |         markdown = "\n".join(markdown_list)
109 |         self.assertFalse(markdown.startswith("Error:"), markdown)
110 |         correct_output = get_ground_truth(correct_output_file)
111 |         percentage = compare_markdown(markdown, correct_output)
112 | 
113 |         self.assertGreaterEqual(
114 |             percentage, 90, f"Output similarity too low: {percentage:.2f}%"
115 |         )
116 | 
117 |     def test_pdf_async_parse_and_fetch_with_file_content(self):
118 |         """Asynchronous PDF Parse and Fetch with file content"""
119 |         working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
120 |         correct_output_file = "./tests/outputs/correct_pdf_output.txt"
121 | 
122 |         with open(working_file, "rb") as file:
123 |             file_content = base64.b64encode(file.read()).decode("utf-8")
124 |             file_type = Path(working_file).suffix.lower().lstrip(".")
125 | 
126 |         # extract
127 |         file_id = self.ap.async_parse(file_content=file_content, file_type=file_type)
128 |         self.assertFalse(file_id.startswith("Error:"), file_id)
129 |         # fetch
130 |         markdown_list = self.ap.async_fetch(file_id=file_id)
131 |         markdown = "\n".join(markdown_list)
132 |         self.assertFalse(markdown.startswith("Error:"), markdown)
133 |         correct_output = get_ground_truth(correct_output_file)
134 |         percentage = compare_markdown(markdown, correct_output)
135 | 
136 |         self.assertGreaterEqual(
137 |             percentage, 90, f"Output similarity too low: {percentage:.2f}%"
138 |         )
139 | 
140 |     def test_docx_sync_extract(self):
141 |         """Synchronous Word Extraction"""
142 |         working_file = "./examples/sample_data/test_odf.docx"
143 |         correct_output_file = "./tests/outputs/correct_docx_output.txt"
144 | 
145 |         # extract
146 |         markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
147 |         markdown = "\n".join(markdown_list)
148 |         self.assertFalse(markdown.startswith("Error:"), markdown)
149 |         correct_output = get_ground_truth(correct_output_file)
150 |         percentage = compare_markdown(markdown, correct_output)
151 | 
152 |         self.assertGreaterEqual(
153 |             percentage, 90, f"Output similarity too low: {percentage:.2f}%"
154 |         )
155 |         self.assertIn("Time Elapsed", elapsed_time)
156 | 
157 |     def test_docx_async_parse_and_fetch(self):
158 |         """Asynchronous Word Parse and Fetch"""
159 |         working_file = "./examples/sample_data/test_odf.docx"
160 |         correct_output_file = "./tests/outputs/correct_docx_output.txt"
161 | 
162 |         # extract
163 |         file_id = self.ap.async_parse(file_path=working_file)
164 |         self.assertFalse(file_id.startswith("Error:"), file_id)
165 |         # fetch
166 |         markdown_list = self.ap.async_fetch(file_id=file_id)
167 |         markdown = "\n".join(markdown_list)
168 |         self.assertFalse(markdown.startswith("Error:"), markdown)
169 |         correct_output = get_ground_truth(correct_output_file)
170 |         percentage = compare_markdown(markdown, correct_output)
171 | 
172 |         self.assertGreaterEqual(
173 |             percentage, 90, f"Output similarity too low: {percentage:.2f}%"
174 |         )
175 | 
176 |     def test_pptx_sync_extract(self):
177 |         """Synchronous Powerpoint Extraction"""
178 |         working_file = "./examples/sample_data/test_odf.pptx"
179 |         correct_output_file = "./tests/outputs/correct_pptx_output.txt"
180 | 
181 |         # extract
182 |         markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
183 |         markdown = "\n".join(markdown_list)
184 |         self.assertFalse(markdown.startswith("Error:"), markdown)
185 |         correct_output = get_ground_truth(correct_output_file)
186 |         percentage = compare_markdown(markdown, correct_output)
187 | 
188 |         self.assertGreaterEqual(
189 |             percentage, 90, f"Output similarity too low: {percentage:.2f}%"
190 |         )
191 |         self.assertIn("Time Elapsed", elapsed_time)
192 | 
193 |     def test_pptx_async_parse_and_fetch(self):
194 |         """Asynchronous Powerpoint Parse and Fetch"""
195 |         working_file = "./examples/sample_data/test_odf.pptx"
196 |         correct_output_file = "./tests/outputs/correct_pptx_output.txt"
197 | 
198 |         # extract
199 |         file_id = self.ap.async_parse(file_path=working_file)
200 |         self.assertFalse(file_id.startswith("Error:"), file_id)
201 |         # fetch
202 |         markdown_list = self.ap.async_fetch(file_id=file_id)
203 |         markdown = "\n".join(markdown_list)
204 |         self.assertFalse(markdown.startswith("Error:"), markdown)
205 |         correct_output = get_ground_truth(correct_output_file)
206 |         percentage = compare_markdown(markdown, correct_output)
207 | 
208 |         self.assertGreaterEqual(
209 |             percentage, 90, f"Output similarity too low: {percentage:.2f}%"
210 |         )
211 | 
212 |     def test_image_sync_extract(self):
213 |         """Synchronous Image Extraction"""
214 |         working_file = "./examples/sample_data/test3.png"
215 |         correct_output_file = "./tests/outputs/correct_png_output.txt"
216 | 
217 |         # extract
218 |         markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
219 |         markdown = "\n".join(markdown_list)
220 |         self.assertFalse(markdown.startswith("Error:"), markdown)
221 |         correct_output = get_ground_truth(correct_output_file)
222 |         percentage = compare_markdown(markdown, correct_output)
223 | 
224 |         self.assertGreaterEqual(
225 |             percentage, 90, f"Output similarity too low: {percentage:.2f}%"
226 |         )
227 |         self.assertIn("Time Elapsed", elapsed_time)
228 | 
229 |     def test_image_async_parse_and_fetch(self):
230 |         """Asynchronous Image Parse and Fetch"""
231 |         working_file = "./examples/sample_data/test3.png"
232 |         correct_output_file = "./tests/outputs/correct_png_output.txt"
233 | 
234 |         # extract
235 |         file_id = self.ap.async_parse(file_path=working_file)
236 |         self.assertFalse(file_id.startswith("Error:"), file_id)
237 |         # fetch
238 |         markdown_list = self.ap.async_fetch(file_id=file_id)
239 |         markdown = "\n".join(markdown_list)
240 |         self.assertFalse(markdown.startswith("Error:"), markdown)
241 |         correct_output = get_ground_truth(correct_output_file)
242 |         percentage = compare_markdown(markdown, correct_output)
243 | 
244 |         self.assertGreaterEqual(
245 |             percentage, 90, f"Output similarity too low: {percentage:.2f}%"
246 |         )
247 | 
248 |     def test_sync_extract_key_value(self):
249 |         """
250 |         Synchronous JSON Extraction with subtests for different file formats
251 |         """
252 |         for data in EXTRACT_JSON_TEST_DATA:
253 |             with self.subTest(working_file=data["working_file"]):
254 |                 # extract
255 |                 key_value_result, elapsed_time = self.ap.extract_key_value(
256 |                     file_path=data["working_file"],
257 |                     extract_instruction=data["extract_instruction"],
258 |                 )
259 | 
260 |                 # assertions
261 |                 self.assertEqual(key_value_result, data["correct_output"])
262 |                 self.assertIn("Time Elapsed", elapsed_time)
263 | 
264 |     def test_async_extract_key_value_and_fetch(self):
265 |         """
266 |         Asynchronous JSON Extraction with subtests for different file formats
267 |         """
268 |         for data in EXTRACT_JSON_TEST_DATA:
269 |             with self.subTest(working_file=data["working_file"]):
270 |                 # extract
271 |                 file_id = self.ap.async_extract_key_value(
272 |                     file_path=data["working_file"],
273 |                     extract_instruction=data["extract_instruction"],
274 |                 )
275 |                 self.assertFalse(file_id.startswith("Error:"), file_id)
276 |                 # fetch
277 |                 key_value_result = self.ap.async_fetch(file_id=file_id)
278 |                 # assertions
279 |                 self.assertEqual(key_value_result, data["correct_output"])
280 |                 # wait 1 s between requests
281 |                 time.sleep(1)
282 | 
283 | 
284 | if __name__ == "__main__":
285 |     unittest.main(verbosity=2)
286 | 


--------------------------------------------------------------------------------
/tests/test_batch_api.py:
--------------------------------------------------------------------------------
 1 | """Testing Batch API Extraction"""
 2 | 
 3 | import os
 4 | import sys
 5 | import unittest
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | sys.path.append(".")
10 | load_dotenv(override=True)
11 | from any_parser import AnyParser  # noqa: E402
12 | 
13 | 
14 | class TestAnyParserBatchAPI(unittest.TestCase):
15 |     """Testing Any Parser Batch API"""
16 | 
17 |     def setUp(self):
18 |         self.api_key = os.environ.get("CAMBIO_API_KEY")
19 |         if not self.api_key:
20 |             raise ValueError("CAMBIO_API_KEY is not set")
21 |         self.ap = AnyParser(self.api_key)
22 | 
23 |     def test_batch_api_create(self):
24 |         """Batch API Create"""
25 |         working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
26 | 
27 |         response = self.ap.batches.create(working_file)
28 | 
29 |         self.assertIsNotNone(response)
30 |         self.assertEqual(response.requestStatus, "UPLOADED")
31 | 
32 |         request_id = response.requestId
33 |         status = self.ap.batches.retrieve(request_id)
34 |         self.assertEqual(status.requestStatus, "UPLOADED")
35 | 
36 |         quota = self.ap.batches.get_usage()
37 |         self.assertGreaterEqual(quota.pageRemaining, 0)
38 | 


--------------------------------------------------------------------------------
/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | EXTRACT_JSON_TEST_DATA = [
 2 |     {
 3 |         "working_file": "./examples/sample_data/test1.pdf",
 4 |         "extract_instruction": {
 5 |             "social_security_number": "the social security number of the employee",
 6 |             "ein": "the employer identification number",
 7 |             "first_name": "the first name of the employee",
 8 |             "last_name": "the last name of the employee",
 9 |         },
10 |         "correct_output": {
11 |             "social_security_number": ["758-58-5787"],
12 |             "ein": ["78-8778788"],
13 |             "first_name": ["Jesan"],
14 |             "last_name": ["Rahaman"],
15 |         },
16 |     },
17 |     # {
18 |     #     "working_file": "./examples/sample_data/test_w2.pptx",
19 |     #     "extract_instruction": {
20 |     #         "social_security_number": "the social security number of the employee",
21 |     #         "ein": "the employer identification number",
22 |     #         "first_name": "the first name of the employee",
23 |     #         "last_name": "the last name of the employee",
24 |     #     },
25 |     #     "correct_output": [
26 |     #         {
27 |     #             "social_security_number": ["758-58-5787"],
28 |     #             "ein": ["78-8778788"],
29 |     #             "first_name": ["Jesan"],
30 |     #             "last_name": ["Rahaman"],
31 |     #         }
32 |     #     ],
33 |     # },
34 |     # {
35 |     #     "working_file": "./examples/sample_data/test_w2.docx",
36 |     #     "extract_instruction": {
37 |     #         "social_security_number": "the social security number of the employee",
38 |     #         "ein": "the employer identification number",
39 |     #         "first_name": "the first name of the employee",
40 |     #         "last_name": "the last name of the employee",
41 |     #     },
42 |     #     "correct_output": [
43 |     #         {
44 |     #             "social_security_number": ["758-58-5787"],
45 |     #             "ein": ["78-8778788"],
46 |     #             "first_name": ["Jesan"],
47 |     #             "last_name": ["Rahaman"],
48 |     #         }
49 |     #     ],
50 |     # },
51 |     {
52 |         "working_file": "./examples/sample_data/test_w2.png",
53 |         "extract_instruction": {
54 |             "social_security_number": "the social security number of the employee",
55 |             "ein": "the employer identification number",
56 |             "first_name": "the first name of the employee",
57 |             "last_name": "the last name of the employee",
58 |         },
59 |         "correct_output": {
60 |             "social_security_number": ["758-58-5787"],
61 |             "ein": ["78-8778788"],
62 |             "first_name": ["Jesan"],
63 |             "last_name": ["Rahaman"],
64 |         },
65 |     },
66 | ]
67 | 


--------------------------------------------------------------------------------