├── .github
├── CODEOWNERS
├── ISSUE_TEMPLATE
│ ├── bug-report.yml
│ ├── config.yml
│ ├── documentation.yml
│ └── feature-request.yml
├── pull_request_template.md
└── workflows
│ ├── pr_agent.yml
│ └── python-app.yml
├── .gitignore
├── .pre-commit-config.yaml
├── README.md
├── any_parser
├── __init__.py
├── any_parser.py
├── async_parser.py
├── base_parser.py
├── batch_parser.py
├── constants.py
├── sync_parser.py
└── utils.py
├── examples
├── async_extract_key_value_img.ipynb
├── async_extract_key_value_pdf.ipynb
├── async_extract_pii.ipynb
├── async_extract_resume_key_value.ipynb
├── async_extract_tables.ipynb
├── async_parse_pdf.ipynb
├── async_parse_pdf2.ipynb
├── async_parse_with_layout.ipynb
├── async_parse_with_ocr.ipynb
├── extract_key_value_img.ipynb
├── extract_key_value_pdf.ipynb
├── extract_pii.ipynb
├── extract_resume_key_value.ipynb
├── extract_tables.ipynb
├── parse_batch_api.ipynb
├── parse_docx.ipynb
├── parse_img.ipynb
├── parse_pdf.ipynb
├── parse_pdf2.ipynb
└── sample_data
│ ├── Earnings-Presentation-Q2-2024.pdf
│ ├── cambioml_logo_large.png
│ ├── resume_1.pdf
│ ├── resume_1.png
│ ├── sample.pdf
│ ├── stoxx_index_guide_0003.pdf
│ ├── test1.pdf
│ ├── test2.pdf
│ ├── test3.pdf
│ ├── test3.png
│ ├── test_1figure_1table.png
│ ├── test_invoice.pdf
│ ├── test_medical_report.jpeg
│ ├── test_odf.docx
│ ├── test_odf.pptx
│ ├── test_w2.docx
│ ├── test_w2.png
│ └── test_w2.pptx
├── pyproject.toml
├── run_tests.sh
└── tests
├── README.md
├── __init__.py
├── outputs
├── correct_docx_output.txt
├── correct_pdf_output.txt
├── correct_png_output.txt
└── correct_pptx_output.txt
├── test.py
├── test_batch_api.py
└── test_data.py
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Default codeowners/reviewers for all code changes
2 | * @CambioML @Sdddell @goldmermaid @lingjiekong @boqiny
3 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.yml:
--------------------------------------------------------------------------------
1 | name: 🐛 Bug Report
2 | description: Create a report to help us reproduce and fix the bug
3 |
4 | body:
5 | - type: markdown
6 | attributes:
7 | value: >
8 | #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/CambioML/any-parser/issues?q=is%3Aissue+sort%3Acreated-desc+).
9 | - type: textarea
10 | attributes:
11 | label: 🐛 Describe the bug
12 | description: |
13 | Please provide a clear and concise description of what the bug is.
14 |
15 | If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example:
16 |
17 | ```python
18 | ...
19 | ```
20 |
21 | If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
22 |
23 | Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
24 | placeholder: |
25 | A clear and concise description of what the bug is.
26 |
27 | ```python
28 | # Sample code to reproduce the problem
29 | ```
30 |
31 | ```
32 | The error message you got, with the full traceback.
33 | ```
34 | validations:
35 | required: true
36 | - type: textarea
37 | attributes:
38 | label: Versions
39 | description: |
40 | Please run the following and paste the output below.
41 | ```sh
42 | wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
43 | # For security purposes, please check the contents of collect_env.py before running it.
44 | python collect_env.py
45 | ```
46 | validations:
47 | required: true
48 | - type: markdown
49 | attributes:
50 | value: >
51 | Thanks for contributing 🎉!
52 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 | - name: Questions
4 | url: https://cambiomlworkspace.slack.com/join/shared_invite/zt-1zes33rmt-20Rag043uvExUaUdvt5_xQ#/shared-invite/email
5 | about: Ask questions and discuss with other CambioML community members
6 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.yml:
--------------------------------------------------------------------------------
1 | name: 📚 Documentation
2 | description: Report an issue related to https://www.cambioml.com/docs/any-parser/index.html
3 |
4 | body:
5 | - type: textarea
6 | attributes:
7 | label: 📚 The doc issue
8 | description: >
9 | A clear and concise description of what content in https://www.cambioml.com/docs/any-parser/index.html is an issue.
10 | validations:
11 | required: true
12 | - type: textarea
13 | attributes:
14 | label: Suggest a potential alternative/fix
15 | description: >
16 | Tell us how we could improve the documentation in this regard.
17 | - type: markdown
18 | attributes:
19 | value: >
20 | Thanks for contributing 🎉!
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.yml:
--------------------------------------------------------------------------------
1 | name: 🚀 Feature request
2 | description: Submit a proposal/request for a new any-parser feature
3 |
4 | body:
5 | - type: textarea
6 | attributes:
7 | label: 🚀 The feature, motivation and pitch
8 | description: >
9 | A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
10 | validations:
11 | required: true
12 | - type: textarea
13 | attributes:
14 | label: Alternatives
15 | description: >
16 | A description of any alternative solutions or features you've considered, if any.
17 | - type: textarea
18 | attributes:
19 | label: Additional context
20 | description: >
21 | Add any other context or screenshots about the feature request.
22 | - type: markdown
23 | attributes:
24 | value: >
25 | Thanks for contributing 🎉!
26 |
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | ## Description
2 |
3 |
4 | ## Related Issue
5 |
6 |
7 | ## Type of Change
8 |
9 |
10 | - [ ] Bug fix (non-breaking change which fixes an issue)
11 | - [ ] New feature (non-breaking change which adds functionality)
12 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
13 | - [ ] Documentation update
14 | - [ ] Code refactoring
15 | - [ ] Performance improvement
16 |
17 | ## How Has This Been Tested?
18 |
19 |
20 | ## Screenshots (if applicable)
21 |
22 |
23 | ## Checklist
24 |
25 |
26 | - [ ] My code follows the project's style guidelines
27 | - [ ] I have performed a self-review of my own code
28 | - [ ] I have commented my code, particularly in hard-to-understand areas
29 | - [ ] I have made corresponding changes to the documentation
30 | - [ ] My changes generate no new warnings
31 | - [ ] I have added tests that prove my fix is effective or that my feature works
32 | - [ ] New and existing unit tests pass locally with my changes
33 |
34 | ## Additional Notes
35 |
36 |
--------------------------------------------------------------------------------
/.github/workflows/pr_agent.yml:
--------------------------------------------------------------------------------
1 | on:
2 | pull_request:
3 | types: [opened, reopened, ready_for_review]
4 | issue_comment:
5 | jobs:
6 | pr_agent_job:
7 | if: ${{ github.event.sender.type != 'Bot' }}
8 | runs-on: ubuntu-latest
9 | permissions:
10 | issues: write
11 | pull-requests: write
12 | contents: write
13 | name: Run pr agent on every pull request, respond to user comments
14 | steps:
15 | - name: PR Agent action step
16 | id: pragent
17 | uses: Codium-ai/pr-agent@main
18 | env:
19 | OPENAI_KEY: ${{ secrets.OPENAI_API_KEY }}
20 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
21 |
--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
1 | name: Python application
2 |
3 | on:
4 | push:
5 | branches: [ "main" ]
6 | pull_request:
7 | branches: [ "main" ]
8 |
9 | permissions:
10 | contents: read
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 | strategy:
17 | matrix:
18 | python-version: ["3.10"]
19 | max-parallel: 1 # Ensures the tests run sequentially
20 |
21 | steps:
22 | - uses: actions/checkout@v3
23 | - name: Set up Python
24 | uses: actions/setup-python@v3
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - name: Install dependencies
28 | run: |
29 | python -m pip install --upgrade pip
30 | pip install flake8
31 | pip install black
32 | pip install isort
33 | python -m pip install poetry
34 | poetry install --no-root # This will install the project dependencies defined in pyproject.toml
35 | - name: Lint with flake8
36 | run: |
37 | # stop the build if there are Python syntax errors or undefined names
38 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
39 | # exit-zero treats all errors as warnings.
40 | flake8 . --count --exit-zero --max-complexity=10 --statistics
41 | - name: Format code with Black
42 | run: |
43 | black . --exclude="" --check --verbose
44 | - name: Sort imports with isort
45 | run: |
46 | isort . --profile=black --check-only --verbose
47 | - name: Test with unittest
48 | env:
49 | CAMBIO_API_KEY: ${{ secrets.CAMBIO_API_KEY }}
50 | run: |
51 | poetry run python -m unittest discover -v tests
52 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
162 | # mac
163 | .DS_Store
164 |
165 | # vscode
166 | .vscode/
167 |
168 | # data/
169 | *.xlsx
170 | *.csv
171 | *.jsonl
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/psf/black
3 | rev: 24.8.0
4 | hooks:
5 | - id: black
6 | args: [--exclude=""]
7 |
8 | # this is not technically always safe but usually is
9 | # use comments `# isort: off` and `# isort: on` to disable/re-enable isort
10 | - repo: https://github.com/pycqa/isort
11 | rev: 5.13.2
12 | hooks:
13 | - id: isort
14 | args: [--profile=black]
15 |
16 | # this is slightly dangerous because python imports have side effects
17 | # and this tool removes unused imports, which may be providing
18 | # necessary side effects for the code to run
19 | - repo: https://github.com/PyCQA/autoflake
20 | rev: v2.3.1
21 | hooks:
22 | - id: autoflake
23 | args:
24 | - "--in-place"
25 | - "--expand-star-imports"
26 | - "--remove-duplicate-keys"
27 | - "--remove-unused-variables"
28 | - "--remove-all-unused-imports"
29 | exclude: "any-parser/__init__.py"
30 |
31 | # run all unittests
32 | - repo: local
33 | hooks:
34 | - id: unittests
35 | name: unittests
36 | entry: ./run_tests.sh
37 | language: script
38 | pass_filenames: false
39 | # Optional: Specify types of files that trigger this hook
40 | # types: [python]
41 | # Optional: Specify files or directories to exclude
42 | # exclude: '^docs/'
43 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🌊 AnyParser
2 |
3 |
4 |
5 |
6 |
7 |
8 | **AnyParser** provides an API to accurately extract unstructured data (e.g., PDFs, images, charts) into a structured format.
9 |
10 | ## :seedling: Set up your AnyParser API key
11 |
12 | To get started, generate your API key from the [Sandbox Account Page](https://www.cambioml.com/account). Each account comes with **100 free pages**.
13 |
14 | > ⚠️ **Note:** The free API is limited to 10 pages/call.
15 |
16 | For more information or to inquire about larger usage plans, feel free to contact us at info@cambioml.com.
17 |
18 | To set up your API key (`CAMBIO_API_KEY`), follow these steps:
19 | 1. Create a `.env` file in the root directory of your project.
20 | 2. Add the following line to the `.env` file:
21 | ```
22 | CAMBIO_API_KEY=0cam************************
23 | ```
24 |
25 |
26 | ## :computer: Installation
27 | ### 1. Set Up a New Conda Environment and Install AnyParser
28 | First, create and activate a new Conda environment, then install AnyParser:
29 | ```bash
30 | conda create -n any-parse python=3.10 -y
31 | conda activate any-parse
32 | pip3 install any-parser
33 | ```
34 | ### 2. Create an AnyParser Instance Using Your API Key
35 | Use your API key to create an instance of AnyParser. Make sure you’ve set up your .env file to store your API key securely:
36 | ```python
37 | import os
38 | from dotenv import load_dotenv
39 | from any_parser import AnyParser
40 |
41 | # Load environment variables
42 | load_dotenv(override=True)
43 |
44 | # Get the API key from the environment
45 | example_apikey = os.getenv("CAMBIO_API_KEY")
46 |
47 | # Create an AnyParser instance
48 | ap = AnyParser(api_key=example_apikey)
49 | ```
50 |
51 | ### 3. Run Synchronous Extraction
52 | To extract data synchronously and receive immediate results:
53 | ```python
54 | # Extract content from the file and get the markdown output along with processing time
55 | markdown, total_time = ap.parse(file_path="./data/test.pdf")
56 | ```
57 |
58 | ### 4. Run Asynchronous Extraction
59 | For asynchronous extraction, send the file for processing and fetch results later:
60 | ```python
61 | # Send the file to begin asynchronous extraction
62 | file_id = ap.async_parse(file_path="./data/test.pdf")
63 |
64 | # Fetch the extracted content using the file ID
65 | markdown = ap.async_fetch(file_id=file_id)
66 | ```
67 |
68 | ### 5. Run Batch Extraction (Beta)
69 | For batch extraction, send the file to begin processing and fetch results later:
70 | ```python
71 | # Send the file to begin batch extraction
72 | response = ap.batches.create(file_path="./data/test.pdf")
73 | request_id = response.requestId
74 |
75 | # Fetch the extracted content using the request ID
76 | markdown = ap.batches.retrieve(request_id)
77 | ```
78 |
79 | Batch API for folder input:
80 | ```python
81 | # Send the folder to begin batch extraction
82 | WORKING_FOLDER = "./sample_data"
83 | # This will generate a jsonl with filename and requestID
84 | response = ap.batches.create(WORKING_FOLDER)
85 | ```
86 |
87 | Each response in the JSONL file contains:
88 | - The filename
89 | - A unique request ID
90 | - Additional processing metadata
91 |
92 | You can later use these request IDs to retrieve the extracted content for each file:
93 |
94 | ```python
95 | # Fetch the extracted content using the request ID from the jsonl file
96 | markdown = ap.batches.retrieve(request_id)
97 | ```
98 | For more details about code implementation of batch API, refer to
99 | [examples/parse_batch_upload.py](examples/parse_batch_upload.py) and [examples/parse_batch_fetch.py](examples/parse_batch_fetch.py)
100 |
101 | > ⚠️ **Note:** Batch extraction is currently in beta testing. Processing time may take up to 12 hours to complete.
102 | >
103 | > ⚠️ **Important:** API keys generated from cambioml.com do not automatically have batch processing permissions. Please contact info@cambioml.com to request batch processing access for your API key.
104 |
105 | ## :scroll: Examples
106 | Check out these examples to see how you can utilize **AnyParser** to extract text, numbers, and symbols in fewer than 10 lines of code!
107 |
108 | ### [Extract all text and layout from PDF into Markdown Format](https://github.com/CambioML/any-parser/blob/rt-migration/examples/pdf_to_markdown.ipynb)
109 | Are you an AI engineer looking to **accurately** extract both the text and layout (e.g., table of contents or Markdown headers hierarchy) from a PDF? Check out this [3-minute notebook demo](https://github.com/CambioML/any-parser/blob/rt-migration/examples/pdf_to_markdown.ipynb).
110 |
111 | ### [Extract a Table from an Image into Markdown Format](https://github.com/CambioML/any-parser/blob/rt-migration/examples/image_to_markdown.ipynb)
112 | Are you a financial analyst needing to **accurately** extract numbers from a table within an image? Explore this [3-minute notebook example](https://github.com/CambioML/any-parser/blob/rt-migration/examples/image_to_markdown.ipynb).
113 |
--------------------------------------------------------------------------------
/any_parser/__init__.py:
--------------------------------------------------------------------------------
1 | """AnyParser module for parsing data."""
2 |
3 | from any_parser.any_parser import AnyParser
4 |
5 | __all__ = ["AnyParser"]
6 |
7 | __version__ = "0.0.24"
8 |
--------------------------------------------------------------------------------
/any_parser/any_parser.py:
--------------------------------------------------------------------------------
1 | """AnyParser RT: Real-time parser for any data format."""
2 |
3 | import base64
4 | import json
5 | import time
6 | import uuid
7 | from collections.abc import Iterable
8 | from io import StringIO
9 | from pathlib import Path
10 |
11 | import requests
12 |
13 | from any_parser.async_parser import AsyncParser
14 | from any_parser.batch_parser import BatchParser
15 | from any_parser.constants import ProcessType
16 | from any_parser.sync_parser import (
17 | ExtractKeyValueSyncParser,
18 | ExtractPIISyncParser,
19 | ExtractResumeKeyValueSyncParser,
20 | ExtractTablesSyncParser,
21 | ParseSyncParser,
22 | )
23 | from any_parser.utils import validate_file_inputs
24 |
25 | PUBLIC_SHARED_BASE_URL = "https://public-api.cambioml.com"
26 | PUBLIC_BATCH_BASE_URL = "http://batch-api.cambioml.com"
27 | TIMEOUT = 180
28 |
29 |
30 | def handle_file_processing(func):
31 | """
32 | Decorator to handle common file processing logic for parsing
33 | and extraction operations.
34 |
35 | This decorator manages file input validation and processing, supporting
36 | either direct file content or file path inputs. It performs base64 encoding
37 | of file contents when a file path is provided.
38 |
39 | Args:
40 | func: The decorated function that performs the actual parsing or
41 | extraction.
42 |
43 | Parameters for decorated functions:
44 | file_path (str, optional): Path to the file to be processed. If
45 | provided, the file will be read and encoded in base64.
46 | file_content (str, optional): Base64-encoded content of the file. If
47 | provided, file_path will be ignored.
48 | file_type (str, optional): The file extension/type (e.g., 'pdf').
49 | If not provided and file_path is given, it will be inferred from
50 | the file extension.
51 | *args, **kwargs: Additional arguments passed to the decorated function.
52 |
53 | Returns:
54 | tuple: A tuple containing (error_message, result), where:
55 | - error_message (str): Error message if processing fails, empty
56 | string on success
57 | - result (str): Empty string if error occurs, otherwise the
58 | processed result from func
59 |
60 | Usage:
61 | @handle_file_processing
62 | def parse(self, file_path=None, file_content=None, file_type=None):
63 | # Implementation
64 | pass
65 |
66 | Note:
67 | Either file_path or file_content must be provided, but not both.
68 | If file_path is provided, the file content will be read and encoded in
69 | base64, and file_type will be inferred from the file extension.
70 | If file_content is provided, file_type will be validated, and a
71 | temporary file path will be generated for generating presigned url(for
72 | async parsing and extraction)
73 | """
74 |
75 | def wrapper(
76 | self,
77 | file_path=None,
78 | file_content=None,
79 | file_type=None,
80 | *args,
81 | **kwargs,
82 | ):
83 | # pylint: disable=too-many-arguments
84 | # Validate inputs
85 | is_valid, error_message = validate_file_inputs(
86 | file_path=file_path,
87 | file_content=file_content,
88 | file_type=file_type,
89 | )
90 |
91 | if not is_valid:
92 | return error_message, ""
93 |
94 | # Encode the file content in base64 if file_path is provided
95 | if file_path:
96 | try:
97 | with open(file_path, "rb") as file:
98 | file_content = base64.b64encode(file.read()).decode("utf-8")
99 | file_type = Path(file_path).suffix.lower().lstrip(".")
100 | except Exception as e:
101 | return f"Error: {e}", ""
102 | else:
103 | # generate a random file path for genrating presigned url
104 | file_path = f"/tmp/{uuid.uuid4()}.{file_type}"
105 |
106 | return func(
107 | self,
108 | file_path=file_path,
109 | file_content=file_content,
110 | file_type=file_type,
111 | *args,
112 | **kwargs,
113 | )
114 |
115 | return wrapper
116 |
117 |
118 | class AnyParser:
119 | """Real-time parser for processing various data formats.
120 |
121 | Provides both synchronous and asynchronous methods for parsing and
122 | extracting information from different types of files.
123 | """
124 |
125 | def __init__(
126 | self,
127 | api_key: str,
128 | base_url: str = PUBLIC_SHARED_BASE_URL,
129 | batch_url: str = PUBLIC_BATCH_BASE_URL,
130 | ) -> None:
131 | """Initialize AnyParser with API credentials.
132 |
133 | Args:
134 | api_key: Authentication key for API access
135 | base_url: API endpoint URL, defaults to public endpoint
136 | """
137 | self._async_parser = AsyncParser(api_key, base_url)
138 | self._sync_parse = ParseSyncParser(api_key, base_url)
139 | self._sync_extract_key_value = ExtractKeyValueSyncParser(api_key, base_url)
140 | self._sync_extract_resume_key_value = ExtractResumeKeyValueSyncParser(
141 | api_key, base_url
142 | )
143 | self._sync_extract_pii = ExtractPIISyncParser(api_key, base_url)
144 | self._sync_extract_tables = ExtractTablesSyncParser(api_key, base_url)
145 | self.batches = BatchParser(api_key, batch_url)
146 |
147 | @handle_file_processing
148 | def parse(
149 | self,
150 | file_path=None,
151 | file_content=None,
152 | file_type=None,
153 | extract_args=None,
154 | ):
155 | """Extract full content from a file synchronously.
156 |
157 | Args:
158 | file_path: Path to input file
159 | file_content: Base64 encoded file content
160 | file_type: File format extension
161 | extract_args: Additional extraction parameters
162 |
163 | Returns:
164 | tuple: (result, timing_info) or (error_message, "")
165 | """
166 | return self._sync_parse.parse(
167 | file_path=file_path,
168 | file_content=file_content,
169 | file_type=file_type,
170 | extract_args=extract_args,
171 | )
172 |
173 | @handle_file_processing
174 | def extract_pii(
175 | self,
176 | file_path=None,
177 | file_content=None,
178 | file_type=None,
179 | ):
180 | """
181 | Extract PII data from a file synchronously.
182 | """
183 | return self._sync_extract_pii.extract(
184 | file_path=file_path,
185 | file_content=file_content,
186 | file_type=file_type,
187 | )
188 |
189 | @staticmethod
190 | def flatten_to_string(item):
191 | """
192 | Flatten any iterable object to a string.
193 | """
194 |
195 | if isinstance(item, str):
196 | return item
197 |
198 | # if item is a dict, flatten all keys and values
199 | if isinstance(item, dict):
200 | parts = []
201 | for k, v in item.items():
202 | parts.append(AnyParser.flatten_to_string(k))
203 | parts.append(AnyParser.flatten_to_string(v))
204 | return "".join(parts)
205 |
206 | # item is other iterable objects
207 | if isinstance(item, Iterable):
208 | parts = []
209 | for sub_item in item:
210 | parts.append(AnyParser.flatten_to_string(sub_item))
211 | return "".join(parts)
212 |
213 | # item is not iterable objects
214 | return str(item)
215 |
216 | @handle_file_processing
217 | def extract_tables(
218 | self,
219 | file_path=None,
220 | file_content=None,
221 | file_type=None,
222 | return_type="html",
223 | ):
224 | """Extract tables from a file in real-time.
225 |
226 | Args:
227 | file_path (str): The path to the file to be parsed.
228 | return_type (str): 'html' or 'csv'
229 | Returns:
230 | tuple(str, str)
231 | """
232 | extracted_html, time_elapsed = self._sync_extract_tables.extract(
233 | file_path=file_path,
234 | file_content=file_content,
235 | file_type=file_type,
236 | )
237 |
238 | if isinstance(extracted_html, list):
239 | extracted_html = AnyParser.flatten_to_string(extracted_html)
240 |
241 | if return_type.lower() == "csv":
242 | try:
243 | import pandas as pd
244 | except ImportError:
245 | raise ImportError("Please install pandas to use CSV return_type")
246 |
247 | if isinstance(extracted_html, list):
248 | extracted_html = "".join(str(item) for item in extracted_html)
249 |
250 | df_list = pd.read_html(StringIO(extracted_html))
251 | combined_df = pd.concat(df_list, ignore_index=True)
252 | csv_output = combined_df.to_csv(index=False)
253 |
254 | return csv_output, time_elapsed
255 |
256 | return extracted_html, time_elapsed
257 |
258 | @handle_file_processing
259 | def extract_key_value(
260 | self,
261 | file_path=None,
262 | file_content=None,
263 | file_type=None,
264 | extract_instruction=None,
265 | ):
266 | """Extract key-value pairs from a file in real-time.
267 |
268 | Args:
269 | file_path (str): The path to the file to be parsed.
270 | extract_instruction (Dict): A dictionary containing the keys to be
271 | extracted, with their values as the description of those keys.
272 | Returns:
273 | tuple(str, str): The extracted data and the time taken.
274 | """
275 | return self._sync_extract_key_value.extract(
276 | file_path=file_path,
277 | file_content=file_content,
278 | file_type=file_type,
279 | extract_args={"extract_instruction": extract_instruction},
280 | )
281 |
282 | @handle_file_processing
283 | def extract_resume_key_value(
284 | self, file_path=None, file_content=None, file_type=None
285 | ):
286 | """Extract resume in real-time.
287 |
288 | Args:
289 | file_path (str): The path to the file to be parsed.
290 | Returns:
291 | tuple(str, str): The extracted data and the time taken.
292 | extracted data includes:
293 | - "education": Education
294 | - "work_experience": Work Experience
295 | - "personal_info": Personal Information
296 | - "skills": Skills
297 | - "certifications": Certifications
298 | - "projects": Projects
299 | - "pii": Personally Identifiable Information - includes
300 | only name, email, and phone
301 | """
302 | return self._sync_extract_resume_key_value.extract(
303 | file_path=file_path,
304 | file_content=file_content,
305 | file_type=file_type,
306 | )
307 |
308 | # Example of decorated methods:
309 | @handle_file_processing
310 | def async_parse(
311 | self,
312 | file_path=None,
313 | file_content=None,
314 | file_type=None,
315 | extract_args=None,
316 | ):
317 | """Extract full content from a file asynchronously."""
318 | return self._async_parser.send_async_request(
319 | process_type=ProcessType.PARSE,
320 | file_path=file_path, # type: ignore
321 | file_content=file_content, # type: ignore
322 | extract_args=extract_args,
323 | )
324 |
325 | @handle_file_processing
326 | def async_parse_with_layout(
327 | self, file_path=None, file_content=None, file_type=None
328 | ):
329 | """Extract content from a file asynchronously with layout analysis."""
330 | return self._async_parser.send_async_request(
331 | process_type=ProcessType.PARSE_WITH_LAYOUT,
332 | file_path=file_path, # type: ignore
333 | file_content=file_content, # type: ignore
334 | )
335 |
336 | @handle_file_processing
337 | def async_parse_with_ocr(self, file_path=None, file_content=None, file_type=None):
338 | """Extract full content from a file asynchronously with OCR."""
339 | return self._async_parser.send_async_request(
340 | process_type=ProcessType.PARSE_WITH_OCR,
341 | file_path=file_path, # type: ignore
342 | file_content=file_content, # type: ignore
343 | )
344 |
345 | @handle_file_processing
346 | def async_extract_pii(
347 | self,
348 | file_path=None,
349 | file_content=None,
350 | file_type=None,
351 | extract_args=None,
352 | ):
353 | """Extract PII from a file asynchronously."""
354 | return self._async_parser.send_async_request(
355 | process_type=ProcessType.EXTRACT_PII,
356 | file_path=file_path, # type: ignore
357 | file_content=file_content, # type: ignore
358 | extract_args=extract_args,
359 | )
360 |
361 | @handle_file_processing
362 | def async_extract_tables(self, file_path=None, file_content=None, file_type=None):
363 | """Extract tables from a file asynchronously."""
364 | return self._async_parser.send_async_request(
365 | process_type=ProcessType.EXTRACT_TABLES,
366 | file_path=file_path, # type: ignore
367 | file_content=file_content, # type: ignore
368 | )
369 |
370 | @handle_file_processing
371 | def async_extract_key_value(
372 | self,
373 | file_path=None,
374 | file_content=None,
375 | file_type=None,
376 | extract_instruction=None,
377 | ):
378 | """Extract key-value pairs from a file asynchronously."""
379 | return self._async_parser.send_async_request(
380 | process_type=ProcessType.EXTRACT_KEY_VALUE,
381 | file_path=file_path, # type: ignore
382 | file_content=file_content, # type: ignore
383 | extract_args={"extract_instruction": extract_instruction},
384 | )
385 |
386 | @handle_file_processing
387 | def async_extract_resume_key_value(
388 | self, file_path=None, file_content=None, file_type=None
389 | ):
390 | """Extract resume key-value pairs from a file asynchronously."""
391 | return self._async_parser.send_async_request(
392 | process_type=ProcessType.EXTRACT_RESUME_KEY_VALUE,
393 | file_path=file_path, # type: ignore
394 | file_content=file_content, # type: ignore
395 | extract_args=None,
396 | )
397 |
398 | def async_fetch(
399 | self,
400 | file_id: str,
401 | sync: bool = True,
402 | sync_timeout: int = 180,
403 | sync_interval: int = 5,
404 | ) -> str:
405 | """Fetches extraction results asynchronously.
406 |
407 | Args:
408 | file_id (str): The ID of the file to fetch results for.
409 | sync (bool, optional): Whether to wait for the results
410 | synchronously.
411 | sync_timeout (int, optional): Maximum time to wait for results in
412 | seconds. Defaults to 180.
413 | sync_interval (int, optional): Time interval between polling
414 | attempts in seconds. Defaults to 5.
415 |
416 | Returns:
417 | str: The extracted results as a markdown string.
418 | None: If the extraction is still in progress (when sync is False).
419 | """
420 |
421 | response = None
422 | # Create the JSON payload
423 | payload = {"file_id": file_id}
424 | if sync:
425 | start_time = time.time()
426 | while time.time() < start_time + sync_timeout:
427 | response = requests.post(
428 | self._async_parser._async_fetch_url,
429 | headers=self._async_parser._headers,
430 | data=json.dumps(payload),
431 | timeout=TIMEOUT,
432 | )
433 | if response.status_code == 202:
434 | print("Waiting for response...")
435 | time.sleep(sync_interval)
436 | continue
437 | break
438 | else:
439 | response = requests.post(
440 | self._async_parser._async_fetch_url,
441 | headers=self._async_parser._headers,
442 | data=json.dumps(payload),
443 | timeout=TIMEOUT,
444 | )
445 |
446 | return self._async_parser.handle_async_response(response)
447 |
--------------------------------------------------------------------------------
/any_parser/async_parser.py:
--------------------------------------------------------------------------------
1 | """Asynchronous parser implementation."""
2 |
3 | import json
4 | from pathlib import Path
5 | from typing import Dict, Optional
6 |
7 | import requests
8 |
9 | from any_parser.base_parser import BaseParser
10 | from any_parser.constants import ProcessType
11 | from any_parser.utils import upload_file_to_presigned_url
12 |
13 | TIMEOUT = 60
14 |
15 |
16 | class BasePostProcessor:
17 | def __init__(self, successor=None) -> None:
18 | self.successor = successor
19 |
20 | def process(self, json_response: Dict) -> str:
21 | if self.successor:
22 | return self.successor.process(json_response)
23 | return f"Error: Invalid JSON response: {json_response}"
24 |
25 |
26 | class ParsePostProcessor(BasePostProcessor):
27 | def process(self, json_response: Dict) -> str:
28 | if "markdown" in json_response:
29 | return json_response["markdown"]
30 | if "result" in json_response:
31 | return json_response["result"]
32 | return super().process(json_response)
33 |
34 |
35 | class KeyValuePostProcessor(BasePostProcessor):
36 | def process(self, json_response: Dict) -> str:
37 | if "json" in json_response:
38 | return json_response["json"]
39 | return super().process(json_response)
40 |
41 |
42 | class ExtractPIIPostProcessor(BasePostProcessor):
43 | def process(self, json_response: Dict) -> str:
44 | if "pii_extraction" in json_response:
45 | return json_response["pii_extraction"]
46 | return super().process(json_response)
47 |
48 |
49 | class ExtractResumeKeyValuePostProcessor(BasePostProcessor):
50 |
51 | def process(self, json_response: Dict) -> str:
52 | if "resume_extraction" in json_response:
53 | return json_response["resume_extraction"]
54 | return super().process(json_response)
55 |
56 |
57 | class AsyncParser(BaseParser):
58 | def __init__(self, api_key: str, base_url: str) -> None:
59 | super().__init__(api_key, base_url)
60 | self._async_upload_url = f"{self._base_url}/async/upload"
61 | self._async_fetch_url = f"{self._base_url}/async/fetch"
62 |
63 | def send_async_request(
64 | self,
65 | process_type: ProcessType,
66 | file_path: str,
67 | file_content: str,
68 | extract_args: Optional[Dict] = None,
69 | ) -> str:
70 | """Extract full content from a file asynchronously.
71 |
72 | Args:
73 | process_type (ProcessType): The type of processing to be done.
74 | file_path (str): The path to the file to be parsed.
75 | file_content (str): The content of the file to be parsed.
76 | extract_args (Optional[Dict]): Additional extraction arguments.
77 |
78 | Returns:
79 | str: The file id of the uploaded file.
80 | """
81 |
82 | file_name = Path(file_path).name
83 |
84 | # Create the JSON payload
85 | payload = {
86 | "file_name": file_name,
87 | "process_type": process_type.value,
88 | }
89 |
90 | if extract_args is not None and isinstance(extract_args, dict):
91 | payload["extract_args"] = extract_args # type: ignore
92 |
93 | # Send the POST request
94 | response = requests.post(
95 | self._async_upload_url,
96 | headers=self._headers,
97 | data=json.dumps(payload),
98 | timeout=TIMEOUT,
99 | )
100 |
101 | # If response successful, upload the file
102 | return upload_file_to_presigned_url(file_content, response)
103 |
104 | def handle_async_response(self, response) -> str:
105 | if response is None:
106 | return "Error: timeout, no response received"
107 | if response.status_code == 202:
108 | return ""
109 | if response.status_code == 200:
110 | extract_resume_processor = ExtractResumeKeyValuePostProcessor()
111 | key_value_processor = KeyValuePostProcessor(extract_resume_processor)
112 | extract_pii_processor = ExtractPIIPostProcessor(key_value_processor)
113 | handler = ParsePostProcessor(extract_pii_processor)
114 | try:
115 | return handler.process(response.json())
116 | except json.JSONDecodeError:
117 | return f"Error: Invalid JSON response: {response.text}"
118 |
119 | return f"Error: {response.status_code} {response.text}"
120 |
--------------------------------------------------------------------------------
/any_parser/base_parser.py:
--------------------------------------------------------------------------------
1 | """Base parser implementation."""
2 |
3 |
4 | class BaseParser:
5 | def __init__(self, api_key: str, base_url: str) -> None:
6 | self._api_key = api_key
7 | self._base_url = base_url
8 | self._headers = {
9 | "Content-Type": "application/json",
10 | "x-api-key": self._api_key,
11 | }
12 |
--------------------------------------------------------------------------------
/any_parser/batch_parser.py:
--------------------------------------------------------------------------------
1 | """Batch parser implementation."""
2 |
3 | import logging
4 | import os
5 | from concurrent.futures import ThreadPoolExecutor, as_completed
6 | from pathlib import Path
7 | from typing import List, Optional, Union
8 |
9 | import requests
10 | from pydantic import BaseModel, Field
11 |
12 | from any_parser.base_parser import BaseParser
13 |
14 | TIMEOUT = 60
15 | MAX_WORKERS = 10
16 |
17 | logger = logging.getLogger(__name__)
18 |
19 |
20 | class UploadResponse(BaseModel):
21 | """
22 | Response from the batch upload endpoint.
23 | """
24 |
25 | fileName: str
26 | requestId: str
27 | requestStatus: str
28 |
29 |
30 | class UsageResponse(BaseModel):
31 | """
32 | Response from the batch usage endpoint.
33 | """
34 |
35 | pageLimit: int
36 | pageRemaining: int
37 |
38 |
39 | class FileStatusResponse(BaseModel):
40 | """
41 | Response from the batch file status endpoint.
42 | """
43 |
44 | fileName: str
45 | fileType: str
46 | requestId: str
47 | requestStatus: str
48 | uploadTime: str
49 | completionTime: Optional[str] = None
50 | result: Optional[List[str]] = Field(default_factory=list)
51 | error: Optional[List[str]] = Field(default_factory=list)
52 |
53 |
54 | class BatchParser(BaseParser):
55 | def __init__(self, api_key: str, base_url: str) -> None:
56 | super().__init__(api_key, base_url)
57 | self._file_upload_url = f"{self._base_url}/files/"
58 | self._processing_status_url = f"{self._base_url}/files/" + "{request_id}"
59 | self._usage_url = f"{self._base_url}/users/current/usage"
60 |
61 | # remove "Content-Type" from headers
62 | self._headers.pop("Content-Type")
63 |
64 | def create(self, file_path: str) -> Union[UploadResponse, List[UploadResponse]]:
65 | """Upload a single file or folder for batch processing.
66 |
67 | Args:
68 | file_path: Path to the file or folder to upload
69 |
70 | Returns:
71 | If file: Single UploadResponse object containing upload details
72 | If folder: List of UploadResponse objects for each file
73 | """
74 | path = Path(file_path)
75 | if path.is_file():
76 | return self._upload_single_file(path)
77 | elif path.is_dir():
78 | return self._upload_folder(path)
79 | else:
80 | raise ValueError(f"Path {file_path} does not exist")
81 |
82 | def _upload_single_file(self, file_path: Path) -> UploadResponse:
83 | """Upload a single file for batch processing."""
84 | if not os.path.isfile(file_path):
85 | raise FileNotFoundError(f"The file path '{file_path}' does not exist.")
86 |
87 | with open(file_path, "rb") as f:
88 | files = {"file": f}
89 | response = requests.post(
90 | self._file_upload_url,
91 | headers=self._headers,
92 | files=files,
93 | timeout=TIMEOUT,
94 | )
95 |
96 | if response.status_code != 200:
97 | raise Exception(f"Upload failed: {response.text}")
98 |
99 | data = response.json()
100 | return UploadResponse(
101 | fileName=data["fileName"],
102 | requestId=data["requestId"],
103 | requestStatus=data["requestStatus"],
104 | )
105 |
106 | def _upload_folder(self, folder_path: Path) -> List[UploadResponse]:
107 | """Upload all files in a folder for batch processing.
108 |
109 | Args:
110 | folder_path: Path to the folder containing files to upload
111 |
112 | Returns:
113 | List of UploadResponse objects for each uploaded file
114 | """
115 | # Get all files in folder and subfolders
116 | files = []
117 | for root, _, filenames in os.walk(folder_path):
118 | for filename in filenames:
119 | files.append(Path(root) / filename)
120 |
121 | # Upload files concurrently using thread pool
122 | responses = []
123 | with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
124 | future_to_file = {
125 | executor.submit(self._upload_single_file, file_path): file_path
126 | for file_path in files
127 | }
128 |
129 | for future in as_completed(future_to_file):
130 | file_path = future_to_file[future]
131 | try:
132 | response = future.result()
133 | responses.append(response)
134 | except Exception as e:
135 | logger.error(f"Failed to upload {file_path}: {str(e)}")
136 |
137 | return responses
138 |
139 | def retrieve(self, request_id: str) -> FileStatusResponse:
140 | """Get the processing status of a file.
141 |
142 | Args:
143 | request_id: The ID of the file processing request
144 |
145 | Returns:
146 | FileProcessingStatus object containing status details
147 | """
148 | response = requests.get(
149 | self._processing_status_url.format(request_id=request_id),
150 | headers=self._headers,
151 | timeout=TIMEOUT,
152 | )
153 |
154 | if response.status_code != 200:
155 | raise Exception(f"Status check failed: {response.text}")
156 |
157 | data = response.json()
158 | return FileStatusResponse(**data)
159 |
160 | def get_usage(self) -> UsageResponse:
161 | """Get current usage information.
162 |
163 | Returns:
164 | UsageResponse object containing usage details
165 | """
166 | response = requests.get(
167 | self._usage_url,
168 | headers=self._headers,
169 | timeout=TIMEOUT,
170 | )
171 |
172 | if response.status_code != 200:
173 | raise Exception(f"Usage check failed: {response.text}")
174 |
175 | data = response.json()
176 | return UsageResponse(
177 | pageLimit=data["pageLimit"], pageRemaining=data["pageRemaining"]
178 | )
179 |
--------------------------------------------------------------------------------
/any_parser/constants.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 |
4 | class ProcessType(Enum):
5 | EXTRACT_PII = "extract_pii"
6 | EXTRACT_TABLES = "extract_tables"
7 | EXTRACT_KEY_VALUE = "extract_key_value"
8 | EXTRACT_RESUME_KEY_VALUE = "extract_resume_key_value"
9 | PARSE = "parse"
10 | PARSE_WITH_OCR = "parse_with_ocr"
11 | PARSE_WITH_LAYOUT = "parse_with_layout"
12 |
--------------------------------------------------------------------------------
/any_parser/sync_parser.py:
--------------------------------------------------------------------------------
1 | """Synchronous parser implementation."""
2 |
3 | import json
4 | import time
5 | from typing import Any, Dict, Optional, Tuple
6 |
7 | import requests
8 |
9 | from any_parser.base_parser import BaseParser
10 |
11 | TIMEOUT = 60
12 |
13 |
14 | class BaseSyncParser(BaseParser):
15 |
16 | def get_sync_response(
17 | self,
18 | url_endpoint: str,
19 | file_content: str,
20 | file_type: str,
21 | extract_args: Optional[Dict[str, Any]] = None,
22 | ) -> Tuple[Optional[requests.Response], str]:
23 | payload = {
24 | "file_content": file_content,
25 | "file_type": file_type,
26 | }
27 | if extract_args:
28 | payload["extract_args"] = extract_args # type: ignore
29 |
30 | start_time = time.time()
31 | response = requests.post(
32 | url_endpoint,
33 | headers=self._headers,
34 | data=json.dumps(payload),
35 | timeout=TIMEOUT,
36 | )
37 | end_time = time.time()
38 |
39 | if response.status_code != 200:
40 | return None, f"Error: {response.status_code} {response.text}"
41 |
42 | return response, f"{end_time - start_time:.2f} seconds"
43 |
44 | def parse(
45 | self,
46 | file_path=None,
47 | file_content=None,
48 | file_type=None,
49 | extract_args=None,
50 | ):
51 | """Converts the given file to markdown."""
52 | raise NotImplementedError
53 |
54 | def extract(
55 | self,
56 | file_path=None,
57 | file_content=None,
58 | file_type=None,
59 | extract_args=None,
60 | ):
61 | """Extracts information from the given file."""
62 | raise NotImplementedError
63 |
64 |
65 | class ParseSyncParser(BaseSyncParser):
66 | """Parse parser implementation."""
67 |
68 | def parse(
69 | self,
70 | file_path=None,
71 | file_content=None,
72 | file_type=None,
73 | extract_args=None,
74 | ):
75 | response, info = self.get_sync_response(
76 | f"{self._base_url}/parse",
77 | file_content=file_content, # type: ignore
78 | file_type=file_type, # type: ignore
79 | extract_args=extract_args,
80 | )
81 |
82 | if response is None:
83 | return info, ""
84 |
85 | try:
86 | response_data = response.json()
87 | result = response_data["markdown"]
88 | return result, f"Time Elapsed: {info}"
89 | except json.JSONDecodeError:
90 | return f"Error: Invalid JSON response: {response.text}", ""
91 |
92 |
93 | class ExtractPIISyncParser(BaseSyncParser):
94 | """Extract PII parser implementation."""
95 |
96 | def extract(
97 | self,
98 | file_path=None,
99 | file_content=None,
100 | file_type=None,
101 | extract_args=None,
102 | ):
103 | response, info = self.get_sync_response(
104 | f"{self._base_url}/extract_pii",
105 | file_content=file_content, # type: ignore
106 | file_type=file_type, # type: ignore
107 | extract_args=None,
108 | )
109 |
110 | if response is None:
111 | return info, ""
112 |
113 | try:
114 | response_data = response.json()
115 | result = response_data["pii_extraction"]
116 | return result, f"Time Elapsed: {info}"
117 | except json.JSONDecodeError:
118 | return f"Error: Invalid JSON response: {response.text}", ""
119 |
120 |
121 | class ExtractTablesSyncParser(BaseSyncParser):
122 | """Extract tables parser implementation."""
123 |
124 | def extract(
125 | self,
126 | file_path=None,
127 | file_content=None,
128 | file_type=None,
129 | extract_args=None,
130 | ):
131 | response, info = self.get_sync_response(
132 | f"{self._base_url}/extract_tables",
133 | file_content=file_content, # type: ignore
134 | file_type=file_type, # type: ignore
135 | extract_args=None,
136 | )
137 |
138 | if response is None:
139 | return info, ""
140 |
141 | try:
142 | response_data = response.json()
143 | result = response_data["markdown"]
144 | return result, f"Time Elapsed: {info}"
145 | except json.JSONDecodeError:
146 | return f"Error: Invalid JSON response: {response.text}", ""
147 |
148 |
149 | class ExtractKeyValueSyncParser(BaseSyncParser):
150 | """Extract key-value parser implementation."""
151 |
152 | def extract(
153 | self,
154 | file_path=None,
155 | file_content=None,
156 | file_type=None,
157 | extract_args=None,
158 | ):
159 | response, info = self.get_sync_response(
160 | f"{self._base_url}/extract_key_value",
161 | file_content=file_content, # type: ignore
162 | file_type=file_type, # type: ignore
163 | extract_args={"extract_instruction": extract_args},
164 | )
165 |
166 | if response is None:
167 | return info, ""
168 |
169 | try:
170 | response_data = response.json()
171 | result = response_data["json"]
172 | return result, f"Time Elapsed: {info}"
173 | except json.JSONDecodeError:
174 | return f"Error: Invalid JSON response: {response.text}", ""
175 |
176 |
177 | class ExtractResumeKeyValueSyncParser(BaseSyncParser):
178 | """Extract resume key-value parser implementation."""
179 |
180 | def extract(
181 | self,
182 | file_path=None,
183 | file_content=None,
184 | file_type=None,
185 | extract_args=None,
186 | ):
187 | response, info = self.get_sync_response(
188 | f"{self._base_url}/extract_resume_key_value",
189 | file_content=file_content, # type: ignore
190 | file_type=file_type, # type: ignore
191 | extract_args=None,
192 | )
193 |
194 | if response is None:
195 | return info, ""
196 |
197 | try:
198 | response_data = response.json()
199 | result = response_data["extraction_result"]
200 | return result, f"Time Elapsed: {info}"
201 | except json.JSONDecodeError:
202 | return f"Error: Invalid JSON response: {response.text}", ""
203 |
--------------------------------------------------------------------------------
/any_parser/utils.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import io
3 | import json
4 | from enum import Enum
5 | from pathlib import Path
6 | from typing import Optional, Tuple
7 |
8 | import requests
9 |
10 | SUPPORTED_FILE_EXTENSIONS = [
11 | "pdf",
12 | "doc",
13 | "docx",
14 | "ppt",
15 | "pptx",
16 | "jpg",
17 | "jpeg",
18 | "png",
19 | "gif",
20 | ]
21 |
22 |
23 | class ValidationError(Enum):
24 | MISSING_INPUTS = "Either file_content or file_path must be provided"
25 | MISSING_FILE_TYPE = "file_type must be provided when using file_content"
26 | NOT_FOUND = "File does not exist: {}"
27 | UNSUPPORTED_FILE_TYPE = "Unsupported file type: {}. Supported file types: {}"
28 | FILE_EMPTY = "File is empty: {}"
29 | FILE_TOO_LARGE = "File size exceeds maximum limit of {} MB: {}"
30 | OTHER = "{}"
31 |
32 |
33 | def validate_file_inputs(
34 | file_path: Optional[str],
35 | file_content: Optional[str],
36 | file_type: Optional[str],
37 | ) -> Tuple[bool, str]:
38 | """Validate inputs for the parser or extractor.
39 |
40 | Args:
41 | file_content (Optional[str]): Base64 encoded file content
42 | file_path (Optional[str]): Path to the file
43 | file_type (Optional[str]): File extension/type
44 |
45 | Returns:
46 | Tuple[bool, str]: (is_valid, error_message)
47 | - is_valid: True if validation passes, False otherwise
48 | - error_message: "" if validation passes, error if validation fails
49 | """
50 | # Check if at least one input method is provided
51 | if file_content is None and file_path is None:
52 | return False, ValidationError.MISSING_INPUTS.value
53 |
54 | # Validate file_content path
55 | if file_content is not None and file_type is None:
56 | return False, ValidationError.MISSING_FILE_TYPE.value
57 |
58 | # Validate file path if provided
59 | if file_path is not None:
60 | path = Path(file_path)
61 |
62 | # Check if file exists
63 | if not path.is_file():
64 | return False, ValidationError.NOT_FOUND.value.format(file_path)
65 |
66 | # Check if file is empty
67 | if path.stat().st_size == 0:
68 | return False, ValidationError.FILE_EMPTY.value.format(file_path)
69 |
70 | # If file_type not provided, extract it from file_path
71 | if file_type is None:
72 | file_type = path.suffix.lower().lstrip(".")
73 |
74 | # Validate file type
75 | if file_type not in SUPPORTED_FILE_EXTENSIONS:
76 | supported_types = ", ".join(sorted(SUPPORTED_FILE_EXTENSIONS))
77 | return False, ValidationError.UNSUPPORTED_FILE_TYPE.value.format(
78 | file_type, supported_types
79 | )
80 |
81 | return True, ""
82 |
83 |
84 | def upload_file_to_presigned_url(
85 | file_content: str, response: requests.Response, timeout: int = 10
86 | ) -> str:
87 | if response.status_code == 200:
88 | try:
89 | file_id = response.json().get("fileId")
90 | presigned_url = response.json().get("presignedUrl")
91 |
92 | # Decode base64 content
93 | decoded_content = base64.b64decode(file_content)
94 |
95 | # Create file-like object from decoded content
96 | files = {"file": ("file", io.BytesIO(decoded_content))}
97 |
98 | upload_resp = requests.post(
99 | presigned_url["url"],
100 | data=presigned_url["fields"],
101 | files=files,
102 | timeout=timeout,
103 | )
104 | if upload_resp.status_code != 204:
105 | return f"Error: {upload_resp.status_code} {upload_resp.text}"
106 | return file_id
107 | except json.JSONDecodeError:
108 | return "Error: Invalid JSON response"
109 | else:
110 | return f"Error: {response.status_code} {response.text}"
111 |
--------------------------------------------------------------------------------
/examples/async_extract_pii.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
10 | "# !pip3 install --upgrade ipython\n",
11 | "# !pip3 install --upgrade any-parser"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from any_parser import AnyParser"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "ap = AnyParser(api_key=\"...\")"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "file_path = \"./sample_data/resume_1.pdf\"\n",
39 | "file_id = ap.async_extract_pii(file_path)"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 6,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "pii_info = ap.async_fetch(file_id=file_id)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 7,
54 | "metadata": {},
55 | "outputs": [
56 | {
57 | "data": {
58 | "text/plain": [
59 | "{'name': 'Gary Jiang',\n",
60 | " 'phone_number': '+1-213-725-7637',\n",
61 | " 'address': None,\n",
62 | " 'email_address': 'jiangzhehuan0105@gmail.com',\n",
63 | " 'linkedin_url': 'https://linkedin.com/in/gary-jiang',\n",
64 | " 'github_url': None,\n",
65 | " 'summary': 'Full-stack Software Engineer'}"
66 | ]
67 | },
68 | "execution_count": 7,
69 | "metadata": {},
70 | "output_type": "execute_result"
71 | }
72 | ],
73 | "source": [
74 | "pii_info"
75 | ]
76 | }
77 | ],
78 | "metadata": {
79 | "kernelspec": {
80 | "display_name": "any",
81 | "language": "python",
82 | "name": "python3"
83 | },
84 | "language_info": {
85 | "codemirror_mode": {
86 | "name": "ipython",
87 | "version": 3
88 | },
89 | "file_extension": ".py",
90 | "mimetype": "text/x-python",
91 | "name": "python",
92 | "nbconvert_exporter": "python",
93 | "pygments_lexer": "ipython3",
94 | "version": "-1.-1.-1"
95 | }
96 | },
97 | "nbformat": 4,
98 | "nbformat_minor": 2
99 | }
100 |
--------------------------------------------------------------------------------
/examples/async_extract_resume_key_value.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
10 | "# !pip3 install --upgrade ipython\n",
11 | "# !pip3 install --upgrade any-parser"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from IPython.display import display\n",
21 | "from any_parser import AnyParser"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "ap = AnyParser(api_key=\"...\")"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "file_path = \"./sample_data/resume_1.pdf\"\n",
40 | "file_id = ap.async_extract_resume_key_value(file_path)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 4,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "name": "stdout",
50 | "output_type": "stream",
51 | "text": [
52 | "Waiting for response...\n",
53 | "Waiting for response...\n",
54 | "Waiting for response...\n"
55 | ]
56 | }
57 | ],
58 | "source": [
59 | "json_result = ap.async_fetch(file_id=file_id)"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 5,
65 | "metadata": {},
66 | "outputs": [
67 | {
68 | "data": {
69 | "text/plain": [
70 | "{'pii': {'full_name': 'GARY JIANG',\n",
71 | " 'email': 'jiangzhehuan0105@gmail.com',\n",
72 | " 'phone': '+1 (213) 725-7637'},\n",
73 | " 'education': [{'organization': 'Shenyang University of Technology',\n",
74 | " 'degree': \"Bachelor's Degree\",\n",
75 | " 'major': 'Computer Science',\n",
76 | " 'start_date': '2008-01-01',\n",
77 | " 'end_date': '2012-12-31',\n",
78 | " 'courses': None,\n",
79 | " 'achievements': None}],\n",
80 | " 'work_experience': [{'job_title': 'Full Stack Developer',\n",
81 | " 'company_name': 'VIMMERSE',\n",
82 | " 'location': None,\n",
83 | " 'start_date': '2023-06-01',\n",
84 | " 'end_date': 'present',\n",
85 | " 'job_type': None,\n",
86 | " 'summary': 'Developed an AI-powered editor web application that brings photos to life by animating images in 3D.',\n",
87 | " 'bullet_points': ['Developed robust back-end services utilizing Python (Flask/FastAPI) and Node.js (AWS Lambda) for efficient and scalable web applications',\n",
88 | " 'Built user-friendly and interactive websites using Next.js, ensuring a seamless user experience',\n",
89 | " 'Deployed and managed AWS infrastructure, including EC2 instances, S3 storage buckets, DynamoDB for NoSQL data management, and Cognito user pools for secure user authentication',\n",
90 | " 'Experienced Agile and Scrum methodologies within a fast-paced startup environment to ensure efficient project delivery and continuous improvement',\n",
91 | " 'Collaborated effectively with cross-functional teams (design, product management) to deliver projects on time, fostering a positive and collaborative work environment']},\n",
92 | " {'job_title': 'Full Stack Developer',\n",
93 | " 'company_name': 'VIKING SASQUATCH',\n",
94 | " 'location': None,\n",
95 | " 'start_date': '2023-01-01',\n",
96 | " 'end_date': '2023-06-01',\n",
97 | " 'job_type': None,\n",
98 | " 'summary': 'Developed APIs and Integrations for all of the parties that work on Real Estate Transactions.',\n",
99 | " 'bullet_points': ['Connecting Mortgage, Title, and Real Estate to solve pain points and improve automation and create efficiencies',\n",
100 | " 'Implemented a user-friendly front-end interface using Nuxt.js, ensuring a seamless user experience',\n",
101 | " 'Built backend APIs utilizing Node.js serverless functions for optimal performance',\n",
102 | " 'Managed data storage and security by implementing a MySQL database',\n",
103 | " 'Collaborated effectively within a team using Agile methodologies like sprint planning, daily standups, retrospectives to ensure project delivery and continuous improvement']},\n",
104 | " {'job_title': 'Full Stack Developer',\n",
105 | " 'company_name': 'ROX PAY SRL',\n",
106 | " 'location': None,\n",
107 | " 'start_date': '2021-12-01',\n",
108 | " 'end_date': '2022-12-31',\n",
109 | " 'job_type': None,\n",
110 | " 'summary': 'Built Fintech Software House that aims to optimize B2B payments by offering a systemic solution that gives value-added services in collection of payments, financial information and corporate liquidity by essentially creating a Commission Free, Open Loop, Payment Gateway system.',\n",
111 | " 'bullet_points': ['Developed front-end by using React.js and Redux, Javascript/Typescript',\n",
112 | " 'Contributed developing backend utilizing Django/Python']},\n",
113 | " {'job_title': 'Freelancer',\n",
114 | " 'company_name': 'FREELANCE',\n",
115 | " 'location': None,\n",
116 | " 'start_date': '2017-09-01',\n",
117 | " 'end_date': '2021-10-31',\n",
118 | " 'job_type': None,\n",
119 | " 'summary': 'Developed and managed many web and mobile applications while working as freelancer at Internet Dzyns LLC company.',\n",
120 | " 'bullet_points': ['Developed multiple web applications, participating in the whole process of their development: product design and estimation, code design and development, DevOps, UI/UX design, product launch and maintenance',\n",
121 | " 'Developed cross-platform mobile application using Flutter and Ionic/Angular',\n",
122 | " 'Developed NFT marketplace websites and wrote smart contracts']},\n",
123 | " {'job_title': 'Server Administrator, Java Developer',\n",
124 | " 'company_name': 'NEUSOFT',\n",
125 | " 'location': None,\n",
126 | " 'start_date': '2014-06-01',\n",
127 | " 'end_date': '2017-08-31',\n",
128 | " 'job_type': None,\n",
129 | " 'summary': 'Worked as intern and software developer after graduated university.',\n",
130 | " 'bullet_points': ['Correct analytical and reasoning skills to troubleshoot and repair server issues',\n",
131 | " 'Operating Systems & Security Software',\n",
132 | " 'Java / Spring Boot / Hibernate']}],\n",
133 | " 'personal_info': {'name': 'GARY JIANG',\n",
134 | " 'phone_number': '+1-213-725-7637',\n",
135 | " 'address': None,\n",
136 | " 'email_address': 'jiangzhehuan0105@gmail.com',\n",
137 | " 'linkedin_url': 'linkedin.com/in/gary-jiang',\n",
138 | " 'github_url': None,\n",
139 | " 'summary': None},\n",
140 | " 'skills': {'Programming Languages': ['Python',\n",
141 | " 'PHP',\n",
142 | " 'Javascript',\n",
143 | " 'Typescript',\n",
144 | " 'HTML',\n",
145 | " 'CSS'],\n",
146 | " 'Tools': ['Flask',\n",
147 | " 'Django',\n",
148 | " 'FastAPI',\n",
149 | " 'Laravel',\n",
150 | " 'Node.js',\n",
151 | " 'SQL databases',\n",
152 | " 'Next.js',\n",
153 | " 'React',\n",
154 | " 'Redux',\n",
155 | " 'Nuxt.js',\n",
156 | " 'Vue',\n",
157 | " 'AWS Lambda',\n",
158 | " 'Cognito',\n",
159 | " 'EC2',\n",
160 | " 'S3',\n",
161 | " 'DynamoDB',\n",
162 | " 'API Gateway',\n",
163 | " 'Flutter',\n",
164 | " 'Ionic',\n",
165 | " 'Angular',\n",
166 | " 'Git',\n",
167 | " 'Version Control',\n",
168 | " 'DevOps',\n",
169 | " 'CI/CD'],\n",
170 | " 'Other': ['Startup Experience',\n",
171 | " 'Adaptable',\n",
172 | " 'Resourceful',\n",
173 | " 'Prioritization',\n",
174 | " 'Hybrid Mobile App Development',\n",
175 | " 'AGILE',\n",
176 | " 'SCRUM']},\n",
177 | " 'certifications': [],\n",
178 | " 'projects': []}"
179 | ]
180 | },
181 | "metadata": {},
182 | "output_type": "display_data"
183 | }
184 | ],
185 | "source": [
186 | "display(json_result)"
187 | ]
188 | }
189 | ],
190 | "metadata": {
191 | "kernelspec": {
192 | "display_name": "any",
193 | "language": "python",
194 | "name": "python3"
195 | },
196 | "language_info": {
197 | "codemirror_mode": {
198 | "name": "ipython",
199 | "version": 3
200 | },
201 | "file_extension": ".py",
202 | "mimetype": "text/x-python",
203 | "name": "python",
204 | "nbconvert_exporter": "python",
205 | "pygments_lexer": "ipython3",
206 | "version": "3.11.10"
207 | }
208 | },
209 | "nbformat": 4,
210 | "nbformat_minor": 2
211 | }
212 |
--------------------------------------------------------------------------------
/examples/async_extract_tables.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
10 | "# !pip3 install --upgrade ipython\n",
11 | "# !pip3 install --upgrade any-parser"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from IPython.display import display, Markdown\n",
21 | "from any_parser import AnyParser\n",
22 | "import os"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 3,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "ap = AnyParser(api_key=os.getenv(\"CAMBIO_API_KEY\"))"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 4,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "file_path = \"./sample_data/sample.pdf\"\n",
41 | "file_id = ap.async_extract_tables(file_path)"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 5,
47 | "metadata": {},
48 | "outputs": [
49 | {
50 | "name": "stdout",
51 | "output_type": "stream",
52 | "text": [
53 | "Waiting for response...\n",
54 | "Waiting for response...\n"
55 | ]
56 | }
57 | ],
58 | "source": [
59 | "markdown_output = ap.async_fetch(file_id=file_id)"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 6,
65 | "metadata": {},
66 | "outputs": [
67 | {
68 | "data": {
69 | "text/plain": [
70 | "['\\n\\n1 Overview 3 | Technical information |
\\n2 Key requirements 4 | Ordering information |
\\n3 Planned availability date 5 | Terms and conditions |
\\n3 Program number 8 | Prices |
\\n3 Publications 8 | Announcement countries |
\\n\\n
\\n\\n',\n",
71 | " '',\n",
72 | " '\\n\\nProgram number | VRM | Program name |
\\n5737-L70 | 2.8.0 | IBM InfoSphere Optim Data Privacy for Unstructured Data |
\\n\\n
\\n\\n',\n",
73 | " '\\n\\nPart number description | Part number |
\\nIBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte License + SW Subscription & Support 12 Months | D2604LL |
\\n\\n
\\n\\n',\n",
74 | " '\\n\\nPart number description | Part number |
\\nIBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte SW Subscription & Support Reinstatement 12 Months | D2605LL |
\\nIBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte Annual SW Subscription & Support Renewal 12 Months | E0QGMLL |
\\nIBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte Monthly License | D2608LL |
\\n\\n
\\n\\n\\n\\nPart number description | Part number |
\\nIBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems(R) Annual Terabyte License + SW Subscription & Support 12 Months | D2606LL |
\\nIBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte SW Subscription & Support Reinstatement 12 Months | D2607LL |
\\nIBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte Annual SW Subscription & Support Renewal 12 Months | E0QGNLL |
\\nIBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte Monthly License | D2609LL |
\\n\\n
\\n\\n\\n\\nProgram identifier | License Information document title | License Information document number |
\\n5737-L70 | IBM InfoSphere Optim Data Privacy for Unstructured Data | L-JERN-BFQ3KR |
\\n\\n
\\n\\n',\n",
75 | " '\\n\\nProgram identifier | License Information document title | License Information document number |
\\n5737-L70 | IBM InfoSphere Optim Data Privacy for Unstructured Data | L-JERN-BFQ3KR |
\\n\\n
\\n\\n',\n",
76 | " '',\n",
77 | " '',\n",
78 | " '']"
79 | ]
80 | },
81 | "execution_count": 6,
82 | "metadata": {},
83 | "output_type": "execute_result"
84 | }
85 | ],
86 | "source": [
87 | "markdown_output"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 9,
93 | "metadata": {},
94 | "outputs": [
95 | {
96 | "data": {
97 | "text/markdown": [
98 | "\n",
99 | "\n",
100 | "1 Overview 3 | Technical information |
\n",
101 | "2 Key requirements 4 | Ordering information |
\n",
102 | "3 Planned availability date 5 | Terms and conditions |
\n",
103 | "3 Program number 8 | Prices |
\n",
104 | "3 Publications 8 | Announcement countries |
\n",
105 | "\n",
106 | "
\n",
107 | "\n"
108 | ],
109 | "text/plain": [
110 | ""
111 | ]
112 | },
113 | "metadata": {},
114 | "output_type": "display_data"
115 | }
116 | ],
117 | "source": [
118 | "display(Markdown(markdown_output[0]))"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 14,
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "data": {
128 | "text/markdown": [
129 | "\n",
130 | "\n",
131 | "Part number description | Part number |
\n",
132 | "IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte SW Subscription & Support Reinstatement 12 Months | D2605LL |
\n",
133 | "IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte Annual SW Subscription & Support Renewal 12 Months | E0QGMLL |
\n",
134 | "IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte Monthly License | D2608LL |
\n",
135 | "\n",
136 | "
\n",
137 | "\n",
138 | "\n",
139 | "\n",
140 | "Part number description | Part number |
\n",
141 | "IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems(R) Annual Terabyte License + SW Subscription & Support 12 Months | D2606LL |
\n",
142 | "IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte SW Subscription & Support Reinstatement 12 Months | D2607LL |
\n",
143 | "IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte Annual SW Subscription & Support Renewal 12 Months | E0QGNLL |
\n",
144 | "IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte Monthly License | D2609LL |
\n",
145 | "\n",
146 | "
\n",
147 | "\n",
148 | "\n",
149 | "\n",
150 | "Program identifier | License Information document title | License Information document number |
\n",
151 | "5737-L70 | IBM InfoSphere Optim Data Privacy for Unstructured Data | L-JERN-BFQ3KR |
\n",
152 | "\n",
153 | "
\n",
154 | "\n"
155 | ],
156 | "text/plain": [
157 | ""
158 | ]
159 | },
160 | "metadata": {},
161 | "output_type": "display_data"
162 | }
163 | ],
164 | "source": [
165 | "display(Markdown(markdown_output[4]))"
166 | ]
167 | }
168 | ],
169 | "metadata": {
170 | "kernelspec": {
171 | "display_name": "any",
172 | "language": "python",
173 | "name": "python3"
174 | },
175 | "language_info": {
176 | "codemirror_mode": {
177 | "name": "ipython",
178 | "version": 3
179 | },
180 | "file_extension": ".py",
181 | "mimetype": "text/x-python",
182 | "name": "python",
183 | "nbconvert_exporter": "python",
184 | "pygments_lexer": "ipython3",
185 | "version": "-1.-1.-1"
186 | }
187 | },
188 | "nbformat": 4,
189 | "nbformat_minor": 2
190 | }
191 |
--------------------------------------------------------------------------------
/examples/async_parse_pdf2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
10 | "# !pip3 install --upgrade ipython\n",
11 | "# !pip3 install --upgrade any-parser"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from IPython.display import display, Markdown\n",
21 | "from any_parser import AnyParser\n",
22 | "import os\n",
23 | "from dotenv import load_dotenv"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 3,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "load_dotenv(override=True)\n",
33 | "example_apikey = os.getenv(\"CAMBIO_API_KEY\")"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 4,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "ap = AnyParser(example_apikey)\n",
43 | "\n",
44 | "# Define extract_args as a dictionary with your desired parameters\n",
45 | "extract_args = {\n",
46 | " \"vqa_figures_flag\": True,\n",
47 | " \"vqa_charts_flag\": True\n",
48 | "}\n",
49 | "\n",
50 | "file_id = ap.async_parse(file_path=\"./sample_data/Earnings-Presentation-Q2-2024.pdf\", extract_args=extract_args)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 5,
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "Waiting for response...\n",
63 | "Waiting for response...\n",
64 | "Waiting for response...\n",
65 | "Waiting for response...\n",
66 | "Waiting for response...\n"
67 | ]
68 | }
69 | ],
70 | "source": [
71 | "markdown_output = ap.async_fetch(file_id=file_id)"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 7,
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "data": {
81 | "text/markdown": [
82 | "Meta Earnings Presentation Q2 2024 \n",
83 | "\n",
84 | "investor.fb.com\n",
85 | "\n",
86 | " Meta logo, consisting of an infinity symbol followed by the text \"Meta\"\n",
87 | "\n",
88 | "Revenue by User Geography Meta logo \n",
89 | "\n",
90 | "In Millions\n",
91 | "\n",
92 | " \n",
93 | "| Quarter | US & Canada | Europe | Asia-Pacific | Rest of World | Total |\n",
94 | "|---|---|---|---|---|---|\n",
95 | "| Q2'24 | 16,847 | 9,300 | 7,888 | 5,036 | 39,071 |\n",
96 | "| Q1'24 | 15,824 | 8,483 | 7,481 | 4,667 | 36,455 |\n",
97 | "| Q4'23 | 18,585 | 9,441 | 7,512 | 4,573 | 40,111 |\n",
98 | "| Q3'23 | 15,190 | 7,777 | 6,928 | 4,251 | 34,146 |\n",
99 | "| Q2'23 | 14,422 | 7,323 | 6,515 | 3,739 | 31,999 |\n",
100 | "| Q1'23 | 13,048 | 6,345 | 5,960 | 3,292 | 28,645 |\n",
101 | "| Q4'22 | 15,636 | 7,050 | 6,050 | 3,429 | 32,165 |\n",
102 | "| Q3'22 | 13,035 | 5,797 | 5,782 | 3,100 | 27,714 |\n",
103 | "| Q2'22 | 13,249 | 6,452 | 5,797 | 3,213 | 28,822 |\n",
104 | "\n",
105 | "This stacked bar chart shows the revenue by user geography for Meta from Q2'22 to Q2'24. The revenue is divided into four categories: US & Canada, Europe, Asia-Pacific, and Rest of World. The total revenue for each quarter is shown at the top of each bar.\n",
106 | " \n",
107 | "\n",
108 | "Our revenue by user geography is geographically apportioned based on our estimation of the geographic location of our users when they perform a revenue-generating activity. This allocation differs from our revenue disaggregated by geography disclosure in our condensed consolidated financial statements where revenue is geographically apportioned based on the addresses of our customers.\n",
109 | "\n",
110 | " 3\n",
111 | "\n",
112 | "Segment Results Meta logo \n",
113 | "\n",
114 | "In Millions\n",
115 | "\n",
116 | " \n",
117 | "| | Q2'22 | Q3'22 | Q4'22 | Q1'23 | Q2'23 | Q3'23 | Q4'23 | Q1'24 | Q2'24 |\n",
118 | "|---|---|---|---|---|---|---|---|---|---|\n",
119 | "| Advertising | $ 28,152 | $ 27,237 | $ 31,254 | $ 28,101 | $ 31,498 | $ 33,643 | $ 38,706 | $ 35,635 | $ 38,329 |\n",
120 | "| Other | 218 | 192 | 184 | 205 | 225 | 293 | 334 | 380 | 389 |\n",
121 | "| Family of Apps Revenue | 28,370 | 27,429 | 31,438 | 28,306 | 31,723 | 33,936 | 39,040 | 36,015 | 38,718 |\n",
122 | "| Reality Labs Revenue | 452 | 285 | 727 | 339 | 276 | 210 | 1,071 | 440 | 353 |\n",
123 | "| Total Revenue | $ 28,822 | $ 27,714 | $ 32,165 | $ 28,645 | $ 31,999 | $ 34,146 | $ 40,111 | $ 36,455 | $ 39,071 |\n",
124 | "| Family of Apps Operating Income | $ 11,164 | $ 9,336 | $ 10,678 | $ 11,219 | $ 13,131 | $ 17,490 | $ 21,030 | $ 17,664 | $ 19,335 |\n",
125 | "| Reality Labs Operating (Loss) | (2,806) | (3,672) | (4,279) | (3,992) | (3,739) | (3,742) | (4,646) | (3,846) | (4,488) |\n",
126 | "| Total Income from Operations | $ 8,358 | $ 5,664 | $ 6,399 | $ 7,227 | $ 9,392 | $ 13,748 | $ 16,384 | $ 13,818 | $ 14,847 |\n",
127 | "| Operating Margin | 29% | 20% | 20% | 25% | 29% | 40% | 41% | 38% | 38% |\n",
128 | " \n",
129 | "\n",
130 | "We report our financial results based on two reportable segments: Family of Apps (FoA) and Reality Labs (RL). FoA includes Facebook, Instagram, Messenger, WhatsApp, and other services. RL includes our virtual, augmented, and mixed reality related consumer hardware, software, and content.\n",
131 | "\n",
132 | " 4\n",
133 | "\n",
134 | "Net Income Meta logo \n",
135 | "\n",
136 | "In Millions\n",
137 | "\n",
138 | " \n",
139 | "| Quarter | Net Income |\n",
140 | "|---|---|\n",
141 | "| Q2'22 | $6,687 |\n",
142 | "| Q3'22 | $4,395 |\n",
143 | "| Q4'22 | $4,652 |\n",
144 | "| Q1'23 | $5,709 |\n",
145 | "| Q2'23 | $7,788 |\n",
146 | "| Q3'23 | $11,583 |\n",
147 | "| Q4'23 | $14,017 |\n",
148 | "| Q1'24 | $12,369 |\n",
149 | "| Q2'24 | $13,465 |\n",
150 | "\n",
151 | "This bar chart shows the Net Income in millions for Meta from Q2'22 to Q2'24. The y-axis ranges from $0 to $14,017 million, with increments of $1,000 million. The highest net income was $14,017 million in Q4'23, while the lowest was $4,395 million in Q3'22.\n",
152 | " \n",
153 | "\n",
154 | " 7\n",
155 | "\n",
156 | "Diluted Earnings Per Share Meta logo \n",
157 | "\n",
158 | " \n",
159 | "| Quarter | Earnings Per Share |\n",
160 | "|---|---|\n",
161 | "| Q2'22 | $2.46 |\n",
162 | "| Q3'22 | $1.64 |\n",
163 | "| Q4'22 | $1.76 |\n",
164 | "| Q1'23 | $2.20 |\n",
165 | "| Q2'23 | $2.98 |\n",
166 | "| Q3'23 | $4.39 |\n",
167 | "| Q4'23 | $5.33 |\n",
168 | "| Q1'24 | $4.71 |\n",
169 | "| Q2'24 | $5.16 |\n",
170 | "\n",
171 | "This bar chart shows the Diluted Earnings Per Share for Meta from Q2'22 to Q2'24. The y-axis ranges from $1.64 to $5.33, with increments of $0.02. The chart demonstrates an overall increasing trend in earnings per share over the period, with the highest point in Q4'23 at $5.33 and the lowest in Q3'22 at $1.64.\n",
172 | " \n",
173 | "\n",
174 | " 8\n",
175 | "\n",
176 | "Limitations of Key Metrics and Other Data Meta logo \n",
177 | "\n",
178 | "To calculate our estimates of DAP, we currently use a series of machine learning models that are developed based on internal reviews of limited samples of user accounts and calibrated against user survey data. We apply significant judgment in designing these models and calculating these estimates. For example, to match user accounts within individual products and across multiple products, we use data signals such as similar device information, IP addresses, and user names. We also calibrate our models against data from periodic user surveys of varying sizes and frequency across our products, which survey questions are based on monthly usage, and which are inherently subject to error. The timing and results of such user surveys have in the past contributed, and may in the future contribute, to changes in our reported Family metrics from period to period. In addition, our data limitations may affect our understanding of certain details of our business and increase the risk of error for our Family metrics estimates. Our techniques and models rely on a variety of data signals from different products, and we rely on more limited data signals for some products compared to others. For example, as a result of limited visibility into encrypted products, we have fewer data signals from WhatsApp user accounts and primarily rely on phone numbers and device information to match WhatsApp user accounts with accounts on our other products. Any loss of access to data signals we use in our process for calculating Family metrics, whether as a result of our own product decisions, actions by third-party browser or mobile platforms, regulatory or legislative requirements, or other factors, also may impact the stability or accuracy of our reported Family metrics, as well as our ability to report these metrics at all. Our estimates of Family metrics also may change as our methodologies evolve, including through the application of new data signals or technologies, product changes, or other improvements in our user surveys, algorithms, or machine learning that may improve our ability to match accounts within and across our products or otherwise evaluate the broad population of our users. In addition, such evolution may allow us to identify previously undetected violating accounts (as defined below).\n",
179 | "\n",
180 | "We regularly evaluate our Family metrics to estimate the percentage of our DAP consisting solely of \"violating\" accounts. We define \"violating\" accounts as accounts which we believe are intended to be used for purposes that violate our terms of service, including bots and spam. In the first quarter of 2024, we estimated that less than 3% of our worldwide DAP consisted solely of violating accounts. Such estimation is based on an internal review of a limited sample of accounts, and we apply significant judgment in making this determination. For example, we look for account information and behaviors associated with Facebook and Instagram accounts that appear to be inauthentic to the reviewers, but we have limited visibility into WhatsApp user activity due to encryption. In addition, if we believe an individual person has one or more violating accounts, we do not include such person in our violating accounts estimation as long as we believe they have one account that does not constitute a violating account. From time to time, we disable certain user accounts, make product changes, or take other actions to reduce the number of violating accounts among our users, which may also reduce our DAP estimates in a particular period. We intend to disclose our estimates of the percentage of our DAP consisting solely of violating accounts on an annual basis. Violating accounts are very difficult to measure at our scale, and it is possible that the actual number of violating accounts may vary significantly from our estimates.\n",
181 | "\n",
182 | "## User Geography\n",
183 | "\n",
184 | "Our estimates for revenue by user location, as well as year-over-year percentage changes in ad impressions delivered and the average price per ad by user location, are also affected by data limitations and other challenges in measuring user geography. Our data regarding the geographic location of our users is estimated based on a number of factors, such as the user's IP address and self-disclosed location. These factors may not always accurately reflect the user's actual location. For example, a user may appear to be accessing our products from the location of the proxy server that the user connects to rather than from the user's actual location. The methodologies used to measure our metrics are also susceptible to algorithm or other technical errors.\n",
185 | "\n",
186 | " 17"
187 | ],
188 | "text/plain": [
189 | ""
190 | ]
191 | },
192 | "metadata": {},
193 | "output_type": "display_data"
194 | }
195 | ],
196 | "source": [
197 | "# Join the list elements with newlines to create a single string\n",
198 | "markdown_text = '\\n\\n'.join(markdown_output)\n",
199 | "display(Markdown(markdown_text))"
200 | ]
201 | }
202 | ],
203 | "metadata": {
204 | "kernelspec": {
205 | "display_name": "any",
206 | "language": "python",
207 | "name": "python3"
208 | },
209 | "language_info": {
210 | "codemirror_mode": {
211 | "name": "ipython",
212 | "version": 3
213 | },
214 | "file_extension": ".py",
215 | "mimetype": "text/x-python",
216 | "name": "python",
217 | "nbconvert_exporter": "python",
218 | "pygments_lexer": "ipython3",
219 | "version": "3.10.15"
220 | }
221 | },
222 | "nbformat": 4,
223 | "nbformat_minor": 2
224 | }
225 |
--------------------------------------------------------------------------------
/examples/async_parse_with_layout.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
10 | "# !pip3 install --upgrade ipython\n",
11 | "# !pip3 install --upgrade any-parser"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from IPython.display import display, Markdown\n",
21 | "from any_parser import AnyParser"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "ap = AnyParser(api_key=\"...\")"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "file_path = \"./sample_data/test_1figure_1table.png\"\n",
40 | "file_id = ap.async_parse_with_layout(file_path)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 4,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "name": "stdout",
50 | "output_type": "stream",
51 | "text": [
52 | "Waiting for response...\n",
53 | "Waiting for response...\n",
54 | "Waiting for response...\n",
55 | "Waiting for response...\n",
56 | "Waiting for response...\n",
57 | "Waiting for response...\n",
58 | "Waiting for response...\n",
59 | "Waiting for response...\n"
60 | ]
61 | }
62 | ],
63 | "source": [
64 | "markdown_output = ap.async_fetch(file_id=file_id)"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 5,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "data": {
74 | "text/markdown": [
75 | "\n",
76 | "\n",
77 | "\n",
78 | " | latency | (ms) |
\n",
79 | "participants | mean | 99th percentile |
\n",
80 | "1 | 17.0 +1.4 | 75.0 34.9 |
\n",
81 | "2 | 24.5 +2.5 | 87.6 +35.9 |
\n",
82 | "5 | 31.5 +6.2 | 104.5 52.2 |
\n",
83 | "10 | 30.0 +3.7 | 95.6 +25.4 |
\n",
84 | "25 | 35.5 +5.6 | 100.4 42.7 |
\n",
85 | "50 | 42.7 4.1 | 93.7 22.9 |
\n",
86 | "100 | 71.4 7.6 | 131.2 +17.6 |
\n",
87 | "200 | 150.5 +11.0 | 320.3 35.1 |
\n",
88 | "\n",
89 | "
\n",
90 | "\n",
91 | "\n",
92 | "\n",
93 | "Table 4: Two-phase commit scalability. Mean and standard deviations over 10 runs.\n",
94 | "\n",
95 | "CPUs. Snapshot reads can execute at any up-to-date replicas, so their throughput increases almost linearly with the number of replicas. Single-read read-only transactions only execute at leaders because timestamp assignment must happen at leaders. Read-only-transaction throughput increases with the number of replicas because the number of effective spanservers increases: in the experimental setup, the number of spanservers equaled the number of replicas, and leaders were randomly distributed among the zones. Write throughput benefits from the same experimental artifact (which explains the increase in throughput from 3 to 5 replicas), but that benefit is outweighed by the linear increase in the amount of work performed per write, as the number of replicas increases.\n",
96 | "\n",
97 | "Table 4 demonstrates that two-phase commit can scale to a reasonable number of participants: it summarizes a set of experiments run across 3 zones, each with 25 spanservers. Scaling up to 50 participants is reasonable in both mean and 99th-percentile, and latencies start to rise noticeably at 100 participants.\n",
98 | "\n",
99 | "5.2 Availability\n",
100 | "\n",
101 | "Figure 5 illustrates the availability benefits of running Spanner in multiple datacenters. It shows the results of three experiments on throughput in the presence of datacenter failure, all of which are overlaid onto the same time scale. The test universe consisted of 5 zones Zi, each of which had 25 spanservers. The test database was sharded into 1250 Paxos groups, and 100 test clients constantly issued non-snapshot reads at an aggregate rate of 50K reads/second. All of the leaders were explicitly placed in Z1. Five seconds into each test, all of the servers in one zone were killed: non-leader kills Z2; leader-hard kills Z1; leader-soft kills Z1, but it gives notifications to all of the servers that they should handoff leadership first.\n",
102 | "\n",
103 | "Killing Z2 has no effect on read throughput. Killing Z1 while giving the leaders time to handoff leadership to a different zone has a minor effect: the throughput drop is not visible in the graph, but is around 3-4%. On the other hand, killing Z1 with no warning has a severe effect: the rate of completion drops almost to 0. As leaders get re-elected, though, the throughput of the system rises to approximately 100K reads/second because of two artifacts of our experiment: there is extra capacity in the system, and operations are queued while the leader is unavailable. As a result, the throughput of the system rises before leveling off again at its steady-state rate.\n",
104 | "\n",
105 | "We can also see the effect of the fact that Paxos leader leases are set to 10 seconds. When we kill the zone, the leader-lease expiration times for the groups should be evenly distributed over the next 10 seconds. Soon after each lease from a dead leader expires, a new leader is elected. Approximately 10 seconds after the kill time, all of the groups have leaders and throughput has recovered. Shorter lease times would reduce the effect of server deaths on availability, but would require greater amounts of lease-renewal network traffic. We are in the process of designing and implementing a mechanism that will cause slaves to release Paxos leader leases upon leader failure.\n",
106 | "\n",
107 | "5.3 TrueTime\n",
108 | "\n",
109 | "Two questions must be answered with respect to TrueTime: is ε truly a bound on clock uncertainty, and how bad does ε get? For the former, the most serious problem would be if a local clock’s drift were greater than 200usec/sec: that would break assumptions made by TrueTime. Our machine statistics show that bad CPUs are 6 times more likely than bad clocks. That is, clock issues are extremely infrequent, relative to much more serious hardware problems. As a result, we believe that TrueTime’s implementation is as trustworthy as any other piece of software upon which Spanner depends.\n",
110 | "\n",
111 | "(timeout=1h)](https://anyparser-realtime-test-j-assetsconstructfilebucke-2wg0ln280yvz.s3.amazonaws.com/result_parse_with_layout/async_S4iyw7RAEE8CTGkVgHYeI8nsTmSALI1U2HXvAN6j/2024/11/14/test_1figure_1table_f00964f1-abcc-4e62-b2af-46249d9c70d4.png/%3C%40mask_p0_e1_figure_s3%3E.png?AWSAccessKeyId=ASIAXM24X76XLDJJDDZX&Signature=Ef8urOX4Oj%2Bdxx%2F1IOh0OqgJ0%2B4%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEFoaCXVzLXdlc3QtMiJGMEQCIBJHF0qjs7xZL9IBZf0a7YooU6WJP1EeclCbGaKCaLFPAiB%2BFjaYEyzmBWPFVh%2FRSUVhrEEdc%2FlQdUaLSTP%2FgclPaSrcAwjj%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwODYxMTI2NDQzMCIMGyjwrhVEC7fYAvneKrADV3HpyrnA8A6QUdLRnfZZM74MpeETlq%2BvlIjpQ5CPxB%2BTWpNRlq4c3eo%2BzKRX87bl9kpFmBaFXJPc9ot%2BN3L3Vcp%2FzvnI0iB4gqlN4jGexU5wVpTclORB1TAK%2FcO6AFfGACTLrUg0GzgcbwYR%2FGIvhxSGj1Ule9MDXL%2FG2YGMnqFDndKirbwufY4dlBYehDzqNii3kB3v5nGFsYKmAdVEocKdeIK6cv%2Fybj3w58l9vDyRMgr0%2FLWposZ160WIEvPMWMseKe6Q87%2BbEL8hcyl5i0aFxeGf4xv1Foiz74tcJcPL7RuwpQYCb3BztfD11Vo8334cla8p5LlEfkj1OEDHVXW15FJpw29pZN1q0IBIQNeBHtajkpu7BPzURXYZIUnvnWnpCPHTThM8z2Az1mhtou69uKWjO6iVeOe%2BrbqGMXbKEJxuKraEh%2BXVukZWmzlxwaiyJ2gomNXTQmO0gaLpiU934WqlJu9mGl0mw686KPwwdVOudV4RUgXAZhpT7j%2FzydhxVNK0sHX%2F02lTm1v6%2BRpsUN1Xvd%2FXMuj1%2FM8q5B86wkwUj1YjgFoQ9qcljZu8MPik1bkGOp8BvunCWNInmGehKh0yaRGfQn0y%2FgecCbOQoOqRUuLahI8ZBrixkIBUOkyinWTmsdLG6ItJXkiKFBOAHU0tq97U0Fbb0mq0v6L%2Bfr1INT52vqWsaXTwxiLSJeGJTEve1SCCRttFsIpkZF5MEmB3V0irDz3lVQbyV1Z2lWSe%2Br13a5DSeH4REoiwqEKtKN%2FCV4WPDhK5G%2FUm%2B8LmNrgUGm77&Expires=1731551406)\n",
112 | "\n",
113 | "Figure 5: Effect of killing servers on throughput.\n"
114 | ],
115 | "text/plain": [
116 | ""
117 | ]
118 | },
119 | "metadata": {},
120 | "output_type": "display_data"
121 | }
122 | ],
123 | "source": [
124 | "display(Markdown(markdown_output))"
125 | ]
126 | }
127 | ],
128 | "metadata": {
129 | "kernelspec": {
130 | "display_name": "any",
131 | "language": "python",
132 | "name": "python3"
133 | },
134 | "language_info": {
135 | "codemirror_mode": {
136 | "name": "ipython",
137 | "version": 3
138 | },
139 | "file_extension": ".py",
140 | "mimetype": "text/x-python",
141 | "name": "python",
142 | "nbconvert_exporter": "python",
143 | "pygments_lexer": "ipython3",
144 | "version": "-1.-1.-1"
145 | }
146 | },
147 | "nbformat": 4,
148 | "nbformat_minor": 2
149 | }
150 |
--------------------------------------------------------------------------------
/examples/async_parse_with_ocr.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
10 | "# !pip3 install --upgrade ipython\n",
11 | "# !pip3 install --upgrade any-parser"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from IPython.display import display, Markdown\n",
21 | "from any_parser import AnyParser"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "ap = AnyParser(api_key=\"...\")"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "file_path = \"./sample_data/test_1figure_1table.png\"\n",
40 | "file_id = ap.async_parse_with_ocr(file_path)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 4,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "name": "stdout",
50 | "output_type": "stream",
51 | "text": [
52 | "Waiting for response...\n",
53 | "Waiting for response...\n",
54 | "Waiting for response...\n",
55 | "Waiting for response...\n",
56 | "Waiting for response...\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "markdown_output = ap.async_fetch(file_id=file_id)"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 6,
67 | "metadata": {},
68 | "outputs": [
69 | {
70 | "data": {
71 | "text/markdown": [
72 | "## Table 4: Two-phase commit scalability. Mean and standard deviations over 10 runs.\n",
73 | "\n",
74 | "| participants | mean | 99th percentile |\n",
75 | "|--------------|-------------|-----------------|\n",
76 | "| 1 | 17.0 ±1.4 | 75.0 ±34.9 |\n",
77 | "| 2 | 24.5 ±2.5 | 87.6 ±35.9 |\n",
78 | "| 5 | 31.5 ±6.2 | 104.5 ±52.2 |\n",
79 | "| 10 | 30.0 ±3.7 | 95.6 ±25.4 |\n",
80 | "| 25 | 35.5 ±5.6 | 100.4 ±42.7 |\n",
81 | "| 50 | 42.7 ±4.1 | 93.7 ±22.9 |\n",
82 | "| 100 | 71.4 ±7.6 | 131.2 ±17.6 |\n",
83 | "| 200 | 150.5 ±11.0 | 320.3 ±35.1 |\n",
84 | "\n",
85 | "## 5.2 Availability\n",
86 | "\n",
87 | "Figure 5 illustrates the availability benefits of running Spanner in multiple datacenters. It shows the results of three experiments on throughput in the presence of datacenter failure, all of which are overlaid onto the same time scale. The test universe consisted of 5 zones Z1-Z5, each of which had 25 spanservers. The test database was sharded into 1250 Paxos groups, and 100 test clients constantly issued non-snapshot reads at an aggregate rate of 50K reads/second. All of the leaders were explicitly placed in Z1. Five seconds into each test, all of the servers in one zone were killed: non-leader kills Z2; leader-hard kills Z1; leader-soft kills Z1, but it gives notifications to all of the servers that they should handoff leadership first.\n",
88 | "\n",
89 | "Killing Z2 has no effect on read throughput. Killing Z1 while giving the leaders time to handoff leadership to a different zone has a minor effect: the throughput drop is not visible in the graph, but is around 3-4%. On the other hand, killing Z1 with no warning has a severe effect: the rate of completion drops almost to 0. As leaders get re-elected, though, the throughput of the system rises to approximately 100K reads/second because of two artifacts of our experiment: there is extra capacity in the system, and operations are queued while the leader is unavailable. As a result, the throughput of the system rises before leveling off again at its steady-state rate.\n",
90 | "\n",
91 | "We can also see the effect of the fact that Paxos leader leases are set to 10 seconds. When we kill the zone, the leader-lease expiration times for the groups should be evenly distributed over the next 10 seconds. Soon after each lease from a dead leader expires, a new leader is elected. Approximately 10 seconds after the kill time, all of the groups have leaders and throughput has recovered. Shorter lease times would reduce the effect of server deaths on availability, but would require greater amounts of lease-renewal network traffic. We are in the process of designing and implementing a mechanism that will cause slaves to release Paxos leader leases upon leader failure.\n",
92 | "\n",
93 | "## 5.3 TrueTime\n",
94 | "\n",
95 | "Two questions must be answered with respect to TrueTime: is ε truly a bound on clock uncertainty, and how bad does ε get? For the former, the most serious problem would be if a local clock's drift were greater than 200us/sec: that would break assumptions made by TrueTime. Our machine statistics show that bad CPUs are 6 times more likely than bad clocks. That is, clock issues are extremely infrequent, relative to much more serious hardware problems. As a result, we believe that TrueTime's implementation is as trustworthy as any other piece of software upon which Spanner depends.\n",
96 | "\n",
97 | "Figure 6 presents TrueTime data taken at several thousand spanserver machines across datacenters up to 2200"
98 | ],
99 | "text/plain": [
100 | ""
101 | ]
102 | },
103 | "metadata": {},
104 | "output_type": "display_data"
105 | }
106 | ],
107 | "source": [
108 | "display(Markdown(markdown_output))"
109 | ]
110 | }
111 | ],
112 | "metadata": {
113 | "kernelspec": {
114 | "display_name": "any",
115 | "language": "python",
116 | "name": "python3"
117 | },
118 | "language_info": {
119 | "codemirror_mode": {
120 | "name": "ipython",
121 | "version": 3
122 | },
123 | "file_extension": ".py",
124 | "mimetype": "text/x-python",
125 | "name": "python",
126 | "nbconvert_exporter": "python",
127 | "pygments_lexer": "ipython3",
128 | "version": "-1.-1.-1"
129 | }
130 | },
131 | "nbformat": 4,
132 | "nbformat_minor": 2
133 | }
134 |
--------------------------------------------------------------------------------
/examples/extract_pii.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
10 | "# !pip3 install --upgrade ipython\n",
11 | "# !pip3 install --upgrade any-parser"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from any_parser import AnyParser"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "ap = AnyParser(api_key=\"...\")"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "file_path = \"./sample_data/resume_1.pdf\"\n",
39 | "pii_info, time = ap.extract_pii(file_path)"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 5,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/plain": [
50 | "'Time Elapsed: 8.02 seconds'"
51 | ]
52 | },
53 | "execution_count": 5,
54 | "metadata": {},
55 | "output_type": "execute_result"
56 | }
57 | ],
58 | "source": [
59 | "time"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 6,
65 | "metadata": {},
66 | "outputs": [
67 | {
68 | "data": {
69 | "text/plain": [
70 | "{'name': 'Gary Jiang',\n",
71 | " 'phone_number': '+1-213-725-7637',\n",
72 | " 'address': None,\n",
73 | " 'email_address': 'jiangzhehuan0105@gmail.com',\n",
74 | " 'linkedin_url': 'https://linkedin.com/in/gary-jiang',\n",
75 | " 'github_url': None,\n",
76 | " 'summary': 'Full-stack Software Engineer'}"
77 | ]
78 | },
79 | "execution_count": 6,
80 | "metadata": {},
81 | "output_type": "execute_result"
82 | }
83 | ],
84 | "source": [
85 | "pii_info"
86 | ]
87 | }
88 | ],
89 | "metadata": {
90 | "kernelspec": {
91 | "display_name": "any",
92 | "language": "python",
93 | "name": "python3"
94 | },
95 | "language_info": {
96 | "codemirror_mode": {
97 | "name": "ipython",
98 | "version": 3
99 | },
100 | "file_extension": ".py",
101 | "mimetype": "text/x-python",
102 | "name": "python",
103 | "nbconvert_exporter": "python",
104 | "pygments_lexer": "ipython3",
105 | "version": "-1.-1.-1"
106 | }
107 | },
108 | "nbformat": 4,
109 | "nbformat_minor": 2
110 | }
111 |
--------------------------------------------------------------------------------
/examples/extract_resume_key_value.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
10 | "# !pip3 install --upgrade ipython\n",
11 | "# !pip3 install --upgrade any-parser"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from IPython.display import display\n",
21 | "from any_parser import AnyParser"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "ap = AnyParser(api_key=\"...\")"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 5,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "file_path = \"./sample_data/resume_1.pdf\"\n",
40 | "json_result = ap.extract_resume_key_value(file_path)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 6,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/plain": [
51 | "({'pii': {'full_name': 'GARY JIANG',\n",
52 | " 'email': 'jiangzhehuan0105@gmail.com',\n",
53 | " 'phone': '+1 (213) 725-7637'},\n",
54 | " 'education': [{'organization': 'Shenyang University of Technology',\n",
55 | " 'degree': \"Bachelor's Degree\",\n",
56 | " 'major': 'Computer Science',\n",
57 | " 'start_date': '2008-01-01',\n",
58 | " 'end_date': '2012-12-31',\n",
59 | " 'courses': None,\n",
60 | " 'achievements': None}],\n",
61 | " 'work_experience': [{'job_title': 'Full Stack Developer',\n",
62 | " 'company_name': 'VIMMERSE',\n",
63 | " 'location': None,\n",
64 | " 'start_date': '2023-06-01',\n",
65 | " 'end_date': 'present',\n",
66 | " 'job_type': None,\n",
67 | " 'summary': 'Developed an AI-powered editor web application that brings photos to life by animating images in 3D.',\n",
68 | " 'bullet_points': ['Developed robust back-end services utilizing Python (Flask/FastAPI) and Node.js (AWS Lambda) for efficient and scalable web applications',\n",
69 | " 'Built user-friendly and interactive websites using Next.js, ensuring a seamless user experience',\n",
70 | " 'Deployed and managed AWS infrastructure, including EC2 instances, S3 storage buckets, DynamoDB for NoSQL data management, and Cognito user pools for secure user authentication',\n",
71 | " 'Experienced Agile and Scrum methodologies within a fast-paced startup environment to ensure efficient project delivery and continuous improvement',\n",
72 | " 'Collaborated effectively with cross-functional teams (design, product management) to deliver projects on time, fostering a positive and collaborative work environment']},\n",
73 | " {'job_title': 'Full Stack Developer',\n",
74 | " 'company_name': 'VIKING SASQUATCH',\n",
75 | " 'location': None,\n",
76 | " 'start_date': '2023-01-01',\n",
77 | " 'end_date': '2023-06-30',\n",
78 | " 'job_type': None,\n",
79 | " 'summary': 'Developed APIs and Integrations for all of the parties that work on Real Estate Transactions.',\n",
80 | " 'bullet_points': ['Connecting Mortgage, Title, and Real Estate to solve pain points and improve automation and create efficiencies',\n",
81 | " 'Implemented a user-friendly front-end interface using Nuxt.js, ensuring a seamless user experience',\n",
82 | " 'Built backend APIs utilizing Node.js serverless functions for optimal performance',\n",
83 | " 'Managed data storage and security by implementing a MySQL database',\n",
84 | " 'Collaborated effectively within a team using Agile methodologies like sprint planning, daily standups, retrospectives to ensure project delivery and continuous improvement']},\n",
85 | " {'job_title': 'Full Stack Developer',\n",
86 | " 'company_name': 'ROX PAY SRL',\n",
87 | " 'location': None,\n",
88 | " 'start_date': '2021-12-01',\n",
89 | " 'end_date': '2022-12-31',\n",
90 | " 'job_type': None,\n",
91 | " 'summary': 'Built Fintech Software House that aims to optimize B2B payments by offering a systemic solution that gives value-added services in collection of payments, financial information and corporate liquidity by essentially creating a Commission Free, Open Loop, Payment Gateway system.',\n",
92 | " 'bullet_points': ['Developed front-end by using React.js and Redux, Javascript/Typescript',\n",
93 | " 'Contributed developing backend utilizing Django/Python']},\n",
94 | " {'job_title': 'Freelancer',\n",
95 | " 'company_name': 'FREELANCE',\n",
96 | " 'location': None,\n",
97 | " 'start_date': '2017-09-01',\n",
98 | " 'end_date': '2021-10-31',\n",
99 | " 'job_type': None,\n",
100 | " 'summary': 'Developed and managed many web and mobile applications while working as freelancer at Internet Dzyns LLC company.',\n",
101 | " 'bullet_points': ['Developed multiple web applications, participating in the whole process of their development: product design and estimation, code design and development, DevOps, UI/UX design, product launch and maintenance',\n",
102 | " 'Developed cross-platform mobile application using Flutter and Ionic/Angular',\n",
103 | " 'Developed NFT marketplace websites and wrote smart contracts']},\n",
104 | " {'job_title': 'Server Administrator, Java Developer',\n",
105 | " 'company_name': 'NEUSOFT',\n",
106 | " 'location': None,\n",
107 | " 'start_date': '2014-06-01',\n",
108 | " 'end_date': '2017-08-31',\n",
109 | " 'job_type': None,\n",
110 | " 'summary': 'Worked as intern and software developer after graduated university.',\n",
111 | " 'bullet_points': ['Correct analytical and reasoning skills to troubleshoot and repair server issues',\n",
112 | " 'Operating Systems & Security Software',\n",
113 | " 'Java / Spring Boot / Hibernate']}],\n",
114 | " 'personal_info': {'name': 'GARY JIANG',\n",
115 | " 'phone_number': '+1-213-725-7637',\n",
116 | " 'address': None,\n",
117 | " 'email_address': 'jiangzhehuan0105@gmail.com',\n",
118 | " 'linkedin_url': 'linkedin.com/in/gary-jiang',\n",
119 | " 'github_url': None,\n",
120 | " 'summary': None},\n",
121 | " 'skills': {'Programming Languages': ['Python',\n",
122 | " 'PHP',\n",
123 | " 'Javascript',\n",
124 | " 'Typescript',\n",
125 | " 'HTML',\n",
126 | " 'CSS'],\n",
127 | " 'Tools': ['Flask',\n",
128 | " 'Django',\n",
129 | " 'FastAPI',\n",
130 | " 'Laravel',\n",
131 | " 'Node.js',\n",
132 | " 'SQL databases',\n",
133 | " 'Next.js',\n",
134 | " 'React',\n",
135 | " 'Redux',\n",
136 | " 'Nuxt.js',\n",
137 | " 'Vue',\n",
138 | " 'AWS Lambda',\n",
139 | " 'Cognito',\n",
140 | " 'EC2',\n",
141 | " 'S3',\n",
142 | " 'DynamoDB',\n",
143 | " 'API gateway',\n",
144 | " 'Git',\n",
145 | " 'Version Control',\n",
146 | " 'DevOps',\n",
147 | " 'CI/CD'],\n",
148 | " 'Other': ['Startup Experience',\n",
149 | " 'Adaptable',\n",
150 | " 'Resourceful',\n",
151 | " 'Prioritization',\n",
152 | " 'Hybrid Mobile App Development',\n",
153 | " 'Flutter',\n",
154 | " 'Ionic',\n",
155 | " 'Angular',\n",
156 | " 'AGILE',\n",
157 | " 'SCRUM']},\n",
158 | " 'certifications': [],\n",
159 | " 'projects': []},\n",
160 | " 'Time Elapsed: 27.27 seconds')"
161 | ]
162 | },
163 | "metadata": {},
164 | "output_type": "display_data"
165 | }
166 | ],
167 | "source": [
168 | "display(json_result)"
169 | ]
170 | }
171 | ],
172 | "metadata": {
173 | "kernelspec": {
174 | "display_name": "any",
175 | "language": "python",
176 | "name": "python3"
177 | },
178 | "language_info": {
179 | "codemirror_mode": {
180 | "name": "ipython",
181 | "version": 3
182 | },
183 | "file_extension": ".py",
184 | "mimetype": "text/x-python",
185 | "name": "python",
186 | "nbconvert_exporter": "python",
187 | "pygments_lexer": "ipython3",
188 | "version": "-1.-1.-1"
189 | }
190 | },
191 | "nbformat": 4,
192 | "nbformat_minor": 2
193 | }
194 |
--------------------------------------------------------------------------------
/examples/extract_tables.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
10 | "# !pip3 install --upgrade ipython\n",
11 | "# !pip3 install --upgrade any-parser\n",
12 | "# !pip3 install pandas lxml html5lib bs4\n"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "from IPython.display import display, Markdown\n",
22 | "from any_parser import AnyParser"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 3,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "ap = AnyParser(api_key=\"...\")"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 7,
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "csv_output, time_info = ap.extract_tables(\n",
41 | " file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"csv\"\n",
42 | ")\n",
43 | "\n",
44 | "html_output, time_info = ap.extract_tables(\n",
45 | " file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"html\"\n",
46 | ")"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 8,
52 | "metadata": {},
53 | "outputs": [
54 | {
55 | "name": "stdout",
56 | "output_type": "stream",
57 | "text": [
58 | "CPU times: user 3 μs, sys: 1 μs, total: 4 μs\n",
59 | "Wall time: 5.96 μs\n"
60 | ]
61 | }
62 | ],
63 | "source": [
64 | "time"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 10,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "data": {
74 | "text/markdown": [
75 | "0,1,2\n",
76 | ",latency,(ms)\n",
77 | "participants,mean,99th percentile\n",
78 | "1,17.0 +1.4,75.0 34.9\n",
79 | "2,24.5 +2.5,87.6 35.9\n",
80 | "5,31.5 +6.2,104.5 52.2\n",
81 | "10,30.0 +3.7,95.6 25.4\n",
82 | "25,35.5 +5.6,100.4 42.7\n",
83 | "50,42.7 +4.1,93.7 22.9\n",
84 | "100,71.4 +7.6,131.2 +17.6\n",
85 | "200,150.5 +11.0,320.3 35.1\n"
86 | ],
87 | "text/plain": [
88 | ""
89 | ]
90 | },
91 | "metadata": {},
92 | "output_type": "display_data"
93 | },
94 | {
95 | "data": {
96 | "text/markdown": [
97 | "\n",
98 | "\n",
99 | " | latency | (ms) |
\n",
100 | "participants | mean | 99th percentile |
\n",
101 | "1 | 17.0 +1.4 | 75.0 34.9 |
\n",
102 | "2 | 24.5 +2.5 | 87.6 35.9 |
\n",
103 | "5 | 31.5 +6.2 | 104.5 52.2 |
\n",
104 | "10 | 30.0 +3.7 | 95.6 25.4 |
\n",
105 | "25 | 35.5 +5.6 | 100.4 42.7 |
\n",
106 | "50 | 42.7 +4.1 | 93.7 22.9 |
\n",
107 | "100 | 71.4 +7.6 | 131.2 +17.6 |
\n",
108 | "200 | 150.5 +11.0 | 320.3 35.1 |
\n",
109 | "\n",
110 | "
\n",
111 | "\n"
112 | ],
113 | "text/plain": [
114 | ""
115 | ]
116 | },
117 | "metadata": {},
118 | "output_type": "display_data"
119 | }
120 | ],
121 | "source": [
122 | "if isinstance(csv_output, list):\n",
123 | " csv_output_str = \"\\n\".join(csv_output)\n",
124 | "else:\n",
125 | " csv_output_str = csv_output\n",
126 | "\n",
127 | "display(Markdown(csv_output_str))\n",
128 | "display(Markdown(html_output))"
129 | ]
130 | }
131 | ],
132 | "metadata": {
133 | "kernelspec": {
134 | "display_name": "any",
135 | "language": "python",
136 | "name": "python3"
137 | },
138 | "language_info": {
139 | "codemirror_mode": {
140 | "name": "ipython",
141 | "version": 3
142 | },
143 | "file_extension": ".py",
144 | "mimetype": "text/x-python",
145 | "name": "python",
146 | "nbconvert_exporter": "python",
147 | "pygments_lexer": "ipython3",
148 | "version": "3.10.16"
149 | }
150 | },
151 | "nbformat": 4,
152 | "nbformat_minor": 2
153 | }
154 |
--------------------------------------------------------------------------------
/examples/parse_batch_api.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Anyparser Batch API Example"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
17 | "# !pip3 install --upgrade ipython\n",
18 | "# !pip3 install --upgrade any-parser"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "### Step1: Batch API Folder Processing Upload"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 1,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "import json\n",
35 | "import os\n",
36 | "from datetime import datetime\n",
37 | "\n",
38 | "from dotenv import load_dotenv\n",
39 | "\n",
40 | "from any_parser import AnyParser"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 2,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "# Load environment variables\n",
50 | "load_dotenv(override=True)\n",
51 | "\n",
52 | "# Get API key and create parser\n",
53 | "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n",
54 | "if not api_key:\n",
55 | " raise ValueError(\"CAMBIO_API_KEY is not set\")\n",
56 | "ap = AnyParser(api_key)"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "Create Batch Request"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 3,
69 | "metadata": {},
70 | "outputs": [
71 | {
72 | "name": "stdout",
73 | "output_type": "stream",
74 | "text": [
75 | "Upload responses saved to: ./sample_data_20250103003352.jsonl\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "# Upload folder for batch processing\n",
81 | "WORKING_FOLDER = \"./sample_data\"\n",
82 | "responses = ap.batches.create(WORKING_FOLDER)\n",
83 | "\n",
84 | "# Save responses to JSONL file with timestamp\n",
85 | "timestamp = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
86 | "output_file = f\"./sample_data_{timestamp}.jsonl\"\n",
87 | "\n",
88 | "with open(output_file, \"w\") as f:\n",
89 | " for response in responses:\n",
90 | " f.write(json.dumps(response.model_dump()) + \"\\n\")\n",
91 | "\n",
92 | "print(f\"Upload responses saved to: {output_file}\")"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "Check the first element status in the jsonl using the requestId"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 4,
105 | "metadata": {},
106 | "outputs": [
107 | {
108 | "name": "stdout",
109 | "output_type": "stream",
110 | "text": [
111 | "Checking status for file: Earnings-Presentation-Q2-2024.pdf\n",
112 | "Content not yet available\n"
113 | ]
114 | }
115 | ],
116 | "source": [
117 | "# Get first response from the JSONL file\n",
118 | "with open(output_file, \"r\") as f:\n",
119 | " first_response = json.loads(f.readline())\n",
120 | "\n",
121 | "request_id = first_response[\"requestId\"]\n",
122 | "print(f\"Checking status for file: {first_response['fileName']}\")\n",
123 | "\n",
124 | "# Retrieve status using request ID\n",
125 | "markdown = ap.batches.retrieve(request_id)\n",
126 | "if markdown and markdown.result:\n",
127 | " print(\"Content retrieved successfully\")\n",
128 | "else:\n",
129 | " print(\"Content not yet available\")"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "Note: Batch extraction is currently in beta testing. Processing time may take up to 2 hours to complete."
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "After 2 hours, you can check the content of the first file in the folder again"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 6,
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "name": "stdout",
153 | "output_type": "stream",
154 | "text": [
155 | "Content retrieved successfully\n"
156 | ]
157 | }
158 | ],
159 | "source": [
160 | "# Retrieve status using request ID\n",
161 | "markdown = ap.batches.retrieve(request_id)\n",
162 | "if markdown and markdown.result:\n",
163 | " print(\"Content retrieved successfully\")\n",
164 | "else:\n",
165 | " print(\"Content not yet available\")"
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "### Step2: Batch API folder fetch response\n"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 16,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "import json\n",
182 | "import logging\n",
183 | "import os\n",
184 | "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
185 | "\n",
186 | "from dotenv import load_dotenv\n",
187 | "\n",
188 | "from any_parser import AnyParser\n",
189 | "\n",
190 | "# Configure logging\n",
191 | "logging.basicConfig(level=logging.INFO)\n",
192 | "logger = logging.getLogger(__name__)\n",
193 | "\n",
194 | "# Load environment variables\n",
195 | "load_dotenv(override=True)\n",
196 | "\n",
197 | "MAX_WORKER = 10"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 17,
203 | "metadata": {},
204 | "outputs": [],
205 | "source": [
206 | "# Get API key and create parser\n",
207 | "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n",
208 | "if not api_key:\n",
209 | " raise ValueError(\"CAMBIO_API_KEY is not set\")\n",
210 | "ap = AnyParser(api_key)"
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "metadata": {},
216 | "source": [
217 | "Read responses from JSONL file"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 18,
223 | "metadata": {},
224 | "outputs": [],
225 | "source": [
226 | "# Change to your real output json from parse_batch_upload.py\n",
227 | "response_file = \"./sample_data_20250102103047.jsonl\"\n",
228 | "with open(response_file, \"r\") as f:\n",
229 | " responses = [json.loads(line) for line in f]"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": 19,
235 | "metadata": {},
236 | "outputs": [],
237 | "source": [
238 | "# Process responses concurrently\n",
239 | "def process_response(response):\n",
240 | " \"\"\"Process a single response by retrieving markdown content\"\"\"\n",
241 | " request_id = response[\"requestId\"]\n",
242 | " try:\n",
243 | " markdown = ap.batches.retrieve(request_id)\n",
244 | " if markdown and markdown.result:\n",
245 | " response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n",
246 | " response[\"requestStatus\"] = \"COMPLETED\"\n",
247 | " response[\"completionTime\"] = markdown.completionTime\n",
248 | " except Exception as e:\n",
249 | " logger.error(f\"Error processing {request_id}: {str(e)}\")\n",
250 | " response[\"error\"] = [str(e)]\n",
251 | " return response"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 20,
257 | "metadata": {},
258 | "outputs": [
259 | {
260 | "name": "stdout",
261 | "output_type": "stream",
262 | "text": [
263 | "Updated all responses in ./sample_data_20250102103047.jsonl with markdown content\n"
264 | ]
265 | }
266 | ],
267 | "source": [
268 | "# Process responses concurrently\n",
269 | "with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n",
270 | " future_to_response = {\n",
271 | " executor.submit(process_response, response): response\n",
272 | " for response in responses\n",
273 | " }\n",
274 | "\n",
275 | " updated_responses = []\n",
276 | " for future in as_completed(future_to_response):\n",
277 | " updated_response = future.result()\n",
278 | " updated_responses.append(updated_response)\n",
279 | "\n",
280 | "# Write all updated responses back to file\n",
281 | "with open(response_file, \"w\") as f:\n",
282 | " for response in updated_responses:\n",
283 | " f.write(json.dumps(response) + \"\\n\")\n",
284 | "\n",
285 | "print(f\"Updated all responses in {response_file} with markdown content\")"
286 | ]
287 | },
288 | {
289 | "cell_type": "markdown",
290 | "metadata": {},
291 | "source": [
292 | "Print out the first row from the updated file"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 21,
298 | "metadata": {},
299 | "outputs": [
300 | {
301 | "name": "stdout",
302 | "output_type": "stream",
303 | "text": [
304 | "First row from updated file:\n",
305 | "{\n",
306 | " \"fileName\": \"Earnings-Presentation-Q2-2024.pdf\",\n",
307 | " \"requestId\": \"cfb556cb-e5f9-4b6c-a2f7-6ba982858a92\",\n",
308 | " \"requestStatus\": \"COMPLETED\",\n",
309 | " \"result\": [\n",
310 | " \"## Meta Earnings Presentation\\n## Q2 2024\\n\\ninvestor.fb.com Meta logo, consisting of a stylized infinity symbol next to the text \\\"Meta\\\"\"\n",
311 | " ],\n",
312 | " \"completionTime\": \"2025-01-02T04:34:56.494827+00:00\"\n",
313 | "}\n"
314 | ]
315 | }
316 | ],
317 | "source": [
318 | "# Read and print first row from the updated file\n",
319 | "with open(response_file, \"r\") as f:\n",
320 | " first_row = json.loads(f.readline())\n",
321 | " print(\"First row from updated file:\")\n",
322 | " print(json.dumps(first_row, indent=2))"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "metadata": {},
328 | "source": [
329 | "## End of the notebook\n",
330 | "\n",
331 | "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n",
332 | "\n",
333 | "\n",
334 | "
\n",
335 | ""
336 | ]
337 | }
338 | ],
339 | "metadata": {
340 | "kernelspec": {
341 | "display_name": "any-parse",
342 | "language": "python",
343 | "name": "python3"
344 | },
345 | "language_info": {
346 | "codemirror_mode": {
347 | "name": "ipython",
348 | "version": 3
349 | },
350 | "file_extension": ".py",
351 | "mimetype": "text/x-python",
352 | "name": "python",
353 | "nbconvert_exporter": "python",
354 | "pygments_lexer": "ipython3",
355 | "version": "3.10.15"
356 | }
357 | },
358 | "nbformat": 4,
359 | "nbformat_minor": 2
360 | }
361 |
--------------------------------------------------------------------------------
/examples/parse_pdf2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 15,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n",
10 | "# !pip3 install --upgrade ipython\n",
11 | "# !pip3 install --upgrade any-parser"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 16,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "from IPython.display import display, Markdown\n",
21 | "from any_parser import AnyParser\n",
22 | "import os\n",
23 | "from dotenv import load_dotenv"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 17,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "load_dotenv(override=True)\n",
33 | "example_apikey = os.getenv(\"CAMBIO_API_KEY\")"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 18,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "ap = AnyParser(example_apikey)\n",
43 | "\n",
44 | "# Define extract_args as a dictionary with your desired parameters\n",
45 | "extract_args = {\n",
46 | " \"vqa_figures_flag\": True,\n",
47 | " \"vqa_charts_flag\": True\n",
48 | "}\n",
49 | "\n",
50 | "# Pass extract_args to the parse method\n",
51 | "markdown_output, time = ap.parse(\n",
52 | " file_path=\"./sample_data/Earnings-Presentation-Q2-2024.pdf\",\n",
53 | " extract_args=extract_args\n",
54 | ")"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 19,
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "data": {
64 | "text/plain": [
65 | "'Time Elapsed: 23.25 seconds'"
66 | ]
67 | },
68 | "execution_count": 19,
69 | "metadata": {},
70 | "output_type": "execute_result"
71 | }
72 | ],
73 | "source": [
74 | "time"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 20,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/markdown": [
85 | "Meta Earnings Presentation Q2 2024 \n",
86 | "\n",
87 | "investor.fb.com\n",
88 | "\n",
89 | " Meta logo, consisting of an infinity symbol followed by the text \"Meta\"\n",
90 | "\n",
91 | "Revenue by User Geography Meta logo \n",
92 | "\n",
93 | "In Millions\n",
94 | "\n",
95 | " \n",
96 | "| Quarter | US & Canada | Europe | Asia-Pacific | Rest of World | Total |\n",
97 | "|---|---|---|---|---|---|\n",
98 | "| Q2'24 | 16,847 | 9,300 | 7,888 | 5,036 | 39,071 |\n",
99 | "| Q1'24 | 15,824 | 8,483 | 7,481 | 4,667 | 36,455 |\n",
100 | "| Q4'23 | 18,585 | 9,441 | 7,512 | 4,573 | 40,111 |\n",
101 | "| Q3'23 | 15,190 | 7,777 | 6,928 | 4,251 | 34,146 |\n",
102 | "| Q2'23 | 14,422 | 7,323 | 6,515 | 3,739 | 31,999 |\n",
103 | "| Q1'23 | 13,048 | 6,345 | 5,960 | 3,292 | 28,645 |\n",
104 | "| Q4'22 | 15,636 | 7,050 | 6,050 | 3,429 | 32,165 |\n",
105 | "| Q3'22 | 13,035 | 5,797 | 5,782 | 3,100 | 27,714 |\n",
106 | "| Q2'22 | 13,249 | 6,452 | 5,797 | 3,213 | 28,822 |\n",
107 | "\n",
108 | "This stacked bar chart shows the revenue by user geography for Meta from Q2'22 to Q2'24. The revenue is divided into four categories: US & Canada, Europe, Asia-Pacific, and Rest of World. The total revenue for each quarter is shown at the top of each bar.\n",
109 | " \n",
110 | "\n",
111 | "Our revenue by user geography is geographically apportioned based on our estimation of the geographic location of our users when they perform a revenue-generating activity. This allocation differs from our revenue disaggregated by geography disclosure in our condensed consolidated financial statements where revenue is geographically apportioned based on the addresses of our customers.\n",
112 | "\n",
113 | " 3\n",
114 | "\n",
115 | "Segment Results Meta logo \n",
116 | "\n",
117 | "In Millions\n",
118 | "\n",
119 | " \n",
120 | "| | Q2'22 | Q3'22 | Q4'22 | Q1'23 | Q2'23 | Q3'23 | Q4'23 | Q1'24 | Q2'24 |\n",
121 | "|---|---|---|---|---|---|---|---|---|---|\n",
122 | "| Advertising | $ 28,152 | $ 27,237 | $ 31,254 | $ 28,101 | $ 31,498 | $ 33,643 | $ 38,706 | $ 35,635 | $ 38,329 |\n",
123 | "| Other | 218 | 192 | 184 | 205 | 225 | 293 | 334 | 380 | 389 |\n",
124 | "| Family of Apps Revenue | 28,370 | 27,429 | 31,438 | 28,306 | 31,723 | 33,936 | 39,040 | 36,015 | 38,718 |\n",
125 | "| Reality Labs Revenue | 452 | 285 | 727 | 339 | 276 | 210 | 1,071 | 440 | 353 |\n",
126 | "| Total Revenue | $ 28,822 | $ 27,714 | $ 32,165 | $ 28,645 | $ 31,999 | $ 34,146 | $ 40,111 | $ 36,455 | $ 39,071 |\n",
127 | "| Family of Apps Operating Income | $ 11,164 | $ 9,336 | $ 10,678 | $ 11,219 | $ 13,131 | $ 17,490 | $ 21,030 | $ 17,664 | $ 19,335 |\n",
128 | "| Reality Labs Operating (Loss) | (2,806) | (3,672) | (4,279) | (3,992) | (3,739) | (3,742) | (4,646) | (3,846) | (4,488) |\n",
129 | "| Total Income from Operations | $ 8,358 | $ 5,664 | $ 6,399 | $ 7,227 | $ 9,392 | $ 13,748 | $ 16,384 | $ 13,818 | $ 14,847 |\n",
130 | "| Operating Margin | 29% | 20% | 20% | 25% | 29% | 40% | 41% | 38% | 38% |\n",
131 | " \n",
132 | "\n",
133 | "We report our financial results based on two reportable segments: Family of Apps (FoA) and Reality Labs (RL). FoA includes Facebook, Instagram, Messenger, WhatsApp, and other services. RL includes our virtual, augmented, and mixed reality related consumer hardware, software, and content.\n",
134 | "\n",
135 | " 4\n",
136 | "\n",
137 | "Net Income Meta logo \n",
138 | "\n",
139 | "In Millions\n",
140 | "\n",
141 | " \n",
142 | "| Quarter | Net Income |\n",
143 | "|---|---|\n",
144 | "| Q2'22 | $6,687 |\n",
145 | "| Q3'22 | $4,395 |\n",
146 | "| Q4'22 | $4,652 |\n",
147 | "| Q1'23 | $5,709 |\n",
148 | "| Q2'23 | $7,788 |\n",
149 | "| Q3'23 | $11,583 |\n",
150 | "| Q4'23 | $14,017 |\n",
151 | "| Q1'24 | $12,369 |\n",
152 | "| Q2'24 | $13,465 |\n",
153 | "\n",
154 | "This bar chart shows the Net Income in millions for Meta from Q2'22 to Q2'24. The y-axis ranges from $0 to $14,017 million, with increments of $1,000 million. The highest net income was $14,017 million in Q4'23, while the lowest was $4,395 million in Q3'22.\n",
155 | " \n",
156 | "\n",
157 | " 7\n",
158 | "\n",
159 | "Diluted Earnings Per Share Meta logo \n",
160 | "\n",
161 | " \n",
162 | "| Quarter | Earnings Per Share |\n",
163 | "|---|---|\n",
164 | "| Q2'22 | $2.46 |\n",
165 | "| Q3'22 | $1.64 |\n",
166 | "| Q4'22 | $1.76 |\n",
167 | "| Q1'23 | $2.20 |\n",
168 | "| Q2'23 | $2.98 |\n",
169 | "| Q3'23 | $4.39 |\n",
170 | "| Q4'23 | $5.33 |\n",
171 | "| Q1'24 | $4.71 |\n",
172 | "| Q2'24 | $5.16 |\n",
173 | "\n",
174 | "This bar chart shows the Diluted Earnings Per Share for Meta from Q2'22 to Q2'24. The y-axis ranges from $1.64 to $5.33, with increments of $0.02. The chart demonstrates an overall increasing trend in earnings per share over the period, with the highest point in Q4'23 at $5.33 and the lowest in Q3'22 at $1.64.\n",
175 | " \n",
176 | "\n",
177 | " 8\n",
178 | "\n",
179 | "Limitations of Key Metrics and Other Data Meta logo \n",
180 | "\n",
181 | "To calculate our estimates of DAP, we currently use a series of machine learning models that are developed based on internal reviews of limited samples of user accounts and calibrated against user survey data. We apply significant judgment in designing these models and calculating these estimates. For example, to match user accounts within individual products and across multiple products, we use data signals such as similar device information, IP addresses, and user names. We also calibrate our models against data from periodic user surveys of varying sizes and frequency across our products, which survey questions are based on monthly usage, and which are inherently subject to error. The timing and results of such user surveys have in the past contributed, and may in the future contribute, to changes in our reported Family metrics from period to period. In addition, our data limitations may affect our understanding of certain details of our business and increase the risk of error for our Family metrics estimates. Our techniques and models rely on a variety of data signals from different products, and we rely on more limited data signals for some products compared to others. For example, as a result of limited visibility into encrypted products, we have fewer data signals from WhatsApp user accounts and primarily rely on phone numbers and device information to match WhatsApp user accounts with accounts on our other products. Any loss of access to data signals we use in our process for calculating Family metrics, whether as a result of our own product decisions, actions by third-party browser or mobile platforms, regulatory or legislative requirements, or other factors, also may impact the stability or accuracy of our reported Family metrics, as well as our ability to report these metrics at all. Our estimates of Family metrics also may change as our methodologies evolve, including through the application of new data signals or technologies, product changes, or other improvements in our user surveys, algorithms, or machine learning that may improve our ability to match accounts within and across our products or otherwise evaluate the broad population of our users. In addition, such evolution may allow us to identify previously undetected violating accounts (as defined below).\n",
182 | "\n",
183 | "We regularly evaluate our Family metrics to estimate the percentage of our DAP consisting solely of \"violating\" accounts. We define \"violating\" accounts as accounts which we believe are intended to be used for purposes that violate our terms of service, including bots and spam. In the first quarter of 2024, we estimated that less than 3% of our worldwide DAP consisted solely of violating accounts. Such estimation is based on an internal review of a limited sample of accounts, and we apply significant judgment in making this determination. For example, we look for account information and behaviors associated with Facebook and Instagram accounts that appear to be inauthentic to the reviewers, but we have limited visibility into WhatsApp user activity due to encryption. In addition, if we believe an individual person has one or more violating accounts, we do not include such person in our violating accounts estimation as long as we believe they have one account that does not constitute a violating account. From time to time, we disable certain user accounts, make product changes, or take other actions to reduce the number of violating accounts among our users, which may also reduce our DAP estimates in a particular period. We intend to disclose our estimates of the percentage of our DAP consisting solely of violating accounts on an annual basis. Violating accounts are very difficult to measure at our scale, and it is possible that the actual number of violating accounts may vary significantly from our estimates.\n",
184 | "\n",
185 | "## User Geography\n",
186 | "\n",
187 | "Our estimates for revenue by user location, as well as year-over-year percentage changes in ad impressions delivered and the average price per ad by user location, are also affected by data limitations and other challenges in measuring user geography. Our data regarding the geographic location of our users is estimated based on a number of factors, such as the user's IP address and self-disclosed location. These factors may not always accurately reflect the user's actual location. For example, a user may appear to be accessing our products from the location of the proxy server that the user connects to rather than from the user's actual location. The methodologies used to measure our metrics are also susceptible to algorithm or other technical errors.\n",
188 | "\n",
189 | " 17"
190 | ],
191 | "text/plain": [
192 | ""
193 | ]
194 | },
195 | "metadata": {},
196 | "output_type": "display_data"
197 | }
198 | ],
199 | "source": [
200 | "# Join the list elements with newlines to create a single string\n",
201 | "markdown_text = '\\n\\n'.join(markdown_output)\n",
202 | "display(Markdown(markdown_text))"
203 | ]
204 | }
205 | ],
206 | "metadata": {
207 | "kernelspec": {
208 | "display_name": "any",
209 | "language": "python",
210 | "name": "python3"
211 | },
212 | "language_info": {
213 | "codemirror_mode": {
214 | "name": "ipython",
215 | "version": 3
216 | },
217 | "file_extension": ".py",
218 | "mimetype": "text/x-python",
219 | "name": "python",
220 | "nbconvert_exporter": "python",
221 | "pygments_lexer": "ipython3",
222 | "version": "3.10.15"
223 | }
224 | },
225 | "nbformat": 4,
226 | "nbformat_minor": 2
227 | }
228 |
--------------------------------------------------------------------------------
/examples/sample_data/Earnings-Presentation-Q2-2024.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/Earnings-Presentation-Q2-2024.pdf
--------------------------------------------------------------------------------
/examples/sample_data/cambioml_logo_large.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/cambioml_logo_large.png
--------------------------------------------------------------------------------
/examples/sample_data/resume_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/resume_1.pdf
--------------------------------------------------------------------------------
/examples/sample_data/resume_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/resume_1.png
--------------------------------------------------------------------------------
/examples/sample_data/sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/sample.pdf
--------------------------------------------------------------------------------
/examples/sample_data/stoxx_index_guide_0003.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/stoxx_index_guide_0003.pdf
--------------------------------------------------------------------------------
/examples/sample_data/test1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test1.pdf
--------------------------------------------------------------------------------
/examples/sample_data/test2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test2.pdf
--------------------------------------------------------------------------------
/examples/sample_data/test3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test3.pdf
--------------------------------------------------------------------------------
/examples/sample_data/test3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test3.png
--------------------------------------------------------------------------------
/examples/sample_data/test_1figure_1table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_1figure_1table.png
--------------------------------------------------------------------------------
/examples/sample_data/test_invoice.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_invoice.pdf
--------------------------------------------------------------------------------
/examples/sample_data/test_medical_report.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_medical_report.jpeg
--------------------------------------------------------------------------------
/examples/sample_data/test_odf.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_odf.docx
--------------------------------------------------------------------------------
/examples/sample_data/test_odf.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_odf.pptx
--------------------------------------------------------------------------------
/examples/sample_data/test_w2.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_w2.docx
--------------------------------------------------------------------------------
/examples/sample_data/test_w2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_w2.png
--------------------------------------------------------------------------------
/examples/sample_data/test_w2.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_w2.pptx
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "any-parser"
3 | version = "0.0.24"
4 | description = "Parser for all."
5 | authors = ["CambioML "]
6 | maintainers = ["Rachel Hu "]
7 | readme = "README.md"
8 |
9 | [tool.poetry.dependencies]
10 | python = ">=3.9,<3.13"
11 | requests = "^2.25.0"
12 | python-dotenv = "^1.0.0"
13 | pydantic = "^2.10.3"
14 |
15 | [tool.poetry.group.dev.dependencies]
16 | black = "^24.8.0"
17 | isort = "^5.13.2"
18 | autoflake = "^2.3.1"
19 | pytest = "^8.3.3"
20 | pre-commit = "^4.0.1"
21 |
22 | [tool.poetry.group.optional.dependencies]
23 | Levenshtein = [
24 | { version = "0.25.1", python = "<3.9" },
25 | { version = "0.26.0", python = ">=3.9" }
26 | ]
27 |
28 | [build-system]
29 | requires = ["poetry-core"]
30 | build-backend = "poetry.core.masonry.api"
31 |
--------------------------------------------------------------------------------
/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | python -m unittest discover tests -v
3 |
--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | # Testing
2 | Overview of running tests for AnyParser sdk. These should be run before submitting any pull request.
3 |
4 | These tests are written using the unittest framework in Python. The tests are located in the `tests/test.py` file. Test data is located in the `tests/test_data.py` file.
5 |
6 | ## Setup
7 | 1. Install the required packages by running the following command:
8 | ```bash
9 | poetry install
10 | ```
11 | In the `dev.dependencies` section of the `pyproject.toml` file, you will see the packages that are installed.
12 |
13 | 2. Add a `.env` file in the `tests` folder with the following content:
14 | ```bash
15 | CAMBIO_API_KEY=*************
16 | ```
17 |
18 | ## Pre-commit
19 | This project uses pre-commit to run checks before committing code. To initialize `pre-commit` for this repo, run the following command:
20 | ```bash
21 | pre-commit install
22 | ```
23 |
24 | Now, with every commit, the checks will run automatically on the files added to the commit. The checks include:
25 | - `black` for code formatting
26 | - `flake8` for linting
27 | - `isort` for import sorting
28 | - running the unit tests in `tests/test.py`
29 |
30 | If you want to run the checks manually, you can run the following command:
31 | ```bash
32 | pre-commit run --all-files
33 | ```
34 |
35 | ## Running Tests Manually
36 | 1. Make sure you are in the project root folder.
37 | 2. Run the following command:
38 | ```bash
39 | ./run_tests.sh
40 | ```
41 |
42 | If you just want to run an individual test within the test.py file, you can run the following command:
43 | ```bash
44 | python -m unittest -k
45 | ```
46 |
47 | For example, if you want to run `test_pdf_sync_extract`, you can run the following command:
48 | ```bash
49 | python -m unittest -k test_pdf_sync_extract
50 | ```
51 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/tests/__init__.py
--------------------------------------------------------------------------------
/tests/outputs/correct_docx_output.txt:
--------------------------------------------------------------------------------
1 | ## Test document
2 |
3 | Here is an example chart:
4 |
5 |
6 | | Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 |
7 | |---|---|---|---|---|---|
8 | | Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% |
9 | | Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% |
10 | | Office 365 Commercial seat growth (a/o) | 14% | 12% | 11% | 11% | 10% |
11 | | Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 |
12 | | Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% |
13 | | LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% |
14 |
15 |
16 | Growth rates include non-GAAP CC growth (GAAP % / CC %)
--------------------------------------------------------------------------------
/tests/outputs/correct_pdf_output.txt:
--------------------------------------------------------------------------------
1 | STOXX INDEX METHODOLOGY GUIDE
2 |
3 | ## CONTENTS
4 |
5 | 6.5.1. OVERVIEW 49
6 | 6.5.2. INDEX REVIEW 49
7 | 6.5.3. ONGOING MAINTENANCE 51
8 |
9 | 7. STOXX BENCHMARK INDICES (BMI) 52
10 |
11 | 7.1. STOXX GLOBAL INDICES 52
12 | 7.1.1. OVERVIEW 52
13 | 7.1.2. INDEX REVIEW 53
14 | 7.1.3. ONGOING MAINTENANCE 55
15 |
16 | 7.2. STOXX GLOBAL 1800 AND DERIVED INDICES 56
17 | 7.2.1. OVERVIEW 56
18 | 7.2.2. INDEX REVIEW 56
19 | 7.2.3. ONGOING MAINTENANCE 58
20 |
21 | 7.3. SIZE INDICES BASED ON THE STOXX GLOBAL INDICES 60
22 | 7.3.1. OVERVIEW 60
23 | 7.3.2. INDEX REVIEW 60
24 | 7.3.3. ONGOING MAINTENANCE 62
25 |
26 | 7.4. SECTOR INDICES BASED ON THE STOXX GLOBAL INDICES 63
27 | 7.4.1. OVERVIEW 63
28 | 7.4.2. INDEX REVIEW 63
29 | 7.4.3. ONGOING MAINTENANCE 64
30 |
31 | 7.5. STOXX EUROPE 600 AND EURO STOXX SUPERSECTOR INDICES: 30% / 15% CAPS 65
32 | 7.5.1. OVERVIEW 65
33 | 7.5.2. INDEX REVIEW 65
34 | 7.5.3. ONGOING MAINTENANCE 66
35 |
36 | 7.6. STOXX REGIONAL REAL ESTATE INDICES: 20% CAPS67
37 | 7.6.1. OVERVIEW 67
38 | 7.6.2. INDEX REVIEW 67
39 | 7.6.3. ONGOING MAINTENANCE 67
40 |
41 | 7.7. STOXX EMERGING MARKETS 800 LO 68
42 | 7.7.1. OVERVIEW 68
43 | 7.7.2. INDEX REVIEW 68
44 | 7.7.3. ONGOING MAINTENANCE 68
45 |
46 | 7.8. STOXX INDUSTRY AND SUPERSECTOR LEGACY INDICES 70
47 | 7.8.1. OVERVIEW 70
48 | 7.8.2. INDEX REVIEW 71
49 | 7.8.3. ONGOING MAINTENANCE 71
50 |
51 | 7.9. EURO STOXX SUPERSECTOR 5/10/40 INDICES 72
52 | 7.9.1. OVERVIEW 72
53 | 7.9.2. INDEX REVIEW 72
54 | 7.9.3. ONGOING MAINTENANCE 73
55 |
56 | 7.10. STOXX EUROPE 600 INDUSTRY 30-15 INDICES 74
57 | 7.10.1. OVERVIEW 74
58 | 7.10.2. INDEX REVIEW 74
59 | 7.10.3. ONGOING MAINTENANCE 75
60 |
61 | 7.11. STOXX SEMICONDUCTOR 30 INDEX 76
62 | 7.11.1. OVERVIEW 76
63 | 7.11.2. INDEX REVIEW 76
64 | 7.11.3. ONGOING MAINTENANCE 77
65 |
66 | ## 8. STOXX EQUAL WEIGHT INDICES 78
67 |
68 | 8.1. STOXX EQUAL WEIGHT INDICES 78
69 | 8.1.1. OVERVIEW 78
70 | 8.1.2. INDEX REVIEW 78
71 | 8.1.3. ONGOING MAINTENANCE 78
72 |
73 | ## 9. STOXX BLUE-CHIP INDICES 80
74 |
75 | 9.1. STOXX GLOBAL AND COUNTRY BLUE-CHIP INDICES 80
76 | 9.1.1. OVERVIEW 80
77 | 9.1.2. INDEX REVIEW 81
78 | 9.1.3. ONGOING MAINTENANCE 84
79 |
80 | 9.2. EURO STOXX 50 85
81 | 9.2.1. OVERVIEW 85
82 | 9.2.2. INDEX REVIEW 85
83 | 9.2.3. ONGOING MAINTENANCE 86
84 |
85 | 9.3. STOXX REGIONAL BLUE-CHIP INDICES 88
86 | 9.3.1. OVERVIEW 88
87 | 9.3.2. INDEX REVIEW 88
88 | 9.3.3. ONGOING MAINTENANCE 89
89 |
90 | 9.4. STOXX GLOBAL 150 91
91 | 9.4.1. OVERVIEW 91
92 | 9.4.2. INDEX REVIEW 91
93 | 9.4.3. ONGOING MAINTENANCE 91
94 |
95 | 9.5. STOXX BALKAN 50 EQUAL WEIGHT 92
96 | 9.5.1. OVERVIEW 92
97 | 9.5.2. INDEX REVIEW 92
98 | 9.5.3. ONGOING MAINTENANCE 93
99 |
100 | 9.6. STOXX CANADA 60 94
101 | 9.6.1. OVERVIEW 94
102 | 9.6.2. INDEX REVIEW 94
103 | 9.6.3. ONGOING MAINTENANCE 95
104 |
105 | ## 10. STOXX DIVIDEND INDICES 96
106 |
107 | 10.1. STOXX SELECT DIVIDEND INDICES 96
108 | 10.1.1. OVERVIEW 96
109 | 10.1.2. INDEX REVIEW 96
110 | 10.1.3. STOXX SELECT DIVIDEND INDICES 99
111 | 10.1.4. ONGOING MAINTENANCE 101
112 |
113 | 10.2. STOXX ASEAN-FIVE SELECT DIVIDEND 50 104
114 | 10.2.1. OVERVIEW 104
115 | 10.2.2. INDEX REVIEW 104
116 | 10.2.3. ONGOING MAINTENANCE 105
117 |
118 | 10.3. STOXX ASEAN SELECT DIVIDEND 30 106
119 |
120 | 3/529
121 |
122 | Part of DEUTSCHE BÖRSE GROUP
--------------------------------------------------------------------------------
/tests/outputs/correct_png_output.txt:
--------------------------------------------------------------------------------
1 | | Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 |
2 | |---|---|---|---|---|---|
3 | | Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% |
4 | | Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% |
5 | | Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% |
6 | | Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 |
7 | | Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% |
8 | | LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% |
--------------------------------------------------------------------------------
/tests/outputs/correct_pptx_output.txt:
--------------------------------------------------------------------------------
1 | ## Test finical report
2 | ## Title
3 |
4 | • Chart 1 example
5 |
6 |
7 | | Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 |
8 | |---|---|---|---|---|---|
9 | | Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% |
10 | | Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% |
11 | | Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% |
12 | | Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 |
13 | | Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% |
14 | | LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% |
15 |
16 |
17 | Growth rates include non-GAAP CC growth (GAAP % / CC %).
18 | Thanks
--------------------------------------------------------------------------------
/tests/test.py:
--------------------------------------------------------------------------------
1 | """Testing Synchronous and Asynchronous Extraction"""
2 |
3 | import base64
4 | import os
5 | import sys
6 | import time
7 | import unittest
8 | from pathlib import Path
9 |
10 | import Levenshtein
11 | from dotenv import load_dotenv
12 |
13 | from tests.test_data import EXTRACT_JSON_TEST_DATA
14 |
15 | sys.path.append(".")
16 | load_dotenv(override=True)
17 | from any_parser import AnyParser # noqa: E402
18 |
19 |
20 | def get_ground_truth(file_path: str) -> str:
21 | """Get the ground truth from the file."""
22 | with open(file_path, "r", encoding="utf-8") as file:
23 | return file.read()
24 |
25 |
26 | def _preprocess_markdown_text(text: str) -> str:
27 | """Clean the markdown text."""
28 | return text.replace("#", "").replace("\n", "")
29 |
30 |
31 | def compare_markdown(generated_output: str, correct_output: str) -> float:
32 | """
33 | Compare the generated markdown to the correct markdown using
34 | Levenshtein Distance.
35 | """
36 | # Preprocess both outputs to clean markdown text
37 | generated_output = _preprocess_markdown_text(generated_output)
38 | correct_output = _preprocess_markdown_text(correct_output)
39 |
40 | distance = Levenshtein.distance(generated_output, correct_output)
41 |
42 | max_len = max(len(generated_output), len(correct_output))
43 | similarity_percentage = ((max_len - distance) / max_len) * 100
44 |
45 | return similarity_percentage
46 |
47 |
48 | class TestAnyParser(unittest.TestCase):
49 | """Testing Any Parser"""
50 |
51 | def setUp(self):
52 | self.api_key = os.environ.get("CAMBIO_API_KEY")
53 | if not self.api_key:
54 | raise ValueError("CAMBIO_API_KEY is not set")
55 | self.ap = AnyParser(self.api_key)
56 |
57 | def test_pdf_sync_parse(self):
58 | """Synchronous PDF Parse"""
59 | working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
60 | correct_output_file = "./tests/outputs/correct_pdf_output.txt"
61 |
62 | # extract
63 | markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
64 | markdown = "\n".join(markdown_list)
65 | self.assertFalse(markdown.startswith("Error:"), markdown)
66 | correct_output = get_ground_truth(correct_output_file)
67 | percentage = compare_markdown(markdown, correct_output)
68 |
69 | self.assertGreaterEqual(
70 | percentage, 90, f"Output similarity too low: {percentage:.2f}%"
71 | )
72 | self.assertIn("Time Elapsed", elapsed_time)
73 |
74 | def test_pdf_sync_parse_with_file_content(self):
75 | """Synchronous PDF Parse with file content"""
76 | working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
77 | correct_output_file = "./tests/outputs/correct_pdf_output.txt"
78 |
79 | with open(working_file, "rb") as file:
80 | file_content = base64.b64encode(file.read()).decode("utf-8")
81 | file_type = Path(working_file).suffix.lower().lstrip(".")
82 |
83 | # extract
84 | markdown_list, elapsed_time = self.ap.parse(
85 | file_content=file_content, file_type=file_type
86 | )
87 | markdown = "\n".join(markdown_list)
88 |
89 | self.assertFalse(markdown.startswith("Error:"), markdown)
90 | correct_output = get_ground_truth(correct_output_file)
91 | percentage = compare_markdown(markdown, correct_output)
92 |
93 | self.assertGreaterEqual(
94 | percentage, 90, f"Output similarity too low: {percentage:.2f}%"
95 | )
96 | self.assertIn("Time Elapsed", elapsed_time)
97 |
98 | def test_pdf_async_parse_and_fetch(self):
99 | """Asynchronous PDF Parse and Fetch"""
100 | working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
101 | correct_output_file = "./tests/outputs/correct_pdf_output.txt"
102 |
103 | # extract
104 | file_id = self.ap.async_parse(file_path=working_file)
105 | self.assertFalse(file_id.startswith("Error:"), file_id)
106 | # fetch
107 | markdown_list = self.ap.async_fetch(file_id=file_id)
108 | markdown = "\n".join(markdown_list)
109 | self.assertFalse(markdown.startswith("Error:"), markdown)
110 | correct_output = get_ground_truth(correct_output_file)
111 | percentage = compare_markdown(markdown, correct_output)
112 |
113 | self.assertGreaterEqual(
114 | percentage, 90, f"Output similarity too low: {percentage:.2f}%"
115 | )
116 |
117 | def test_pdf_async_parse_and_fetch_with_file_content(self):
118 | """Asynchronous PDF Parse and Fetch with file content"""
119 | working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
120 | correct_output_file = "./tests/outputs/correct_pdf_output.txt"
121 |
122 | with open(working_file, "rb") as file:
123 | file_content = base64.b64encode(file.read()).decode("utf-8")
124 | file_type = Path(working_file).suffix.lower().lstrip(".")
125 |
126 | # extract
127 | file_id = self.ap.async_parse(file_content=file_content, file_type=file_type)
128 | self.assertFalse(file_id.startswith("Error:"), file_id)
129 | # fetch
130 | markdown_list = self.ap.async_fetch(file_id=file_id)
131 | markdown = "\n".join(markdown_list)
132 | self.assertFalse(markdown.startswith("Error:"), markdown)
133 | correct_output = get_ground_truth(correct_output_file)
134 | percentage = compare_markdown(markdown, correct_output)
135 |
136 | self.assertGreaterEqual(
137 | percentage, 90, f"Output similarity too low: {percentage:.2f}%"
138 | )
139 |
140 | def test_docx_sync_extract(self):
141 | """Synchronous Word Extraction"""
142 | working_file = "./examples/sample_data/test_odf.docx"
143 | correct_output_file = "./tests/outputs/correct_docx_output.txt"
144 |
145 | # extract
146 | markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
147 | markdown = "\n".join(markdown_list)
148 | self.assertFalse(markdown.startswith("Error:"), markdown)
149 | correct_output = get_ground_truth(correct_output_file)
150 | percentage = compare_markdown(markdown, correct_output)
151 |
152 | self.assertGreaterEqual(
153 | percentage, 90, f"Output similarity too low: {percentage:.2f}%"
154 | )
155 | self.assertIn("Time Elapsed", elapsed_time)
156 |
157 | def test_docx_async_parse_and_fetch(self):
158 | """Asynchronous Word Parse and Fetch"""
159 | working_file = "./examples/sample_data/test_odf.docx"
160 | correct_output_file = "./tests/outputs/correct_docx_output.txt"
161 |
162 | # extract
163 | file_id = self.ap.async_parse(file_path=working_file)
164 | self.assertFalse(file_id.startswith("Error:"), file_id)
165 | # fetch
166 | markdown_list = self.ap.async_fetch(file_id=file_id)
167 | markdown = "\n".join(markdown_list)
168 | self.assertFalse(markdown.startswith("Error:"), markdown)
169 | correct_output = get_ground_truth(correct_output_file)
170 | percentage = compare_markdown(markdown, correct_output)
171 |
172 | self.assertGreaterEqual(
173 | percentage, 90, f"Output similarity too low: {percentage:.2f}%"
174 | )
175 |
176 | def test_pptx_sync_extract(self):
177 | """Synchronous Powerpoint Extraction"""
178 | working_file = "./examples/sample_data/test_odf.pptx"
179 | correct_output_file = "./tests/outputs/correct_pptx_output.txt"
180 |
181 | # extract
182 | markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
183 | markdown = "\n".join(markdown_list)
184 | self.assertFalse(markdown.startswith("Error:"), markdown)
185 | correct_output = get_ground_truth(correct_output_file)
186 | percentage = compare_markdown(markdown, correct_output)
187 |
188 | self.assertGreaterEqual(
189 | percentage, 90, f"Output similarity too low: {percentage:.2f}%"
190 | )
191 | self.assertIn("Time Elapsed", elapsed_time)
192 |
193 | def test_pptx_async_parse_and_fetch(self):
194 | """Asynchronous Powerpoint Parse and Fetch"""
195 | working_file = "./examples/sample_data/test_odf.pptx"
196 | correct_output_file = "./tests/outputs/correct_pptx_output.txt"
197 |
198 | # extract
199 | file_id = self.ap.async_parse(file_path=working_file)
200 | self.assertFalse(file_id.startswith("Error:"), file_id)
201 | # fetch
202 | markdown_list = self.ap.async_fetch(file_id=file_id)
203 | markdown = "\n".join(markdown_list)
204 | self.assertFalse(markdown.startswith("Error:"), markdown)
205 | correct_output = get_ground_truth(correct_output_file)
206 | percentage = compare_markdown(markdown, correct_output)
207 |
208 | self.assertGreaterEqual(
209 | percentage, 90, f"Output similarity too low: {percentage:.2f}%"
210 | )
211 |
212 | def test_image_sync_extract(self):
213 | """Synchronous Image Extraction"""
214 | working_file = "./examples/sample_data/test3.png"
215 | correct_output_file = "./tests/outputs/correct_png_output.txt"
216 |
217 | # extract
218 | markdown_list, elapsed_time = self.ap.parse(file_path=working_file)
219 | markdown = "\n".join(markdown_list)
220 | self.assertFalse(markdown.startswith("Error:"), markdown)
221 | correct_output = get_ground_truth(correct_output_file)
222 | percentage = compare_markdown(markdown, correct_output)
223 |
224 | self.assertGreaterEqual(
225 | percentage, 90, f"Output similarity too low: {percentage:.2f}%"
226 | )
227 | self.assertIn("Time Elapsed", elapsed_time)
228 |
229 | def test_image_async_parse_and_fetch(self):
230 | """Asynchronous Image Parse and Fetch"""
231 | working_file = "./examples/sample_data/test3.png"
232 | correct_output_file = "./tests/outputs/correct_png_output.txt"
233 |
234 | # extract
235 | file_id = self.ap.async_parse(file_path=working_file)
236 | self.assertFalse(file_id.startswith("Error:"), file_id)
237 | # fetch
238 | markdown_list = self.ap.async_fetch(file_id=file_id)
239 | markdown = "\n".join(markdown_list)
240 | self.assertFalse(markdown.startswith("Error:"), markdown)
241 | correct_output = get_ground_truth(correct_output_file)
242 | percentage = compare_markdown(markdown, correct_output)
243 |
244 | self.assertGreaterEqual(
245 | percentage, 90, f"Output similarity too low: {percentage:.2f}%"
246 | )
247 |
248 | def test_sync_extract_key_value(self):
249 | """
250 | Synchronous JSON Extraction with subtests for different file formats
251 | """
252 | for data in EXTRACT_JSON_TEST_DATA:
253 | with self.subTest(working_file=data["working_file"]):
254 | # extract
255 | key_value_result, elapsed_time = self.ap.extract_key_value(
256 | file_path=data["working_file"],
257 | extract_instruction=data["extract_instruction"],
258 | )
259 |
260 | # assertions
261 | self.assertEqual(key_value_result, data["correct_output"])
262 | self.assertIn("Time Elapsed", elapsed_time)
263 |
264 | def test_async_extract_key_value_and_fetch(self):
265 | """
266 | Asynchronous JSON Extraction with subtests for different file formats
267 | """
268 | for data in EXTRACT_JSON_TEST_DATA:
269 | with self.subTest(working_file=data["working_file"]):
270 | # extract
271 | file_id = self.ap.async_extract_key_value(
272 | file_path=data["working_file"],
273 | extract_instruction=data["extract_instruction"],
274 | )
275 | self.assertFalse(file_id.startswith("Error:"), file_id)
276 | # fetch
277 | key_value_result = self.ap.async_fetch(file_id=file_id)
278 | # assertions
279 | self.assertEqual(key_value_result, data["correct_output"])
280 | # wait 1 s between requests
281 | time.sleep(1)
282 |
283 |
284 | if __name__ == "__main__":
285 | unittest.main(verbosity=2)
286 |
--------------------------------------------------------------------------------
/tests/test_batch_api.py:
--------------------------------------------------------------------------------
1 | """Testing Batch API Extraction"""
2 |
3 | import os
4 | import sys
5 | import unittest
6 |
7 | from dotenv import load_dotenv
8 |
9 | sys.path.append(".")
10 | load_dotenv(override=True)
11 | from any_parser import AnyParser # noqa: E402
12 |
13 |
14 | class TestAnyParserBatchAPI(unittest.TestCase):
15 | """Testing Any Parser Batch API"""
16 |
17 | def setUp(self):
18 | self.api_key = os.environ.get("CAMBIO_API_KEY")
19 | if not self.api_key:
20 | raise ValueError("CAMBIO_API_KEY is not set")
21 | self.ap = AnyParser(self.api_key)
22 |
23 | def test_batch_api_create(self):
24 | """Batch API Create"""
25 | working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf"
26 |
27 | response = self.ap.batches.create(working_file)
28 |
29 | self.assertIsNotNone(response)
30 | self.assertEqual(response.requestStatus, "UPLOADED")
31 |
32 | request_id = response.requestId
33 | status = self.ap.batches.retrieve(request_id)
34 | self.assertEqual(status.requestStatus, "UPLOADED")
35 |
36 | quota = self.ap.batches.get_usage()
37 | self.assertGreaterEqual(quota.pageRemaining, 0)
38 |
--------------------------------------------------------------------------------
/tests/test_data.py:
--------------------------------------------------------------------------------
1 | EXTRACT_JSON_TEST_DATA = [
2 | {
3 | "working_file": "./examples/sample_data/test1.pdf",
4 | "extract_instruction": {
5 | "social_security_number": "the social security number of the employee",
6 | "ein": "the employer identification number",
7 | "first_name": "the first name of the employee",
8 | "last_name": "the last name of the employee",
9 | },
10 | "correct_output": {
11 | "social_security_number": ["758-58-5787"],
12 | "ein": ["78-8778788"],
13 | "first_name": ["Jesan"],
14 | "last_name": ["Rahaman"],
15 | },
16 | },
17 | # {
18 | # "working_file": "./examples/sample_data/test_w2.pptx",
19 | # "extract_instruction": {
20 | # "social_security_number": "the social security number of the employee",
21 | # "ein": "the employer identification number",
22 | # "first_name": "the first name of the employee",
23 | # "last_name": "the last name of the employee",
24 | # },
25 | # "correct_output": [
26 | # {
27 | # "social_security_number": ["758-58-5787"],
28 | # "ein": ["78-8778788"],
29 | # "first_name": ["Jesan"],
30 | # "last_name": ["Rahaman"],
31 | # }
32 | # ],
33 | # },
34 | # {
35 | # "working_file": "./examples/sample_data/test_w2.docx",
36 | # "extract_instruction": {
37 | # "social_security_number": "the social security number of the employee",
38 | # "ein": "the employer identification number",
39 | # "first_name": "the first name of the employee",
40 | # "last_name": "the last name of the employee",
41 | # },
42 | # "correct_output": [
43 | # {
44 | # "social_security_number": ["758-58-5787"],
45 | # "ein": ["78-8778788"],
46 | # "first_name": ["Jesan"],
47 | # "last_name": ["Rahaman"],
48 | # }
49 | # ],
50 | # },
51 | {
52 | "working_file": "./examples/sample_data/test_w2.png",
53 | "extract_instruction": {
54 | "social_security_number": "the social security number of the employee",
55 | "ein": "the employer identification number",
56 | "first_name": "the first name of the employee",
57 | "last_name": "the last name of the employee",
58 | },
59 | "correct_output": {
60 | "social_security_number": ["758-58-5787"],
61 | "ein": ["78-8778788"],
62 | "first_name": ["Jesan"],
63 | "last_name": ["Rahaman"],
64 | },
65 | },
66 | ]
67 |
--------------------------------------------------------------------------------