├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug-report.yml │ ├── config.yml │ ├── documentation.yml │ └── feature-request.yml ├── pull_request_template.md └── workflows │ ├── pr_agent.yml │ └── python-app.yml ├── .gitignore ├── .pre-commit-config.yaml ├── README.md ├── any_parser ├── __init__.py ├── any_parser.py ├── async_parser.py ├── base_parser.py ├── batch_parser.py ├── constants.py ├── sync_parser.py └── utils.py ├── examples ├── async_extract_key_value_img.ipynb ├── async_extract_key_value_pdf.ipynb ├── async_extract_pii.ipynb ├── async_extract_resume_key_value.ipynb ├── async_extract_tables.ipynb ├── async_parse_pdf.ipynb ├── async_parse_pdf2.ipynb ├── async_parse_with_layout.ipynb ├── async_parse_with_ocr.ipynb ├── extract_key_value_img.ipynb ├── extract_key_value_pdf.ipynb ├── extract_pii.ipynb ├── extract_resume_key_value.ipynb ├── extract_tables.ipynb ├── parse_batch_api.ipynb ├── parse_docx.ipynb ├── parse_img.ipynb ├── parse_pdf.ipynb ├── parse_pdf2.ipynb └── sample_data │ ├── Earnings-Presentation-Q2-2024.pdf │ ├── cambioml_logo_large.png │ ├── resume_1.pdf │ ├── resume_1.png │ ├── sample.pdf │ ├── stoxx_index_guide_0003.pdf │ ├── test1.pdf │ ├── test2.pdf │ ├── test3.pdf │ ├── test3.png │ ├── test_1figure_1table.png │ ├── test_invoice.pdf │ ├── test_medical_report.jpeg │ ├── test_odf.docx │ ├── test_odf.pptx │ ├── test_w2.docx │ ├── test_w2.png │ └── test_w2.pptx ├── pyproject.toml ├── run_tests.sh └── tests ├── README.md ├── __init__.py ├── outputs ├── correct_docx_output.txt ├── correct_pdf_output.txt ├── correct_png_output.txt └── correct_pptx_output.txt ├── test.py ├── test_batch_api.py └── test_data.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Default codeowners/reviewers for all code changes 2 | * @CambioML @Sdddell @goldmermaid @lingjiekong @boqiny 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.yml: -------------------------------------------------------------------------------- 1 | name: 🐛 Bug Report 2 | description: Create a report to help us reproduce and fix the bug 3 | 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: > 8 | #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/CambioML/any-parser/issues?q=is%3Aissue+sort%3Acreated-desc+). 9 | - type: textarea 10 | attributes: 11 | label: 🐛 Describe the bug 12 | description: | 13 | Please provide a clear and concise description of what the bug is. 14 | 15 | If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example: 16 | 17 | ```python 18 | ... 19 | ``` 20 | 21 | If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com. 22 | 23 | Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````. 24 | placeholder: | 25 | A clear and concise description of what the bug is. 26 | 27 | ```python 28 | # Sample code to reproduce the problem 29 | ``` 30 | 31 | ``` 32 | The error message you got, with the full traceback. 33 | ``` 34 | validations: 35 | required: true 36 | - type: textarea 37 | attributes: 38 | label: Versions 39 | description: | 40 | Please run the following and paste the output below. 41 | ```sh 42 | wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py 43 | # For security purposes, please check the contents of collect_env.py before running it. 44 | python collect_env.py 45 | ``` 46 | validations: 47 | required: true 48 | - type: markdown 49 | attributes: 50 | value: > 51 | Thanks for contributing 🎉! 52 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: Questions 4 | url: https://cambiomlworkspace.slack.com/join/shared_invite/zt-1zes33rmt-20Rag043uvExUaUdvt5_xQ#/shared-invite/email 5 | about: Ask questions and discuss with other CambioML community members 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to https://www.cambioml.com/docs/any-parser/index.html 3 | 4 | body: 5 | - type: textarea 6 | attributes: 7 | label: 📚 The doc issue 8 | description: > 9 | A clear and concise description of what content in https://www.cambioml.com/docs/any-parser/index.html is an issue. 10 | validations: 11 | required: true 12 | - type: textarea 13 | attributes: 14 | label: Suggest a potential alternative/fix 15 | description: > 16 | Tell us how we could improve the documentation in this regard. 17 | - type: markdown 18 | attributes: 19 | value: > 20 | Thanks for contributing 🎉! 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Submit a proposal/request for a new any-parser feature 3 | 4 | body: 5 | - type: textarea 6 | attributes: 7 | label: 🚀 The feature, motivation and pitch 8 | description: > 9 | A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. 10 | validations: 11 | required: true 12 | - type: textarea 13 | attributes: 14 | label: Alternatives 15 | description: > 16 | A description of any alternative solutions or features you've considered, if any. 17 | - type: textarea 18 | attributes: 19 | label: Additional context 20 | description: > 21 | Add any other context or screenshots about the feature request. 22 | - type: markdown 23 | attributes: 24 | value: > 25 | Thanks for contributing 🎉! 26 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | 4 | ## Related Issue 5 | 6 | 7 | ## Type of Change 8 | 9 | 10 | - [ ] Bug fix (non-breaking change which fixes an issue) 11 | - [ ] New feature (non-breaking change which adds functionality) 12 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 13 | - [ ] Documentation update 14 | - [ ] Code refactoring 15 | - [ ] Performance improvement 16 | 17 | ## How Has This Been Tested? 18 | 19 | 20 | ## Screenshots (if applicable) 21 | 22 | 23 | ## Checklist 24 | 25 | 26 | - [ ] My code follows the project's style guidelines 27 | - [ ] I have performed a self-review of my own code 28 | - [ ] I have commented my code, particularly in hard-to-understand areas 29 | - [ ] I have made corresponding changes to the documentation 30 | - [ ] My changes generate no new warnings 31 | - [ ] I have added tests that prove my fix is effective or that my feature works 32 | - [ ] New and existing unit tests pass locally with my changes 33 | 34 | ## Additional Notes 35 | 36 | -------------------------------------------------------------------------------- /.github/workflows/pr_agent.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | types: [opened, reopened, ready_for_review] 4 | issue_comment: 5 | jobs: 6 | pr_agent_job: 7 | if: ${{ github.event.sender.type != 'Bot' }} 8 | runs-on: ubuntu-latest 9 | permissions: 10 | issues: write 11 | pull-requests: write 12 | contents: write 13 | name: Run pr agent on every pull request, respond to user comments 14 | steps: 15 | - name: PR Agent action step 16 | id: pragent 17 | uses: Codium-ai/pr-agent@main 18 | env: 19 | OPENAI_KEY: ${{ secrets.OPENAI_API_KEY }} 20 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 21 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | name: Python application 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | permissions: 10 | contents: read 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.10"] 19 | max-parallel: 1 # Ensures the tests run sequentially 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install flake8 31 | pip install black 32 | pip install isort 33 | python -m pip install poetry 34 | poetry install --no-root # This will install the project dependencies defined in pyproject.toml 35 | - name: Lint with flake8 36 | run: | 37 | # stop the build if there are Python syntax errors or undefined names 38 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 39 | # exit-zero treats all errors as warnings. 40 | flake8 . --count --exit-zero --max-complexity=10 --statistics 41 | - name: Format code with Black 42 | run: | 43 | black . --exclude="" --check --verbose 44 | - name: Sort imports with isort 45 | run: | 46 | isort . --profile=black --check-only --verbose 47 | - name: Test with unittest 48 | env: 49 | CAMBIO_API_KEY: ${{ secrets.CAMBIO_API_KEY }} 50 | run: | 51 | poetry run python -m unittest discover -v tests 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # mac 163 | .DS_Store 164 | 165 | # vscode 166 | .vscode/ 167 | 168 | # data/ 169 | *.xlsx 170 | *.csv 171 | *.jsonl -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 24.8.0 4 | hooks: 5 | - id: black 6 | args: [--exclude=""] 7 | 8 | # this is not technically always safe but usually is 9 | # use comments `# isort: off` and `# isort: on` to disable/re-enable isort 10 | - repo: https://github.com/pycqa/isort 11 | rev: 5.13.2 12 | hooks: 13 | - id: isort 14 | args: [--profile=black] 15 | 16 | # this is slightly dangerous because python imports have side effects 17 | # and this tool removes unused imports, which may be providing 18 | # necessary side effects for the code to run 19 | - repo: https://github.com/PyCQA/autoflake 20 | rev: v2.3.1 21 | hooks: 22 | - id: autoflake 23 | args: 24 | - "--in-place" 25 | - "--expand-star-imports" 26 | - "--remove-duplicate-keys" 27 | - "--remove-unused-variables" 28 | - "--remove-all-unused-imports" 29 | exclude: "any-parser/__init__.py" 30 | 31 | # run all unittests 32 | - repo: local 33 | hooks: 34 | - id: unittests 35 | name: unittests 36 | entry: ./run_tests.sh 37 | language: script 38 | pass_filenames: false 39 | # Optional: Specify types of files that trigger this hook 40 | # types: [python] 41 | # Optional: Specify files or directories to exclude 42 | # exclude: '^docs/' 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🌊 AnyParser 2 |

3 | pypi_status 4 | Commit activity 5 | Slack 6 |

7 | 8 | **AnyParser** provides an API to accurately extract unstructured data (e.g., PDFs, images, charts) into a structured format. 9 | 10 | ## :seedling: Set up your AnyParser API key 11 | 12 | To get started, generate your API key from the [Sandbox Account Page](https://www.cambioml.com/account). Each account comes with **100 free pages**. 13 | 14 | > ⚠️ **Note:** The free API is limited to 10 pages/call. 15 | 16 | For more information or to inquire about larger usage plans, feel free to contact us at info@cambioml.com. 17 | 18 | To set up your API key (`CAMBIO_API_KEY`), follow these steps: 19 | 1. Create a `.env` file in the root directory of your project. 20 | 2. Add the following line to the `.env` file: 21 | ``` 22 | CAMBIO_API_KEY=0cam************************ 23 | ``` 24 | 25 | 26 | ## :computer: Installation 27 | ### 1. Set Up a New Conda Environment and Install AnyParser 28 | First, create and activate a new Conda environment, then install AnyParser: 29 | ```bash 30 | conda create -n any-parse python=3.10 -y 31 | conda activate any-parse 32 | pip3 install any-parser 33 | ``` 34 | ### 2. Create an AnyParser Instance Using Your API Key 35 | Use your API key to create an instance of AnyParser. Make sure you’ve set up your .env file to store your API key securely: 36 | ```python 37 | import os 38 | from dotenv import load_dotenv 39 | from any_parser import AnyParser 40 | 41 | # Load environment variables 42 | load_dotenv(override=True) 43 | 44 | # Get the API key from the environment 45 | example_apikey = os.getenv("CAMBIO_API_KEY") 46 | 47 | # Create an AnyParser instance 48 | ap = AnyParser(api_key=example_apikey) 49 | ``` 50 | 51 | ### 3. Run Synchronous Extraction 52 | To extract data synchronously and receive immediate results: 53 | ```python 54 | # Extract content from the file and get the markdown output along with processing time 55 | markdown, total_time = ap.parse(file_path="./data/test.pdf") 56 | ``` 57 | 58 | ### 4. Run Asynchronous Extraction 59 | For asynchronous extraction, send the file for processing and fetch results later: 60 | ```python 61 | # Send the file to begin asynchronous extraction 62 | file_id = ap.async_parse(file_path="./data/test.pdf") 63 | 64 | # Fetch the extracted content using the file ID 65 | markdown = ap.async_fetch(file_id=file_id) 66 | ``` 67 | 68 | ### 5. Run Batch Extraction (Beta) 69 | For batch extraction, send the file to begin processing and fetch results later: 70 | ```python 71 | # Send the file to begin batch extraction 72 | response = ap.batches.create(file_path="./data/test.pdf") 73 | request_id = response.requestId 74 | 75 | # Fetch the extracted content using the request ID 76 | markdown = ap.batches.retrieve(request_id) 77 | ``` 78 | 79 | Batch API for folder input: 80 | ```python 81 | # Send the folder to begin batch extraction 82 | WORKING_FOLDER = "./sample_data" 83 | # This will generate a jsonl with filename and requestID 84 | response = ap.batches.create(WORKING_FOLDER) 85 | ``` 86 | 87 | Each response in the JSONL file contains: 88 | - The filename 89 | - A unique request ID 90 | - Additional processing metadata 91 | 92 | You can later use these request IDs to retrieve the extracted content for each file: 93 | 94 | ```python 95 | # Fetch the extracted content using the request ID from the jsonl file 96 | markdown = ap.batches.retrieve(request_id) 97 | ``` 98 | For more details about code implementation of batch API, refer to 99 | [examples/parse_batch_upload.py](examples/parse_batch_upload.py) and [examples/parse_batch_fetch.py](examples/parse_batch_fetch.py) 100 | 101 | > ⚠️ **Note:** Batch extraction is currently in beta testing. Processing time may take up to 12 hours to complete. 102 | > 103 | > ⚠️ **Important:** API keys generated from cambioml.com do not automatically have batch processing permissions. Please contact info@cambioml.com to request batch processing access for your API key. 104 | 105 | ## :scroll: Examples 106 | Check out these examples to see how you can utilize **AnyParser** to extract text, numbers, and symbols in fewer than 10 lines of code! 107 | 108 | ### [Extract all text and layout from PDF into Markdown Format](https://github.com/CambioML/any-parser/blob/rt-migration/examples/pdf_to_markdown.ipynb) 109 | Are you an AI engineer looking to **accurately** extract both the text and layout (e.g., table of contents or Markdown headers hierarchy) from a PDF? Check out this [3-minute notebook demo](https://github.com/CambioML/any-parser/blob/rt-migration/examples/pdf_to_markdown.ipynb). 110 | 111 | ### [Extract a Table from an Image into Markdown Format](https://github.com/CambioML/any-parser/blob/rt-migration/examples/image_to_markdown.ipynb) 112 | Are you a financial analyst needing to **accurately** extract numbers from a table within an image? Explore this [3-minute notebook example](https://github.com/CambioML/any-parser/blob/rt-migration/examples/image_to_markdown.ipynb). 113 | -------------------------------------------------------------------------------- /any_parser/__init__.py: -------------------------------------------------------------------------------- 1 | """AnyParser module for parsing data.""" 2 | 3 | from any_parser.any_parser import AnyParser 4 | 5 | __all__ = ["AnyParser"] 6 | 7 | __version__ = "0.0.24" 8 | -------------------------------------------------------------------------------- /any_parser/any_parser.py: -------------------------------------------------------------------------------- 1 | """AnyParser RT: Real-time parser for any data format.""" 2 | 3 | import base64 4 | import json 5 | import time 6 | import uuid 7 | from collections.abc import Iterable 8 | from io import StringIO 9 | from pathlib import Path 10 | 11 | import requests 12 | 13 | from any_parser.async_parser import AsyncParser 14 | from any_parser.batch_parser import BatchParser 15 | from any_parser.constants import ProcessType 16 | from any_parser.sync_parser import ( 17 | ExtractKeyValueSyncParser, 18 | ExtractPIISyncParser, 19 | ExtractResumeKeyValueSyncParser, 20 | ExtractTablesSyncParser, 21 | ParseSyncParser, 22 | ) 23 | from any_parser.utils import validate_file_inputs 24 | 25 | PUBLIC_SHARED_BASE_URL = "https://public-api.cambioml.com" 26 | PUBLIC_BATCH_BASE_URL = "http://batch-api.cambioml.com" 27 | TIMEOUT = 180 28 | 29 | 30 | def handle_file_processing(func): 31 | """ 32 | Decorator to handle common file processing logic for parsing 33 | and extraction operations. 34 | 35 | This decorator manages file input validation and processing, supporting 36 | either direct file content or file path inputs. It performs base64 encoding 37 | of file contents when a file path is provided. 38 | 39 | Args: 40 | func: The decorated function that performs the actual parsing or 41 | extraction. 42 | 43 | Parameters for decorated functions: 44 | file_path (str, optional): Path to the file to be processed. If 45 | provided, the file will be read and encoded in base64. 46 | file_content (str, optional): Base64-encoded content of the file. If 47 | provided, file_path will be ignored. 48 | file_type (str, optional): The file extension/type (e.g., 'pdf'). 49 | If not provided and file_path is given, it will be inferred from 50 | the file extension. 51 | *args, **kwargs: Additional arguments passed to the decorated function. 52 | 53 | Returns: 54 | tuple: A tuple containing (error_message, result), where: 55 | - error_message (str): Error message if processing fails, empty 56 | string on success 57 | - result (str): Empty string if error occurs, otherwise the 58 | processed result from func 59 | 60 | Usage: 61 | @handle_file_processing 62 | def parse(self, file_path=None, file_content=None, file_type=None): 63 | # Implementation 64 | pass 65 | 66 | Note: 67 | Either file_path or file_content must be provided, but not both. 68 | If file_path is provided, the file content will be read and encoded in 69 | base64, and file_type will be inferred from the file extension. 70 | If file_content is provided, file_type will be validated, and a 71 | temporary file path will be generated for generating presigned url(for 72 | async parsing and extraction) 73 | """ 74 | 75 | def wrapper( 76 | self, 77 | file_path=None, 78 | file_content=None, 79 | file_type=None, 80 | *args, 81 | **kwargs, 82 | ): 83 | # pylint: disable=too-many-arguments 84 | # Validate inputs 85 | is_valid, error_message = validate_file_inputs( 86 | file_path=file_path, 87 | file_content=file_content, 88 | file_type=file_type, 89 | ) 90 | 91 | if not is_valid: 92 | return error_message, "" 93 | 94 | # Encode the file content in base64 if file_path is provided 95 | if file_path: 96 | try: 97 | with open(file_path, "rb") as file: 98 | file_content = base64.b64encode(file.read()).decode("utf-8") 99 | file_type = Path(file_path).suffix.lower().lstrip(".") 100 | except Exception as e: 101 | return f"Error: {e}", "" 102 | else: 103 | # generate a random file path for genrating presigned url 104 | file_path = f"/tmp/{uuid.uuid4()}.{file_type}" 105 | 106 | return func( 107 | self, 108 | file_path=file_path, 109 | file_content=file_content, 110 | file_type=file_type, 111 | *args, 112 | **kwargs, 113 | ) 114 | 115 | return wrapper 116 | 117 | 118 | class AnyParser: 119 | """Real-time parser for processing various data formats. 120 | 121 | Provides both synchronous and asynchronous methods for parsing and 122 | extracting information from different types of files. 123 | """ 124 | 125 | def __init__( 126 | self, 127 | api_key: str, 128 | base_url: str = PUBLIC_SHARED_BASE_URL, 129 | batch_url: str = PUBLIC_BATCH_BASE_URL, 130 | ) -> None: 131 | """Initialize AnyParser with API credentials. 132 | 133 | Args: 134 | api_key: Authentication key for API access 135 | base_url: API endpoint URL, defaults to public endpoint 136 | """ 137 | self._async_parser = AsyncParser(api_key, base_url) 138 | self._sync_parse = ParseSyncParser(api_key, base_url) 139 | self._sync_extract_key_value = ExtractKeyValueSyncParser(api_key, base_url) 140 | self._sync_extract_resume_key_value = ExtractResumeKeyValueSyncParser( 141 | api_key, base_url 142 | ) 143 | self._sync_extract_pii = ExtractPIISyncParser(api_key, base_url) 144 | self._sync_extract_tables = ExtractTablesSyncParser(api_key, base_url) 145 | self.batches = BatchParser(api_key, batch_url) 146 | 147 | @handle_file_processing 148 | def parse( 149 | self, 150 | file_path=None, 151 | file_content=None, 152 | file_type=None, 153 | extract_args=None, 154 | ): 155 | """Extract full content from a file synchronously. 156 | 157 | Args: 158 | file_path: Path to input file 159 | file_content: Base64 encoded file content 160 | file_type: File format extension 161 | extract_args: Additional extraction parameters 162 | 163 | Returns: 164 | tuple: (result, timing_info) or (error_message, "") 165 | """ 166 | return self._sync_parse.parse( 167 | file_path=file_path, 168 | file_content=file_content, 169 | file_type=file_type, 170 | extract_args=extract_args, 171 | ) 172 | 173 | @handle_file_processing 174 | def extract_pii( 175 | self, 176 | file_path=None, 177 | file_content=None, 178 | file_type=None, 179 | ): 180 | """ 181 | Extract PII data from a file synchronously. 182 | """ 183 | return self._sync_extract_pii.extract( 184 | file_path=file_path, 185 | file_content=file_content, 186 | file_type=file_type, 187 | ) 188 | 189 | @staticmethod 190 | def flatten_to_string(item): 191 | """ 192 | Flatten any iterable object to a string. 193 | """ 194 | 195 | if isinstance(item, str): 196 | return item 197 | 198 | # if item is a dict, flatten all keys and values 199 | if isinstance(item, dict): 200 | parts = [] 201 | for k, v in item.items(): 202 | parts.append(AnyParser.flatten_to_string(k)) 203 | parts.append(AnyParser.flatten_to_string(v)) 204 | return "".join(parts) 205 | 206 | # item is other iterable objects 207 | if isinstance(item, Iterable): 208 | parts = [] 209 | for sub_item in item: 210 | parts.append(AnyParser.flatten_to_string(sub_item)) 211 | return "".join(parts) 212 | 213 | # item is not iterable objects 214 | return str(item) 215 | 216 | @handle_file_processing 217 | def extract_tables( 218 | self, 219 | file_path=None, 220 | file_content=None, 221 | file_type=None, 222 | return_type="html", 223 | ): 224 | """Extract tables from a file in real-time. 225 | 226 | Args: 227 | file_path (str): The path to the file to be parsed. 228 | return_type (str): 'html' or 'csv' 229 | Returns: 230 | tuple(str, str) 231 | """ 232 | extracted_html, time_elapsed = self._sync_extract_tables.extract( 233 | file_path=file_path, 234 | file_content=file_content, 235 | file_type=file_type, 236 | ) 237 | 238 | if isinstance(extracted_html, list): 239 | extracted_html = AnyParser.flatten_to_string(extracted_html) 240 | 241 | if return_type.lower() == "csv": 242 | try: 243 | import pandas as pd 244 | except ImportError: 245 | raise ImportError("Please install pandas to use CSV return_type") 246 | 247 | if isinstance(extracted_html, list): 248 | extracted_html = "".join(str(item) for item in extracted_html) 249 | 250 | df_list = pd.read_html(StringIO(extracted_html)) 251 | combined_df = pd.concat(df_list, ignore_index=True) 252 | csv_output = combined_df.to_csv(index=False) 253 | 254 | return csv_output, time_elapsed 255 | 256 | return extracted_html, time_elapsed 257 | 258 | @handle_file_processing 259 | def extract_key_value( 260 | self, 261 | file_path=None, 262 | file_content=None, 263 | file_type=None, 264 | extract_instruction=None, 265 | ): 266 | """Extract key-value pairs from a file in real-time. 267 | 268 | Args: 269 | file_path (str): The path to the file to be parsed. 270 | extract_instruction (Dict): A dictionary containing the keys to be 271 | extracted, with their values as the description of those keys. 272 | Returns: 273 | tuple(str, str): The extracted data and the time taken. 274 | """ 275 | return self._sync_extract_key_value.extract( 276 | file_path=file_path, 277 | file_content=file_content, 278 | file_type=file_type, 279 | extract_args={"extract_instruction": extract_instruction}, 280 | ) 281 | 282 | @handle_file_processing 283 | def extract_resume_key_value( 284 | self, file_path=None, file_content=None, file_type=None 285 | ): 286 | """Extract resume in real-time. 287 | 288 | Args: 289 | file_path (str): The path to the file to be parsed. 290 | Returns: 291 | tuple(str, str): The extracted data and the time taken. 292 | extracted data includes: 293 | - "education": Education 294 | - "work_experience": Work Experience 295 | - "personal_info": Personal Information 296 | - "skills": Skills 297 | - "certifications": Certifications 298 | - "projects": Projects 299 | - "pii": Personally Identifiable Information - includes 300 | only name, email, and phone 301 | """ 302 | return self._sync_extract_resume_key_value.extract( 303 | file_path=file_path, 304 | file_content=file_content, 305 | file_type=file_type, 306 | ) 307 | 308 | # Example of decorated methods: 309 | @handle_file_processing 310 | def async_parse( 311 | self, 312 | file_path=None, 313 | file_content=None, 314 | file_type=None, 315 | extract_args=None, 316 | ): 317 | """Extract full content from a file asynchronously.""" 318 | return self._async_parser.send_async_request( 319 | process_type=ProcessType.PARSE, 320 | file_path=file_path, # type: ignore 321 | file_content=file_content, # type: ignore 322 | extract_args=extract_args, 323 | ) 324 | 325 | @handle_file_processing 326 | def async_parse_with_layout( 327 | self, file_path=None, file_content=None, file_type=None 328 | ): 329 | """Extract content from a file asynchronously with layout analysis.""" 330 | return self._async_parser.send_async_request( 331 | process_type=ProcessType.PARSE_WITH_LAYOUT, 332 | file_path=file_path, # type: ignore 333 | file_content=file_content, # type: ignore 334 | ) 335 | 336 | @handle_file_processing 337 | def async_parse_with_ocr(self, file_path=None, file_content=None, file_type=None): 338 | """Extract full content from a file asynchronously with OCR.""" 339 | return self._async_parser.send_async_request( 340 | process_type=ProcessType.PARSE_WITH_OCR, 341 | file_path=file_path, # type: ignore 342 | file_content=file_content, # type: ignore 343 | ) 344 | 345 | @handle_file_processing 346 | def async_extract_pii( 347 | self, 348 | file_path=None, 349 | file_content=None, 350 | file_type=None, 351 | extract_args=None, 352 | ): 353 | """Extract PII from a file asynchronously.""" 354 | return self._async_parser.send_async_request( 355 | process_type=ProcessType.EXTRACT_PII, 356 | file_path=file_path, # type: ignore 357 | file_content=file_content, # type: ignore 358 | extract_args=extract_args, 359 | ) 360 | 361 | @handle_file_processing 362 | def async_extract_tables(self, file_path=None, file_content=None, file_type=None): 363 | """Extract tables from a file asynchronously.""" 364 | return self._async_parser.send_async_request( 365 | process_type=ProcessType.EXTRACT_TABLES, 366 | file_path=file_path, # type: ignore 367 | file_content=file_content, # type: ignore 368 | ) 369 | 370 | @handle_file_processing 371 | def async_extract_key_value( 372 | self, 373 | file_path=None, 374 | file_content=None, 375 | file_type=None, 376 | extract_instruction=None, 377 | ): 378 | """Extract key-value pairs from a file asynchronously.""" 379 | return self._async_parser.send_async_request( 380 | process_type=ProcessType.EXTRACT_KEY_VALUE, 381 | file_path=file_path, # type: ignore 382 | file_content=file_content, # type: ignore 383 | extract_args={"extract_instruction": extract_instruction}, 384 | ) 385 | 386 | @handle_file_processing 387 | def async_extract_resume_key_value( 388 | self, file_path=None, file_content=None, file_type=None 389 | ): 390 | """Extract resume key-value pairs from a file asynchronously.""" 391 | return self._async_parser.send_async_request( 392 | process_type=ProcessType.EXTRACT_RESUME_KEY_VALUE, 393 | file_path=file_path, # type: ignore 394 | file_content=file_content, # type: ignore 395 | extract_args=None, 396 | ) 397 | 398 | def async_fetch( 399 | self, 400 | file_id: str, 401 | sync: bool = True, 402 | sync_timeout: int = 180, 403 | sync_interval: int = 5, 404 | ) -> str: 405 | """Fetches extraction results asynchronously. 406 | 407 | Args: 408 | file_id (str): The ID of the file to fetch results for. 409 | sync (bool, optional): Whether to wait for the results 410 | synchronously. 411 | sync_timeout (int, optional): Maximum time to wait for results in 412 | seconds. Defaults to 180. 413 | sync_interval (int, optional): Time interval between polling 414 | attempts in seconds. Defaults to 5. 415 | 416 | Returns: 417 | str: The extracted results as a markdown string. 418 | None: If the extraction is still in progress (when sync is False). 419 | """ 420 | 421 | response = None 422 | # Create the JSON payload 423 | payload = {"file_id": file_id} 424 | if sync: 425 | start_time = time.time() 426 | while time.time() < start_time + sync_timeout: 427 | response = requests.post( 428 | self._async_parser._async_fetch_url, 429 | headers=self._async_parser._headers, 430 | data=json.dumps(payload), 431 | timeout=TIMEOUT, 432 | ) 433 | if response.status_code == 202: 434 | print("Waiting for response...") 435 | time.sleep(sync_interval) 436 | continue 437 | break 438 | else: 439 | response = requests.post( 440 | self._async_parser._async_fetch_url, 441 | headers=self._async_parser._headers, 442 | data=json.dumps(payload), 443 | timeout=TIMEOUT, 444 | ) 445 | 446 | return self._async_parser.handle_async_response(response) 447 | -------------------------------------------------------------------------------- /any_parser/async_parser.py: -------------------------------------------------------------------------------- 1 | """Asynchronous parser implementation.""" 2 | 3 | import json 4 | from pathlib import Path 5 | from typing import Dict, Optional 6 | 7 | import requests 8 | 9 | from any_parser.base_parser import BaseParser 10 | from any_parser.constants import ProcessType 11 | from any_parser.utils import upload_file_to_presigned_url 12 | 13 | TIMEOUT = 60 14 | 15 | 16 | class BasePostProcessor: 17 | def __init__(self, successor=None) -> None: 18 | self.successor = successor 19 | 20 | def process(self, json_response: Dict) -> str: 21 | if self.successor: 22 | return self.successor.process(json_response) 23 | return f"Error: Invalid JSON response: {json_response}" 24 | 25 | 26 | class ParsePostProcessor(BasePostProcessor): 27 | def process(self, json_response: Dict) -> str: 28 | if "markdown" in json_response: 29 | return json_response["markdown"] 30 | if "result" in json_response: 31 | return json_response["result"] 32 | return super().process(json_response) 33 | 34 | 35 | class KeyValuePostProcessor(BasePostProcessor): 36 | def process(self, json_response: Dict) -> str: 37 | if "json" in json_response: 38 | return json_response["json"] 39 | return super().process(json_response) 40 | 41 | 42 | class ExtractPIIPostProcessor(BasePostProcessor): 43 | def process(self, json_response: Dict) -> str: 44 | if "pii_extraction" in json_response: 45 | return json_response["pii_extraction"] 46 | return super().process(json_response) 47 | 48 | 49 | class ExtractResumeKeyValuePostProcessor(BasePostProcessor): 50 | 51 | def process(self, json_response: Dict) -> str: 52 | if "resume_extraction" in json_response: 53 | return json_response["resume_extraction"] 54 | return super().process(json_response) 55 | 56 | 57 | class AsyncParser(BaseParser): 58 | def __init__(self, api_key: str, base_url: str) -> None: 59 | super().__init__(api_key, base_url) 60 | self._async_upload_url = f"{self._base_url}/async/upload" 61 | self._async_fetch_url = f"{self._base_url}/async/fetch" 62 | 63 | def send_async_request( 64 | self, 65 | process_type: ProcessType, 66 | file_path: str, 67 | file_content: str, 68 | extract_args: Optional[Dict] = None, 69 | ) -> str: 70 | """Extract full content from a file asynchronously. 71 | 72 | Args: 73 | process_type (ProcessType): The type of processing to be done. 74 | file_path (str): The path to the file to be parsed. 75 | file_content (str): The content of the file to be parsed. 76 | extract_args (Optional[Dict]): Additional extraction arguments. 77 | 78 | Returns: 79 | str: The file id of the uploaded file. 80 | """ 81 | 82 | file_name = Path(file_path).name 83 | 84 | # Create the JSON payload 85 | payload = { 86 | "file_name": file_name, 87 | "process_type": process_type.value, 88 | } 89 | 90 | if extract_args is not None and isinstance(extract_args, dict): 91 | payload["extract_args"] = extract_args # type: ignore 92 | 93 | # Send the POST request 94 | response = requests.post( 95 | self._async_upload_url, 96 | headers=self._headers, 97 | data=json.dumps(payload), 98 | timeout=TIMEOUT, 99 | ) 100 | 101 | # If response successful, upload the file 102 | return upload_file_to_presigned_url(file_content, response) 103 | 104 | def handle_async_response(self, response) -> str: 105 | if response is None: 106 | return "Error: timeout, no response received" 107 | if response.status_code == 202: 108 | return "" 109 | if response.status_code == 200: 110 | extract_resume_processor = ExtractResumeKeyValuePostProcessor() 111 | key_value_processor = KeyValuePostProcessor(extract_resume_processor) 112 | extract_pii_processor = ExtractPIIPostProcessor(key_value_processor) 113 | handler = ParsePostProcessor(extract_pii_processor) 114 | try: 115 | return handler.process(response.json()) 116 | except json.JSONDecodeError: 117 | return f"Error: Invalid JSON response: {response.text}" 118 | 119 | return f"Error: {response.status_code} {response.text}" 120 | -------------------------------------------------------------------------------- /any_parser/base_parser.py: -------------------------------------------------------------------------------- 1 | """Base parser implementation.""" 2 | 3 | 4 | class BaseParser: 5 | def __init__(self, api_key: str, base_url: str) -> None: 6 | self._api_key = api_key 7 | self._base_url = base_url 8 | self._headers = { 9 | "Content-Type": "application/json", 10 | "x-api-key": self._api_key, 11 | } 12 | -------------------------------------------------------------------------------- /any_parser/batch_parser.py: -------------------------------------------------------------------------------- 1 | """Batch parser implementation.""" 2 | 3 | import logging 4 | import os 5 | from concurrent.futures import ThreadPoolExecutor, as_completed 6 | from pathlib import Path 7 | from typing import List, Optional, Union 8 | 9 | import requests 10 | from pydantic import BaseModel, Field 11 | 12 | from any_parser.base_parser import BaseParser 13 | 14 | TIMEOUT = 60 15 | MAX_WORKERS = 10 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class UploadResponse(BaseModel): 21 | """ 22 | Response from the batch upload endpoint. 23 | """ 24 | 25 | fileName: str 26 | requestId: str 27 | requestStatus: str 28 | 29 | 30 | class UsageResponse(BaseModel): 31 | """ 32 | Response from the batch usage endpoint. 33 | """ 34 | 35 | pageLimit: int 36 | pageRemaining: int 37 | 38 | 39 | class FileStatusResponse(BaseModel): 40 | """ 41 | Response from the batch file status endpoint. 42 | """ 43 | 44 | fileName: str 45 | fileType: str 46 | requestId: str 47 | requestStatus: str 48 | uploadTime: str 49 | completionTime: Optional[str] = None 50 | result: Optional[List[str]] = Field(default_factory=list) 51 | error: Optional[List[str]] = Field(default_factory=list) 52 | 53 | 54 | class BatchParser(BaseParser): 55 | def __init__(self, api_key: str, base_url: str) -> None: 56 | super().__init__(api_key, base_url) 57 | self._file_upload_url = f"{self._base_url}/files/" 58 | self._processing_status_url = f"{self._base_url}/files/" + "{request_id}" 59 | self._usage_url = f"{self._base_url}/users/current/usage" 60 | 61 | # remove "Content-Type" from headers 62 | self._headers.pop("Content-Type") 63 | 64 | def create(self, file_path: str) -> Union[UploadResponse, List[UploadResponse]]: 65 | """Upload a single file or folder for batch processing. 66 | 67 | Args: 68 | file_path: Path to the file or folder to upload 69 | 70 | Returns: 71 | If file: Single UploadResponse object containing upload details 72 | If folder: List of UploadResponse objects for each file 73 | """ 74 | path = Path(file_path) 75 | if path.is_file(): 76 | return self._upload_single_file(path) 77 | elif path.is_dir(): 78 | return self._upload_folder(path) 79 | else: 80 | raise ValueError(f"Path {file_path} does not exist") 81 | 82 | def _upload_single_file(self, file_path: Path) -> UploadResponse: 83 | """Upload a single file for batch processing.""" 84 | if not os.path.isfile(file_path): 85 | raise FileNotFoundError(f"The file path '{file_path}' does not exist.") 86 | 87 | with open(file_path, "rb") as f: 88 | files = {"file": f} 89 | response = requests.post( 90 | self._file_upload_url, 91 | headers=self._headers, 92 | files=files, 93 | timeout=TIMEOUT, 94 | ) 95 | 96 | if response.status_code != 200: 97 | raise Exception(f"Upload failed: {response.text}") 98 | 99 | data = response.json() 100 | return UploadResponse( 101 | fileName=data["fileName"], 102 | requestId=data["requestId"], 103 | requestStatus=data["requestStatus"], 104 | ) 105 | 106 | def _upload_folder(self, folder_path: Path) -> List[UploadResponse]: 107 | """Upload all files in a folder for batch processing. 108 | 109 | Args: 110 | folder_path: Path to the folder containing files to upload 111 | 112 | Returns: 113 | List of UploadResponse objects for each uploaded file 114 | """ 115 | # Get all files in folder and subfolders 116 | files = [] 117 | for root, _, filenames in os.walk(folder_path): 118 | for filename in filenames: 119 | files.append(Path(root) / filename) 120 | 121 | # Upload files concurrently using thread pool 122 | responses = [] 123 | with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: 124 | future_to_file = { 125 | executor.submit(self._upload_single_file, file_path): file_path 126 | for file_path in files 127 | } 128 | 129 | for future in as_completed(future_to_file): 130 | file_path = future_to_file[future] 131 | try: 132 | response = future.result() 133 | responses.append(response) 134 | except Exception as e: 135 | logger.error(f"Failed to upload {file_path}: {str(e)}") 136 | 137 | return responses 138 | 139 | def retrieve(self, request_id: str) -> FileStatusResponse: 140 | """Get the processing status of a file. 141 | 142 | Args: 143 | request_id: The ID of the file processing request 144 | 145 | Returns: 146 | FileProcessingStatus object containing status details 147 | """ 148 | response = requests.get( 149 | self._processing_status_url.format(request_id=request_id), 150 | headers=self._headers, 151 | timeout=TIMEOUT, 152 | ) 153 | 154 | if response.status_code != 200: 155 | raise Exception(f"Status check failed: {response.text}") 156 | 157 | data = response.json() 158 | return FileStatusResponse(**data) 159 | 160 | def get_usage(self) -> UsageResponse: 161 | """Get current usage information. 162 | 163 | Returns: 164 | UsageResponse object containing usage details 165 | """ 166 | response = requests.get( 167 | self._usage_url, 168 | headers=self._headers, 169 | timeout=TIMEOUT, 170 | ) 171 | 172 | if response.status_code != 200: 173 | raise Exception(f"Usage check failed: {response.text}") 174 | 175 | data = response.json() 176 | return UsageResponse( 177 | pageLimit=data["pageLimit"], pageRemaining=data["pageRemaining"] 178 | ) 179 | -------------------------------------------------------------------------------- /any_parser/constants.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class ProcessType(Enum): 5 | EXTRACT_PII = "extract_pii" 6 | EXTRACT_TABLES = "extract_tables" 7 | EXTRACT_KEY_VALUE = "extract_key_value" 8 | EXTRACT_RESUME_KEY_VALUE = "extract_resume_key_value" 9 | PARSE = "parse" 10 | PARSE_WITH_OCR = "parse_with_ocr" 11 | PARSE_WITH_LAYOUT = "parse_with_layout" 12 | -------------------------------------------------------------------------------- /any_parser/sync_parser.py: -------------------------------------------------------------------------------- 1 | """Synchronous parser implementation.""" 2 | 3 | import json 4 | import time 5 | from typing import Any, Dict, Optional, Tuple 6 | 7 | import requests 8 | 9 | from any_parser.base_parser import BaseParser 10 | 11 | TIMEOUT = 60 12 | 13 | 14 | class BaseSyncParser(BaseParser): 15 | 16 | def get_sync_response( 17 | self, 18 | url_endpoint: str, 19 | file_content: str, 20 | file_type: str, 21 | extract_args: Optional[Dict[str, Any]] = None, 22 | ) -> Tuple[Optional[requests.Response], str]: 23 | payload = { 24 | "file_content": file_content, 25 | "file_type": file_type, 26 | } 27 | if extract_args: 28 | payload["extract_args"] = extract_args # type: ignore 29 | 30 | start_time = time.time() 31 | response = requests.post( 32 | url_endpoint, 33 | headers=self._headers, 34 | data=json.dumps(payload), 35 | timeout=TIMEOUT, 36 | ) 37 | end_time = time.time() 38 | 39 | if response.status_code != 200: 40 | return None, f"Error: {response.status_code} {response.text}" 41 | 42 | return response, f"{end_time - start_time:.2f} seconds" 43 | 44 | def parse( 45 | self, 46 | file_path=None, 47 | file_content=None, 48 | file_type=None, 49 | extract_args=None, 50 | ): 51 | """Converts the given file to markdown.""" 52 | raise NotImplementedError 53 | 54 | def extract( 55 | self, 56 | file_path=None, 57 | file_content=None, 58 | file_type=None, 59 | extract_args=None, 60 | ): 61 | """Extracts information from the given file.""" 62 | raise NotImplementedError 63 | 64 | 65 | class ParseSyncParser(BaseSyncParser): 66 | """Parse parser implementation.""" 67 | 68 | def parse( 69 | self, 70 | file_path=None, 71 | file_content=None, 72 | file_type=None, 73 | extract_args=None, 74 | ): 75 | response, info = self.get_sync_response( 76 | f"{self._base_url}/parse", 77 | file_content=file_content, # type: ignore 78 | file_type=file_type, # type: ignore 79 | extract_args=extract_args, 80 | ) 81 | 82 | if response is None: 83 | return info, "" 84 | 85 | try: 86 | response_data = response.json() 87 | result = response_data["markdown"] 88 | return result, f"Time Elapsed: {info}" 89 | except json.JSONDecodeError: 90 | return f"Error: Invalid JSON response: {response.text}", "" 91 | 92 | 93 | class ExtractPIISyncParser(BaseSyncParser): 94 | """Extract PII parser implementation.""" 95 | 96 | def extract( 97 | self, 98 | file_path=None, 99 | file_content=None, 100 | file_type=None, 101 | extract_args=None, 102 | ): 103 | response, info = self.get_sync_response( 104 | f"{self._base_url}/extract_pii", 105 | file_content=file_content, # type: ignore 106 | file_type=file_type, # type: ignore 107 | extract_args=None, 108 | ) 109 | 110 | if response is None: 111 | return info, "" 112 | 113 | try: 114 | response_data = response.json() 115 | result = response_data["pii_extraction"] 116 | return result, f"Time Elapsed: {info}" 117 | except json.JSONDecodeError: 118 | return f"Error: Invalid JSON response: {response.text}", "" 119 | 120 | 121 | class ExtractTablesSyncParser(BaseSyncParser): 122 | """Extract tables parser implementation.""" 123 | 124 | def extract( 125 | self, 126 | file_path=None, 127 | file_content=None, 128 | file_type=None, 129 | extract_args=None, 130 | ): 131 | response, info = self.get_sync_response( 132 | f"{self._base_url}/extract_tables", 133 | file_content=file_content, # type: ignore 134 | file_type=file_type, # type: ignore 135 | extract_args=None, 136 | ) 137 | 138 | if response is None: 139 | return info, "" 140 | 141 | try: 142 | response_data = response.json() 143 | result = response_data["markdown"] 144 | return result, f"Time Elapsed: {info}" 145 | except json.JSONDecodeError: 146 | return f"Error: Invalid JSON response: {response.text}", "" 147 | 148 | 149 | class ExtractKeyValueSyncParser(BaseSyncParser): 150 | """Extract key-value parser implementation.""" 151 | 152 | def extract( 153 | self, 154 | file_path=None, 155 | file_content=None, 156 | file_type=None, 157 | extract_args=None, 158 | ): 159 | response, info = self.get_sync_response( 160 | f"{self._base_url}/extract_key_value", 161 | file_content=file_content, # type: ignore 162 | file_type=file_type, # type: ignore 163 | extract_args={"extract_instruction": extract_args}, 164 | ) 165 | 166 | if response is None: 167 | return info, "" 168 | 169 | try: 170 | response_data = response.json() 171 | result = response_data["json"] 172 | return result, f"Time Elapsed: {info}" 173 | except json.JSONDecodeError: 174 | return f"Error: Invalid JSON response: {response.text}", "" 175 | 176 | 177 | class ExtractResumeKeyValueSyncParser(BaseSyncParser): 178 | """Extract resume key-value parser implementation.""" 179 | 180 | def extract( 181 | self, 182 | file_path=None, 183 | file_content=None, 184 | file_type=None, 185 | extract_args=None, 186 | ): 187 | response, info = self.get_sync_response( 188 | f"{self._base_url}/extract_resume_key_value", 189 | file_content=file_content, # type: ignore 190 | file_type=file_type, # type: ignore 191 | extract_args=None, 192 | ) 193 | 194 | if response is None: 195 | return info, "" 196 | 197 | try: 198 | response_data = response.json() 199 | result = response_data["extraction_result"] 200 | return result, f"Time Elapsed: {info}" 201 | except json.JSONDecodeError: 202 | return f"Error: Invalid JSON response: {response.text}", "" 203 | -------------------------------------------------------------------------------- /any_parser/utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import json 4 | from enum import Enum 5 | from pathlib import Path 6 | from typing import Optional, Tuple 7 | 8 | import requests 9 | 10 | SUPPORTED_FILE_EXTENSIONS = [ 11 | "pdf", 12 | "doc", 13 | "docx", 14 | "ppt", 15 | "pptx", 16 | "jpg", 17 | "jpeg", 18 | "png", 19 | "gif", 20 | ] 21 | 22 | 23 | class ValidationError(Enum): 24 | MISSING_INPUTS = "Either file_content or file_path must be provided" 25 | MISSING_FILE_TYPE = "file_type must be provided when using file_content" 26 | NOT_FOUND = "File does not exist: {}" 27 | UNSUPPORTED_FILE_TYPE = "Unsupported file type: {}. Supported file types: {}" 28 | FILE_EMPTY = "File is empty: {}" 29 | FILE_TOO_LARGE = "File size exceeds maximum limit of {} MB: {}" 30 | OTHER = "{}" 31 | 32 | 33 | def validate_file_inputs( 34 | file_path: Optional[str], 35 | file_content: Optional[str], 36 | file_type: Optional[str], 37 | ) -> Tuple[bool, str]: 38 | """Validate inputs for the parser or extractor. 39 | 40 | Args: 41 | file_content (Optional[str]): Base64 encoded file content 42 | file_path (Optional[str]): Path to the file 43 | file_type (Optional[str]): File extension/type 44 | 45 | Returns: 46 | Tuple[bool, str]: (is_valid, error_message) 47 | - is_valid: True if validation passes, False otherwise 48 | - error_message: "" if validation passes, error if validation fails 49 | """ 50 | # Check if at least one input method is provided 51 | if file_content is None and file_path is None: 52 | return False, ValidationError.MISSING_INPUTS.value 53 | 54 | # Validate file_content path 55 | if file_content is not None and file_type is None: 56 | return False, ValidationError.MISSING_FILE_TYPE.value 57 | 58 | # Validate file path if provided 59 | if file_path is not None: 60 | path = Path(file_path) 61 | 62 | # Check if file exists 63 | if not path.is_file(): 64 | return False, ValidationError.NOT_FOUND.value.format(file_path) 65 | 66 | # Check if file is empty 67 | if path.stat().st_size == 0: 68 | return False, ValidationError.FILE_EMPTY.value.format(file_path) 69 | 70 | # If file_type not provided, extract it from file_path 71 | if file_type is None: 72 | file_type = path.suffix.lower().lstrip(".") 73 | 74 | # Validate file type 75 | if file_type not in SUPPORTED_FILE_EXTENSIONS: 76 | supported_types = ", ".join(sorted(SUPPORTED_FILE_EXTENSIONS)) 77 | return False, ValidationError.UNSUPPORTED_FILE_TYPE.value.format( 78 | file_type, supported_types 79 | ) 80 | 81 | return True, "" 82 | 83 | 84 | def upload_file_to_presigned_url( 85 | file_content: str, response: requests.Response, timeout: int = 10 86 | ) -> str: 87 | if response.status_code == 200: 88 | try: 89 | file_id = response.json().get("fileId") 90 | presigned_url = response.json().get("presignedUrl") 91 | 92 | # Decode base64 content 93 | decoded_content = base64.b64decode(file_content) 94 | 95 | # Create file-like object from decoded content 96 | files = {"file": ("file", io.BytesIO(decoded_content))} 97 | 98 | upload_resp = requests.post( 99 | presigned_url["url"], 100 | data=presigned_url["fields"], 101 | files=files, 102 | timeout=timeout, 103 | ) 104 | if upload_resp.status_code != 204: 105 | return f"Error: {upload_resp.status_code} {upload_resp.text}" 106 | return file_id 107 | except json.JSONDecodeError: 108 | return "Error: Invalid JSON response" 109 | else: 110 | return f"Error: {response.status_code} {response.text}" 111 | -------------------------------------------------------------------------------- /examples/async_extract_pii.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n", 10 | "# !pip3 install --upgrade ipython\n", 11 | "# !pip3 install --upgrade any-parser" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from any_parser import AnyParser" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "ap = AnyParser(api_key=\"...\")" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "file_path = \"./sample_data/resume_1.pdf\"\n", 39 | "file_id = ap.async_extract_pii(file_path)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 6, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "pii_info = ap.async_fetch(file_id=file_id)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 7, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "{'name': 'Gary Jiang',\n", 60 | " 'phone_number': '+1-213-725-7637',\n", 61 | " 'address': None,\n", 62 | " 'email_address': 'jiangzhehuan0105@gmail.com',\n", 63 | " 'linkedin_url': 'https://linkedin.com/in/gary-jiang',\n", 64 | " 'github_url': None,\n", 65 | " 'summary': 'Full-stack Software Engineer'}" 66 | ] 67 | }, 68 | "execution_count": 7, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "pii_info" 75 | ] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "any", 81 | "language": "python", 82 | "name": "python3" 83 | }, 84 | "language_info": { 85 | "codemirror_mode": { 86 | "name": "ipython", 87 | "version": 3 88 | }, 89 | "file_extension": ".py", 90 | "mimetype": "text/x-python", 91 | "name": "python", 92 | "nbconvert_exporter": "python", 93 | "pygments_lexer": "ipython3", 94 | "version": "-1.-1.-1" 95 | } 96 | }, 97 | "nbformat": 4, 98 | "nbformat_minor": 2 99 | } 100 | -------------------------------------------------------------------------------- /examples/async_extract_resume_key_value.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n", 10 | "# !pip3 install --upgrade ipython\n", 11 | "# !pip3 install --upgrade any-parser" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from IPython.display import display\n", 21 | "from any_parser import AnyParser" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "ap = AnyParser(api_key=\"...\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "file_path = \"./sample_data/resume_1.pdf\"\n", 40 | "file_id = ap.async_extract_resume_key_value(file_path)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "Waiting for response...\n", 53 | "Waiting for response...\n", 54 | "Waiting for response...\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "json_result = ap.async_fetch(file_id=file_id)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "{'pii': {'full_name': 'GARY JIANG',\n", 71 | " 'email': 'jiangzhehuan0105@gmail.com',\n", 72 | " 'phone': '+1 (213) 725-7637'},\n", 73 | " 'education': [{'organization': 'Shenyang University of Technology',\n", 74 | " 'degree': \"Bachelor's Degree\",\n", 75 | " 'major': 'Computer Science',\n", 76 | " 'start_date': '2008-01-01',\n", 77 | " 'end_date': '2012-12-31',\n", 78 | " 'courses': None,\n", 79 | " 'achievements': None}],\n", 80 | " 'work_experience': [{'job_title': 'Full Stack Developer',\n", 81 | " 'company_name': 'VIMMERSE',\n", 82 | " 'location': None,\n", 83 | " 'start_date': '2023-06-01',\n", 84 | " 'end_date': 'present',\n", 85 | " 'job_type': None,\n", 86 | " 'summary': 'Developed an AI-powered editor web application that brings photos to life by animating images in 3D.',\n", 87 | " 'bullet_points': ['Developed robust back-end services utilizing Python (Flask/FastAPI) and Node.js (AWS Lambda) for efficient and scalable web applications',\n", 88 | " 'Built user-friendly and interactive websites using Next.js, ensuring a seamless user experience',\n", 89 | " 'Deployed and managed AWS infrastructure, including EC2 instances, S3 storage buckets, DynamoDB for NoSQL data management, and Cognito user pools for secure user authentication',\n", 90 | " 'Experienced Agile and Scrum methodologies within a fast-paced startup environment to ensure efficient project delivery and continuous improvement',\n", 91 | " 'Collaborated effectively with cross-functional teams (design, product management) to deliver projects on time, fostering a positive and collaborative work environment']},\n", 92 | " {'job_title': 'Full Stack Developer',\n", 93 | " 'company_name': 'VIKING SASQUATCH',\n", 94 | " 'location': None,\n", 95 | " 'start_date': '2023-01-01',\n", 96 | " 'end_date': '2023-06-01',\n", 97 | " 'job_type': None,\n", 98 | " 'summary': 'Developed APIs and Integrations for all of the parties that work on Real Estate Transactions.',\n", 99 | " 'bullet_points': ['Connecting Mortgage, Title, and Real Estate to solve pain points and improve automation and create efficiencies',\n", 100 | " 'Implemented a user-friendly front-end interface using Nuxt.js, ensuring a seamless user experience',\n", 101 | " 'Built backend APIs utilizing Node.js serverless functions for optimal performance',\n", 102 | " 'Managed data storage and security by implementing a MySQL database',\n", 103 | " 'Collaborated effectively within a team using Agile methodologies like sprint planning, daily standups, retrospectives to ensure project delivery and continuous improvement']},\n", 104 | " {'job_title': 'Full Stack Developer',\n", 105 | " 'company_name': 'ROX PAY SRL',\n", 106 | " 'location': None,\n", 107 | " 'start_date': '2021-12-01',\n", 108 | " 'end_date': '2022-12-31',\n", 109 | " 'job_type': None,\n", 110 | " 'summary': 'Built Fintech Software House that aims to optimize B2B payments by offering a systemic solution that gives value-added services in collection of payments, financial information and corporate liquidity by essentially creating a Commission Free, Open Loop, Payment Gateway system.',\n", 111 | " 'bullet_points': ['Developed front-end by using React.js and Redux, Javascript/Typescript',\n", 112 | " 'Contributed developing backend utilizing Django/Python']},\n", 113 | " {'job_title': 'Freelancer',\n", 114 | " 'company_name': 'FREELANCE',\n", 115 | " 'location': None,\n", 116 | " 'start_date': '2017-09-01',\n", 117 | " 'end_date': '2021-10-31',\n", 118 | " 'job_type': None,\n", 119 | " 'summary': 'Developed and managed many web and mobile applications while working as freelancer at Internet Dzyns LLC company.',\n", 120 | " 'bullet_points': ['Developed multiple web applications, participating in the whole process of their development: product design and estimation, code design and development, DevOps, UI/UX design, product launch and maintenance',\n", 121 | " 'Developed cross-platform mobile application using Flutter and Ionic/Angular',\n", 122 | " 'Developed NFT marketplace websites and wrote smart contracts']},\n", 123 | " {'job_title': 'Server Administrator, Java Developer',\n", 124 | " 'company_name': 'NEUSOFT',\n", 125 | " 'location': None,\n", 126 | " 'start_date': '2014-06-01',\n", 127 | " 'end_date': '2017-08-31',\n", 128 | " 'job_type': None,\n", 129 | " 'summary': 'Worked as intern and software developer after graduated university.',\n", 130 | " 'bullet_points': ['Correct analytical and reasoning skills to troubleshoot and repair server issues',\n", 131 | " 'Operating Systems & Security Software',\n", 132 | " 'Java / Spring Boot / Hibernate']}],\n", 133 | " 'personal_info': {'name': 'GARY JIANG',\n", 134 | " 'phone_number': '+1-213-725-7637',\n", 135 | " 'address': None,\n", 136 | " 'email_address': 'jiangzhehuan0105@gmail.com',\n", 137 | " 'linkedin_url': 'linkedin.com/in/gary-jiang',\n", 138 | " 'github_url': None,\n", 139 | " 'summary': None},\n", 140 | " 'skills': {'Programming Languages': ['Python',\n", 141 | " 'PHP',\n", 142 | " 'Javascript',\n", 143 | " 'Typescript',\n", 144 | " 'HTML',\n", 145 | " 'CSS'],\n", 146 | " 'Tools': ['Flask',\n", 147 | " 'Django',\n", 148 | " 'FastAPI',\n", 149 | " 'Laravel',\n", 150 | " 'Node.js',\n", 151 | " 'SQL databases',\n", 152 | " 'Next.js',\n", 153 | " 'React',\n", 154 | " 'Redux',\n", 155 | " 'Nuxt.js',\n", 156 | " 'Vue',\n", 157 | " 'AWS Lambda',\n", 158 | " 'Cognito',\n", 159 | " 'EC2',\n", 160 | " 'S3',\n", 161 | " 'DynamoDB',\n", 162 | " 'API Gateway',\n", 163 | " 'Flutter',\n", 164 | " 'Ionic',\n", 165 | " 'Angular',\n", 166 | " 'Git',\n", 167 | " 'Version Control',\n", 168 | " 'DevOps',\n", 169 | " 'CI/CD'],\n", 170 | " 'Other': ['Startup Experience',\n", 171 | " 'Adaptable',\n", 172 | " 'Resourceful',\n", 173 | " 'Prioritization',\n", 174 | " 'Hybrid Mobile App Development',\n", 175 | " 'AGILE',\n", 176 | " 'SCRUM']},\n", 177 | " 'certifications': [],\n", 178 | " 'projects': []}" 179 | ] 180 | }, 181 | "metadata": {}, 182 | "output_type": "display_data" 183 | } 184 | ], 185 | "source": [ 186 | "display(json_result)" 187 | ] 188 | } 189 | ], 190 | "metadata": { 191 | "kernelspec": { 192 | "display_name": "any", 193 | "language": "python", 194 | "name": "python3" 195 | }, 196 | "language_info": { 197 | "codemirror_mode": { 198 | "name": "ipython", 199 | "version": 3 200 | }, 201 | "file_extension": ".py", 202 | "mimetype": "text/x-python", 203 | "name": "python", 204 | "nbconvert_exporter": "python", 205 | "pygments_lexer": "ipython3", 206 | "version": "3.11.10" 207 | } 208 | }, 209 | "nbformat": 4, 210 | "nbformat_minor": 2 211 | } 212 | -------------------------------------------------------------------------------- /examples/async_extract_tables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n", 10 | "# !pip3 install --upgrade ipython\n", 11 | "# !pip3 install --upgrade any-parser" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from IPython.display import display, Markdown\n", 21 | "from any_parser import AnyParser\n", 22 | "import os" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "ap = AnyParser(api_key=os.getenv(\"CAMBIO_API_KEY\"))" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 4, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "file_path = \"./sample_data/sample.pdf\"\n", 41 | "file_id = ap.async_extract_tables(file_path)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 5, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "Waiting for response...\n", 54 | "Waiting for response...\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "markdown_output = ap.async_fetch(file_id=file_id)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 6, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "['\\n\\n\\n\\n\\n\\n\\n\\n
1 Overview 3 Technical information
2 Key requirements 4 Ordering information
3 Planned availability date 5Terms and conditions
3 Program number 8 Prices
3 Publications 8 Announcement countries
\\n\\n',\n", 71 | " '',\n", 72 | " '\\n\\n\\n\\n\\n
Program numberVRM Program name
5737-L70 2.8.0IBM InfoSphere Optim Data Privacy for Unstructured Data
\\n\\n',\n", 73 | " '\\n\\n\\n\\n\\n
Part number description Part number
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte License + SW Subscription & Support 12 MonthsD2604LL
\\n\\n',\n", 74 | " '\\n\\n\\n\\n\\n\\n\\n
Part number description Part number
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte SW Subscription & Support Reinstatement 12 Months D2605LL
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte Annual SW Subscription & Support Renewal 12 MonthsE0QGMLL
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte Monthly License D2608LL
\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n
Part number description Part number
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems(R) Annual Terabyte License + SW Subscription & Support 12 Months D2606LL
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte SW Subscription & Support Reinstatement 12 Months D2607LL
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte Annual SW Subscription & Support Renewal 12 MonthsE0QGNLL
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte Monthly License D2609LL
\\n\\n\\n\\n\\n\\n\\n
Program identifierLicense Information document title License Information document number
5737-L70 IBM InfoSphere Optim Data Privacy for Unstructured DataL-JERN-BFQ3KR
\\n\\n',\n", 75 | " '\\n\\n\\n\\n\\n
Program identifierLicense Information document title License Information document number
5737-L70 IBM InfoSphere Optim Data Privacy for Unstructured DataL-JERN-BFQ3KR
\\n\\n',\n", 76 | " '',\n", 77 | " '',\n", 78 | " '']" 79 | ] 80 | }, 81 | "execution_count": 6, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [ 87 | "markdown_output" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 9, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/markdown": [ 98 | "\n", 99 | "\n", 100 | "\n", 101 | "\n", 102 | "\n", 103 | "\n", 104 | "\n", 105 | "\n", 106 | "
1 Overview 3 Technical information
2 Key requirements 4 Ordering information
3 Planned availability date 5Terms and conditions
3 Program number 8 Prices
3 Publications 8 Announcement countries
\n", 107 | "\n" 108 | ], 109 | "text/plain": [ 110 | "" 111 | ] 112 | }, 113 | "metadata": {}, 114 | "output_type": "display_data" 115 | } 116 | ], 117 | "source": [ 118 | "display(Markdown(markdown_output[0]))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 14, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/markdown": [ 129 | "\n", 130 | "\n", 131 | "\n", 132 | "\n", 133 | "\n", 134 | "\n", 135 | "\n", 136 | "
Part number description Part number
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte SW Subscription & Support Reinstatement 12 Months D2605LL
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte Annual SW Subscription & Support Renewal 12 MonthsE0QGMLL
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Annual Terabyte Monthly License D2608LL
\n", 137 | "\n", 138 | "\n", 139 | "\n", 140 | "\n", 141 | "\n", 142 | "\n", 143 | "\n", 144 | "\n", 145 | "\n", 146 | "
Part number description Part number
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems(R) Annual Terabyte License + SW Subscription & Support 12 Months D2606LL
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte SW Subscription & Support Reinstatement 12 Months D2607LL
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte Annual SW Subscription & Support Renewal 12 MonthsE0QGNLL
IBM Analytics - Platform InfoSphere Optim Data Privacy for Unstructured Data Z Systems Annual Terabyte Monthly License D2609LL
\n", 147 | "\n", 148 | "\n", 149 | "\n", 150 | "\n", 151 | "\n", 152 | "\n", 153 | "
Program identifierLicense Information document title License Information document number
5737-L70 IBM InfoSphere Optim Data Privacy for Unstructured DataL-JERN-BFQ3KR
\n", 154 | "\n" 155 | ], 156 | "text/plain": [ 157 | "" 158 | ] 159 | }, 160 | "metadata": {}, 161 | "output_type": "display_data" 162 | } 163 | ], 164 | "source": [ 165 | "display(Markdown(markdown_output[4]))" 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "any", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "-1.-1.-1" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 2 190 | } 191 | -------------------------------------------------------------------------------- /examples/async_parse_pdf2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n", 10 | "# !pip3 install --upgrade ipython\n", 11 | "# !pip3 install --upgrade any-parser" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from IPython.display import display, Markdown\n", 21 | "from any_parser import AnyParser\n", 22 | "import os\n", 23 | "from dotenv import load_dotenv" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "load_dotenv(override=True)\n", 33 | "example_apikey = os.getenv(\"CAMBIO_API_KEY\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "ap = AnyParser(example_apikey)\n", 43 | "\n", 44 | "# Define extract_args as a dictionary with your desired parameters\n", 45 | "extract_args = {\n", 46 | " \"vqa_figures_flag\": True,\n", 47 | " \"vqa_charts_flag\": True\n", 48 | "}\n", 49 | "\n", 50 | "file_id = ap.async_parse(file_path=\"./sample_data/Earnings-Presentation-Q2-2024.pdf\", extract_args=extract_args)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 5, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "Waiting for response...\n", 63 | "Waiting for response...\n", 64 | "Waiting for response...\n", 65 | "Waiting for response...\n", 66 | "Waiting for response...\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "markdown_output = ap.async_fetch(file_id=file_id)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 7, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/markdown": [ 82 | "Meta Earnings Presentation Q2 2024 \n", 83 | "\n", 84 | "investor.fb.com\n", 85 | "\n", 86 | " Meta logo, consisting of an infinity symbol followed by the text \"Meta\"\n", 87 | "\n", 88 | "Revenue by User Geography Meta logo \n", 89 | "\n", 90 | "In Millions\n", 91 | "\n", 92 | " \n", 93 | "| Quarter | US & Canada | Europe | Asia-Pacific | Rest of World | Total |\n", 94 | "|---|---|---|---|---|---|\n", 95 | "| Q2'24 | 16,847 | 9,300 | 7,888 | 5,036 | 39,071 |\n", 96 | "| Q1'24 | 15,824 | 8,483 | 7,481 | 4,667 | 36,455 |\n", 97 | "| Q4'23 | 18,585 | 9,441 | 7,512 | 4,573 | 40,111 |\n", 98 | "| Q3'23 | 15,190 | 7,777 | 6,928 | 4,251 | 34,146 |\n", 99 | "| Q2'23 | 14,422 | 7,323 | 6,515 | 3,739 | 31,999 |\n", 100 | "| Q1'23 | 13,048 | 6,345 | 5,960 | 3,292 | 28,645 |\n", 101 | "| Q4'22 | 15,636 | 7,050 | 6,050 | 3,429 | 32,165 |\n", 102 | "| Q3'22 | 13,035 | 5,797 | 5,782 | 3,100 | 27,714 |\n", 103 | "| Q2'22 | 13,249 | 6,452 | 5,797 | 3,213 | 28,822 |\n", 104 | "\n", 105 | "This stacked bar chart shows the revenue by user geography for Meta from Q2'22 to Q2'24. The revenue is divided into four categories: US & Canada, Europe, Asia-Pacific, and Rest of World. The total revenue for each quarter is shown at the top of each bar.\n", 106 | " \n", 107 | "\n", 108 | "Our revenue by user geography is geographically apportioned based on our estimation of the geographic location of our users when they perform a revenue-generating activity. This allocation differs from our revenue disaggregated by geography disclosure in our condensed consolidated financial statements where revenue is geographically apportioned based on the addresses of our customers.\n", 109 | "\n", 110 | " 3\n", 111 | "\n", 112 | "Segment Results Meta logo \n", 113 | "\n", 114 | "In Millions\n", 115 | "\n", 116 | " \n", 117 | "| | Q2'22 | Q3'22 | Q4'22 | Q1'23 | Q2'23 | Q3'23 | Q4'23 | Q1'24 | Q2'24 |\n", 118 | "|---|---|---|---|---|---|---|---|---|---|\n", 119 | "| Advertising | $ 28,152 | $ 27,237 | $ 31,254 | $ 28,101 | $ 31,498 | $ 33,643 | $ 38,706 | $ 35,635 | $ 38,329 |\n", 120 | "| Other | 218 | 192 | 184 | 205 | 225 | 293 | 334 | 380 | 389 |\n", 121 | "| Family of Apps Revenue | 28,370 | 27,429 | 31,438 | 28,306 | 31,723 | 33,936 | 39,040 | 36,015 | 38,718 |\n", 122 | "| Reality Labs Revenue | 452 | 285 | 727 | 339 | 276 | 210 | 1,071 | 440 | 353 |\n", 123 | "| Total Revenue | $ 28,822 | $ 27,714 | $ 32,165 | $ 28,645 | $ 31,999 | $ 34,146 | $ 40,111 | $ 36,455 | $ 39,071 |\n", 124 | "| Family of Apps Operating Income | $ 11,164 | $ 9,336 | $ 10,678 | $ 11,219 | $ 13,131 | $ 17,490 | $ 21,030 | $ 17,664 | $ 19,335 |\n", 125 | "| Reality Labs Operating (Loss) | (2,806) | (3,672) | (4,279) | (3,992) | (3,739) | (3,742) | (4,646) | (3,846) | (4,488) |\n", 126 | "| Total Income from Operations | $ 8,358 | $ 5,664 | $ 6,399 | $ 7,227 | $ 9,392 | $ 13,748 | $ 16,384 | $ 13,818 | $ 14,847 |\n", 127 | "| Operating Margin | 29% | 20% | 20% | 25% | 29% | 40% | 41% | 38% | 38% |\n", 128 | " \n", 129 | "\n", 130 | "We report our financial results based on two reportable segments: Family of Apps (FoA) and Reality Labs (RL). FoA includes Facebook, Instagram, Messenger, WhatsApp, and other services. RL includes our virtual, augmented, and mixed reality related consumer hardware, software, and content.\n", 131 | "\n", 132 | " 4\n", 133 | "\n", 134 | "Net Income Meta logo \n", 135 | "\n", 136 | "In Millions\n", 137 | "\n", 138 | " \n", 139 | "| Quarter | Net Income |\n", 140 | "|---|---|\n", 141 | "| Q2'22 | $6,687 |\n", 142 | "| Q3'22 | $4,395 |\n", 143 | "| Q4'22 | $4,652 |\n", 144 | "| Q1'23 | $5,709 |\n", 145 | "| Q2'23 | $7,788 |\n", 146 | "| Q3'23 | $11,583 |\n", 147 | "| Q4'23 | $14,017 |\n", 148 | "| Q1'24 | $12,369 |\n", 149 | "| Q2'24 | $13,465 |\n", 150 | "\n", 151 | "This bar chart shows the Net Income in millions for Meta from Q2'22 to Q2'24. The y-axis ranges from $0 to $14,017 million, with increments of $1,000 million. The highest net income was $14,017 million in Q4'23, while the lowest was $4,395 million in Q3'22.\n", 152 | " \n", 153 | "\n", 154 | " 7\n", 155 | "\n", 156 | "Diluted Earnings Per Share Meta logo \n", 157 | "\n", 158 | " \n", 159 | "| Quarter | Earnings Per Share |\n", 160 | "|---|---|\n", 161 | "| Q2'22 | $2.46 |\n", 162 | "| Q3'22 | $1.64 |\n", 163 | "| Q4'22 | $1.76 |\n", 164 | "| Q1'23 | $2.20 |\n", 165 | "| Q2'23 | $2.98 |\n", 166 | "| Q3'23 | $4.39 |\n", 167 | "| Q4'23 | $5.33 |\n", 168 | "| Q1'24 | $4.71 |\n", 169 | "| Q2'24 | $5.16 |\n", 170 | "\n", 171 | "This bar chart shows the Diluted Earnings Per Share for Meta from Q2'22 to Q2'24. The y-axis ranges from $1.64 to $5.33, with increments of $0.02. The chart demonstrates an overall increasing trend in earnings per share over the period, with the highest point in Q4'23 at $5.33 and the lowest in Q3'22 at $1.64.\n", 172 | " \n", 173 | "\n", 174 | " 8\n", 175 | "\n", 176 | "Limitations of Key Metrics and Other Data Meta logo \n", 177 | "\n", 178 | "To calculate our estimates of DAP, we currently use a series of machine learning models that are developed based on internal reviews of limited samples of user accounts and calibrated against user survey data. We apply significant judgment in designing these models and calculating these estimates. For example, to match user accounts within individual products and across multiple products, we use data signals such as similar device information, IP addresses, and user names. We also calibrate our models against data from periodic user surveys of varying sizes and frequency across our products, which survey questions are based on monthly usage, and which are inherently subject to error. The timing and results of such user surveys have in the past contributed, and may in the future contribute, to changes in our reported Family metrics from period to period. In addition, our data limitations may affect our understanding of certain details of our business and increase the risk of error for our Family metrics estimates. Our techniques and models rely on a variety of data signals from different products, and we rely on more limited data signals for some products compared to others. For example, as a result of limited visibility into encrypted products, we have fewer data signals from WhatsApp user accounts and primarily rely on phone numbers and device information to match WhatsApp user accounts with accounts on our other products. Any loss of access to data signals we use in our process for calculating Family metrics, whether as a result of our own product decisions, actions by third-party browser or mobile platforms, regulatory or legislative requirements, or other factors, also may impact the stability or accuracy of our reported Family metrics, as well as our ability to report these metrics at all. Our estimates of Family metrics also may change as our methodologies evolve, including through the application of new data signals or technologies, product changes, or other improvements in our user surveys, algorithms, or machine learning that may improve our ability to match accounts within and across our products or otherwise evaluate the broad population of our users. In addition, such evolution may allow us to identify previously undetected violating accounts (as defined below).\n", 179 | "\n", 180 | "We regularly evaluate our Family metrics to estimate the percentage of our DAP consisting solely of \"violating\" accounts. We define \"violating\" accounts as accounts which we believe are intended to be used for purposes that violate our terms of service, including bots and spam. In the first quarter of 2024, we estimated that less than 3% of our worldwide DAP consisted solely of violating accounts. Such estimation is based on an internal review of a limited sample of accounts, and we apply significant judgment in making this determination. For example, we look for account information and behaviors associated with Facebook and Instagram accounts that appear to be inauthentic to the reviewers, but we have limited visibility into WhatsApp user activity due to encryption. In addition, if we believe an individual person has one or more violating accounts, we do not include such person in our violating accounts estimation as long as we believe they have one account that does not constitute a violating account. From time to time, we disable certain user accounts, make product changes, or take other actions to reduce the number of violating accounts among our users, which may also reduce our DAP estimates in a particular period. We intend to disclose our estimates of the percentage of our DAP consisting solely of violating accounts on an annual basis. Violating accounts are very difficult to measure at our scale, and it is possible that the actual number of violating accounts may vary significantly from our estimates.\n", 181 | "\n", 182 | "## User Geography\n", 183 | "\n", 184 | "Our estimates for revenue by user location, as well as year-over-year percentage changes in ad impressions delivered and the average price per ad by user location, are also affected by data limitations and other challenges in measuring user geography. Our data regarding the geographic location of our users is estimated based on a number of factors, such as the user's IP address and self-disclosed location. These factors may not always accurately reflect the user's actual location. For example, a user may appear to be accessing our products from the location of the proxy server that the user connects to rather than from the user's actual location. The methodologies used to measure our metrics are also susceptible to algorithm or other technical errors.\n", 185 | "\n", 186 | " 17" 187 | ], 188 | "text/plain": [ 189 | "" 190 | ] 191 | }, 192 | "metadata": {}, 193 | "output_type": "display_data" 194 | } 195 | ], 196 | "source": [ 197 | "# Join the list elements with newlines to create a single string\n", 198 | "markdown_text = '\\n\\n'.join(markdown_output)\n", 199 | "display(Markdown(markdown_text))" 200 | ] 201 | } 202 | ], 203 | "metadata": { 204 | "kernelspec": { 205 | "display_name": "any", 206 | "language": "python", 207 | "name": "python3" 208 | }, 209 | "language_info": { 210 | "codemirror_mode": { 211 | "name": "ipython", 212 | "version": 3 213 | }, 214 | "file_extension": ".py", 215 | "mimetype": "text/x-python", 216 | "name": "python", 217 | "nbconvert_exporter": "python", 218 | "pygments_lexer": "ipython3", 219 | "version": "3.10.15" 220 | } 221 | }, 222 | "nbformat": 4, 223 | "nbformat_minor": 2 224 | } 225 | -------------------------------------------------------------------------------- /examples/async_parse_with_layout.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n", 10 | "# !pip3 install --upgrade ipython\n", 11 | "# !pip3 install --upgrade any-parser" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from IPython.display import display, Markdown\n", 21 | "from any_parser import AnyParser" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "ap = AnyParser(api_key=\"...\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "file_path = \"./sample_data/test_1figure_1table.png\"\n", 40 | "file_id = ap.async_parse_with_layout(file_path)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "Waiting for response...\n", 53 | "Waiting for response...\n", 54 | "Waiting for response...\n", 55 | "Waiting for response...\n", 56 | "Waiting for response...\n", 57 | "Waiting for response...\n", 58 | "Waiting for response...\n", 59 | "Waiting for response...\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "markdown_output = ap.async_fetch(file_id=file_id)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/markdown": [ 75 | "\n", 76 | "\n", 77 | "\n", 78 | "\n", 79 | "\n", 80 | "\n", 81 | "\n", 82 | "\n", 83 | "\n", 84 | "\n", 85 | "\n", 86 | "\n", 87 | "\n", 88 | "\n", 89 | "
latency (ms)
participantsmean 99th percentile
1 17.0 +1.4 75.0 34.9
2 24.5 +2.5 87.6 +35.9
5 31.5 +6.2 104.5 52.2
10 30.0 +3.7 95.6 +25.4
25 35.5 +5.6 100.4 42.7
50 42.7 4.1 93.7 22.9
100 71.4 7.6 131.2 +17.6
200 150.5 +11.0320.3 35.1
\n", 90 | "\n", 91 | "\n", 92 | "\n", 93 | "Table 4: Two-phase commit scalability. Mean and standard deviations over 10 runs.\n", 94 | "\n", 95 | "CPUs. Snapshot reads can execute at any up-to-date replicas, so their throughput increases almost linearly with the number of replicas. Single-read read-only transactions only execute at leaders because timestamp assignment must happen at leaders. Read-only-transaction throughput increases with the number of replicas because the number of effective spanservers increases: in the experimental setup, the number of spanservers equaled the number of replicas, and leaders were randomly distributed among the zones. Write throughput benefits from the same experimental artifact (which explains the increase in throughput from 3 to 5 replicas), but that benefit is outweighed by the linear increase in the amount of work performed per write, as the number of replicas increases.\n", 96 | "\n", 97 | "Table 4 demonstrates that two-phase commit can scale to a reasonable number of participants: it summarizes a set of experiments run across 3 zones, each with 25 spanservers. Scaling up to 50 participants is reasonable in both mean and 99th-percentile, and latencies start to rise noticeably at 100 participants.\n", 98 | "\n", 99 | "5.2 Availability\n", 100 | "\n", 101 | "Figure 5 illustrates the availability benefits of running Spanner in multiple datacenters. It shows the results of three experiments on throughput in the presence of datacenter failure, all of which are overlaid onto the same time scale. The test universe consisted of 5 zones Zi, each of which had 25 spanservers. The test database was sharded into 1250 Paxos groups, and 100 test clients constantly issued non-snapshot reads at an aggregate rate of 50K reads/second. All of the leaders were explicitly placed in Z1. Five seconds into each test, all of the servers in one zone were killed: non-leader kills Z2; leader-hard kills Z1; leader-soft kills Z1, but it gives notifications to all of the servers that they should handoff leadership first.\n", 102 | "\n", 103 | "Killing Z2 has no effect on read throughput. Killing Z1 while giving the leaders time to handoff leadership to a different zone has a minor effect: the throughput drop is not visible in the graph, but is around 3-4%. On the other hand, killing Z1 with no warning has a severe effect: the rate of completion drops almost to 0. As leaders get re-elected, though, the throughput of the system rises to approximately 100K reads/second because of two artifacts of our experiment: there is extra capacity in the system, and operations are queued while the leader is unavailable. As a result, the throughput of the system rises before leveling off again at its steady-state rate.\n", 104 | "\n", 105 | "We can also see the effect of the fact that Paxos leader leases are set to 10 seconds. When we kill the zone, the leader-lease expiration times for the groups should be evenly distributed over the next 10 seconds. Soon after each lease from a dead leader expires, a new leader is elected. Approximately 10 seconds after the kill time, all of the groups have leaders and throughput has recovered. Shorter lease times would reduce the effect of server deaths on availability, but would require greater amounts of lease-renewal network traffic. We are in the process of designing and implementing a mechanism that will cause slaves to release Paxos leader leases upon leader failure.\n", 106 | "\n", 107 | "5.3 TrueTime\n", 108 | "\n", 109 | "Two questions must be answered with respect to TrueTime: is ε truly a bound on clock uncertainty, and how bad does ε get? For the former, the most serious problem would be if a local clock’s drift were greater than 200usec/sec: that would break assumptions made by TrueTime. Our machine statistics show that bad CPUs are 6 times more likely than bad clocks. That is, clock issues are extremely infrequent, relative to much more serious hardware problems. As a result, we believe that TrueTime’s implementation is as trustworthy as any other piece of software upon which Spanner depends.\n", 110 | "\n", 111 | "![![<@mask_p0_e1_figure>(timeout=1h)](https://anyparser-realtime-test-j-assetsconstructfilebucke-2wg0ln280yvz.s3.amazonaws.com/result_parse_with_layout/async_S4iyw7RAEE8CTGkVgHYeI8nsTmSALI1U2HXvAN6j/2024/11/14/test_1figure_1table_f00964f1-abcc-4e62-b2af-46249d9c70d4.png/%3C%40mask_p0_e1_figure_s3%3E.png?AWSAccessKeyId=ASIAXM24X76XLDJJDDZX&Signature=Ef8urOX4Oj%2Bdxx%2F1IOh0OqgJ0%2B4%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEFoaCXVzLXdlc3QtMiJGMEQCIBJHF0qjs7xZL9IBZf0a7YooU6WJP1EeclCbGaKCaLFPAiB%2BFjaYEyzmBWPFVh%2FRSUVhrEEdc%2FlQdUaLSTP%2FgclPaSrcAwjj%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwODYxMTI2NDQzMCIMGyjwrhVEC7fYAvneKrADV3HpyrnA8A6QUdLRnfZZM74MpeETlq%2BvlIjpQ5CPxB%2BTWpNRlq4c3eo%2BzKRX87bl9kpFmBaFXJPc9ot%2BN3L3Vcp%2FzvnI0iB4gqlN4jGexU5wVpTclORB1TAK%2FcO6AFfGACTLrUg0GzgcbwYR%2FGIvhxSGj1Ule9MDXL%2FG2YGMnqFDndKirbwufY4dlBYehDzqNii3kB3v5nGFsYKmAdVEocKdeIK6cv%2Fybj3w58l9vDyRMgr0%2FLWposZ160WIEvPMWMseKe6Q87%2BbEL8hcyl5i0aFxeGf4xv1Foiz74tcJcPL7RuwpQYCb3BztfD11Vo8334cla8p5LlEfkj1OEDHVXW15FJpw29pZN1q0IBIQNeBHtajkpu7BPzURXYZIUnvnWnpCPHTThM8z2Az1mhtou69uKWjO6iVeOe%2BrbqGMXbKEJxuKraEh%2BXVukZWmzlxwaiyJ2gomNXTQmO0gaLpiU934WqlJu9mGl0mw686KPwwdVOudV4RUgXAZhpT7j%2FzydhxVNK0sHX%2F02lTm1v6%2BRpsUN1Xvd%2FXMuj1%2FM8q5B86wkwUj1YjgFoQ9qcljZu8MPik1bkGOp8BvunCWNInmGehKh0yaRGfQn0y%2FgecCbOQoOqRUuLahI8ZBrixkIBUOkyinWTmsdLG6ItJXkiKFBOAHU0tq97U0Fbb0mq0v6L%2Bfr1INT52vqWsaXTwxiLSJeGJTEve1SCCRttFsIpkZF5MEmB3V0irDz3lVQbyV1Z2lWSe%2Br13a5DSeH4REoiwqEKtKN%2FCV4WPDhK5G%2FUm%2B8LmNrgUGm77&Expires=1731551406)(timeout=1h)](https://anyparser-realtime-test-j-assetsconstructfilebucke-2wg0ln280yvz.s3.amazonaws.com/result_parse_with_layout/async_S4iyw7RAEE8CTGkVgHYeI8nsTmSALI1U2HXvAN6j/2024/11/14/test_1figure_1table_f00964f1-abcc-4e62-b2af-46249d9c70d4.png/%3C%40mask_p0_e1_figure_s3%3E.png?AWSAccessKeyId=ASIAXM24X76XLDJJDDZX&Signature=Ef8urOX4Oj%2Bdxx%2F1IOh0OqgJ0%2B4%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEFoaCXVzLXdlc3QtMiJGMEQCIBJHF0qjs7xZL9IBZf0a7YooU6WJP1EeclCbGaKCaLFPAiB%2BFjaYEyzmBWPFVh%2FRSUVhrEEdc%2FlQdUaLSTP%2FgclPaSrcAwjj%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwODYxMTI2NDQzMCIMGyjwrhVEC7fYAvneKrADV3HpyrnA8A6QUdLRnfZZM74MpeETlq%2BvlIjpQ5CPxB%2BTWpNRlq4c3eo%2BzKRX87bl9kpFmBaFXJPc9ot%2BN3L3Vcp%2FzvnI0iB4gqlN4jGexU5wVpTclORB1TAK%2FcO6AFfGACTLrUg0GzgcbwYR%2FGIvhxSGj1Ule9MDXL%2FG2YGMnqFDndKirbwufY4dlBYehDzqNii3kB3v5nGFsYKmAdVEocKdeIK6cv%2Fybj3w58l9vDyRMgr0%2FLWposZ160WIEvPMWMseKe6Q87%2BbEL8hcyl5i0aFxeGf4xv1Foiz74tcJcPL7RuwpQYCb3BztfD11Vo8334cla8p5LlEfkj1OEDHVXW15FJpw29pZN1q0IBIQNeBHtajkpu7BPzURXYZIUnvnWnpCPHTThM8z2Az1mhtou69uKWjO6iVeOe%2BrbqGMXbKEJxuKraEh%2BXVukZWmzlxwaiyJ2gomNXTQmO0gaLpiU934WqlJu9mGl0mw686KPwwdVOudV4RUgXAZhpT7j%2FzydhxVNK0sHX%2F02lTm1v6%2BRpsUN1Xvd%2FXMuj1%2FM8q5B86wkwUj1YjgFoQ9qcljZu8MPik1bkGOp8BvunCWNInmGehKh0yaRGfQn0y%2FgecCbOQoOqRUuLahI8ZBrixkIBUOkyinWTmsdLG6ItJXkiKFBOAHU0tq97U0Fbb0mq0v6L%2Bfr1INT52vqWsaXTwxiLSJeGJTEve1SCCRttFsIpkZF5MEmB3V0irDz3lVQbyV1Z2lWSe%2Br13a5DSeH4REoiwqEKtKN%2FCV4WPDhK5G%2FUm%2B8LmNrgUGm77&Expires=1731551406)\n", 112 | "\n", 113 | "Figure 5: Effect of killing servers on throughput.\n" 114 | ], 115 | "text/plain": [ 116 | "" 117 | ] 118 | }, 119 | "metadata": {}, 120 | "output_type": "display_data" 121 | } 122 | ], 123 | "source": [ 124 | "display(Markdown(markdown_output))" 125 | ] 126 | } 127 | ], 128 | "metadata": { 129 | "kernelspec": { 130 | "display_name": "any", 131 | "language": "python", 132 | "name": "python3" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 3 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython3", 144 | "version": "-1.-1.-1" 145 | } 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 2 149 | } 150 | -------------------------------------------------------------------------------- /examples/async_parse_with_ocr.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n", 10 | "# !pip3 install --upgrade ipython\n", 11 | "# !pip3 install --upgrade any-parser" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from IPython.display import display, Markdown\n", 21 | "from any_parser import AnyParser" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "ap = AnyParser(api_key=\"...\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "file_path = \"./sample_data/test_1figure_1table.png\"\n", 40 | "file_id = ap.async_parse_with_ocr(file_path)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "Waiting for response...\n", 53 | "Waiting for response...\n", 54 | "Waiting for response...\n", 55 | "Waiting for response...\n", 56 | "Waiting for response...\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "markdown_output = ap.async_fetch(file_id=file_id)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 6, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/markdown": [ 72 | "## Table 4: Two-phase commit scalability. Mean and standard deviations over 10 runs.\n", 73 | "\n", 74 | "| participants | mean | 99th percentile |\n", 75 | "|--------------|-------------|-----------------|\n", 76 | "| 1 | 17.0 ±1.4 | 75.0 ±34.9 |\n", 77 | "| 2 | 24.5 ±2.5 | 87.6 ±35.9 |\n", 78 | "| 5 | 31.5 ±6.2 | 104.5 ±52.2 |\n", 79 | "| 10 | 30.0 ±3.7 | 95.6 ±25.4 |\n", 80 | "| 25 | 35.5 ±5.6 | 100.4 ±42.7 |\n", 81 | "| 50 | 42.7 ±4.1 | 93.7 ±22.9 |\n", 82 | "| 100 | 71.4 ±7.6 | 131.2 ±17.6 |\n", 83 | "| 200 | 150.5 ±11.0 | 320.3 ±35.1 |\n", 84 | "\n", 85 | "## 5.2 Availability\n", 86 | "\n", 87 | "Figure 5 illustrates the availability benefits of running Spanner in multiple datacenters. It shows the results of three experiments on throughput in the presence of datacenter failure, all of which are overlaid onto the same time scale. The test universe consisted of 5 zones Z1-Z5, each of which had 25 spanservers. The test database was sharded into 1250 Paxos groups, and 100 test clients constantly issued non-snapshot reads at an aggregate rate of 50K reads/second. All of the leaders were explicitly placed in Z1. Five seconds into each test, all of the servers in one zone were killed: non-leader kills Z2; leader-hard kills Z1; leader-soft kills Z1, but it gives notifications to all of the servers that they should handoff leadership first.\n", 88 | "\n", 89 | "Killing Z2 has no effect on read throughput. Killing Z1 while giving the leaders time to handoff leadership to a different zone has a minor effect: the throughput drop is not visible in the graph, but is around 3-4%. On the other hand, killing Z1 with no warning has a severe effect: the rate of completion drops almost to 0. As leaders get re-elected, though, the throughput of the system rises to approximately 100K reads/second because of two artifacts of our experiment: there is extra capacity in the system, and operations are queued while the leader is unavailable. As a result, the throughput of the system rises before leveling off again at its steady-state rate.\n", 90 | "\n", 91 | "We can also see the effect of the fact that Paxos leader leases are set to 10 seconds. When we kill the zone, the leader-lease expiration times for the groups should be evenly distributed over the next 10 seconds. Soon after each lease from a dead leader expires, a new leader is elected. Approximately 10 seconds after the kill time, all of the groups have leaders and throughput has recovered. Shorter lease times would reduce the effect of server deaths on availability, but would require greater amounts of lease-renewal network traffic. We are in the process of designing and implementing a mechanism that will cause slaves to release Paxos leader leases upon leader failure.\n", 92 | "\n", 93 | "## 5.3 TrueTime\n", 94 | "\n", 95 | "Two questions must be answered with respect to TrueTime: is ε truly a bound on clock uncertainty, and how bad does ε get? For the former, the most serious problem would be if a local clock's drift were greater than 200us/sec: that would break assumptions made by TrueTime. Our machine statistics show that bad CPUs are 6 times more likely than bad clocks. That is, clock issues are extremely infrequent, relative to much more serious hardware problems. As a result, we believe that TrueTime's implementation is as trustworthy as any other piece of software upon which Spanner depends.\n", 96 | "\n", 97 | "Figure 6 presents TrueTime data taken at several thousand spanserver machines across datacenters up to 2200" 98 | ], 99 | "text/plain": [ 100 | "" 101 | ] 102 | }, 103 | "metadata": {}, 104 | "output_type": "display_data" 105 | } 106 | ], 107 | "source": [ 108 | "display(Markdown(markdown_output))" 109 | ] 110 | } 111 | ], 112 | "metadata": { 113 | "kernelspec": { 114 | "display_name": "any", 115 | "language": "python", 116 | "name": "python3" 117 | }, 118 | "language_info": { 119 | "codemirror_mode": { 120 | "name": "ipython", 121 | "version": 3 122 | }, 123 | "file_extension": ".py", 124 | "mimetype": "text/x-python", 125 | "name": "python", 126 | "nbconvert_exporter": "python", 127 | "pygments_lexer": "ipython3", 128 | "version": "-1.-1.-1" 129 | } 130 | }, 131 | "nbformat": 4, 132 | "nbformat_minor": 2 133 | } 134 | -------------------------------------------------------------------------------- /examples/extract_pii.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n", 10 | "# !pip3 install --upgrade ipython\n", 11 | "# !pip3 install --upgrade any-parser" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from any_parser import AnyParser" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "ap = AnyParser(api_key=\"...\")" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "file_path = \"./sample_data/resume_1.pdf\"\n", 39 | "pii_info, time = ap.extract_pii(file_path)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 5, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "'Time Elapsed: 8.02 seconds'" 51 | ] 52 | }, 53 | "execution_count": 5, 54 | "metadata": {}, 55 | "output_type": "execute_result" 56 | } 57 | ], 58 | "source": [ 59 | "time" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 6, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "{'name': 'Gary Jiang',\n", 71 | " 'phone_number': '+1-213-725-7637',\n", 72 | " 'address': None,\n", 73 | " 'email_address': 'jiangzhehuan0105@gmail.com',\n", 74 | " 'linkedin_url': 'https://linkedin.com/in/gary-jiang',\n", 75 | " 'github_url': None,\n", 76 | " 'summary': 'Full-stack Software Engineer'}" 77 | ] 78 | }, 79 | "execution_count": 6, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "pii_info" 86 | ] 87 | } 88 | ], 89 | "metadata": { 90 | "kernelspec": { 91 | "display_name": "any", 92 | "language": "python", 93 | "name": "python3" 94 | }, 95 | "language_info": { 96 | "codemirror_mode": { 97 | "name": "ipython", 98 | "version": 3 99 | }, 100 | "file_extension": ".py", 101 | "mimetype": "text/x-python", 102 | "name": "python", 103 | "nbconvert_exporter": "python", 104 | "pygments_lexer": "ipython3", 105 | "version": "-1.-1.-1" 106 | } 107 | }, 108 | "nbformat": 4, 109 | "nbformat_minor": 2 110 | } 111 | -------------------------------------------------------------------------------- /examples/extract_resume_key_value.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n", 10 | "# !pip3 install --upgrade ipython\n", 11 | "# !pip3 install --upgrade any-parser" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from IPython.display import display\n", 21 | "from any_parser import AnyParser" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "ap = AnyParser(api_key=\"...\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 5, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "file_path = \"./sample_data/resume_1.pdf\"\n", 40 | "json_result = ap.extract_resume_key_value(file_path)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 6, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/plain": [ 51 | "({'pii': {'full_name': 'GARY JIANG',\n", 52 | " 'email': 'jiangzhehuan0105@gmail.com',\n", 53 | " 'phone': '+1 (213) 725-7637'},\n", 54 | " 'education': [{'organization': 'Shenyang University of Technology',\n", 55 | " 'degree': \"Bachelor's Degree\",\n", 56 | " 'major': 'Computer Science',\n", 57 | " 'start_date': '2008-01-01',\n", 58 | " 'end_date': '2012-12-31',\n", 59 | " 'courses': None,\n", 60 | " 'achievements': None}],\n", 61 | " 'work_experience': [{'job_title': 'Full Stack Developer',\n", 62 | " 'company_name': 'VIMMERSE',\n", 63 | " 'location': None,\n", 64 | " 'start_date': '2023-06-01',\n", 65 | " 'end_date': 'present',\n", 66 | " 'job_type': None,\n", 67 | " 'summary': 'Developed an AI-powered editor web application that brings photos to life by animating images in 3D.',\n", 68 | " 'bullet_points': ['Developed robust back-end services utilizing Python (Flask/FastAPI) and Node.js (AWS Lambda) for efficient and scalable web applications',\n", 69 | " 'Built user-friendly and interactive websites using Next.js, ensuring a seamless user experience',\n", 70 | " 'Deployed and managed AWS infrastructure, including EC2 instances, S3 storage buckets, DynamoDB for NoSQL data management, and Cognito user pools for secure user authentication',\n", 71 | " 'Experienced Agile and Scrum methodologies within a fast-paced startup environment to ensure efficient project delivery and continuous improvement',\n", 72 | " 'Collaborated effectively with cross-functional teams (design, product management) to deliver projects on time, fostering a positive and collaborative work environment']},\n", 73 | " {'job_title': 'Full Stack Developer',\n", 74 | " 'company_name': 'VIKING SASQUATCH',\n", 75 | " 'location': None,\n", 76 | " 'start_date': '2023-01-01',\n", 77 | " 'end_date': '2023-06-30',\n", 78 | " 'job_type': None,\n", 79 | " 'summary': 'Developed APIs and Integrations for all of the parties that work on Real Estate Transactions.',\n", 80 | " 'bullet_points': ['Connecting Mortgage, Title, and Real Estate to solve pain points and improve automation and create efficiencies',\n", 81 | " 'Implemented a user-friendly front-end interface using Nuxt.js, ensuring a seamless user experience',\n", 82 | " 'Built backend APIs utilizing Node.js serverless functions for optimal performance',\n", 83 | " 'Managed data storage and security by implementing a MySQL database',\n", 84 | " 'Collaborated effectively within a team using Agile methodologies like sprint planning, daily standups, retrospectives to ensure project delivery and continuous improvement']},\n", 85 | " {'job_title': 'Full Stack Developer',\n", 86 | " 'company_name': 'ROX PAY SRL',\n", 87 | " 'location': None,\n", 88 | " 'start_date': '2021-12-01',\n", 89 | " 'end_date': '2022-12-31',\n", 90 | " 'job_type': None,\n", 91 | " 'summary': 'Built Fintech Software House that aims to optimize B2B payments by offering a systemic solution that gives value-added services in collection of payments, financial information and corporate liquidity by essentially creating a Commission Free, Open Loop, Payment Gateway system.',\n", 92 | " 'bullet_points': ['Developed front-end by using React.js and Redux, Javascript/Typescript',\n", 93 | " 'Contributed developing backend utilizing Django/Python']},\n", 94 | " {'job_title': 'Freelancer',\n", 95 | " 'company_name': 'FREELANCE',\n", 96 | " 'location': None,\n", 97 | " 'start_date': '2017-09-01',\n", 98 | " 'end_date': '2021-10-31',\n", 99 | " 'job_type': None,\n", 100 | " 'summary': 'Developed and managed many web and mobile applications while working as freelancer at Internet Dzyns LLC company.',\n", 101 | " 'bullet_points': ['Developed multiple web applications, participating in the whole process of their development: product design and estimation, code design and development, DevOps, UI/UX design, product launch and maintenance',\n", 102 | " 'Developed cross-platform mobile application using Flutter and Ionic/Angular',\n", 103 | " 'Developed NFT marketplace websites and wrote smart contracts']},\n", 104 | " {'job_title': 'Server Administrator, Java Developer',\n", 105 | " 'company_name': 'NEUSOFT',\n", 106 | " 'location': None,\n", 107 | " 'start_date': '2014-06-01',\n", 108 | " 'end_date': '2017-08-31',\n", 109 | " 'job_type': None,\n", 110 | " 'summary': 'Worked as intern and software developer after graduated university.',\n", 111 | " 'bullet_points': ['Correct analytical and reasoning skills to troubleshoot and repair server issues',\n", 112 | " 'Operating Systems & Security Software',\n", 113 | " 'Java / Spring Boot / Hibernate']}],\n", 114 | " 'personal_info': {'name': 'GARY JIANG',\n", 115 | " 'phone_number': '+1-213-725-7637',\n", 116 | " 'address': None,\n", 117 | " 'email_address': 'jiangzhehuan0105@gmail.com',\n", 118 | " 'linkedin_url': 'linkedin.com/in/gary-jiang',\n", 119 | " 'github_url': None,\n", 120 | " 'summary': None},\n", 121 | " 'skills': {'Programming Languages': ['Python',\n", 122 | " 'PHP',\n", 123 | " 'Javascript',\n", 124 | " 'Typescript',\n", 125 | " 'HTML',\n", 126 | " 'CSS'],\n", 127 | " 'Tools': ['Flask',\n", 128 | " 'Django',\n", 129 | " 'FastAPI',\n", 130 | " 'Laravel',\n", 131 | " 'Node.js',\n", 132 | " 'SQL databases',\n", 133 | " 'Next.js',\n", 134 | " 'React',\n", 135 | " 'Redux',\n", 136 | " 'Nuxt.js',\n", 137 | " 'Vue',\n", 138 | " 'AWS Lambda',\n", 139 | " 'Cognito',\n", 140 | " 'EC2',\n", 141 | " 'S3',\n", 142 | " 'DynamoDB',\n", 143 | " 'API gateway',\n", 144 | " 'Git',\n", 145 | " 'Version Control',\n", 146 | " 'DevOps',\n", 147 | " 'CI/CD'],\n", 148 | " 'Other': ['Startup Experience',\n", 149 | " 'Adaptable',\n", 150 | " 'Resourceful',\n", 151 | " 'Prioritization',\n", 152 | " 'Hybrid Mobile App Development',\n", 153 | " 'Flutter',\n", 154 | " 'Ionic',\n", 155 | " 'Angular',\n", 156 | " 'AGILE',\n", 157 | " 'SCRUM']},\n", 158 | " 'certifications': [],\n", 159 | " 'projects': []},\n", 160 | " 'Time Elapsed: 27.27 seconds')" 161 | ] 162 | }, 163 | "metadata": {}, 164 | "output_type": "display_data" 165 | } 166 | ], 167 | "source": [ 168 | "display(json_result)" 169 | ] 170 | } 171 | ], 172 | "metadata": { 173 | "kernelspec": { 174 | "display_name": "any", 175 | "language": "python", 176 | "name": "python3" 177 | }, 178 | "language_info": { 179 | "codemirror_mode": { 180 | "name": "ipython", 181 | "version": 3 182 | }, 183 | "file_extension": ".py", 184 | "mimetype": "text/x-python", 185 | "name": "python", 186 | "nbconvert_exporter": "python", 187 | "pygments_lexer": "ipython3", 188 | "version": "-1.-1.-1" 189 | } 190 | }, 191 | "nbformat": 4, 192 | "nbformat_minor": 2 193 | } 194 | -------------------------------------------------------------------------------- /examples/extract_tables.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n", 10 | "# !pip3 install --upgrade ipython\n", 11 | "# !pip3 install --upgrade any-parser\n", 12 | "# !pip3 install pandas lxml html5lib bs4\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "from IPython.display import display, Markdown\n", 22 | "from any_parser import AnyParser" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "ap = AnyParser(api_key=\"...\")" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 7, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "csv_output, time_info = ap.extract_tables(\n", 41 | " file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"csv\"\n", 42 | ")\n", 43 | "\n", 44 | "html_output, time_info = ap.extract_tables(\n", 45 | " file_path=\"./sample_data/test_1figure_1table.png\", return_type=\"html\"\n", 46 | ")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 8, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "CPU times: user 3 μs, sys: 1 μs, total: 4 μs\n", 59 | "Wall time: 5.96 μs\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "time" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 10, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/markdown": [ 75 | "0,1,2\n", 76 | ",latency,(ms)\n", 77 | "participants,mean,99th percentile\n", 78 | "1,17.0 +1.4,75.0 34.9\n", 79 | "2,24.5 +2.5,87.6 35.9\n", 80 | "5,31.5 +6.2,104.5 52.2\n", 81 | "10,30.0 +3.7,95.6 25.4\n", 82 | "25,35.5 +5.6,100.4 42.7\n", 83 | "50,42.7 +4.1,93.7 22.9\n", 84 | "100,71.4 +7.6,131.2 +17.6\n", 85 | "200,150.5 +11.0,320.3 35.1\n" 86 | ], 87 | "text/plain": [ 88 | "" 89 | ] 90 | }, 91 | "metadata": {}, 92 | "output_type": "display_data" 93 | }, 94 | { 95 | "data": { 96 | "text/markdown": [ 97 | "\n", 98 | "\n", 99 | "\n", 100 | "\n", 101 | "\n", 102 | "\n", 103 | "\n", 104 | "\n", 105 | "\n", 106 | "\n", 107 | "\n", 108 | "\n", 109 | "\n", 110 | "
latency (ms)
participantsmean 99th percentile
1 17.0 +1.4 75.0 34.9
2 24.5 +2.5 87.6 35.9
5 31.5 +6.2 104.5 52.2
10 30.0 +3.7 95.6 25.4
25 35.5 +5.6 100.4 42.7
50 42.7 +4.1 93.7 22.9
100 71.4 +7.6 131.2 +17.6
200 150.5 +11.0320.3 35.1
\n", 111 | "\n" 112 | ], 113 | "text/plain": [ 114 | "" 115 | ] 116 | }, 117 | "metadata": {}, 118 | "output_type": "display_data" 119 | } 120 | ], 121 | "source": [ 122 | "if isinstance(csv_output, list):\n", 123 | " csv_output_str = \"\\n\".join(csv_output)\n", 124 | "else:\n", 125 | " csv_output_str = csv_output\n", 126 | "\n", 127 | "display(Markdown(csv_output_str))\n", 128 | "display(Markdown(html_output))" 129 | ] 130 | } 131 | ], 132 | "metadata": { 133 | "kernelspec": { 134 | "display_name": "any", 135 | "language": "python", 136 | "name": "python3" 137 | }, 138 | "language_info": { 139 | "codemirror_mode": { 140 | "name": "ipython", 141 | "version": 3 142 | }, 143 | "file_extension": ".py", 144 | "mimetype": "text/x-python", 145 | "name": "python", 146 | "nbconvert_exporter": "python", 147 | "pygments_lexer": "ipython3", 148 | "version": "3.10.16" 149 | } 150 | }, 151 | "nbformat": 4, 152 | "nbformat_minor": 2 153 | } 154 | -------------------------------------------------------------------------------- /examples/parse_batch_api.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Anyparser Batch API Example" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n", 17 | "# !pip3 install --upgrade ipython\n", 18 | "# !pip3 install --upgrade any-parser" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Step1: Batch API Folder Processing Upload" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import json\n", 35 | "import os\n", 36 | "from datetime import datetime\n", 37 | "\n", 38 | "from dotenv import load_dotenv\n", 39 | "\n", 40 | "from any_parser import AnyParser" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# Load environment variables\n", 50 | "load_dotenv(override=True)\n", 51 | "\n", 52 | "# Get API key and create parser\n", 53 | "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n", 54 | "if not api_key:\n", 55 | " raise ValueError(\"CAMBIO_API_KEY is not set\")\n", 56 | "ap = AnyParser(api_key)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "Create Batch Request" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "Upload responses saved to: ./sample_data_20250103003352.jsonl\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "# Upload folder for batch processing\n", 81 | "WORKING_FOLDER = \"./sample_data\"\n", 82 | "responses = ap.batches.create(WORKING_FOLDER)\n", 83 | "\n", 84 | "# Save responses to JSONL file with timestamp\n", 85 | "timestamp = datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", 86 | "output_file = f\"./sample_data_{timestamp}.jsonl\"\n", 87 | "\n", 88 | "with open(output_file, \"w\") as f:\n", 89 | " for response in responses:\n", 90 | " f.write(json.dumps(response.model_dump()) + \"\\n\")\n", 91 | "\n", 92 | "print(f\"Upload responses saved to: {output_file}\")" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "Check the first element status in the jsonl using the requestId" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 4, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "Checking status for file: Earnings-Presentation-Q2-2024.pdf\n", 112 | "Content not yet available\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "# Get first response from the JSONL file\n", 118 | "with open(output_file, \"r\") as f:\n", 119 | " first_response = json.loads(f.readline())\n", 120 | "\n", 121 | "request_id = first_response[\"requestId\"]\n", 122 | "print(f\"Checking status for file: {first_response['fileName']}\")\n", 123 | "\n", 124 | "# Retrieve status using request ID\n", 125 | "markdown = ap.batches.retrieve(request_id)\n", 126 | "if markdown and markdown.result:\n", 127 | " print(\"Content retrieved successfully\")\n", 128 | "else:\n", 129 | " print(\"Content not yet available\")" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "Note: Batch extraction is currently in beta testing. Processing time may take up to 2 hours to complete." 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "After 2 hours, you can check the content of the first file in the folder again" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 6, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "Content retrieved successfully\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "# Retrieve status using request ID\n", 161 | "markdown = ap.batches.retrieve(request_id)\n", 162 | "if markdown and markdown.result:\n", 163 | " print(\"Content retrieved successfully\")\n", 164 | "else:\n", 165 | " print(\"Content not yet available\")" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "### Step2: Batch API folder fetch response\n" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 16, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "import json\n", 182 | "import logging\n", 183 | "import os\n", 184 | "from concurrent.futures import ThreadPoolExecutor, as_completed\n", 185 | "\n", 186 | "from dotenv import load_dotenv\n", 187 | "\n", 188 | "from any_parser import AnyParser\n", 189 | "\n", 190 | "# Configure logging\n", 191 | "logging.basicConfig(level=logging.INFO)\n", 192 | "logger = logging.getLogger(__name__)\n", 193 | "\n", 194 | "# Load environment variables\n", 195 | "load_dotenv(override=True)\n", 196 | "\n", 197 | "MAX_WORKER = 10" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 17, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "# Get API key and create parser\n", 207 | "api_key = os.environ.get(\"CAMBIO_API_KEY\")\n", 208 | "if not api_key:\n", 209 | " raise ValueError(\"CAMBIO_API_KEY is not set\")\n", 210 | "ap = AnyParser(api_key)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "Read responses from JSONL file" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 18, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "# Change to your real output json from parse_batch_upload.py\n", 227 | "response_file = \"./sample_data_20250102103047.jsonl\"\n", 228 | "with open(response_file, \"r\") as f:\n", 229 | " responses = [json.loads(line) for line in f]" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 19, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "# Process responses concurrently\n", 239 | "def process_response(response):\n", 240 | " \"\"\"Process a single response by retrieving markdown content\"\"\"\n", 241 | " request_id = response[\"requestId\"]\n", 242 | " try:\n", 243 | " markdown = ap.batches.retrieve(request_id)\n", 244 | " if markdown and markdown.result:\n", 245 | " response[\"result\"] = [markdown.result[0] if markdown.result else \"\"]\n", 246 | " response[\"requestStatus\"] = \"COMPLETED\"\n", 247 | " response[\"completionTime\"] = markdown.completionTime\n", 248 | " except Exception as e:\n", 249 | " logger.error(f\"Error processing {request_id}: {str(e)}\")\n", 250 | " response[\"error\"] = [str(e)]\n", 251 | " return response" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 20, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "name": "stdout", 261 | "output_type": "stream", 262 | "text": [ 263 | "Updated all responses in ./sample_data_20250102103047.jsonl with markdown content\n" 264 | ] 265 | } 266 | ], 267 | "source": [ 268 | "# Process responses concurrently\n", 269 | "with ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:\n", 270 | " future_to_response = {\n", 271 | " executor.submit(process_response, response): response\n", 272 | " for response in responses\n", 273 | " }\n", 274 | "\n", 275 | " updated_responses = []\n", 276 | " for future in as_completed(future_to_response):\n", 277 | " updated_response = future.result()\n", 278 | " updated_responses.append(updated_response)\n", 279 | "\n", 280 | "# Write all updated responses back to file\n", 281 | "with open(response_file, \"w\") as f:\n", 282 | " for response in updated_responses:\n", 283 | " f.write(json.dumps(response) + \"\\n\")\n", 284 | "\n", 285 | "print(f\"Updated all responses in {response_file} with markdown content\")" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "Print out the first row from the updated file" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 21, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "First row from updated file:\n", 305 | "{\n", 306 | " \"fileName\": \"Earnings-Presentation-Q2-2024.pdf\",\n", 307 | " \"requestId\": \"cfb556cb-e5f9-4b6c-a2f7-6ba982858a92\",\n", 308 | " \"requestStatus\": \"COMPLETED\",\n", 309 | " \"result\": [\n", 310 | " \"## Meta Earnings Presentation\\n## Q2 2024\\n\\ninvestor.fb.com Meta logo, consisting of a stylized infinity symbol next to the text \\\"Meta\\\"\"\n", 311 | " ],\n", 312 | " \"completionTime\": \"2025-01-02T04:34:56.494827+00:00\"\n", 313 | "}\n" 314 | ] 315 | } 316 | ], 317 | "source": [ 318 | "# Read and print first row from the updated file\n", 319 | "with open(response_file, \"r\") as f:\n", 320 | " first_row = json.loads(f.readline())\n", 321 | " print(\"First row from updated file:\")\n", 322 | " print(json.dumps(first_row, indent=2))" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "## End of the notebook\n", 330 | "\n", 331 | "Check more [case studies](https://www.cambioml.com/blog) of CambioML!\n", 332 | "\n", 333 | "\n", 334 | " \n", 335 | "" 336 | ] 337 | } 338 | ], 339 | "metadata": { 340 | "kernelspec": { 341 | "display_name": "any-parse", 342 | "language": "python", 343 | "name": "python3" 344 | }, 345 | "language_info": { 346 | "codemirror_mode": { 347 | "name": "ipython", 348 | "version": 3 349 | }, 350 | "file_extension": ".py", 351 | "mimetype": "text/x-python", 352 | "name": "python", 353 | "nbconvert_exporter": "python", 354 | "pygments_lexer": "ipython3", 355 | "version": "3.10.15" 356 | } 357 | }, 358 | "nbformat": 4, 359 | "nbformat_minor": 2 360 | } 361 | -------------------------------------------------------------------------------- /examples/parse_pdf2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 15, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Install the libraries (ipython is used for displaying markdown in this demo)\n", 10 | "# !pip3 install --upgrade ipython\n", 11 | "# !pip3 install --upgrade any-parser" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 16, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from IPython.display import display, Markdown\n", 21 | "from any_parser import AnyParser\n", 22 | "import os\n", 23 | "from dotenv import load_dotenv" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 17, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "load_dotenv(override=True)\n", 33 | "example_apikey = os.getenv(\"CAMBIO_API_KEY\")" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 18, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "ap = AnyParser(example_apikey)\n", 43 | "\n", 44 | "# Define extract_args as a dictionary with your desired parameters\n", 45 | "extract_args = {\n", 46 | " \"vqa_figures_flag\": True,\n", 47 | " \"vqa_charts_flag\": True\n", 48 | "}\n", 49 | "\n", 50 | "# Pass extract_args to the parse method\n", 51 | "markdown_output, time = ap.parse(\n", 52 | " file_path=\"./sample_data/Earnings-Presentation-Q2-2024.pdf\",\n", 53 | " extract_args=extract_args\n", 54 | ")" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 19, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "'Time Elapsed: 23.25 seconds'" 66 | ] 67 | }, 68 | "execution_count": 19, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "time" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 20, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/markdown": [ 85 | "Meta Earnings Presentation Q2 2024 \n", 86 | "\n", 87 | "investor.fb.com\n", 88 | "\n", 89 | " Meta logo, consisting of an infinity symbol followed by the text \"Meta\"\n", 90 | "\n", 91 | "Revenue by User Geography Meta logo \n", 92 | "\n", 93 | "In Millions\n", 94 | "\n", 95 | " \n", 96 | "| Quarter | US & Canada | Europe | Asia-Pacific | Rest of World | Total |\n", 97 | "|---|---|---|---|---|---|\n", 98 | "| Q2'24 | 16,847 | 9,300 | 7,888 | 5,036 | 39,071 |\n", 99 | "| Q1'24 | 15,824 | 8,483 | 7,481 | 4,667 | 36,455 |\n", 100 | "| Q4'23 | 18,585 | 9,441 | 7,512 | 4,573 | 40,111 |\n", 101 | "| Q3'23 | 15,190 | 7,777 | 6,928 | 4,251 | 34,146 |\n", 102 | "| Q2'23 | 14,422 | 7,323 | 6,515 | 3,739 | 31,999 |\n", 103 | "| Q1'23 | 13,048 | 6,345 | 5,960 | 3,292 | 28,645 |\n", 104 | "| Q4'22 | 15,636 | 7,050 | 6,050 | 3,429 | 32,165 |\n", 105 | "| Q3'22 | 13,035 | 5,797 | 5,782 | 3,100 | 27,714 |\n", 106 | "| Q2'22 | 13,249 | 6,452 | 5,797 | 3,213 | 28,822 |\n", 107 | "\n", 108 | "This stacked bar chart shows the revenue by user geography for Meta from Q2'22 to Q2'24. The revenue is divided into four categories: US & Canada, Europe, Asia-Pacific, and Rest of World. The total revenue for each quarter is shown at the top of each bar.\n", 109 | " \n", 110 | "\n", 111 | "Our revenue by user geography is geographically apportioned based on our estimation of the geographic location of our users when they perform a revenue-generating activity. This allocation differs from our revenue disaggregated by geography disclosure in our condensed consolidated financial statements where revenue is geographically apportioned based on the addresses of our customers.\n", 112 | "\n", 113 | " 3\n", 114 | "\n", 115 | "Segment Results Meta logo \n", 116 | "\n", 117 | "In Millions\n", 118 | "\n", 119 | " \n", 120 | "| | Q2'22 | Q3'22 | Q4'22 | Q1'23 | Q2'23 | Q3'23 | Q4'23 | Q1'24 | Q2'24 |\n", 121 | "|---|---|---|---|---|---|---|---|---|---|\n", 122 | "| Advertising | $ 28,152 | $ 27,237 | $ 31,254 | $ 28,101 | $ 31,498 | $ 33,643 | $ 38,706 | $ 35,635 | $ 38,329 |\n", 123 | "| Other | 218 | 192 | 184 | 205 | 225 | 293 | 334 | 380 | 389 |\n", 124 | "| Family of Apps Revenue | 28,370 | 27,429 | 31,438 | 28,306 | 31,723 | 33,936 | 39,040 | 36,015 | 38,718 |\n", 125 | "| Reality Labs Revenue | 452 | 285 | 727 | 339 | 276 | 210 | 1,071 | 440 | 353 |\n", 126 | "| Total Revenue | $ 28,822 | $ 27,714 | $ 32,165 | $ 28,645 | $ 31,999 | $ 34,146 | $ 40,111 | $ 36,455 | $ 39,071 |\n", 127 | "| Family of Apps Operating Income | $ 11,164 | $ 9,336 | $ 10,678 | $ 11,219 | $ 13,131 | $ 17,490 | $ 21,030 | $ 17,664 | $ 19,335 |\n", 128 | "| Reality Labs Operating (Loss) | (2,806) | (3,672) | (4,279) | (3,992) | (3,739) | (3,742) | (4,646) | (3,846) | (4,488) |\n", 129 | "| Total Income from Operations | $ 8,358 | $ 5,664 | $ 6,399 | $ 7,227 | $ 9,392 | $ 13,748 | $ 16,384 | $ 13,818 | $ 14,847 |\n", 130 | "| Operating Margin | 29% | 20% | 20% | 25% | 29% | 40% | 41% | 38% | 38% |\n", 131 | " \n", 132 | "\n", 133 | "We report our financial results based on two reportable segments: Family of Apps (FoA) and Reality Labs (RL). FoA includes Facebook, Instagram, Messenger, WhatsApp, and other services. RL includes our virtual, augmented, and mixed reality related consumer hardware, software, and content.\n", 134 | "\n", 135 | " 4\n", 136 | "\n", 137 | "Net Income Meta logo \n", 138 | "\n", 139 | "In Millions\n", 140 | "\n", 141 | " \n", 142 | "| Quarter | Net Income |\n", 143 | "|---|---|\n", 144 | "| Q2'22 | $6,687 |\n", 145 | "| Q3'22 | $4,395 |\n", 146 | "| Q4'22 | $4,652 |\n", 147 | "| Q1'23 | $5,709 |\n", 148 | "| Q2'23 | $7,788 |\n", 149 | "| Q3'23 | $11,583 |\n", 150 | "| Q4'23 | $14,017 |\n", 151 | "| Q1'24 | $12,369 |\n", 152 | "| Q2'24 | $13,465 |\n", 153 | "\n", 154 | "This bar chart shows the Net Income in millions for Meta from Q2'22 to Q2'24. The y-axis ranges from $0 to $14,017 million, with increments of $1,000 million. The highest net income was $14,017 million in Q4'23, while the lowest was $4,395 million in Q3'22.\n", 155 | " \n", 156 | "\n", 157 | " 7\n", 158 | "\n", 159 | "Diluted Earnings Per Share Meta logo \n", 160 | "\n", 161 | " \n", 162 | "| Quarter | Earnings Per Share |\n", 163 | "|---|---|\n", 164 | "| Q2'22 | $2.46 |\n", 165 | "| Q3'22 | $1.64 |\n", 166 | "| Q4'22 | $1.76 |\n", 167 | "| Q1'23 | $2.20 |\n", 168 | "| Q2'23 | $2.98 |\n", 169 | "| Q3'23 | $4.39 |\n", 170 | "| Q4'23 | $5.33 |\n", 171 | "| Q1'24 | $4.71 |\n", 172 | "| Q2'24 | $5.16 |\n", 173 | "\n", 174 | "This bar chart shows the Diluted Earnings Per Share for Meta from Q2'22 to Q2'24. The y-axis ranges from $1.64 to $5.33, with increments of $0.02. The chart demonstrates an overall increasing trend in earnings per share over the period, with the highest point in Q4'23 at $5.33 and the lowest in Q3'22 at $1.64.\n", 175 | " \n", 176 | "\n", 177 | " 8\n", 178 | "\n", 179 | "Limitations of Key Metrics and Other Data Meta logo \n", 180 | "\n", 181 | "To calculate our estimates of DAP, we currently use a series of machine learning models that are developed based on internal reviews of limited samples of user accounts and calibrated against user survey data. We apply significant judgment in designing these models and calculating these estimates. For example, to match user accounts within individual products and across multiple products, we use data signals such as similar device information, IP addresses, and user names. We also calibrate our models against data from periodic user surveys of varying sizes and frequency across our products, which survey questions are based on monthly usage, and which are inherently subject to error. The timing and results of such user surveys have in the past contributed, and may in the future contribute, to changes in our reported Family metrics from period to period. In addition, our data limitations may affect our understanding of certain details of our business and increase the risk of error for our Family metrics estimates. Our techniques and models rely on a variety of data signals from different products, and we rely on more limited data signals for some products compared to others. For example, as a result of limited visibility into encrypted products, we have fewer data signals from WhatsApp user accounts and primarily rely on phone numbers and device information to match WhatsApp user accounts with accounts on our other products. Any loss of access to data signals we use in our process for calculating Family metrics, whether as a result of our own product decisions, actions by third-party browser or mobile platforms, regulatory or legislative requirements, or other factors, also may impact the stability or accuracy of our reported Family metrics, as well as our ability to report these metrics at all. Our estimates of Family metrics also may change as our methodologies evolve, including through the application of new data signals or technologies, product changes, or other improvements in our user surveys, algorithms, or machine learning that may improve our ability to match accounts within and across our products or otherwise evaluate the broad population of our users. In addition, such evolution may allow us to identify previously undetected violating accounts (as defined below).\n", 182 | "\n", 183 | "We regularly evaluate our Family metrics to estimate the percentage of our DAP consisting solely of \"violating\" accounts. We define \"violating\" accounts as accounts which we believe are intended to be used for purposes that violate our terms of service, including bots and spam. In the first quarter of 2024, we estimated that less than 3% of our worldwide DAP consisted solely of violating accounts. Such estimation is based on an internal review of a limited sample of accounts, and we apply significant judgment in making this determination. For example, we look for account information and behaviors associated with Facebook and Instagram accounts that appear to be inauthentic to the reviewers, but we have limited visibility into WhatsApp user activity due to encryption. In addition, if we believe an individual person has one or more violating accounts, we do not include such person in our violating accounts estimation as long as we believe they have one account that does not constitute a violating account. From time to time, we disable certain user accounts, make product changes, or take other actions to reduce the number of violating accounts among our users, which may also reduce our DAP estimates in a particular period. We intend to disclose our estimates of the percentage of our DAP consisting solely of violating accounts on an annual basis. Violating accounts are very difficult to measure at our scale, and it is possible that the actual number of violating accounts may vary significantly from our estimates.\n", 184 | "\n", 185 | "## User Geography\n", 186 | "\n", 187 | "Our estimates for revenue by user location, as well as year-over-year percentage changes in ad impressions delivered and the average price per ad by user location, are also affected by data limitations and other challenges in measuring user geography. Our data regarding the geographic location of our users is estimated based on a number of factors, such as the user's IP address and self-disclosed location. These factors may not always accurately reflect the user's actual location. For example, a user may appear to be accessing our products from the location of the proxy server that the user connects to rather than from the user's actual location. The methodologies used to measure our metrics are also susceptible to algorithm or other technical errors.\n", 188 | "\n", 189 | " 17" 190 | ], 191 | "text/plain": [ 192 | "" 193 | ] 194 | }, 195 | "metadata": {}, 196 | "output_type": "display_data" 197 | } 198 | ], 199 | "source": [ 200 | "# Join the list elements with newlines to create a single string\n", 201 | "markdown_text = '\\n\\n'.join(markdown_output)\n", 202 | "display(Markdown(markdown_text))" 203 | ] 204 | } 205 | ], 206 | "metadata": { 207 | "kernelspec": { 208 | "display_name": "any", 209 | "language": "python", 210 | "name": "python3" 211 | }, 212 | "language_info": { 213 | "codemirror_mode": { 214 | "name": "ipython", 215 | "version": 3 216 | }, 217 | "file_extension": ".py", 218 | "mimetype": "text/x-python", 219 | "name": "python", 220 | "nbconvert_exporter": "python", 221 | "pygments_lexer": "ipython3", 222 | "version": "3.10.15" 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 2 227 | } 228 | -------------------------------------------------------------------------------- /examples/sample_data/Earnings-Presentation-Q2-2024.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/Earnings-Presentation-Q2-2024.pdf -------------------------------------------------------------------------------- /examples/sample_data/cambioml_logo_large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/cambioml_logo_large.png -------------------------------------------------------------------------------- /examples/sample_data/resume_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/resume_1.pdf -------------------------------------------------------------------------------- /examples/sample_data/resume_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/resume_1.png -------------------------------------------------------------------------------- /examples/sample_data/sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/sample.pdf -------------------------------------------------------------------------------- /examples/sample_data/stoxx_index_guide_0003.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/stoxx_index_guide_0003.pdf -------------------------------------------------------------------------------- /examples/sample_data/test1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test1.pdf -------------------------------------------------------------------------------- /examples/sample_data/test2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test2.pdf -------------------------------------------------------------------------------- /examples/sample_data/test3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test3.pdf -------------------------------------------------------------------------------- /examples/sample_data/test3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test3.png -------------------------------------------------------------------------------- /examples/sample_data/test_1figure_1table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_1figure_1table.png -------------------------------------------------------------------------------- /examples/sample_data/test_invoice.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_invoice.pdf -------------------------------------------------------------------------------- /examples/sample_data/test_medical_report.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_medical_report.jpeg -------------------------------------------------------------------------------- /examples/sample_data/test_odf.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_odf.docx -------------------------------------------------------------------------------- /examples/sample_data/test_odf.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_odf.pptx -------------------------------------------------------------------------------- /examples/sample_data/test_w2.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_w2.docx -------------------------------------------------------------------------------- /examples/sample_data/test_w2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_w2.png -------------------------------------------------------------------------------- /examples/sample_data/test_w2.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/examples/sample_data/test_w2.pptx -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "any-parser" 3 | version = "0.0.24" 4 | description = "Parser for all." 5 | authors = ["CambioML "] 6 | maintainers = ["Rachel Hu "] 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.9,<3.13" 11 | requests = "^2.25.0" 12 | python-dotenv = "^1.0.0" 13 | pydantic = "^2.10.3" 14 | 15 | [tool.poetry.group.dev.dependencies] 16 | black = "^24.8.0" 17 | isort = "^5.13.2" 18 | autoflake = "^2.3.1" 19 | pytest = "^8.3.3" 20 | pre-commit = "^4.0.1" 21 | 22 | [tool.poetry.group.optional.dependencies] 23 | Levenshtein = [ 24 | { version = "0.25.1", python = "<3.9" }, 25 | { version = "0.26.0", python = ">=3.9" } 26 | ] 27 | 28 | [build-system] 29 | requires = ["poetry-core"] 30 | build-backend = "poetry.core.masonry.api" 31 | -------------------------------------------------------------------------------- /run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | python -m unittest discover tests -v 3 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Testing 2 | Overview of running tests for AnyParser sdk. These should be run before submitting any pull request. 3 | 4 | These tests are written using the unittest framework in Python. The tests are located in the `tests/test.py` file. Test data is located in the `tests/test_data.py` file. 5 | 6 | ## Setup 7 | 1. Install the required packages by running the following command: 8 | ```bash 9 | poetry install 10 | ``` 11 | In the `dev.dependencies` section of the `pyproject.toml` file, you will see the packages that are installed. 12 | 13 | 2. Add a `.env` file in the `tests` folder with the following content: 14 | ```bash 15 | CAMBIO_API_KEY=************* 16 | ``` 17 | 18 | ## Pre-commit 19 | This project uses pre-commit to run checks before committing code. To initialize `pre-commit` for this repo, run the following command: 20 | ```bash 21 | pre-commit install 22 | ``` 23 | 24 | Now, with every commit, the checks will run automatically on the files added to the commit. The checks include: 25 | - `black` for code formatting 26 | - `flake8` for linting 27 | - `isort` for import sorting 28 | - running the unit tests in `tests/test.py` 29 | 30 | If you want to run the checks manually, you can run the following command: 31 | ```bash 32 | pre-commit run --all-files 33 | ``` 34 | 35 | ## Running Tests Manually 36 | 1. Make sure you are in the project root folder. 37 | 2. Run the following command: 38 | ```bash 39 | ./run_tests.sh 40 | ``` 41 | 42 | If you just want to run an individual test within the test.py file, you can run the following command: 43 | ```bash 44 | python -m unittest -k 45 | ``` 46 | 47 | For example, if you want to run `test_pdf_sync_extract`, you can run the following command: 48 | ```bash 49 | python -m unittest -k test_pdf_sync_extract 50 | ``` 51 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CambioML/any-parser/70dcb61db464c6878d02fd31e68b016c167d17d2/tests/__init__.py -------------------------------------------------------------------------------- /tests/outputs/correct_docx_output.txt: -------------------------------------------------------------------------------- 1 | ## Test document 2 | 3 | Here is an example chart: 4 | 5 | 6 | | Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 | 7 | |---|---|---|---|---|---| 8 | | Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% | 9 | | Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% | 10 | | Office 365 Commercial seat growth (a/o) | 14% | 12% | 11% | 11% | 10% | 11 | | Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 | 12 | | Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% | 13 | | LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% | 14 | 15 | 16 | Growth rates include non-GAAP CC growth (GAAP % / CC %) -------------------------------------------------------------------------------- /tests/outputs/correct_pdf_output.txt: -------------------------------------------------------------------------------- 1 | STOXX INDEX METHODOLOGY GUIDE 2 | 3 | ## CONTENTS 4 | 5 | 6.5.1. OVERVIEW 49 6 | 6.5.2. INDEX REVIEW 49 7 | 6.5.3. ONGOING MAINTENANCE 51 8 | 9 | 7. STOXX BENCHMARK INDICES (BMI) 52 10 | 11 | 7.1. STOXX GLOBAL INDICES 52 12 | 7.1.1. OVERVIEW 52 13 | 7.1.2. INDEX REVIEW 53 14 | 7.1.3. ONGOING MAINTENANCE 55 15 | 16 | 7.2. STOXX GLOBAL 1800 AND DERIVED INDICES 56 17 | 7.2.1. OVERVIEW 56 18 | 7.2.2. INDEX REVIEW 56 19 | 7.2.3. ONGOING MAINTENANCE 58 20 | 21 | 7.3. SIZE INDICES BASED ON THE STOXX GLOBAL INDICES 60 22 | 7.3.1. OVERVIEW 60 23 | 7.3.2. INDEX REVIEW 60 24 | 7.3.3. ONGOING MAINTENANCE 62 25 | 26 | 7.4. SECTOR INDICES BASED ON THE STOXX GLOBAL INDICES 63 27 | 7.4.1. OVERVIEW 63 28 | 7.4.2. INDEX REVIEW 63 29 | 7.4.3. ONGOING MAINTENANCE 64 30 | 31 | 7.5. STOXX EUROPE 600 AND EURO STOXX SUPERSECTOR INDICES: 30% / 15% CAPS 65 32 | 7.5.1. OVERVIEW 65 33 | 7.5.2. INDEX REVIEW 65 34 | 7.5.3. ONGOING MAINTENANCE 66 35 | 36 | 7.6. STOXX REGIONAL REAL ESTATE INDICES: 20% CAPS67 37 | 7.6.1. OVERVIEW 67 38 | 7.6.2. INDEX REVIEW 67 39 | 7.6.3. ONGOING MAINTENANCE 67 40 | 41 | 7.7. STOXX EMERGING MARKETS 800 LO 68 42 | 7.7.1. OVERVIEW 68 43 | 7.7.2. INDEX REVIEW 68 44 | 7.7.3. ONGOING MAINTENANCE 68 45 | 46 | 7.8. STOXX INDUSTRY AND SUPERSECTOR LEGACY INDICES 70 47 | 7.8.1. OVERVIEW 70 48 | 7.8.2. INDEX REVIEW 71 49 | 7.8.3. ONGOING MAINTENANCE 71 50 | 51 | 7.9. EURO STOXX SUPERSECTOR 5/10/40 INDICES 72 52 | 7.9.1. OVERVIEW 72 53 | 7.9.2. INDEX REVIEW 72 54 | 7.9.3. ONGOING MAINTENANCE 73 55 | 56 | 7.10. STOXX EUROPE 600 INDUSTRY 30-15 INDICES 74 57 | 7.10.1. OVERVIEW 74 58 | 7.10.2. INDEX REVIEW 74 59 | 7.10.3. ONGOING MAINTENANCE 75 60 | 61 | 7.11. STOXX SEMICONDUCTOR 30 INDEX 76 62 | 7.11.1. OVERVIEW 76 63 | 7.11.2. INDEX REVIEW 76 64 | 7.11.3. ONGOING MAINTENANCE 77 65 | 66 | ## 8. STOXX EQUAL WEIGHT INDICES 78 67 | 68 | 8.1. STOXX EQUAL WEIGHT INDICES 78 69 | 8.1.1. OVERVIEW 78 70 | 8.1.2. INDEX REVIEW 78 71 | 8.1.3. ONGOING MAINTENANCE 78 72 | 73 | ## 9. STOXX BLUE-CHIP INDICES 80 74 | 75 | 9.1. STOXX GLOBAL AND COUNTRY BLUE-CHIP INDICES 80 76 | 9.1.1. OVERVIEW 80 77 | 9.1.2. INDEX REVIEW 81 78 | 9.1.3. ONGOING MAINTENANCE 84 79 | 80 | 9.2. EURO STOXX 50 85 81 | 9.2.1. OVERVIEW 85 82 | 9.2.2. INDEX REVIEW 85 83 | 9.2.3. ONGOING MAINTENANCE 86 84 | 85 | 9.3. STOXX REGIONAL BLUE-CHIP INDICES 88 86 | 9.3.1. OVERVIEW 88 87 | 9.3.2. INDEX REVIEW 88 88 | 9.3.3. ONGOING MAINTENANCE 89 89 | 90 | 9.4. STOXX GLOBAL 150 91 91 | 9.4.1. OVERVIEW 91 92 | 9.4.2. INDEX REVIEW 91 93 | 9.4.3. ONGOING MAINTENANCE 91 94 | 95 | 9.5. STOXX BALKAN 50 EQUAL WEIGHT 92 96 | 9.5.1. OVERVIEW 92 97 | 9.5.2. INDEX REVIEW 92 98 | 9.5.3. ONGOING MAINTENANCE 93 99 | 100 | 9.6. STOXX CANADA 60 94 101 | 9.6.1. OVERVIEW 94 102 | 9.6.2. INDEX REVIEW 94 103 | 9.6.3. ONGOING MAINTENANCE 95 104 | 105 | ## 10. STOXX DIVIDEND INDICES 96 106 | 107 | 10.1. STOXX SELECT DIVIDEND INDICES 96 108 | 10.1.1. OVERVIEW 96 109 | 10.1.2. INDEX REVIEW 96 110 | 10.1.3. STOXX SELECT DIVIDEND INDICES 99 111 | 10.1.4. ONGOING MAINTENANCE 101 112 | 113 | 10.2. STOXX ASEAN-FIVE SELECT DIVIDEND 50 104 114 | 10.2.1. OVERVIEW 104 115 | 10.2.2. INDEX REVIEW 104 116 | 10.2.3. ONGOING MAINTENANCE 105 117 | 118 | 10.3. STOXX ASEAN SELECT DIVIDEND 30 106 119 | 120 | 3/529 121 | 122 | Part of DEUTSCHE BÖRSE GROUP -------------------------------------------------------------------------------- /tests/outputs/correct_png_output.txt: -------------------------------------------------------------------------------- 1 | | Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 | 2 | |---|---|---|---|---|---| 3 | | Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% | 4 | | Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% | 5 | | Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% | 6 | | Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 | 7 | | Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% | 8 | | LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% | -------------------------------------------------------------------------------- /tests/outputs/correct_pptx_output.txt: -------------------------------------------------------------------------------- 1 | ## Test finical report 2 | ## Title 3 | 4 | • Chart 1 example 5 | 6 | 7 | | Investor Metrics | FY23 Q1 | FY23 Q2 | FY23 Q3 | FY23 Q4 | FY24 Q1 | 8 | |---|---|---|---|---|---| 9 | | Office Commercial products and cloud services revenue growth (y/y) | 7% / 13% | 7% / 14% | 13% / 17% | 12% / 14% | 15% / 14% | 10 | | Office Consumer products and cloud services revenue growth (y/y) | 7% / 11% | (2)% / 3% | 1% / 4% | 3% / 6% | 3% / 4% | 11 | | Office 365 Commercial seat growth (y/y) | 14% | 12% | 11% | 11% | 10% | 12 | | Microsoft 365 Consumer subscribers (in millions) | 65.1 | 67.7 | 70.8 | 74.9 | 76.7 | 13 | | Dynamics products and cloud services revenue growth (y/y) | 15% / 22% | 13% / 20% | 17% / 21% | 19% / 21% | 22% / 21% | 14 | | LinkedIn revenue growth (y/y) | 17% / 21% | 10% / 14% | 8% / 11% | 6% / 8% | 8% | 15 | 16 | 17 | Growth rates include non-GAAP CC growth (GAAP % / CC %). 18 | Thanks -------------------------------------------------------------------------------- /tests/test.py: -------------------------------------------------------------------------------- 1 | """Testing Synchronous and Asynchronous Extraction""" 2 | 3 | import base64 4 | import os 5 | import sys 6 | import time 7 | import unittest 8 | from pathlib import Path 9 | 10 | import Levenshtein 11 | from dotenv import load_dotenv 12 | 13 | from tests.test_data import EXTRACT_JSON_TEST_DATA 14 | 15 | sys.path.append(".") 16 | load_dotenv(override=True) 17 | from any_parser import AnyParser # noqa: E402 18 | 19 | 20 | def get_ground_truth(file_path: str) -> str: 21 | """Get the ground truth from the file.""" 22 | with open(file_path, "r", encoding="utf-8") as file: 23 | return file.read() 24 | 25 | 26 | def _preprocess_markdown_text(text: str) -> str: 27 | """Clean the markdown text.""" 28 | return text.replace("#", "").replace("\n", "") 29 | 30 | 31 | def compare_markdown(generated_output: str, correct_output: str) -> float: 32 | """ 33 | Compare the generated markdown to the correct markdown using 34 | Levenshtein Distance. 35 | """ 36 | # Preprocess both outputs to clean markdown text 37 | generated_output = _preprocess_markdown_text(generated_output) 38 | correct_output = _preprocess_markdown_text(correct_output) 39 | 40 | distance = Levenshtein.distance(generated_output, correct_output) 41 | 42 | max_len = max(len(generated_output), len(correct_output)) 43 | similarity_percentage = ((max_len - distance) / max_len) * 100 44 | 45 | return similarity_percentage 46 | 47 | 48 | class TestAnyParser(unittest.TestCase): 49 | """Testing Any Parser""" 50 | 51 | def setUp(self): 52 | self.api_key = os.environ.get("CAMBIO_API_KEY") 53 | if not self.api_key: 54 | raise ValueError("CAMBIO_API_KEY is not set") 55 | self.ap = AnyParser(self.api_key) 56 | 57 | def test_pdf_sync_parse(self): 58 | """Synchronous PDF Parse""" 59 | working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf" 60 | correct_output_file = "./tests/outputs/correct_pdf_output.txt" 61 | 62 | # extract 63 | markdown_list, elapsed_time = self.ap.parse(file_path=working_file) 64 | markdown = "\n".join(markdown_list) 65 | self.assertFalse(markdown.startswith("Error:"), markdown) 66 | correct_output = get_ground_truth(correct_output_file) 67 | percentage = compare_markdown(markdown, correct_output) 68 | 69 | self.assertGreaterEqual( 70 | percentage, 90, f"Output similarity too low: {percentage:.2f}%" 71 | ) 72 | self.assertIn("Time Elapsed", elapsed_time) 73 | 74 | def test_pdf_sync_parse_with_file_content(self): 75 | """Synchronous PDF Parse with file content""" 76 | working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf" 77 | correct_output_file = "./tests/outputs/correct_pdf_output.txt" 78 | 79 | with open(working_file, "rb") as file: 80 | file_content = base64.b64encode(file.read()).decode("utf-8") 81 | file_type = Path(working_file).suffix.lower().lstrip(".") 82 | 83 | # extract 84 | markdown_list, elapsed_time = self.ap.parse( 85 | file_content=file_content, file_type=file_type 86 | ) 87 | markdown = "\n".join(markdown_list) 88 | 89 | self.assertFalse(markdown.startswith("Error:"), markdown) 90 | correct_output = get_ground_truth(correct_output_file) 91 | percentage = compare_markdown(markdown, correct_output) 92 | 93 | self.assertGreaterEqual( 94 | percentage, 90, f"Output similarity too low: {percentage:.2f}%" 95 | ) 96 | self.assertIn("Time Elapsed", elapsed_time) 97 | 98 | def test_pdf_async_parse_and_fetch(self): 99 | """Asynchronous PDF Parse and Fetch""" 100 | working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf" 101 | correct_output_file = "./tests/outputs/correct_pdf_output.txt" 102 | 103 | # extract 104 | file_id = self.ap.async_parse(file_path=working_file) 105 | self.assertFalse(file_id.startswith("Error:"), file_id) 106 | # fetch 107 | markdown_list = self.ap.async_fetch(file_id=file_id) 108 | markdown = "\n".join(markdown_list) 109 | self.assertFalse(markdown.startswith("Error:"), markdown) 110 | correct_output = get_ground_truth(correct_output_file) 111 | percentage = compare_markdown(markdown, correct_output) 112 | 113 | self.assertGreaterEqual( 114 | percentage, 90, f"Output similarity too low: {percentage:.2f}%" 115 | ) 116 | 117 | def test_pdf_async_parse_and_fetch_with_file_content(self): 118 | """Asynchronous PDF Parse and Fetch with file content""" 119 | working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf" 120 | correct_output_file = "./tests/outputs/correct_pdf_output.txt" 121 | 122 | with open(working_file, "rb") as file: 123 | file_content = base64.b64encode(file.read()).decode("utf-8") 124 | file_type = Path(working_file).suffix.lower().lstrip(".") 125 | 126 | # extract 127 | file_id = self.ap.async_parse(file_content=file_content, file_type=file_type) 128 | self.assertFalse(file_id.startswith("Error:"), file_id) 129 | # fetch 130 | markdown_list = self.ap.async_fetch(file_id=file_id) 131 | markdown = "\n".join(markdown_list) 132 | self.assertFalse(markdown.startswith("Error:"), markdown) 133 | correct_output = get_ground_truth(correct_output_file) 134 | percentage = compare_markdown(markdown, correct_output) 135 | 136 | self.assertGreaterEqual( 137 | percentage, 90, f"Output similarity too low: {percentage:.2f}%" 138 | ) 139 | 140 | def test_docx_sync_extract(self): 141 | """Synchronous Word Extraction""" 142 | working_file = "./examples/sample_data/test_odf.docx" 143 | correct_output_file = "./tests/outputs/correct_docx_output.txt" 144 | 145 | # extract 146 | markdown_list, elapsed_time = self.ap.parse(file_path=working_file) 147 | markdown = "\n".join(markdown_list) 148 | self.assertFalse(markdown.startswith("Error:"), markdown) 149 | correct_output = get_ground_truth(correct_output_file) 150 | percentage = compare_markdown(markdown, correct_output) 151 | 152 | self.assertGreaterEqual( 153 | percentage, 90, f"Output similarity too low: {percentage:.2f}%" 154 | ) 155 | self.assertIn("Time Elapsed", elapsed_time) 156 | 157 | def test_docx_async_parse_and_fetch(self): 158 | """Asynchronous Word Parse and Fetch""" 159 | working_file = "./examples/sample_data/test_odf.docx" 160 | correct_output_file = "./tests/outputs/correct_docx_output.txt" 161 | 162 | # extract 163 | file_id = self.ap.async_parse(file_path=working_file) 164 | self.assertFalse(file_id.startswith("Error:"), file_id) 165 | # fetch 166 | markdown_list = self.ap.async_fetch(file_id=file_id) 167 | markdown = "\n".join(markdown_list) 168 | self.assertFalse(markdown.startswith("Error:"), markdown) 169 | correct_output = get_ground_truth(correct_output_file) 170 | percentage = compare_markdown(markdown, correct_output) 171 | 172 | self.assertGreaterEqual( 173 | percentage, 90, f"Output similarity too low: {percentage:.2f}%" 174 | ) 175 | 176 | def test_pptx_sync_extract(self): 177 | """Synchronous Powerpoint Extraction""" 178 | working_file = "./examples/sample_data/test_odf.pptx" 179 | correct_output_file = "./tests/outputs/correct_pptx_output.txt" 180 | 181 | # extract 182 | markdown_list, elapsed_time = self.ap.parse(file_path=working_file) 183 | markdown = "\n".join(markdown_list) 184 | self.assertFalse(markdown.startswith("Error:"), markdown) 185 | correct_output = get_ground_truth(correct_output_file) 186 | percentage = compare_markdown(markdown, correct_output) 187 | 188 | self.assertGreaterEqual( 189 | percentage, 90, f"Output similarity too low: {percentage:.2f}%" 190 | ) 191 | self.assertIn("Time Elapsed", elapsed_time) 192 | 193 | def test_pptx_async_parse_and_fetch(self): 194 | """Asynchronous Powerpoint Parse and Fetch""" 195 | working_file = "./examples/sample_data/test_odf.pptx" 196 | correct_output_file = "./tests/outputs/correct_pptx_output.txt" 197 | 198 | # extract 199 | file_id = self.ap.async_parse(file_path=working_file) 200 | self.assertFalse(file_id.startswith("Error:"), file_id) 201 | # fetch 202 | markdown_list = self.ap.async_fetch(file_id=file_id) 203 | markdown = "\n".join(markdown_list) 204 | self.assertFalse(markdown.startswith("Error:"), markdown) 205 | correct_output = get_ground_truth(correct_output_file) 206 | percentage = compare_markdown(markdown, correct_output) 207 | 208 | self.assertGreaterEqual( 209 | percentage, 90, f"Output similarity too low: {percentage:.2f}%" 210 | ) 211 | 212 | def test_image_sync_extract(self): 213 | """Synchronous Image Extraction""" 214 | working_file = "./examples/sample_data/test3.png" 215 | correct_output_file = "./tests/outputs/correct_png_output.txt" 216 | 217 | # extract 218 | markdown_list, elapsed_time = self.ap.parse(file_path=working_file) 219 | markdown = "\n".join(markdown_list) 220 | self.assertFalse(markdown.startswith("Error:"), markdown) 221 | correct_output = get_ground_truth(correct_output_file) 222 | percentage = compare_markdown(markdown, correct_output) 223 | 224 | self.assertGreaterEqual( 225 | percentage, 90, f"Output similarity too low: {percentage:.2f}%" 226 | ) 227 | self.assertIn("Time Elapsed", elapsed_time) 228 | 229 | def test_image_async_parse_and_fetch(self): 230 | """Asynchronous Image Parse and Fetch""" 231 | working_file = "./examples/sample_data/test3.png" 232 | correct_output_file = "./tests/outputs/correct_png_output.txt" 233 | 234 | # extract 235 | file_id = self.ap.async_parse(file_path=working_file) 236 | self.assertFalse(file_id.startswith("Error:"), file_id) 237 | # fetch 238 | markdown_list = self.ap.async_fetch(file_id=file_id) 239 | markdown = "\n".join(markdown_list) 240 | self.assertFalse(markdown.startswith("Error:"), markdown) 241 | correct_output = get_ground_truth(correct_output_file) 242 | percentage = compare_markdown(markdown, correct_output) 243 | 244 | self.assertGreaterEqual( 245 | percentage, 90, f"Output similarity too low: {percentage:.2f}%" 246 | ) 247 | 248 | def test_sync_extract_key_value(self): 249 | """ 250 | Synchronous JSON Extraction with subtests for different file formats 251 | """ 252 | for data in EXTRACT_JSON_TEST_DATA: 253 | with self.subTest(working_file=data["working_file"]): 254 | # extract 255 | key_value_result, elapsed_time = self.ap.extract_key_value( 256 | file_path=data["working_file"], 257 | extract_instruction=data["extract_instruction"], 258 | ) 259 | 260 | # assertions 261 | self.assertEqual(key_value_result, data["correct_output"]) 262 | self.assertIn("Time Elapsed", elapsed_time) 263 | 264 | def test_async_extract_key_value_and_fetch(self): 265 | """ 266 | Asynchronous JSON Extraction with subtests for different file formats 267 | """ 268 | for data in EXTRACT_JSON_TEST_DATA: 269 | with self.subTest(working_file=data["working_file"]): 270 | # extract 271 | file_id = self.ap.async_extract_key_value( 272 | file_path=data["working_file"], 273 | extract_instruction=data["extract_instruction"], 274 | ) 275 | self.assertFalse(file_id.startswith("Error:"), file_id) 276 | # fetch 277 | key_value_result = self.ap.async_fetch(file_id=file_id) 278 | # assertions 279 | self.assertEqual(key_value_result, data["correct_output"]) 280 | # wait 1 s between requests 281 | time.sleep(1) 282 | 283 | 284 | if __name__ == "__main__": 285 | unittest.main(verbosity=2) 286 | -------------------------------------------------------------------------------- /tests/test_batch_api.py: -------------------------------------------------------------------------------- 1 | """Testing Batch API Extraction""" 2 | 3 | import os 4 | import sys 5 | import unittest 6 | 7 | from dotenv import load_dotenv 8 | 9 | sys.path.append(".") 10 | load_dotenv(override=True) 11 | from any_parser import AnyParser # noqa: E402 12 | 13 | 14 | class TestAnyParserBatchAPI(unittest.TestCase): 15 | """Testing Any Parser Batch API""" 16 | 17 | def setUp(self): 18 | self.api_key = os.environ.get("CAMBIO_API_KEY") 19 | if not self.api_key: 20 | raise ValueError("CAMBIO_API_KEY is not set") 21 | self.ap = AnyParser(self.api_key) 22 | 23 | def test_batch_api_create(self): 24 | """Batch API Create""" 25 | working_file = "./examples/sample_data/stoxx_index_guide_0003.pdf" 26 | 27 | response = self.ap.batches.create(working_file) 28 | 29 | self.assertIsNotNone(response) 30 | self.assertEqual(response.requestStatus, "UPLOADED") 31 | 32 | request_id = response.requestId 33 | status = self.ap.batches.retrieve(request_id) 34 | self.assertEqual(status.requestStatus, "UPLOADED") 35 | 36 | quota = self.ap.batches.get_usage() 37 | self.assertGreaterEqual(quota.pageRemaining, 0) 38 | -------------------------------------------------------------------------------- /tests/test_data.py: -------------------------------------------------------------------------------- 1 | EXTRACT_JSON_TEST_DATA = [ 2 | { 3 | "working_file": "./examples/sample_data/test1.pdf", 4 | "extract_instruction": { 5 | "social_security_number": "the social security number of the employee", 6 | "ein": "the employer identification number", 7 | "first_name": "the first name of the employee", 8 | "last_name": "the last name of the employee", 9 | }, 10 | "correct_output": { 11 | "social_security_number": ["758-58-5787"], 12 | "ein": ["78-8778788"], 13 | "first_name": ["Jesan"], 14 | "last_name": ["Rahaman"], 15 | }, 16 | }, 17 | # { 18 | # "working_file": "./examples/sample_data/test_w2.pptx", 19 | # "extract_instruction": { 20 | # "social_security_number": "the social security number of the employee", 21 | # "ein": "the employer identification number", 22 | # "first_name": "the first name of the employee", 23 | # "last_name": "the last name of the employee", 24 | # }, 25 | # "correct_output": [ 26 | # { 27 | # "social_security_number": ["758-58-5787"], 28 | # "ein": ["78-8778788"], 29 | # "first_name": ["Jesan"], 30 | # "last_name": ["Rahaman"], 31 | # } 32 | # ], 33 | # }, 34 | # { 35 | # "working_file": "./examples/sample_data/test_w2.docx", 36 | # "extract_instruction": { 37 | # "social_security_number": "the social security number of the employee", 38 | # "ein": "the employer identification number", 39 | # "first_name": "the first name of the employee", 40 | # "last_name": "the last name of the employee", 41 | # }, 42 | # "correct_output": [ 43 | # { 44 | # "social_security_number": ["758-58-5787"], 45 | # "ein": ["78-8778788"], 46 | # "first_name": ["Jesan"], 47 | # "last_name": ["Rahaman"], 48 | # } 49 | # ], 50 | # }, 51 | { 52 | "working_file": "./examples/sample_data/test_w2.png", 53 | "extract_instruction": { 54 | "social_security_number": "the social security number of the employee", 55 | "ein": "the employer identification number", 56 | "first_name": "the first name of the employee", 57 | "last_name": "the last name of the employee", 58 | }, 59 | "correct_output": { 60 | "social_security_number": ["758-58-5787"], 61 | "ein": ["78-8778788"], 62 | "first_name": ["Jesan"], 63 | "last_name": ["Rahaman"], 64 | }, 65 | }, 66 | ] 67 | --------------------------------------------------------------------------------