├── .github ├── ISSUE_TEMPLATE │ └── bug_report.md ├── dependabot.yml └── workflows │ ├── bump_libraries.yaml │ ├── ci.yml │ └── docker-publish.yml ├── .gitignore ├── CHANGELOG.md ├── Dockerfile ├── LICENSE.md ├── Makefile ├── README.md ├── docker └── rockylinux-9.4 │ └── Dockerfile ├── exploration-notebooks ├── exploration-email.ipynb ├── exploration-html.ipynb └── exploration-powerpoint.ipynb ├── img ├── email-screenshot.png └── unstructured_logo.png ├── logger_config.yaml ├── prepline_general ├── __init__.py └── api │ ├── __init__.py │ ├── app.py │ ├── filetypes.py │ ├── general.py │ ├── models │ ├── __init__.py │ └── form_params.py │ ├── openapi.py │ └── utils.py ├── preprocessing-pipeline-family.yaml ├── pyproject.toml ├── requirements ├── base.in ├── base.txt ├── constraints.in ├── test.in └── test.txt ├── sample-docs ├── .gitkeep ├── DA-1p-with-duplicate-pages.pdf ├── DA-1p.bmp ├── DA-1p.heic ├── README.md ├── README.rst ├── alert.eml ├── announcement.eml ├── embedded-images-tables.jpg ├── embedded-images-tables.pdf ├── english-and-korean.png ├── fake-doc.rtf ├── fake-email-attachment.eml ├── fake-email-image-embedded.eml ├── fake-email.eml ├── fake-email.msg ├── fake-html.html ├── fake-power-point.ppt ├── fake-power-point.pptx ├── fake-text-utf-32.txt ├── fake-text.txt ├── fake-xml.xml ├── fake.doc ├── fake.docx ├── fake.odt ├── family-day.eml ├── layout-parser-paper-fast.jpg ├── layout-parser-paper-fast.pdf ├── layout-parser-paper-fast.tiff ├── layout-parser-paper-with-table.jpg ├── layout-parser-paper.pdf ├── layout-parser-paper.pdf.gz ├── list-item-example.pdf ├── notes.ppt ├── notes.pptx ├── spring-weather.html.json ├── stanley-cups.csv ├── stanley-cups.tsv ├── stanley-cups.xlsx └── winter-sports.epub ├── scripts ├── app-start.sh ├── docker-build.sh ├── docker-smoke-test.sh ├── install-pandoc.sh ├── parallel-mode-test.sh ├── shellcheck.sh ├── smoketest.py ├── version-increment.sh └── version-sync.sh ├── setup.cfg └── test_general ├── __init__.py └── api ├── .gitkeep ├── __init__.py ├── test_app.py ├── test_deprecated_api.py ├── test_gzip.py └── test_utils.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Please provide as much info as possible: 15 | 16 | - Filetype: 17 | - Any additional API parameters: 18 | 19 | **Environment:** 20 | - Using the hosted API or self hosting? 21 | - How are you calling the API? (Langchain, SDKs, cUrl, etc.) 22 | 23 | **Additional context** 24 | Add any other context about the problem here. 25 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/requirements" 5 | schedule: 6 | interval: "daily" 7 | # Only use this to bump our libraries 8 | allow: 9 | - dependency-name: "unstructured[local-inference]" 10 | 11 | - package-ecosystem: "github-actions" 12 | # NOTE(robinson) - Workflow files stored in the 13 | # default location of `.github/workflows` 14 | directory: "/" 15 | schedule: 16 | interval: "weekly" 17 | -------------------------------------------------------------------------------- /.github/workflows/bump_libraries.yaml: -------------------------------------------------------------------------------- 1 | name: Dependabot - Bump libs and cut release 2 | 3 | on: 4 | pull_request: 5 | types: 6 | - opened 7 | - reopened 8 | paths: 9 | - 'requirements/**' 10 | 11 | env: 12 | PYTHON_VERSION: "3.8" 13 | 14 | jobs: 15 | bump-changelog: 16 | runs-on: ubuntu-latest 17 | if: ${{ github.actor == 'dependabot[bot]' }} 18 | permissions: 19 | contents: write 20 | steps: 21 | - uses: actions/checkout@v4 22 | - name: Set up Python ${{ env.PYTHON_VERSION }} 23 | uses: actions/setup-python@v5 24 | with: 25 | python-version: ${{ env.PYTHON_VERSION }} 26 | - name: Dependabot metadata 27 | id: metadata 28 | uses: dependabot/fetch-metadata@v2 29 | with: 30 | github-token: "${{ secrets.GITHUB_TOKEN }}" 31 | - name: Create release version 32 | run: | 33 | pip install pip-tools 34 | make pip-compile 35 | package=${{ steps.metadata.outputs.dependency-names }} 36 | # Strip any [extras] from name 37 | package=${package%\[*} 38 | changelog_message="Bump $package to ${{ steps.metadata.outputs.new-version }}" 39 | ./scripts/version-increment.sh "$changelog_message" 40 | make version-sync 41 | - uses: stefanzweifel/git-auto-commit-action@v5 42 | with: 43 | commit_message: "Bump libraries and release" 44 | 45 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | env: 10 | PYTHON_VERSION: "3.10" 11 | PIPELINE_FAMILY: "general" 12 | 13 | jobs: 14 | setup: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v4 18 | - uses: actions/cache@v4 19 | id: virtualenv-cache 20 | with: 21 | path: | 22 | .venv 23 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }} 24 | - name: Set up Python ${{ env.PYTHON_VERSION }} 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: ${{ env.PYTHON_VERSION }} 28 | - name: Setup virtual environment (no cache hit) 29 | if: steps.virtualenv-cache.outputs.cache-hit != 'true' 30 | run: | 31 | python${{ env.PYTHON_VERSION }} -m venv .venv 32 | source .venv/bin/activate 33 | make install-ci 34 | 35 | lint: 36 | runs-on: ubuntu-latest 37 | needs: setup 38 | steps: 39 | - uses: actions/checkout@v4 40 | - uses: actions/cache@v4 41 | id: virtualenv-cache 42 | with: 43 | path: | 44 | .venv 45 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }} 46 | - name: Lint 47 | run: | 48 | source .venv/bin/activate 49 | make check 50 | 51 | shellcheck: 52 | runs-on: ubuntu-latest 53 | steps: 54 | - uses: actions/checkout@v4 55 | - name: ShellCheck 56 | uses: ludeeus/action-shellcheck@master 57 | 58 | test: 59 | runs-on: ubuntu-latest-m 60 | needs: [setup, lint] 61 | steps: 62 | - uses: actions/checkout@v4 63 | - uses: actions/cache@v4 64 | id: virtualenv-cache 65 | with: 66 | path: | 67 | .venv 68 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }} 69 | - name: Run core tests 70 | run: | 71 | source .venv/bin/activate 72 | sudo apt-get update && sudo apt-get install --yes poppler-utils libreoffice 73 | make install-pandoc 74 | sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 75 | sudo apt-get install -y tesseract-ocr tesseract-ocr-kor 76 | tesseract --version 77 | make install-nltk-models 78 | make test 79 | make check-coverage 80 | 81 | changelog: 82 | runs-on: ubuntu-latest 83 | steps: 84 | - uses: actions/checkout@v4 85 | - if: github.ref != 'refs/heads/main' 86 | uses: dorny/paths-filter@v3 87 | id: changes 88 | with: 89 | filters: | 90 | src: 91 | - 'doc_recipe/**' 92 | - 'recipe-notebooks/**' 93 | 94 | - if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main' 95 | uses: dangoslen/changelog-enforcer@v3 96 | 97 | # TODO - figure out best practice for caching docker images 98 | # (Using the virtualenv to get pytest) 99 | test_dockerfile: 100 | runs-on: ubuntu-latest-m 101 | needs: [setup, lint] 102 | steps: 103 | - uses: actions/checkout@v4 104 | - uses: actions/cache@v4 105 | id: virtualenv-cache 106 | with: 107 | path: | 108 | .venv 109 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }} 110 | - name: Test Dockerfile 111 | run: | 112 | source .venv/bin/activate 113 | make docker-build 114 | make docker-test 115 | # - name: Scan image 116 | # uses: anchore/scan-action@v3 117 | # with: 118 | # image: "pipeline-family-${{ env.PIPELINE_FAMILY }}-dev" 119 | # # NOTE(robinson) - revert this to medium when we bump libreoffice 120 | # severity-cutoff: critical 121 | -------------------------------------------------------------------------------- /.github/workflows/docker-publish.yml: -------------------------------------------------------------------------------- 1 | name: Build And Push Docker Image 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | env: 9 | DOCKER_REPOSITORY: quay.io/unstructured-io/unstructured-api 10 | DOCKER_BUILD_REPOSITORY: quay.io/unstructured-io/build-unstructured-api 11 | PACKAGE: "unstructured-api" 12 | PIPELINE_FAMILY: "general" 13 | PIP_VERSION: "25.1.1" 14 | PYTHON_VERSION: "3.10" 15 | 16 | jobs: 17 | setup: 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v4 21 | - uses: actions/cache@v4 22 | id: virtualenv-cache 23 | with: 24 | path: | 25 | .venv 26 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }} 27 | - name: Set up Python ${{ env.PYTHON_VERSION }} 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: ${{ env.PYTHON_VERSION }} 31 | - name: Setup virtual environment (no cache hit) 32 | if: steps.virtualenv-cache.outputs.cache-hit != 'true' 33 | run: | 34 | python${{ env.PYTHON_VERSION }} -m venv .venv 35 | source .venv/bin/activate 36 | make install-ci 37 | set-short-sha: 38 | runs-on: ubuntu-latest 39 | outputs: 40 | short_sha: ${{ steps.set_short_sha.outputs.short_sha }} 41 | steps: 42 | - name: Set Short SHA 43 | id: set_short_sha 44 | run: echo "short_sha=$(echo ${{ github.sha }} | cut -c1-7)" >> $GITHUB_OUTPUT 45 | build-images: 46 | strategy: 47 | matrix: 48 | #arch: ["arm64", "amd64"] 49 | # NOTE(luke): temporary disable arm64 since its failing the smoke test 50 | arch: ["amd64"] 51 | runs-on: ubuntu-latest-m 52 | needs: [setup, set-short-sha] 53 | env: 54 | SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }} 55 | DOCKER_PLATFORM: linux/${{ matrix.arch }} 56 | steps: 57 | - name: Set up Docker Buildx 58 | uses: docker/setup-buildx-action@v3 59 | with: 60 | driver: ${{ matrix.arch == 'amd64' && 'docker' || 'docker-container' }} 61 | - name: Checkout code 62 | uses: actions/checkout@v4 63 | - name: Login to Quay.io 64 | uses: docker/login-action@v3 65 | with: 66 | registry: quay.io 67 | username: ${{ secrets.QUAY_IO_ROBOT_USERNAME }} 68 | password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }} 69 | - name: Build image 70 | run: | 71 | # Clear some space (https://github.com/actions/runner-images/issues/2840) 72 | sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/share/boost 73 | 74 | DOCKER_BUILDKIT=1 docker buildx build --load -f Dockerfile \ 75 | --platform=$DOCKER_PLATFORM \ 76 | --build-arg PIP_VERSION=$PIP_VERSION \ 77 | --build-arg BUILDKIT_INLINE_CACHE=1 \ 78 | --build-arg PIPELINE_PACKAGE=${{ env.PIPELINE_FAMILY }} \ 79 | --provenance=false \ 80 | --progress plain \ 81 | --cache-from $DOCKER_BUILD_REPOSITORY:${{ matrix.arch }} \ 82 | -t $DOCKER_BUILD_REPOSITORY:${{ matrix.arch }}-$SHORT_SHA . 83 | - name: Set virtualenv cache 84 | uses: actions/cache@v4 85 | id: virtualenv-cache 86 | with: 87 | path: | 88 | .venv 89 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }} 90 | - name: Set up QEMU 91 | uses: docker/setup-qemu-action@v3 92 | - name: Test image 93 | run: | 94 | source .venv/bin/activate 95 | export DOCKER_IMAGE="$DOCKER_BUILD_REPOSITORY:${{ matrix.arch }}-$SHORT_SHA" 96 | if [ "$DOCKER_PLATFORM" == "linux/arm64" ]; then 97 | SKIP_INFERENCE_TESTS=true make docker-test 98 | else 99 | make docker-test 100 | fi 101 | - name: Push image 102 | run: | 103 | # write to the build repository to cache for the publish-images job 104 | docker push $DOCKER_BUILD_REPOSITORY:${{ matrix.arch }}-$SHORT_SHA 105 | publish-images: 106 | runs-on: ubuntu-latest-m 107 | needs: [setup, set-short-sha, build-images] 108 | env: 109 | SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }} 110 | steps: 111 | - name: Checkout code 112 | uses: actions/checkout@v4 113 | - name: Set SHORT_SHA 114 | run: echo "SHORT_SHA=$(git rev-parse --short HEAD)" >> $GITHUB_ENV 115 | - name: Login to Quay.io 116 | uses: docker/login-action@v3 117 | with: 118 | registry: quay.io 119 | username: ${{ secrets.QUAY_IO_ROBOT_USERNAME }} 120 | password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }} 121 | - name: Pull AMD image 122 | run: | 123 | docker pull $DOCKER_BUILD_REPOSITORY:amd64-$SHORT_SHA 124 | # - name: Pull ARM image 125 | # run: | 126 | # docker pull $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA 127 | - name: Push AMD and ARM tags 128 | run: | 129 | # these are used to construct the final manifest but also cache-from in subsequent runs 130 | docker tag $DOCKER_BUILD_REPOSITORY:amd64-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 131 | docker push $DOCKER_BUILD_REPOSITORY:amd64 132 | #docker tag $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:arm64 133 | #docker push $DOCKER_BUILD_REPOSITORY:arm64 134 | - name: Push multiarch manifest 135 | run: | 136 | #docker manifest create ${DOCKER_REPOSITORY}:latest $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64 137 | docker manifest create ${DOCKER_REPOSITORY}:latest $DOCKER_BUILD_REPOSITORY:amd64 138 | docker manifest push $DOCKER_REPOSITORY:latest 139 | #docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64 140 | docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 141 | docker manifest push $DOCKER_REPOSITORY:$SHORT_SHA 142 | VERSION=$(grep -m1 version preprocessing-pipeline-family.yaml | cut -d ' ' -f2) 143 | #docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64 144 | docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd64 145 | docker manifest push ${DOCKER_REPOSITORY}:$VERSION 146 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pycharm 129 | .idea/ 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # VSCode 135 | .vscode/ 136 | 137 | # Mac 138 | .DS_Store 139 | 140 | nbs/ 141 | 142 | # Celery files that are created when the mercury dashboard is run 143 | celery.sqlite 144 | celerybeat-schedule.db 145 | 146 | # temporarily generated files by project-specific Makefile 147 | tmp* 148 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.0.85 2 | * Patch various CVEs 3 | * Bump Python version to 3.12, some packages no longer support 3.9 4 | 5 | ## 0.0.84 6 | * Patch h11 CVE 7 | * bump httpcore version due to h11 dependency 8 | 9 | ## 0.0.83 10 | 11 | * Patch various CVEs 12 | * Fix Starlette vulnerability 13 | 14 | ## 0.0.82 15 | 16 | * Patch various python CVEs 17 | * Bump to `unstructured` 0.16.11 18 | * No longer attempts to download NLTK asset from S3 which could result in a 403 19 | 20 | ## 0.0.81 21 | 22 | * Update `strategy` parameter to allow `'` and `"` as input surrounding the value. 23 | 24 | ## 0.0.80 25 | 26 | * Bump to `unstructured` 0.15.10 27 | * Add `include_slide_notes` parameter, indicating whether slide notes in `ppt` and `pptx` files should be partitioned. Default is `True`. Now, when slide notes are present in the file, they will be included alongside other elements, which may shift the index numbers of non-note elements. 28 | 29 | ## 0.0.79 30 | 31 | * Bump to `unstructured` 0.15.7 32 | 33 | ## 0.0.78 34 | 35 | * Resolve NLTK CVE. 36 | * Bump to `unstructured` 0.15.6 37 | 38 | ## 0.0.77 39 | 40 | * Bump to `unstructured` 0.15.5 41 | 42 | ## 0.0.76 43 | * Use the library's `detect_filetype` in API to determine mimetype 44 | * Add content_type api parameter 45 | * Bump to `unstructured` 0.15.1 46 | 47 | ## 0.0.75 48 | 49 | * Remove constraint on `safetensors` that preventing us from bumping `transformers`. 50 | 51 | ## 0.0.74 52 | 53 | * Bump to `unstructured` 0.15.0 54 | 55 | ## 0.0.73 56 | 57 | * Bump to `unstructured` 0.14.10 58 | 59 | ## 0.0.72 60 | 61 | * Fix certain filetypes failing mimetype lookup in the new base image 62 | 63 | ## 0.0.71 64 | 65 | * replace rockylinux with chainguard/wolfi as a base image for `amd64` 66 | 67 | ## 0.0.70 68 | 69 | * Bump to `unstructured` 0.14.6 70 | * Bump to `unstructured-inference` 0.7.35 71 | 72 | ## 0.0.69 73 | 74 | * Bump to `unstructured` 0.14.4 75 | * Add handling for `pdf_infer_table_structure` to reflect the "tables off by default" behavior in `unstructured`. 76 | 77 | ## 0.0.68 78 | 79 | * Fix list params such as `extract_image_block_types` not working via the python/js clients 80 | 81 | ## 0.0.67 82 | 83 | * Allow for a different server port with the PORT variable 84 | * Change pdf_infer_table_structure parameter from being disabled in auto strategy. 85 | 86 | ## 0.0.66 87 | 88 | * Add support for `unique_element_ids` parameter. 89 | * Add max lifetime, via MAX_LIFETIME_SECONDS env-var, to API containers 90 | * Bump unstructured to 0.13.5 91 | * Change default values for `pdf_infer_table_structure` and `skip_infer_table_types`. Mark `pdf_infer_table_structure` deprecated. 92 | * Add support for the `starting_page_number` param. 93 | 94 | ## 0.0.65 95 | 96 | * Bump unstructured to 0.12.4 97 | * Add support for both `list[str]` and `str` input formats for `ocr_languages` parameter 98 | * Adds support for additional MIME types from `unstructured` 99 | * Document the support for gzip files and add additional testing 100 | 101 | ## 0.0.64 102 | 103 | * Bump Pydantic to 2.5.x and remove it from explicit dependencies list (will be managed by fastapi) 104 | * Introduce Form params description in the code, which will form openapi and swagger documentation 105 | * Roll back some openapi customizations 106 | * Keep backward compatibility for passing parameters in form of `list[str]` (will not be shown in the documentation) 107 | 108 | ## 0.0.63 109 | 110 | * Bump unstructured to 0.12.2 111 | * Fix bug that ignored `combine_under_n_chars` chunking option argument. 112 | 113 | ## 0.0.62 114 | 115 | * Add hi_res_model_name to partition and deprecate model_name 116 | * Bump unstructured to 0.12.0 117 | * Add support for returning extracted image blocks as base64 encoded data stored in metadata fields 118 | 119 | ## 0.0.61 120 | 121 | * Bump unstructured to 0.11.6 122 | * Handle invalid hi_res_model_name kwarg 123 | 124 | ## 0.0.60 125 | 126 | * Enable self-hosted authorization using UNSTRUCTURED_API_KEY env variable 127 | 128 | ## 0.0.59 129 | 130 | * Bump unstructured to 0.11.0 131 | 132 | ## 0.0.58 133 | 134 | * Bump unstructured to 0.10.30 135 | 136 | ## 0.0.57 137 | * Make sure `multipage_sections` param defaults to `true` as per the readme 138 | * Bump unstructured to 0.10.29 139 | 140 | 141 | ## 0.0.56 142 | * **Add `max_characters` param for chunking** This param gives users additional control to "chunk" elements into larger or smaller `CompositeElement`s 143 | * Bump unstructured to 0.10.28 144 | * Make sure chipperv2 is called when `hi_res_model_name==chipper` 145 | 146 | 147 | ## 0.0.55 148 | 149 | * Bump unstructured to 0.10.26 150 | * Bring parent_id metadata field back after fixing a backwards compatibility bug 151 | * Restrict Chipper usage to one at a time. The model is very resource intense, and this will prevent issues while we improve it. 152 | 153 | ## 0.0.54 154 | 155 | * Bump unstructured to 0.10.25 156 | * Use a generator when splitting pdfs in parallel mode 157 | * Add a default memory minimum for 503 check 158 | * Fix an UnboundLocalError when an invalid docx file is caught 159 | 160 | ## 0.0.53 161 | 162 | * Bump unstructured to 0.10.23 163 | * Simplify the error message for BadZipFile errors 164 | 165 | ## 0.0.52 166 | 167 | * Bump unstructured to 0.10.21 168 | * Fix an unhandled error when a non pdf file is sent with content-type pdf 169 | * Fix an unhandled error when a non docx file is sent with content-type docx 170 | * Fix an unhandled error when a non-Unstructured json schema is sent 171 | 172 | ## 0.0.51 173 | 174 | * Bump unstructured to 0.10.19 175 | 176 | ## 0.0.50 177 | 178 | * Bump unstructured to 0.10.18 179 | 180 | ## 0.0.49 181 | 182 | * Remove spurious whitespace in `app-start.sh`. **This fixes deployments in some envs such as Google Cloud Run**. 183 | 184 | ## 0.0.48 185 | 186 | * **Adds `languages` kwarg** `ocr_languages` will eventually be deprecated and replaced by `languages` to specify what languages to use for OCR 187 | * Adds a startup log and other minor cleanups 188 | 189 | ## 0.0.47 190 | 191 | * **Adds `chunking_strategy` kwarg and associated params** These params allow users to "chunk" elements into larger or smaller `CompositeElement`s 192 | * **Remove `parent_id` from the element metadata**. New metadata fields are causing errors with existing installs. We'll readd this once a fix is widely available. 193 | * **Fix some pdfs incorrectly returning a file is encrypted error**. The `pypdf.is_encrypted` check caused us to return this error even if the file is readable. 194 | 195 | ## 0.0.46 196 | 197 | * Bump unstructured to 0.10.16 198 | 199 | ## 0.0.45 200 | 201 | * Drop `detection_class_prob` from the element metadata. This broke backwards compatibility when library users called `partition_via_api`. 202 | * Bump unstructured to 0.10.15 203 | 204 | ## 0.0.44 205 | 206 | * Bump unstructured to 0.10.14 207 | * Improve parallel mode retry handling 208 | * Improve logging during error handling. We don't need to log stack traces for expected errors. 209 | 210 | ## 0.0.43 211 | 212 | * Bump unstructured to 0.10.13 213 | * Bump unstructured-inference to 0.5.25 214 | * Remove dependency on unstructured-api-tools 215 | * Add a top level error handler for more consistent response bodies 216 | * Tesseract minor version bump to 5.3.2 217 | 218 | ## 0.0.42 219 | 220 | * Update readme for parameter `hi_res_model_name` 221 | * Fix a bug using `hi_res_model_name` in parallel mode 222 | * Bump unstructured library to 0.10.12 223 | * Bump unstructured-inference to 0.5.22 224 | 225 | ## 0.0.41 226 | 227 | * Bump unstructured library to 0.10.8 228 | * Bump unstructured-inference to 0.5.17 229 | 230 | ## 0.0.40 231 | 232 | * Reject traffic when overloaded via `UNSTRUCTURED_MEMORY_FREE_MINIMUM_MB` 233 | * Docker image built with Python 3.10 rather than 3.8 234 | 235 | ## 0.0.39 236 | 237 | * Fix incorrect handling on param skip_infer_table_types 238 | * Pin `safetensors` to fix a build error with 0.0.38 239 | 240 | ## 0.0.38 241 | 242 | * Fix page break has None page number bug 243 | * Bump unstructured to 0.10.5 244 | * Bump unstructured-ingest to 0.5.15 245 | * Fix UnboundLocalError using pdfs in parallel mode 246 | 247 | ## 0.0.37 248 | 249 | * Bump unstructured to 0.10.4 250 | 251 | ## 0.0.36 252 | 253 | * Fix a bug in parallel mode causing `not a valid pdf` errors 254 | * Bump unstructured to 0.10.2, unstructured-inference to 0.5.13 255 | 256 | ## 0.0.35 257 | 258 | * Bump unstructured library to 0.9.2 259 | * Fix a misleading error in make docker-test 260 | 261 | ## 0.0.34 262 | 263 | * Bump unstructured library to 0.9.0 264 | * Add table support for image with parameter `skip_infer_table_types` 265 | * Add support for gzipped files 266 | 267 | ## 0.0.33 268 | 269 | * Image tweak, move application entrypoint to scripts/app-start.sh 270 | 271 | ## 0.0.32 272 | 273 | * Throw 400 error if a PDF is password protected 274 | * Improve logging of params to single line json 275 | * Add support for `include_page_breaks` parameter 276 | 277 | ## 0.0.31 278 | 279 | * Support model name as api parameter 280 | * Add retry parameters on fanout requests 281 | * Bump unstructured library to 0.8.1 282 | * Fix how to remove an element's coordinate information 283 | 284 | ## 0.0.30 285 | 286 | * Add table extraction support for hi_res strategy 287 | * Add support for `encoding` parameter 288 | * Add support for `xml_keep_tags` parameter 289 | * Add env variables for additional parallel mode tweaking 290 | 291 | ## 0.0.29 292 | 293 | * Support .msg files 294 | * Refactor parallel mode and add smoke test 295 | * Fix header value for api key 296 | 297 | ## 0.0.28 298 | 299 | * Bump unstructured library to 0.7.8 for bug fixes 300 | 301 | ## 0.0.27 302 | 303 | * Update documentation and tests for filetypes to sync with partition.auto 304 | * Add support for .rst, .tsv, .xml 305 | * Move PYPDF2 to pypdf since PYPDF2 is deprecated 306 | 307 | ## 0.0.26 308 | 309 | * Add support for `ocr_only` strategy and `ocr_languages` parameter 310 | * Remove building `detectron2` from source in Dockerfile 311 | * Convert strategy from fast to auto for images since there is no fast strategy for images 312 | 313 | ## 0.0.25 314 | 315 | * Bump image to use python 3.8.17 instead of 3.8.15 316 | 317 | ## 0.0.24 318 | 319 | * Add returning text/csv to pipeline_api 320 | 321 | ## 0.0.23 322 | 323 | * Add support for csv files 324 | 325 | ## 0.0.22 326 | 327 | * Add parallel processing mode for pages within a pdf 328 | 329 | ## 0.0.21 330 | 331 | * Bump version of base image to use new stable version of tesseract 332 | * Bump to unstructured==0.7.1 for various bug fixes. 333 | 334 | ## 0.0.20 335 | 336 | * Supports additional filetypes: epub, odt, rft 337 | 338 | ## 0.0.19 339 | 340 | * Updating data type of optional os env var `ALLOWED_ORIGINS` 341 | 342 | ## 0.0.18 343 | 344 | * Add optional CORS to api if os env var `ALLOWED_ORIGINS` is set 345 | 346 | ## 0.0.17 347 | 348 | * Add config for unstructured.trace logger 349 | 350 | ## 0.0.16 351 | 352 | * Fix image build steps to support detectron2 install from Mac M1/M2 353 | * Upgrade to openssl 1.1.1 to accomodate the latest urllib3 354 | * Bump unstructured for SpooledTemporaryFile fix 355 | 356 | ## 0.0.15 357 | 358 | * Add msg and json types to supported 359 | 360 | ## 0.0.14 361 | 362 | * Bump unstructured to the latest version 363 | 364 | ## 0.0.13 365 | 366 | * Posting a bad .pdf results in a 400 367 | 368 | ## 0.0.12 369 | 370 | * Remove coordinates field from response elements by default 371 | 372 | ## 0.0.11 373 | 374 | * Add caching from the registry for `make docker-build` 375 | * Add fix for empty content type error 376 | 377 | ## 0.0.10 378 | 379 | * Bump unstructured-api-tools for better 'file type not supported' response messages 380 | 381 | ## 0.0.9 382 | 383 | * Updated detectron version 384 | * Update docker-build to use the public registry as a cache 385 | * Adds a strategy parameter to pipeline_api 386 | * Passing file, file_filename, and content_type to `partition` 387 | 388 | ## 0.0.8 389 | 390 | * Sensible logging config 391 | 392 | ## 0.0.7 393 | 394 | * Minor version bump 395 | 396 | ## 0.0.6 397 | 398 | * Minor version bump 399 | 400 | ## 0.0.5 401 | 402 | * Updated Dockerfile for public release 403 | * Remove rate limiting in the API 404 | * Add file type validation via UNSTRUCTURED_ALLOWED_MIMETYPES 405 | * Major semver route also supported: /general/v0/general 406 | 407 | ## 0.0.4 408 | 409 | * Changed pipeline name to `pipeline-general` 410 | * Changed pipeline to handle a variety of documents not just emails 411 | * Update Dockerfile, all supported library files. 412 | * Add sample-docs for pdf and pdf image. 413 | 414 | ## 0.0.3 415 | 416 | * Add emails pipeline Dockerfile 417 | 418 | ## 0.0.2 419 | 420 | * Add pipeline notebook 421 | 422 | ## 0.0.1 423 | 424 | * Initial pipeline setup 425 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:experimental 2 | FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base 3 | 4 | # NOTE(crag): NB_USER ARG for mybinder.org compat: 5 | # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html 6 | ARG NB_USER=notebook-user 7 | ARG NB_UID=1000 8 | ARG PIP_VERSION 9 | ARG PIPELINE_PACKAGE 10 | ARG PYTHON_VERSION="3.11" 11 | 12 | # Set up environment 13 | ENV PYTHON python${PYTHON_VERSION} 14 | ENV PIP ${PYTHON} -m pip 15 | 16 | WORKDIR ${HOME} 17 | USER ${NB_USER} 18 | 19 | ENV PYTHONPATH="${PYTHONPATH}:${HOME}" 20 | ENV PATH="/home/${NB_USER}/.local/bin:${PATH}" 21 | 22 | FROM base as python-deps 23 | COPY --chown=${NB_USER}:${NB_USER} requirements/base.txt requirements-base.txt 24 | RUN ${PIP} install pip==${PIP_VERSION} 25 | RUN ${PIP} install --no-cache -r requirements-base.txt 26 | 27 | FROM python-deps as model-deps 28 | RUN ${PYTHON} -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \ 29 | ${PYTHON} -c "from unstructured.partition.model_init import initialize; initialize()" 30 | 31 | FROM model-deps as code 32 | COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md 33 | COPY --chown=${NB_USER}:${NB_USER} logger_config.yaml logger_config.yaml 34 | COPY --chown=${NB_USER}:${NB_USER} prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/ 35 | COPY --chown=${NB_USER}:${NB_USER} exploration-notebooks exploration-notebooks 36 | COPY --chown=${NB_USER}:${NB_USER} scripts/app-start.sh scripts/app-start.sh 37 | 38 | ENTRYPOINT ["scripts/app-start.sh"] 39 | # Expose a default port of 8000. Note: The EXPOSE instruction does not actually publish the port, 40 | # but some tooling will inspect containers and perform work contingent on networking support declared. 41 | 42 | EXPOSE 8000 43 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2022 Unstructured Technologies, Inc 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PIPELINE_FAMILY := general 2 | PIPELINE_PACKAGE := general 3 | PACKAGE_NAME := prepline_${PIPELINE_PACKAGE} 4 | PIP_VERSION := 25.1.1 5 | ARCH := $(shell uname -m) 6 | 7 | .PHONY: help 8 | help: Makefile 9 | @sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $< 10 | 11 | 12 | ########### 13 | # Install # 14 | ########### 15 | 16 | ## install-base: installs minimum requirements to run the API 17 | .PHONY: install-base 18 | install-base: install-base-pip-packages install-nltk-models 19 | 20 | ## install: installs all test and dev requirements 21 | .PHONY: install 22 | install:install-base install-test 23 | 24 | .PHONY: install-base-pip-packages 25 | install-base-pip-packages: 26 | python3 -m pip install pip==${PIP_VERSION} 27 | python3 -m pip install -r requirements/base.txt 28 | 29 | .PHONY: install-test 30 | install-test: install-base 31 | python3 -m pip install -r requirements/test.txt 32 | 33 | .PHONY: install-ci 34 | install-ci: install-test 35 | 36 | .PHONY: install-nltk-models 37 | install-nltk-models: 38 | python3 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" 39 | 40 | ## pip-compile: compiles all base/dev/test requirements 41 | .PHONY: pip-compile 42 | pip-compile: 43 | pip-compile --upgrade requirements/base.in 44 | pip-compile --upgrade -o requirements/test.txt requirements/base.txt requirements/test.in 45 | 46 | .PHONY: install-pandoc 47 | install-pandoc: 48 | ARCH=${ARCH} ./scripts/install-pandoc.sh 49 | 50 | ########## 51 | # Docker # 52 | ########## 53 | 54 | # Docker targets are provided for convenience only and are not required in a standard development environment 55 | 56 | # Note that the image has notebooks baked in, however the current working directory 57 | # is mounted under /home/notebook-user/local/ when the image is started with 58 | # docker-start-api or docker-start-jupyter 59 | 60 | DOCKER_IMAGE ?= pipeline-family-${PIPELINE_FAMILY}-dev:latest 61 | 62 | .PHONY: docker-build 63 | docker-build: 64 | PIP_VERSION=${PIP_VERSION} PIPELINE_FAMILY=${PIPELINE_FAMILY} PIPELINE_PACKAGE=${PIPELINE_PACKAGE} ./scripts/docker-build.sh 65 | 66 | .PHONY: docker-start-api 67 | docker-start-api: 68 | docker run -p 8000:8000 \ 69 | -it --rm \ 70 | --mount type=bind,source=$(realpath .),target=/home/notebook-user/local \ 71 | $(if $(MAX_LIFETIME_SECONDS),-e MAX_LIFETIME_SECONDS=$(MAX_LIFETIME_SECONDS)) \ 72 | pipeline-family-${PIPELINE_FAMILY}-dev:latest scripts/app-start.sh 73 | 74 | .PHONY: docker-start-bash 75 | docker-start-bash: 76 | docker run -p 8000:8000 -it --rm --mount type=bind,source=$(realpath .),target=/home/notebook-user/local --entrypoint /bin/bash pipeline-family-${PIPELINE_FAMILY}-dev:latest 77 | 78 | .PHONY: docker-test 79 | docker-test: 80 | DOCKER_IMAGE=${DOCKER_IMAGE} ./scripts/docker-smoke-test.sh 81 | 82 | ######### 83 | # Local # 84 | ######### 85 | 86 | ## run-web-app: runs the FastAPI api with hot reloading 87 | .PHONY: run-web-app 88 | run-web-app: 89 | PYTHONPATH=$(realpath .) uvicorn ${PACKAGE_NAME}.api.app:app --reload --log-config logger_config.yaml 90 | 91 | ################# 92 | # Test and Lint # 93 | ################# 94 | 95 | ## test: runs core tests 96 | .PHONY: test 97 | test: 98 | PYTHONPATH=. pytest -v test_${PIPELINE_PACKAGE} --cov=${PACKAGE_NAME} --cov-report term-missing 99 | 100 | # Setting a low bar here - need more tests! 101 | .PHONY: check-coverage 102 | check-coverage: 103 | coverage report --fail-under=60 104 | 105 | ## check: runs linters (includes tests) 106 | .PHONY: check 107 | check: check-src check-tests check-version 108 | 109 | ## check-src: runs linters (source only, no tests) 110 | .PHONY: check-src 111 | check-src: 112 | black --line-length 100 ${PACKAGE_NAME} --check 113 | flake8 ${PACKAGE_NAME} 114 | mypy ${PACKAGE_NAME} --ignore-missing-imports --install-types --non-interactive --implicit-optional 115 | 116 | .PHONY: check-tests 117 | check-tests: 118 | black --line-length 100 test_${PIPELINE_PACKAGE} --check 119 | flake8 test_${PIPELINE_PACKAGE} scripts/smoketest.py 120 | 121 | ## tidy: run black 122 | .PHONY: tidy 123 | tidy: 124 | black --line-length 100 ${PACKAGE_NAME} 125 | black --line-length 100 test_${PIPELINE_PACKAGE} scripts/smoketest.py 126 | 127 | ## check-scripts: run shellcheck 128 | .PHONY: check-scripts 129 | check-scripts: 130 | # Fail if any of these files have warnings 131 | scripts/shellcheck.sh 132 | 133 | ## check-version: run check to ensure version in CHANGELOG.md matches references in files 134 | .PHONY: check-version 135 | check-version: 136 | # Fail if syncing version would produce changes 137 | scripts/version-sync.sh -c \ 138 | -s CHANGELOG.md \ 139 | -f preprocessing-pipeline-family.yaml release \ 140 | -f ${PACKAGE_NAME}/api/app.py release \ 141 | -f ${PACKAGE_NAME}/api/general.py release 142 | 143 | ## version-sync: update references to version with most recent version from CHANGELOG.md 144 | .PHONY: version-sync 145 | version-sync: 146 | scripts/version-sync.sh \ 147 | -s CHANGELOG.md \ 148 | -f preprocessing-pipeline-family.yaml release \ 149 | -f ${PACKAGE_NAME}/api/app.py release \ 150 | -f ${PACKAGE_NAME}/api/general.py release 151 | -------------------------------------------------------------------------------- /docker/rockylinux-9.4/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:experimental 2 | FROM quay.io/unstructured-io/base-images:rocky9.2-9@sha256:73d8492452f086144d4b92b7931aa04719f085c74d16cae81e8826ef873729c9 as base 3 | 4 | # NOTE(crag): NB_USER ARG for mybinder.org compat: 5 | # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html 6 | ARG NB_USER=notebook-user 7 | ARG NB_UID=1000 8 | ARG PIP_VERSION 9 | ARG PIPELINE_PACKAGE 10 | 11 | # Set up environment 12 | ENV USER ${NB_USER} 13 | ENV HOME /home/${NB_USER} 14 | 15 | RUN groupadd --gid ${NB_UID} ${NB_USER} 16 | RUN useradd --uid ${NB_UID} --gid ${NB_UID} ${NB_USER} 17 | WORKDIR ${HOME} 18 | 19 | ENV PYTHONPATH="${PYTHONPATH}:${HOME}" 20 | ENV PATH="/home/${NB_USER}/.local/bin:${PATH}" 21 | 22 | FROM base as python-deps 23 | # COPY requirements/dev.txt requirements-dev.txt 24 | COPY requirements/base.txt requirements-base.txt 25 | RUN python3.10 -m pip install pip==${PIP_VERSION} \ 26 | && dnf -y groupinstall "Development Tools" \ 27 | && su -l ${NB_USER} -c 'pip3.10 install --no-cache -r requirements-base.txt' \ 28 | && dnf -y groupremove "Development Tools" \ 29 | && dnf clean all \ 30 | && ln -s /home/notebook-user/.local/bin/pip3.10 /usr/local/bin/pip3.10 || true 31 | 32 | USER ${NB_USER} 33 | 34 | FROM python-deps as model-deps 35 | RUN python3.10 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \ 36 | python3.10 -c "from unstructured.partition.model_init import initialize; initialize()" 37 | 38 | FROM model-deps as code 39 | COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md 40 | COPY --chown=${NB_USER}:${NB_USER} logger_config.yaml logger_config.yaml 41 | COPY --chown=${NB_USER}:${NB_USER} prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/ 42 | COPY --chown=${NB_USER}:${NB_USER} exploration-notebooks exploration-notebooks 43 | COPY --chown=${NB_USER}:${NB_USER} scripts/app-start.sh scripts/app-start.sh 44 | 45 | ENTRYPOINT ["scripts/app-start.sh"] 46 | # Expose a default port of 8000. Note: The EXPOSE instruction does not actually publish the port, 47 | # but some tooling will inspect containers and perform work contingent on networking support declared. 48 | EXPOSE 8000 49 | -------------------------------------------------------------------------------- /exploration-notebooks/exploration-html.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "35227754", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "\n", 16 | "\n", 63 | "To toggle visibility of explanation cells click here\n" 64 | ], 65 | "text/plain": [ 66 | "" 67 | ] 68 | }, 69 | "metadata": {}, 70 | "output_type": "display_data" 71 | } 72 | ], 73 | "source": [ 74 | "%%html\n", 75 | "\n", 79 | "\n", 126 | "To toggle visibility of explanation cells click here\n" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "id": "e908195c", 132 | "metadata": {}, 133 | "source": [ 134 | "# HTML Preprocessing" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "727614ba", 140 | "metadata": {}, 141 | "source": [ 142 | "This notebook defines the steps for extracting information from an HTML file. To see how to create a generalized API for all documents in the `pipeline-notebooks` directory\n", 143 | "\n", 144 | "To demonstrate how off-the-shelf Unstructured Bricks extract meaningful data from complex source documents, we will apply a series of Bricks with explanations.\n", 145 | "\n", 146 | "#### Table of Contents\n", 147 | "\n", 148 | "1. [Take a Look at a HTML File](#explore)\n", 149 | "1. [Custom Partitioning Bricks](#custom)\n", 150 | "1. [Cleaning Bricks](#cleaning)\n", 151 | "1. [Staging Bricks](#staging)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "id": "3848e558", 157 | "metadata": {}, 158 | "source": [ 159 | "## Section 1: Take a Look at a HTML File " 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "id": "71814e12", 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "import os\n", 170 | "import json\n", 171 | "\n", 172 | "\n", 173 | "def get_filename(directory, filename):\n", 174 | " cwd = os.getcwd()\n", 175 | " local_directory = os.path.join(os.path.split(cwd)[0], directory)\n", 176 | " ci_directory = os.path.join(cwd, directory)\n", 177 | "\n", 178 | " if os.path.exists(local_directory) and filename in os.listdir(local_directory):\n", 179 | " return os.path.join(local_directory, filename)\n", 180 | " elif os.path.exists(ci_directory) and filename in os.listdir(ci_directory):\n", 181 | " return os.path.join(ci_directory, filename)\n", 182 | " else:\n", 183 | " raise FileNotFoundError" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "id": "72f0ebc4", 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "filename = get_filename(\"sample-docs\", \"fake-html.html\")" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "id": "ea3b2b58", 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "from unstructured.documents.html import HTMLDocument\n", 204 | "\n", 205 | "document = HTMLDocument.from_file(filename)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "id": "fa146f41", 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "My First Heading\n", 219 | "\n", 220 | "My first paragraph.\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "print(document)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "id": "15d69b6b", 231 | "metadata": {}, 232 | "source": [ 233 | "## Section 2: Custom Partition Bricks" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "id": "ff34cce7", 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "from unstructured.partition.html import partition_html\n", 244 | "\n", 245 | "elements = partition_html(filename)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "id": "7a46b93f", 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "[, ]\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "print(elements)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "id": "e0312c8c", 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "My First Heading\n", 277 | "My first paragraph.\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "for element in elements:\n", 283 | " print(element.text)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "id": "10e1d3df", 289 | "metadata": {}, 290 | "source": [ 291 | "## Section 3: Cleaning Bricks " 292 | ] 293 | }, 294 | { 295 | "attachments": {}, 296 | "cell_type": "markdown", 297 | "id": "52943c00", 298 | "metadata": {}, 299 | "source": [ 300 | "In addition to partitioning bricks, the Unstructured library has\n", 301 | "***cleaning*** bricks for removing unwanted content from text. In this\n", 302 | "case, we'll solve our punctuation problem by using the \n", 303 | "`remove_punctuation`. Other uses for cleaning bricks include\n", 304 | "cleaning out boilerplate, sentence fragments, and other segments\n", 305 | "of text that could impact labeling tasks or the accuracy of\n", 306 | "machine learning models. As with partitioning bricks, users can\n", 307 | "include custom cleaning bricks in a pipeline." 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "id": "268e7dcd", 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "data": { 318 | "text/plain": [ 319 | "'My first paragraph.'" 320 | ] 321 | }, 322 | "execution_count": null, 323 | "metadata": {}, 324 | "output_type": "execute_result" 325 | } 326 | ], 327 | "source": [ 328 | "#This element has a lot of new line characters\n", 329 | "elements[1].text" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "id": "485198a5", 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/plain": [ 341 | "'My first paragraph'" 342 | ] 343 | }, 344 | "execution_count": null, 345 | "metadata": {}, 346 | "output_type": "execute_result" 347 | } 348 | ], 349 | "source": [ 350 | "from unstructured.cleaners.core import remove_punctuation\n", 351 | "\n", 352 | "remove_punctuation(elements[1].text)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "id": "0f7fea99", 358 | "metadata": {}, 359 | "source": [ 360 | "## Section 4: Staging Bricks" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "id": "4f41f82c", 367 | "metadata": {}, 368 | "outputs": [ 369 | { 370 | "data": { 371 | "text/plain": [ 372 | "[{'data': {'text': 'My First Heading',\n", 373 | " 'ref_id': '0540311f6c077fe8f797080918b8d74b'}},\n", 374 | " {'data': {'text': 'My first paragraph.',\n", 375 | " 'ref_id': '399af454cb1368b8257ed406b430de84'}}]" 376 | ] 377 | }, 378 | "execution_count": null, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "from unstructured.staging.label_studio import stage_for_label_studio\n", 385 | "\n", 386 | "label_studio_data = stage_for_label_studio(elements)\n", 387 | "label_studio_data" 388 | ] 389 | } 390 | ], 391 | "metadata": { 392 | "kernelspec": { 393 | "display_name": "python3", 394 | "language": "python", 395 | "name": "python3" 396 | } 397 | }, 398 | "nbformat": 4, 399 | "nbformat_minor": 5 400 | } 401 | -------------------------------------------------------------------------------- /exploration-notebooks/exploration-powerpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "35227754", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "\n", 16 | "\n", 63 | "To toggle visibility of explanation cells click here\n" 64 | ], 65 | "text/plain": [ 66 | "" 67 | ] 68 | }, 69 | "metadata": {}, 70 | "output_type": "display_data" 71 | } 72 | ], 73 | "source": [ 74 | "%%html\n", 75 | "\n", 79 | "\n", 126 | "To toggle visibility of explanation cells click here\n" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "id": "e908195c", 132 | "metadata": {}, 133 | "source": [ 134 | "# Powerpoint Preprocessing" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "727614ba", 140 | "metadata": {}, 141 | "source": [ 142 | "This notebook defines the steps for extracting information from an Powerpoint file. To see how to create a generalized API for all documents in the `pipeline-notebooks` directory\n", 143 | "\n", 144 | "To demonstrate how off-the-shelf Unstructured Bricks extract meaningful data from complex source documents, we will apply a series of Bricks with explanations.\n", 145 | "\n", 146 | "#### Table of Contents\n", 147 | "\n", 148 | "1. [Take a Look at a Powerpoint File](#explore)\n", 149 | "1. [Custom Partitioning Bricks](#custom)\n", 150 | "1. [Cleaning Bricks](#cleaning)\n", 151 | "1. [Staging Bricks](#staging)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "id": "3848e558", 157 | "metadata": {}, 158 | "source": [ 159 | "## Section 1: Take a Look at a Excel File " 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "id": "71814e12", 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "import os\n", 170 | "import json\n", 171 | "\n", 172 | "\n", 173 | "def get_filename(directory, filename):\n", 174 | " cwd = os.getcwd()\n", 175 | " local_directory = os.path.join(os.path.split(cwd)[0], directory)\n", 176 | " ci_directory = os.path.join(cwd, directory)\n", 177 | "\n", 178 | " if os.path.exists(local_directory) and filename in os.listdir(local_directory):\n", 179 | " return os.path.join(local_directory, filename)\n", 180 | " elif os.path.exists(ci_directory) and filename in os.listdir(ci_directory):\n", 181 | " return os.path.join(ci_directory, filename)\n", 182 | " else:\n", 183 | " raise FileNotFoundError" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "id": "72f0ebc4", 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "filename = get_filename(\"sample-docs\", \"fake-power-point.pptx\")" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "id": "ea3b2b58", 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "import pptx\n", 204 | "\n", 205 | "presentation = pptx.Presentation(filename)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "id": "fa146f41", 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "shape = presentation.slides[0].shapes[0]" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "id": "7c938979", 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "text = shape.text_frame.paragraphs[0].text" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "id": "f3848757", 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "name": "stdout", 236 | "output_type": "stream", 237 | "text": [ 238 | "Adding a Bullet Slide\n" 239 | ] 240 | } 241 | ], 242 | "source": [ 243 | "print(text)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "id": "15d69b6b", 249 | "metadata": {}, 250 | "source": [ 251 | "## Section 2: Custom Partition Bricks" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "id": "ff34cce7", 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "from unstructured.partition.pptx import partition_pptx\n", 262 | "\n", 263 | "elements = partition_pptx(filename)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "id": "7a46b93f", 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "[, , , ]\n" 277 | ] 278 | } 279 | ], 280 | "source": [ 281 | "print(elements)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "id": "e0312c8c", 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "Adding a Bullet Slide\n", 295 | "Find the bullet slide layout\n", 296 | "Use _TextFrame.text for first bullet\n", 297 | "Use _TextFrame.add_paragraph() for subsequent bullets\n" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "for element in elements:\n", 303 | " print(element.text)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "id": "10e1d3df", 309 | "metadata": {}, 310 | "source": [ 311 | "## Section 3: Cleaning Bricks " 312 | ] 313 | }, 314 | { 315 | "attachments": {}, 316 | "cell_type": "markdown", 317 | "id": "52943c00", 318 | "metadata": {}, 319 | "source": [ 320 | "In addition to partitioning bricks, the Unstructured library has\n", 321 | "***cleaning*** bricks for removing unwanted content from text. In this\n", 322 | "case, we'll solve our punctuation problem by using the \n", 323 | "`remove_punctuation`. Other uses for cleaning bricks include\n", 324 | "cleaning out boilerplate, sentence fragments, and other segments\n", 325 | "of text that could impact labeling tasks or the accuracy of\n", 326 | "machine learning models. As with partitioning bricks, users can\n", 327 | "include custom cleaning bricks in a pipeline." 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "id": "268e7dcd", 334 | "metadata": {}, 335 | "outputs": [ 336 | { 337 | "data": { 338 | "text/plain": [ 339 | "'Use _TextFrame.add_paragraph() for subsequent bullets'" 340 | ] 341 | }, 342 | "execution_count": null, 343 | "metadata": {}, 344 | "output_type": "execute_result" 345 | } 346 | ], 347 | "source": [ 348 | "#This element has a lot of new line characters\n", 349 | "elements[3].text" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "id": "485198a5", 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "'Use TextFrameaddparagraph for subsequent bullets'" 362 | ] 363 | }, 364 | "execution_count": null, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "from unstructured.cleaners.core import remove_punctuation\n", 371 | "\n", 372 | "remove_punctuation(elements[3].text)" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "id": "0f7fea99", 378 | "metadata": {}, 379 | "source": [ 380 | "## Section 4: Staging Bricks" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "id": "a4cb2037", 387 | "metadata": {}, 388 | "outputs": [ 389 | { 390 | "data": { 391 | "text/plain": [ 392 | "'Use _TextFrame.text for first bullet'" 393 | ] 394 | }, 395 | "execution_count": null, 396 | "metadata": {}, 397 | "output_type": "execute_result" 398 | } 399 | ], 400 | "source": [ 401 | "elements[2].text" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "id": "4f41f82c", 408 | "metadata": {}, 409 | "outputs": [ 410 | { 411 | "data": { 412 | "text/plain": [ 413 | "[{'data': {'text': 'Adding a Bullet Slide',\n", 414 | " 'ref_id': '50b70366a51804855c6dd48a3865cb87'}},\n", 415 | " {'data': {'text': 'Find the bullet slide layout',\n", 416 | " 'ref_id': '3c0332d3515a039dee82e4f3388594c8'}},\n", 417 | " {'data': {'text': 'Use _TextFrame.text for first bullet',\n", 418 | " 'ref_id': 'ca8d08c97f0eeb554cac4758c9229614'}},\n", 419 | " {'data': {'text': 'Use _TextFrame.add_paragraph() for subsequent bullets',\n", 420 | " 'ref_id': '83d53564b64b558f77c7c33b5a029213'}}]" 421 | ] 422 | }, 423 | "execution_count": null, 424 | "metadata": {}, 425 | "output_type": "execute_result" 426 | } 427 | ], 428 | "source": [ 429 | "from unstructured.staging.label_studio import stage_for_label_studio\n", 430 | "\n", 431 | "label_studio_data = stage_for_label_studio(elements)\n", 432 | "label_studio_data" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "id": "7bd176e1", 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [] 442 | } 443 | ], 444 | "metadata": { 445 | "kernelspec": { 446 | "display_name": "python3", 447 | "language": "python", 448 | "name": "python3" 449 | } 450 | }, 451 | "nbformat": 4, 452 | "nbformat_minor": 5 453 | } 454 | -------------------------------------------------------------------------------- /img/email-screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/img/email-screenshot.png -------------------------------------------------------------------------------- /img/unstructured_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/img/unstructured_logo.png -------------------------------------------------------------------------------- /logger_config.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | disable_existing_loggers: False 3 | formatters: 4 | default_format: 5 | "()": uvicorn.logging.DefaultFormatter 6 | format: '%(asctime)s %(name)s %(levelname)s %(message)s' 7 | access: 8 | "()": uvicorn.logging.AccessFormatter 9 | format: '%(asctime)s %(client_addr)s %(request_line)s - %(status_code)s' 10 | handlers: 11 | access_handler: 12 | formatter: access 13 | class: logging.StreamHandler 14 | stream: ext://sys.stderr 15 | standard_handler: 16 | formatter: default_format 17 | class: logging.StreamHandler 18 | stream: ext://sys.stderr 19 | loggers: 20 | uvicorn.error: 21 | level: INFO 22 | handlers: 23 | - standard_handler 24 | propagate: no 25 | # disable logging for uvicorn.error by not having a handler 26 | uvicorn.access: 27 | level: INFO 28 | handlers: 29 | - access_handler 30 | propagate: no 31 | # disable logging for uvicorn.access by not having a handler 32 | unstructured: 33 | level: INFO 34 | handlers: 35 | - standard_handler 36 | propagate: no 37 | unstructured.trace: 38 | level: CRITICAL 39 | handlers: 40 | - standard_handler 41 | propagate: no 42 | unstructured_inference: 43 | level: DEBUG 44 | handlers: 45 | - standard_handler 46 | propagate: no 47 | unstructured_api: 48 | level: DEBUG 49 | handlers: 50 | - standard_handler 51 | propagate: no 52 | 53 | -------------------------------------------------------------------------------- /prepline_general/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/prepline_general/__init__.py -------------------------------------------------------------------------------- /prepline_general/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/prepline_general/api/__init__.py -------------------------------------------------------------------------------- /prepline_general/api/app.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from fastapi import FastAPI, HTTPException, Request, status 5 | from fastapi.datastructures import FormData 6 | from fastapi.responses import JSONResponse 7 | 8 | from .general import router as general_router 9 | from .openapi import set_custom_openapi 10 | 11 | logger = logging.getLogger("unstructured_api") 12 | 13 | app = FastAPI( 14 | title="Unstructured Pipeline API", 15 | summary="Partition documents with the Unstructured library", 16 | version="0.0.85", 17 | docs_url="/general/docs", 18 | openapi_url="/general/openapi.json", 19 | servers=[ 20 | { 21 | "url": "https://api.unstructured.io", 22 | "description": "Hosted API", 23 | "x-speakeasy-server-id": "prod", 24 | }, 25 | { 26 | "url": "http://localhost:8000", 27 | "description": "Development server", 28 | "x-speakeasy-server-id": "local", 29 | }, 30 | ], 31 | openapi_tags=[{"name": "general"}], 32 | ) 33 | 34 | # Note(austin) - This logger just dumps exceptions 35 | # We'd rather handle those below, so disable this in deployments 36 | uvicorn_logger = logging.getLogger("uvicorn.error") 37 | if os.environ.get("ENV") in ["dev", "prod"]: 38 | uvicorn_logger.disabled = True 39 | 40 | 41 | # Catch all HTTPException for uniform logging and response 42 | @app.exception_handler(HTTPException) 43 | async def http_error_handler(request: Request, e: HTTPException): 44 | logger.error(e.detail) 45 | return JSONResponse(status_code=e.status_code, content={"detail": e.detail}) 46 | 47 | 48 | # Catch any other errors and return as 500 49 | @app.exception_handler(Exception) 50 | async def error_handler(request: Request, e: Exception): 51 | return JSONResponse(status_code=500, content={"detail": str(e)}) 52 | 53 | 54 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", None) 55 | if allowed_origins: 56 | from fastapi.middleware.cors import CORSMiddleware 57 | 58 | app.add_middleware( 59 | CORSMiddleware, 60 | allow_origins=allowed_origins.split(","), 61 | allow_methods=["OPTIONS", "POST"], 62 | allow_headers=["Content-Type"], 63 | ) 64 | 65 | app.include_router(general_router) 66 | 67 | set_custom_openapi(app) 68 | 69 | 70 | # Note(austin) - When FastAPI parses our FormData params, 71 | # it builds lists out of duplicate keys, like so: 72 | # FormData([('key', 'value1'), ('key', 'value2')]) 73 | # 74 | # The Speakeasy clients send a more explicit form: 75 | # FormData([('key[]', 'value1'), ('key[]', 'value2')]) 76 | # 77 | # FastAPI doesn't understand these, so we need to transform them. 78 | # Can't do this in middleware before the data stream is read, nor in the endpoint 79 | # after the fields are parsed. Thus, we have to patch it into Request.form() on startup. 80 | get_form = Request._get_form 81 | 82 | 83 | async def patched_get_form( 84 | self, 85 | *, 86 | max_files: int | float = 1000, 87 | max_fields: int | float = 1000, 88 | ) -> FormData: 89 | """ 90 | Call the original get_form, and iterate the results 91 | If a key has brackets at the end, remove them before returning the final FormData 92 | Note the extra params here are unused, but needed to match the signature 93 | """ 94 | form_params = await get_form(self) 95 | 96 | fixed_params = [] 97 | for key, value in form_params.multi_items(): 98 | # Transform key[] into key 99 | if key and key.endswith("[]"): 100 | key = key[:-2] 101 | 102 | fixed_params.append((key, value)) 103 | 104 | return FormData(fixed_params) 105 | 106 | 107 | # Replace the private method with our wrapper 108 | Request._get_form = patched_get_form # type: ignore[assignment] 109 | 110 | 111 | # Filter out /healthcheck noise 112 | class HealthCheckFilter(logging.Filter): 113 | def filter(self, record: logging.LogRecord) -> bool: 114 | return record.getMessage().find("/healthcheck") == -1 115 | 116 | 117 | # Filter out /metrics noise 118 | class MetricsCheckFilter(logging.Filter): 119 | def filter(self, record: logging.LogRecord) -> bool: 120 | return record.getMessage().find("/metrics") == -1 121 | 122 | 123 | logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter()) 124 | logging.getLogger("uvicorn.access").addFilter(MetricsCheckFilter()) 125 | 126 | 127 | @app.get("/healthcheck", status_code=status.HTTP_200_OK, include_in_schema=False) 128 | def healthcheck(request: Request): 129 | return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"} 130 | 131 | 132 | logger.info("Started Unstructured API") 133 | -------------------------------------------------------------------------------- /prepline_general/api/filetypes.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | from io import BytesIO 4 | 5 | from fastapi import HTTPException, UploadFile 6 | 7 | from unstructured.file_utils.filetype import detect_filetype 8 | from unstructured.file_utils.model import FileType 9 | 10 | 11 | def _remove_optional_info_from_mime_type(content_type: str | None) -> str | None: 12 | """removes charset information from mime types, e.g., 13 | "application/json; charset=utf-8" -> "application/json" 14 | """ 15 | if not content_type: 16 | return content_type 17 | return content_type.split(";")[0] 18 | 19 | 20 | def get_validated_mimetype(file: UploadFile, content_type_hint: str | None = None) -> Optional[str]: 21 | """Given the incoming file, identify and return the correct mimetype. 22 | 23 | Order of operations: 24 | - If user passed content_type as a form param, take it as truth. 25 | - Otherwise, use file.content_type (as set by the Content-Type header) 26 | - If no content_type was passed and the header wasn't useful, call the library's detect_filetype 27 | 28 | Once we have a filteype, check is_partitionable and return 400 if we don't support this file. 29 | """ 30 | content_type: str | None = None 31 | 32 | if content_type_hint is not None: 33 | content_type = content_type_hint 34 | else: 35 | content_type = _remove_optional_info_from_mime_type(file.content_type) 36 | 37 | filetype = FileType.from_mime_type(content_type) 38 | 39 | # If content_type was not specified, use the library to identify the file 40 | # We inspect the bytes to do this, so we need to buffer the file 41 | if not filetype or filetype == FileType.UNK: 42 | file_buffer = BytesIO(file.file.read()) 43 | file.file.seek(0) 44 | 45 | file_buffer.name = file.filename 46 | 47 | filetype = detect_filetype(file=file_buffer) 48 | 49 | if not filetype.is_partitionable: 50 | raise HTTPException( 51 | status_code=400, 52 | detail=(f"File type {filetype.mime_type} is not supported."), 53 | ) 54 | 55 | return filetype.mime_type 56 | -------------------------------------------------------------------------------- /prepline_general/api/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/prepline_general/api/models/__init__.py -------------------------------------------------------------------------------- /prepline_general/api/models/form_params.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, List, Literal, Optional 2 | 3 | from fastapi import Form 4 | from pydantic import BaseModel, BeforeValidator 5 | 6 | from prepline_general.api.utils import SmartValueParser 7 | 8 | 9 | class GeneralFormParams(BaseModel): 10 | """General partition API form parameters for the prepline API. 11 | To add a new parameter, add it here and in the as_form classmethod. 12 | Use Annotated to add a description and example for the parameter. 13 | """ 14 | 15 | xml_keep_tags: bool 16 | languages: Optional[List[str]] 17 | ocr_languages: Optional[List[str]] 18 | skip_infer_table_types: Optional[List[str]] 19 | gz_uncompressed_content_type: Optional[str] 20 | output_format: str 21 | coordinates: bool 22 | encoding: str 23 | content_type: Optional[str] 24 | hi_res_model_name: Optional[str] 25 | include_page_breaks: bool 26 | pdf_infer_table_structure: bool 27 | strategy: str 28 | extract_image_block_types: Optional[List[str]] 29 | unique_element_ids: bool 30 | # -- chunking options -- 31 | chunking_strategy: Optional[str] 32 | combine_under_n_chars: Optional[int] 33 | max_characters: int 34 | multipage_sections: bool 35 | new_after_n_chars: Optional[int] 36 | overlap: int 37 | overlap_all: bool 38 | starting_page_number: Optional[int] = None 39 | include_slide_notes: bool 40 | 41 | @classmethod 42 | def as_form( 43 | cls, 44 | xml_keep_tags: Annotated[ 45 | bool, 46 | Form( 47 | title="Xml Keep Tags", 48 | description="If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml.", 49 | ), 50 | BeforeValidator(SmartValueParser[bool]().value_or_first_element), 51 | ] = False, 52 | languages: Annotated[ 53 | List[str], 54 | Form( 55 | title="OCR Languages", 56 | description="The languages present in the document, for use in partitioning and/or OCR", 57 | example="[eng]", 58 | ), 59 | BeforeValidator(SmartValueParser[List[str]]().value_or_first_element), 60 | ] = [], # noqa 61 | ocr_languages: Annotated[ 62 | List[str], 63 | Form( 64 | title="OCR Languages", 65 | description="The languages present in the document, for use in partitioning and/or OCR", 66 | example="[eng]", 67 | ), 68 | BeforeValidator(SmartValueParser[List[str]]().value_or_first_element), 69 | ] = [], 70 | skip_infer_table_types: Annotated[ 71 | List[str], 72 | Form( 73 | title="Skip Infer Table Types", 74 | description=( 75 | "The document types that you want to skip table extraction with. Default: []" 76 | ), 77 | example="['pdf', 'jpg', 'png']", 78 | ), 79 | BeforeValidator(SmartValueParser[List[str]]().value_or_first_element), 80 | ] = [], # noqa 81 | gz_uncompressed_content_type: Annotated[ 82 | Optional[str], 83 | Form( 84 | title="Uncompressed Content Type", 85 | description="If file is gzipped, use this content type after unzipping", 86 | example="application/pdf", 87 | ), 88 | ] = None, 89 | output_format: Annotated[ 90 | Literal["application/json", "text/csv"], 91 | Form( 92 | title="Output Format", 93 | description="The format of the response. Supported formats are application/json and text/csv. Default: application/json.", 94 | example="application/json", 95 | ), 96 | ] = "application/json", 97 | coordinates: Annotated[ 98 | bool, 99 | Form( 100 | title="Coordinates", 101 | description="If true, return coordinates for each element. Default: false", 102 | ), 103 | BeforeValidator(SmartValueParser[bool]().value_or_first_element), 104 | ] = False, 105 | content_type: Annotated[ 106 | Optional[str], 107 | Form( 108 | title="Content type", 109 | description="A hint about the content type to use (such as text/markdown), when there are problems processing a specific file. This value is a MIME type in the format type/subtype.", 110 | example="text/markdown", 111 | ), 112 | BeforeValidator(SmartValueParser[str]().value_or_first_element), 113 | ] = None, 114 | encoding: Annotated[ 115 | str, 116 | Form( 117 | title="Encoding", 118 | description="The encoding method used to decode the text input. Default: utf-8", 119 | example="utf-8", 120 | ), 121 | BeforeValidator(SmartValueParser[str]().value_or_first_element), 122 | ] = "utf-8", 123 | hi_res_model_name: Annotated[ 124 | Optional[str], 125 | Form( 126 | title="Hi Res Model Name", 127 | description="The name of the inference model used when strategy is hi_res", 128 | example="yolox", 129 | ), 130 | BeforeValidator(SmartValueParser[str]().value_or_first_element), 131 | ] = None, 132 | include_page_breaks: Annotated[ 133 | bool, 134 | Form( 135 | title="Include Page Breaks", 136 | description="If True, the output will include page breaks if the filetype supports it. Default: false", 137 | ), 138 | BeforeValidator(SmartValueParser[str]().value_or_first_element), 139 | ] = False, 140 | pdf_infer_table_structure: Annotated[ 141 | bool, 142 | Form( 143 | title="Pdf Infer Table Structure", 144 | description=( 145 | "Deprecated! Use skip_infer_table_types to opt out of table extraction for any " 146 | "file type. If False and strategy=hi_res, no Table Elements will be extracted " 147 | "from pdf files regardless of skip_infer_table_types contents." 148 | ), 149 | ), 150 | BeforeValidator(SmartValueParser[bool]().value_or_first_element), 151 | ] = True, 152 | strategy: Annotated[ 153 | Literal["fast", "hi_res", "auto", "ocr_only"], 154 | Form( 155 | title="Strategy", 156 | description="The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto", 157 | examples=["auto", "hi_res"], 158 | ), 159 | BeforeValidator(SmartValueParser[str]().literal_value_stripped_or_first_element), 160 | ] = "auto", 161 | extract_image_block_types: Annotated[ 162 | List[str], 163 | Form( 164 | title="Image block types to extract", 165 | description="The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields", 166 | example="""["image", "table"]""", 167 | ), 168 | BeforeValidator(SmartValueParser[List[str]]().value_or_first_element), 169 | ] = [], # noqa 170 | unique_element_ids: Annotated[ 171 | bool, 172 | Form( 173 | title="unique_element_ids", 174 | description="""When `True`, assign UUIDs to element IDs, which guarantees their uniqueness 175 | (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: False""", 176 | example=True, 177 | ), 178 | ] = False, 179 | # -- chunking options -- 180 | chunking_strategy: Annotated[ 181 | Optional[Literal["by_title"]], 182 | Form( 183 | title="Chunking Strategy", 184 | description="Use one of the supported strategies to chunk the returned elements. Currently supports: by_title", 185 | examples=["by_title"], 186 | ), 187 | ] = None, 188 | combine_under_n_chars: Annotated[ 189 | Optional[int], 190 | Form( 191 | title="Combine Under N Chars", 192 | description="If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: 500", 193 | example=500, 194 | ), 195 | ] = None, 196 | max_characters: Annotated[ 197 | int, 198 | Form( 199 | title="Max Characters", 200 | description="If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 1500", 201 | example=1500, 202 | ), 203 | ] = 500, 204 | multipage_sections: Annotated[ 205 | bool, 206 | Form( 207 | title="Multipage Sections", 208 | description="If chunking strategy is set, determines if sections can span multiple sections. Default: true", 209 | ), 210 | ] = True, 211 | new_after_n_chars: Annotated[ 212 | Optional[int], 213 | Form( 214 | title="New after n chars", 215 | description="If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: 1500", 216 | example=1500, 217 | ), 218 | ] = None, 219 | overlap: Annotated[ 220 | int, 221 | Form( 222 | title="Overlap", 223 | description="""Specifies the length of a string ("tail") to be drawn from each chunk and prefixed to the 224 | next chunk as a context-preserving mechanism. By default, this only applies to split-chunks 225 | where an oversized element is divided into multiple chunks by text-splitting. Default: 0""", 226 | example=20, 227 | ), 228 | ] = 0, 229 | overlap_all: Annotated[ 230 | bool, 231 | Form( 232 | title="Overlap all", 233 | description="""When `True`, apply overlap between "normal" chunks formed from whole 234 | elements and not subject to text-splitting. Use this with caution as it entails a certain 235 | level of "pollution" of otherwise clean semantic chunk boundaries. Default: False""", 236 | example=True, 237 | ), 238 | ] = False, 239 | starting_page_number: Annotated[ 240 | Optional[int], 241 | Form( 242 | title="PDF Starting Page Number", 243 | description=( 244 | "When PDF is split into pages before sending it into the API, providing " 245 | "this information will allow the page number to be assigned correctly." 246 | ), 247 | example=3, 248 | ), 249 | ] = None, 250 | include_slide_notes: Annotated[ 251 | bool, 252 | Form( 253 | title="include_slide_notes", 254 | description=( 255 | "When `True`, slide notes from .ppt and .pptx files" 256 | " will be included in the response. Default: `True`" 257 | ), 258 | example=False, 259 | ), 260 | ] = True, 261 | ) -> "GeneralFormParams": 262 | return cls( 263 | xml_keep_tags=xml_keep_tags, 264 | languages=languages if languages else None, 265 | ocr_languages=ocr_languages if ocr_languages else None, 266 | skip_infer_table_types=skip_infer_table_types, 267 | gz_uncompressed_content_type=gz_uncompressed_content_type, 268 | output_format=output_format, 269 | coordinates=coordinates, 270 | content_type=content_type, 271 | encoding=encoding, 272 | hi_res_model_name=hi_res_model_name, 273 | include_page_breaks=include_page_breaks, 274 | pdf_infer_table_structure=pdf_infer_table_structure, 275 | strategy=strategy, 276 | extract_image_block_types=( 277 | extract_image_block_types if extract_image_block_types else None 278 | ), 279 | chunking_strategy=chunking_strategy, 280 | combine_under_n_chars=combine_under_n_chars, 281 | max_characters=max_characters, 282 | multipage_sections=multipage_sections, 283 | new_after_n_chars=new_after_n_chars, 284 | overlap=overlap, 285 | overlap_all=overlap_all, 286 | unique_element_ids=unique_element_ids, 287 | starting_page_number=starting_page_number, 288 | include_slide_notes=include_slide_notes, 289 | ) 290 | -------------------------------------------------------------------------------- /prepline_general/api/openapi.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from fastapi import FastAPI 4 | from fastapi.openapi.utils import get_openapi 5 | 6 | 7 | def set_custom_openapi(app: FastAPI) -> None: 8 | """Generate a custom OpenAPI schema for the app""" 9 | 10 | def custom_openapi() -> dict[str, Any]: 11 | if app.openapi_schema: 12 | return app.openapi_schema 13 | openapi_schema = get_openapi( 14 | title=app.title, 15 | version=app.version, 16 | summary=app.summary, 17 | description=app.description, 18 | servers=app.servers, 19 | routes=app.routes, 20 | tags=app.openapi_tags, 21 | ) 22 | _apply_customizations(openapi_schema) 23 | 24 | app.openapi_schema = openapi_schema 25 | return app.openapi_schema 26 | 27 | app.openapi = custom_openapi # type: ignore 28 | 29 | 30 | def _apply_customizations(openapi_schema: dict[str, Any]) -> None: 31 | """Add customizations to the OpenAPI schema""" 32 | 33 | # Add security 34 | openapi_schema["security"] = [{"ApiKeyAuth": []}] 35 | 36 | # Add retries 37 | openapi_schema["x-speakeasy-retries"] = { 38 | "strategy": "backoff", 39 | "backoff": { 40 | "initialInterval": 500, 41 | "maxInterval": 60000, 42 | "maxElapsedTime": 900000, 43 | "exponent": 1.5, 44 | }, 45 | "statusCodes": [ 46 | "5xx", 47 | ], 48 | "retryConnectionErrors": True, 49 | } 50 | 51 | # Response changes 52 | openapi_schema["paths"]["/general/v0/general"]["post"]["responses"]["200"]["content"][ 53 | "application/json" 54 | ]["schema"] = { 55 | "items": {"$ref": "#/components/schemas/Element"}, 56 | "title": "Response Partition Parameters", 57 | "type": "array", 58 | } 59 | 60 | # Schema changes 61 | 62 | # Add securitySchemes 63 | # TODO: Implement security per the FastAPI documentation: 64 | # https://fastapi.tiangolo.com/reference/security/?h=apikey 65 | openapi_schema["components"]["securitySchemes"] = { 66 | "ApiKeyAuth": { 67 | "type": "apiKey", 68 | "name": "unstructured-api-key", 69 | "in": "header", 70 | "x-speakeasy-example": "YOUR_API_KEY", 71 | } 72 | } 73 | 74 | # TODO: Instead of a list of parameters, crete a PartitionParameters model 75 | # and declare schema keys (type, format, description) as attributes 76 | # https://fastapi.tiangolo.com/reference/openapi/models/?h=model 77 | # Update the schema key from `Body_partition` to `partition_parameters` 78 | 79 | # TODO: Similarly, create an Element model 80 | # https://fastapi.tiangolo.com/reference/openapi/models/?h=model 81 | # Add Elements schema 82 | openapi_schema["components"]["schemas"]["Element"] = { 83 | "properties": { 84 | "type": {"type": "string", "title": "Type"}, 85 | "element_id": {"type": "string", "title": "Element Id"}, 86 | "metadata": {"type": "object", "title": "Metadata"}, 87 | "text": {"type": "string", "title": "Text"}, 88 | }, 89 | "type": "object", 90 | "required": ["type", "element_id", "metadata", "text"], 91 | "title": "Element", 92 | } 93 | 94 | # Must manually correct the schema for the files parameter as due to a bug 95 | # described here: https://github.com/tiangolo/fastapi/discussions/10280 96 | # files parameter cannot be described with an annotation. 97 | # TODO: Check if the bug is fixed and remove this workaround 98 | for key in openapi_schema["components"]["schemas"]: 99 | if "partition_parameters" in key: 100 | general_pipeline_schema = openapi_schema["components"]["schemas"][key] 101 | break 102 | else: 103 | # Could not find the schema to update, returning 104 | return 105 | 106 | general_pipeline_schema["properties"]["files"] = { 107 | "type": "string", 108 | "format": "binary", 109 | "description": "The file to extract", 110 | "required": "true", 111 | "examples": [ 112 | { 113 | "summary": "File to be partitioned", 114 | "externalValue": "https://github.com/Unstructured-IO/unstructured/blob/98d3541909f64290b5efb65a226fc3ee8a7cc5ee/example-docs/layout-parser-paper.pdf", 115 | } 116 | ], 117 | } 118 | -------------------------------------------------------------------------------- /prepline_general/api/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import TypeVar, Union, List, Optional, Generic, get_origin, get_args, Type, Any, Tuple 3 | 4 | T = TypeVar("T") 5 | E = TypeVar("E") 6 | 7 | 8 | def _cast_to_type(value: Any, origin_class: type) -> Any: 9 | """Cast a value to a type E 10 | 11 | Args: 12 | value (Any): value to cast to a type T 13 | origin_class (type): type to cast the value to. Should be one of simple types 14 | 15 | Returns: 16 | T: value cast to a type T 17 | """ 18 | if isinstance(value, str) and (origin_class == int or origin_class == float): 19 | return origin_class(value) # noqa 20 | if origin_class == bool and isinstance(value, str): 21 | return value.lower() == "true" 22 | return value 23 | 24 | 25 | def _return_cast_first_element(values: list[E], origin_class: type) -> E | None: 26 | """Return the first element of a list cast to a type T, or None if the list is empty 27 | 28 | Args: 29 | values (list[str]): list of strings 30 | origin_class (type): type to cast the first element to. Should be one of simple types 31 | 32 | Returns: 33 | T | None: first element cast to a type T, or None if the list is empty 34 | """ 35 | value = next(iter(values), None) 36 | if value is not None: 37 | return _cast_to_type(value, origin_class) # noqa 38 | return value 39 | 40 | 41 | def is_convertible_to_list(s: str) -> Tuple[bool, Union[List, str]]: 42 | """ 43 | Determines if a given string is convertible to a list. 44 | 45 | This function first tries to parse the string as JSON. If the parsed JSON is a list, it returns 46 | True along with the list. If parsing as JSON fails, it then checks if the string can be split 47 | into a list using predefined delimiters ("," or "+"). If so, it returns True and the resulting list. 48 | If neither condition is met, it returns False and a message indicating the string cannot 49 | be converted to a list. 50 | """ 51 | 52 | try: 53 | result = json.loads(s) 54 | if isinstance(result, list): 55 | return True, result # Return the list if conversion is successful 56 | else: 57 | return False, "Input is valid JSON but not a list." # Valid JSON but not a list 58 | except json.JSONDecodeError: 59 | pass # proceed to check using delimiters if JSON parsing fails 60 | 61 | delimiters = ["+", ","] 62 | for delimiter in delimiters: 63 | if delimiter in delimiters: 64 | return True, s.split(delimiter) 65 | 66 | return False, "Input is not valid JSON." # Invalid JSON 67 | 68 | 69 | class SmartValueParser(Generic[T]): 70 | """Class handle api parameters that are passed in form of a specific value or as a list of strings from which 71 | the first element is used, cast to a proper type 72 | Should be parametrized with a type to which the value should be casted. 73 | 74 | Examples: 75 | SmartValueParser[int]().value_or_first_element(value) 76 | SmartValueParser[list[int]]().value_or_first_element(value) 77 | """ 78 | 79 | def value_or_first_element(self, value: Union[T, list[T]]) -> list[T] | T | None: 80 | """If value is a list, return the first element cast to a type T, otherwise return the value itself 81 | 82 | Args: 83 | value (Union[T, List[str]]): value to cast to a type T or return as is 84 | """ 85 | origin_class, container_elems_class = self._get_origin_container_classes() 86 | if isinstance(value, list) and not isinstance(value, origin_class): 87 | extracted_value: T | None = _return_cast_first_element(value, origin_class) 88 | return extracted_value 89 | elif isinstance(value, list) and origin_class == list and container_elems_class: 90 | if len(value) == 1: 91 | is_list, result = is_convertible_to_list(str(value[0])) 92 | new_value = result if is_list else value 93 | return [_cast_to_type(elem, container_elems_class) for elem in new_value] 94 | return [_cast_to_type(elem, container_elems_class) for elem in value] 95 | return _cast_to_type(value, origin_class) # noqa 96 | 97 | def literal_value_stripped_or_first_element(self, value: str) -> str | None: 98 | """Returns the value itself for literal strings and strips quotation characters. 99 | 100 | Args: 101 | value (Union[T, List[str]]): value to cast to a type T or return as is 102 | """ 103 | origin_class, container_elems_class = self._get_origin_container_classes() 104 | value = value.replace("'", "") 105 | value = value.replace('"', "") 106 | return _cast_to_type(value, origin_class) 107 | 108 | def _get_origin_container_classes(self) -> tuple[type, type | None]: 109 | """Extracts class (and container class if it's a list) from a type hint 110 | 111 | Returns: 112 | tuple[type, type | None]: class and container class of the type hint 113 | """ 114 | type_info = self.__orig_class__.__args__[0] # type: ignore 115 | origin_class = get_origin(type_info) 116 | if origin_class is None: 117 | # it's a basic type like int or bool - return it and no container class 118 | return type_info, None 119 | origin_args = get_args(type_info) 120 | container_elems_class = origin_args[0] if origin_args else None 121 | return origin_class, container_elems_class 122 | -------------------------------------------------------------------------------- /preprocessing-pipeline-family.yaml: -------------------------------------------------------------------------------- 1 | name: general 2 | version: 0.0.85 3 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 100 3 | 4 | [tool.pyright] 5 | pythonPlatform = "Linux" 6 | pythonVersion = "3.12" 7 | reportUnnecessaryCast = true 8 | typeCheckingMode = "strict" 9 | 10 | [tool.ruff] 11 | line-length = 100 12 | select = [ 13 | "C4", # -- flake8-comprehensions -- 14 | "COM", # -- flake8-commas -- 15 | "E", # -- pycodestyle errors -- 16 | "F", # -- pyflakes -- 17 | "I", # -- isort (imports) -- 18 | "PLR0402", # -- Name compared with itself like `foo == foo` -- 19 | "PT", # -- flake8-pytest-style -- 20 | "SIM", # -- flake8-simplify -- 21 | "UP015", # -- redundant `open()` mode parameter (like "r" is default) -- 22 | "UP018", # -- Unnecessary {literal_type} call like `str("abc")`. (rewrite as a literal) -- 23 | "UP032", # -- Use f-string instead of `.format()` call -- 24 | "UP034", # -- Avoid extraneous parentheses -- 25 | ] 26 | ignore = [ 27 | "COM812", # -- over aggressively insists on trailing commas where not desireable -- 28 | "PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception -- 29 | "PT012", # -- pytest.raises() block should contain a single simple statement -- 30 | "SIM117", # -- merge `with` statements for context managers that have same scope -- 31 | ] 32 | 33 | [tool.ruff.lint.isort] 34 | known-first-party = [ 35 | "unstructured", 36 | "unstructured_inference", 37 | ] 38 | -------------------------------------------------------------------------------- /requirements/base.in: -------------------------------------------------------------------------------- 1 | -c constraints.in 2 | unstructured[all-docs] 3 | # Pinning click due to a unicode issue in black 4 | # can remove after black drops support for Python 3.6 5 | # ref: https://github.com/psf/black/issues/2964 6 | click==8.1.3 7 | fastapi 8 | uvicorn 9 | ratelimit 10 | requests 11 | backoff 12 | pypdf 13 | pycryptodome 14 | psutil 15 | -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.12 3 | # by the following command: 4 | # 5 | # pip-compile requirements/base.in 6 | # 7 | aiofiles==24.1.0 8 | # via unstructured-client 9 | annotated-types==0.7.0 10 | # via pydantic 11 | antlr4-python3-runtime==4.9.3 12 | # via omegaconf 13 | anyio==4.8.0 14 | # via 15 | # httpx 16 | # starlette 17 | backoff==2.2.1 18 | # via 19 | # -r requirements/base.in 20 | # unstructured 21 | beautifulsoup4==4.12.3 22 | # via unstructured 23 | cachetools==5.5.1 24 | # via google-auth 25 | certifi==2024.12.14 26 | # via 27 | # httpcore 28 | # httpx 29 | # requests 30 | cffi==1.17.1 31 | # via cryptography 32 | chardet==5.2.0 33 | # via unstructured 34 | charset-normalizer==3.4.1 35 | # via 36 | # pdfminer-six 37 | # requests 38 | click==8.1.3 39 | # via 40 | # -r requirements/base.in 41 | # nltk 42 | # python-oxmsg 43 | # uvicorn 44 | coloredlogs==15.0.1 45 | # via onnxruntime 46 | contourpy==1.3.1 47 | # via matplotlib 48 | cryptography==44.0.1 49 | # via 50 | # pdfminer-six 51 | # unstructured-client 52 | cycler==0.12.1 53 | # via matplotlib 54 | dataclasses-json==0.6.7 55 | # via unstructured 56 | deprecated==1.2.18 57 | # via pikepdf 58 | effdet==0.4.1 59 | # via unstructured 60 | emoji==2.14.1 61 | # via unstructured 62 | et-xmlfile==2.0.0 63 | # via openpyxl 64 | eval-type-backport==0.2.2 65 | # via unstructured-client 66 | fastapi==0.115.8 67 | # via -r requirements/base.in 68 | filelock==3.17.0 69 | # via 70 | # huggingface-hub 71 | # torch 72 | # transformers 73 | filetype==1.2.0 74 | # via unstructured 75 | flatbuffers==25.1.24 76 | # via onnxruntime 77 | fonttools==4.55.8 78 | # via matplotlib 79 | fsspec==2024.12.0 80 | # via 81 | # huggingface-hub 82 | # torch 83 | google-api-core[grpc]==2.24.1 84 | # via google-cloud-vision 85 | google-auth==2.38.0 86 | # via 87 | # google-api-core 88 | # google-cloud-vision 89 | google-cloud-vision==3.9.0 90 | # via unstructured 91 | googleapis-common-protos==1.66.0 92 | # via 93 | # google-api-core 94 | # grpcio-status 95 | grpcio==1.70.0 96 | # via 97 | # google-api-core 98 | # grpcio-status 99 | grpcio-status==1.70.0 100 | # via google-api-core 101 | h11==0.16.0 102 | # via 103 | # httpcore 104 | # uvicorn 105 | html5lib==1.1 106 | # via unstructured 107 | httpcore==1.0.9 108 | # via httpx 109 | httpx==0.28.1 110 | # via unstructured-client 111 | huggingface-hub==0.32.1 112 | # via 113 | # timm 114 | # tokenizers 115 | # transformers 116 | # unstructured-inference 117 | humanfriendly==10.0 118 | # via coloredlogs 119 | idna==3.10 120 | # via 121 | # anyio 122 | # httpx 123 | # requests 124 | jinja2==3.1.6 125 | # via torch 126 | joblib==1.4.2 127 | # via nltk 128 | jsonpath-python==1.0.6 129 | # via unstructured-client 130 | kiwisolver==1.4.8 131 | # via matplotlib 132 | langdetect==1.0.9 133 | # via unstructured 134 | lxml==5.3.0 135 | # via 136 | # pikepdf 137 | # python-docx 138 | # python-pptx 139 | # unstructured 140 | markdown==3.7 141 | # via unstructured 142 | markupsafe==3.0.2 143 | # via jinja2 144 | marshmallow==3.26.0 145 | # via dataclasses-json 146 | matplotlib==3.10.0 147 | # via 148 | # pycocotools 149 | # unstructured-inference 150 | mpmath==1.3.0 151 | # via sympy 152 | mypy-extensions==1.0.0 153 | # via typing-inspect 154 | nest-asyncio==1.6.0 155 | # via unstructured-client 156 | networkx==3.4.2 157 | # via 158 | # torch 159 | # unstructured 160 | nltk==3.9.1 161 | # via unstructured 162 | numpy==1.26.4 163 | # via 164 | # -c requirements/constraints.in 165 | # contourpy 166 | # matplotlib 167 | # onnx 168 | # onnxruntime 169 | # opencv-python 170 | # pandas 171 | # pycocotools 172 | # scipy 173 | # torchvision 174 | # transformers 175 | # unstructured 176 | # unstructured-inference 177 | olefile==0.47 178 | # via python-oxmsg 179 | omegaconf==2.3.0 180 | # via effdet 181 | onnx==1.17.0 182 | # via 183 | # unstructured 184 | # unstructured-inference 185 | onnxruntime==1.20.1 186 | # via unstructured-inference 187 | opencv-python==4.11.0.86 188 | # via unstructured-inference 189 | openpyxl==3.1.5 190 | # via unstructured 191 | packaging==24.2 192 | # via 193 | # huggingface-hub 194 | # marshmallow 195 | # matplotlib 196 | # onnxruntime 197 | # pikepdf 198 | # transformers 199 | # unstructured-pytesseract 200 | pandas==2.2.3 201 | # via 202 | # unstructured 203 | # unstructured-inference 204 | pdf2image==1.17.0 205 | # via unstructured 206 | pdfminer-six==20240706 207 | # via 208 | # unstructured 209 | # unstructured-inference 210 | pi-heif==0.21.0 211 | # via unstructured 212 | pikepdf==9.5.1 213 | # via unstructured 214 | pillow==11.1.0 215 | # via 216 | # matplotlib 217 | # pdf2image 218 | # pi-heif 219 | # pikepdf 220 | # python-pptx 221 | # torchvision 222 | # unstructured-pytesseract 223 | proto-plus==1.26.0 224 | # via 225 | # google-api-core 226 | # google-cloud-vision 227 | protobuf==5.29.3 228 | # via 229 | # google-api-core 230 | # google-cloud-vision 231 | # googleapis-common-protos 232 | # grpcio-status 233 | # onnx 234 | # onnxruntime 235 | # proto-plus 236 | psutil==6.1.1 237 | # via 238 | # -r requirements/base.in 239 | # unstructured 240 | pyasn1==0.6.1 241 | # via 242 | # pyasn1-modules 243 | # rsa 244 | pyasn1-modules==0.4.1 245 | # via google-auth 246 | pycocotools==2.0.8 247 | # via effdet 248 | pycparser==2.22 249 | # via cffi 250 | pycryptodome==3.21.0 251 | # via -r requirements/base.in 252 | pydantic==2.10.6 253 | # via 254 | # fastapi 255 | # unstructured-client 256 | pydantic-core==2.27.2 257 | # via pydantic 258 | pypandoc==1.15 259 | # via unstructured 260 | pyparsing==3.2.1 261 | # via matplotlib 262 | pypdf==5.2.0 263 | # via 264 | # -r requirements/base.in 265 | # unstructured 266 | # unstructured-client 267 | pypdfium2==4.30.1 268 | # via unstructured-inference 269 | python-dateutil==2.9.0.post0 270 | # via 271 | # matplotlib 272 | # pandas 273 | # unstructured-client 274 | python-docx==1.1.2 275 | # via unstructured 276 | python-iso639==2025.1.28 277 | # via unstructured 278 | python-magic==0.4.27 279 | # via unstructured 280 | python-multipart==0.0.20 281 | # via unstructured-inference 282 | python-oxmsg==0.0.1 283 | # via unstructured 284 | python-pptx==1.0.2 285 | # via unstructured 286 | pytz==2024.2 287 | # via pandas 288 | pyyaml==6.0.2 289 | # via 290 | # huggingface-hub 291 | # omegaconf 292 | # timm 293 | # transformers 294 | rapidfuzz==3.12.1 295 | # via 296 | # unstructured 297 | # unstructured-inference 298 | ratelimit==2.2.1 299 | # via -r requirements/base.in 300 | regex==2024.11.6 301 | # via 302 | # nltk 303 | # transformers 304 | requests==2.32.3 305 | # via 306 | # -r requirements/base.in 307 | # google-api-core 308 | # huggingface-hub 309 | # requests-toolbelt 310 | # transformers 311 | # unstructured 312 | requests-toolbelt==1.0.0 313 | # via unstructured-client 314 | rsa==4.9 315 | # via google-auth 316 | safetensors==0.5.2 317 | # via 318 | # timm 319 | # transformers 320 | scipy==1.15.1 321 | # via unstructured-inference 322 | six==1.17.0 323 | # via 324 | # html5lib 325 | # langdetect 326 | # python-dateutil 327 | sniffio==1.3.1 328 | # via anyio 329 | soupsieve==2.6 330 | # via beautifulsoup4 331 | starlette==0.41.2 332 | # via 333 | # -c requirements/constraints.in 334 | # fastapi 335 | sympy==1.13.3 336 | # via 337 | # onnxruntime 338 | # torch 339 | timm==1.0.14 340 | # via 341 | # effdet 342 | # unstructured-inference 343 | tokenizers==0.21.0 344 | # via transformers 345 | torch==2.7.0 346 | # via 347 | # effdet 348 | # timm 349 | # torchvision 350 | # unstructured-inference 351 | torchvision==0.22.0 352 | # via 353 | # effdet 354 | # timm 355 | tqdm==4.67.1 356 | # via 357 | # huggingface-hub 358 | # nltk 359 | # transformers 360 | # unstructured 361 | transformers==4.50.0 362 | # via unstructured-inference 363 | typing-extensions==4.12.2 364 | # via 365 | # anyio 366 | # fastapi 367 | # huggingface-hub 368 | # pydantic 369 | # pydantic-core 370 | # python-docx 371 | # python-oxmsg 372 | # python-pptx 373 | # torch 374 | # typing-inspect 375 | # unstructured 376 | typing-inspect==0.9.0 377 | # via 378 | # dataclasses-json 379 | # unstructured-client 380 | tzdata==2025.1 381 | # via pandas 382 | unstructured[all-docs]==0.16.17 383 | # via -r requirements/base.in 384 | unstructured-client==0.29.0 385 | # via unstructured 386 | unstructured-inference==0.8.6 387 | # via unstructured 388 | unstructured-pytesseract==0.3.13 389 | # via unstructured 390 | urllib3==2.3.0 391 | # via requests 392 | uvicorn==0.34.0 393 | # via -r requirements/base.in 394 | webencodings==0.5.1 395 | # via html5lib 396 | wrapt==1.17.2 397 | # via 398 | # deprecated 399 | # unstructured 400 | xlrd==2.0.1 401 | # via unstructured 402 | xlsxwriter==3.2.2 403 | # via python-pptx 404 | 405 | # The following packages are considered to be unsafe in a requirements file: 406 | # setuptools 407 | -------------------------------------------------------------------------------- /requirements/constraints.in: -------------------------------------------------------------------------------- 1 | #################################################################################################### 2 | # This file can house global constraints that aren't *direct* requirements of the package or any 3 | # extras. Putting a dependency here will only affect dependency sets that contain them -- in other 4 | # words, if something does not require a constraint, it will not be installed. 5 | #################################################################################################### 6 | numpy<2.0.0 7 | # later versions of Starlette break middleware 8 | starlette==0.41.2 -------------------------------------------------------------------------------- /requirements/test.in: -------------------------------------------------------------------------------- 1 | -c constraints.in 2 | black 3 | # NOTE(mrobinson) - Pinning click due to a unicode issue in black 4 | # can remove after black drops support for Python 3.6 5 | # ref: https://github.com/psf/black/issues/2964 6 | click==8.1.3 7 | flake8 8 | mypy 9 | pytest-cov 10 | pytest-mock 11 | nbdev 12 | jupyter 13 | httpx 14 | deepdiff 15 | -------------------------------------------------------------------------------- /requirements/test.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.12 3 | # by the following command: 4 | # 5 | # pip-compile --output-file=requirements/test.txt requirements/base.txt requirements/test.in 6 | # 7 | aiofiles==24.1.0 8 | # via 9 | # -r requirements/base.txt 10 | # unstructured-client 11 | annotated-types==0.7.0 12 | # via 13 | # -r requirements/base.txt 14 | # pydantic 15 | antlr4-python3-runtime==4.9.3 16 | # via 17 | # -r requirements/base.txt 18 | # omegaconf 19 | anyio==4.8.0 20 | # via 21 | # -r requirements/base.txt 22 | # httpx 23 | # jupyter-server 24 | # starlette 25 | appnope==0.1.4 26 | # via ipykernel 27 | argon2-cffi==23.1.0 28 | # via jupyter-server 29 | argon2-cffi-bindings==21.2.0 30 | # via argon2-cffi 31 | arrow==1.3.0 32 | # via isoduration 33 | asttokens==3.0.0 34 | # via 35 | # nbdev 36 | # stack-data 37 | astunparse==1.6.3 38 | # via nbdev 39 | async-lru==2.0.4 40 | # via jupyterlab 41 | attrs==25.1.0 42 | # via 43 | # jsonschema 44 | # referencing 45 | babel==2.16.0 46 | # via jupyterlab-server 47 | backoff==2.2.1 48 | # via 49 | # -r requirements/base.txt 50 | # unstructured 51 | beautifulsoup4==4.12.3 52 | # via 53 | # -r requirements/base.txt 54 | # nbconvert 55 | # unstructured 56 | black==25.1.0 57 | # via -r requirements/test.in 58 | bleach[css]==6.2.0 59 | # via nbconvert 60 | cachetools==5.5.1 61 | # via 62 | # -r requirements/base.txt 63 | # google-auth 64 | certifi==2024.12.14 65 | # via 66 | # -r requirements/base.txt 67 | # httpcore 68 | # httpx 69 | # requests 70 | cffi==1.17.1 71 | # via 72 | # -r requirements/base.txt 73 | # argon2-cffi-bindings 74 | # cryptography 75 | chardet==5.2.0 76 | # via 77 | # -r requirements/base.txt 78 | # unstructured 79 | charset-normalizer==3.4.1 80 | # via 81 | # -r requirements/base.txt 82 | # pdfminer-six 83 | # requests 84 | click==8.1.3 85 | # via 86 | # -r requirements/base.txt 87 | # -r requirements/test.in 88 | # black 89 | # nltk 90 | # python-oxmsg 91 | # uvicorn 92 | coloredlogs==15.0.1 93 | # via 94 | # -r requirements/base.txt 95 | # onnxruntime 96 | comm==0.2.2 97 | # via 98 | # ipykernel 99 | # ipywidgets 100 | contourpy==1.3.1 101 | # via 102 | # -r requirements/base.txt 103 | # matplotlib 104 | coverage[toml]==7.6.10 105 | # via pytest-cov 106 | cryptography==44.0.1 107 | # via 108 | # -r requirements/base.txt 109 | # pdfminer-six 110 | # unstructured-client 111 | cycler==0.12.1 112 | # via 113 | # -r requirements/base.txt 114 | # matplotlib 115 | dataclasses-json==0.6.7 116 | # via 117 | # -r requirements/base.txt 118 | # unstructured 119 | debugpy==1.8.12 120 | # via ipykernel 121 | decorator==5.1.1 122 | # via ipython 123 | deepdiff==8.1.1 124 | # via -r requirements/test.in 125 | defusedxml==0.7.1 126 | # via nbconvert 127 | deprecated==1.2.18 128 | # via 129 | # -r requirements/base.txt 130 | # pikepdf 131 | effdet==0.4.1 132 | # via 133 | # -r requirements/base.txt 134 | # unstructured 135 | emoji==2.14.1 136 | # via 137 | # -r requirements/base.txt 138 | # unstructured 139 | et-xmlfile==2.0.0 140 | # via 141 | # -r requirements/base.txt 142 | # openpyxl 143 | eval-type-backport==0.2.2 144 | # via 145 | # -r requirements/base.txt 146 | # unstructured-client 147 | execnb==0.1.11 148 | # via nbdev 149 | executing==2.2.0 150 | # via stack-data 151 | fastapi==0.115.8 152 | # via -r requirements/base.txt 153 | fastcore==1.7.28 154 | # via 155 | # execnb 156 | # ghapi 157 | # nbdev 158 | fastjsonschema==2.21.1 159 | # via nbformat 160 | filelock==3.17.0 161 | # via 162 | # -r requirements/base.txt 163 | # huggingface-hub 164 | # torch 165 | # transformers 166 | filetype==1.2.0 167 | # via 168 | # -r requirements/base.txt 169 | # unstructured 170 | flake8==7.1.1 171 | # via -r requirements/test.in 172 | flatbuffers==25.1.24 173 | # via 174 | # -r requirements/base.txt 175 | # onnxruntime 176 | fonttools==4.55.8 177 | # via 178 | # -r requirements/base.txt 179 | # matplotlib 180 | fqdn==1.5.1 181 | # via jsonschema 182 | fsspec==2024.12.0 183 | # via 184 | # -r requirements/base.txt 185 | # huggingface-hub 186 | # torch 187 | ghapi==1.0.6 188 | # via nbdev 189 | google-api-core[grpc]==2.24.1 190 | # via 191 | # -r requirements/base.txt 192 | # google-cloud-vision 193 | google-auth==2.38.0 194 | # via 195 | # -r requirements/base.txt 196 | # google-api-core 197 | # google-cloud-vision 198 | google-cloud-vision==3.9.0 199 | # via 200 | # -r requirements/base.txt 201 | # unstructured 202 | googleapis-common-protos==1.66.0 203 | # via 204 | # -r requirements/base.txt 205 | # google-api-core 206 | # grpcio-status 207 | grpcio==1.70.0 208 | # via 209 | # -r requirements/base.txt 210 | # google-api-core 211 | # grpcio-status 212 | grpcio-status==1.70.0 213 | # via 214 | # -r requirements/base.txt 215 | # google-api-core 216 | h11==0.16.0 217 | # via 218 | # -r requirements/base.txt 219 | # httpcore 220 | # uvicorn 221 | html5lib==1.1 222 | # via 223 | # -r requirements/base.txt 224 | # unstructured 225 | httpcore==1.0.9 226 | # via 227 | # -r requirements/base.txt 228 | # httpx 229 | httpx==0.28.1 230 | # via 231 | # -r requirements/base.txt 232 | # -r requirements/test.in 233 | # jupyterlab 234 | # unstructured-client 235 | huggingface-hub==0.32.1 236 | # via 237 | # -r requirements/base.txt 238 | # timm 239 | # tokenizers 240 | # transformers 241 | # unstructured-inference 242 | humanfriendly==10.0 243 | # via 244 | # -r requirements/base.txt 245 | # coloredlogs 246 | idna==3.10 247 | # via 248 | # -r requirements/base.txt 249 | # anyio 250 | # httpx 251 | # jsonschema 252 | # requests 253 | iniconfig==2.0.0 254 | # via pytest 255 | ipykernel==6.29.5 256 | # via 257 | # jupyter 258 | # jupyter-console 259 | # jupyterlab 260 | ipython==8.31.0 261 | # via 262 | # execnb 263 | # ipykernel 264 | # ipywidgets 265 | # jupyter-console 266 | ipywidgets==8.1.5 267 | # via jupyter 268 | isoduration==20.11.0 269 | # via jsonschema 270 | jedi==0.19.2 271 | # via ipython 272 | jinja2==3.1.6 273 | # via 274 | # -r requirements/base.txt 275 | # jupyter-server 276 | # jupyterlab 277 | # jupyterlab-server 278 | # nbconvert 279 | # torch 280 | joblib==1.4.2 281 | # via 282 | # -r requirements/base.txt 283 | # nltk 284 | json5==0.10.0 285 | # via jupyterlab-server 286 | jsonpath-python==1.0.6 287 | # via 288 | # -r requirements/base.txt 289 | # unstructured-client 290 | jsonpointer==3.0.0 291 | # via jsonschema 292 | jsonschema[format-nongpl]==4.23.0 293 | # via 294 | # jupyter-events 295 | # jupyterlab-server 296 | # nbformat 297 | jsonschema-specifications==2024.10.1 298 | # via jsonschema 299 | jupyter==1.1.1 300 | # via -r requirements/test.in 301 | jupyter-client==8.6.3 302 | # via 303 | # ipykernel 304 | # jupyter-console 305 | # jupyter-server 306 | # nbclient 307 | jupyter-console==6.6.3 308 | # via jupyter 309 | jupyter-core==5.7.2 310 | # via 311 | # ipykernel 312 | # jupyter-client 313 | # jupyter-console 314 | # jupyter-server 315 | # jupyterlab 316 | # nbclient 317 | # nbconvert 318 | # nbformat 319 | jupyter-events==0.11.0 320 | # via jupyter-server 321 | jupyter-lsp==2.2.5 322 | # via jupyterlab 323 | jupyter-server==2.15.0 324 | # via 325 | # jupyter-lsp 326 | # jupyterlab 327 | # jupyterlab-server 328 | # notebook 329 | # notebook-shim 330 | jupyter-server-terminals==0.5.3 331 | # via jupyter-server 332 | jupyterlab==4.3.5 333 | # via 334 | # jupyter 335 | # notebook 336 | jupyterlab-pygments==0.3.0 337 | # via nbconvert 338 | jupyterlab-server==2.27.3 339 | # via 340 | # jupyterlab 341 | # notebook 342 | jupyterlab-widgets==3.0.13 343 | # via ipywidgets 344 | kiwisolver==1.4.8 345 | # via 346 | # -r requirements/base.txt 347 | # matplotlib 348 | langdetect==1.0.9 349 | # via 350 | # -r requirements/base.txt 351 | # unstructured 352 | lxml==5.3.0 353 | # via 354 | # -r requirements/base.txt 355 | # pikepdf 356 | # python-docx 357 | # python-pptx 358 | # unstructured 359 | markdown==3.7 360 | # via 361 | # -r requirements/base.txt 362 | # unstructured 363 | markupsafe==3.0.2 364 | # via 365 | # -r requirements/base.txt 366 | # jinja2 367 | # nbconvert 368 | marshmallow==3.26.0 369 | # via 370 | # -r requirements/base.txt 371 | # dataclasses-json 372 | matplotlib==3.10.0 373 | # via 374 | # -r requirements/base.txt 375 | # pycocotools 376 | # unstructured-inference 377 | matplotlib-inline==0.1.7 378 | # via 379 | # ipykernel 380 | # ipython 381 | mccabe==0.7.0 382 | # via flake8 383 | mistune==3.1.1 384 | # via nbconvert 385 | mpmath==1.3.0 386 | # via 387 | # -r requirements/base.txt 388 | # sympy 389 | mypy==1.14.1 390 | # via -r requirements/test.in 391 | mypy-extensions==1.0.0 392 | # via 393 | # -r requirements/base.txt 394 | # black 395 | # mypy 396 | # typing-inspect 397 | nbclient==0.10.2 398 | # via nbconvert 399 | nbconvert==7.16.6 400 | # via 401 | # jupyter 402 | # jupyter-server 403 | nbdev==2.3.34 404 | # via -r requirements/test.in 405 | nbformat==5.10.4 406 | # via 407 | # jupyter-server 408 | # nbclient 409 | # nbconvert 410 | nest-asyncio==1.6.0 411 | # via 412 | # -r requirements/base.txt 413 | # ipykernel 414 | # unstructured-client 415 | networkx==3.4.2 416 | # via 417 | # -r requirements/base.txt 418 | # torch 419 | # unstructured 420 | nltk==3.9.1 421 | # via 422 | # -r requirements/base.txt 423 | # unstructured 424 | notebook==7.3.2 425 | # via jupyter 426 | notebook-shim==0.2.4 427 | # via 428 | # jupyterlab 429 | # notebook 430 | numpy==1.26.4 431 | # via 432 | # -c requirements/constraints.in 433 | # -r requirements/base.txt 434 | # contourpy 435 | # matplotlib 436 | # onnx 437 | # onnxruntime 438 | # opencv-python 439 | # pandas 440 | # pycocotools 441 | # scipy 442 | # torchvision 443 | # transformers 444 | # unstructured 445 | # unstructured-inference 446 | olefile==0.47 447 | # via 448 | # -r requirements/base.txt 449 | # python-oxmsg 450 | omegaconf==2.3.0 451 | # via 452 | # -r requirements/base.txt 453 | # effdet 454 | onnx==1.17.0 455 | # via 456 | # -r requirements/base.txt 457 | # unstructured 458 | # unstructured-inference 459 | onnxruntime==1.20.1 460 | # via 461 | # -r requirements/base.txt 462 | # unstructured-inference 463 | opencv-python==4.11.0.86 464 | # via 465 | # -r requirements/base.txt 466 | # unstructured-inference 467 | openpyxl==3.1.5 468 | # via 469 | # -r requirements/base.txt 470 | # unstructured 471 | orderly-set==5.2.3 472 | # via deepdiff 473 | overrides==7.7.0 474 | # via jupyter-server 475 | packaging==24.2 476 | # via 477 | # -r requirements/base.txt 478 | # black 479 | # fastcore 480 | # ghapi 481 | # huggingface-hub 482 | # ipykernel 483 | # jupyter-server 484 | # jupyterlab 485 | # jupyterlab-server 486 | # marshmallow 487 | # matplotlib 488 | # nbconvert 489 | # nbdev 490 | # onnxruntime 491 | # pikepdf 492 | # pytest 493 | # transformers 494 | # unstructured-pytesseract 495 | pandas==2.2.3 496 | # via 497 | # -r requirements/base.txt 498 | # unstructured 499 | # unstructured-inference 500 | pandocfilters==1.5.1 501 | # via nbconvert 502 | parso==0.8.4 503 | # via jedi 504 | pathspec==0.12.1 505 | # via black 506 | pdf2image==1.17.0 507 | # via 508 | # -r requirements/base.txt 509 | # unstructured 510 | pdfminer-six==20240706 511 | # via 512 | # -r requirements/base.txt 513 | # unstructured 514 | # unstructured-inference 515 | pexpect==4.9.0 516 | # via ipython 517 | pi-heif==0.21.0 518 | # via 519 | # -r requirements/base.txt 520 | # unstructured 521 | pikepdf==9.5.1 522 | # via 523 | # -r requirements/base.txt 524 | # unstructured 525 | pillow==11.1.0 526 | # via 527 | # -r requirements/base.txt 528 | # matplotlib 529 | # pdf2image 530 | # pi-heif 531 | # pikepdf 532 | # python-pptx 533 | # torchvision 534 | # unstructured-pytesseract 535 | platformdirs==4.3.6 536 | # via 537 | # black 538 | # jupyter-core 539 | pluggy==1.5.0 540 | # via pytest 541 | prometheus-client==0.21.1 542 | # via jupyter-server 543 | prompt-toolkit==3.0.50 544 | # via 545 | # ipython 546 | # jupyter-console 547 | proto-plus==1.26.0 548 | # via 549 | # -r requirements/base.txt 550 | # google-api-core 551 | # google-cloud-vision 552 | protobuf==5.29.3 553 | # via 554 | # -r requirements/base.txt 555 | # google-api-core 556 | # google-cloud-vision 557 | # googleapis-common-protos 558 | # grpcio-status 559 | # onnx 560 | # onnxruntime 561 | # proto-plus 562 | psutil==6.1.1 563 | # via 564 | # -r requirements/base.txt 565 | # ipykernel 566 | # unstructured 567 | ptyprocess==0.7.0 568 | # via 569 | # pexpect 570 | # terminado 571 | pure-eval==0.2.3 572 | # via stack-data 573 | pyasn1==0.6.1 574 | # via 575 | # -r requirements/base.txt 576 | # pyasn1-modules 577 | # rsa 578 | pyasn1-modules==0.4.1 579 | # via 580 | # -r requirements/base.txt 581 | # google-auth 582 | pycocotools==2.0.8 583 | # via 584 | # -r requirements/base.txt 585 | # effdet 586 | pycodestyle==2.12.1 587 | # via flake8 588 | pycparser==2.22 589 | # via 590 | # -r requirements/base.txt 591 | # cffi 592 | pycryptodome==3.21.0 593 | # via -r requirements/base.txt 594 | pydantic==2.10.6 595 | # via 596 | # -r requirements/base.txt 597 | # fastapi 598 | # unstructured-client 599 | pydantic-core==2.27.2 600 | # via 601 | # -r requirements/base.txt 602 | # pydantic 603 | pyflakes==3.2.0 604 | # via flake8 605 | pygments==2.19.1 606 | # via 607 | # ipython 608 | # jupyter-console 609 | # nbconvert 610 | pypandoc==1.15 611 | # via 612 | # -r requirements/base.txt 613 | # unstructured 614 | pyparsing==3.2.1 615 | # via 616 | # -r requirements/base.txt 617 | # matplotlib 618 | pypdf==5.2.0 619 | # via 620 | # -r requirements/base.txt 621 | # unstructured 622 | # unstructured-client 623 | pypdfium2==4.30.1 624 | # via 625 | # -r requirements/base.txt 626 | # unstructured-inference 627 | pytest==8.3.4 628 | # via 629 | # pytest-cov 630 | # pytest-mock 631 | pytest-cov==6.0.0 632 | # via -r requirements/test.in 633 | pytest-mock==3.14.0 634 | # via -r requirements/test.in 635 | python-dateutil==2.9.0.post0 636 | # via 637 | # -r requirements/base.txt 638 | # arrow 639 | # jupyter-client 640 | # matplotlib 641 | # pandas 642 | # unstructured-client 643 | python-docx==1.1.2 644 | # via 645 | # -r requirements/base.txt 646 | # unstructured 647 | python-iso639==2025.1.28 648 | # via 649 | # -r requirements/base.txt 650 | # unstructured 651 | python-json-logger==3.2.1 652 | # via jupyter-events 653 | python-magic==0.4.27 654 | # via 655 | # -r requirements/base.txt 656 | # unstructured 657 | python-multipart==0.0.20 658 | # via 659 | # -r requirements/base.txt 660 | # unstructured-inference 661 | python-oxmsg==0.0.1 662 | # via 663 | # -r requirements/base.txt 664 | # unstructured 665 | python-pptx==1.0.2 666 | # via 667 | # -r requirements/base.txt 668 | # unstructured 669 | pytz==2024.2 670 | # via 671 | # -r requirements/base.txt 672 | # pandas 673 | pyyaml==6.0.2 674 | # via 675 | # -r requirements/base.txt 676 | # huggingface-hub 677 | # jupyter-events 678 | # nbdev 679 | # omegaconf 680 | # timm 681 | # transformers 682 | pyzmq==26.2.1 683 | # via 684 | # ipykernel 685 | # jupyter-client 686 | # jupyter-console 687 | # jupyter-server 688 | rapidfuzz==3.12.1 689 | # via 690 | # -r requirements/base.txt 691 | # unstructured 692 | # unstructured-inference 693 | ratelimit==2.2.1 694 | # via -r requirements/base.txt 695 | referencing==0.36.2 696 | # via 697 | # jsonschema 698 | # jsonschema-specifications 699 | # jupyter-events 700 | regex==2024.11.6 701 | # via 702 | # -r requirements/base.txt 703 | # nltk 704 | # transformers 705 | requests==2.32.3 706 | # via 707 | # -r requirements/base.txt 708 | # google-api-core 709 | # huggingface-hub 710 | # jupyterlab-server 711 | # requests-toolbelt 712 | # transformers 713 | # unstructured 714 | requests-toolbelt==1.0.0 715 | # via 716 | # -r requirements/base.txt 717 | # unstructured-client 718 | rfc3339-validator==0.1.4 719 | # via 720 | # jsonschema 721 | # jupyter-events 722 | rfc3986-validator==0.1.1 723 | # via 724 | # jsonschema 725 | # jupyter-events 726 | rpds-py==0.22.3 727 | # via 728 | # jsonschema 729 | # referencing 730 | rsa==4.9 731 | # via 732 | # -r requirements/base.txt 733 | # google-auth 734 | safetensors==0.5.2 735 | # via 736 | # -r requirements/base.txt 737 | # timm 738 | # transformers 739 | scipy==1.15.1 740 | # via 741 | # -r requirements/base.txt 742 | # unstructured-inference 743 | send2trash==1.8.3 744 | # via jupyter-server 745 | six==1.17.0 746 | # via 747 | # -r requirements/base.txt 748 | # astunparse 749 | # html5lib 750 | # langdetect 751 | # python-dateutil 752 | # rfc3339-validator 753 | sniffio==1.3.1 754 | # via 755 | # -r requirements/base.txt 756 | # anyio 757 | soupsieve==2.6 758 | # via 759 | # -r requirements/base.txt 760 | # beautifulsoup4 761 | stack-data==0.6.3 762 | # via ipython 763 | starlette==0.41.2 764 | # via 765 | # -c requirements/constraints.in 766 | # -r requirements/base.txt 767 | # fastapi 768 | sympy==1.13.3 769 | # via 770 | # -r requirements/base.txt 771 | # onnxruntime 772 | # torch 773 | terminado==0.18.1 774 | # via 775 | # jupyter-server 776 | # jupyter-server-terminals 777 | timm==1.0.14 778 | # via 779 | # -r requirements/base.txt 780 | # effdet 781 | # unstructured-inference 782 | tinycss2==1.4.0 783 | # via bleach 784 | tokenizers==0.21.0 785 | # via 786 | # -r requirements/base.txt 787 | # transformers 788 | torch==2.7.0 789 | # via 790 | # -r requirements/base.txt 791 | # effdet 792 | # timm 793 | # torchvision 794 | # unstructured-inference 795 | torchvision==0.22.0 796 | # via 797 | # -r requirements/base.txt 798 | # effdet 799 | # timm 800 | tornado==6.5.0 801 | # via 802 | # ipykernel 803 | # jupyter-client 804 | # jupyter-server 805 | # jupyterlab 806 | # notebook 807 | # terminado 808 | tqdm==4.67.1 809 | # via 810 | # -r requirements/base.txt 811 | # huggingface-hub 812 | # nltk 813 | # transformers 814 | # unstructured 815 | traitlets==5.14.3 816 | # via 817 | # comm 818 | # ipykernel 819 | # ipython 820 | # ipywidgets 821 | # jupyter-client 822 | # jupyter-console 823 | # jupyter-core 824 | # jupyter-events 825 | # jupyter-server 826 | # jupyterlab 827 | # matplotlib-inline 828 | # nbclient 829 | # nbconvert 830 | # nbformat 831 | transformers==4.50.0 832 | # via 833 | # -r requirements/base.txt 834 | # unstructured-inference 835 | types-python-dateutil==2.9.0.20241206 836 | # via arrow 837 | typing-extensions==4.12.2 838 | # via 839 | # -r requirements/base.txt 840 | # anyio 841 | # fastapi 842 | # huggingface-hub 843 | # mypy 844 | # pydantic 845 | # pydantic-core 846 | # python-docx 847 | # python-oxmsg 848 | # python-pptx 849 | # referencing 850 | # torch 851 | # typing-inspect 852 | # unstructured 853 | typing-inspect==0.9.0 854 | # via 855 | # -r requirements/base.txt 856 | # dataclasses-json 857 | # unstructured-client 858 | tzdata==2025.1 859 | # via 860 | # -r requirements/base.txt 861 | # pandas 862 | unstructured[all-docs]==0.16.17 863 | # via -r requirements/base.txt 864 | unstructured-client==0.29.0 865 | # via 866 | # -r requirements/base.txt 867 | # unstructured 868 | unstructured-inference==0.8.6 869 | # via 870 | # -r requirements/base.txt 871 | # unstructured 872 | unstructured-pytesseract==0.3.13 873 | # via 874 | # -r requirements/base.txt 875 | # unstructured 876 | uri-template==1.3.0 877 | # via jsonschema 878 | urllib3==2.3.0 879 | # via 880 | # -r requirements/base.txt 881 | # requests 882 | uvicorn==0.34.0 883 | # via -r requirements/base.txt 884 | watchdog==6.0.0 885 | # via nbdev 886 | wcwidth==0.2.13 887 | # via prompt-toolkit 888 | webcolors==24.11.1 889 | # via jsonschema 890 | webencodings==0.5.1 891 | # via 892 | # -r requirements/base.txt 893 | # bleach 894 | # html5lib 895 | # tinycss2 896 | websocket-client==1.8.0 897 | # via jupyter-server 898 | wheel==0.45.1 899 | # via astunparse 900 | widgetsnbextension==4.0.13 901 | # via ipywidgets 902 | wrapt==1.17.2 903 | # via 904 | # -r requirements/base.txt 905 | # deprecated 906 | # unstructured 907 | xlrd==2.0.1 908 | # via 909 | # -r requirements/base.txt 910 | # unstructured 911 | xlsxwriter==3.2.2 912 | # via 913 | # -r requirements/base.txt 914 | # python-pptx 915 | 916 | # The following packages are considered to be unsafe in a requirements file: 917 | # setuptools 918 | -------------------------------------------------------------------------------- /sample-docs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/.gitkeep -------------------------------------------------------------------------------- /sample-docs/DA-1p-with-duplicate-pages.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/DA-1p-with-duplicate-pages.pdf -------------------------------------------------------------------------------- /sample-docs/DA-1p.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/DA-1p.bmp -------------------------------------------------------------------------------- /sample-docs/DA-1p.heic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/DA-1p.heic -------------------------------------------------------------------------------- /sample-docs/README.md: -------------------------------------------------------------------------------- 1 | ## Example Docs 2 | 3 | The sample docs directory contains the following files: 4 | 5 | - `example-10k.html` - A 10-K SEC filing in HTML format 6 | - `layout-parser-paper.pdf` - A PDF copy of the layout parser paper 7 | - `factbook.xml`/`factbook.xsl` - Example XML/XLS files that you can use to test stylesheets 8 | 9 | These documents can be used to test out the parsers in the library. In addition, here are 10 | instructions for pulling in some sample docs that are too big to store in the repo. 11 | 12 | #### XBRL 10-K 13 | 14 | You can get an example 10-K in inline XBRL format using the following `curl`. Note, you need 15 | to have the user agent set in the header or the SEC site will reject your request. 16 | 17 | ```bash 18 | curl -O \ 19 | -A '${organization} ${email}' 20 | https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt 21 | ``` 22 | 23 | You can parse this document using the HTML parser. 24 | -------------------------------------------------------------------------------- /sample-docs/README.rst: -------------------------------------------------------------------------------- 1 | Example Docs 2 | ------------ 3 | 4 | The sample docs directory contains the following files: 5 | 6 | - ``example-10k.html`` - A 10-K SEC filing in HTML format 7 | - ``layout-parser-paper.pdf`` - A PDF copy of the layout parser paper 8 | - ``factbook.xml``/``factbook.xsl`` - Example XML/XLS files that you 9 | can use to test stylesheets 10 | 11 | These documents can be used to test out the parsers in the library. In 12 | addition, here are instructions for pulling in some sample docs that are 13 | too big to store in the repo. 14 | 15 | XBRL 10-K 16 | ^^^^^^^^^ 17 | 18 | You can get an example 10-K in inline XBRL format using the following 19 | ``curl``. Note, you need to have the user agent set in the header or the 20 | SEC site will reject your request. 21 | 22 | .. code:: bash 23 | 24 | curl -O \ 25 | -A '${organization} ${email}' 26 | https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt 27 | 28 | You can parse this document using the HTML parser. 29 | -------------------------------------------------------------------------------- /sample-docs/alert.eml: -------------------------------------------------------------------------------- 1 | MIME-Version: 1.0 2 | Date: Wed, 21 Dec 2022 09:55:33 -0600 3 | Message-ID: 4 | Subject: ALERT: Stolen Lunch 5 | From: Mallori Harrell 6 | To: Mallori Harrell 7 | Content-Type: multipart/alternative; boundary="0000000000002f0ea105f0589582" 8 | 9 | --0000000000002f0ea105f0589582 10 | Content-Type: text/plain; charset="UTF-8" 11 | 12 | Hi, 13 | 14 | It has come to our attention that as of 9:00am this morning, Harold's lunch 15 | is missing. If this was done in error please return the lunch immediately 16 | to the fridge on the 2nd floor by noon. 17 | 18 | If the lunch has not been returned by noon, we will be reviewing camera 19 | footage to determine who stole Harold's lunch. 20 | 21 | The perpetrators will be PUNISHED to the full extent of our employee code 22 | of conduct handbook. 23 | 24 | Thank you for your time, 25 | 26 | -- 27 | Mallori Harrell 28 | Unstructured Technologies 29 | Data Scientist 30 | 31 | --0000000000002f0ea105f0589582 32 | Content-Type: text/html; charset="UTF-8" 33 | Content-Transfer-Encoding: quoted-printable 34 | 35 |
Hi,

It has come to our atten= 36 | tion that as of 9:00am this morning, Harold's lunch is missing. If this= 37 | was done in error please return the lunch immediately to the fridge on the= 38 | 2nd floor by noon.

If the lunch has not been retu= 39 | rned by noon, we will be reviewing camera footage to determine who stole Ha= 40 | rold's lunch.

The perpetrators=C2=A0will be PU= 41 | NISHED to the full extent of our employee code of conduct handbook.

Thank you for your time,

--
Mallori Harrell
Unstructured Technologies
Data S= 45 | cientist

46 | 47 | --0000000000002f0ea105f0589582-- -------------------------------------------------------------------------------- /sample-docs/announcement.eml: -------------------------------------------------------------------------------- 1 | MIME-Version: 1.0 2 | Date: Wed, 21 Dec 2022 11:09:08 -0600 3 | Message-ID: 4 | Subject: ANNOUNCEMENT: The holidays are coming! 5 | From: Mallori Harrell 6 | To: Mallori Harrell 7 | Content-Type: multipart/alternative; boundary="00000000000054448805f0599c48" 8 | 9 | --00000000000054448805f0599c48 10 | Content-Type: text/plain; charset="UTF-8" 11 | 12 | To All, 13 | 14 | As the holiday approaches, be sure to let your manager and team know the 15 | following: 16 | 17 | - Your days off 18 | - The location of your work's documentation 19 | - How to reach you or your secondary in case of an emergency 20 | 21 | 22 | Hope you all have a Happy Holidays! 23 | 24 | Best, 25 | 26 | -- 27 | Mallori Harrell 28 | Unstructured Technologies 29 | Data Scientist 30 | 31 | --00000000000054448805f0599c48 32 | Content-Type: text/html; charset="UTF-8" 33 | Content-Transfer-Encoding: quoted-printable 34 | 35 |
To All,

As the holiday approaches, be s= 36 | ure to let your manager and team know the following:
  • Your= 37 | days off
  • The location of your work's documentation
  • How= 38 | to reach you or your secondary in case of an emergency

Hope you all have a Happy Holidays!

Best,

--
Mallori Harrell
= 42 | Unstructured Technologies
Data Scientist

= 43 |
44 | 45 | --00000000000054448805f0599c48-- -------------------------------------------------------------------------------- /sample-docs/embedded-images-tables.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/embedded-images-tables.jpg -------------------------------------------------------------------------------- /sample-docs/embedded-images-tables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/embedded-images-tables.pdf -------------------------------------------------------------------------------- /sample-docs/english-and-korean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/english-and-korean.png -------------------------------------------------------------------------------- /sample-docs/fake-doc.rtf: -------------------------------------------------------------------------------- 1 | {\pard \ql \f0 \sa180 \li0 \fi0 \outlinelevel0 \b \fs36 My First Heading\par} 2 | {\pard \ql \f0 \sa180 \li0 \fi0 My first paragraph.\par} 3 | -------------------------------------------------------------------------------- /sample-docs/fake-email-attachment.eml: -------------------------------------------------------------------------------- 1 | MIME-Version: 1.0 2 | Date: Fri, 23 Dec 2022 12:08:48 -0600 3 | Message-ID: 4 | Subject: Fake email with attachment 5 | From: Mallori Harrell 6 | To: Mallori Harrell 7 | Content-Type: multipart/mixed; boundary="0000000000005d654405f082adb7" 8 | 9 | --0000000000005d654405f082adb7 10 | Content-Type: multipart/alternative; boundary="0000000000005d654205f082adb5" 11 | 12 | --0000000000005d654205f082adb5 13 | Content-Type: text/plain; charset="UTF-8" 14 | 15 | Hello! 16 | 17 | Here's the attachments! 18 | 19 | It includes: 20 | 21 | - Lots of whitespace 22 | - Little to no content 23 | - and is a quick read 24 | 25 | Best, 26 | 27 | Mallori 28 | 29 | --0000000000005d654205f082adb5 30 | Content-Type: text/html; charset="UTF-8" 31 | Content-Transfer-Encoding: quoted-printable 32 | 33 |
Hello!=C2=A0

Here's the attachments= 34 | !

It includes:
  • Lots of whitespace
  • Little=C2= 36 | =A0to no content
  • and is a quick read
Best,

Mallori

40 | 41 | --0000000000005d654205f082adb5-- 42 | --0000000000005d654405f082adb7 43 | Content-Type: text/plain; charset="US-ASCII"; name="fake-attachment.txt" 44 | Content-Disposition: attachment; filename="fake-attachment.txt" 45 | Content-Transfer-Encoding: base64 46 | X-Attachment-Id: f_lc0tto5j0 47 | Content-ID: 48 | 49 | SGV5IHRoaXMgaXMgYSBmYWtlIGF0dGFjaG1lbnQh 50 | --0000000000005d654405f082adb7-- -------------------------------------------------------------------------------- /sample-docs/fake-email.eml: -------------------------------------------------------------------------------- 1 | MIME-Version: 1.0 2 | Date: Fri, 16 Dec 2022 17:04:16 -0500 3 | Message-ID: 4 | Subject: Test Email 5 | From: Matthew Robinson 6 | To: Matthew Robinson 7 | Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630" 8 | 9 | --00000000000095c9b205eff92630 10 | Content-Type: text/plain; charset="UTF-8" 11 | 12 | This is a test email to use for unit tests. 13 | 14 | Important points: 15 | 16 | - Roses are red 17 | - Violets are blue 18 | 19 | --00000000000095c9b205eff92630 20 | Content-Type: text/html; charset="UTF-8" 21 | 22 |
This is a test email to use for unit tests.

Important points:
  • Roses are red
  • Violets are blue
23 | 24 | --00000000000095c9b205eff92630-- -------------------------------------------------------------------------------- /sample-docs/fake-email.msg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake-email.msg -------------------------------------------------------------------------------- /sample-docs/fake-html.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

My First Heading

6 |

My first paragraph.

7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /sample-docs/fake-power-point.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake-power-point.ppt -------------------------------------------------------------------------------- /sample-docs/fake-power-point.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake-power-point.pptx -------------------------------------------------------------------------------- /sample-docs/fake-text-utf-32.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake-text-utf-32.txt -------------------------------------------------------------------------------- /sample-docs/fake-text.txt: -------------------------------------------------------------------------------- 1 | This is a test document to use for unit tests. 2 | 3 | Important points: 4 | 5 | - Hamburgers are delicious 6 | - Dogs are the best 7 | - I love fuzzy blankets -------------------------------------------------------------------------------- /sample-docs/fake-xml.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | United States 5 | Washington, DC 6 | Joe Biden 7 | Baseball 8 | 9 | 10 | Canada 11 | Ottawa 12 | Justin Trudeau 13 | Hockey 14 | 15 | 16 | France 17 | Paris 18 | Emmanuel Macron 19 | Soccer 20 | 21 | 22 | Trinidad & Tobado 23 | Port of Spain 24 | Keith Rowley 25 | Track & Field 26 | 27 | 28 | Trinidad & Tobado 29 | Port of Spain 30 | Keith Rowley 31 | Track & Field 32 | 33 | 34 | -------------------------------------------------------------------------------- /sample-docs/fake.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake.doc -------------------------------------------------------------------------------- /sample-docs/fake.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake.docx -------------------------------------------------------------------------------- /sample-docs/fake.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake.odt -------------------------------------------------------------------------------- /sample-docs/family-day.eml: -------------------------------------------------------------------------------- 1 | MIME-Version: 1.0 2 | Date: Wed, 21 Dec 2022 10:28:53 -0600 3 | Message-ID: 4 | Subject: Family Day 5 | From: Mallori Harrell 6 | To: Mallori Harrell 7 | Content-Type: multipart/alternative; boundary="0000000000005c115405f0590ce4" 8 | 9 | --0000000000005c115405f0590ce4 10 | Content-Type: text/plain; charset="UTF-8" 11 | 12 | Hi All, 13 | 14 | Get excited for our first annual family day! 15 | 16 | There will be face painting, a petting zoo, funnel cake and more. 17 | 18 | Make sure to RSVP! 19 | 20 | Best. 21 | 22 | -- 23 | Mallori Harrell 24 | Unstructured Technologies 25 | Data Scientist 26 | 27 | --0000000000005c115405f0590ce4 28 | Content-Type: text/html; charset="UTF-8" 29 | Content-Transfer-Encoding: quoted-printable 30 | 31 |
Hi All,

Get excited for our first annua= 32 | l family day!=C2=A0

There will be face painting, = 33 | a petting zoo, funnel cake and more.

Make sure to = 34 | RSVP!

Best.

--
Mallori Harrell
Unstructured Technologies
= 37 | Data Scientist

38 | 39 | --0000000000005c115405f0590ce4-- -------------------------------------------------------------------------------- /sample-docs/layout-parser-paper-fast.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/layout-parser-paper-fast.jpg -------------------------------------------------------------------------------- /sample-docs/layout-parser-paper-fast.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/layout-parser-paper-fast.pdf -------------------------------------------------------------------------------- /sample-docs/layout-parser-paper-fast.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/layout-parser-paper-fast.tiff -------------------------------------------------------------------------------- /sample-docs/layout-parser-paper-with-table.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/layout-parser-paper-with-table.jpg -------------------------------------------------------------------------------- /sample-docs/layout-parser-paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/layout-parser-paper.pdf -------------------------------------------------------------------------------- /sample-docs/layout-parser-paper.pdf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/layout-parser-paper.pdf.gz -------------------------------------------------------------------------------- /sample-docs/list-item-example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/list-item-example.pdf -------------------------------------------------------------------------------- /sample-docs/notes.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/notes.ppt -------------------------------------------------------------------------------- /sample-docs/notes.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/notes.pptx -------------------------------------------------------------------------------- /sample-docs/spring-weather.html.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "element_id": "41f6e17bf5e9a407fcca74e902f802a0", 4 | "text": "News Around NOAA", 5 | "type": "Title", 6 | "metadata": { 7 | "page_number": 1 8 | } 9 | }, 10 | { 11 | "element_id": "aa589c25dc22dcc8a75baba1244e6c8f", 12 | "text": "National Program", 13 | "type": "Title", 14 | "metadata": { 15 | "page_number": 1 16 | } 17 | }, 18 | { 19 | "element_id": "62c26d2e16774d2334bd804c7bb6a711", 20 | "text": "Are You Weather-Ready for the Spring?", 21 | "type": "Title", 22 | "metadata": { 23 | "page_number": 1 24 | } 25 | }, 26 | { 27 | "element_id": "32709cd3bec72640bbbe32f58e6e23f6", 28 | "text": "Weather.gov >", 29 | "type": "Title", 30 | "metadata": { 31 | "page_number": 1 32 | } 33 | }, 34 | { 35 | "element_id": "2661da76db570876b075083aaeeaee55", 36 | "text": "News Around NOAA > Are You Weather-Ready for the Spring?", 37 | "type": "Title", 38 | "metadata": { 39 | "page_number": 1 40 | } 41 | }, 42 | { 43 | "element_id": "fab6c4df083f0fb6f324fff65b652c86", 44 | "text": "Weather Safety Air Quality Beach Hazards Cold Cold Water Drought Floods Fog Heat Hurricanes Lightning Safety Rip Currents Safe Boating Space Weather Sun (Ultraviolet Radiation) Thunderstorms & Tornadoes Tornado Tsunami Wildfire Wind Winter", 45 | "type": "ListItem", 46 | "metadata": { 47 | "page_number": 1 48 | } 49 | }, 50 | { 51 | "element_id": "45c26cf3457e6d18985a435e2c0fcc65", 52 | "text": "Safety Campaigns Seasonal Safety Campaigns #SafePlaceSelfie Deaf & Hard of Hearing Intellectual Disabilities Spanish-language Content The Great Outdoors", 53 | "type": "ListItem", 54 | "metadata": { 55 | "page_number": 1 56 | } 57 | }, 58 | { 59 | "element_id": "77f5acc603de9a165ed87a5c3fbaf14a", 60 | "text": "Ambassador About WRN Ambassadors Become an Ambassador Ambassadors of Excellence People of WRN FAQS Tell Your Success Story Success Stories Tri-fold Aviation Current Ambassadors Brochure En Español", 61 | "type": "ListItem", 62 | "metadata": { 63 | "page_number": 1 64 | } 65 | }, 66 | { 67 | "element_id": "8f19bcaabbd1bafa5e9826ac69766c8b", 68 | "text": "Education NWS Education Home Be A Force Of Nature WRN Kids Flyer Wireless Emergency Alerts NOAA Weather Radio Mobile Weather Brochures Hourly Weather Forecast Citizen Science Intellectual Disabilities", 69 | "type": "ListItem", 70 | "metadata": { 71 | "page_number": 1 72 | } 73 | }, 74 | { 75 | "element_id": "1245f9cf9e019713391e4ee3bac54a63", 76 | "text": "Collaboration Get Involved Social Media WRN Ambassadors ​ Enterprise Resources StormReady TsunamiReady NWSChat (core partners only) InteractiveNWS (iNWS) (core partners only)​ SKYWARN", 77 | "type": "ListItem", 78 | "metadata": { 79 | "page_number": 1 80 | } 81 | }, 82 | { 83 | "element_id": "23dfa7f98424dbf86e00b3d500096dfa", 84 | "text": "News & Events Latest News Calendar Meetings & Workshops NWS Aware Newsletter", 85 | "type": "ListItem", 86 | "metadata": { 87 | "page_number": 1 88 | } 89 | }, 90 | { 91 | "element_id": "93202df2ec7081b28b47901b5c287a5a", 92 | "text": "International", 93 | "type": "ListItem", 94 | "metadata": { 95 | "page_number": 1 96 | } 97 | }, 98 | { 99 | "element_id": "e53d6a9c615bdf1a8d7b98a67cade488", 100 | "text": "About Contact Us What is WRN? WRN FAQ WRN Brochure Hazard Simplification IDSS Brochure Roadmap Strategic Plan WRN International Social Science", 101 | "type": "ListItem", 102 | "metadata": { 103 | "page_number": 1 104 | } 105 | }, 106 | { 107 | "element_id": "6cbcf8c11f8c0781bd9ecc7f67169ff0", 108 | "text": "The spring season is all about change – a rebirth both literally and figuratively. Even though the spring season doesn’t officially (astronomically, that is) begin until March 20 this year, climatologically, it starts March 1.", 109 | "type": "NarrativeText", 110 | "metadata": { 111 | "page_number": 1 112 | } 113 | }, 114 | { 115 | "element_id": "7184168da442c6ef28553b274bf2be8f", 116 | "text": "As cold winter nights are replaced by the warmth of longer daylight hours, the National Weather Service invites you to do two important things that may save your life or the life of a loved one.", 117 | "type": "NarrativeText", 118 | "metadata": { 119 | "page_number": 1 120 | } 121 | }, 122 | { 123 | "element_id": "f3be9748ecd68b20d706548129baa22d", 124 | "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”", 125 | "type": "NarrativeText", 126 | "metadata": { 127 | "page_number": 1 128 | } 129 | }, 130 | { 131 | "element_id": "126c3cd201fb259cfeabc6bffc0b5473", 132 | "text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content – everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic.", 133 | "type": "NarrativeText", 134 | "metadata": { 135 | "page_number": 1 136 | } 137 | }, 138 | { 139 | "element_id": "c1944fb037f3e1cb14969bc59a7dd9c2", 140 | "text": "This spring, the campaign is focused on heat dangers. Heat illness and death can occur even in spring’s moderately warm weather. The majority of all heat-related deaths occur outside of heat waves and roughly a third of child hot car deaths occur outside of the summer months. Learn more by viewing the infographics that are now available.", 141 | "type": "NarrativeText", 142 | "metadata": { 143 | "page_number": 1 144 | } 145 | }, 146 | { 147 | "element_id": "fa1b939ef6159d95260bc095f58ebbc2", 148 | "text": "Stay safe this spring, and every season, by being informed, prepared, and Weather-Ready.", 149 | "type": "NarrativeText", 150 | "metadata": { 151 | "page_number": 1 152 | } 153 | }, 154 | { 155 | "element_id": "47d5d0d27a35a36d7467dfc8b6e089b3", 156 | "text": "US Dept of Commerce\n National Oceanic and Atmospheric Administration\n National Weather Service\n News Around NOAA1325 East West HighwaySilver Spring, MD 20910Comments? Questions? Please Contact Us.", 157 | "type": "NarrativeText", 158 | "metadata": { 159 | "page_number": 1 160 | } 161 | }, 162 | { 163 | "element_id": "129c678fce59acee7ac6a6fdb67b6310", 164 | "text": "Disclaimer", 165 | "type": "Title", 166 | "metadata": { 167 | "page_number": 1 168 | } 169 | }, 170 | { 171 | "element_id": "3c96caaebd949e39d25b3ccf4133c5d8", 172 | "text": "Information Quality", 173 | "type": "Title", 174 | "metadata": { 175 | "page_number": 1 176 | } 177 | }, 178 | { 179 | "element_id": "b79cac926e0b2e347e72cc91d5174037", 180 | "text": "Help", 181 | "type": "Title", 182 | "metadata": { 183 | "page_number": 1 184 | } 185 | }, 186 | { 187 | "element_id": "4c4e436f9a453c776dbf011f98d932d6", 188 | "text": "Glossary", 189 | "type": "Title", 190 | "metadata": { 191 | "page_number": 1 192 | } 193 | }, 194 | { 195 | "element_id": "506ff394621596dd88138642eddfc1e4", 196 | "text": "Privacy Policy", 197 | "type": "Title", 198 | "metadata": { 199 | "page_number": 1 200 | } 201 | }, 202 | { 203 | "element_id": "c70ae8c30a61c450d2c5148d1b6a0447", 204 | "text": "Freedom of Information Act (FOIA)", 205 | "type": "Title", 206 | "metadata": { 207 | "page_number": 1 208 | } 209 | }, 210 | { 211 | "element_id": "5d8c71abc527284cd463aa58f3f48098", 212 | "text": "About Us", 213 | "type": "Title", 214 | "metadata": { 215 | "page_number": 1 216 | } 217 | }, 218 | { 219 | "element_id": "a8a00c355d2fa1461d532a1088274f32", 220 | "text": "Career Opportunities", 221 | "type": "Title", 222 | "metadata": { 223 | "page_number": 1 224 | } 225 | } 226 | ] -------------------------------------------------------------------------------- /sample-docs/stanley-cups.csv: -------------------------------------------------------------------------------- 1 | Stanley Cups,, 2 | Team,Location,Stanley Cups 3 | Blues,STL,1 4 | Flyers,PHI,2 5 | Maple Leafs,TOR,13 -------------------------------------------------------------------------------- /sample-docs/stanley-cups.tsv: -------------------------------------------------------------------------------- 1 | Stanley Cups 2 | Team Location Stanley Cups 3 | Blues STL 1 4 | Flyers PHI 2 5 | Maple Leafs TOR 13 6 | -------------------------------------------------------------------------------- /sample-docs/stanley-cups.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/stanley-cups.xlsx -------------------------------------------------------------------------------- /sample-docs/winter-sports.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/winter-sports.epub -------------------------------------------------------------------------------- /scripts/app-start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PORT=${PORT:-8000} 4 | export HOST=${HOST:-"0.0.0.0"} 5 | export WORKERS=${WORKERS:-1} 6 | 7 | NUMREGEX="^[0-9]+$" 8 | GRACEFUL_SHUTDOWN_PERIOD_SECONDS=3600 9 | TIMEOUT_COMMAND='timeout' 10 | OPTIONAL_TIMEOUT='' 11 | 12 | if [[ -n $MAX_LIFETIME_SECONDS ]]; then 13 | if ! command -v $TIMEOUT_COMMAND &> /dev/null; then 14 | TIMEOUT_COMMAND='gtimeout' 15 | echo "Warning! 'timeout' command is required but not available. Checking for gtimeout." 16 | elif ! command -v $TIMEOUT_COMMAND &> /dev/null; then 17 | echo "Warning! 'gtimeout' command is required but not available. Running without max lifetime." 18 | elif [[ $MAX_LIFETIME_SECONDS =~ $NUMREGEX ]]; then 19 | OPTIONAL_TIMEOUT="timeout --preserve-status --foreground --kill-after ${GRACEFUL_SHUTDOWN_PERIOD_SECONDS} ${MAX_LIFETIME_SECONDS}" 20 | echo "Server's lifetime set to ${MAX_LIFETIME_SECONDS} seconds." 21 | else 22 | echo "Warning! MAX_LIFETIME_SECONDS was not properly set, an integer was expected, got ${MAX_LIFETIME_SECONDS}. Running without max lifetime." 23 | fi 24 | fi 25 | 26 | ${OPTIONAL_TIMEOUT} \ 27 | uvicorn prepline_general.api.app:app \ 28 | --log-config logger_config.yaml \ 29 | --host "$HOST" \ 30 | --port "$PORT" \ 31 | --workers "$WORKERS" \ 32 | 33 | echo "Server was shutdown" 34 | [ -n "$MAX_LIFETIME_SECONDS" ] && echo "Reached timeout of $MAX_LIFETIME_SECONDS seconds" 35 | -------------------------------------------------------------------------------- /scripts/docker-build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | DOCKER_REPOSITORY="${DOCKER_REPOSITORY:-quay.io/unstructured-io/unstructured-api}" 5 | PIPELINE_PACKAGE=${PIPELINE_PACKAGE:-"general"} 6 | PIPELINE_FAMILY=${PIPELINE_FAMILY:-"general"} 7 | PIP_VERSION="${PIP_VERSION:-25.1.1}" 8 | DOCKER_IMAGE="${DOCKER_IMAGE:-pipeline-family-${PIPELINE_FAMILY}-dev}" 9 | DOCKER_PLATFORM="${DOCKER_PLATFORM:-}" 10 | 11 | 12 | DOCKER_BUILD_CMD=( 13 | docker buildx build --load -f Dockerfile 14 | --build-arg PIP_VERSION="$PIP_VERSION" 15 | --build-arg BUILDKIT_INLINE_CACHE=1 16 | --build-arg PIPELINE_PACKAGE="$PIPELINE_PACKAGE" 17 | --progress plain 18 | --platform linux/amd64 19 | --cache-from "$DOCKER_REPOSITORY:latest" 20 | -t "$DOCKER_IMAGE" 21 | . 22 | ) 23 | 24 | # only build for specific platform if DOCKER_PLATFORM is set 25 | if [ -n "${DOCKER_PLATFORM:-}" ]; then 26 | DOCKER_BUILD_CMD+=("--platform=$DOCKER_PLATFORM") 27 | fi 28 | 29 | DOCKER_BUILDKIT=1 "${DOCKER_BUILD_CMD[@]}" 30 | -------------------------------------------------------------------------------- /scripts/docker-smoke-test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | 4 | # docker-smoke-test.sh 5 | # Start the containerized api and run some end-to-end tests against it 6 | # There will be some overlap with just running a TestClient in the unit tests 7 | # Is there a good way to reuse code here? 8 | # Also note this can evolve into a generalized pipeline smoke test 9 | 10 | # shellcheck disable=SC2317 # Shellcheck complains that trap functions are unreachable... 11 | 12 | set -e 13 | 14 | CONTAINER_NAME=unstructured-api-smoke-test 15 | CONTAINER_NAME_PARALLEL=unstructured-api-smoke-test-parallel 16 | PIPELINE_FAMILY=${PIPELINE_FAMILY:-"general"} 17 | DOCKER_IMAGE="${DOCKER_IMAGE:-pipeline-family-${PIPELINE_FAMILY}-dev:latest}" 18 | SKIP_INFERENCE_TESTS="${SKIP_INFERENCE_TESTS:-false}" 19 | 20 | start_container() { 21 | 22 | port=$1 23 | use_parallel_mode=$2 24 | 25 | if [ "$use_parallel_mode" = "true" ]; then 26 | name=$CONTAINER_NAME_PARALLEL 27 | else 28 | name=$CONTAINER_NAME 29 | fi 30 | 31 | echo Starting container "$name" 32 | docker run --platform "$DOCKER_PLATFORM" \ 33 | -p "$port":"$port" \ 34 | --entrypoint uvicorn \ 35 | -d \ 36 | --rm \ 37 | --name "$name" \ 38 | --env "UNSTRUCTURED_PARALLEL_MODE_URL=http://localhost:$port/general/v0/general" \ 39 | --env "UNSTRUCTURED_PARALLEL_MODE_ENABLED=$use_parallel_mode" \ 40 | "$DOCKER_IMAGE" \ 41 | prepline_general.api.app:app --port "$port" --host 0.0.0.0 42 | } 43 | 44 | await_server_ready() { 45 | port=$1 46 | url=localhost:$port/healthcheck 47 | 48 | # NOTE(rniko): Increasing the timeout to 120 seconds because emulated arm tests are slow to start 49 | for _ in {1..120}; do 50 | echo Waiting for response from "$url" 51 | if curl "$url" 2> /dev/null; then 52 | echo 53 | return 54 | fi 55 | 56 | sleep 1 57 | done 58 | 59 | echo Server did not respond! 60 | exit 1 61 | } 62 | 63 | stop_container() { 64 | echo Stopping container "$CONTAINER_NAME" 65 | # Note (austin) - if you're getting an error from the api, try dumping the logs 66 | # docker logs $CONTAINER_NAME 2> docker_logs.txt 67 | docker stop "$CONTAINER_NAME" 2> /dev/null || true 68 | 69 | echo Stopping container "$CONTAINER_NAME_PARALLEL" 70 | docker stop "$CONTAINER_NAME_PARALLEL" 2> /dev/null || true 71 | } 72 | 73 | # Always clean up the container 74 | trap stop_container EXIT 75 | 76 | start_container 8000 "false" 77 | await_server_ready 8000 78 | 79 | ####################### 80 | # Smoke Tests 81 | ####################### 82 | echo Running smoke tests with SKIP_INFERENCE_TESTS: "$SKIP_INFERENCE_TESTS" 83 | PYTHONPATH=. SKIP_INFERENCE_TESTS=$SKIP_INFERENCE_TESTS pytest -vv scripts/smoketest.py 84 | 85 | ####################### 86 | # Test parallel vs single mode 87 | ####################### 88 | if ! $SKIP_INFERENCE_TESTS; then 89 | start_container 9000 true 90 | await_server_ready 9000 91 | 92 | echo Running parallel mode test 93 | ./scripts/parallel-mode-test.sh localhost:8000 localhost:9000 94 | fi 95 | 96 | result=$? 97 | exit $result 98 | -------------------------------------------------------------------------------- /scripts/install-pandoc.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Mainly used for installing pandoc on CI 4 | 5 | if [ "$(uname)" == "Darwin" ]; then 6 | echo "This script is intended for Linux only." 7 | exit 0 8 | fi 9 | 10 | set -euo pipefail 11 | if [ "${ARCH}" = "x86_64" ]; then 12 | export PANDOC_ARCH="amd64" 13 | elif [ "${ARCH}" = "arm64" ] || [ "${ARCH}" = "aarch64" ]; then 14 | export PANDOC_ARCH="arm64" 15 | fi 16 | 17 | wget https://github.com/jgm/pandoc/releases/download/3.1.2/pandoc-3.1.2-linux-"${PANDOC_ARCH}".tar.gz 18 | tar xvf pandoc-3.1.2-linux-"${PANDOC_ARCH}".tar.gz 19 | cd pandoc-3.1.2 20 | sudo cp bin/pandoc /usr/local/bin/ 21 | cd .. 22 | rm -rf pandoc-3.1.2* 23 | -------------------------------------------------------------------------------- /scripts/parallel-mode-test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # parallel-mode-test.sh 4 | # Iterate a list of curl commands, and run each one against two instances of the api 5 | # The smoke test will start one container with parallel mode and one without, and 6 | # diff the two outputs to make sure parallel mode does not alter the response. 7 | # Note the filepaths assume you ran this from the top level 8 | 9 | # shellcheck disable=SC2317 # Shellcheck complains that trap functions are unreachable... 10 | 11 | base_url_1=$1 12 | base_url_2=$2 13 | 14 | declare -a curl_params=( 15 | "-F files=@sample-docs/layout-parser-paper.pdf -F 'strategy=fast'" 16 | "-F files=@sample-docs/layout-parser-paper.pdf -F 'strategy=auto" 17 | "-F files=@sample-docs/layout-parser-paper.pdf -F 'strategy=hi_res'" 18 | "-F files=@sample-docs/layout-parser-paper.pdf -F 'coordinates=true'" 19 | "-F files=@sample-docs/layout-parser-paper.pdf -F 'encoding=utf-8'" 20 | "-F files=@sample-docs/layout-parser-paper.pdf -F 'include_page_breaks=true'" 21 | "-F files=@sample-docs/layout-parser-paper.pdf -F 'hi_res_model_name=yolox'" 22 | ) 23 | 24 | for params in "${curl_params[@]}" 25 | do 26 | curl_command="curl $base_url_1/general/v0/general $params" 27 | echo Testing: "$curl_command" 28 | 29 | # Run in single mode 30 | # Note(austin): Parallel mode screws up hierarchy! While we deal with that, 31 | # let's ignore parent_id fields in the results 32 | $curl_command 2> /dev/null | jq -S 'del(..|.parent_id?)' > output.json 33 | original_length=$(jq 'length' output.json) 34 | 35 | # Stop if curl didn't work 36 | if [ ! -s output.json ]; then 37 | echo Command failed! 38 | $curl_command 39 | exit 1 40 | fi 41 | 42 | # Run in parallel mode 43 | curl_command="curl $base_url_2/general/v0/general $params" 44 | $curl_command 2> /dev/null | jq -S 'del(..|.parent_id?)' > parallel_output.json 45 | parallel_length=$(jq 'length' parallel_output.json) 46 | 47 | # Stop if curl didn't work 48 | if [ ! -s parallel_output.json ]; then 49 | echo Command failed! 50 | $curl_command 51 | exit 1 52 | fi 53 | 54 | if ! [[ "$original_length" == "$parallel_length" ]]; then 55 | echo Parallel mode returned a different number of elements! 56 | echo Params: "$params" 57 | exit 1 58 | fi 59 | 60 | rm -f output.json parallel_output.json 61 | echo 62 | done 63 | 64 | 65 | -------------------------------------------------------------------------------- /scripts/shellcheck.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | find scripts -name "*.sh" -exec shellcheck {} + 4 | 5 | -------------------------------------------------------------------------------- /scripts/smoketest.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import time 4 | import gzip 5 | import shutil 6 | from pathlib import Path 7 | from typing import List, Optional 8 | import tempfile 9 | 10 | import pytest 11 | import requests 12 | import pandas as pd 13 | 14 | API_URL = "http://localhost:8000/general/v0/general" 15 | # NOTE(rniko): Skip inference tests if we're running on an emulated architecture 16 | skip_inference_tests = os.getenv("SKIP_INFERENCE_TESTS", "").lower() in {"true", "yes", "y", "1"} 17 | 18 | 19 | def send_document( 20 | filenames: List[str], 21 | filenames_gzipped: Optional[List[str]] = None, 22 | content_type: str = "", 23 | strategy: str = "auto", 24 | output_format: str = "application/json", 25 | skip_infer_table_types: list[str] = [], 26 | uncompressed_content_type: str = "", 27 | ): 28 | if filenames_gzipped is None: 29 | filenames_gzipped = [] 30 | files = [] 31 | for filename in filenames: 32 | files.append(("files", (str(filename), open(filename, "rb"), content_type))) 33 | for filename in filenames_gzipped: 34 | files.append(("files", (str(filename), open(filename, "rb"), "application/gzip"))) 35 | 36 | options = { 37 | "strategy": strategy, 38 | "output_format": output_format, 39 | "skip_infer_table_types": skip_infer_table_types, 40 | } 41 | if uncompressed_content_type: 42 | options["gz_uncompressed_content_type"] = uncompressed_content_type 43 | 44 | return requests.post( 45 | API_URL, 46 | files=files, 47 | data=options, 48 | ) 49 | 50 | 51 | @pytest.mark.parametrize( 52 | ("extension", "example_filename", "content_type"), 53 | [ 54 | (".bmp", "DA-1p.bmp", "image/bmp"), 55 | (".csv", "stanley-cups.csv", "application/csv"), 56 | (".doc", "fake.doc", "application/msword"), 57 | ( 58 | ".docx", 59 | "fake.docx", 60 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 61 | ), 62 | (".eml", "fake-email-attachment.eml", "message/rfc822"), 63 | (".epub", "winter-sports.epub", "application/epub"), 64 | (".heic", "DA-1p.heic", "image/heic"), 65 | (".html", "fake-html.html", "text/html"), 66 | (".jpeg", "layout-parser-paper-fast.jpg", "image/jpeg"), 67 | (".md", "README.md", "text/markdown"), 68 | (".msg", "fake-email.msg", "application/x-ole-storage"), 69 | (".odt", "fake.odt", "application/vnd.oasis.opendocument.text"), 70 | (".pdf", "layout-parser-paper.pdf", "application/pdf"), 71 | (".png", "english-and-korean.png", "image/png"), 72 | (".ppt", "fake-power-point.ppt", "application/vnd.ms-powerpoint"), 73 | ( 74 | ".pptx", 75 | "fake-power-point.pptx", 76 | "application/vnd.openxmlformats-officedocument.presentationml.presentation", 77 | ), 78 | (".rst", "README.rst", "text/prs.fallenstein.rst"), 79 | (".rtf", "fake-doc.rtf", "application/rtf"), 80 | (".tiff", "layout-parser-paper-fast.tiff", "image/tiff"), 81 | (".tsv", "stanley-cups.tsv", "text/tab-separated-values"), 82 | (".txt", "fake-text.txt", "text/plain"), 83 | ( 84 | ".xlsx", 85 | "stanley-cups.xlsx", 86 | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 87 | ), 88 | (".xml", "fake-xml.xml", "text/xml"), 89 | (".json", "spring-weather.html.json", "application/json"), 90 | ( 91 | ".gz", 92 | "layout-parser-paper.pdf.gz", 93 | "application/gzip", 94 | ), 95 | ], 96 | ) 97 | def test_happy_path_all_types(extension, example_filename: str, content_type: str): 98 | """ 99 | For the files in sample-docs, verify that we get a 200 100 | and some structured response 101 | """ 102 | # The auto strategy will run ocr on these files 103 | # This doesn't always work on our macs 104 | if skip_inference_tests and extension in [ 105 | ".bmp", 106 | ".heic", 107 | ".jpeg", 108 | ".pdf", 109 | ".png", 110 | ".tiff", 111 | ".gz", # Since we're using a gzipped pdf... 112 | ]: 113 | pytest.skip("emulated hardware") 114 | 115 | test_file = str(Path("sample-docs") / example_filename) 116 | 117 | # Verify we can send with explicit content type 118 | response = send_document(filenames=[test_file], content_type=content_type) 119 | 120 | if response.status_code != 200: 121 | assert False, response.text 122 | 123 | assert len(response.json()) > 0 124 | assert len("".join(elem["text"] for elem in response.json())) > 20 125 | 126 | # Verify we can infer the filetype on the server 127 | response = send_document(filenames=[test_file], content_type=None) 128 | 129 | if response.status_code != 200: 130 | assert False, response.text 131 | 132 | assert len(response.json()) > 0 133 | assert len("".join(elem["text"] for elem in response.json())) > 20 134 | 135 | json_response = response 136 | 137 | # Verify we can set output type to csv 138 | csv_response = send_document( 139 | filenames=[test_file], 140 | content_type=content_type, 141 | output_format="text/csv", 142 | ) 143 | assert csv_response.status_code == 200 144 | assert len(csv_response.text) > 0 145 | df = pd.read_csv(io.StringIO(csv_response.text)) 146 | assert len(df) == len(json_response.json()) 147 | 148 | 149 | @pytest.mark.parametrize("output_format", ["application/json", "text/csv"]) 150 | @pytest.mark.parametrize( 151 | "filenames_to_gzip, filenames_verbatim, uncompressed_content_type", 152 | [ 153 | (["fake-html.html"], [], "text/html"), 154 | (["stanley-cups.csv"], [], "application/csv"), 155 | (["fake.doc"], [], "application/msword"), 156 | # compressed and uncompressed 157 | pytest.param( 158 | ["layout-parser-paper-fast.pdf"], 159 | ["list-item-example.pdf"], 160 | "application/pdf", 161 | marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"), 162 | ), 163 | (["fake-email.eml"], ["fake-email-image-embedded.eml"], "message/rfc822"), 164 | # compressed and uncompressed 165 | # empty content-type means that API should detect filetype after decompressing. 166 | pytest.param( 167 | ["layout-parser-paper-fast.pdf"], 168 | ["list-item-example.pdf"], 169 | "", 170 | marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"), 171 | ), 172 | (["fake-email.eml"], ["fake-email-image-embedded.eml"], ""), 173 | ], 174 | ) 175 | def test_gzip_sending( 176 | output_format: str, 177 | filenames_to_gzip: List[str], 178 | filenames_verbatim: List[str], 179 | uncompressed_content_type: str, 180 | ): 181 | temp_files = {} 182 | 183 | for filename in filenames_to_gzip: 184 | gz_file_extension = f"{Path(filename).suffix}.gz" 185 | temp_file = tempfile.NamedTemporaryFile(suffix=gz_file_extension) 186 | full_path = Path("sample-docs") / filename 187 | gzip_file(str(full_path), temp_file.name) 188 | temp_files[filename] = temp_file 189 | filenames_gzipped = [temp_file.name for temp_file in temp_files.values()] 190 | 191 | filenames = [] 192 | for filename in filenames_verbatim: 193 | filenames.append(str(Path("sample-docs") / filename)) 194 | 195 | json_response = send_document( 196 | filenames, 197 | filenames_gzipped, 198 | content_type=uncompressed_content_type, 199 | uncompressed_content_type=uncompressed_content_type, 200 | ) 201 | assert json_response.status_code == 200, json_response.text 202 | json_content = json_response.json() 203 | assert len(json_content) > 0 204 | if len(filenames_gzipped + filenames) > 1: 205 | for file in json_content: 206 | assert len("".join(elem["text"] for elem in file)) > 20 207 | else: 208 | assert len("".join(elem["text"] for elem in json_content)) > 20 209 | 210 | csv_response = send_document( 211 | filenames, 212 | filenames_gzipped, 213 | content_type=uncompressed_content_type, 214 | uncompressed_content_type=uncompressed_content_type, 215 | output_format="text/csv", 216 | ) 217 | assert csv_response.status_code == 200 218 | assert len(csv_response.text) > 0 219 | df = pd.read_csv(io.StringIO(csv_response.text)) 220 | if len(filenames_gzipped + filenames) > 1: 221 | json_size = 0 222 | for file in json_content: 223 | json_size += len(file) 224 | assert len(df) == json_size 225 | else: 226 | assert len(df) == len(json_content) 227 | 228 | for filename in filenames_to_gzip: 229 | temp_files[filename].close() 230 | 231 | 232 | @pytest.mark.skipif(skip_inference_tests, reason="emulated architecture") 233 | def test_strategy_performance(): 234 | """ 235 | For the files in sample-docs, verify that the fast strategy 236 | is significantly faster than the hi_res strategy 237 | """ 238 | performance_ratio = 4 239 | test_file = str(Path("sample-docs") / "layout-parser-paper.pdf") 240 | 241 | start_time = time.monotonic() 242 | response = send_document( 243 | filenames=[test_file], content_type="application/pdf", strategy="hi_res" 244 | ) 245 | hi_res_time = time.monotonic() - start_time 246 | assert response.status_code == 200 247 | 248 | start_time = time.monotonic() 249 | response = send_document(filenames=[test_file], content_type="application/pdf", strategy="fast") 250 | fast_time = time.monotonic() - start_time 251 | assert response.status_code == 200 252 | assert hi_res_time > performance_ratio * fast_time 253 | 254 | 255 | @pytest.mark.skipif(skip_inference_tests, reason="emulated architecture") 256 | @pytest.mark.parametrize( 257 | "strategy, skip_infer_table_types, expected_table_num", 258 | [ 259 | ("fast", [], 0), 260 | ("fast", ["pdf"], 0), 261 | ("hi_res", [], 2), 262 | ("hi_res", ["pdf"], 0), 263 | ], 264 | ) 265 | def test_table_support(strategy: str, skip_infer_table_types: list[str], expected_table_num: int): 266 | """ 267 | Test that table extraction works on hi_res strategy 268 | """ 269 | test_file = str(Path("sample-docs") / "layout-parser-paper.pdf") 270 | response = send_document( 271 | filenames=[test_file], 272 | content_type="application/pdf", 273 | strategy=strategy, 274 | skip_infer_table_types=skip_infer_table_types, 275 | ) 276 | 277 | assert response.status_code == 200 278 | extracted_tables = [ 279 | el["metadata"]["text_as_html"] 280 | for el in response.json() 281 | if "text_as_html" in el["metadata"].keys() 282 | ] 283 | assert len(extracted_tables) == expected_table_num 284 | if expected_table_num > 0: 285 | # Test a text form a table is extracted 286 | # Note(austin) - table output has changed - this line isn't returned 287 | # assert "Layouts of scanned modern magazines and scientific reports" in extracted_tables[0] 288 | assert "Layouts of history" in extracted_tables[0] 289 | 290 | 291 | def gzip_file(in_filepath: str, out_filepath: str): 292 | with open(in_filepath, "rb") as f_in: 293 | with gzip.open(out_filepath, "wb", compresslevel=1) as f_out: 294 | shutil.copyfileobj(f_in, f_out) 295 | -------------------------------------------------------------------------------- /scripts/version-increment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | function usage { 3 | echo "Usage: $(basename "$0") CHANGELOG_MESSAGE" 2>&1 4 | echo 'Add the given message to the changelog and cut a release' 5 | echo "Example: $(basename "$0") \"Bump unstructured to x.y.z\"" 6 | } 7 | 8 | # Found at https://www.henryschmale.org/2019/04/30/incr-semver.html 9 | # $1 - semver string 10 | # $2 - level to incr {dev,release,minor,major} - release by default 11 | function incr_semver() { 12 | IFS='.' read -ra ver <<< "$1" 13 | [[ "${#ver[@]}" -ne 3 ]] && echo "Invalid semver string" && return 1 14 | [[ "$#" -eq 1 ]] && level='release' || level=$2 15 | 16 | release=${ver[2]} 17 | minor=${ver[1]} 18 | major=${ver[0]} 19 | 20 | case $level in 21 | # Drop the dev tag 22 | dev) 23 | release=$(echo "$release" | awk -F '-' '{print $1}') 24 | ;; 25 | release) 26 | release=$((release+1)) 27 | ;; 28 | minor) 29 | release=0 30 | minor=$((minor+1)) 31 | ;; 32 | major) 33 | release=0 34 | minor=0 35 | major=$((major+1)) 36 | ;; 37 | *) 38 | echo "Invalid level passed" 39 | return 2 40 | esac 41 | echo "$major.$minor.$release" 42 | } 43 | 44 | 45 | if [[ -z "$1" ]]; then 46 | usage 47 | exit 0 48 | fi 49 | 50 | changelog_text="* $1" 51 | current_version=$(head -1 CHANGELOG.md | awk -F' ' '{print $2}') 52 | 53 | # If dev version, add to current change list and cut the release 54 | if [[ $current_version == *"dev"* ]]; then 55 | new_version=$(incr_semver "$current_version" dev) 56 | 57 | # Replace the version (drop the dev tag) 58 | sed -i 's/'"$current_version"'/'"$new_version"'/' CHANGELOG.md 59 | 60 | # Find the first bullet, add the new change above it 61 | sed -i '0,/^*/{s/\(^*.*\)/'"$changelog_text"'\n\1/}' CHANGELOG.md 62 | 63 | # If not dev version, create a new release 64 | else 65 | new_version=$(incr_semver "$current_version" release) 66 | 67 | cat < CHANGELOG.tmp 68 | ## $new_version 69 | 70 | $changelog_text 71 | 72 | EOF 73 | 74 | mv CHANGELOG.{tmp,md} 75 | 76 | fi 77 | -------------------------------------------------------------------------------- /scripts/version-sync.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | function usage { 3 | echo "Usage: $(basename "$0") [-c] -f FILE_TO_CHANGE REPLACEMENT_FORMAT [-f FILE_TO_CHANGE REPLACEMENT_FORMAT ...]" 2>&1 4 | echo 'Synchronize files to latest version in source file' 5 | echo ' -s Specifies source file for version (default is CHANGELOG.md)' 6 | echo ' -f Specifies a file to change and the format for searching and replacing versions' 7 | echo ' FILE_TO_CHANGE is the file to be updated/checked for updates' 8 | echo ' REPLACEMENT_FORMAT is one of (semver, release, api-release)' 9 | echo ' semver indicates to look for a full semver version and replace with the latest full version' 10 | echo ' release indicates to look for a release semver version (x.x.x) and replace with the latest release version' 11 | echo ' api-release indicates to look for a release semver version in the context of an api route and replace with the latest release version' 12 | echo ' -c Compare versions and output proposed changes without changing anything.' 13 | } 14 | 15 | function getopts-extra () { 16 | declare -i i=1 17 | # if the next argument is not an option, then append it to array OPTARG 18 | while [[ ${OPTIND} -le $# && ${!OPTIND:0:1} != '-' ]]; do 19 | OPTARG[i]=${!OPTIND} 20 | ((i += 1)) 21 | ((OPTIND += 1)) 22 | done 23 | } 24 | 25 | # Parse input options 26 | declare CHECK=0 27 | declare SOURCE_FILE="CHANGELOG.md" 28 | declare -a FILES_TO_CHECK=() 29 | declare -a REPLACEMENT_FORMATS=() 30 | declare args 31 | declare OPTIND OPTARG opt 32 | while getopts ":hcs:f:" opt; do 33 | case $opt in 34 | h) 35 | usage 36 | exit 0 37 | ;; 38 | c) 39 | CHECK=1 40 | ;; 41 | s) 42 | SOURCE_FILE="$OPTARG" 43 | ;; 44 | f) 45 | getopts-extra "$@" 46 | args=( "${OPTARG[@]}" ) 47 | # validate length of args, should be 2 48 | if [ ${#args[@]} -eq 2 ]; then 49 | FILES_TO_CHECK+=( "${args[0]}" ) 50 | REPLACEMENT_FORMATS+=( "${args[1]}" ) 51 | else 52 | echo "Exactly 2 arguments must follow -f option." >&2 53 | exit 1 54 | fi 55 | ;; 56 | \?) 57 | echo "Invalid option: -$OPTARG." >&2 58 | usage 59 | exit 1 60 | ;; 61 | esac 62 | done 63 | 64 | # Parse REPLACEMENT_FORMATS 65 | RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?" 66 | RE_RELEASE="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)" 67 | RE_API_RELEASE="v(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)" 68 | # Pull out semver appearing earliest in SOURCE_FILE. 69 | LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$SOURCE_FILE") 70 | LAST_RELEASE=$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}") 71 | LAST_API_RELEASE="v$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])$" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")" 72 | declare -a RE_SEMVERS=() 73 | declare -a UPDATED_VERSIONS=() 74 | for i in "${!REPLACEMENT_FORMATS[@]}"; do 75 | REPLACEMENT_FORMAT=${REPLACEMENT_FORMATS[$i]} 76 | case $REPLACEMENT_FORMAT in 77 | semver) 78 | RE_SEMVERS+=( "$RE_SEMVER_FULL" ) 79 | UPDATED_VERSIONS+=( "$LAST_VERSION" ) 80 | ;; 81 | release) 82 | RE_SEMVERS+=( "$RE_RELEASE" ) 83 | UPDATED_VERSIONS+=( "$LAST_RELEASE" ) 84 | ;; 85 | api-release) 86 | RE_SEMVERS+=( "$RE_API_RELEASE" ) 87 | UPDATED_VERSIONS+=( "$LAST_API_RELEASE" ) 88 | ;; 89 | *) 90 | echo "Invalid replacement format: \"${REPLACEMENT_FORMAT}\". Use semver, release, or api-release" >&2 91 | exit 1 92 | ;; 93 | esac 94 | done 95 | 96 | if [ -z "$LAST_VERSION" ]; 97 | then 98 | # No match to semver regex in SOURCE_FILE, so no version to go from. 99 | printf "Error: Unable to find latest version from %s.\n" "$SOURCE_FILE" 100 | exit 1 101 | fi 102 | 103 | # Search files in FILES_TO_CHECK and change (or get diffs) 104 | declare FAILED_CHECK=0 105 | 106 | for i in "${!FILES_TO_CHECK[@]}"; do 107 | FILE_TO_CHANGE=${FILES_TO_CHECK[$i]} 108 | RE_SEMVER=${RE_SEMVERS[$i]} 109 | UPDATED_VERSION=${UPDATED_VERSIONS[$i]} 110 | FILE_VERSION=$(grep -o -m 1 -E "${RE_SEMVER}" "$FILE_TO_CHANGE") 111 | if [ -z "$FILE_VERSION" ]; 112 | then 113 | # No match to semver regex in VERSIONFILE, so nothing to replace 114 | printf "Error: No semver version found in file %s.\n" "$FILE_TO_CHANGE" 115 | exit 1 116 | else 117 | # Replace semver in VERSIONFILE with semver obtained from SOURCE_FILE 118 | TMPFILE=$(mktemp /tmp/new_version.XXXXXX) 119 | # Check sed version, exit if version < 4.3 120 | if ! sed --version > /dev/null 2>&1; then 121 | CURRENT_VERSION=1.archaic 122 | else 123 | CURRENT_VERSION=$(sed --version | head -n1 | cut -d" " -f4) 124 | fi 125 | REQUIRED_VERSION="4.3" 126 | if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$CURRENT_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then 127 | echo "sed version must be >= ${REQUIRED_VERSION}" && exit 1 128 | fi 129 | sed -E -r "s/$RE_SEMVER/$UPDATED_VERSION/" "$FILE_TO_CHANGE" > "$TMPFILE" 130 | if [ $CHECK == 1 ]; 131 | then 132 | DIFF=$(diff "$FILE_TO_CHANGE" "$TMPFILE" ) 133 | if [ -z "$DIFF" ]; 134 | then 135 | printf "version sync would make no changes to %s.\n" "$FILE_TO_CHANGE" 136 | rm "$TMPFILE" 137 | else 138 | FAILED_CHECK=1 139 | printf "version sync would make the following changes to %s:\n%s\n" "$FILE_TO_CHANGE" "$DIFF" 140 | rm "$TMPFILE" 141 | fi 142 | else 143 | cp "$TMPFILE" "$FILE_TO_CHANGE" 144 | rm "$TMPFILE" 145 | fi 146 | fi 147 | done 148 | 149 | # Exit with code determined by whether changes were needed in a check. 150 | if [ ${FAILED_CHECK} -ne 0 ]; then 151 | exit 1 152 | else 153 | exit 0 154 | fi 155 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | exclude = 4 | prepline_*/api 5 | -------------------------------------------------------------------------------- /test_general/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/test_general/__init__.py -------------------------------------------------------------------------------- /test_general/api/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/test_general/api/.gitkeep -------------------------------------------------------------------------------- /test_general/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/test_general/api/__init__.py -------------------------------------------------------------------------------- /test_general/api/test_deprecated_api.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pathlib import Path 4 | from typing import Any 5 | 6 | from fastapi.testclient import TestClient 7 | from prepline_general.api.app import app 8 | 9 | MAIN_API_ROUTE = "general/v0/general" 10 | 11 | 12 | @pytest.mark.parametrize( 13 | "parameters", 14 | [ 15 | pytest.param({"coordinates": ["true"]}, id="coordinates_true"), 16 | pytest.param({"coordinates": ["false"]}, id="coordinates_false"), 17 | pytest.param({"encoding": ["utf-8"]}, id="encoding"), 18 | pytest.param({"hi_res_model_name": ["yolox"]}, id="hi_res_model_name"), 19 | pytest.param({"include_page_breaks": ["true"]}, id="include_page_breaks"), 20 | pytest.param({"ocr_languages": ["eng", "kor"]}, id="ocr_languages"), 21 | pytest.param({"languages": ["eng", "kor"]}, id="languages"), 22 | pytest.param({"languages": ["eng", "kor"]}, id="languages_inner"), 23 | pytest.param({"pdf_infer_table_structure": ["false"]}, id="pdf_infer_table_structure"), 24 | pytest.param({"skip_infer_table_types": ["false"]}, id="skip_infer_table_types"), 25 | pytest.param({"strategy": ["hi_res"]}, id="strategy"), 26 | pytest.param({"xml_keep_tags": ["false"]}, id="xml_keep_tags"), 27 | pytest.param({"extract_image_block_types": ["image"]}, id="extract_image_block_types"), 28 | pytest.param( 29 | {"extract_image_block_types": ['["image", "table"]']}, 30 | id="extract_image_block_types_json", 31 | ), 32 | pytest.param({"chunking_strategy": ["by_title"]}, id="chunking_strategy"), 33 | pytest.param({"multipage_sections": ["false"]}, id="multipage_sections"), 34 | pytest.param({"combine_under_n_chars": ["500"]}, id="combine_under_n_chars"), 35 | pytest.param({"new_after_n_chars": ["1500"]}, id="new_after_n_chars"), 36 | pytest.param({"max_characters": ["1500"]}, id="max_characters"), 37 | ], 38 | ) 39 | def test_form_params_passed_as_first_element_of_array_are_properly_handled( 40 | parameters: dict[str, Any], 41 | ): 42 | """ 43 | Verify that responses do not include coordinates unless requested 44 | Verify that certain other metadata fields are dropped 45 | """ 46 | client = TestClient(app) 47 | test_file = Path("sample-docs") / "layout-parser-paper-fast.jpg" 48 | response = client.post( 49 | MAIN_API_ROUTE, 50 | files=[("files", (str(test_file), open(test_file, "rb")))], 51 | data=parameters, 52 | ) 53 | 54 | assert response.status_code == 200 55 | assert response.json() 56 | -------------------------------------------------------------------------------- /test_general/api/test_gzip.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import shutil 3 | import io 4 | import tempfile 5 | from pathlib import Path 6 | from typing import List 7 | 8 | import httpx 9 | import pandas as pd 10 | import pytest 11 | from fastapi.testclient import TestClient 12 | from deepdiff import DeepDiff 13 | 14 | from prepline_general.api.app import app 15 | 16 | MAIN_API_ROUTE = "general/v0/general" 17 | 18 | 19 | @pytest.mark.xfail(reason="The outputs are different as of unstructured==0.13.5") 20 | @pytest.mark.parametrize("output_format", ["application/json", "text/csv"]) 21 | @pytest.mark.parametrize( 22 | "filenames_to_gzip, filenames_verbatim, uncompressed_content_type", 23 | [ 24 | (["fake-html.html"], [], "text/html"), 25 | (["stanley-cups.csv"], [], "application/csv"), 26 | (["fake.doc"], [], "application/msword"), 27 | (["layout-parser-paper-fast.pdf"], [], "application/pdf"), 28 | (["fake-email-attachment.eml", "fake-email.eml"], [], "message/rfc822"), 29 | ( 30 | ["fake-email-attachment.eml", "fake-email.eml", "announcement.eml"], 31 | [], 32 | "message/rfc822", 33 | ), 34 | (["layout-parser-paper-fast.pdf", "list-item-example.pdf"], [], "application/pdf"), 35 | # now the same but without explicit content type 36 | # to make the system guess the un-gzipped type based on content. 37 | (["fake-html.html"], [], ""), 38 | (["fake-email-attachment.eml", "fake-email.eml"], [], ""), 39 | (["layout-parser-paper-fast.pdf", "list-item-example.pdf"], [], ""), 40 | # mix of compressed and uncompressed 41 | (["layout-parser-paper-fast.pdf"], ["list-item-example.pdf"], "application/pdf"), 42 | # mix of compressed and uncompressed, and guessing of content type 43 | (["layout-parser-paper-fast.pdf"], ["list-item-example.pdf"], ""), 44 | # have to use OCR which is slow, so minimum cases 45 | (["embedded-images-tables.jpg"], ["english-and-korean.png"], "image/png"), 46 | (["embedded-images-tables.jpg"], ["english-and-korean.png"], ""), 47 | ], 48 | ) 49 | def test_gzipped_files_are_parsed_like_original( 50 | output_format: str, 51 | filenames_to_gzip: List[str], 52 | filenames_verbatim: List[str], 53 | uncompressed_content_type: str, 54 | ): 55 | """ 56 | Verify that API supports un-gzipping and correctly interprets gz_uncompressed_content_type, 57 | by comparing response to directly parsing the same files. 58 | The one thing which changes is the filenames in metadata, which have to be ignored. 59 | """ 60 | client = TestClient(app) 61 | gz_options = { 62 | "gz_uncompressed_content_type": ( 63 | uncompressed_content_type if uncompressed_content_type else None 64 | ), 65 | "output_format": output_format, 66 | } 67 | response1 = get_gzipped_response( 68 | client, filenames_to_gzip, filenames_verbatim, gz_options, uncompressed_content_type 69 | ) 70 | response2 = call_api( 71 | client, 72 | [], 73 | filenames_to_gzip + filenames_verbatim, 74 | uncompressed_content_type, 75 | {"output_format": output_format}, 76 | ) 77 | compare_responses( 78 | response1, response2, output_format, len(filenames_to_gzip + filenames_verbatim) 79 | ) 80 | 81 | 82 | def compare_responses( 83 | response1: httpx.Response, response2: httpx.Response, output_format: str, files_count: int 84 | ) -> None: 85 | if output_format == "application/json": 86 | if files_count == 1: 87 | exclude_regex_paths = ( 88 | r"root\[\d+\]\['(metadata'\]\['(filename|parent_id)|element_id)'\]" 89 | ) 90 | 91 | else: 92 | exclude_regex_paths = ( 93 | r"root\[\d+\]\[\d+\]\['(metadata'\]\['(filename|parent_id)|element_id)'\]" 94 | ) 95 | diff = DeepDiff( 96 | t1=response1.json(), 97 | t2=response2.json(), 98 | exclude_regex_paths=exclude_regex_paths, 99 | ) 100 | assert len(diff) == 0 101 | else: 102 | df1 = pd.read_csv(io.StringIO(response1.text)) 103 | df2 = pd.read_csv(io.StringIO(response2.text)) 104 | diff = DeepDiff( 105 | t1=df1.to_dict(), 106 | t2=df2.to_dict(), 107 | exclude_regex_paths=r"root\['(filename|parent_id|element_id)'\]\[\d+\]", 108 | ) 109 | assert len(diff) == 0 110 | 111 | 112 | def call_api( 113 | client: TestClient, 114 | filenames_gzipped: List[str], 115 | filenames_verbatim: List[str], 116 | content_type: str, 117 | options: dict, 118 | samples_dir: str = "sample-docs", 119 | ) -> httpx.Response: 120 | files = [] 121 | for filename in filenames_gzipped: 122 | full_path = Path(samples_dir) / filename 123 | files.append(("files", (str(full_path), open(full_path, "rb"), "application/gzip"))) 124 | 125 | for filename in filenames_verbatim: 126 | full_path = Path(samples_dir) / filename 127 | files.append(("files", (str(full_path), open(full_path, "rb"), content_type))) 128 | 129 | response = client.post( 130 | MAIN_API_ROUTE, 131 | files=files, 132 | data=options, 133 | ) 134 | assert response.status_code == 200, response.text 135 | assert len(response.text) > 0 136 | return response 137 | 138 | 139 | def get_gzipped_response( 140 | client: TestClient, 141 | filenames_to_gzip: List[str], 142 | filenames_verbatim: List[str], 143 | options: dict, 144 | content_type: str, 145 | samples_dir: str = "sample-docs", 146 | ) -> httpx.Response: 147 | """ 148 | G-zips the filenames_to_gzip into temporary .gz file and sends to API, 149 | along with filenames_no_gzip. 150 | """ 151 | temp_files = {} 152 | for filename in filenames_to_gzip: 153 | gz_file_extension = f"{Path(filename).suffix}.gz" 154 | temp_file = tempfile.NamedTemporaryFile(suffix=gz_file_extension) 155 | full_path = Path(samples_dir) / filename 156 | gzip_file(str(full_path), temp_file.name) 157 | temp_files[filename] = temp_file 158 | 159 | filenames_gzipped = [temp_file.name for temp_file in temp_files.values()] 160 | 161 | response = call_api(client, filenames_gzipped, filenames_verbatim, content_type, options) 162 | 163 | for filename in filenames_to_gzip: 164 | temp_files[filename].close() 165 | 166 | return response 167 | 168 | 169 | def gzip_file(in_filepath: str, out_filepath: str): 170 | with open(in_filepath, "rb") as f_in: 171 | with gzip.open(out_filepath, "wb", compresslevel=1) as f_out: 172 | shutil.copyfileobj(f_in, f_out) 173 | -------------------------------------------------------------------------------- /test_general/api/test_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import pytest 4 | 5 | from prepline_general.api.utils import SmartValueParser 6 | 7 | 8 | @pytest.mark.parametrize( 9 | "desired_type, value_to_parse, expected_result", 10 | [ 11 | (bool, ["true"], True), 12 | (bool, "true", True), 13 | (bool, ["false"], False), 14 | (bool, True, True), 15 | (bool, "false", False), 16 | (bool, False, False), 17 | (int, "1500", 1500), 18 | (int, ["1500"], 1500), 19 | (float, ["1500"], 1500.0), 20 | (list[int], [1000], [1000]), 21 | (int, 1500, 1500), 22 | (float, 1500, 1500.0), 23 | (str, "1500", "1500"), 24 | (float, "1500", 1500.0), 25 | (list[str], ["one", "two", "three"], ["one", "two", "three"]), 26 | (list[int], [1000], [1000]), 27 | (list[bool], ["true", "False", "True"], [True, False, True]), 28 | ], 29 | ) 30 | def test_smart_value_parser(desired_type: type, value_to_parse: Any, expected_result: Any): 31 | parsed_value = SmartValueParser[desired_type]().value_or_first_element(value_to_parse) 32 | assert expected_result == parsed_value 33 | 34 | 35 | @pytest.mark.parametrize( 36 | "desired_type, value_to_parse, expected_result", 37 | [ 38 | (str, "fast", "fast"), 39 | (str, "'fast'", "fast"), 40 | (str, '"fast"', "fast"), 41 | (str, "!fast", "!fast"), 42 | (str, "fa'st", "fast"), 43 | (str, "fast''''''", "fast"), 44 | ], 45 | ) 46 | def test_literal_value_stripped_or_first_element( 47 | desired_type: type, value_to_parse: Any, expected_result: Any 48 | ): 49 | parsed_value = SmartValueParser[desired_type]().literal_value_stripped_or_first_element( 50 | value_to_parse 51 | ) 52 | assert expected_result == parsed_value 53 | --------------------------------------------------------------------------------