├── .github
    ├── ISSUE_TEMPLATE
    │   └── bug_report.md
    ├── dependabot.yml
    └── workflows
    │   ├── bump_libraries.yaml
    │   ├── ci.yml
    │   └── docker-publish.yml
├── .gitignore
├── CHANGELOG.md
├── Dockerfile
├── LICENSE.md
├── Makefile
├── README.md
├── docker
    └── rockylinux-9.4
    │   └── Dockerfile
├── exploration-notebooks
    ├── exploration-email.ipynb
    ├── exploration-html.ipynb
    └── exploration-powerpoint.ipynb
├── img
    ├── email-screenshot.png
    └── unstructured_logo.png
├── logger_config.yaml
├── prepline_general
    ├── __init__.py
    └── api
    │   ├── __init__.py
    │   ├── app.py
    │   ├── filetypes.py
    │   ├── general.py
    │   ├── models
    │       ├── __init__.py
    │       └── form_params.py
    │   ├── openapi.py
    │   └── utils.py
├── preprocessing-pipeline-family.yaml
├── pyproject.toml
├── requirements
    ├── base.in
    ├── base.txt
    ├── constraints.in
    ├── test.in
    └── test.txt
├── sample-docs
    ├── .gitkeep
    ├── DA-1p-with-duplicate-pages.pdf
    ├── DA-1p.bmp
    ├── DA-1p.heic
    ├── README.md
    ├── README.rst
    ├── alert.eml
    ├── announcement.eml
    ├── embedded-images-tables.jpg
    ├── embedded-images-tables.pdf
    ├── english-and-korean.png
    ├── fake-doc.rtf
    ├── fake-email-attachment.eml
    ├── fake-email-image-embedded.eml
    ├── fake-email.eml
    ├── fake-email.msg
    ├── fake-html.html
    ├── fake-power-point.ppt
    ├── fake-power-point.pptx
    ├── fake-text-utf-32.txt
    ├── fake-text.txt
    ├── fake-xml.xml
    ├── fake.doc
    ├── fake.docx
    ├── fake.odt
    ├── family-day.eml
    ├── layout-parser-paper-fast.jpg
    ├── layout-parser-paper-fast.pdf
    ├── layout-parser-paper-fast.tiff
    ├── layout-parser-paper-with-table.jpg
    ├── layout-parser-paper.pdf
    ├── layout-parser-paper.pdf.gz
    ├── list-item-example.pdf
    ├── notes.ppt
    ├── notes.pptx
    ├── spring-weather.html.json
    ├── stanley-cups.csv
    ├── stanley-cups.tsv
    ├── stanley-cups.xlsx
    └── winter-sports.epub
├── scripts
    ├── app-start.sh
    ├── docker-build.sh
    ├── docker-smoke-test.sh
    ├── install-pandoc.sh
    ├── parallel-mode-test.sh
    ├── shellcheck.sh
    ├── smoketest.py
    ├── version-increment.sh
    └── version-sync.sh
├── setup.cfg
└── test_general
    ├── __init__.py
    └── api
        ├── .gitkeep
        ├── __init__.py
        ├── test_app.py
        ├── test_deprecated_api.py
        ├── test_gzip.py
        └── test_utils.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Please provide as much info as possible:
15 | 
16 | - Filetype:
17 | - Any additional API parameters:
18 | 
19 | **Environment:**
20 |  - Using the hosted API or self hosting?
21 |  - How are you calling the API? (Langchain, SDKs, cUrl, etc.)
22 | 
23 | **Additional context**
24 | Add any other context about the problem here.
25 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "pip"
 4 |     directory: "/requirements"
 5 |     schedule:
 6 |       interval: "daily"
 7 |     # Only use this to bump our libraries
 8 |     allow:
 9 |       - dependency-name: "unstructured[local-inference]"
10 | 
11 |   - package-ecosystem: "github-actions"
12 |     # NOTE(robinson) - Workflow files stored in the
13 |     # default location of `.github/workflows`
14 |     directory: "/"
15 |     schedule:
16 |       interval: "weekly"
17 | 


--------------------------------------------------------------------------------
/.github/workflows/bump_libraries.yaml:
--------------------------------------------------------------------------------
 1 | name: Dependabot - Bump libs and cut release
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     types:
 6 |       - opened
 7 |       - reopened
 8 |     paths:
 9 |       - 'requirements/**'
10 | 
11 | env:
12 |   PYTHON_VERSION: "3.8"
13 | 
14 | jobs:
15 |     bump-changelog:
16 |         runs-on: ubuntu-latest
17 |         if: ${{ github.actor == 'dependabot[bot]' }}
18 |         permissions:
19 |           contents: write 
20 |         steps:
21 |         - uses: actions/checkout@v4
22 |         - name: Set up Python ${{ env.PYTHON_VERSION }}
23 |           uses: actions/setup-python@v5
24 |           with:
25 |             python-version: ${{ env.PYTHON_VERSION }}
26 |         - name: Dependabot metadata
27 |           id: metadata
28 |           uses: dependabot/fetch-metadata@v2
29 |           with:
30 |             github-token: "${{ secrets.GITHUB_TOKEN }}"
31 |         - name: Create release version
32 |           run: |
33 |             pip install pip-tools
34 |             make pip-compile
35 |             package=${{ steps.metadata.outputs.dependency-names }}
36 |             # Strip any [extras] from name
37 |             package=${package%\[*}
38 |             changelog_message="Bump $package to ${{ steps.metadata.outputs.new-version }}"
39 |             ./scripts/version-increment.sh "$changelog_message"
40 |             make version-sync
41 |         - uses: stefanzweifel/git-auto-commit-action@v5
42 |           with:
43 |             commit_message: "Bump libraries and release"
44 | 
45 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [ main ]
  6 |   pull_request:
  7 |     branches: [ main ]
  8 | 
  9 | env:
 10 |   PYTHON_VERSION: "3.10"
 11 |   PIPELINE_FAMILY: "general"
 12 | 
 13 | jobs:
 14 |   setup:
 15 |     runs-on: ubuntu-latest
 16 |     steps:
 17 |     - uses: actions/checkout@v4
 18 |     - uses: actions/cache@v4
 19 |       id: virtualenv-cache
 20 |       with:
 21 |         path: |
 22 |           .venv
 23 |         key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }}
 24 |     - name: Set up Python ${{ env.PYTHON_VERSION }}
 25 |       uses: actions/setup-python@v5
 26 |       with:
 27 |         python-version: ${{ env.PYTHON_VERSION }}
 28 |     - name: Setup virtual environment (no cache hit)
 29 |       if: steps.virtualenv-cache.outputs.cache-hit != 'true'
 30 |       run: |
 31 |         python${{ env.PYTHON_VERSION }} -m venv .venv
 32 |         source .venv/bin/activate
 33 |         make install-ci
 34 | 
 35 |   lint:
 36 |     runs-on: ubuntu-latest
 37 |     needs: setup
 38 |     steps:
 39 |     - uses: actions/checkout@v4
 40 |     - uses: actions/cache@v4
 41 |       id: virtualenv-cache
 42 |       with:
 43 |         path: |
 44 |           .venv
 45 |         key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }}
 46 |     - name: Lint
 47 |       run: |
 48 |         source .venv/bin/activate
 49 |         make check
 50 | 
 51 |   shellcheck:
 52 |     runs-on: ubuntu-latest
 53 |     steps:
 54 |       - uses: actions/checkout@v4
 55 |       - name: ShellCheck
 56 |         uses: ludeeus/action-shellcheck@master
 57 | 
 58 |   test:
 59 |     runs-on: ubuntu-latest-m
 60 |     needs: [setup, lint]
 61 |     steps:
 62 |     - uses: actions/checkout@v4
 63 |     - uses: actions/cache@v4
 64 |       id: virtualenv-cache
 65 |       with:
 66 |         path: |
 67 |           .venv
 68 |         key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }}
 69 |     - name: Run core tests
 70 |       run: |
 71 |         source .venv/bin/activate
 72 |         sudo apt-get update && sudo apt-get install --yes poppler-utils libreoffice
 73 |         make install-pandoc
 74 |         sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
 75 |         sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
 76 |         tesseract --version
 77 |         make install-nltk-models
 78 |         make test
 79 |         make check-coverage
 80 | 
 81 |   changelog:
 82 |     runs-on: ubuntu-latest
 83 |     steps:
 84 |     - uses: actions/checkout@v4
 85 |     - if: github.ref != 'refs/heads/main'
 86 |       uses: dorny/paths-filter@v3
 87 |       id: changes
 88 |       with:
 89 |         filters: |
 90 |           src:
 91 |             - 'doc_recipe/**'
 92 |             - 'recipe-notebooks/**'
 93 | 
 94 |     - if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main'
 95 |       uses: dangoslen/changelog-enforcer@v3
 96 | 
 97 |   # TODO - figure out best practice for caching docker images
 98 |   # (Using the virtualenv to get pytest)
 99 |   test_dockerfile:
100 |     runs-on: ubuntu-latest-m
101 |     needs: [setup, lint]
102 |     steps:
103 |     - uses: actions/checkout@v4
104 |     - uses: actions/cache@v4
105 |       id: virtualenv-cache
106 |       with:
107 |         path: |
108 |           .venv
109 |         key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/base.txt') }}
110 |     - name: Test Dockerfile
111 |       run: |
112 |         source .venv/bin/activate
113 |         make docker-build
114 |         make docker-test
115 |     # - name: Scan image
116 |     #   uses: anchore/scan-action@v3
117 |     #   with:
118 |     #     image: "pipeline-family-${{ env.PIPELINE_FAMILY }}-dev"
119 |     #     # NOTE(robinson) - revert this to medium when we bump libreoffice
120 |     #     severity-cutoff: critical
121 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-publish.yml:
--------------------------------------------------------------------------------
  1 | name: Build And Push Docker Image
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - main
  7 | 
  8 | env:
  9 |   DOCKER_REPOSITORY: quay.io/unstructured-io/unstructured-api
 10 |   DOCKER_BUILD_REPOSITORY: quay.io/unstructured-io/build-unstructured-api
 11 |   PACKAGE: "unstructured-api"
 12 |   PIPELINE_FAMILY: "general"
 13 |   PIP_VERSION: "25.1.1"
 14 |   PYTHON_VERSION: "3.10"
 15 | 
 16 | jobs:
 17 |   setup:
 18 |     runs-on: ubuntu-latest
 19 |     steps:
 20 |     - uses: actions/checkout@v4
 21 |     - uses: actions/cache@v4
 22 |       id: virtualenv-cache
 23 |       with:
 24 |         path: |
 25 |           .venv
 26 |         key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
 27 |     - name: Set up Python ${{ env.PYTHON_VERSION }}
 28 |       uses: actions/setup-python@v5
 29 |       with:
 30 |         python-version: ${{ env.PYTHON_VERSION }}
 31 |     - name: Setup virtual environment (no cache hit)
 32 |       if: steps.virtualenv-cache.outputs.cache-hit != 'true'
 33 |       run: |
 34 |         python${{ env.PYTHON_VERSION }} -m venv .venv
 35 |         source .venv/bin/activate
 36 |         make install-ci
 37 |   set-short-sha:
 38 |     runs-on: ubuntu-latest
 39 |     outputs:
 40 |       short_sha: ${{ steps.set_short_sha.outputs.short_sha }}
 41 |     steps:
 42 |       - name: Set Short SHA
 43 |         id: set_short_sha
 44 |         run: echo "short_sha=$(echo ${{ github.sha }} | cut -c1-7)" >> $GITHUB_OUTPUT
 45 |   build-images:
 46 |     strategy:
 47 |       matrix:
 48 |         #arch: ["arm64", "amd64"]
 49 |         # NOTE(luke): temporary disable arm64 since its failing the smoke test
 50 |         arch: ["amd64"]
 51 |     runs-on: ubuntu-latest-m
 52 |     needs: [setup, set-short-sha]
 53 |     env:
 54 |       SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
 55 |       DOCKER_PLATFORM: linux/${{ matrix.arch }}
 56 |     steps:
 57 |     - name: Set up Docker Buildx
 58 |       uses: docker/setup-buildx-action@v3
 59 |       with:
 60 |         driver: ${{ matrix.arch == 'amd64' && 'docker' || 'docker-container' }}
 61 |     - name: Checkout code
 62 |       uses: actions/checkout@v4
 63 |     - name: Login to Quay.io
 64 |       uses: docker/login-action@v3
 65 |       with:
 66 |         registry: quay.io
 67 |         username: ${{ secrets.QUAY_IO_ROBOT_USERNAME }}
 68 |         password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }}
 69 |     - name: Build image
 70 |       run: |
 71 |         # Clear some space (https://github.com/actions/runner-images/issues/2840)
 72 |         sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/share/boost
 73 | 
 74 |         DOCKER_BUILDKIT=1 docker buildx build --load -f Dockerfile \
 75 |           --platform=$DOCKER_PLATFORM \
 76 |           --build-arg PIP_VERSION=$PIP_VERSION \
 77 |           --build-arg BUILDKIT_INLINE_CACHE=1 \
 78 |           --build-arg PIPELINE_PACKAGE=${{ env.PIPELINE_FAMILY }} \
 79 |           --provenance=false \
 80 |           --progress plain \
 81 |           --cache-from $DOCKER_BUILD_REPOSITORY:${{ matrix.arch }} \
 82 |           -t $DOCKER_BUILD_REPOSITORY:${{ matrix.arch }}-$SHORT_SHA .
 83 |     - name: Set virtualenv cache
 84 |       uses: actions/cache@v4
 85 |       id: virtualenv-cache
 86 |       with:
 87 |         path: |
 88 |           .venv
 89 |         key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/test.txt') }}
 90 |     - name: Set up QEMU
 91 |       uses: docker/setup-qemu-action@v3
 92 |     - name: Test image
 93 |       run: |
 94 |         source .venv/bin/activate
 95 |         export DOCKER_IMAGE="$DOCKER_BUILD_REPOSITORY:${{ matrix.arch }}-$SHORT_SHA"
 96 |         if [ "$DOCKER_PLATFORM" == "linux/arm64" ]; then
 97 |           SKIP_INFERENCE_TESTS=true make docker-test
 98 |         else
 99 |           make docker-test
100 |         fi
101 |     - name: Push image
102 |       run: |
103 |         # write to the build repository to cache for the publish-images job
104 |         docker push $DOCKER_BUILD_REPOSITORY:${{ matrix.arch }}-$SHORT_SHA
105 |   publish-images:
106 |     runs-on: ubuntu-latest-m
107 |     needs: [setup, set-short-sha, build-images]
108 |     env:
109 |       SHORT_SHA: ${{ needs.set-short-sha.outputs.short_sha }}
110 |     steps:
111 |     - name: Checkout code
112 |       uses: actions/checkout@v4
113 |     - name: Set SHORT_SHA
114 |       run: echo "SHORT_SHA=$(git rev-parse --short HEAD)" >> $GITHUB_ENV
115 |     - name: Login to Quay.io
116 |       uses: docker/login-action@v3
117 |       with:
118 |         registry: quay.io
119 |         username: ${{ secrets.QUAY_IO_ROBOT_USERNAME }}
120 |         password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }}
121 |     - name: Pull AMD image
122 |       run: |
123 |         docker pull $DOCKER_BUILD_REPOSITORY:amd64-$SHORT_SHA
124 | #    - name: Pull ARM image
125 | #      run: |
126 | #        docker pull $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA
127 |     - name: Push AMD and ARM tags
128 |       run: |
129 |         # these are used to construct the final manifest but also cache-from in subsequent runs
130 |         docker tag $DOCKER_BUILD_REPOSITORY:amd64-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64
131 |         docker push $DOCKER_BUILD_REPOSITORY:amd64
132 |         #docker tag $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:arm64
133 |         #docker push $DOCKER_BUILD_REPOSITORY:arm64
134 |     - name: Push multiarch manifest
135 |       run: |
136 |         #docker manifest create ${DOCKER_REPOSITORY}:latest $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64
137 |         docker manifest create ${DOCKER_REPOSITORY}:latest $DOCKER_BUILD_REPOSITORY:amd64
138 |         docker manifest push $DOCKER_REPOSITORY:latest
139 |         #docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64
140 |         docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64
141 |         docker manifest push $DOCKER_REPOSITORY:$SHORT_SHA
142 |         VERSION=$(grep -m1 version preprocessing-pipeline-family.yaml | cut -d ' ' -f2)
143 |         #docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64
144 |         docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd64
145 |         docker manifest push ${DOCKER_REPOSITORY}:$VERSION
146 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pycharm
129 | .idea/
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # VSCode
135 | .vscode/
136 | 
137 | # Mac
138 | .DS_Store
139 | 
140 | nbs/
141 | 
142 | # Celery files that are created when the mercury dashboard is run
143 | celery.sqlite
144 | celerybeat-schedule.db
145 | 
146 | # temporarily generated files by project-specific Makefile
147 | tmp*
148 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | ## 0.0.85
  2 | * Patch various CVEs
  3 | * Bump Python version to 3.12, some packages no longer support 3.9
  4 | 
  5 | ## 0.0.84
  6 | * Patch h11 CVE
  7 | * bump httpcore version due to h11 dependency
  8 | 
  9 | ## 0.0.83
 10 | 
 11 | * Patch various CVEs
 12 | * Fix Starlette vulnerability
 13 | 
 14 | ## 0.0.82
 15 | 
 16 | * Patch various python CVEs
 17 | * Bump to `unstructured` 0.16.11
 18 | * No longer attempts to download NLTK asset from S3 which could result in a 403
 19 | 
 20 | ## 0.0.81
 21 | 
 22 | * Update `strategy` parameter to allow `'` and `"` as input surrounding the value.
 23 | 
 24 | ## 0.0.80
 25 | 
 26 | * Bump to `unstructured` 0.15.10
 27 | * Add `include_slide_notes` parameter, indicating whether slide notes in `ppt` and `pptx` files should be partitioned. Default is `True`. Now, when slide notes are present in the file, they will be included alongside other elements, which may shift the index numbers of non-note elements.
 28 | 
 29 | ## 0.0.79
 30 | 
 31 | * Bump to `unstructured` 0.15.7
 32 | 
 33 | ## 0.0.78
 34 | 
 35 | * Resolve NLTK CVE.
 36 | * Bump to `unstructured` 0.15.6
 37 | 
 38 | ## 0.0.77
 39 | 
 40 | * Bump to `unstructured` 0.15.5
 41 | 
 42 | ## 0.0.76
 43 | * Use the library's `detect_filetype` in API to determine mimetype
 44 | * Add content_type api parameter
 45 | * Bump to `unstructured` 0.15.1
 46 | 
 47 | ## 0.0.75
 48 | 
 49 | * Remove constraint on `safetensors` that preventing us from bumping `transformers`.
 50 | 
 51 | ## 0.0.74
 52 | 
 53 | * Bump to `unstructured` 0.15.0
 54 | 
 55 | ## 0.0.73
 56 | 
 57 | * Bump to `unstructured` 0.14.10
 58 | 
 59 | ## 0.0.72
 60 | 
 61 | * Fix certain filetypes failing mimetype lookup in the new base image
 62 | 
 63 | ## 0.0.71
 64 | 
 65 | * replace rockylinux with chainguard/wolfi as a base image for `amd64`
 66 | 
 67 | ## 0.0.70
 68 | 
 69 | * Bump to `unstructured` 0.14.6
 70 | * Bump to `unstructured-inference` 0.7.35
 71 | 
 72 | ## 0.0.69
 73 | 
 74 | * Bump to `unstructured` 0.14.4
 75 | * Add handling for `pdf_infer_table_structure` to reflect the "tables off by default" behavior in `unstructured`.
 76 | 
 77 | ## 0.0.68
 78 | 
 79 | * Fix list params such as `extract_image_block_types` not working via the python/js clients
 80 | 
 81 | ## 0.0.67
 82 | 
 83 | * Allow for a different server port with the PORT variable
 84 | * Change pdf_infer_table_structure parameter from being disabled in auto strategy.
 85 | 
 86 | ## 0.0.66
 87 | 
 88 | * Add support for `unique_element_ids` parameter.
 89 | * Add max lifetime, via MAX_LIFETIME_SECONDS env-var, to API containers
 90 | * Bump unstructured to 0.13.5
 91 | * Change default values for `pdf_infer_table_structure` and `skip_infer_table_types`. Mark `pdf_infer_table_structure` deprecated.
 92 | * Add support for the `starting_page_number` param.
 93 | 
 94 | ## 0.0.65
 95 | 
 96 | * Bump unstructured to 0.12.4
 97 | * Add support for both `list[str]` and `str` input formats for `ocr_languages` parameter
 98 | * Adds support for additional MIME types from `unstructured`
 99 | * Document the support for gzip files and add additional testing
100 | 
101 | ## 0.0.64
102 | 
103 | * Bump Pydantic to 2.5.x and remove it from explicit dependencies list (will be managed by fastapi)
104 | * Introduce Form params description in the code, which will form openapi and swagger documentation
105 | * Roll back some openapi customizations
106 | * Keep backward compatibility for passing parameters in form of `list[str]` (will not be shown in the documentation)
107 | 
108 | ## 0.0.63
109 | 
110 | * Bump unstructured to 0.12.2
111 | * Fix bug that ignored `combine_under_n_chars` chunking option argument.
112 | 
113 | ## 0.0.62
114 | 
115 | * Add hi_res_model_name to partition and deprecate model_name
116 | * Bump unstructured to 0.12.0
117 | * Add support for returning extracted image blocks as base64 encoded data stored in metadata fields
118 | 
119 | ## 0.0.61
120 | 
121 | * Bump unstructured to 0.11.6
122 | * Handle invalid hi_res_model_name kwarg
123 | 
124 | ## 0.0.60
125 | 
126 | * Enable self-hosted authorization using UNSTRUCTURED_API_KEY env variable
127 | 
128 | ## 0.0.59
129 | 
130 | * Bump unstructured to 0.11.0
131 | 
132 | ## 0.0.58
133 | 
134 | * Bump unstructured to 0.10.30
135 | 
136 | ## 0.0.57
137 | * Make sure `multipage_sections` param defaults to `true` as per the readme
138 | * Bump unstructured to 0.10.29
139 | 
140 | 
141 | ## 0.0.56
142 | * **Add `max_characters` param for chunking** This param gives users additional control to "chunk" elements into larger or smaller `CompositeElement`s
143 | * Bump unstructured to 0.10.28
144 | * Make sure chipperv2 is called when `hi_res_model_name==chipper`
145 | 
146 | 
147 | ## 0.0.55
148 | 
149 | * Bump unstructured to 0.10.26
150 | * Bring parent_id metadata field back after fixing a backwards compatibility bug
151 | * Restrict Chipper usage to one at a time. The model is very resource intense, and this will prevent issues while we improve it.
152 | 
153 | ## 0.0.54
154 | 
155 | * Bump unstructured to 0.10.25
156 | * Use a generator when splitting pdfs in parallel mode
157 | * Add a default memory minimum for 503 check
158 | * Fix an UnboundLocalError when an invalid docx file is caught
159 | 
160 | ## 0.0.53
161 | 
162 | * Bump unstructured to 0.10.23
163 | * Simplify the error message for BadZipFile errors
164 | 
165 | ## 0.0.52
166 | 
167 | * Bump unstructured to 0.10.21
168 | * Fix an unhandled error when a non pdf file is sent with content-type pdf
169 | * Fix an unhandled error when a non docx file is sent with content-type docx
170 | * Fix an unhandled error when a non-Unstructured json schema is sent
171 | 
172 | ## 0.0.51
173 | 
174 | * Bump unstructured to 0.10.19
175 | 
176 | ## 0.0.50
177 | 
178 | * Bump unstructured to 0.10.18
179 | 
180 | ## 0.0.49
181 | 
182 | * Remove spurious whitespace in `app-start.sh`. **This fixes deployments in some envs such as Google Cloud Run**.
183 | 
184 | ## 0.0.48
185 | 
186 | * **Adds `languages` kwarg** `ocr_languages` will eventually be deprecated and replaced by `languages` to specify what languages to use for OCR
187 | * Adds a startup log and other minor cleanups
188 | 
189 | ## 0.0.47
190 | 
191 | * **Adds `chunking_strategy` kwarg and associated params** These params allow users to "chunk" elements into larger or smaller `CompositeElement`s
192 | * **Remove `parent_id` from the element metadata**. New metadata fields are causing errors with existing installs. We'll readd this once a fix is widely available.
193 | * **Fix some pdfs incorrectly returning a file is encrypted error**. The `pypdf.is_encrypted` check caused us to return this error even if the file is readable.
194 | 
195 | ## 0.0.46
196 | 
197 | * Bump unstructured to 0.10.16
198 | 
199 | ## 0.0.45
200 | 
201 | * Drop `detection_class_prob` from the element metadata. This broke backwards compatibility when library users called `partition_via_api`.
202 | * Bump unstructured to 0.10.15
203 | 
204 | ## 0.0.44
205 | 
206 | * Bump unstructured to 0.10.14
207 | * Improve parallel mode retry handling
208 | * Improve logging during error handling. We don't need to log stack traces for expected errors.
209 | 
210 | ## 0.0.43
211 | 
212 | * Bump unstructured to 0.10.13
213 | * Bump unstructured-inference to 0.5.25
214 | * Remove dependency on unstructured-api-tools
215 | * Add a top level error handler for more consistent response bodies
216 | * Tesseract minor version bump to 5.3.2
217 | 
218 | ## 0.0.42
219 | 
220 | * Update readme for parameter `hi_res_model_name`
221 | * Fix a bug using `hi_res_model_name` in parallel mode
222 | * Bump unstructured library to 0.10.12
223 | * Bump unstructured-inference to 0.5.22
224 | 
225 | ## 0.0.41
226 | 
227 | * Bump unstructured library to 0.10.8
228 | * Bump unstructured-inference to 0.5.17
229 | 
230 | ## 0.0.40
231 | 
232 | * Reject traffic when overloaded via `UNSTRUCTURED_MEMORY_FREE_MINIMUM_MB`
233 | * Docker image built with Python 3.10 rather than 3.8
234 | 
235 | ## 0.0.39
236 | 
237 | * Fix incorrect handling on param skip_infer_table_types
238 | * Pin `safetensors` to fix a build error with 0.0.38
239 | 
240 | ## 0.0.38
241 | 
242 | * Fix page break has None page number bug
243 | * Bump unstructured to 0.10.5
244 | * Bump unstructured-ingest to 0.5.15
245 | * Fix UnboundLocalError using pdfs in parallel mode
246 | 
247 | ## 0.0.37
248 | 
249 | * Bump unstructured to 0.10.4
250 | 
251 | ## 0.0.36
252 | 
253 | * Fix a bug in parallel mode causing `not a valid pdf` errors
254 | * Bump unstructured to 0.10.2, unstructured-inference to 0.5.13
255 | 
256 | ## 0.0.35
257 | 
258 | * Bump unstructured library to 0.9.2
259 | * Fix a misleading error in make docker-test
260 | 
261 | ## 0.0.34
262 | 
263 | * Bump unstructured library to 0.9.0
264 | * Add table support for image with parameter `skip_infer_table_types`
265 | * Add support for gzipped files
266 | 
267 | ## 0.0.33
268 | 
269 | * Image tweak, move application entrypoint to scripts/app-start.sh
270 | 
271 | ## 0.0.32
272 | 
273 | * Throw 400 error if a PDF is password protected
274 | * Improve logging of params to single line json
275 | * Add support for `include_page_breaks` parameter
276 | 
277 | ## 0.0.31
278 | 
279 | * Support model name as api parameter
280 | * Add retry parameters on fanout requests
281 | * Bump unstructured library to 0.8.1
282 | * Fix how to remove an element's coordinate information
283 | 
284 | ## 0.0.30
285 | 
286 | * Add table extraction support for hi_res strategy
287 | * Add support for `encoding` parameter
288 | * Add support for `xml_keep_tags` parameter
289 | * Add env variables for additional parallel mode tweaking
290 | 
291 | ## 0.0.29
292 | 
293 | * Support .msg files
294 | * Refactor parallel mode and add smoke test
295 | * Fix header value for api key
296 | 
297 | ## 0.0.28
298 | 
299 | * Bump unstructured library to 0.7.8 for bug fixes
300 | 
301 | ## 0.0.27
302 | 
303 | * Update documentation and tests for filetypes to sync with partition.auto
304 | * Add support for .rst, .tsv, .xml
305 | * Move PYPDF2 to pypdf since PYPDF2 is deprecated
306 | 
307 | ## 0.0.26
308 | 
309 | * Add support for `ocr_only` strategy and `ocr_languages` parameter
310 | * Remove building `detectron2` from source in Dockerfile
311 | * Convert strategy from fast to auto for images since there is no fast strategy for images
312 | 
313 | ## 0.0.25
314 | 
315 | * Bump image to use python 3.8.17 instead of 3.8.15
316 | 
317 | ## 0.0.24
318 | 
319 | * Add returning text/csv to pipeline_api
320 | 
321 | ## 0.0.23
322 | 
323 | * Add support for csv files
324 | 
325 | ## 0.0.22
326 | 
327 | * Add parallel processing mode for pages within a pdf
328 | 
329 | ## 0.0.21
330 | 
331 | * Bump version of base image to use new stable version of tesseract
332 | * Bump to unstructured==0.7.1 for various bug fixes.
333 | 
334 | ## 0.0.20
335 | 
336 | * Supports additional filetypes: epub, odt, rft
337 | 
338 | ## 0.0.19
339 | 
340 | * Updating data type of optional os env var `ALLOWED_ORIGINS`
341 | 
342 | ## 0.0.18
343 | 
344 | * Add optional CORS to api if os env var `ALLOWED_ORIGINS` is set
345 | 
346 | ## 0.0.17
347 | 
348 | * Add config for unstructured.trace logger
349 | 
350 | ## 0.0.16
351 | 
352 | * Fix image build steps to support detectron2 install from Mac M1/M2
353 | * Upgrade to openssl 1.1.1 to accomodate the latest urllib3
354 | * Bump unstructured for SpooledTemporaryFile fix
355 | 
356 | ## 0.0.15
357 | 
358 | * Add msg and json types to supported
359 | 
360 | ## 0.0.14
361 | 
362 | * Bump unstructured to the latest version
363 | 
364 | ## 0.0.13
365 | 
366 | * Posting a bad .pdf results in a 400
367 | 
368 | ## 0.0.12
369 | 
370 | * Remove coordinates field from response elements by default
371 | 
372 | ## 0.0.11
373 | 
374 | * Add caching from the registry for `make docker-build`
375 | * Add fix for empty content type error
376 | 
377 | ## 0.0.10
378 | 
379 | * Bump unstructured-api-tools for better 'file type not supported' response messages
380 | 
381 | ## 0.0.9
382 | 
383 | * Updated detectron version
384 | * Update docker-build to use the public registry as a cache
385 | * Adds a strategy parameter to pipeline_api
386 | * Passing file, file_filename, and content_type to `partition`
387 | 
388 | ## 0.0.8
389 | 
390 | * Sensible logging config
391 | 
392 | ## 0.0.7
393 | 
394 | * Minor version bump
395 | 
396 | ## 0.0.6
397 | 
398 | * Minor version bump
399 | 
400 | ## 0.0.5
401 | 
402 | * Updated Dockerfile for public release
403 | * Remove rate limiting in the API
404 | * Add file type validation via UNSTRUCTURED_ALLOWED_MIMETYPES
405 | * Major semver route also supported: /general/v0/general
406 | 
407 | ## 0.0.4
408 | 
409 | * Changed pipeline name to `pipeline-general`
410 | * Changed pipeline to handle a variety of documents not just emails
411 | * Update Dockerfile, all supported library files.
412 | * Add sample-docs for pdf and pdf image.
413 | 
414 | ## 0.0.3
415 | 
416 | * Add emails pipeline Dockerfile
417 | 
418 | ## 0.0.2
419 | 
420 | * Add pipeline notebook
421 | 
422 | ## 0.0.1
423 | 
424 | * Initial pipeline setup
425 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:experimental
 2 | FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base
 3 | 
 4 | # NOTE(crag): NB_USER ARG for mybinder.org compat:
 5 | #             https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html
 6 | ARG NB_USER=notebook-user
 7 | ARG NB_UID=1000
 8 | ARG PIP_VERSION
 9 | ARG PIPELINE_PACKAGE
10 | ARG PYTHON_VERSION="3.11"
11 | 
12 | # Set up environment
13 | ENV PYTHON python${PYTHON_VERSION}
14 | ENV PIP ${PYTHON} -m pip
15 | 
16 | WORKDIR ${HOME}
17 | USER ${NB_USER}
18 | 
19 | ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
20 | ENV PATH="/home/${NB_USER}/.local/bin:${PATH}"
21 | 
22 | FROM base as python-deps
23 | COPY --chown=${NB_USER}:${NB_USER} requirements/base.txt requirements-base.txt
24 | RUN ${PIP} install pip==${PIP_VERSION}
25 | RUN ${PIP} install --no-cache -r requirements-base.txt
26 | 
27 | FROM python-deps as model-deps
28 | RUN ${PYTHON} -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
29 |   ${PYTHON} -c "from unstructured.partition.model_init import initialize; initialize()"
30 | 
31 | FROM model-deps as code
32 | COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md
33 | COPY --chown=${NB_USER}:${NB_USER} logger_config.yaml logger_config.yaml
34 | COPY --chown=${NB_USER}:${NB_USER} prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/
35 | COPY --chown=${NB_USER}:${NB_USER} exploration-notebooks exploration-notebooks
36 | COPY --chown=${NB_USER}:${NB_USER} scripts/app-start.sh scripts/app-start.sh
37 | 
38 | ENTRYPOINT ["scripts/app-start.sh"]
39 | # Expose a default port of 8000. Note: The EXPOSE instruction does not actually publish the port,
40 | # but some tooling will inspect containers and perform work contingent on networking support declared.
41 | 
42 | EXPOSE 8000
43 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2022 Unstructured Technologies, Inc
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | PIPELINE_FAMILY := general
  2 | PIPELINE_PACKAGE := general
  3 | PACKAGE_NAME := prepline_${PIPELINE_PACKAGE}
  4 | PIP_VERSION := 25.1.1
  5 | ARCH := $(shell uname -m)
  6 | 
  7 | .PHONY: help
  8 | help: Makefile
  9 | 	@sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $<
 10 | 
 11 | 
 12 | ###########
 13 | # Install #
 14 | ###########
 15 | 
 16 | ## install-base:                installs minimum requirements to run the API
 17 | .PHONY: install-base
 18 | install-base: install-base-pip-packages install-nltk-models
 19 | 
 20 | ## install:                     installs all test and dev requirements
 21 | .PHONY: install
 22 | install:install-base install-test
 23 | 
 24 | .PHONY: install-base-pip-packages
 25 | install-base-pip-packages:
 26 | 	python3 -m pip install pip==${PIP_VERSION}
 27 | 	python3 -m pip install -r requirements/base.txt
 28 | 
 29 | .PHONY: install-test
 30 | install-test: install-base
 31 | 	python3 -m pip install -r requirements/test.txt
 32 | 
 33 | .PHONY: install-ci
 34 | install-ci: install-test
 35 | 
 36 | .PHONY: install-nltk-models
 37 | install-nltk-models:
 38 | 	python3 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()"
 39 | 
 40 | ## pip-compile:                 compiles all base/dev/test requirements
 41 | .PHONY: pip-compile
 42 | pip-compile:
 43 | 	pip-compile --upgrade requirements/base.in
 44 | 	pip-compile --upgrade -o requirements/test.txt requirements/base.txt requirements/test.in
 45 | 
 46 | .PHONY: install-pandoc
 47 | install-pandoc:
 48 | 	ARCH=${ARCH} ./scripts/install-pandoc.sh
 49 | 
 50 | ##########
 51 | # Docker #
 52 | ##########
 53 | 
 54 | # Docker targets are provided for convenience only and are not required in a standard development environment
 55 | 
 56 | # Note that the image has notebooks baked in, however the current working directory
 57 | # is mounted under /home/notebook-user/local/ when the image is started with
 58 | # docker-start-api or docker-start-jupyter
 59 | 
 60 | DOCKER_IMAGE ?= pipeline-family-${PIPELINE_FAMILY}-dev:latest
 61 | 
 62 | .PHONY: docker-build
 63 | docker-build:
 64 | 	PIP_VERSION=${PIP_VERSION} PIPELINE_FAMILY=${PIPELINE_FAMILY} PIPELINE_PACKAGE=${PIPELINE_PACKAGE} ./scripts/docker-build.sh
 65 | 
 66 | .PHONY: docker-start-api
 67 | docker-start-api:
 68 | 	docker run -p 8000:8000 \
 69 | 	-it --rm  \
 70 | 	--mount type=bind,source=$(realpath .),target=/home/notebook-user/local \
 71 | 	$(if $(MAX_LIFETIME_SECONDS),-e MAX_LIFETIME_SECONDS=$(MAX_LIFETIME_SECONDS)) \
 72 | 	pipeline-family-${PIPELINE_FAMILY}-dev:latest scripts/app-start.sh
 73 | 
 74 | .PHONY: docker-start-bash
 75 | docker-start-bash:
 76 | 	docker run -p 8000:8000 -it --rm --mount type=bind,source=$(realpath .),target=/home/notebook-user/local --entrypoint /bin/bash pipeline-family-${PIPELINE_FAMILY}-dev:latest
 77 | 
 78 | .PHONY: docker-test
 79 | docker-test:
 80 | 	DOCKER_IMAGE=${DOCKER_IMAGE} ./scripts/docker-smoke-test.sh
 81 | 
 82 | #########
 83 | # Local #
 84 | #########
 85 | 
 86 | ## run-web-app:                 runs the FastAPI api with hot reloading
 87 | .PHONY: run-web-app
 88 | run-web-app:
 89 | 	PYTHONPATH=$(realpath .) uvicorn ${PACKAGE_NAME}.api.app:app --reload --log-config logger_config.yaml
 90 | 
 91 | #################
 92 | # Test and Lint #
 93 | #################
 94 | 
 95 | ## test:                        runs core tests
 96 | .PHONY: test
 97 | test:
 98 | 	PYTHONPATH=. pytest -v test_${PIPELINE_PACKAGE} --cov=${PACKAGE_NAME} --cov-report term-missing
 99 | 
100 | # Setting a low bar here - need more tests!
101 | .PHONY: check-coverage
102 | check-coverage:
103 | 	coverage report --fail-under=60
104 | 
105 | ## check:                       runs linters (includes tests)
106 | .PHONY: check
107 | check: check-src check-tests check-version
108 | 
109 | ## check-src:                   runs linters (source only, no tests)
110 | .PHONY: check-src
111 | check-src:
112 | 	black --line-length 100 ${PACKAGE_NAME} --check
113 | 	flake8 ${PACKAGE_NAME}
114 | 	mypy ${PACKAGE_NAME} --ignore-missing-imports --install-types --non-interactive --implicit-optional
115 | 
116 | .PHONY: check-tests
117 | check-tests:
118 | 	black --line-length 100 test_${PIPELINE_PACKAGE} --check
119 | 	flake8 test_${PIPELINE_PACKAGE} scripts/smoketest.py
120 | 
121 | ## tidy:                        run black
122 | .PHONY: tidy
123 | tidy:
124 | 	black --line-length 100 ${PACKAGE_NAME}
125 | 	black --line-length 100 test_${PIPELINE_PACKAGE} scripts/smoketest.py
126 | 
127 | ## check-scripts:               run shellcheck
128 | .PHONY: check-scripts
129 | check-scripts:
130 |     # Fail if any of these files have warnings
131 | 	scripts/shellcheck.sh
132 | 
133 | ## check-version:               run check to ensure version in CHANGELOG.md matches references in files
134 | .PHONY: check-version
135 | check-version:
136 | # Fail if syncing version would produce changes
137 | 	scripts/version-sync.sh -c \
138 | 		-s CHANGELOG.md \
139 | 		-f preprocessing-pipeline-family.yaml release \
140 | 		-f ${PACKAGE_NAME}/api/app.py release \
141 | 		-f ${PACKAGE_NAME}/api/general.py release
142 | 
143 | ## version-sync:                update references to version with most recent version from CHANGELOG.md
144 | .PHONY: version-sync
145 | version-sync:
146 | 	scripts/version-sync.sh \
147 | 		-s CHANGELOG.md \
148 | 		-f preprocessing-pipeline-family.yaml release \
149 | 		-f ${PACKAGE_NAME}/api/app.py release \
150 | 		-f ${PACKAGE_NAME}/api/general.py release
151 | 


--------------------------------------------------------------------------------
/docker/rockylinux-9.4/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:experimental
 2 | FROM quay.io/unstructured-io/base-images:rocky9.2-9@sha256:73d8492452f086144d4b92b7931aa04719f085c74d16cae81e8826ef873729c9 as base
 3 | 
 4 | # NOTE(crag): NB_USER ARG for mybinder.org compat:
 5 | #             https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html
 6 | ARG NB_USER=notebook-user
 7 | ARG NB_UID=1000
 8 | ARG PIP_VERSION
 9 | ARG PIPELINE_PACKAGE
10 | 
11 | # Set up environment
12 | ENV USER ${NB_USER}
13 | ENV HOME /home/${NB_USER}
14 | 
15 | RUN groupadd --gid ${NB_UID} ${NB_USER}
16 | RUN useradd --uid ${NB_UID} --gid ${NB_UID} ${NB_USER}
17 | WORKDIR ${HOME}
18 | 
19 | ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
20 | ENV PATH="/home/${NB_USER}/.local/bin:${PATH}"
21 | 
22 | FROM base as python-deps
23 | # COPY requirements/dev.txt requirements-dev.txt
24 | COPY requirements/base.txt requirements-base.txt
25 | RUN python3.10 -m pip install pip==${PIP_VERSION} \
26 |   && dnf -y groupinstall "Development Tools" \
27 |   && su -l ${NB_USER} -c 'pip3.10 install  --no-cache  -r requirements-base.txt' \
28 |   && dnf -y groupremove "Development Tools" \
29 |   && dnf clean all \
30 |   && ln -s /home/notebook-user/.local/bin/pip3.10 /usr/local/bin/pip3.10 || true
31 | 
32 | USER ${NB_USER}
33 | 
34 | FROM python-deps as model-deps
35 | RUN python3.10 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
36 |   python3.10 -c "from unstructured.partition.model_init import initialize; initialize()"
37 | 
38 | FROM model-deps as code
39 | COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md
40 | COPY --chown=${NB_USER}:${NB_USER} logger_config.yaml logger_config.yaml
41 | COPY --chown=${NB_USER}:${NB_USER} prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/
42 | COPY --chown=${NB_USER}:${NB_USER} exploration-notebooks exploration-notebooks
43 | COPY --chown=${NB_USER}:${NB_USER} scripts/app-start.sh scripts/app-start.sh
44 | 
45 | ENTRYPOINT ["scripts/app-start.sh"]
46 | # Expose a default port of 8000. Note: The EXPOSE instruction does not actually publish the port,
47 | # but some tooling will inspect containers and perform work contingent on networking support declared.
48 | EXPOSE 8000
49 | 


--------------------------------------------------------------------------------
/exploration-notebooks/exploration-html.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "35227754",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "data": {
 11 |       "text/html": [
 12 |        "<!-- \n",
 13 |        "If you can see this code, this cell's output is not trusted.\n",
 14 |        "Please execute this cell and save the notebook, or click File -> Trust Notebook\n",
 15 |        "-->\n",
 16 |        "<script>\n",
 17 |        "var shown = true;\n",
 18 |        "\n",
 19 |        "function filter_cells_by_tag(tag) {\n",
 20 |        "    out = Array();\n",
 21 |        "    all_cells = Jupyter.notebook.get_cells()\n",
 22 |        "    for (var i=0; i<all_cells.length; i++) {\n",
 23 |        "        var curr_cell = all_cells[i];\n",
 24 |        "        var tags = curr_cell._metadata.tags;\n",
 25 |        "        if (tags != undefined) {\n",
 26 |        "            for (var j=0; j<tags.length; j++) {\n",
 27 |        "                var curr_tag = tags[j];\n",
 28 |        "                if (curr_tag == tag) {\n",
 29 |        "                    out.push(curr_cell);\n",
 30 |        "                    break;\n",
 31 |        "                }\n",
 32 |        "            }\n",
 33 |        "        }\n",
 34 |        "    }\n",
 35 |        "    return out;\n",
 36 |        "}\n",
 37 |        "\n",
 38 |        "function set_cell_visibility(tag, show, input_only) {\n",
 39 |        "    var cells = Jupyter.notebook.get_cells();\n",
 40 |        "    var marked_cells = filter_cells_by_tag(tag);\n",
 41 |        "    for (var i=0; i<marked_cells.length; i++) {\n",
 42 |        "        var curr_cell = marked_cells[i];\n",
 43 |        "        if (input_only) {\n",
 44 |        "            obj = curr_cell.input\n",
 45 |        "        } else {\n",
 46 |        "            obj = curr_cell.element\n",
 47 |        "        }\n",
 48 |        "        if (show) {\n",
 49 |        "            obj.show();\n",
 50 |        "        } else {\n",
 51 |        "            obj.hide();\n",
 52 |        "        }\n",
 53 |        "    }\n",
 54 |        "}\n",
 55 |        "\n",
 56 |        "function toggle_cell_visibility(tag) {\n",
 57 |        "    set_cell_visibility(tag, shown, false)\n",
 58 |        "    shown = ! shown;\n",
 59 |        "}\n",
 60 |        "\n",
 61 |        "set_cell_visibility('execution_cell', false, true);\n",
 62 |        "</script>\n",
 63 |        "To toggle visibility of explanation cells click <a href=\"javascript:toggle_cell_visibility('explanatory_cell')\">here</a>\n"
 64 |       ],
 65 |       "text/plain": [
 66 |        "<IPython.core.display.HTML object>"
 67 |       ]
 68 |      },
 69 |      "metadata": {},
 70 |      "output_type": "display_data"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "%%html\n",
 75 |     "<!-- \n",
 76 |     "If you can see this code, this cell's output is not trusted.\n",
 77 |     "Please execute this cell and save the notebook, or click File -> Trust Notebook\n",
 78 |     "-->\n",
 79 |     "<script>\n",
 80 |     "var shown = true;\n",
 81 |     "\n",
 82 |     "function filter_cells_by_tag(tag) {\n",
 83 |     "    out = Array();\n",
 84 |     "    all_cells = Jupyter.notebook.get_cells()\n",
 85 |     "    for (var i=0; i<all_cells.length; i++) {\n",
 86 |     "        var curr_cell = all_cells[i];\n",
 87 |     "        var tags = curr_cell._metadata.tags;\n",
 88 |     "        if (tags != undefined) {\n",
 89 |     "            for (var j=0; j<tags.length; j++) {\n",
 90 |     "                var curr_tag = tags[j];\n",
 91 |     "                if (curr_tag == tag) {\n",
 92 |     "                    out.push(curr_cell);\n",
 93 |     "                    break;\n",
 94 |     "                }\n",
 95 |     "            }\n",
 96 |     "        }\n",
 97 |     "    }\n",
 98 |     "    return out;\n",
 99 |     "}\n",
100 |     "\n",
101 |     "function set_cell_visibility(tag, show, input_only) {\n",
102 |     "    var cells = Jupyter.notebook.get_cells();\n",
103 |     "    var marked_cells = filter_cells_by_tag(tag);\n",
104 |     "    for (var i=0; i<marked_cells.length; i++) {\n",
105 |     "        var curr_cell = marked_cells[i];\n",
106 |     "        if (input_only) {\n",
107 |     "            obj = curr_cell.input\n",
108 |     "        } else {\n",
109 |     "            obj = curr_cell.element\n",
110 |     "        }\n",
111 |     "        if (show) {\n",
112 |     "            obj.show();\n",
113 |     "        } else {\n",
114 |     "            obj.hide();\n",
115 |     "        }\n",
116 |     "    }\n",
117 |     "}\n",
118 |     "\n",
119 |     "function toggle_cell_visibility(tag) {\n",
120 |     "    set_cell_visibility(tag, shown, false)\n",
121 |     "    shown = ! shown;\n",
122 |     "}\n",
123 |     "\n",
124 |     "set_cell_visibility('execution_cell', false, true);\n",
125 |     "</script>\n",
126 |     "To toggle visibility of explanation cells click <a href=\"javascript:toggle_cell_visibility('explanatory_cell')\">here</a>\n"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "id": "e908195c",
132 |    "metadata": {},
133 |    "source": [
134 |     "# HTML Preprocessing"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "id": "727614ba",
140 |    "metadata": {},
141 |    "source": [
142 |     "This notebook defines the steps for extracting information from an HTML file. To see how to create a generalized API for all documents in the `pipeline-notebooks` directory\n",
143 |     "\n",
144 |     "To demonstrate how off-the-shelf Unstructured Bricks extract meaningful data from complex source documents, we will apply a series of Bricks with explanations.\n",
145 |     "\n",
146 |     "#### Table of Contents\n",
147 |     "\n",
148 |     "1. [Take a Look at a HTML File](#explore)\n",
149 |     "1. [Custom Partitioning Bricks](#custom)\n",
150 |     "1. [Cleaning Bricks](#cleaning)\n",
151 |     "1. [Staging Bricks](#staging)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "id": "3848e558",
157 |    "metadata": {},
158 |    "source": [
159 |     "## Section 1: Take a Look at a HTML File <a id=\"explore\"></a>"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "id": "71814e12",
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "import os\n",
170 |     "import json\n",
171 |     "\n",
172 |     "\n",
173 |     "def get_filename(directory, filename):\n",
174 |     "    cwd = os.getcwd()\n",
175 |     "    local_directory = os.path.join(os.path.split(cwd)[0], directory)\n",
176 |     "    ci_directory = os.path.join(cwd, directory)\n",
177 |     "\n",
178 |     "    if os.path.exists(local_directory) and filename in os.listdir(local_directory):\n",
179 |     "        return os.path.join(local_directory, filename)\n",
180 |     "    elif os.path.exists(ci_directory) and filename in os.listdir(ci_directory):\n",
181 |     "        return os.path.join(ci_directory, filename)\n",
182 |     "    else:\n",
183 |     "        raise FileNotFoundError"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "id": "72f0ebc4",
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "filename = get_filename(\"sample-docs\", \"fake-html.html\")"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "id": "ea3b2b58",
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "from unstructured.documents.html import HTMLDocument\n",
204 |     "\n",
205 |     "document = HTMLDocument.from_file(filename)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "id": "fa146f41",
212 |    "metadata": {},
213 |    "outputs": [
214 |     {
215 |      "name": "stdout",
216 |      "output_type": "stream",
217 |      "text": [
218 |       "My First Heading\n",
219 |       "\n",
220 |       "My first paragraph.\n"
221 |      ]
222 |     }
223 |    ],
224 |    "source": [
225 |     "print(document)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "id": "15d69b6b",
231 |    "metadata": {},
232 |    "source": [
233 |     "## Section 2: Custom Partition Bricks"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "id": "ff34cce7",
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "from unstructured.partition.html import partition_html\n",
244 |     "\n",
245 |     "elements = partition_html(filename)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "id": "7a46b93f",
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "[<unstructured.documents.html.HTMLTitle object>, <unstructured.documents.html.HTMLTitle object>]\n"
259 |      ]
260 |     }
261 |    ],
262 |    "source": [
263 |     "print(elements)"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "id": "e0312c8c",
270 |    "metadata": {},
271 |    "outputs": [
272 |     {
273 |      "name": "stdout",
274 |      "output_type": "stream",
275 |      "text": [
276 |       "My First Heading\n",
277 |       "My first paragraph.\n"
278 |      ]
279 |     }
280 |    ],
281 |    "source": [
282 |     "for element in elements:\n",
283 |     "    print(element.text)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "id": "10e1d3df",
289 |    "metadata": {},
290 |    "source": [
291 |     "## Section 3: Cleaning Bricks <a id=\"cleaning\"></a>"
292 |    ]
293 |   },
294 |   {
295 |    "attachments": {},
296 |    "cell_type": "markdown",
297 |    "id": "52943c00",
298 |    "metadata": {},
299 |    "source": [
300 |     "In addition to partitioning bricks, the Unstructured library has\n",
301 |     "***cleaning*** bricks for removing unwanted content from text. In this\n",
302 |     "case, we'll solve our punctuation problem by using the \n",
303 |     "`remove_punctuation`. Other uses for cleaning bricks include\n",
304 |     "cleaning out boilerplate, sentence fragments, and other segments\n",
305 |     "of text that could impact labeling tasks or the accuracy of\n",
306 |     "machine learning models. As with partitioning bricks, users can\n",
307 |     "include custom cleaning bricks in a pipeline."
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": null,
313 |    "id": "268e7dcd",
314 |    "metadata": {},
315 |    "outputs": [
316 |     {
317 |      "data": {
318 |       "text/plain": [
319 |        "'My first paragraph.'"
320 |       ]
321 |      },
322 |      "execution_count": null,
323 |      "metadata": {},
324 |      "output_type": "execute_result"
325 |     }
326 |    ],
327 |    "source": [
328 |     "#This element has a lot of new line characters\n",
329 |     "elements[1].text"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "id": "485198a5",
336 |    "metadata": {},
337 |    "outputs": [
338 |     {
339 |      "data": {
340 |       "text/plain": [
341 |        "'My first paragraph'"
342 |       ]
343 |      },
344 |      "execution_count": null,
345 |      "metadata": {},
346 |      "output_type": "execute_result"
347 |     }
348 |    ],
349 |    "source": [
350 |     "from unstructured.cleaners.core import remove_punctuation\n",
351 |     "\n",
352 |     "remove_punctuation(elements[1].text)"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "markdown",
357 |    "id": "0f7fea99",
358 |    "metadata": {},
359 |    "source": [
360 |     "## Section 4: Staging Bricks<a id=\"staging\"></a>"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": null,
366 |    "id": "4f41f82c",
367 |    "metadata": {},
368 |    "outputs": [
369 |     {
370 |      "data": {
371 |       "text/plain": [
372 |        "[{'data': {'text': 'My First Heading',\n",
373 |        "   'ref_id': '0540311f6c077fe8f797080918b8d74b'}},\n",
374 |        " {'data': {'text': 'My first paragraph.',\n",
375 |        "   'ref_id': '399af454cb1368b8257ed406b430de84'}}]"
376 |       ]
377 |      },
378 |      "execution_count": null,
379 |      "metadata": {},
380 |      "output_type": "execute_result"
381 |     }
382 |    ],
383 |    "source": [
384 |     "from unstructured.staging.label_studio import stage_for_label_studio\n",
385 |     "\n",
386 |     "label_studio_data = stage_for_label_studio(elements)\n",
387 |     "label_studio_data"
388 |    ]
389 |   }
390 |  ],
391 |  "metadata": {
392 |   "kernelspec": {
393 |    "display_name": "python3",
394 |    "language": "python",
395 |    "name": "python3"
396 |   }
397 |  },
398 |  "nbformat": 4,
399 |  "nbformat_minor": 5
400 | }
401 | 


--------------------------------------------------------------------------------
/exploration-notebooks/exploration-powerpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "35227754",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "data": {
 11 |       "text/html": [
 12 |        "<!-- \n",
 13 |        "If you can see this code, this cell's output is not trusted.\n",
 14 |        "Please execute this cell and save the notebook, or click File -> Trust Notebook\n",
 15 |        "-->\n",
 16 |        "<script>\n",
 17 |        "var shown = true;\n",
 18 |        "\n",
 19 |        "function filter_cells_by_tag(tag) {\n",
 20 |        "    out = Array();\n",
 21 |        "    all_cells = Jupyter.notebook.get_cells()\n",
 22 |        "    for (var i=0; i<all_cells.length; i++) {\n",
 23 |        "        var curr_cell = all_cells[i];\n",
 24 |        "        var tags = curr_cell._metadata.tags;\n",
 25 |        "        if (tags != undefined) {\n",
 26 |        "            for (var j=0; j<tags.length; j++) {\n",
 27 |        "                var curr_tag = tags[j];\n",
 28 |        "                if (curr_tag == tag) {\n",
 29 |        "                    out.push(curr_cell);\n",
 30 |        "                    break;\n",
 31 |        "                }\n",
 32 |        "            }\n",
 33 |        "        }\n",
 34 |        "    }\n",
 35 |        "    return out;\n",
 36 |        "}\n",
 37 |        "\n",
 38 |        "function set_cell_visibility(tag, show, input_only) {\n",
 39 |        "    var cells = Jupyter.notebook.get_cells();\n",
 40 |        "    var marked_cells = filter_cells_by_tag(tag);\n",
 41 |        "    for (var i=0; i<marked_cells.length; i++) {\n",
 42 |        "        var curr_cell = marked_cells[i];\n",
 43 |        "        if (input_only) {\n",
 44 |        "            obj = curr_cell.input\n",
 45 |        "        } else {\n",
 46 |        "            obj = curr_cell.element\n",
 47 |        "        }\n",
 48 |        "        if (show) {\n",
 49 |        "            obj.show();\n",
 50 |        "        } else {\n",
 51 |        "            obj.hide();\n",
 52 |        "        }\n",
 53 |        "    }\n",
 54 |        "}\n",
 55 |        "\n",
 56 |        "function toggle_cell_visibility(tag) {\n",
 57 |        "    set_cell_visibility(tag, shown, false)\n",
 58 |        "    shown = ! shown;\n",
 59 |        "}\n",
 60 |        "\n",
 61 |        "set_cell_visibility('execution_cell', false, true);\n",
 62 |        "</script>\n",
 63 |        "To toggle visibility of explanation cells click <a href=\"javascript:toggle_cell_visibility('explanatory_cell')\">here</a>\n"
 64 |       ],
 65 |       "text/plain": [
 66 |        "<IPython.core.display.HTML object>"
 67 |       ]
 68 |      },
 69 |      "metadata": {},
 70 |      "output_type": "display_data"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "%%html\n",
 75 |     "<!-- \n",
 76 |     "If you can see this code, this cell's output is not trusted.\n",
 77 |     "Please execute this cell and save the notebook, or click File -> Trust Notebook\n",
 78 |     "-->\n",
 79 |     "<script>\n",
 80 |     "var shown = true;\n",
 81 |     "\n",
 82 |     "function filter_cells_by_tag(tag) {\n",
 83 |     "    out = Array();\n",
 84 |     "    all_cells = Jupyter.notebook.get_cells()\n",
 85 |     "    for (var i=0; i<all_cells.length; i++) {\n",
 86 |     "        var curr_cell = all_cells[i];\n",
 87 |     "        var tags = curr_cell._metadata.tags;\n",
 88 |     "        if (tags != undefined) {\n",
 89 |     "            for (var j=0; j<tags.length; j++) {\n",
 90 |     "                var curr_tag = tags[j];\n",
 91 |     "                if (curr_tag == tag) {\n",
 92 |     "                    out.push(curr_cell);\n",
 93 |     "                    break;\n",
 94 |     "                }\n",
 95 |     "            }\n",
 96 |     "        }\n",
 97 |     "    }\n",
 98 |     "    return out;\n",
 99 |     "}\n",
100 |     "\n",
101 |     "function set_cell_visibility(tag, show, input_only) {\n",
102 |     "    var cells = Jupyter.notebook.get_cells();\n",
103 |     "    var marked_cells = filter_cells_by_tag(tag);\n",
104 |     "    for (var i=0; i<marked_cells.length; i++) {\n",
105 |     "        var curr_cell = marked_cells[i];\n",
106 |     "        if (input_only) {\n",
107 |     "            obj = curr_cell.input\n",
108 |     "        } else {\n",
109 |     "            obj = curr_cell.element\n",
110 |     "        }\n",
111 |     "        if (show) {\n",
112 |     "            obj.show();\n",
113 |     "        } else {\n",
114 |     "            obj.hide();\n",
115 |     "        }\n",
116 |     "    }\n",
117 |     "}\n",
118 |     "\n",
119 |     "function toggle_cell_visibility(tag) {\n",
120 |     "    set_cell_visibility(tag, shown, false)\n",
121 |     "    shown = ! shown;\n",
122 |     "}\n",
123 |     "\n",
124 |     "set_cell_visibility('execution_cell', false, true);\n",
125 |     "</script>\n",
126 |     "To toggle visibility of explanation cells click <a href=\"javascript:toggle_cell_visibility('explanatory_cell')\">here</a>\n"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "id": "e908195c",
132 |    "metadata": {},
133 |    "source": [
134 |     "# Powerpoint Preprocessing"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "id": "727614ba",
140 |    "metadata": {},
141 |    "source": [
142 |     "This notebook defines the steps for extracting information from an Powerpoint file. To see how to create a generalized API for all documents in the `pipeline-notebooks` directory\n",
143 |     "\n",
144 |     "To demonstrate how off-the-shelf Unstructured Bricks extract meaningful data from complex source documents, we will apply a series of Bricks with explanations.\n",
145 |     "\n",
146 |     "#### Table of Contents\n",
147 |     "\n",
148 |     "1. [Take a Look at a Powerpoint File](#explore)\n",
149 |     "1. [Custom Partitioning Bricks](#custom)\n",
150 |     "1. [Cleaning Bricks](#cleaning)\n",
151 |     "1. [Staging Bricks](#staging)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "id": "3848e558",
157 |    "metadata": {},
158 |    "source": [
159 |     "## Section 1: Take a Look at a Excel File <a id=\"explore\"></a>"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "id": "71814e12",
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "import os\n",
170 |     "import json\n",
171 |     "\n",
172 |     "\n",
173 |     "def get_filename(directory, filename):\n",
174 |     "    cwd = os.getcwd()\n",
175 |     "    local_directory = os.path.join(os.path.split(cwd)[0], directory)\n",
176 |     "    ci_directory = os.path.join(cwd, directory)\n",
177 |     "\n",
178 |     "    if os.path.exists(local_directory) and filename in os.listdir(local_directory):\n",
179 |     "        return os.path.join(local_directory, filename)\n",
180 |     "    elif os.path.exists(ci_directory) and filename in os.listdir(ci_directory):\n",
181 |     "        return os.path.join(ci_directory, filename)\n",
182 |     "    else:\n",
183 |     "        raise FileNotFoundError"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "id": "72f0ebc4",
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "filename = get_filename(\"sample-docs\", \"fake-power-point.pptx\")"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "id": "ea3b2b58",
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "import pptx\n",
204 |     "\n",
205 |     "presentation = pptx.Presentation(filename)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "id": "fa146f41",
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "shape = presentation.slides[0].shapes[0]"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "id": "7c938979",
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "text = shape.text_frame.paragraphs[0].text"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "id": "f3848757",
232 |    "metadata": {},
233 |    "outputs": [
234 |     {
235 |      "name": "stdout",
236 |      "output_type": "stream",
237 |      "text": [
238 |       "Adding a Bullet Slide\n"
239 |      ]
240 |     }
241 |    ],
242 |    "source": [
243 |     "print(text)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "id": "15d69b6b",
249 |    "metadata": {},
250 |    "source": [
251 |     "## Section 2: Custom Partition Bricks"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "id": "ff34cce7",
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "from unstructured.partition.pptx import partition_pptx\n",
262 |     "\n",
263 |     "elements = partition_pptx(filename)"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "id": "7a46b93f",
270 |    "metadata": {},
271 |    "outputs": [
272 |     {
273 |      "name": "stdout",
274 |      "output_type": "stream",
275 |      "text": [
276 |       "[<unstructured.documents.elements.Title object>, <unstructured.documents.elements.Title object>, <unstructured.documents.elements.Title object>, <unstructured.documents.elements.Title object>]\n"
277 |      ]
278 |     }
279 |    ],
280 |    "source": [
281 |     "print(elements)"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "id": "e0312c8c",
288 |    "metadata": {},
289 |    "outputs": [
290 |     {
291 |      "name": "stdout",
292 |      "output_type": "stream",
293 |      "text": [
294 |       "Adding a Bullet Slide\n",
295 |       "Find the bullet slide layout\n",
296 |       "Use _TextFrame.text for first bullet\n",
297 |       "Use _TextFrame.add_paragraph() for subsequent bullets\n"
298 |      ]
299 |     }
300 |    ],
301 |    "source": [
302 |     "for element in elements:\n",
303 |     "    print(element.text)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "id": "10e1d3df",
309 |    "metadata": {},
310 |    "source": [
311 |     "## Section 3: Cleaning Bricks <a id=\"cleaning\"></a>"
312 |    ]
313 |   },
314 |   {
315 |    "attachments": {},
316 |    "cell_type": "markdown",
317 |    "id": "52943c00",
318 |    "metadata": {},
319 |    "source": [
320 |     "In addition to partitioning bricks, the Unstructured library has\n",
321 |     "***cleaning*** bricks for removing unwanted content from text. In this\n",
322 |     "case, we'll solve our punctuation problem by using the \n",
323 |     "`remove_punctuation`. Other uses for cleaning bricks include\n",
324 |     "cleaning out boilerplate, sentence fragments, and other segments\n",
325 |     "of text that could impact labeling tasks or the accuracy of\n",
326 |     "machine learning models. As with partitioning bricks, users can\n",
327 |     "include custom cleaning bricks in a pipeline."
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": null,
333 |    "id": "268e7dcd",
334 |    "metadata": {},
335 |    "outputs": [
336 |     {
337 |      "data": {
338 |       "text/plain": [
339 |        "'Use _TextFrame.add_paragraph() for subsequent bullets'"
340 |       ]
341 |      },
342 |      "execution_count": null,
343 |      "metadata": {},
344 |      "output_type": "execute_result"
345 |     }
346 |    ],
347 |    "source": [
348 |     "#This element has a lot of new line characters\n",
349 |     "elements[3].text"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": null,
355 |    "id": "485198a5",
356 |    "metadata": {},
357 |    "outputs": [
358 |     {
359 |      "data": {
360 |       "text/plain": [
361 |        "'Use TextFrameaddparagraph for subsequent bullets'"
362 |       ]
363 |      },
364 |      "execution_count": null,
365 |      "metadata": {},
366 |      "output_type": "execute_result"
367 |     }
368 |    ],
369 |    "source": [
370 |     "from unstructured.cleaners.core import remove_punctuation\n",
371 |     "\n",
372 |     "remove_punctuation(elements[3].text)"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "markdown",
377 |    "id": "0f7fea99",
378 |    "metadata": {},
379 |    "source": [
380 |     "## Section 4: Staging Bricks<a id=\"staging\"></a>"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "id": "a4cb2037",
387 |    "metadata": {},
388 |    "outputs": [
389 |     {
390 |      "data": {
391 |       "text/plain": [
392 |        "'Use _TextFrame.text for first bullet'"
393 |       ]
394 |      },
395 |      "execution_count": null,
396 |      "metadata": {},
397 |      "output_type": "execute_result"
398 |     }
399 |    ],
400 |    "source": [
401 |     "elements[2].text"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "id": "4f41f82c",
408 |    "metadata": {},
409 |    "outputs": [
410 |     {
411 |      "data": {
412 |       "text/plain": [
413 |        "[{'data': {'text': 'Adding a Bullet Slide',\n",
414 |        "   'ref_id': '50b70366a51804855c6dd48a3865cb87'}},\n",
415 |        " {'data': {'text': 'Find the bullet slide layout',\n",
416 |        "   'ref_id': '3c0332d3515a039dee82e4f3388594c8'}},\n",
417 |        " {'data': {'text': 'Use _TextFrame.text for first bullet',\n",
418 |        "   'ref_id': 'ca8d08c97f0eeb554cac4758c9229614'}},\n",
419 |        " {'data': {'text': 'Use _TextFrame.add_paragraph() for subsequent bullets',\n",
420 |        "   'ref_id': '83d53564b64b558f77c7c33b5a029213'}}]"
421 |       ]
422 |      },
423 |      "execution_count": null,
424 |      "metadata": {},
425 |      "output_type": "execute_result"
426 |     }
427 |    ],
428 |    "source": [
429 |     "from unstructured.staging.label_studio import stage_for_label_studio\n",
430 |     "\n",
431 |     "label_studio_data = stage_for_label_studio(elements)\n",
432 |     "label_studio_data"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": null,
438 |    "id": "7bd176e1",
439 |    "metadata": {},
440 |    "outputs": [],
441 |    "source": []
442 |   }
443 |  ],
444 |  "metadata": {
445 |   "kernelspec": {
446 |    "display_name": "python3",
447 |    "language": "python",
448 |    "name": "python3"
449 |   }
450 |  },
451 |  "nbformat": 4,
452 |  "nbformat_minor": 5
453 | }
454 | 


--------------------------------------------------------------------------------
/img/email-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/img/email-screenshot.png


--------------------------------------------------------------------------------
/img/unstructured_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/img/unstructured_logo.png


--------------------------------------------------------------------------------
/logger_config.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | disable_existing_loggers: False
 3 | formatters:
 4 |   default_format:
 5 |     "()": uvicorn.logging.DefaultFormatter
 6 |     format: '%(asctime)s %(name)s %(levelname)s %(message)s'
 7 |   access:
 8 |     "()": uvicorn.logging.AccessFormatter
 9 |     format: '%(asctime)s %(client_addr)s %(request_line)s - %(status_code)s'
10 | handlers:
11 |   access_handler:
12 |     formatter: access
13 |     class: logging.StreamHandler
14 |     stream: ext://sys.stderr
15 |   standard_handler:
16 |     formatter: default_format
17 |     class: logging.StreamHandler
18 |     stream: ext://sys.stderr
19 | loggers:
20 |   uvicorn.error:
21 |     level: INFO
22 |     handlers:
23 |       - standard_handler
24 |     propagate: no
25 |     # disable logging for uvicorn.error by not having a handler
26 |   uvicorn.access:
27 |     level: INFO
28 |     handlers:
29 |       - access_handler
30 |     propagate: no
31 |     # disable logging for uvicorn.access by not having a handler
32 |   unstructured:
33 |     level: INFO
34 |     handlers:
35 |       - standard_handler
36 |     propagate: no
37 |   unstructured.trace:
38 |     level: CRITICAL
39 |     handlers:
40 |       - standard_handler
41 |     propagate: no
42 |   unstructured_inference:
43 |     level: DEBUG
44 |     handlers:
45 |       - standard_handler
46 |     propagate: no
47 |   unstructured_api:
48 |     level: DEBUG
49 |     handlers:
50 |       - standard_handler
51 |     propagate: no
52 | 
53 | 


--------------------------------------------------------------------------------
/prepline_general/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/prepline_general/__init__.py


--------------------------------------------------------------------------------
/prepline_general/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/prepline_general/api/__init__.py


--------------------------------------------------------------------------------
/prepline_general/api/app.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | 
  4 | from fastapi import FastAPI, HTTPException, Request, status
  5 | from fastapi.datastructures import FormData
  6 | from fastapi.responses import JSONResponse
  7 | 
  8 | from .general import router as general_router
  9 | from .openapi import set_custom_openapi
 10 | 
 11 | logger = logging.getLogger("unstructured_api")
 12 | 
 13 | app = FastAPI(
 14 |     title="Unstructured Pipeline API",
 15 |     summary="Partition documents with the Unstructured library",
 16 |     version="0.0.85",
 17 |     docs_url="/general/docs",
 18 |     openapi_url="/general/openapi.json",
 19 |     servers=[
 20 |         {
 21 |             "url": "https://api.unstructured.io",
 22 |             "description": "Hosted API",
 23 |             "x-speakeasy-server-id": "prod",
 24 |         },
 25 |         {
 26 |             "url": "http://localhost:8000",
 27 |             "description": "Development server",
 28 |             "x-speakeasy-server-id": "local",
 29 |         },
 30 |     ],
 31 |     openapi_tags=[{"name": "general"}],
 32 | )
 33 | 
 34 | # Note(austin) - This logger just dumps exceptions
 35 | # We'd rather handle those below, so disable this in deployments
 36 | uvicorn_logger = logging.getLogger("uvicorn.error")
 37 | if os.environ.get("ENV") in ["dev", "prod"]:
 38 |     uvicorn_logger.disabled = True
 39 | 
 40 | 
 41 | # Catch all HTTPException for uniform logging and response
 42 | @app.exception_handler(HTTPException)
 43 | async def http_error_handler(request: Request, e: HTTPException):
 44 |     logger.error(e.detail)
 45 |     return JSONResponse(status_code=e.status_code, content={"detail": e.detail})
 46 | 
 47 | 
 48 | # Catch any other errors and return as 500
 49 | @app.exception_handler(Exception)
 50 | async def error_handler(request: Request, e: Exception):
 51 |     return JSONResponse(status_code=500, content={"detail": str(e)})
 52 | 
 53 | 
 54 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", None)
 55 | if allowed_origins:
 56 |     from fastapi.middleware.cors import CORSMiddleware
 57 | 
 58 |     app.add_middleware(
 59 |         CORSMiddleware,
 60 |         allow_origins=allowed_origins.split(","),
 61 |         allow_methods=["OPTIONS", "POST"],
 62 |         allow_headers=["Content-Type"],
 63 |     )
 64 | 
 65 | app.include_router(general_router)
 66 | 
 67 | set_custom_openapi(app)
 68 | 
 69 | 
 70 | # Note(austin) - When FastAPI parses our FormData params,
 71 | # it builds lists out of duplicate keys, like so:
 72 | # FormData([('key', 'value1'), ('key', 'value2')])
 73 | #
 74 | # The Speakeasy clients send a more explicit form:
 75 | # FormData([('key[]', 'value1'), ('key[]', 'value2')])
 76 | #
 77 | # FastAPI doesn't understand these, so we need to transform them.
 78 | # Can't do this in middleware before the data stream is read, nor in the endpoint
 79 | # after the fields are parsed. Thus, we have to patch it into Request.form() on startup.
 80 | get_form = Request._get_form
 81 | 
 82 | 
 83 | async def patched_get_form(
 84 |     self,
 85 |     *,
 86 |     max_files: int | float = 1000,
 87 |     max_fields: int | float = 1000,
 88 | ) -> FormData:
 89 |     """
 90 |     Call the original get_form, and iterate the results
 91 |     If a key has brackets at the end, remove them before returning the final FormData
 92 |     Note the extra params here are unused, but needed to match the signature
 93 |     """
 94 |     form_params = await get_form(self)
 95 | 
 96 |     fixed_params = []
 97 |     for key, value in form_params.multi_items():
 98 |         # Transform key[] into key
 99 |         if key and key.endswith("[]"):
100 |             key = key[:-2]
101 | 
102 |         fixed_params.append((key, value))
103 | 
104 |     return FormData(fixed_params)
105 | 
106 | 
107 | # Replace the private method with our wrapper
108 | Request._get_form = patched_get_form  # type: ignore[assignment]
109 | 
110 | 
111 | # Filter out /healthcheck noise
112 | class HealthCheckFilter(logging.Filter):
113 |     def filter(self, record: logging.LogRecord) -> bool:
114 |         return record.getMessage().find("/healthcheck") == -1
115 | 
116 | 
117 | # Filter out /metrics noise
118 | class MetricsCheckFilter(logging.Filter):
119 |     def filter(self, record: logging.LogRecord) -> bool:
120 |         return record.getMessage().find("/metrics") == -1
121 | 
122 | 
123 | logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter())
124 | logging.getLogger("uvicorn.access").addFilter(MetricsCheckFilter())
125 | 
126 | 
127 | @app.get("/healthcheck", status_code=status.HTTP_200_OK, include_in_schema=False)
128 | def healthcheck(request: Request):
129 |     return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"}
130 | 
131 | 
132 | logger.info("Started Unstructured API")
133 | 


--------------------------------------------------------------------------------
/prepline_general/api/filetypes.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Optional
 3 | from io import BytesIO
 4 | 
 5 | from fastapi import HTTPException, UploadFile
 6 | 
 7 | from unstructured.file_utils.filetype import detect_filetype
 8 | from unstructured.file_utils.model import FileType
 9 | 
10 | 
11 | def _remove_optional_info_from_mime_type(content_type: str | None) -> str | None:
12 |     """removes charset information from mime types, e.g.,
13 |     "application/json; charset=utf-8" -> "application/json"
14 |     """
15 |     if not content_type:
16 |         return content_type
17 |     return content_type.split(";")[0]
18 | 
19 | 
20 | def get_validated_mimetype(file: UploadFile, content_type_hint: str | None = None) -> Optional[str]:
21 |     """Given the incoming file, identify and return the correct mimetype.
22 | 
23 |     Order of operations:
24 |     - If user passed content_type as a form param, take it as truth.
25 |     - Otherwise, use file.content_type (as set by the Content-Type header)
26 |     - If no content_type was passed and the header wasn't useful, call the library's detect_filetype
27 | 
28 |     Once we have a filteype, check is_partitionable and return 400 if we don't support this file.
29 |     """
30 |     content_type: str | None = None
31 | 
32 |     if content_type_hint is not None:
33 |         content_type = content_type_hint
34 |     else:
35 |         content_type = _remove_optional_info_from_mime_type(file.content_type)
36 | 
37 |     filetype = FileType.from_mime_type(content_type)
38 | 
39 |     # If content_type was not specified, use the library to identify the file
40 |     # We inspect the bytes to do this, so we need to buffer the file
41 |     if not filetype or filetype == FileType.UNK:
42 |         file_buffer = BytesIO(file.file.read())
43 |         file.file.seek(0)
44 | 
45 |         file_buffer.name = file.filename
46 | 
47 |         filetype = detect_filetype(file=file_buffer)
48 | 
49 |     if not filetype.is_partitionable:
50 |         raise HTTPException(
51 |             status_code=400,
52 |             detail=(f"File type {filetype.mime_type} is not supported."),
53 |         )
54 | 
55 |     return filetype.mime_type
56 | 


--------------------------------------------------------------------------------
/prepline_general/api/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/prepline_general/api/models/__init__.py


--------------------------------------------------------------------------------
/prepline_general/api/models/form_params.py:
--------------------------------------------------------------------------------
  1 | from typing import Annotated, List, Literal, Optional
  2 | 
  3 | from fastapi import Form
  4 | from pydantic import BaseModel, BeforeValidator
  5 | 
  6 | from prepline_general.api.utils import SmartValueParser
  7 | 
  8 | 
  9 | class GeneralFormParams(BaseModel):
 10 |     """General partition API form parameters for the prepline API.
 11 |     To add a new parameter, add it here and in the as_form classmethod.
 12 |     Use Annotated to add a description and example for the parameter.
 13 |     """
 14 | 
 15 |     xml_keep_tags: bool
 16 |     languages: Optional[List[str]]
 17 |     ocr_languages: Optional[List[str]]
 18 |     skip_infer_table_types: Optional[List[str]]
 19 |     gz_uncompressed_content_type: Optional[str]
 20 |     output_format: str
 21 |     coordinates: bool
 22 |     encoding: str
 23 |     content_type: Optional[str]
 24 |     hi_res_model_name: Optional[str]
 25 |     include_page_breaks: bool
 26 |     pdf_infer_table_structure: bool
 27 |     strategy: str
 28 |     extract_image_block_types: Optional[List[str]]
 29 |     unique_element_ids: bool
 30 |     # -- chunking options --
 31 |     chunking_strategy: Optional[str]
 32 |     combine_under_n_chars: Optional[int]
 33 |     max_characters: int
 34 |     multipage_sections: bool
 35 |     new_after_n_chars: Optional[int]
 36 |     overlap: int
 37 |     overlap_all: bool
 38 |     starting_page_number: Optional[int] = None
 39 |     include_slide_notes: bool
 40 | 
 41 |     @classmethod
 42 |     def as_form(
 43 |         cls,
 44 |         xml_keep_tags: Annotated[
 45 |             bool,
 46 |             Form(
 47 |                 title="Xml Keep Tags",
 48 |                 description="If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml.",
 49 |             ),
 50 |             BeforeValidator(SmartValueParser[bool]().value_or_first_element),
 51 |         ] = False,
 52 |         languages: Annotated[
 53 |             List[str],
 54 |             Form(
 55 |                 title="OCR Languages",
 56 |                 description="The languages present in the document, for use in partitioning and/or OCR",
 57 |                 example="[eng]",
 58 |             ),
 59 |             BeforeValidator(SmartValueParser[List[str]]().value_or_first_element),
 60 |         ] = [],  # noqa
 61 |         ocr_languages: Annotated[
 62 |             List[str],
 63 |             Form(
 64 |                 title="OCR Languages",
 65 |                 description="The languages present in the document, for use in partitioning and/or OCR",
 66 |                 example="[eng]",
 67 |             ),
 68 |             BeforeValidator(SmartValueParser[List[str]]().value_or_first_element),
 69 |         ] = [],
 70 |         skip_infer_table_types: Annotated[
 71 |             List[str],
 72 |             Form(
 73 |                 title="Skip Infer Table Types",
 74 |                 description=(
 75 |                     "The document types that you want to skip table extraction with. Default: []"
 76 |                 ),
 77 |                 example="['pdf', 'jpg', 'png']",
 78 |             ),
 79 |             BeforeValidator(SmartValueParser[List[str]]().value_or_first_element),
 80 |         ] = [],  # noqa
 81 |         gz_uncompressed_content_type: Annotated[
 82 |             Optional[str],
 83 |             Form(
 84 |                 title="Uncompressed Content Type",
 85 |                 description="If file is gzipped, use this content type after unzipping",
 86 |                 example="application/pdf",
 87 |             ),
 88 |         ] = None,
 89 |         output_format: Annotated[
 90 |             Literal["application/json", "text/csv"],
 91 |             Form(
 92 |                 title="Output Format",
 93 |                 description="The format of the response. Supported formats are application/json and text/csv. Default: application/json.",
 94 |                 example="application/json",
 95 |             ),
 96 |         ] = "application/json",
 97 |         coordinates: Annotated[
 98 |             bool,
 99 |             Form(
100 |                 title="Coordinates",
101 |                 description="If true, return coordinates for each element. Default: false",
102 |             ),
103 |             BeforeValidator(SmartValueParser[bool]().value_or_first_element),
104 |         ] = False,
105 |         content_type: Annotated[
106 |             Optional[str],
107 |             Form(
108 |                 title="Content type",
109 |                 description="A hint about the content type to use (such as text/markdown), when there are problems processing a specific file. This value is a MIME type in the format type/subtype.",
110 |                 example="text/markdown",
111 |             ),
112 |             BeforeValidator(SmartValueParser[str]().value_or_first_element),
113 |         ] = None,
114 |         encoding: Annotated[
115 |             str,
116 |             Form(
117 |                 title="Encoding",
118 |                 description="The encoding method used to decode the text input. Default: utf-8",
119 |                 example="utf-8",
120 |             ),
121 |             BeforeValidator(SmartValueParser[str]().value_or_first_element),
122 |         ] = "utf-8",
123 |         hi_res_model_name: Annotated[
124 |             Optional[str],
125 |             Form(
126 |                 title="Hi Res Model Name",
127 |                 description="The name of the inference model used when strategy is hi_res",
128 |                 example="yolox",
129 |             ),
130 |             BeforeValidator(SmartValueParser[str]().value_or_first_element),
131 |         ] = None,
132 |         include_page_breaks: Annotated[
133 |             bool,
134 |             Form(
135 |                 title="Include Page Breaks",
136 |                 description="If True, the output will include page breaks if the filetype supports it. Default: false",
137 |             ),
138 |             BeforeValidator(SmartValueParser[str]().value_or_first_element),
139 |         ] = False,
140 |         pdf_infer_table_structure: Annotated[
141 |             bool,
142 |             Form(
143 |                 title="Pdf Infer Table Structure",
144 |                 description=(
145 |                     "Deprecated! Use skip_infer_table_types to opt out of table extraction for any "
146 |                     "file type. If False and strategy=hi_res, no Table Elements will be extracted "
147 |                     "from pdf files regardless of skip_infer_table_types contents."
148 |                 ),
149 |             ),
150 |             BeforeValidator(SmartValueParser[bool]().value_or_first_element),
151 |         ] = True,
152 |         strategy: Annotated[
153 |             Literal["fast", "hi_res", "auto", "ocr_only"],
154 |             Form(
155 |                 title="Strategy",
156 |                 description="The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto",
157 |                 examples=["auto", "hi_res"],
158 |             ),
159 |             BeforeValidator(SmartValueParser[str]().literal_value_stripped_or_first_element),
160 |         ] = "auto",
161 |         extract_image_block_types: Annotated[
162 |             List[str],
163 |             Form(
164 |                 title="Image block types to extract",
165 |                 description="The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields",
166 |                 example="""["image", "table"]""",
167 |             ),
168 |             BeforeValidator(SmartValueParser[List[str]]().value_or_first_element),
169 |         ] = [],  # noqa
170 |         unique_element_ids: Annotated[
171 |             bool,
172 |             Form(
173 |                 title="unique_element_ids",
174 |                 description="""When `True`, assign UUIDs to element IDs, which guarantees their uniqueness 
175 | (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: False""",
176 |                 example=True,
177 |             ),
178 |         ] = False,
179 |         # -- chunking options --
180 |         chunking_strategy: Annotated[
181 |             Optional[Literal["by_title"]],
182 |             Form(
183 |                 title="Chunking Strategy",
184 |                 description="Use one of the supported strategies to chunk the returned elements. Currently supports: by_title",
185 |                 examples=["by_title"],
186 |             ),
187 |         ] = None,
188 |         combine_under_n_chars: Annotated[
189 |             Optional[int],
190 |             Form(
191 |                 title="Combine Under N Chars",
192 |                 description="If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: 500",
193 |                 example=500,
194 |             ),
195 |         ] = None,
196 |         max_characters: Annotated[
197 |             int,
198 |             Form(
199 |                 title="Max Characters",
200 |                 description="If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 1500",
201 |                 example=1500,
202 |             ),
203 |         ] = 500,
204 |         multipage_sections: Annotated[
205 |             bool,
206 |             Form(
207 |                 title="Multipage Sections",
208 |                 description="If chunking strategy is set, determines if sections can span multiple sections. Default: true",
209 |             ),
210 |         ] = True,
211 |         new_after_n_chars: Annotated[
212 |             Optional[int],
213 |             Form(
214 |                 title="New after n chars",
215 |                 description="If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: 1500",
216 |                 example=1500,
217 |             ),
218 |         ] = None,
219 |         overlap: Annotated[
220 |             int,
221 |             Form(
222 |                 title="Overlap",
223 |                 description="""Specifies the length of a string ("tail") to be drawn from each chunk and prefixed to the
224 | next chunk as a context-preserving mechanism. By default, this only applies to split-chunks
225 | where an oversized element is divided into multiple chunks by text-splitting. Default: 0""",
226 |                 example=20,
227 |             ),
228 |         ] = 0,
229 |         overlap_all: Annotated[
230 |             bool,
231 |             Form(
232 |                 title="Overlap all",
233 |                 description="""When `True`, apply overlap between "normal" chunks formed from whole
234 | elements and not subject to text-splitting. Use this with caution as it entails a certain
235 | level of "pollution" of otherwise clean semantic chunk boundaries. Default: False""",
236 |                 example=True,
237 |             ),
238 |         ] = False,
239 |         starting_page_number: Annotated[
240 |             Optional[int],
241 |             Form(
242 |                 title="PDF Starting Page Number",
243 |                 description=(
244 |                     "When PDF is split into pages before sending it into the API, providing "
245 |                     "this information will allow the page number to be assigned correctly."
246 |                 ),
247 |                 example=3,
248 |             ),
249 |         ] = None,
250 |         include_slide_notes: Annotated[
251 |             bool,
252 |             Form(
253 |                 title="include_slide_notes",
254 |                 description=(
255 |                     "When `True`, slide notes from .ppt and .pptx files"
256 |                     " will be included in the response. Default: `True`"
257 |                 ),
258 |                 example=False,
259 |             ),
260 |         ] = True,
261 |     ) -> "GeneralFormParams":
262 |         return cls(
263 |             xml_keep_tags=xml_keep_tags,
264 |             languages=languages if languages else None,
265 |             ocr_languages=ocr_languages if ocr_languages else None,
266 |             skip_infer_table_types=skip_infer_table_types,
267 |             gz_uncompressed_content_type=gz_uncompressed_content_type,
268 |             output_format=output_format,
269 |             coordinates=coordinates,
270 |             content_type=content_type,
271 |             encoding=encoding,
272 |             hi_res_model_name=hi_res_model_name,
273 |             include_page_breaks=include_page_breaks,
274 |             pdf_infer_table_structure=pdf_infer_table_structure,
275 |             strategy=strategy,
276 |             extract_image_block_types=(
277 |                 extract_image_block_types if extract_image_block_types else None
278 |             ),
279 |             chunking_strategy=chunking_strategy,
280 |             combine_under_n_chars=combine_under_n_chars,
281 |             max_characters=max_characters,
282 |             multipage_sections=multipage_sections,
283 |             new_after_n_chars=new_after_n_chars,
284 |             overlap=overlap,
285 |             overlap_all=overlap_all,
286 |             unique_element_ids=unique_element_ids,
287 |             starting_page_number=starting_page_number,
288 |             include_slide_notes=include_slide_notes,
289 |         )
290 | 


--------------------------------------------------------------------------------
/prepline_general/api/openapi.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | from fastapi import FastAPI
  4 | from fastapi.openapi.utils import get_openapi
  5 | 
  6 | 
  7 | def set_custom_openapi(app: FastAPI) -> None:
  8 |     """Generate a custom OpenAPI schema for the app"""
  9 | 
 10 |     def custom_openapi() -> dict[str, Any]:
 11 |         if app.openapi_schema:
 12 |             return app.openapi_schema
 13 |         openapi_schema = get_openapi(
 14 |             title=app.title,
 15 |             version=app.version,
 16 |             summary=app.summary,
 17 |             description=app.description,
 18 |             servers=app.servers,
 19 |             routes=app.routes,
 20 |             tags=app.openapi_tags,
 21 |         )
 22 |         _apply_customizations(openapi_schema)
 23 | 
 24 |         app.openapi_schema = openapi_schema
 25 |         return app.openapi_schema
 26 | 
 27 |     app.openapi = custom_openapi  # type: ignore
 28 | 
 29 | 
 30 | def _apply_customizations(openapi_schema: dict[str, Any]) -> None:
 31 |     """Add customizations to the OpenAPI schema"""
 32 | 
 33 |     # Add security
 34 |     openapi_schema["security"] = [{"ApiKeyAuth": []}]
 35 | 
 36 |     # Add retries
 37 |     openapi_schema["x-speakeasy-retries"] = {
 38 |         "strategy": "backoff",
 39 |         "backoff": {
 40 |             "initialInterval": 500,
 41 |             "maxInterval": 60000,
 42 |             "maxElapsedTime": 900000,
 43 |             "exponent": 1.5,
 44 |         },
 45 |         "statusCodes": [
 46 |             "5xx",
 47 |         ],
 48 |         "retryConnectionErrors": True,
 49 |     }
 50 | 
 51 |     # Response changes
 52 |     openapi_schema["paths"]["/general/v0/general"]["post"]["responses"]["200"]["content"][
 53 |         "application/json"
 54 |     ]["schema"] = {
 55 |         "items": {"$ref": "#/components/schemas/Element"},
 56 |         "title": "Response Partition Parameters",
 57 |         "type": "array",
 58 |     }
 59 | 
 60 |     # Schema changes
 61 | 
 62 |     # Add securitySchemes
 63 |     # TODO: Implement security per the FastAPI documentation:
 64 |     # https://fastapi.tiangolo.com/reference/security/?h=apikey
 65 |     openapi_schema["components"]["securitySchemes"] = {
 66 |         "ApiKeyAuth": {
 67 |             "type": "apiKey",
 68 |             "name": "unstructured-api-key",
 69 |             "in": "header",
 70 |             "x-speakeasy-example": "YOUR_API_KEY",
 71 |         }
 72 |     }
 73 | 
 74 |     # TODO: Instead of a list of parameters, crete a PartitionParameters model
 75 |     # and declare schema keys (type, format, description) as attributes
 76 |     # https://fastapi.tiangolo.com/reference/openapi/models/?h=model
 77 |     # Update the schema key from `Body_partition` to `partition_parameters`
 78 | 
 79 |     # TODO: Similarly, create an Element model
 80 |     # https://fastapi.tiangolo.com/reference/openapi/models/?h=model
 81 |     # Add Elements schema
 82 |     openapi_schema["components"]["schemas"]["Element"] = {
 83 |         "properties": {
 84 |             "type": {"type": "string", "title": "Type"},
 85 |             "element_id": {"type": "string", "title": "Element Id"},
 86 |             "metadata": {"type": "object", "title": "Metadata"},
 87 |             "text": {"type": "string", "title": "Text"},
 88 |         },
 89 |         "type": "object",
 90 |         "required": ["type", "element_id", "metadata", "text"],
 91 |         "title": "Element",
 92 |     }
 93 | 
 94 |     # Must manually correct the schema for the files parameter as due to a bug
 95 |     # described here: https://github.com/tiangolo/fastapi/discussions/10280
 96 |     # files parameter cannot be described with an annotation.
 97 |     # TODO: Check if the bug is fixed and remove this workaround
 98 |     for key in openapi_schema["components"]["schemas"]:
 99 |         if "partition_parameters" in key:
100 |             general_pipeline_schema = openapi_schema["components"]["schemas"][key]
101 |             break
102 |     else:
103 |         # Could not find the schema to update, returning
104 |         return
105 | 
106 |     general_pipeline_schema["properties"]["files"] = {
107 |         "type": "string",
108 |         "format": "binary",
109 |         "description": "The file to extract",
110 |         "required": "true",
111 |         "examples": [
112 |             {
113 |                 "summary": "File to be partitioned",
114 |                 "externalValue": "https://github.com/Unstructured-IO/unstructured/blob/98d3541909f64290b5efb65a226fc3ee8a7cc5ee/example-docs/layout-parser-paper.pdf",
115 |             }
116 |         ],
117 |     }
118 | 


--------------------------------------------------------------------------------
/prepline_general/api/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import TypeVar, Union, List, Optional, Generic, get_origin, get_args, Type, Any, Tuple
  3 | 
  4 | T = TypeVar("T")
  5 | E = TypeVar("E")
  6 | 
  7 | 
  8 | def _cast_to_type(value: Any, origin_class: type) -> Any:
  9 |     """Cast a value to a type E
 10 | 
 11 |     Args:
 12 |         value (Any): value to cast to a type T
 13 |         origin_class (type): type to cast the value to. Should be one of simple types
 14 | 
 15 |     Returns:
 16 |         T: value cast to a type T
 17 |     """
 18 |     if isinstance(value, str) and (origin_class == int or origin_class == float):
 19 |         return origin_class(value)  # noqa
 20 |     if origin_class == bool and isinstance(value, str):
 21 |         return value.lower() == "true"
 22 |     return value
 23 | 
 24 | 
 25 | def _return_cast_first_element(values: list[E], origin_class: type) -> E | None:
 26 |     """Return the first element of a list cast to a type T, or None if the list is empty
 27 | 
 28 |     Args:
 29 |         values (list[str]): list of strings
 30 |         origin_class (type): type to cast the first element to. Should be one of simple types
 31 | 
 32 |     Returns:
 33 |         T | None: first element cast to a type T, or None if the list is empty
 34 |     """
 35 |     value = next(iter(values), None)
 36 |     if value is not None:
 37 |         return _cast_to_type(value, origin_class)  # noqa
 38 |     return value
 39 | 
 40 | 
 41 | def is_convertible_to_list(s: str) -> Tuple[bool, Union[List, str]]:
 42 |     """
 43 |     Determines if a given string is convertible to a list.
 44 | 
 45 |     This function first tries to parse the string as JSON. If the parsed JSON is a list, it returns
 46 |     True along with the list. If parsing as JSON fails, it then checks if the string can be split
 47 |     into a list using predefined delimiters ("," or "+"). If so, it returns True and the resulting list.
 48 |     If neither condition is met, it returns False and a message indicating the string cannot
 49 |     be converted to a list.
 50 |     """
 51 | 
 52 |     try:
 53 |         result = json.loads(s)
 54 |         if isinstance(result, list):
 55 |             return True, result  # Return the list if conversion is successful
 56 |         else:
 57 |             return False, "Input is valid JSON but not a list."  # Valid JSON but not a list
 58 |     except json.JSONDecodeError:
 59 |         pass  # proceed to check using delimiters if JSON parsing fails
 60 | 
 61 |     delimiters = ["+", ","]
 62 |     for delimiter in delimiters:
 63 |         if delimiter in delimiters:
 64 |             return True, s.split(delimiter)
 65 | 
 66 |     return False, "Input is not valid JSON."  # Invalid JSON
 67 | 
 68 | 
 69 | class SmartValueParser(Generic[T]):
 70 |     """Class handle api parameters that are passed in form of a specific value or as a list of strings from which
 71 |     the first element is used, cast to a proper type
 72 |     Should be parametrized with a type to which the value should be casted.
 73 | 
 74 |     Examples:
 75 |         SmartValueParser[int]().value_or_first_element(value)
 76 |         SmartValueParser[list[int]]().value_or_first_element(value)
 77 |     """
 78 | 
 79 |     def value_or_first_element(self, value: Union[T, list[T]]) -> list[T] | T | None:
 80 |         """If value is a list, return the first element cast to a type T, otherwise return the value itself
 81 | 
 82 |         Args:
 83 |             value (Union[T, List[str]]): value to cast to a type T or return as is
 84 |         """
 85 |         origin_class, container_elems_class = self._get_origin_container_classes()
 86 |         if isinstance(value, list) and not isinstance(value, origin_class):
 87 |             extracted_value: T | None = _return_cast_first_element(value, origin_class)
 88 |             return extracted_value
 89 |         elif isinstance(value, list) and origin_class == list and container_elems_class:
 90 |             if len(value) == 1:
 91 |                 is_list, result = is_convertible_to_list(str(value[0]))
 92 |                 new_value = result if is_list else value
 93 |                 return [_cast_to_type(elem, container_elems_class) for elem in new_value]
 94 |             return [_cast_to_type(elem, container_elems_class) for elem in value]
 95 |         return _cast_to_type(value, origin_class)  # noqa
 96 | 
 97 |     def literal_value_stripped_or_first_element(self, value: str) -> str | None:
 98 |         """Returns the value itself for literal strings and strips quotation characters.
 99 | 
100 |         Args:
101 |             value (Union[T, List[str]]): value to cast to a type T or return as is
102 |         """
103 |         origin_class, container_elems_class = self._get_origin_container_classes()
104 |         value = value.replace("'", "")
105 |         value = value.replace('"', "")
106 |         return _cast_to_type(value, origin_class)
107 | 
108 |     def _get_origin_container_classes(self) -> tuple[type, type | None]:
109 |         """Extracts class (and container class if it's a list) from a type hint
110 | 
111 |         Returns:
112 |             tuple[type, type | None]: class and container class of the type hint
113 |         """
114 |         type_info = self.__orig_class__.__args__[0]  # type: ignore
115 |         origin_class = get_origin(type_info)
116 |         if origin_class is None:
117 |             # it's a basic type like int or bool - return it and no container class
118 |             return type_info, None
119 |         origin_args = get_args(type_info)
120 |         container_elems_class = origin_args[0] if origin_args else None
121 |         return origin_class, container_elems_class
122 | 


--------------------------------------------------------------------------------
/preprocessing-pipeline-family.yaml:
--------------------------------------------------------------------------------
1 | name: general
2 | version: 0.0.85
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 100
 3 | 
 4 | [tool.pyright]
 5 | pythonPlatform = "Linux"
 6 | pythonVersion = "3.12"
 7 | reportUnnecessaryCast = true
 8 | typeCheckingMode = "strict"
 9 | 
10 | [tool.ruff]
11 | line-length = 100
12 | select = [
13 |     "C4",       # -- flake8-comprehensions --
14 |     "COM",      # -- flake8-commas --
15 |     "E",        # -- pycodestyle errors --
16 |     "F",        # -- pyflakes --
17 |     "I",        # -- isort (imports) --
18 |     "PLR0402",  # -- Name compared with itself like `foo == foo` --
19 |     "PT",       # -- flake8-pytest-style --
20 |     "SIM",      # -- flake8-simplify --
21 |     "UP015",    # -- redundant `open()` mode parameter (like "r" is default) --
22 |     "UP018",    # -- Unnecessary {literal_type} call like `str("abc")`. (rewrite as a literal) --
23 |     "UP032",    # -- Use f-string instead of `.format()` call --
24 |     "UP034",    # -- Avoid extraneous parentheses --
25 | ]
26 | ignore = [
27 |     "COM812",   # -- over aggressively insists on trailing commas where not desireable --
28 |     "PT011",    # -- pytest.raises({exc}) too broad, use match param or more specific exception --
29 |     "PT012",    # -- pytest.raises() block should contain a single simple statement --
30 |     "SIM117",   # -- merge `with` statements for context managers that have same scope --
31 | ]
32 | 
33 | [tool.ruff.lint.isort]
34 | known-first-party = [
35 |     "unstructured",
36 |     "unstructured_inference",
37 | ]
38 | 


--------------------------------------------------------------------------------
/requirements/base.in:
--------------------------------------------------------------------------------
 1 | -c constraints.in
 2 | unstructured[all-docs]
 3 | # Pinning click due to a unicode issue in black
 4 | # can remove after black drops support for Python 3.6
 5 | # ref: https://github.com/psf/black/issues/2964
 6 | click==8.1.3
 7 | fastapi
 8 | uvicorn
 9 | ratelimit
10 | requests
11 | backoff
12 | pypdf
13 | pycryptodome
14 | psutil
15 | 


--------------------------------------------------------------------------------
/requirements/base.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is autogenerated by pip-compile with Python 3.12
  3 | # by the following command:
  4 | #
  5 | #    pip-compile requirements/base.in
  6 | #
  7 | aiofiles==24.1.0
  8 |     # via unstructured-client
  9 | annotated-types==0.7.0
 10 |     # via pydantic
 11 | antlr4-python3-runtime==4.9.3
 12 |     # via omegaconf
 13 | anyio==4.8.0
 14 |     # via
 15 |     #   httpx
 16 |     #   starlette
 17 | backoff==2.2.1
 18 |     # via
 19 |     #   -r requirements/base.in
 20 |     #   unstructured
 21 | beautifulsoup4==4.12.3
 22 |     # via unstructured
 23 | cachetools==5.5.1
 24 |     # via google-auth
 25 | certifi==2024.12.14
 26 |     # via
 27 |     #   httpcore
 28 |     #   httpx
 29 |     #   requests
 30 | cffi==1.17.1
 31 |     # via cryptography
 32 | chardet==5.2.0
 33 |     # via unstructured
 34 | charset-normalizer==3.4.1
 35 |     # via
 36 |     #   pdfminer-six
 37 |     #   requests
 38 | click==8.1.3
 39 |     # via
 40 |     #   -r requirements/base.in
 41 |     #   nltk
 42 |     #   python-oxmsg
 43 |     #   uvicorn
 44 | coloredlogs==15.0.1
 45 |     # via onnxruntime
 46 | contourpy==1.3.1
 47 |     # via matplotlib
 48 | cryptography==44.0.1
 49 |     # via
 50 |     #   pdfminer-six
 51 |     #   unstructured-client
 52 | cycler==0.12.1
 53 |     # via matplotlib
 54 | dataclasses-json==0.6.7
 55 |     # via unstructured
 56 | deprecated==1.2.18
 57 |     # via pikepdf
 58 | effdet==0.4.1
 59 |     # via unstructured
 60 | emoji==2.14.1
 61 |     # via unstructured
 62 | et-xmlfile==2.0.0
 63 |     # via openpyxl
 64 | eval-type-backport==0.2.2
 65 |     # via unstructured-client
 66 | fastapi==0.115.8
 67 |     # via -r requirements/base.in
 68 | filelock==3.17.0
 69 |     # via
 70 |     #   huggingface-hub
 71 |     #   torch
 72 |     #   transformers
 73 | filetype==1.2.0
 74 |     # via unstructured
 75 | flatbuffers==25.1.24
 76 |     # via onnxruntime
 77 | fonttools==4.55.8
 78 |     # via matplotlib
 79 | fsspec==2024.12.0
 80 |     # via
 81 |     #   huggingface-hub
 82 |     #   torch
 83 | google-api-core[grpc]==2.24.1
 84 |     # via google-cloud-vision
 85 | google-auth==2.38.0
 86 |     # via
 87 |     #   google-api-core
 88 |     #   google-cloud-vision
 89 | google-cloud-vision==3.9.0
 90 |     # via unstructured
 91 | googleapis-common-protos==1.66.0
 92 |     # via
 93 |     #   google-api-core
 94 |     #   grpcio-status
 95 | grpcio==1.70.0
 96 |     # via
 97 |     #   google-api-core
 98 |     #   grpcio-status
 99 | grpcio-status==1.70.0
100 |     # via google-api-core
101 | h11==0.16.0
102 |     # via
103 |     #   httpcore
104 |     #   uvicorn
105 | html5lib==1.1
106 |     # via unstructured
107 | httpcore==1.0.9
108 |     # via httpx
109 | httpx==0.28.1
110 |     # via unstructured-client
111 | huggingface-hub==0.32.1
112 |     # via
113 |     #   timm
114 |     #   tokenizers
115 |     #   transformers
116 |     #   unstructured-inference
117 | humanfriendly==10.0
118 |     # via coloredlogs
119 | idna==3.10
120 |     # via
121 |     #   anyio
122 |     #   httpx
123 |     #   requests
124 | jinja2==3.1.6
125 |     # via torch
126 | joblib==1.4.2
127 |     # via nltk
128 | jsonpath-python==1.0.6
129 |     # via unstructured-client
130 | kiwisolver==1.4.8
131 |     # via matplotlib
132 | langdetect==1.0.9
133 |     # via unstructured
134 | lxml==5.3.0
135 |     # via
136 |     #   pikepdf
137 |     #   python-docx
138 |     #   python-pptx
139 |     #   unstructured
140 | markdown==3.7
141 |     # via unstructured
142 | markupsafe==3.0.2
143 |     # via jinja2
144 | marshmallow==3.26.0
145 |     # via dataclasses-json
146 | matplotlib==3.10.0
147 |     # via
148 |     #   pycocotools
149 |     #   unstructured-inference
150 | mpmath==1.3.0
151 |     # via sympy
152 | mypy-extensions==1.0.0
153 |     # via typing-inspect
154 | nest-asyncio==1.6.0
155 |     # via unstructured-client
156 | networkx==3.4.2
157 |     # via
158 |     #   torch
159 |     #   unstructured
160 | nltk==3.9.1
161 |     # via unstructured
162 | numpy==1.26.4
163 |     # via
164 |     #   -c requirements/constraints.in
165 |     #   contourpy
166 |     #   matplotlib
167 |     #   onnx
168 |     #   onnxruntime
169 |     #   opencv-python
170 |     #   pandas
171 |     #   pycocotools
172 |     #   scipy
173 |     #   torchvision
174 |     #   transformers
175 |     #   unstructured
176 |     #   unstructured-inference
177 | olefile==0.47
178 |     # via python-oxmsg
179 | omegaconf==2.3.0
180 |     # via effdet
181 | onnx==1.17.0
182 |     # via
183 |     #   unstructured
184 |     #   unstructured-inference
185 | onnxruntime==1.20.1
186 |     # via unstructured-inference
187 | opencv-python==4.11.0.86
188 |     # via unstructured-inference
189 | openpyxl==3.1.5
190 |     # via unstructured
191 | packaging==24.2
192 |     # via
193 |     #   huggingface-hub
194 |     #   marshmallow
195 |     #   matplotlib
196 |     #   onnxruntime
197 |     #   pikepdf
198 |     #   transformers
199 |     #   unstructured-pytesseract
200 | pandas==2.2.3
201 |     # via
202 |     #   unstructured
203 |     #   unstructured-inference
204 | pdf2image==1.17.0
205 |     # via unstructured
206 | pdfminer-six==20240706
207 |     # via
208 |     #   unstructured
209 |     #   unstructured-inference
210 | pi-heif==0.21.0
211 |     # via unstructured
212 | pikepdf==9.5.1
213 |     # via unstructured
214 | pillow==11.1.0
215 |     # via
216 |     #   matplotlib
217 |     #   pdf2image
218 |     #   pi-heif
219 |     #   pikepdf
220 |     #   python-pptx
221 |     #   torchvision
222 |     #   unstructured-pytesseract
223 | proto-plus==1.26.0
224 |     # via
225 |     #   google-api-core
226 |     #   google-cloud-vision
227 | protobuf==5.29.3
228 |     # via
229 |     #   google-api-core
230 |     #   google-cloud-vision
231 |     #   googleapis-common-protos
232 |     #   grpcio-status
233 |     #   onnx
234 |     #   onnxruntime
235 |     #   proto-plus
236 | psutil==6.1.1
237 |     # via
238 |     #   -r requirements/base.in
239 |     #   unstructured
240 | pyasn1==0.6.1
241 |     # via
242 |     #   pyasn1-modules
243 |     #   rsa
244 | pyasn1-modules==0.4.1
245 |     # via google-auth
246 | pycocotools==2.0.8
247 |     # via effdet
248 | pycparser==2.22
249 |     # via cffi
250 | pycryptodome==3.21.0
251 |     # via -r requirements/base.in
252 | pydantic==2.10.6
253 |     # via
254 |     #   fastapi
255 |     #   unstructured-client
256 | pydantic-core==2.27.2
257 |     # via pydantic
258 | pypandoc==1.15
259 |     # via unstructured
260 | pyparsing==3.2.1
261 |     # via matplotlib
262 | pypdf==5.2.0
263 |     # via
264 |     #   -r requirements/base.in
265 |     #   unstructured
266 |     #   unstructured-client
267 | pypdfium2==4.30.1
268 |     # via unstructured-inference
269 | python-dateutil==2.9.0.post0
270 |     # via
271 |     #   matplotlib
272 |     #   pandas
273 |     #   unstructured-client
274 | python-docx==1.1.2
275 |     # via unstructured
276 | python-iso639==2025.1.28
277 |     # via unstructured
278 | python-magic==0.4.27
279 |     # via unstructured
280 | python-multipart==0.0.20
281 |     # via unstructured-inference
282 | python-oxmsg==0.0.1
283 |     # via unstructured
284 | python-pptx==1.0.2
285 |     # via unstructured
286 | pytz==2024.2
287 |     # via pandas
288 | pyyaml==6.0.2
289 |     # via
290 |     #   huggingface-hub
291 |     #   omegaconf
292 |     #   timm
293 |     #   transformers
294 | rapidfuzz==3.12.1
295 |     # via
296 |     #   unstructured
297 |     #   unstructured-inference
298 | ratelimit==2.2.1
299 |     # via -r requirements/base.in
300 | regex==2024.11.6
301 |     # via
302 |     #   nltk
303 |     #   transformers
304 | requests==2.32.3
305 |     # via
306 |     #   -r requirements/base.in
307 |     #   google-api-core
308 |     #   huggingface-hub
309 |     #   requests-toolbelt
310 |     #   transformers
311 |     #   unstructured
312 | requests-toolbelt==1.0.0
313 |     # via unstructured-client
314 | rsa==4.9
315 |     # via google-auth
316 | safetensors==0.5.2
317 |     # via
318 |     #   timm
319 |     #   transformers
320 | scipy==1.15.1
321 |     # via unstructured-inference
322 | six==1.17.0
323 |     # via
324 |     #   html5lib
325 |     #   langdetect
326 |     #   python-dateutil
327 | sniffio==1.3.1
328 |     # via anyio
329 | soupsieve==2.6
330 |     # via beautifulsoup4
331 | starlette==0.41.2
332 |     # via
333 |     #   -c requirements/constraints.in
334 |     #   fastapi
335 | sympy==1.13.3
336 |     # via
337 |     #   onnxruntime
338 |     #   torch
339 | timm==1.0.14
340 |     # via
341 |     #   effdet
342 |     #   unstructured-inference
343 | tokenizers==0.21.0
344 |     # via transformers
345 | torch==2.7.0
346 |     # via
347 |     #   effdet
348 |     #   timm
349 |     #   torchvision
350 |     #   unstructured-inference
351 | torchvision==0.22.0
352 |     # via
353 |     #   effdet
354 |     #   timm
355 | tqdm==4.67.1
356 |     # via
357 |     #   huggingface-hub
358 |     #   nltk
359 |     #   transformers
360 |     #   unstructured
361 | transformers==4.50.0
362 |     # via unstructured-inference
363 | typing-extensions==4.12.2
364 |     # via
365 |     #   anyio
366 |     #   fastapi
367 |     #   huggingface-hub
368 |     #   pydantic
369 |     #   pydantic-core
370 |     #   python-docx
371 |     #   python-oxmsg
372 |     #   python-pptx
373 |     #   torch
374 |     #   typing-inspect
375 |     #   unstructured
376 | typing-inspect==0.9.0
377 |     # via
378 |     #   dataclasses-json
379 |     #   unstructured-client
380 | tzdata==2025.1
381 |     # via pandas
382 | unstructured[all-docs]==0.16.17
383 |     # via -r requirements/base.in
384 | unstructured-client==0.29.0
385 |     # via unstructured
386 | unstructured-inference==0.8.6
387 |     # via unstructured
388 | unstructured-pytesseract==0.3.13
389 |     # via unstructured
390 | urllib3==2.3.0
391 |     # via requests
392 | uvicorn==0.34.0
393 |     # via -r requirements/base.in
394 | webencodings==0.5.1
395 |     # via html5lib
396 | wrapt==1.17.2
397 |     # via
398 |     #   deprecated
399 |     #   unstructured
400 | xlrd==2.0.1
401 |     # via unstructured
402 | xlsxwriter==3.2.2
403 |     # via python-pptx
404 | 
405 | # The following packages are considered to be unsafe in a requirements file:
406 | # setuptools
407 | 


--------------------------------------------------------------------------------
/requirements/constraints.in:
--------------------------------------------------------------------------------
1 | ####################################################################################################
2 | # This file can house global constraints that aren't *direct* requirements of the package or any
3 | # extras. Putting a dependency here will only affect dependency sets that contain them -- in other
4 | # words, if something does not require a constraint, it will not be installed.
5 | ####################################################################################################
6 | numpy<2.0.0
7 | # later versions of Starlette break middleware
8 | starlette==0.41.2


--------------------------------------------------------------------------------
/requirements/test.in:
--------------------------------------------------------------------------------
 1 | -c constraints.in
 2 | black
 3 | # NOTE(mrobinson) - Pinning click due to a unicode issue in black
 4 | # can remove after black drops support for Python 3.6
 5 | # ref: https://github.com/psf/black/issues/2964
 6 | click==8.1.3
 7 | flake8
 8 | mypy
 9 | pytest-cov
10 | pytest-mock
11 | nbdev
12 | jupyter
13 | httpx
14 | deepdiff
15 | 


--------------------------------------------------------------------------------
/requirements/test.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is autogenerated by pip-compile with Python 3.12
  3 | # by the following command:
  4 | #
  5 | #    pip-compile --output-file=requirements/test.txt requirements/base.txt requirements/test.in
  6 | #
  7 | aiofiles==24.1.0
  8 |     # via
  9 |     #   -r requirements/base.txt
 10 |     #   unstructured-client
 11 | annotated-types==0.7.0
 12 |     # via
 13 |     #   -r requirements/base.txt
 14 |     #   pydantic
 15 | antlr4-python3-runtime==4.9.3
 16 |     # via
 17 |     #   -r requirements/base.txt
 18 |     #   omegaconf
 19 | anyio==4.8.0
 20 |     # via
 21 |     #   -r requirements/base.txt
 22 |     #   httpx
 23 |     #   jupyter-server
 24 |     #   starlette
 25 | appnope==0.1.4
 26 |     # via ipykernel
 27 | argon2-cffi==23.1.0
 28 |     # via jupyter-server
 29 | argon2-cffi-bindings==21.2.0
 30 |     # via argon2-cffi
 31 | arrow==1.3.0
 32 |     # via isoduration
 33 | asttokens==3.0.0
 34 |     # via
 35 |     #   nbdev
 36 |     #   stack-data
 37 | astunparse==1.6.3
 38 |     # via nbdev
 39 | async-lru==2.0.4
 40 |     # via jupyterlab
 41 | attrs==25.1.0
 42 |     # via
 43 |     #   jsonschema
 44 |     #   referencing
 45 | babel==2.16.0
 46 |     # via jupyterlab-server
 47 | backoff==2.2.1
 48 |     # via
 49 |     #   -r requirements/base.txt
 50 |     #   unstructured
 51 | beautifulsoup4==4.12.3
 52 |     # via
 53 |     #   -r requirements/base.txt
 54 |     #   nbconvert
 55 |     #   unstructured
 56 | black==25.1.0
 57 |     # via -r requirements/test.in
 58 | bleach[css]==6.2.0
 59 |     # via nbconvert
 60 | cachetools==5.5.1
 61 |     # via
 62 |     #   -r requirements/base.txt
 63 |     #   google-auth
 64 | certifi==2024.12.14
 65 |     # via
 66 |     #   -r requirements/base.txt
 67 |     #   httpcore
 68 |     #   httpx
 69 |     #   requests
 70 | cffi==1.17.1
 71 |     # via
 72 |     #   -r requirements/base.txt
 73 |     #   argon2-cffi-bindings
 74 |     #   cryptography
 75 | chardet==5.2.0
 76 |     # via
 77 |     #   -r requirements/base.txt
 78 |     #   unstructured
 79 | charset-normalizer==3.4.1
 80 |     # via
 81 |     #   -r requirements/base.txt
 82 |     #   pdfminer-six
 83 |     #   requests
 84 | click==8.1.3
 85 |     # via
 86 |     #   -r requirements/base.txt
 87 |     #   -r requirements/test.in
 88 |     #   black
 89 |     #   nltk
 90 |     #   python-oxmsg
 91 |     #   uvicorn
 92 | coloredlogs==15.0.1
 93 |     # via
 94 |     #   -r requirements/base.txt
 95 |     #   onnxruntime
 96 | comm==0.2.2
 97 |     # via
 98 |     #   ipykernel
 99 |     #   ipywidgets
100 | contourpy==1.3.1
101 |     # via
102 |     #   -r requirements/base.txt
103 |     #   matplotlib
104 | coverage[toml]==7.6.10
105 |     # via pytest-cov
106 | cryptography==44.0.1
107 |     # via
108 |     #   -r requirements/base.txt
109 |     #   pdfminer-six
110 |     #   unstructured-client
111 | cycler==0.12.1
112 |     # via
113 |     #   -r requirements/base.txt
114 |     #   matplotlib
115 | dataclasses-json==0.6.7
116 |     # via
117 |     #   -r requirements/base.txt
118 |     #   unstructured
119 | debugpy==1.8.12
120 |     # via ipykernel
121 | decorator==5.1.1
122 |     # via ipython
123 | deepdiff==8.1.1
124 |     # via -r requirements/test.in
125 | defusedxml==0.7.1
126 |     # via nbconvert
127 | deprecated==1.2.18
128 |     # via
129 |     #   -r requirements/base.txt
130 |     #   pikepdf
131 | effdet==0.4.1
132 |     # via
133 |     #   -r requirements/base.txt
134 |     #   unstructured
135 | emoji==2.14.1
136 |     # via
137 |     #   -r requirements/base.txt
138 |     #   unstructured
139 | et-xmlfile==2.0.0
140 |     # via
141 |     #   -r requirements/base.txt
142 |     #   openpyxl
143 | eval-type-backport==0.2.2
144 |     # via
145 |     #   -r requirements/base.txt
146 |     #   unstructured-client
147 | execnb==0.1.11
148 |     # via nbdev
149 | executing==2.2.0
150 |     # via stack-data
151 | fastapi==0.115.8
152 |     # via -r requirements/base.txt
153 | fastcore==1.7.28
154 |     # via
155 |     #   execnb
156 |     #   ghapi
157 |     #   nbdev
158 | fastjsonschema==2.21.1
159 |     # via nbformat
160 | filelock==3.17.0
161 |     # via
162 |     #   -r requirements/base.txt
163 |     #   huggingface-hub
164 |     #   torch
165 |     #   transformers
166 | filetype==1.2.0
167 |     # via
168 |     #   -r requirements/base.txt
169 |     #   unstructured
170 | flake8==7.1.1
171 |     # via -r requirements/test.in
172 | flatbuffers==25.1.24
173 |     # via
174 |     #   -r requirements/base.txt
175 |     #   onnxruntime
176 | fonttools==4.55.8
177 |     # via
178 |     #   -r requirements/base.txt
179 |     #   matplotlib
180 | fqdn==1.5.1
181 |     # via jsonschema
182 | fsspec==2024.12.0
183 |     # via
184 |     #   -r requirements/base.txt
185 |     #   huggingface-hub
186 |     #   torch
187 | ghapi==1.0.6
188 |     # via nbdev
189 | google-api-core[grpc]==2.24.1
190 |     # via
191 |     #   -r requirements/base.txt
192 |     #   google-cloud-vision
193 | google-auth==2.38.0
194 |     # via
195 |     #   -r requirements/base.txt
196 |     #   google-api-core
197 |     #   google-cloud-vision
198 | google-cloud-vision==3.9.0
199 |     # via
200 |     #   -r requirements/base.txt
201 |     #   unstructured
202 | googleapis-common-protos==1.66.0
203 |     # via
204 |     #   -r requirements/base.txt
205 |     #   google-api-core
206 |     #   grpcio-status
207 | grpcio==1.70.0
208 |     # via
209 |     #   -r requirements/base.txt
210 |     #   google-api-core
211 |     #   grpcio-status
212 | grpcio-status==1.70.0
213 |     # via
214 |     #   -r requirements/base.txt
215 |     #   google-api-core
216 | h11==0.16.0
217 |     # via
218 |     #   -r requirements/base.txt
219 |     #   httpcore
220 |     #   uvicorn
221 | html5lib==1.1
222 |     # via
223 |     #   -r requirements/base.txt
224 |     #   unstructured
225 | httpcore==1.0.9
226 |     # via
227 |     #   -r requirements/base.txt
228 |     #   httpx
229 | httpx==0.28.1
230 |     # via
231 |     #   -r requirements/base.txt
232 |     #   -r requirements/test.in
233 |     #   jupyterlab
234 |     #   unstructured-client
235 | huggingface-hub==0.32.1
236 |     # via
237 |     #   -r requirements/base.txt
238 |     #   timm
239 |     #   tokenizers
240 |     #   transformers
241 |     #   unstructured-inference
242 | humanfriendly==10.0
243 |     # via
244 |     #   -r requirements/base.txt
245 |     #   coloredlogs
246 | idna==3.10
247 |     # via
248 |     #   -r requirements/base.txt
249 |     #   anyio
250 |     #   httpx
251 |     #   jsonschema
252 |     #   requests
253 | iniconfig==2.0.0
254 |     # via pytest
255 | ipykernel==6.29.5
256 |     # via
257 |     #   jupyter
258 |     #   jupyter-console
259 |     #   jupyterlab
260 | ipython==8.31.0
261 |     # via
262 |     #   execnb
263 |     #   ipykernel
264 |     #   ipywidgets
265 |     #   jupyter-console
266 | ipywidgets==8.1.5
267 |     # via jupyter
268 | isoduration==20.11.0
269 |     # via jsonschema
270 | jedi==0.19.2
271 |     # via ipython
272 | jinja2==3.1.6
273 |     # via
274 |     #   -r requirements/base.txt
275 |     #   jupyter-server
276 |     #   jupyterlab
277 |     #   jupyterlab-server
278 |     #   nbconvert
279 |     #   torch
280 | joblib==1.4.2
281 |     # via
282 |     #   -r requirements/base.txt
283 |     #   nltk
284 | json5==0.10.0
285 |     # via jupyterlab-server
286 | jsonpath-python==1.0.6
287 |     # via
288 |     #   -r requirements/base.txt
289 |     #   unstructured-client
290 | jsonpointer==3.0.0
291 |     # via jsonschema
292 | jsonschema[format-nongpl]==4.23.0
293 |     # via
294 |     #   jupyter-events
295 |     #   jupyterlab-server
296 |     #   nbformat
297 | jsonschema-specifications==2024.10.1
298 |     # via jsonschema
299 | jupyter==1.1.1
300 |     # via -r requirements/test.in
301 | jupyter-client==8.6.3
302 |     # via
303 |     #   ipykernel
304 |     #   jupyter-console
305 |     #   jupyter-server
306 |     #   nbclient
307 | jupyter-console==6.6.3
308 |     # via jupyter
309 | jupyter-core==5.7.2
310 |     # via
311 |     #   ipykernel
312 |     #   jupyter-client
313 |     #   jupyter-console
314 |     #   jupyter-server
315 |     #   jupyterlab
316 |     #   nbclient
317 |     #   nbconvert
318 |     #   nbformat
319 | jupyter-events==0.11.0
320 |     # via jupyter-server
321 | jupyter-lsp==2.2.5
322 |     # via jupyterlab
323 | jupyter-server==2.15.0
324 |     # via
325 |     #   jupyter-lsp
326 |     #   jupyterlab
327 |     #   jupyterlab-server
328 |     #   notebook
329 |     #   notebook-shim
330 | jupyter-server-terminals==0.5.3
331 |     # via jupyter-server
332 | jupyterlab==4.3.5
333 |     # via
334 |     #   jupyter
335 |     #   notebook
336 | jupyterlab-pygments==0.3.0
337 |     # via nbconvert
338 | jupyterlab-server==2.27.3
339 |     # via
340 |     #   jupyterlab
341 |     #   notebook
342 | jupyterlab-widgets==3.0.13
343 |     # via ipywidgets
344 | kiwisolver==1.4.8
345 |     # via
346 |     #   -r requirements/base.txt
347 |     #   matplotlib
348 | langdetect==1.0.9
349 |     # via
350 |     #   -r requirements/base.txt
351 |     #   unstructured
352 | lxml==5.3.0
353 |     # via
354 |     #   -r requirements/base.txt
355 |     #   pikepdf
356 |     #   python-docx
357 |     #   python-pptx
358 |     #   unstructured
359 | markdown==3.7
360 |     # via
361 |     #   -r requirements/base.txt
362 |     #   unstructured
363 | markupsafe==3.0.2
364 |     # via
365 |     #   -r requirements/base.txt
366 |     #   jinja2
367 |     #   nbconvert
368 | marshmallow==3.26.0
369 |     # via
370 |     #   -r requirements/base.txt
371 |     #   dataclasses-json
372 | matplotlib==3.10.0
373 |     # via
374 |     #   -r requirements/base.txt
375 |     #   pycocotools
376 |     #   unstructured-inference
377 | matplotlib-inline==0.1.7
378 |     # via
379 |     #   ipykernel
380 |     #   ipython
381 | mccabe==0.7.0
382 |     # via flake8
383 | mistune==3.1.1
384 |     # via nbconvert
385 | mpmath==1.3.0
386 |     # via
387 |     #   -r requirements/base.txt
388 |     #   sympy
389 | mypy==1.14.1
390 |     # via -r requirements/test.in
391 | mypy-extensions==1.0.0
392 |     # via
393 |     #   -r requirements/base.txt
394 |     #   black
395 |     #   mypy
396 |     #   typing-inspect
397 | nbclient==0.10.2
398 |     # via nbconvert
399 | nbconvert==7.16.6
400 |     # via
401 |     #   jupyter
402 |     #   jupyter-server
403 | nbdev==2.3.34
404 |     # via -r requirements/test.in
405 | nbformat==5.10.4
406 |     # via
407 |     #   jupyter-server
408 |     #   nbclient
409 |     #   nbconvert
410 | nest-asyncio==1.6.0
411 |     # via
412 |     #   -r requirements/base.txt
413 |     #   ipykernel
414 |     #   unstructured-client
415 | networkx==3.4.2
416 |     # via
417 |     #   -r requirements/base.txt
418 |     #   torch
419 |     #   unstructured
420 | nltk==3.9.1
421 |     # via
422 |     #   -r requirements/base.txt
423 |     #   unstructured
424 | notebook==7.3.2
425 |     # via jupyter
426 | notebook-shim==0.2.4
427 |     # via
428 |     #   jupyterlab
429 |     #   notebook
430 | numpy==1.26.4
431 |     # via
432 |     #   -c requirements/constraints.in
433 |     #   -r requirements/base.txt
434 |     #   contourpy
435 |     #   matplotlib
436 |     #   onnx
437 |     #   onnxruntime
438 |     #   opencv-python
439 |     #   pandas
440 |     #   pycocotools
441 |     #   scipy
442 |     #   torchvision
443 |     #   transformers
444 |     #   unstructured
445 |     #   unstructured-inference
446 | olefile==0.47
447 |     # via
448 |     #   -r requirements/base.txt
449 |     #   python-oxmsg
450 | omegaconf==2.3.0
451 |     # via
452 |     #   -r requirements/base.txt
453 |     #   effdet
454 | onnx==1.17.0
455 |     # via
456 |     #   -r requirements/base.txt
457 |     #   unstructured
458 |     #   unstructured-inference
459 | onnxruntime==1.20.1
460 |     # via
461 |     #   -r requirements/base.txt
462 |     #   unstructured-inference
463 | opencv-python==4.11.0.86
464 |     # via
465 |     #   -r requirements/base.txt
466 |     #   unstructured-inference
467 | openpyxl==3.1.5
468 |     # via
469 |     #   -r requirements/base.txt
470 |     #   unstructured
471 | orderly-set==5.2.3
472 |     # via deepdiff
473 | overrides==7.7.0
474 |     # via jupyter-server
475 | packaging==24.2
476 |     # via
477 |     #   -r requirements/base.txt
478 |     #   black
479 |     #   fastcore
480 |     #   ghapi
481 |     #   huggingface-hub
482 |     #   ipykernel
483 |     #   jupyter-server
484 |     #   jupyterlab
485 |     #   jupyterlab-server
486 |     #   marshmallow
487 |     #   matplotlib
488 |     #   nbconvert
489 |     #   nbdev
490 |     #   onnxruntime
491 |     #   pikepdf
492 |     #   pytest
493 |     #   transformers
494 |     #   unstructured-pytesseract
495 | pandas==2.2.3
496 |     # via
497 |     #   -r requirements/base.txt
498 |     #   unstructured
499 |     #   unstructured-inference
500 | pandocfilters==1.5.1
501 |     # via nbconvert
502 | parso==0.8.4
503 |     # via jedi
504 | pathspec==0.12.1
505 |     # via black
506 | pdf2image==1.17.0
507 |     # via
508 |     #   -r requirements/base.txt
509 |     #   unstructured
510 | pdfminer-six==20240706
511 |     # via
512 |     #   -r requirements/base.txt
513 |     #   unstructured
514 |     #   unstructured-inference
515 | pexpect==4.9.0
516 |     # via ipython
517 | pi-heif==0.21.0
518 |     # via
519 |     #   -r requirements/base.txt
520 |     #   unstructured
521 | pikepdf==9.5.1
522 |     # via
523 |     #   -r requirements/base.txt
524 |     #   unstructured
525 | pillow==11.1.0
526 |     # via
527 |     #   -r requirements/base.txt
528 |     #   matplotlib
529 |     #   pdf2image
530 |     #   pi-heif
531 |     #   pikepdf
532 |     #   python-pptx
533 |     #   torchvision
534 |     #   unstructured-pytesseract
535 | platformdirs==4.3.6
536 |     # via
537 |     #   black
538 |     #   jupyter-core
539 | pluggy==1.5.0
540 |     # via pytest
541 | prometheus-client==0.21.1
542 |     # via jupyter-server
543 | prompt-toolkit==3.0.50
544 |     # via
545 |     #   ipython
546 |     #   jupyter-console
547 | proto-plus==1.26.0
548 |     # via
549 |     #   -r requirements/base.txt
550 |     #   google-api-core
551 |     #   google-cloud-vision
552 | protobuf==5.29.3
553 |     # via
554 |     #   -r requirements/base.txt
555 |     #   google-api-core
556 |     #   google-cloud-vision
557 |     #   googleapis-common-protos
558 |     #   grpcio-status
559 |     #   onnx
560 |     #   onnxruntime
561 |     #   proto-plus
562 | psutil==6.1.1
563 |     # via
564 |     #   -r requirements/base.txt
565 |     #   ipykernel
566 |     #   unstructured
567 | ptyprocess==0.7.0
568 |     # via
569 |     #   pexpect
570 |     #   terminado
571 | pure-eval==0.2.3
572 |     # via stack-data
573 | pyasn1==0.6.1
574 |     # via
575 |     #   -r requirements/base.txt
576 |     #   pyasn1-modules
577 |     #   rsa
578 | pyasn1-modules==0.4.1
579 |     # via
580 |     #   -r requirements/base.txt
581 |     #   google-auth
582 | pycocotools==2.0.8
583 |     # via
584 |     #   -r requirements/base.txt
585 |     #   effdet
586 | pycodestyle==2.12.1
587 |     # via flake8
588 | pycparser==2.22
589 |     # via
590 |     #   -r requirements/base.txt
591 |     #   cffi
592 | pycryptodome==3.21.0
593 |     # via -r requirements/base.txt
594 | pydantic==2.10.6
595 |     # via
596 |     #   -r requirements/base.txt
597 |     #   fastapi
598 |     #   unstructured-client
599 | pydantic-core==2.27.2
600 |     # via
601 |     #   -r requirements/base.txt
602 |     #   pydantic
603 | pyflakes==3.2.0
604 |     # via flake8
605 | pygments==2.19.1
606 |     # via
607 |     #   ipython
608 |     #   jupyter-console
609 |     #   nbconvert
610 | pypandoc==1.15
611 |     # via
612 |     #   -r requirements/base.txt
613 |     #   unstructured
614 | pyparsing==3.2.1
615 |     # via
616 |     #   -r requirements/base.txt
617 |     #   matplotlib
618 | pypdf==5.2.0
619 |     # via
620 |     #   -r requirements/base.txt
621 |     #   unstructured
622 |     #   unstructured-client
623 | pypdfium2==4.30.1
624 |     # via
625 |     #   -r requirements/base.txt
626 |     #   unstructured-inference
627 | pytest==8.3.4
628 |     # via
629 |     #   pytest-cov
630 |     #   pytest-mock
631 | pytest-cov==6.0.0
632 |     # via -r requirements/test.in
633 | pytest-mock==3.14.0
634 |     # via -r requirements/test.in
635 | python-dateutil==2.9.0.post0
636 |     # via
637 |     #   -r requirements/base.txt
638 |     #   arrow
639 |     #   jupyter-client
640 |     #   matplotlib
641 |     #   pandas
642 |     #   unstructured-client
643 | python-docx==1.1.2
644 |     # via
645 |     #   -r requirements/base.txt
646 |     #   unstructured
647 | python-iso639==2025.1.28
648 |     # via
649 |     #   -r requirements/base.txt
650 |     #   unstructured
651 | python-json-logger==3.2.1
652 |     # via jupyter-events
653 | python-magic==0.4.27
654 |     # via
655 |     #   -r requirements/base.txt
656 |     #   unstructured
657 | python-multipart==0.0.20
658 |     # via
659 |     #   -r requirements/base.txt
660 |     #   unstructured-inference
661 | python-oxmsg==0.0.1
662 |     # via
663 |     #   -r requirements/base.txt
664 |     #   unstructured
665 | python-pptx==1.0.2
666 |     # via
667 |     #   -r requirements/base.txt
668 |     #   unstructured
669 | pytz==2024.2
670 |     # via
671 |     #   -r requirements/base.txt
672 |     #   pandas
673 | pyyaml==6.0.2
674 |     # via
675 |     #   -r requirements/base.txt
676 |     #   huggingface-hub
677 |     #   jupyter-events
678 |     #   nbdev
679 |     #   omegaconf
680 |     #   timm
681 |     #   transformers
682 | pyzmq==26.2.1
683 |     # via
684 |     #   ipykernel
685 |     #   jupyter-client
686 |     #   jupyter-console
687 |     #   jupyter-server
688 | rapidfuzz==3.12.1
689 |     # via
690 |     #   -r requirements/base.txt
691 |     #   unstructured
692 |     #   unstructured-inference
693 | ratelimit==2.2.1
694 |     # via -r requirements/base.txt
695 | referencing==0.36.2
696 |     # via
697 |     #   jsonschema
698 |     #   jsonschema-specifications
699 |     #   jupyter-events
700 | regex==2024.11.6
701 |     # via
702 |     #   -r requirements/base.txt
703 |     #   nltk
704 |     #   transformers
705 | requests==2.32.3
706 |     # via
707 |     #   -r requirements/base.txt
708 |     #   google-api-core
709 |     #   huggingface-hub
710 |     #   jupyterlab-server
711 |     #   requests-toolbelt
712 |     #   transformers
713 |     #   unstructured
714 | requests-toolbelt==1.0.0
715 |     # via
716 |     #   -r requirements/base.txt
717 |     #   unstructured-client
718 | rfc3339-validator==0.1.4
719 |     # via
720 |     #   jsonschema
721 |     #   jupyter-events
722 | rfc3986-validator==0.1.1
723 |     # via
724 |     #   jsonschema
725 |     #   jupyter-events
726 | rpds-py==0.22.3
727 |     # via
728 |     #   jsonschema
729 |     #   referencing
730 | rsa==4.9
731 |     # via
732 |     #   -r requirements/base.txt
733 |     #   google-auth
734 | safetensors==0.5.2
735 |     # via
736 |     #   -r requirements/base.txt
737 |     #   timm
738 |     #   transformers
739 | scipy==1.15.1
740 |     # via
741 |     #   -r requirements/base.txt
742 |     #   unstructured-inference
743 | send2trash==1.8.3
744 |     # via jupyter-server
745 | six==1.17.0
746 |     # via
747 |     #   -r requirements/base.txt
748 |     #   astunparse
749 |     #   html5lib
750 |     #   langdetect
751 |     #   python-dateutil
752 |     #   rfc3339-validator
753 | sniffio==1.3.1
754 |     # via
755 |     #   -r requirements/base.txt
756 |     #   anyio
757 | soupsieve==2.6
758 |     # via
759 |     #   -r requirements/base.txt
760 |     #   beautifulsoup4
761 | stack-data==0.6.3
762 |     # via ipython
763 | starlette==0.41.2
764 |     # via
765 |     #   -c requirements/constraints.in
766 |     #   -r requirements/base.txt
767 |     #   fastapi
768 | sympy==1.13.3
769 |     # via
770 |     #   -r requirements/base.txt
771 |     #   onnxruntime
772 |     #   torch
773 | terminado==0.18.1
774 |     # via
775 |     #   jupyter-server
776 |     #   jupyter-server-terminals
777 | timm==1.0.14
778 |     # via
779 |     #   -r requirements/base.txt
780 |     #   effdet
781 |     #   unstructured-inference
782 | tinycss2==1.4.0
783 |     # via bleach
784 | tokenizers==0.21.0
785 |     # via
786 |     #   -r requirements/base.txt
787 |     #   transformers
788 | torch==2.7.0
789 |     # via
790 |     #   -r requirements/base.txt
791 |     #   effdet
792 |     #   timm
793 |     #   torchvision
794 |     #   unstructured-inference
795 | torchvision==0.22.0
796 |     # via
797 |     #   -r requirements/base.txt
798 |     #   effdet
799 |     #   timm
800 | tornado==6.5.0
801 |     # via
802 |     #   ipykernel
803 |     #   jupyter-client
804 |     #   jupyter-server
805 |     #   jupyterlab
806 |     #   notebook
807 |     #   terminado
808 | tqdm==4.67.1
809 |     # via
810 |     #   -r requirements/base.txt
811 |     #   huggingface-hub
812 |     #   nltk
813 |     #   transformers
814 |     #   unstructured
815 | traitlets==5.14.3
816 |     # via
817 |     #   comm
818 |     #   ipykernel
819 |     #   ipython
820 |     #   ipywidgets
821 |     #   jupyter-client
822 |     #   jupyter-console
823 |     #   jupyter-core
824 |     #   jupyter-events
825 |     #   jupyter-server
826 |     #   jupyterlab
827 |     #   matplotlib-inline
828 |     #   nbclient
829 |     #   nbconvert
830 |     #   nbformat
831 | transformers==4.50.0
832 |     # via
833 |     #   -r requirements/base.txt
834 |     #   unstructured-inference
835 | types-python-dateutil==2.9.0.20241206
836 |     # via arrow
837 | typing-extensions==4.12.2
838 |     # via
839 |     #   -r requirements/base.txt
840 |     #   anyio
841 |     #   fastapi
842 |     #   huggingface-hub
843 |     #   mypy
844 |     #   pydantic
845 |     #   pydantic-core
846 |     #   python-docx
847 |     #   python-oxmsg
848 |     #   python-pptx
849 |     #   referencing
850 |     #   torch
851 |     #   typing-inspect
852 |     #   unstructured
853 | typing-inspect==0.9.0
854 |     # via
855 |     #   -r requirements/base.txt
856 |     #   dataclasses-json
857 |     #   unstructured-client
858 | tzdata==2025.1
859 |     # via
860 |     #   -r requirements/base.txt
861 |     #   pandas
862 | unstructured[all-docs]==0.16.17
863 |     # via -r requirements/base.txt
864 | unstructured-client==0.29.0
865 |     # via
866 |     #   -r requirements/base.txt
867 |     #   unstructured
868 | unstructured-inference==0.8.6
869 |     # via
870 |     #   -r requirements/base.txt
871 |     #   unstructured
872 | unstructured-pytesseract==0.3.13
873 |     # via
874 |     #   -r requirements/base.txt
875 |     #   unstructured
876 | uri-template==1.3.0
877 |     # via jsonschema
878 | urllib3==2.3.0
879 |     # via
880 |     #   -r requirements/base.txt
881 |     #   requests
882 | uvicorn==0.34.0
883 |     # via -r requirements/base.txt
884 | watchdog==6.0.0
885 |     # via nbdev
886 | wcwidth==0.2.13
887 |     # via prompt-toolkit
888 | webcolors==24.11.1
889 |     # via jsonschema
890 | webencodings==0.5.1
891 |     # via
892 |     #   -r requirements/base.txt
893 |     #   bleach
894 |     #   html5lib
895 |     #   tinycss2
896 | websocket-client==1.8.0
897 |     # via jupyter-server
898 | wheel==0.45.1
899 |     # via astunparse
900 | widgetsnbextension==4.0.13
901 |     # via ipywidgets
902 | wrapt==1.17.2
903 |     # via
904 |     #   -r requirements/base.txt
905 |     #   deprecated
906 |     #   unstructured
907 | xlrd==2.0.1
908 |     # via
909 |     #   -r requirements/base.txt
910 |     #   unstructured
911 | xlsxwriter==3.2.2
912 |     # via
913 |     #   -r requirements/base.txt
914 |     #   python-pptx
915 | 
916 | # The following packages are considered to be unsafe in a requirements file:
917 | # setuptools
918 | 


--------------------------------------------------------------------------------
/sample-docs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/.gitkeep


--------------------------------------------------------------------------------
/sample-docs/DA-1p-with-duplicate-pages.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/DA-1p-with-duplicate-pages.pdf


--------------------------------------------------------------------------------
/sample-docs/DA-1p.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/DA-1p.bmp


--------------------------------------------------------------------------------
/sample-docs/DA-1p.heic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/DA-1p.heic


--------------------------------------------------------------------------------
/sample-docs/README.md:
--------------------------------------------------------------------------------
 1 | ## Example Docs
 2 | 
 3 | The sample docs directory contains the following files:
 4 | 
 5 | - `example-10k.html` - A 10-K SEC filing in HTML format
 6 | - `layout-parser-paper.pdf` - A PDF copy of the layout parser paper
 7 | - `factbook.xml`/`factbook.xsl` - Example XML/XLS files that you can use to test stylesheets
 8 | 
 9 | These documents can be used to test out the parsers in the library. In addition, here are
10 | instructions for pulling in some sample docs that are too big to store in the repo.
11 | 
12 | #### XBRL 10-K
13 | 
14 | You can get an example 10-K in inline XBRL format using the following `curl`. Note, you need
15 | to have the user agent set in the header or the SEC site will reject your request.
16 | 
17 | ```bash
18 | curl -O \
19 |   -A '${organization} ${email}'
20 |   https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt
21 | ```
22 | 
23 | You can parse this document using the HTML parser.
24 | 


--------------------------------------------------------------------------------
/sample-docs/README.rst:
--------------------------------------------------------------------------------
 1 | Example Docs
 2 | ------------
 3 | 
 4 | The sample docs directory contains the following files:
 5 | 
 6 | -  ``example-10k.html`` - A 10-K SEC filing in HTML format
 7 | -  ``layout-parser-paper.pdf`` - A PDF copy of the layout parser paper
 8 | -  ``factbook.xml``/``factbook.xsl`` - Example XML/XLS files that you
 9 |    can use to test stylesheets
10 | 
11 | These documents can be used to test out the parsers in the library. In
12 | addition, here are instructions for pulling in some sample docs that are
13 | too big to store in the repo.
14 | 
15 | XBRL 10-K
16 | ^^^^^^^^^
17 | 
18 | You can get an example 10-K in inline XBRL format using the following
19 | ``curl``. Note, you need to have the user agent set in the header or the
20 | SEC site will reject your request.
21 | 
22 | .. code:: bash
23 | 
24 |    curl -O \
25 |      -A '${organization} ${email}'
26 |      https://www.sec.gov/Archives/edgar/data/311094/000117184321001344/0001171843-21-001344.txt
27 | 
28 | You can parse this document using the HTML parser.
29 | 


--------------------------------------------------------------------------------
/sample-docs/alert.eml:
--------------------------------------------------------------------------------
 1 | MIME-Version: 1.0
 2 | Date: Wed, 21 Dec 2022 09:55:33 -0600
 3 | Message-ID: <CAPgNNXRkF42xx-fMFG62O-Wevj1GZKuG_SAHPF5-QrvfxbZXTA@mail.gmail.com>
 4 | Subject: ALERT: Stolen Lunch
 5 | From: Mallori Harrell <mallori@unstructured.io>
 6 | To: Mallori Harrell <mallori@unstructured.io>
 7 | Content-Type: multipart/alternative; boundary="0000000000002f0ea105f0589582"
 8 | 
 9 | --0000000000002f0ea105f0589582
10 | Content-Type: text/plain; charset="UTF-8"
11 | 
12 | Hi,
13 | 
14 | It has come to our attention that as of 9:00am this morning, Harold's lunch
15 | is missing. If this was done in error please return the lunch immediately
16 | to the fridge on the 2nd floor by noon.
17 | 
18 | If the lunch has not been returned by noon, we will be reviewing camera
19 | footage to determine who stole Harold's lunch.
20 | 
21 | The perpetrators will be PUNISHED to the full extent of our employee code
22 | of conduct handbook.
23 | 
24 | Thank you for your time,
25 | 
26 | -- 
27 | Mallori Harrell
28 | Unstructured Technologies
29 | Data Scientist
30 | 
31 | --0000000000002f0ea105f0589582
32 | Content-Type: text/html; charset="UTF-8"
33 | Content-Transfer-Encoding: quoted-printable
34 | 
35 | <div dir=3D"ltr"><div>Hi,</div><div><br></div><div>It has come to our atten=
36 | tion that as of 9:00am this morning, Harold&#39;s lunch is missing. If this=
37 |  was done in error please return the lunch immediately to the fridge on the=
38 |  2nd floor by noon.</div><div><br></div><div>If the lunch has not been retu=
39 | rned by noon, we will be reviewing camera footage to determine who stole Ha=
40 | rold&#39;s lunch.</div><div><br></div><div>The perpetrators=C2=A0will be PU=
41 | NISHED to the full extent of our employee code of conduct handbook.</div><d=
42 | iv><br></div><div>Thank you for your time,</div><div><br></div>-- <br><div =
43 | dir=3D"ltr" class=3D"gmail_signature" data-smartmail=3D"gmail_signature"><d=
44 | iv dir=3D"ltr">Mallori Harrell<div>Unstructured Technologies<br><div>Data S=
45 | cientist</div><div><br></div></div></div></div></div>
46 | 
47 | --0000000000002f0ea105f0589582--


--------------------------------------------------------------------------------
/sample-docs/announcement.eml:
--------------------------------------------------------------------------------
 1 | MIME-Version: 1.0
 2 | Date: Wed, 21 Dec 2022 11:09:08 -0600
 3 | Message-ID: <CAPgNNXR+x-xiszwFdZx59eFHz9syApFyODPbAUHT7YVgNtF-fA@mail.gmail.com>
 4 | Subject: ANNOUNCEMENT: The holidays are coming!
 5 | From: Mallori Harrell <mallori@unstructured.io>
 6 | To: Mallori Harrell <mallori@unstructured.io>
 7 | Content-Type: multipart/alternative; boundary="00000000000054448805f0599c48"
 8 | 
 9 | --00000000000054448805f0599c48
10 | Content-Type: text/plain; charset="UTF-8"
11 | 
12 | To All,
13 | 
14 | As the holiday approaches, be sure to let your manager and team know the
15 | following:
16 | 
17 |    - Your days off
18 |    - The location of your work's documentation
19 |    - How to reach you or your secondary in case of an emergency
20 | 
21 | 
22 | Hope you all have a Happy Holidays!
23 | 
24 | Best,
25 | 
26 | -- 
27 | Mallori Harrell
28 | Unstructured Technologies
29 | Data Scientist
30 | 
31 | --00000000000054448805f0599c48
32 | Content-Type: text/html; charset="UTF-8"
33 | Content-Transfer-Encoding: quoted-printable
34 | 
35 | <div dir=3D"ltr">To All,<div><br></div><div>As the holiday approaches, be s=
36 | ure to let your manager and team know the following:</div><div><ul><li>Your=
37 |  days off</li><li>The location of your work&#39;s documentation</li><li>How=
38 |  to reach you or your secondary in case of an emergency</li></ul></div><div=
39 | ><br></div><div>Hope you all have a Happy Holidays!</div><div><br></div><di=
40 | v>Best,</div><div><br></div>-- <br><div dir=3D"ltr" class=3D"gmail_signatur=
41 | e" data-smartmail=3D"gmail_signature"><div dir=3D"ltr">Mallori Harrell<div>=
42 | Unstructured Technologies<br><div>Data Scientist</div><div><br></div></div>=
43 | </div></div></div>
44 | 
45 | --00000000000054448805f0599c48--


--------------------------------------------------------------------------------
/sample-docs/embedded-images-tables.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/embedded-images-tables.jpg


--------------------------------------------------------------------------------
/sample-docs/embedded-images-tables.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/embedded-images-tables.pdf


--------------------------------------------------------------------------------
/sample-docs/english-and-korean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/english-and-korean.png


--------------------------------------------------------------------------------
/sample-docs/fake-doc.rtf:
--------------------------------------------------------------------------------
1 | {\pard \ql \f0 \sa180 \li0 \fi0 \outlinelevel0 \b \fs36 My First Heading\par}
2 | {\pard \ql \f0 \sa180 \li0 \fi0 My first paragraph.\par}
3 | 


--------------------------------------------------------------------------------
/sample-docs/fake-email-attachment.eml:
--------------------------------------------------------------------------------
 1 | MIME-Version: 1.0
 2 | Date: Fri, 23 Dec 2022 12:08:48 -0600
 3 | Message-ID: <CAPgNNXSzLVJ-d1OCX_TjFgJU7ugtQrjFybPtAMmmYZzphxNFYg@mail.gmail.com>
 4 | Subject: Fake email with attachment
 5 | From: Mallori Harrell <mallori@unstructured.io>
 6 | To: Mallori Harrell <mallori@unstructured.io>
 7 | Content-Type: multipart/mixed; boundary="0000000000005d654405f082adb7"
 8 | 
 9 | --0000000000005d654405f082adb7
10 | Content-Type: multipart/alternative; boundary="0000000000005d654205f082adb5"
11 | 
12 | --0000000000005d654205f082adb5
13 | Content-Type: text/plain; charset="UTF-8"
14 | 
15 | Hello!
16 | 
17 | Here's the attachments!
18 | 
19 | It includes:
20 | 
21 |    - Lots of whitespace
22 |    - Little to no content
23 |    - and is a quick read
24 | 
25 | Best,
26 | 
27 | Mallori
28 | 
29 | --0000000000005d654205f082adb5
30 | Content-Type: text/html; charset="UTF-8"
31 | Content-Transfer-Encoding: quoted-printable
32 | 
33 | <div dir=3D"ltr">Hello!=C2=A0<div><br></div><div>Here&#39;s the attachments=
34 | !</div><div><br></div><div>It includes:</div><div><ul><li style=3D"margin-l=
35 | eft:15px">Lots of whitespace</li><li style=3D"margin-left:15px">Little=C2=
36 | =A0to no content</li><li style=3D"margin-left:15px">and is a quick read</li=
37 | ></ul><div>Best,</div></div><div><br></div><div>Mallori</div><div dir=3D"lt=
38 | r" class=3D"gmail_signature" data-smartmail=3D"gmail_signature"><div dir=3D=
39 | "ltr"><div><div><br></div></div></div></div></div>
40 | 
41 | --0000000000005d654205f082adb5--
42 | --0000000000005d654405f082adb7
43 | Content-Type: text/plain; charset="US-ASCII"; name="fake-attachment.txt"
44 | Content-Disposition: attachment; filename="fake-attachment.txt"
45 | Content-Transfer-Encoding: base64
46 | X-Attachment-Id: f_lc0tto5j0
47 | Content-ID: <f_lc0tto5j0>
48 | 
49 | SGV5IHRoaXMgaXMgYSBmYWtlIGF0dGFjaG1lbnQh
50 | --0000000000005d654405f082adb7--


--------------------------------------------------------------------------------
/sample-docs/fake-email.eml:
--------------------------------------------------------------------------------
 1 | MIME-Version: 1.0
 2 | Date: Fri, 16 Dec 2022 17:04:16 -0500
 3 | Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
 4 | Subject: Test Email
 5 | From: Matthew Robinson <mrobinson@unstructured.io>
 6 | To: Matthew Robinson <mrobinson@unstructured.io>
 7 | Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"
 8 | 
 9 | --00000000000095c9b205eff92630
10 | Content-Type: text/plain; charset="UTF-8"
11 | 
12 | This is a test email to use for unit tests.
13 | 
14 | Important points:
15 | 
16 |    - Roses are red
17 |    - Violets are blue
18 | 
19 | --00000000000095c9b205eff92630
20 | Content-Type: text/html; charset="UTF-8"
21 | 
22 | <div dir="ltr"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div>
23 | 
24 | --00000000000095c9b205eff92630--


--------------------------------------------------------------------------------
/sample-docs/fake-email.msg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake-email.msg


--------------------------------------------------------------------------------
/sample-docs/fake-html.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <body>
 4 | 
 5 | <h1>My First Heading</h1>
 6 | <p>My first paragraph.</p>
 7 | 
 8 | </body>
 9 | </html>
10 | 


--------------------------------------------------------------------------------
/sample-docs/fake-power-point.ppt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake-power-point.ppt


--------------------------------------------------------------------------------
/sample-docs/fake-power-point.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake-power-point.pptx


--------------------------------------------------------------------------------
/sample-docs/fake-text-utf-32.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake-text-utf-32.txt


--------------------------------------------------------------------------------
/sample-docs/fake-text.txt:
--------------------------------------------------------------------------------
1 | This is a test document to use for unit tests.
2 | 
3 | Important points:
4 | 
5 |    - Hamburgers are delicious
6 |    - Dogs are the best
7 |    - I love fuzzy blankets


--------------------------------------------------------------------------------
/sample-docs/fake-xml.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <factbook>
 3 |   <country>
 4 |     <name>United States</name>
 5 |     <capital>Washington, DC</capital>
 6 |     <leader>Joe Biden</leader>
 7 |     <sport>Baseball</sport>
 8 |   </country>
 9 |   <country>
10 |     <name>Canada</name>
11 |     <capital>Ottawa</capital>
12 |     <leader>Justin Trudeau</leader>
13 |     <sport>Hockey</sport>
14 |   </country>
15 |   <country>
16 |     <name>France</name>
17 |     <capital>Paris</capital>
18 |     <leader>Emmanuel Macron</leader>
19 |     <sport>Soccer</sport>
20 |   </country>
21 |   <country>
22 |     <name>Trinidad &amp; Tobado</name>
23 |     <capital>Port of Spain</capital>
24 |     <leader>Keith Rowley</leader>
25 |     <sport>Track &amp; Field</sport>
26 |   </country>
27 |   <country>
28 |     <name>Trinidad &amp; Tobado</name>
29 |     <capital>Port of Spain</capital>
30 |     <leader>Keith Rowley</leader>
31 |     <sport>Track &amp; Field</sport>
32 |   </country>
33 | </factbook>
34 | 


--------------------------------------------------------------------------------
/sample-docs/fake.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake.doc


--------------------------------------------------------------------------------
/sample-docs/fake.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake.docx


--------------------------------------------------------------------------------
/sample-docs/fake.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/fake.odt


--------------------------------------------------------------------------------
/sample-docs/family-day.eml:
--------------------------------------------------------------------------------
 1 | MIME-Version: 1.0
 2 | Date: Wed, 21 Dec 2022 10:28:53 -0600
 3 | Message-ID: <CAPgNNXQKR=o6AsOTr74VMrsDNhUJW0Keou9n3vLa2UO_Nv+tZw@mail.gmail.com>
 4 | Subject: Family Day
 5 | From: Mallori Harrell <mallori@unstructured.io>
 6 | To: Mallori Harrell <mallori@unstructured.io>
 7 | Content-Type: multipart/alternative; boundary="0000000000005c115405f0590ce4"
 8 | 
 9 | --0000000000005c115405f0590ce4
10 | Content-Type: text/plain; charset="UTF-8"
11 | 
12 | Hi All,
13 | 
14 | Get excited for our first annual family day!
15 | 
16 | There will be face painting, a petting zoo, funnel cake and more.
17 | 
18 | Make sure to RSVP!
19 | 
20 | Best.
21 | 
22 | -- 
23 | Mallori Harrell
24 | Unstructured Technologies
25 | Data Scientist
26 | 
27 | --0000000000005c115405f0590ce4
28 | Content-Type: text/html; charset="UTF-8"
29 | Content-Transfer-Encoding: quoted-printable
30 | 
31 | <div dir=3D"ltr">Hi All,<div><br></div><div>Get excited for our first annua=
32 | l family day!=C2=A0</div><div><br></div><div>There will be face painting, =
33 | a petting zoo, funnel cake and more.</div><div><br></div><div>Make sure to =
34 | RSVP!</div><div><br></div><div>Best.<br clear=3D"all"><div><br></div>-- <br=
35 | ><div dir=3D"ltr" class=3D"gmail_signature" data-smartmail=3D"gmail_signatu=
36 | re"><div dir=3D"ltr">Mallori Harrell<div>Unstructured Technologies<br><div>=
37 | Data Scientist</div><div><br></div></div></div></div></div></div>
38 | 
39 | --0000000000005c115405f0590ce4--


--------------------------------------------------------------------------------
/sample-docs/layout-parser-paper-fast.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/layout-parser-paper-fast.jpg


--------------------------------------------------------------------------------
/sample-docs/layout-parser-paper-fast.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/layout-parser-paper-fast.pdf


--------------------------------------------------------------------------------
/sample-docs/layout-parser-paper-fast.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/layout-parser-paper-fast.tiff


--------------------------------------------------------------------------------
/sample-docs/layout-parser-paper-with-table.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/layout-parser-paper-with-table.jpg


--------------------------------------------------------------------------------
/sample-docs/layout-parser-paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/layout-parser-paper.pdf


--------------------------------------------------------------------------------
/sample-docs/layout-parser-paper.pdf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/layout-parser-paper.pdf.gz


--------------------------------------------------------------------------------
/sample-docs/list-item-example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/list-item-example.pdf


--------------------------------------------------------------------------------
/sample-docs/notes.ppt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/notes.ppt


--------------------------------------------------------------------------------
/sample-docs/notes.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/notes.pptx


--------------------------------------------------------------------------------
/sample-docs/spring-weather.html.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "element_id": "41f6e17bf5e9a407fcca74e902f802a0",
  4 |     "text": "News Around NOAA",
  5 |     "type": "Title",
  6 |     "metadata": {
  7 |       "page_number": 1
  8 |     }
  9 |   },
 10 |   {
 11 |     "element_id": "aa589c25dc22dcc8a75baba1244e6c8f",
 12 |     "text": "National Program",
 13 |     "type": "Title",
 14 |     "metadata": {
 15 |       "page_number": 1
 16 |     }
 17 |   },
 18 |   {
 19 |     "element_id": "62c26d2e16774d2334bd804c7bb6a711",
 20 |     "text": "Are You Weather-Ready for the Spring?",
 21 |     "type": "Title",
 22 |     "metadata": {
 23 |       "page_number": 1
 24 |     }
 25 |   },
 26 |   {
 27 |     "element_id": "32709cd3bec72640bbbe32f58e6e23f6",
 28 |     "text": "Weather.gov >",
 29 |     "type": "Title",
 30 |     "metadata": {
 31 |       "page_number": 1
 32 |     }
 33 |   },
 34 |   {
 35 |     "element_id": "2661da76db570876b075083aaeeaee55",
 36 |     "text": "News Around NOAA > Are You Weather-Ready for the Spring?",
 37 |     "type": "Title",
 38 |     "metadata": {
 39 |       "page_number": 1
 40 |     }
 41 |   },
 42 |   {
 43 |     "element_id": "fab6c4df083f0fb6f324fff65b652c86",
 44 |     "text": "Weather Safety                                                                                                        Air Quality                                                                            Beach Hazards                                                                            Cold                                                                            Cold Water                                                                            Drought                                                                            Floods                                                                            Fog                                                                            Heat                                                                             Hurricanes                                                                             Lightning Safety                                                                            Rip Currents                                                                            Safe Boating                                                                            Space Weather                                                                            Sun (Ultraviolet Radiation)                                                                             Thunderstorms & Tornadoes                                                                            Tornado                                                                            Tsunami                                                                            Wildfire                                                                            Wind                                                                            Winter",
 45 |     "type": "ListItem",
 46 |     "metadata": {
 47 |       "page_number": 1
 48 |     }
 49 |   },
 50 |   {
 51 |     "element_id": "45c26cf3457e6d18985a435e2c0fcc65",
 52 |     "text": "Safety Campaigns                                                                                                        Seasonal Safety Campaigns                                                                            #SafePlaceSelfie                                                                            Deaf & Hard of Hearing                                                                            Intellectual Disabilities                                                                            Spanish-language Content                                                                            The Great Outdoors",
 53 |     "type": "ListItem",
 54 |     "metadata": {
 55 |       "page_number": 1
 56 |     }
 57 |   },
 58 |   {
 59 |     "element_id": "77f5acc603de9a165ed87a5c3fbaf14a",
 60 |     "text": "Ambassador                                                                                                        About WRN Ambassadors                                                                            Become an Ambassador                                                                            Ambassadors of Excellence                                                                            People of WRN                                                                             FAQS                                                                            Tell Your Success Story                                                                             Success Stories                                                                            Tri-fold                                                                            Aviation                                                                             Current Ambassadors                                                                            Brochure                                                                            En Español",
 61 |     "type": "ListItem",
 62 |     "metadata": {
 63 |       "page_number": 1
 64 |     }
 65 |   },
 66 |   {
 67 |     "element_id": "8f19bcaabbd1bafa5e9826ac69766c8b",
 68 |     "text": "Education                                                                                                        NWS Education Home                                                                            Be A Force Of Nature                                                                            WRN Kids Flyer                                                                            Wireless Emergency Alerts                                                                            NOAA Weather Radio                                                                            Mobile Weather                                                                            Brochures                                                                            Hourly Weather Forecast                                                                            Citizen Science                                                                            Intellectual Disabilities",
 69 |     "type": "ListItem",
 70 |     "metadata": {
 71 |       "page_number": 1
 72 |     }
 73 |   },
 74 |   {
 75 |     "element_id": "1245f9cf9e019713391e4ee3bac54a63",
 76 |     "text": "Collaboration                                                                                                        Get Involved                                                                             Social Media                                                                            WRN Ambassadors ​                                                                            Enterprise Resources                                                                            StormReady                                                                            TsunamiReady                                                                            NWSChat (core partners only)                                                                            InteractiveNWS (iNWS) (core partners only)​                                                                            SKYWARN",
 77 |     "type": "ListItem",
 78 |     "metadata": {
 79 |       "page_number": 1
 80 |     }
 81 |   },
 82 |   {
 83 |     "element_id": "23dfa7f98424dbf86e00b3d500096dfa",
 84 |     "text": "News & Events                                                                                                        Latest News                                                                            Calendar                                                                            Meetings & Workshops                                                                            NWS Aware Newsletter",
 85 |     "type": "ListItem",
 86 |     "metadata": {
 87 |       "page_number": 1
 88 |     }
 89 |   },
 90 |   {
 91 |     "element_id": "93202df2ec7081b28b47901b5c287a5a",
 92 |     "text": "International",
 93 |     "type": "ListItem",
 94 |     "metadata": {
 95 |       "page_number": 1
 96 |     }
 97 |   },
 98 |   {
 99 |     "element_id": "e53d6a9c615bdf1a8d7b98a67cade488",
100 |     "text": "About                                                                                                        Contact Us                                                                             What is WRN?                                                                             WRN FAQ                                                                            WRN Brochure                                                                            Hazard Simplification                                                                            IDSS Brochure                                                                            Roadmap                                                                            Strategic Plan                                                                            WRN International                                                                            Social Science",
101 |     "type": "ListItem",
102 |     "metadata": {
103 |       "page_number": 1
104 |     }
105 |   },
106 |   {
107 |     "element_id": "6cbcf8c11f8c0781bd9ecc7f67169ff0",
108 |     "text": "The spring season is all about change – a rebirth both literally and figuratively. Even though the spring season doesn’t officially (astronomically, that is) begin until March 20 this year, climatologically, it starts March 1.",
109 |     "type": "NarrativeText",
110 |     "metadata": {
111 |       "page_number": 1
112 |     }
113 |   },
114 |   {
115 |     "element_id": "7184168da442c6ef28553b274bf2be8f",
116 |     "text": "As cold winter nights are replaced by the warmth of longer daylight hours, the National Weather Service invites you to do two important things that may save your life or the life of a loved one.",
117 |     "type": "NarrativeText",
118 |     "metadata": {
119 |       "page_number": 1
120 |     }
121 |   },
122 |   {
123 |     "element_id": "f3be9748ecd68b20d706548129baa22d",
124 |     "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”",
125 |     "type": "NarrativeText",
126 |     "metadata": {
127 |       "page_number": 1
128 |     }
129 |   },
130 |   {
131 |     "element_id": "126c3cd201fb259cfeabc6bffc0b5473",
132 |     "text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content – everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic.",
133 |     "type": "NarrativeText",
134 |     "metadata": {
135 |       "page_number": 1
136 |     }
137 |   },
138 |   {
139 |     "element_id": "c1944fb037f3e1cb14969bc59a7dd9c2",
140 |     "text": "This spring, the campaign is focused on heat dangers. Heat illness and death can occur even in spring’s moderately warm weather. The majority of all heat-related deaths occur outside of heat waves and roughly a third of child hot car deaths occur outside of the summer months. Learn more by viewing the infographics that are now available.",
141 |     "type": "NarrativeText",
142 |     "metadata": {
143 |       "page_number": 1
144 |     }
145 |   },
146 |   {
147 |     "element_id": "fa1b939ef6159d95260bc095f58ebbc2",
148 |     "text": "Stay safe this spring, and every season, by being informed, prepared, and Weather-Ready.",
149 |     "type": "NarrativeText",
150 |     "metadata": {
151 |       "page_number": 1
152 |     }
153 |   },
154 |   {
155 |     "element_id": "47d5d0d27a35a36d7467dfc8b6e089b3",
156 |     "text": "US Dept of Commerce\n                        National Oceanic and Atmospheric Administration\n                        National Weather Service\n                        News Around NOAA1325 East West HighwaySilver Spring, MD 20910Comments? Questions? Please Contact Us.",
157 |     "type": "NarrativeText",
158 |     "metadata": {
159 |       "page_number": 1
160 |     }
161 |   },
162 |   {
163 |     "element_id": "129c678fce59acee7ac6a6fdb67b6310",
164 |     "text": "Disclaimer",
165 |     "type": "Title",
166 |     "metadata": {
167 |       "page_number": 1
168 |     }
169 |   },
170 |   {
171 |     "element_id": "3c96caaebd949e39d25b3ccf4133c5d8",
172 |     "text": "Information Quality",
173 |     "type": "Title",
174 |     "metadata": {
175 |       "page_number": 1
176 |     }
177 |   },
178 |   {
179 |     "element_id": "b79cac926e0b2e347e72cc91d5174037",
180 |     "text": "Help",
181 |     "type": "Title",
182 |     "metadata": {
183 |       "page_number": 1
184 |     }
185 |   },
186 |   {
187 |     "element_id": "4c4e436f9a453c776dbf011f98d932d6",
188 |     "text": "Glossary",
189 |     "type": "Title",
190 |     "metadata": {
191 |       "page_number": 1
192 |     }
193 |   },
194 |   {
195 |     "element_id": "506ff394621596dd88138642eddfc1e4",
196 |     "text": "Privacy Policy",
197 |     "type": "Title",
198 |     "metadata": {
199 |       "page_number": 1
200 |     }
201 |   },
202 |   {
203 |     "element_id": "c70ae8c30a61c450d2c5148d1b6a0447",
204 |     "text": "Freedom of Information Act (FOIA)",
205 |     "type": "Title",
206 |     "metadata": {
207 |       "page_number": 1
208 |     }
209 |   },
210 |   {
211 |     "element_id": "5d8c71abc527284cd463aa58f3f48098",
212 |     "text": "About Us",
213 |     "type": "Title",
214 |     "metadata": {
215 |       "page_number": 1
216 |     }
217 |   },
218 |   {
219 |     "element_id": "a8a00c355d2fa1461d532a1088274f32",
220 |     "text": "Career Opportunities",
221 |     "type": "Title",
222 |     "metadata": {
223 |       "page_number": 1
224 |     }
225 |   }
226 | ]


--------------------------------------------------------------------------------
/sample-docs/stanley-cups.csv:
--------------------------------------------------------------------------------
1 | Stanley Cups,,
2 | Team,Location,Stanley Cups
3 | Blues,STL,1
4 | Flyers,PHI,2
5 | Maple Leafs,TOR,13


--------------------------------------------------------------------------------
/sample-docs/stanley-cups.tsv:
--------------------------------------------------------------------------------
1 | Stanley Cups		
2 | Team	Location	Stanley Cups
3 | Blues	STL	1
4 | Flyers	PHI	2
5 | Maple Leafs	TOR	13
6 | 


--------------------------------------------------------------------------------
/sample-docs/stanley-cups.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/stanley-cups.xlsx


--------------------------------------------------------------------------------
/sample-docs/winter-sports.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/sample-docs/winter-sports.epub


--------------------------------------------------------------------------------
/scripts/app-start.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PORT=${PORT:-8000}
 4 | export HOST=${HOST:-"0.0.0.0"}
 5 | export WORKERS=${WORKERS:-1}
 6 | 
 7 | NUMREGEX="^[0-9]+$"
 8 | GRACEFUL_SHUTDOWN_PERIOD_SECONDS=3600
 9 | TIMEOUT_COMMAND='timeout'
10 | OPTIONAL_TIMEOUT=''
11 | 
12 | if [[ -n $MAX_LIFETIME_SECONDS ]]; then
13 |     if ! command -v $TIMEOUT_COMMAND &> /dev/null; then
14 |         TIMEOUT_COMMAND='gtimeout'
15 |         echo "Warning! 'timeout' command is required but not available. Checking for gtimeout."
16 |     elif ! command -v $TIMEOUT_COMMAND &> /dev/null; then
17 |         echo "Warning! 'gtimeout' command is required but not available. Running without max lifetime."
18 |     elif [[ $MAX_LIFETIME_SECONDS =~ $NUMREGEX ]]; then
19 |         OPTIONAL_TIMEOUT="timeout --preserve-status --foreground --kill-after ${GRACEFUL_SHUTDOWN_PERIOD_SECONDS} ${MAX_LIFETIME_SECONDS}"
20 |         echo "Server's lifetime set to ${MAX_LIFETIME_SECONDS} seconds."
21 |     else
22 |         echo "Warning! MAX_LIFETIME_SECONDS was not properly set, an integer was expected, got ${MAX_LIFETIME_SECONDS}. Running without max lifetime."
23 |     fi
24 | fi
25 | 
26 | ${OPTIONAL_TIMEOUT} \
27 |     uvicorn prepline_general.api.app:app \
28 |     --log-config logger_config.yaml \
29 |     --host "$HOST" \
30 |     --port "$PORT" \
31 |     --workers "$WORKERS" \
32 | 
33 | echo "Server was shutdown"
34 | [ -n "$MAX_LIFETIME_SECONDS" ] && echo "Reached timeout of $MAX_LIFETIME_SECONDS seconds"
35 | 


--------------------------------------------------------------------------------
/scripts/docker-build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euo pipefail
 4 | DOCKER_REPOSITORY="${DOCKER_REPOSITORY:-quay.io/unstructured-io/unstructured-api}"
 5 | PIPELINE_PACKAGE=${PIPELINE_PACKAGE:-"general"}
 6 | PIPELINE_FAMILY=${PIPELINE_FAMILY:-"general"}
 7 | PIP_VERSION="${PIP_VERSION:-25.1.1}"
 8 | DOCKER_IMAGE="${DOCKER_IMAGE:-pipeline-family-${PIPELINE_FAMILY}-dev}"
 9 | DOCKER_PLATFORM="${DOCKER_PLATFORM:-}"
10 | 
11 | 
12 | DOCKER_BUILD_CMD=(
13 |   docker buildx build --load -f Dockerfile
14 |   --build-arg PIP_VERSION="$PIP_VERSION"
15 |   --build-arg BUILDKIT_INLINE_CACHE=1
16 |   --build-arg PIPELINE_PACKAGE="$PIPELINE_PACKAGE"
17 |   --progress plain
18 |   --platform linux/amd64
19 |   --cache-from "$DOCKER_REPOSITORY:latest"
20 |   -t "$DOCKER_IMAGE"
21 |   .
22 | )
23 | 
24 | # only build for specific platform if DOCKER_PLATFORM is set
25 | if [ -n "${DOCKER_PLATFORM:-}" ]; then
26 |   DOCKER_BUILD_CMD+=("--platform=$DOCKER_PLATFORM")
27 | fi
28 | 
29 | DOCKER_BUILDKIT=1 "${DOCKER_BUILD_CMD[@]}"
30 | 


--------------------------------------------------------------------------------
/scripts/docker-smoke-test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | 
 4 | # docker-smoke-test.sh
 5 | # Start the containerized api and run some end-to-end tests against it
 6 | # There will be some overlap with just running a TestClient in the unit tests
 7 | # Is there a good way to reuse code here?
 8 | # Also note this can evolve into a generalized pipeline smoke test
 9 | 
10 | # shellcheck disable=SC2317  # Shellcheck complains that trap functions are unreachable...
11 | 
12 | set -e
13 | 
14 | CONTAINER_NAME=unstructured-api-smoke-test
15 | CONTAINER_NAME_PARALLEL=unstructured-api-smoke-test-parallel
16 | PIPELINE_FAMILY=${PIPELINE_FAMILY:-"general"}
17 | DOCKER_IMAGE="${DOCKER_IMAGE:-pipeline-family-${PIPELINE_FAMILY}-dev:latest}"
18 | SKIP_INFERENCE_TESTS="${SKIP_INFERENCE_TESTS:-false}"
19 | 
20 | start_container() {
21 | 
22 |     port=$1
23 |     use_parallel_mode=$2
24 | 
25 |     if [ "$use_parallel_mode" = "true" ]; then
26 |         name=$CONTAINER_NAME_PARALLEL
27 |     else
28 |         name=$CONTAINER_NAME
29 |     fi
30 | 
31 |     echo Starting container "$name"
32 |     docker run --platform "$DOCKER_PLATFORM" \
33 |            -p "$port":"$port" \
34 |            --entrypoint uvicorn \
35 |            -d \
36 |            --rm \
37 |            --name "$name" \
38 |            --env "UNSTRUCTURED_PARALLEL_MODE_URL=http://localhost:$port/general/v0/general" \
39 |            --env "UNSTRUCTURED_PARALLEL_MODE_ENABLED=$use_parallel_mode" \
40 |            "$DOCKER_IMAGE" \
41 |            prepline_general.api.app:app --port "$port" --host 0.0.0.0
42 | }
43 | 
44 | await_server_ready() {
45 |     port=$1
46 |     url=localhost:$port/healthcheck
47 | 
48 |     # NOTE(rniko): Increasing the timeout to 120 seconds because emulated arm tests are slow to start
49 |     for _ in {1..120}; do
50 |         echo Waiting for response from "$url"
51 |         if curl "$url" 2> /dev/null; then
52 |             echo
53 |             return
54 |         fi
55 | 
56 |         sleep 1
57 |     done
58 | 
59 |     echo Server did not respond!
60 |     exit 1
61 | }
62 | 
63 | stop_container() {
64 |     echo Stopping container "$CONTAINER_NAME"
65 |     # Note (austin) - if you're getting an error from the api, try dumping the logs
66 |     # docker logs $CONTAINER_NAME 2> docker_logs.txt
67 |     docker stop "$CONTAINER_NAME" 2> /dev/null || true
68 | 
69 |     echo Stopping container "$CONTAINER_NAME_PARALLEL"
70 |     docker stop "$CONTAINER_NAME_PARALLEL" 2> /dev/null || true
71 | }
72 | 
73 | # Always clean up the container
74 | trap stop_container EXIT
75 | 
76 | start_container 8000 "false"
77 | await_server_ready 8000
78 | 
79 | #######################
80 | # Smoke Tests
81 | #######################
82 | echo Running smoke tests with SKIP_INFERENCE_TESTS: "$SKIP_INFERENCE_TESTS"
83 | PYTHONPATH=. SKIP_INFERENCE_TESTS=$SKIP_INFERENCE_TESTS pytest -vv scripts/smoketest.py
84 | 
85 | #######################
86 | # Test parallel vs single mode
87 | #######################
88 | if ! $SKIP_INFERENCE_TESTS; then
89 |     start_container 9000 true
90 |     await_server_ready 9000
91 | 
92 |     echo Running parallel mode test
93 |     ./scripts/parallel-mode-test.sh localhost:8000 localhost:9000
94 | fi
95 | 
96 | result=$?
97 | exit $result
98 | 


--------------------------------------------------------------------------------
/scripts/install-pandoc.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Mainly used for installing pandoc on CI
 4 | 
 5 | if [ "$(uname)" == "Darwin" ]; then
 6 |     echo "This script is intended for Linux only."
 7 |     exit 0
 8 | fi
 9 | 
10 | set -euo pipefail
11 | if [ "${ARCH}" = "x86_64" ]; then
12 |     export PANDOC_ARCH="amd64"
13 | elif [ "${ARCH}" = "arm64" ] || [ "${ARCH}" = "aarch64" ]; then
14 |     export PANDOC_ARCH="arm64"
15 | fi
16 | 
17 | wget https://github.com/jgm/pandoc/releases/download/3.1.2/pandoc-3.1.2-linux-"${PANDOC_ARCH}".tar.gz
18 | tar xvf pandoc-3.1.2-linux-"${PANDOC_ARCH}".tar.gz
19 | cd pandoc-3.1.2
20 | sudo cp bin/pandoc /usr/local/bin/
21 | cd ..
22 | rm -rf pandoc-3.1.2*
23 | 


--------------------------------------------------------------------------------
/scripts/parallel-mode-test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # parallel-mode-test.sh
 4 | # Iterate a list of curl commands, and run each one against two instances of the api
 5 | # The smoke test will start one container with parallel mode and one without, and
 6 | # diff the two outputs to make sure parallel mode does not alter the response.
 7 | # Note the filepaths assume you ran this from the top level
 8 | 
 9 | # shellcheck disable=SC2317  # Shellcheck complains that trap functions are unreachable...
10 | 
11 | base_url_1=$1
12 | base_url_2=$2
13 | 
14 | declare -a curl_params=(
15 |     "-F files=@sample-docs/layout-parser-paper.pdf -F 'strategy=fast'"
16 |     "-F files=@sample-docs/layout-parser-paper.pdf -F 'strategy=auto"
17 |     "-F files=@sample-docs/layout-parser-paper.pdf -F 'strategy=hi_res'"
18 |     "-F files=@sample-docs/layout-parser-paper.pdf -F 'coordinates=true'"
19 |     "-F files=@sample-docs/layout-parser-paper.pdf -F 'encoding=utf-8'"
20 |     "-F files=@sample-docs/layout-parser-paper.pdf -F 'include_page_breaks=true'"
21 |     "-F files=@sample-docs/layout-parser-paper.pdf -F 'hi_res_model_name=yolox'"
22 | )
23 | 
24 | for params in "${curl_params[@]}"
25 | do
26 |    curl_command="curl $base_url_1/general/v0/general $params"
27 |    echo Testing: "$curl_command"
28 | 
29 |    # Run in single mode
30 |    # Note(austin): Parallel mode screws up hierarchy! While we deal with that,
31 |    # let's ignore parent_id fields in the results
32 |    $curl_command 2> /dev/null | jq -S 'del(..|.parent_id?)' > output.json
33 |    original_length=$(jq 'length' output.json)
34 | 
35 |    # Stop if curl didn't work
36 |    if [ ! -s output.json ]; then
37 |        echo Command failed!
38 |        $curl_command
39 |        exit 1
40 |    fi
41 | 
42 |    # Run in parallel mode
43 |    curl_command="curl $base_url_2/general/v0/general $params"
44 |    $curl_command 2> /dev/null | jq -S 'del(..|.parent_id?)' > parallel_output.json
45 |    parallel_length=$(jq 'length' parallel_output.json)
46 | 
47 |    # Stop if curl didn't work
48 |    if [ ! -s parallel_output.json ]; then
49 |        echo Command failed!
50 |        $curl_command
51 |        exit 1
52 |    fi
53 | 
54 |    if ! [[ "$original_length" == "$parallel_length" ]]; then
55 |        echo Parallel mode returned a different number of elements!
56 |        echo Params: "$params"
57 |        exit 1
58 |    fi
59 | 
60 |    rm -f output.json parallel_output.json
61 |    echo
62 | done
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/scripts/shellcheck.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | find scripts -name "*.sh" -exec shellcheck {} +
4 | 
5 | 


--------------------------------------------------------------------------------
/scripts/smoketest.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os
  3 | import time
  4 | import gzip
  5 | import shutil
  6 | from pathlib import Path
  7 | from typing import List, Optional
  8 | import tempfile
  9 | 
 10 | import pytest
 11 | import requests
 12 | import pandas as pd
 13 | 
 14 | API_URL = "http://localhost:8000/general/v0/general"
 15 | # NOTE(rniko): Skip inference tests if we're running on an emulated architecture
 16 | skip_inference_tests = os.getenv("SKIP_INFERENCE_TESTS", "").lower() in {"true", "yes", "y", "1"}
 17 | 
 18 | 
 19 | def send_document(
 20 |     filenames: List[str],
 21 |     filenames_gzipped: Optional[List[str]] = None,
 22 |     content_type: str = "",
 23 |     strategy: str = "auto",
 24 |     output_format: str = "application/json",
 25 |     skip_infer_table_types: list[str] = [],
 26 |     uncompressed_content_type: str = "",
 27 | ):
 28 |     if filenames_gzipped is None:
 29 |         filenames_gzipped = []
 30 |     files = []
 31 |     for filename in filenames:
 32 |         files.append(("files", (str(filename), open(filename, "rb"), content_type)))
 33 |     for filename in filenames_gzipped:
 34 |         files.append(("files", (str(filename), open(filename, "rb"), "application/gzip")))
 35 | 
 36 |     options = {
 37 |         "strategy": strategy,
 38 |         "output_format": output_format,
 39 |         "skip_infer_table_types": skip_infer_table_types,
 40 |     }
 41 |     if uncompressed_content_type:
 42 |         options["gz_uncompressed_content_type"] = uncompressed_content_type
 43 | 
 44 |     return requests.post(
 45 |         API_URL,
 46 |         files=files,
 47 |         data=options,
 48 |     )
 49 | 
 50 | 
 51 | @pytest.mark.parametrize(
 52 |     ("extension", "example_filename", "content_type"),
 53 |     [
 54 |         (".bmp", "DA-1p.bmp", "image/bmp"),
 55 |         (".csv", "stanley-cups.csv", "application/csv"),
 56 |         (".doc", "fake.doc", "application/msword"),
 57 |         (
 58 |             ".docx",
 59 |             "fake.docx",
 60 |             "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 61 |         ),
 62 |         (".eml", "fake-email-attachment.eml", "message/rfc822"),
 63 |         (".epub", "winter-sports.epub", "application/epub"),
 64 |         (".heic", "DA-1p.heic", "image/heic"),
 65 |         (".html", "fake-html.html", "text/html"),
 66 |         (".jpeg", "layout-parser-paper-fast.jpg", "image/jpeg"),
 67 |         (".md", "README.md", "text/markdown"),
 68 |         (".msg", "fake-email.msg", "application/x-ole-storage"),
 69 |         (".odt", "fake.odt", "application/vnd.oasis.opendocument.text"),
 70 |         (".pdf", "layout-parser-paper.pdf", "application/pdf"),
 71 |         (".png", "english-and-korean.png", "image/png"),
 72 |         (".ppt", "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
 73 |         (
 74 |             ".pptx",
 75 |             "fake-power-point.pptx",
 76 |             "application/vnd.openxmlformats-officedocument.presentationml.presentation",
 77 |         ),
 78 |         (".rst", "README.rst", "text/prs.fallenstein.rst"),
 79 |         (".rtf", "fake-doc.rtf", "application/rtf"),
 80 |         (".tiff", "layout-parser-paper-fast.tiff", "image/tiff"),
 81 |         (".tsv", "stanley-cups.tsv", "text/tab-separated-values"),
 82 |         (".txt", "fake-text.txt", "text/plain"),
 83 |         (
 84 |             ".xlsx",
 85 |             "stanley-cups.xlsx",
 86 |             "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 87 |         ),
 88 |         (".xml", "fake-xml.xml", "text/xml"),
 89 |         (".json", "spring-weather.html.json", "application/json"),
 90 |         (
 91 |             ".gz",
 92 |             "layout-parser-paper.pdf.gz",
 93 |             "application/gzip",
 94 |         ),
 95 |     ],
 96 | )
 97 | def test_happy_path_all_types(extension, example_filename: str, content_type: str):
 98 |     """
 99 |     For the files in sample-docs, verify that we get a 200
100 |     and some structured response
101 |     """
102 |     # The auto strategy will run ocr on these files
103 |     # This doesn't always work on our macs
104 |     if skip_inference_tests and extension in [
105 |         ".bmp",
106 |         ".heic",
107 |         ".jpeg",
108 |         ".pdf",
109 |         ".png",
110 |         ".tiff",
111 |         ".gz",  # Since we're using a gzipped pdf...
112 |     ]:
113 |         pytest.skip("emulated hardware")
114 | 
115 |     test_file = str(Path("sample-docs") / example_filename)
116 | 
117 |     # Verify we can send with explicit content type
118 |     response = send_document(filenames=[test_file], content_type=content_type)
119 | 
120 |     if response.status_code != 200:
121 |         assert False, response.text
122 | 
123 |     assert len(response.json()) > 0
124 |     assert len("".join(elem["text"] for elem in response.json())) > 20
125 | 
126 |     # Verify we can infer the filetype on the server
127 |     response = send_document(filenames=[test_file], content_type=None)
128 | 
129 |     if response.status_code != 200:
130 |         assert False, response.text
131 | 
132 |     assert len(response.json()) > 0
133 |     assert len("".join(elem["text"] for elem in response.json())) > 20
134 | 
135 |     json_response = response
136 | 
137 |     # Verify we can set output type to csv
138 |     csv_response = send_document(
139 |         filenames=[test_file],
140 |         content_type=content_type,
141 |         output_format="text/csv",
142 |     )
143 |     assert csv_response.status_code == 200
144 |     assert len(csv_response.text) > 0
145 |     df = pd.read_csv(io.StringIO(csv_response.text))
146 |     assert len(df) == len(json_response.json())
147 | 
148 | 
149 | @pytest.mark.parametrize("output_format", ["application/json", "text/csv"])
150 | @pytest.mark.parametrize(
151 |     "filenames_to_gzip, filenames_verbatim, uncompressed_content_type",
152 |     [
153 |         (["fake-html.html"], [], "text/html"),
154 |         (["stanley-cups.csv"], [], "application/csv"),
155 |         (["fake.doc"], [], "application/msword"),
156 |         # compressed and uncompressed
157 |         pytest.param(
158 |             ["layout-parser-paper-fast.pdf"],
159 |             ["list-item-example.pdf"],
160 |             "application/pdf",
161 |             marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
162 |         ),
163 |         (["fake-email.eml"], ["fake-email-image-embedded.eml"], "message/rfc822"),
164 |         # compressed and uncompressed
165 |         # empty content-type means that API should detect filetype after decompressing.
166 |         pytest.param(
167 |             ["layout-parser-paper-fast.pdf"],
168 |             ["list-item-example.pdf"],
169 |             "",
170 |             marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
171 |         ),
172 |         (["fake-email.eml"], ["fake-email-image-embedded.eml"], ""),
173 |     ],
174 | )
175 | def test_gzip_sending(
176 |     output_format: str,
177 |     filenames_to_gzip: List[str],
178 |     filenames_verbatim: List[str],
179 |     uncompressed_content_type: str,
180 | ):
181 |     temp_files = {}
182 | 
183 |     for filename in filenames_to_gzip:
184 |         gz_file_extension = f"{Path(filename).suffix}.gz"
185 |         temp_file = tempfile.NamedTemporaryFile(suffix=gz_file_extension)
186 |         full_path = Path("sample-docs") / filename
187 |         gzip_file(str(full_path), temp_file.name)
188 |         temp_files[filename] = temp_file
189 |     filenames_gzipped = [temp_file.name for temp_file in temp_files.values()]
190 | 
191 |     filenames = []
192 |     for filename in filenames_verbatim:
193 |         filenames.append(str(Path("sample-docs") / filename))
194 | 
195 |     json_response = send_document(
196 |         filenames,
197 |         filenames_gzipped,
198 |         content_type=uncompressed_content_type,
199 |         uncompressed_content_type=uncompressed_content_type,
200 |     )
201 |     assert json_response.status_code == 200, json_response.text
202 |     json_content = json_response.json()
203 |     assert len(json_content) > 0
204 |     if len(filenames_gzipped + filenames) > 1:
205 |         for file in json_content:
206 |             assert len("".join(elem["text"] for elem in file)) > 20
207 |     else:
208 |         assert len("".join(elem["text"] for elem in json_content)) > 20
209 | 
210 |     csv_response = send_document(
211 |         filenames,
212 |         filenames_gzipped,
213 |         content_type=uncompressed_content_type,
214 |         uncompressed_content_type=uncompressed_content_type,
215 |         output_format="text/csv",
216 |     )
217 |     assert csv_response.status_code == 200
218 |     assert len(csv_response.text) > 0
219 |     df = pd.read_csv(io.StringIO(csv_response.text))
220 |     if len(filenames_gzipped + filenames) > 1:
221 |         json_size = 0
222 |         for file in json_content:
223 |             json_size += len(file)
224 |         assert len(df) == json_size
225 |     else:
226 |         assert len(df) == len(json_content)
227 | 
228 |     for filename in filenames_to_gzip:
229 |         temp_files[filename].close()
230 | 
231 | 
232 | @pytest.mark.skipif(skip_inference_tests, reason="emulated architecture")
233 | def test_strategy_performance():
234 |     """
235 |     For the files in sample-docs, verify that the fast strategy
236 |     is significantly faster than the hi_res strategy
237 |     """
238 |     performance_ratio = 4
239 |     test_file = str(Path("sample-docs") / "layout-parser-paper.pdf")
240 | 
241 |     start_time = time.monotonic()
242 |     response = send_document(
243 |         filenames=[test_file], content_type="application/pdf", strategy="hi_res"
244 |     )
245 |     hi_res_time = time.monotonic() - start_time
246 |     assert response.status_code == 200
247 | 
248 |     start_time = time.monotonic()
249 |     response = send_document(filenames=[test_file], content_type="application/pdf", strategy="fast")
250 |     fast_time = time.monotonic() - start_time
251 |     assert response.status_code == 200
252 |     assert hi_res_time > performance_ratio * fast_time
253 | 
254 | 
255 | @pytest.mark.skipif(skip_inference_tests, reason="emulated architecture")
256 | @pytest.mark.parametrize(
257 |     "strategy, skip_infer_table_types, expected_table_num",
258 |     [
259 |         ("fast", [], 0),
260 |         ("fast", ["pdf"], 0),
261 |         ("hi_res", [], 2),
262 |         ("hi_res", ["pdf"], 0),
263 |     ],
264 | )
265 | def test_table_support(strategy: str, skip_infer_table_types: list[str], expected_table_num: int):
266 |     """
267 |     Test that table extraction works on hi_res strategy
268 |     """
269 |     test_file = str(Path("sample-docs") / "layout-parser-paper.pdf")
270 |     response = send_document(
271 |         filenames=[test_file],
272 |         content_type="application/pdf",
273 |         strategy=strategy,
274 |         skip_infer_table_types=skip_infer_table_types,
275 |     )
276 | 
277 |     assert response.status_code == 200
278 |     extracted_tables = [
279 |         el["metadata"]["text_as_html"]
280 |         for el in response.json()
281 |         if "text_as_html" in el["metadata"].keys()
282 |     ]
283 |     assert len(extracted_tables) == expected_table_num
284 |     if expected_table_num > 0:
285 |         # Test a text form a table is extracted
286 |         # Note(austin) - table output has changed - this line isn't returned
287 |         # assert "Layouts of scanned modern magazines and scientific reports" in extracted_tables[0]
288 |         assert "Layouts of history" in extracted_tables[0]
289 | 
290 | 
291 | def gzip_file(in_filepath: str, out_filepath: str):
292 |     with open(in_filepath, "rb") as f_in:
293 |         with gzip.open(out_filepath, "wb", compresslevel=1) as f_out:
294 |             shutil.copyfileobj(f_in, f_out)
295 | 


--------------------------------------------------------------------------------
/scripts/version-increment.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | function usage {
 3 |     echo "Usage: $(basename "$0") CHANGELOG_MESSAGE" 2>&1
 4 |     echo 'Add the given message to the changelog and cut a release'
 5 |     echo "Example: $(basename "$0") \"Bump unstructured to x.y.z\""
 6 | }
 7 | 
 8 | # Found at https://www.henryschmale.org/2019/04/30/incr-semver.html
 9 | # $1 - semver string
10 | # $2 - level to incr {dev,release,minor,major} - release by default
11 | function incr_semver() {
12 |     IFS='.' read -ra ver <<< "$1"
13 |     [[ "${#ver[@]}" -ne 3 ]] && echo "Invalid semver string" && return 1
14 |     [[ "$#" -eq 1 ]] && level='release' || level=$2
15 | 
16 |     release=${ver[2]}
17 |     minor=${ver[1]}
18 |     major=${ver[0]}
19 | 
20 |     case $level in
21 |         # Drop the dev tag
22 |         dev)
23 |             release=$(echo "$release" | awk -F '-' '{print $1}')
24 |         ;;
25 |         release)
26 |             release=$((release+1))
27 |         ;;
28 |         minor)
29 |             release=0
30 |             minor=$((minor+1))
31 |         ;;
32 |         major)
33 |             release=0
34 |             minor=0
35 |             major=$((major+1))
36 |         ;;
37 |         *)
38 |             echo "Invalid level passed"
39 |             return 2
40 |     esac
41 |     echo "$major.$minor.$release"
42 | }
43 | 
44 | 
45 | if [[ -z "$1" ]]; then
46 |     usage
47 |     exit 0
48 | fi
49 | 
50 | changelog_text="* $1"
51 | current_version=$(head -1 CHANGELOG.md | awk -F' ' '{print $2}')
52 | 
53 | # If dev version, add to current change list and cut the release
54 | if [[ $current_version == *"dev"* ]]; then
55 |     new_version=$(incr_semver "$current_version" dev)
56 | 
57 |     # Replace the version (drop the dev tag)
58 |     sed -i 's/'"$current_version"'/'"$new_version"'/' CHANGELOG.md
59 | 
60 |     # Find the first bullet, add the new change above it
61 |     sed -i '0,/^*/{s/\(^*.*\)/'"$changelog_text"'\n\1/}' CHANGELOG.md
62 | 
63 | # If not dev version, create a new release
64 | else
65 |     new_version=$(incr_semver "$current_version" release)
66 | 
67 |     cat <<EOF | cat - CHANGELOG.md > CHANGELOG.tmp
68 | ## $new_version
69 | 
70 | $changelog_text
71 | 
72 | EOF
73 | 
74 |   mv CHANGELOG.{tmp,md}
75 | 
76 | fi
77 | 


--------------------------------------------------------------------------------
/scripts/version-sync.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | function usage {
  3 |     echo "Usage: $(basename "$0") [-c] -f FILE_TO_CHANGE REPLACEMENT_FORMAT [-f FILE_TO_CHANGE REPLACEMENT_FORMAT ...]" 2>&1
  4 |     echo 'Synchronize files to latest version in source file'
  5 |     echo '   -s              Specifies source file for version (default is CHANGELOG.md)'
  6 |     echo '   -f              Specifies a file to change and the format for searching and replacing versions'
  7 |     echo '                       FILE_TO_CHANGE is the file to be updated/checked for updates'
  8 |     echo '                       REPLACEMENT_FORMAT is one of (semver, release, api-release)'
  9 |     echo '                           semver indicates to look for a full semver version and replace with the latest full version'
 10 |     echo '                           release indicates to look for a release semver version (x.x.x) and replace with the latest release version'
 11 |     echo '                           api-release indicates to look for a release semver version in the context of an api route and replace with the latest release version'
 12 |     echo '   -c              Compare versions and output proposed changes without changing anything.'
 13 | }
 14 | 
 15 | function getopts-extra () {
 16 |     declare -i i=1
 17 |     # if the next argument is not an option, then append it to array OPTARG
 18 |     while [[ ${OPTIND} -le $# && ${!OPTIND:0:1} != '-' ]]; do
 19 |         OPTARG[i]=${!OPTIND}
 20 |         ((i += 1))
 21 |         ((OPTIND += 1))
 22 |     done
 23 | }
 24 | 
 25 | # Parse input options
 26 | declare CHECK=0
 27 | declare SOURCE_FILE="CHANGELOG.md"
 28 | declare -a FILES_TO_CHECK=()
 29 | declare -a REPLACEMENT_FORMATS=()
 30 | declare args
 31 | declare OPTIND OPTARG opt
 32 | while getopts ":hcs:f:" opt; do
 33 |     case $opt in
 34 |         h)
 35 |             usage
 36 |             exit 0
 37 |             ;;
 38 |         c)
 39 |             CHECK=1
 40 |             ;;
 41 |         s)
 42 |             SOURCE_FILE="$OPTARG"
 43 |             ;;
 44 |         f)
 45 |             getopts-extra "$@"
 46 |             args=( "${OPTARG[@]}" )
 47 |             # validate length of args, should be 2
 48 |             if [ ${#args[@]} -eq 2 ]; then
 49 |                 FILES_TO_CHECK+=( "${args[0]}" )
 50 |                 REPLACEMENT_FORMATS+=( "${args[1]}" )
 51 |             else
 52 |                 echo "Exactly 2 arguments must follow -f option." >&2
 53 |                 exit 1
 54 |             fi
 55 |             ;;
 56 |         \?)
 57 |             echo "Invalid option: -$OPTARG." >&2
 58 |             usage
 59 |             exit 1
 60 |             ;;
 61 |     esac
 62 | done
 63 | 
 64 | # Parse REPLACEMENT_FORMATS
 65 | RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
 66 | RE_RELEASE="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)"
 67 | RE_API_RELEASE="v(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)"
 68 | # Pull out semver appearing earliest in SOURCE_FILE.
 69 | LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$SOURCE_FILE")
 70 | LAST_RELEASE=$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")
 71 | LAST_API_RELEASE="v$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])$" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")"
 72 | declare -a RE_SEMVERS=()
 73 | declare -a UPDATED_VERSIONS=()
 74 | for i in "${!REPLACEMENT_FORMATS[@]}"; do
 75 |     REPLACEMENT_FORMAT=${REPLACEMENT_FORMATS[$i]}
 76 |     case $REPLACEMENT_FORMAT in
 77 |         semver)
 78 |             RE_SEMVERS+=( "$RE_SEMVER_FULL" )
 79 |             UPDATED_VERSIONS+=( "$LAST_VERSION" )
 80 |             ;;
 81 |         release)
 82 |             RE_SEMVERS+=( "$RE_RELEASE" )
 83 |             UPDATED_VERSIONS+=( "$LAST_RELEASE" )
 84 |             ;;
 85 |         api-release)
 86 |             RE_SEMVERS+=( "$RE_API_RELEASE" )
 87 |             UPDATED_VERSIONS+=( "$LAST_API_RELEASE" )
 88 |             ;;
 89 |         *)
 90 |             echo "Invalid replacement format: \"${REPLACEMENT_FORMAT}\". Use semver, release, or api-release" >&2
 91 |             exit 1
 92 |             ;;
 93 |     esac
 94 | done
 95 | 
 96 | if [ -z "$LAST_VERSION" ];
 97 | then
 98 |     # No match to semver regex in SOURCE_FILE, so no version to go from.
 99 |     printf "Error: Unable to find latest version from %s.\n" "$SOURCE_FILE"
100 |     exit 1
101 | fi
102 | 
103 | # Search files in FILES_TO_CHECK and change (or get diffs)
104 | declare FAILED_CHECK=0
105 | 
106 | for i in "${!FILES_TO_CHECK[@]}"; do
107 |     FILE_TO_CHANGE=${FILES_TO_CHECK[$i]}
108 |     RE_SEMVER=${RE_SEMVERS[$i]}
109 |     UPDATED_VERSION=${UPDATED_VERSIONS[$i]}
110 |     FILE_VERSION=$(grep -o -m 1 -E "${RE_SEMVER}" "$FILE_TO_CHANGE")
111 |     if [ -z "$FILE_VERSION" ];
112 |     then
113 |         # No match to semver regex in VERSIONFILE, so nothing to replace
114 |         printf "Error: No semver version found in file %s.\n" "$FILE_TO_CHANGE"
115 |         exit 1
116 |     else
117 |         # Replace semver in VERSIONFILE with semver obtained from SOURCE_FILE
118 |         TMPFILE=$(mktemp /tmp/new_version.XXXXXX)
119 |         # Check sed version, exit if version < 4.3
120 |         if ! sed --version > /dev/null 2>&1; then
121 |             CURRENT_VERSION=1.archaic
122 |         else
123 |             CURRENT_VERSION=$(sed --version | head -n1 | cut -d" " -f4)
124 |         fi
125 |         REQUIRED_VERSION="4.3"
126 |         if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$CURRENT_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then
127 |             echo "sed version must be >= ${REQUIRED_VERSION}" && exit 1
128 |         fi
129 |         sed -E -r "s/$RE_SEMVER/$UPDATED_VERSION/" "$FILE_TO_CHANGE" > "$TMPFILE"
130 |         if [ $CHECK == 1 ];
131 |         then
132 |             DIFF=$(diff "$FILE_TO_CHANGE"  "$TMPFILE" )
133 |             if [ -z "$DIFF" ];
134 |             then
135 |                 printf "version sync would make no changes to %s.\n" "$FILE_TO_CHANGE"
136 |                 rm "$TMPFILE"
137 |             else
138 |                 FAILED_CHECK=1
139 |                 printf "version sync would make the following changes to %s:\n%s\n" "$FILE_TO_CHANGE" "$DIFF"
140 |                 rm "$TMPFILE"
141 |             fi
142 |         else
143 |             cp "$TMPFILE" "$FILE_TO_CHANGE" 
144 |             rm "$TMPFILE"
145 |         fi
146 |     fi
147 | done
148 | 
149 | # Exit with code determined by whether changes were needed in a check.
150 | if [ ${FAILED_CHECK} -ne 0 ]; then
151 |     exit 1
152 | else
153 |     exit 0
154 | fi
155 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | exclude =
4 |   prepline_*/api
5 | 


--------------------------------------------------------------------------------
/test_general/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/test_general/__init__.py


--------------------------------------------------------------------------------
/test_general/api/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/test_general/api/.gitkeep


--------------------------------------------------------------------------------
/test_general/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/df1edce95193c2876d193c6655b1f3870a05e536/test_general/api/__init__.py


--------------------------------------------------------------------------------
/test_general/api/test_deprecated_api.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from pathlib import Path
 4 | from typing import Any
 5 | 
 6 | from fastapi.testclient import TestClient
 7 | from prepline_general.api.app import app
 8 | 
 9 | MAIN_API_ROUTE = "general/v0/general"
10 | 
11 | 
12 | @pytest.mark.parametrize(
13 |     "parameters",
14 |     [
15 |         pytest.param({"coordinates": ["true"]}, id="coordinates_true"),
16 |         pytest.param({"coordinates": ["false"]}, id="coordinates_false"),
17 |         pytest.param({"encoding": ["utf-8"]}, id="encoding"),
18 |         pytest.param({"hi_res_model_name": ["yolox"]}, id="hi_res_model_name"),
19 |         pytest.param({"include_page_breaks": ["true"]}, id="include_page_breaks"),
20 |         pytest.param({"ocr_languages": ["eng", "kor"]}, id="ocr_languages"),
21 |         pytest.param({"languages": ["eng", "kor"]}, id="languages"),
22 |         pytest.param({"languages": ["eng", "kor"]}, id="languages_inner"),
23 |         pytest.param({"pdf_infer_table_structure": ["false"]}, id="pdf_infer_table_structure"),
24 |         pytest.param({"skip_infer_table_types": ["false"]}, id="skip_infer_table_types"),
25 |         pytest.param({"strategy": ["hi_res"]}, id="strategy"),
26 |         pytest.param({"xml_keep_tags": ["false"]}, id="xml_keep_tags"),
27 |         pytest.param({"extract_image_block_types": ["image"]}, id="extract_image_block_types"),
28 |         pytest.param(
29 |             {"extract_image_block_types": ['["image", "table"]']},
30 |             id="extract_image_block_types_json",
31 |         ),
32 |         pytest.param({"chunking_strategy": ["by_title"]}, id="chunking_strategy"),
33 |         pytest.param({"multipage_sections": ["false"]}, id="multipage_sections"),
34 |         pytest.param({"combine_under_n_chars": ["500"]}, id="combine_under_n_chars"),
35 |         pytest.param({"new_after_n_chars": ["1500"]}, id="new_after_n_chars"),
36 |         pytest.param({"max_characters": ["1500"]}, id="max_characters"),
37 |     ],
38 | )
39 | def test_form_params_passed_as_first_element_of_array_are_properly_handled(
40 |     parameters: dict[str, Any],
41 | ):
42 |     """
43 |     Verify that responses do not include coordinates unless requested
44 |     Verify that certain other metadata fields are dropped
45 |     """
46 |     client = TestClient(app)
47 |     test_file = Path("sample-docs") / "layout-parser-paper-fast.jpg"
48 |     response = client.post(
49 |         MAIN_API_ROUTE,
50 |         files=[("files", (str(test_file), open(test_file, "rb")))],
51 |         data=parameters,
52 |     )
53 | 
54 |     assert response.status_code == 200
55 |     assert response.json()
56 | 


--------------------------------------------------------------------------------
/test_general/api/test_gzip.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import shutil
  3 | import io
  4 | import tempfile
  5 | from pathlib import Path
  6 | from typing import List
  7 | 
  8 | import httpx
  9 | import pandas as pd
 10 | import pytest
 11 | from fastapi.testclient import TestClient
 12 | from deepdiff import DeepDiff
 13 | 
 14 | from prepline_general.api.app import app
 15 | 
 16 | MAIN_API_ROUTE = "general/v0/general"
 17 | 
 18 | 
 19 | @pytest.mark.xfail(reason="The outputs are different as of unstructured==0.13.5")
 20 | @pytest.mark.parametrize("output_format", ["application/json", "text/csv"])
 21 | @pytest.mark.parametrize(
 22 |     "filenames_to_gzip, filenames_verbatim, uncompressed_content_type",
 23 |     [
 24 |         (["fake-html.html"], [], "text/html"),
 25 |         (["stanley-cups.csv"], [], "application/csv"),
 26 |         (["fake.doc"], [], "application/msword"),
 27 |         (["layout-parser-paper-fast.pdf"], [], "application/pdf"),
 28 |         (["fake-email-attachment.eml", "fake-email.eml"], [], "message/rfc822"),
 29 |         (
 30 |             ["fake-email-attachment.eml", "fake-email.eml", "announcement.eml"],
 31 |             [],
 32 |             "message/rfc822",
 33 |         ),
 34 |         (["layout-parser-paper-fast.pdf", "list-item-example.pdf"], [], "application/pdf"),
 35 |         # now the same but without explicit content type
 36 |         # to make the system guess the un-gzipped type based on content.
 37 |         (["fake-html.html"], [], ""),
 38 |         (["fake-email-attachment.eml", "fake-email.eml"], [], ""),
 39 |         (["layout-parser-paper-fast.pdf", "list-item-example.pdf"], [], ""),
 40 |         # mix of compressed and uncompressed
 41 |         (["layout-parser-paper-fast.pdf"], ["list-item-example.pdf"], "application/pdf"),
 42 |         # mix of compressed and uncompressed, and guessing of content type
 43 |         (["layout-parser-paper-fast.pdf"], ["list-item-example.pdf"], ""),
 44 |         # have to use OCR which is slow, so minimum cases
 45 |         (["embedded-images-tables.jpg"], ["english-and-korean.png"], "image/png"),
 46 |         (["embedded-images-tables.jpg"], ["english-and-korean.png"], ""),
 47 |     ],
 48 | )
 49 | def test_gzipped_files_are_parsed_like_original(
 50 |     output_format: str,
 51 |     filenames_to_gzip: List[str],
 52 |     filenames_verbatim: List[str],
 53 |     uncompressed_content_type: str,
 54 | ):
 55 |     """
 56 |     Verify that API supports un-gzipping and correctly interprets gz_uncompressed_content_type,
 57 |     by comparing response to directly parsing the same files.
 58 |     The one thing which changes is the filenames in metadata, which have to be ignored.
 59 |     """
 60 |     client = TestClient(app)
 61 |     gz_options = {
 62 |         "gz_uncompressed_content_type": (
 63 |             uncompressed_content_type if uncompressed_content_type else None
 64 |         ),
 65 |         "output_format": output_format,
 66 |     }
 67 |     response1 = get_gzipped_response(
 68 |         client, filenames_to_gzip, filenames_verbatim, gz_options, uncompressed_content_type
 69 |     )
 70 |     response2 = call_api(
 71 |         client,
 72 |         [],
 73 |         filenames_to_gzip + filenames_verbatim,
 74 |         uncompressed_content_type,
 75 |         {"output_format": output_format},
 76 |     )
 77 |     compare_responses(
 78 |         response1, response2, output_format, len(filenames_to_gzip + filenames_verbatim)
 79 |     )
 80 | 
 81 | 
 82 | def compare_responses(
 83 |     response1: httpx.Response, response2: httpx.Response, output_format: str, files_count: int
 84 | ) -> None:
 85 |     if output_format == "application/json":
 86 |         if files_count == 1:
 87 |             exclude_regex_paths = (
 88 |                 r"root\[\d+\]\['(metadata'\]\['(filename|parent_id)|element_id)'\]"
 89 |             )
 90 | 
 91 |         else:
 92 |             exclude_regex_paths = (
 93 |                 r"root\[\d+\]\[\d+\]\['(metadata'\]\['(filename|parent_id)|element_id)'\]"
 94 |             )
 95 |         diff = DeepDiff(
 96 |             t1=response1.json(),
 97 |             t2=response2.json(),
 98 |             exclude_regex_paths=exclude_regex_paths,
 99 |         )
100 |         assert len(diff) == 0
101 |     else:
102 |         df1 = pd.read_csv(io.StringIO(response1.text))
103 |         df2 = pd.read_csv(io.StringIO(response2.text))
104 |         diff = DeepDiff(
105 |             t1=df1.to_dict(),
106 |             t2=df2.to_dict(),
107 |             exclude_regex_paths=r"root\['(filename|parent_id|element_id)'\]\[\d+\]",
108 |         )
109 |         assert len(diff) == 0
110 | 
111 | 
112 | def call_api(
113 |     client: TestClient,
114 |     filenames_gzipped: List[str],
115 |     filenames_verbatim: List[str],
116 |     content_type: str,
117 |     options: dict,
118 |     samples_dir: str = "sample-docs",
119 | ) -> httpx.Response:
120 |     files = []
121 |     for filename in filenames_gzipped:
122 |         full_path = Path(samples_dir) / filename
123 |         files.append(("files", (str(full_path), open(full_path, "rb"), "application/gzip")))
124 | 
125 |     for filename in filenames_verbatim:
126 |         full_path = Path(samples_dir) / filename
127 |         files.append(("files", (str(full_path), open(full_path, "rb"), content_type)))
128 | 
129 |     response = client.post(
130 |         MAIN_API_ROUTE,
131 |         files=files,
132 |         data=options,
133 |     )
134 |     assert response.status_code == 200, response.text
135 |     assert len(response.text) > 0
136 |     return response
137 | 
138 | 
139 | def get_gzipped_response(
140 |     client: TestClient,
141 |     filenames_to_gzip: List[str],
142 |     filenames_verbatim: List[str],
143 |     options: dict,
144 |     content_type: str,
145 |     samples_dir: str = "sample-docs",
146 | ) -> httpx.Response:
147 |     """
148 |     G-zips the filenames_to_gzip into temporary .gz file and sends to API,
149 |     along with filenames_no_gzip.
150 |     """
151 |     temp_files = {}
152 |     for filename in filenames_to_gzip:
153 |         gz_file_extension = f"{Path(filename).suffix}.gz"
154 |         temp_file = tempfile.NamedTemporaryFile(suffix=gz_file_extension)
155 |         full_path = Path(samples_dir) / filename
156 |         gzip_file(str(full_path), temp_file.name)
157 |         temp_files[filename] = temp_file
158 | 
159 |     filenames_gzipped = [temp_file.name for temp_file in temp_files.values()]
160 | 
161 |     response = call_api(client, filenames_gzipped, filenames_verbatim, content_type, options)
162 | 
163 |     for filename in filenames_to_gzip:
164 |         temp_files[filename].close()
165 | 
166 |     return response
167 | 
168 | 
169 | def gzip_file(in_filepath: str, out_filepath: str):
170 |     with open(in_filepath, "rb") as f_in:
171 |         with gzip.open(out_filepath, "wb", compresslevel=1) as f_out:
172 |             shutil.copyfileobj(f_in, f_out)
173 | 


--------------------------------------------------------------------------------
/test_general/api/test_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import pytest
 4 | 
 5 | from prepline_general.api.utils import SmartValueParser
 6 | 
 7 | 
 8 | @pytest.mark.parametrize(
 9 |     "desired_type, value_to_parse, expected_result",
10 |     [
11 |         (bool, ["true"], True),
12 |         (bool, "true", True),
13 |         (bool, ["false"], False),
14 |         (bool, True, True),
15 |         (bool, "false", False),
16 |         (bool, False, False),
17 |         (int, "1500", 1500),
18 |         (int, ["1500"], 1500),
19 |         (float, ["1500"], 1500.0),
20 |         (list[int], [1000], [1000]),
21 |         (int, 1500, 1500),
22 |         (float, 1500, 1500.0),
23 |         (str, "1500", "1500"),
24 |         (float, "1500", 1500.0),
25 |         (list[str], ["one", "two", "three"], ["one", "two", "three"]),
26 |         (list[int], [1000], [1000]),
27 |         (list[bool], ["true", "False", "True"], [True, False, True]),
28 |     ],
29 | )
30 | def test_smart_value_parser(desired_type: type, value_to_parse: Any, expected_result: Any):
31 |     parsed_value = SmartValueParser[desired_type]().value_or_first_element(value_to_parse)
32 |     assert expected_result == parsed_value
33 | 
34 | 
35 | @pytest.mark.parametrize(
36 |     "desired_type, value_to_parse, expected_result",
37 |     [
38 |         (str, "fast", "fast"),
39 |         (str, "'fast'", "fast"),
40 |         (str, '"fast"', "fast"),
41 |         (str, "!fast", "!fast"),
42 |         (str, "fa'st", "fast"),
43 |         (str, "fast''''''", "fast"),
44 |     ],
45 | )
46 | def test_literal_value_stripped_or_first_element(
47 |     desired_type: type, value_to_parse: Any, expected_result: Any
48 | ):
49 |     parsed_value = SmartValueParser[desired_type]().literal_value_stripped_or_first_element(
50 |         value_to_parse
51 |     )
52 |     assert expected_result == parsed_value
53 | 


--------------------------------------------------------------------------------