├── .coveragerc ├── .github └── workflows │ ├── ci.yml │ └── codeql-analysis.yml ├── .gitignore ├── CHANGELOG.md ├── Dockerfile ├── LICENSE.md ├── MANIFEST.in ├── Makefile ├── README.md ├── img └── unstructured_logo.png ├── requirements ├── base.txt ├── test.in └── test.txt ├── scripts ├── docker-build.sh ├── shellcheck.sh └── version-sync.sh ├── setup.cfg ├── setup.py ├── test_unstructured_api_tools ├── api │ ├── fixtures │ │ ├── example.jpg │ │ ├── example.jpg.gz │ │ ├── fake-email.msg │ │ ├── fake.docx │ │ ├── fake.docx.gz │ │ ├── markdown.md │ │ ├── spring-weather.html.json │ │ ├── text_file.txt │ │ ├── text_file.txt.gz │ │ ├── text_file_2.txt │ │ └── text_file_2.txt.gz │ ├── functions_and_variables.py │ ├── test_docs.py │ ├── test_file_apis.py │ ├── test_file_text_apis.py │ └── test_text_apis.py ├── pipeline-test-project │ ├── README.md │ ├── pipeline-notebooks │ │ ├── pipeline-process-file-1.ipynb │ │ ├── pipeline-process-file-2.ipynb │ │ ├── pipeline-process-file-3.ipynb │ │ ├── pipeline-process-file-4.ipynb │ │ ├── pipeline-process-file-5.ipynb │ │ ├── pipeline-process-text-1.ipynb │ │ ├── pipeline-process-text-2.ipynb │ │ ├── pipeline-process-text-3.ipynb │ │ ├── pipeline-process-text-4.ipynb │ │ ├── pipeline-process-text-file-1.ipynb │ │ ├── pipeline-process-text-file-2.ipynb │ │ ├── pipeline-process-text-file-3.ipynb │ │ └── pipeline-process-text-file-4.ipynb │ ├── prepline_test_project │ │ └── api │ │ │ ├── __init__.py │ │ │ ├── app.py │ │ │ ├── process_file_1.py │ │ │ ├── process_file_2.py │ │ │ ├── process_file_3.py │ │ │ ├── process_file_4.py │ │ │ ├── process_file_5.py │ │ │ ├── process_text_1.py │ │ │ ├── process_text_2.py │ │ │ ├── process_text_3.py │ │ │ ├── process_text_4.py │ │ │ ├── process_text_file_1.py │ │ │ ├── process_text_file_2.py │ │ │ ├── process_text_file_3.py │ │ │ └── process_text_file_4.py │ ├── preprocessing-pipeline-family.yaml │ └── scripts │ │ ├── check-and-format-notebooks.py │ │ └── test-doc-pipeline-apis-consistent.sh ├── pipelines │ ├── test_api_conventions.py │ ├── test_convert.py │ └── test_lint.py └── test_cli.py └── unstructured_api_tools ├── __init__.py ├── __version__.py ├── cli.py └── pipelines ├── __init__.py ├── api_conventions.py ├── convert.py ├── lint.py └── templates ├── pipeline_api.txt └── pipeline_app.txt /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = *.txt 3 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | # NOTE(robinson) - We are limiting when we run CI avoid exceeding our 2,000 min/month limt. 5 | # We can switch to running on push if we make this repo public or are fine with 6 | # paying for CI minutes. 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | env: 13 | PYTHON_VERSION: 3.8 14 | 15 | jobs: 16 | setup: 17 | strategy: 18 | matrix: 19 | python-version: ["3.8", "3.9", "3.10"] 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@v3 23 | - uses: actions/cache@v3 24 | id: virtualenv-cache 25 | with: 26 | path: | 27 | .venv 28 | key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }} 29 | - name: Set up Python ${{ matrix.python-version }} 30 | uses: actions/setup-python@v4 31 | with: 32 | python-version: ${{ matrix.python-version }} 33 | - name: Setup virtual environment (no cache hit) 34 | if: steps.virtualenv-cache.outputs.cache-hit != 'true' 35 | run: | 36 | python${{ matrix.python-version }} -m venv .venv 37 | source .venv/bin/activate 38 | make install-ci 39 | 40 | lint: 41 | runs-on: ubuntu-latest 42 | needs: setup 43 | steps: 44 | - uses: actions/checkout@v3 45 | - uses: actions/cache@v3 46 | id: virtualenv-cache 47 | with: 48 | path: .venv 49 | key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }} 50 | - name: Lint 51 | run: | 52 | source .venv/bin/activate 53 | make check 54 | 55 | shellcheck: 56 | runs-on: ubuntu-latest 57 | steps: 58 | - uses: actions/checkout@v2 59 | - name: ShellCheck 60 | uses: ludeeus/action-shellcheck@master 61 | 62 | test_api_consistency: 63 | strategy: 64 | matrix: 65 | python-version: ["3.8", "3.9", "3.10"] 66 | runs-on: ubuntu-latest 67 | needs: [setup, lint] 68 | steps: 69 | - uses: actions/checkout@v3 70 | - uses: actions/cache@v3 71 | id: virtualenv-cache 72 | with: 73 | path: | 74 | .venv 75 | key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }} 76 | - name: API Consistency 77 | run: | 78 | source .venv/bin/activate 79 | make api-check-test 80 | 81 | test: 82 | strategy: 83 | matrix: 84 | python-version: [ "3.8", "3.9", "3.10" ] 85 | runs-on: ubuntu-latest 86 | needs: test_api_consistency 87 | steps: 88 | - uses: actions/checkout@v3 89 | - uses: actions/cache@v3 90 | id: virtualenv-cache 91 | with: 92 | path: | 93 | .venv 94 | key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }} 95 | - name: Test 96 | run: | 97 | source .venv/bin/activate 98 | make test 99 | make check-coverage 100 | 101 | changelog: 102 | runs-on: ubuntu-latest 103 | steps: 104 | - if: github.ref != 'refs/heads/main' 105 | uses: dorny/paths-filter@v2 106 | id: changes 107 | with: 108 | filters: | 109 | src: 110 | - 'unstructured_api_tools/**' 111 | - if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main' 112 | uses: dangoslen/changelog-enforcer@v3 113 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "main" ] 20 | schedule: 21 | - cron: '35 10 * * 3' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v3 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v2 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | 52 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 53 | # queries: security-extended,security-and-quality 54 | 55 | 56 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 57 | # If this step fails, then you should remove it and run the build manually (see below) 58 | - name: Autobuild 59 | uses: github/codeql-action/autobuild@v2 60 | 61 | # ℹ️ Command-line programs to run using the OS shell. 62 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 63 | 64 | # If the Autobuild fails above, remove it and uncomment the following three lines. 65 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 66 | 67 | # - run: | 68 | # echo "Run, Build Application using script" 69 | # ./location_of_script_within_repo/buildscript.sh 70 | 71 | - name: Perform CodeQL Analysis 72 | uses: github/codeql-action/analyze@v2 73 | with: 74 | category: "/language:${{matrix.language}}" 75 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tmp* 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Pycharm 80 | .idea/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | 136 | # API test project test files 137 | /test_unstructured_api_tools/pipeline-test-project/tmp* -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 0.10.11 2 | 3 | * Fix using metrics filter for logger 4 | 5 | # 0.10.10 6 | 7 | * Filter out metrics endpoint requests from logs 8 | 9 | # 0.10.9 10 | 11 | * Fix output formatting for csv responses 12 | 13 | # 0.10.8 14 | 15 | * Add autoflake and duplicate import removal to linting steps 16 | 17 | # 0.10.7 18 | 19 | * Add support for passing request into pipeline 20 | 21 | # 0.10.6 22 | 23 | * Fix ENV variable processing for CORS 24 | 25 | # 0.10.5 26 | 27 | * Add optional CORS to api 28 | 29 | # 0.10.4 30 | 31 | * Add filter on /healthcheck logs 32 | 33 | # 0.10.3 34 | 35 | * Add support for json and msg file types 36 | 37 | # 0.10.2 38 | 39 | * Set black line length to 100 40 | 41 | # 0.10.1 42 | 43 | * Add Ability to request one file as multipart/form data 44 | 45 | # 0.10.0 46 | 47 | * Update templates for generated API. 48 | * Improve code for accepting gzip files. 49 | 50 | # 0.9.4 51 | 52 | * Add dynamic openapi_url to match docs_url 53 | 54 | # 0.9.3 55 | 56 | * Removed /healthcheck endpoint from docs 57 | * Add fix for handling content type sent as None 58 | 59 | # 0.9.2 60 | 61 | * Add content_type to error message for unsupported file types 62 | 63 | # 0.9.1 64 | 65 | * Allow references to standard imports in pipeline cells 66 | * Removed unused /healthcheck endpoints 67 | 68 | # 0.9.0 69 | 70 | * Add supporting gzip compressed files 71 | 72 | # 0.8.1 73 | 74 | * Removed async/await from endpoints. 75 | * Refactored template for generating endpoints with shorter semver. 76 | 77 | # 0.8.0 78 | 79 | * Add duplicate routes with semver major version 80 | 81 | # 0.7.0 82 | 83 | * Add dynamic docs_url 84 | 85 | # 0.6.0 86 | 87 | * Add file type validation via `UNSTRUCTURED_ALLOWED_MIMETYPES` 88 | 89 | # 0.5.0 90 | 91 | * Removed rate limit and slow api from project. Updated templates and tests. 92 | 93 | # 0.4.9 94 | 95 | * Bug fix: Generated code now consistent across Operating Systems 96 | 97 | # 0.4.8 98 | 99 | * Add ability to return JSON responses for multiple text_files 100 | 101 | # 0.4.7 102 | 103 | * Notebook conversion organizes module level imports at the top of the file 104 | * Allow for FastAPI metadata to be read from the config file 105 | * Add `__init__.py` to API module and add a default version for FastAPI. 106 | 107 | # 0.4.6 108 | 109 | * Add support for `response_schema` parameter in Pipeline API functions. 110 | 111 | # 0.4.5 112 | 113 | * fix bug to get `response_type` value before first call of it in template 114 | 115 | # 0.4.4 116 | 117 | * Implement generation of an app-level FastAPI module. 118 | 119 | # 0.4.3 120 | 121 | * Updates `mypy` type checking code to use `--implicit-optional` 122 | 123 | ## 0.4.2 124 | 125 | * Add types-ujson dependency 126 | 127 | ## 0.4.1 128 | 129 | * Implement feature to allow accepting multiple binary files to the autogenerated pipeline APIs. 130 | 131 | ## 0.4.0 132 | 133 | * Implement feature to allow accepting multiple text files to the autogenerated pipeline APIs. 134 | 135 | ## 0.3.1 136 | 137 | * Removed the ratelimit on healthchecks 138 | * Dependency bumps 139 | 140 | ## 0.3.0 141 | 142 | * Add the ability to pass Accept MIME type headers to pipeline API's 143 | * Dependency bumps 144 | 145 | ## 0.2.0 146 | 147 | * Initial release of unstructured-api-tools 148 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:experimental 2 | FROM quay.io/unstructured-io/base-images:rocky8.7-2 as base 3 | 4 | RUN yum install -y make 5 | 6 | ARG PIP_VERSION 7 | 8 | # Set up environment 9 | ENV HOME /home/ 10 | WORKDIR ${HOME} 11 | RUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \ 12 | && ssh-keyscan -t rsa github.com >> /home/.ssh/known_hosts 13 | ENV PYTHONPATH="${PYTHONPATH}:${HOME}" 14 | ENV PATH="/home/usr/.local/bin:${PATH}" 15 | 16 | FROM base as deps 17 | # Copy and install Unstructured 18 | COPY requirements requirements 19 | 20 | RUN python3.8 -m pip install pip==${PIP_VERSION} && \ 21 | dnf -y groupinstall "Development Tools" && \ 22 | pip install --no-cache -r requirements/base.txt && \ 23 | pip install --no-cache -r requirements/test.txt && \ 24 | dnf -y groupremove "Development Tools" && \ 25 | dnf clean all 26 | 27 | FROM deps as code 28 | COPY Makefile Makefile 29 | 30 | CMD ["/bin/bash"] 31 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2022 Unstructured Technologies, Inc 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include unstructured_api_tools/pipelines/templates/*.txt 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PACKAGE_NAME := unstructured_api_tools 2 | PIP_VERSION := 22.2.1 3 | CURRENT_DIR := $(shell pwd) 4 | 5 | 6 | .PHONY: help 7 | help: Makefile 8 | @sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $< 9 | 10 | 11 | ########### 12 | # Install # 13 | ########### 14 | 15 | ## install: installs all base and test requirements 16 | .PHONY: install 17 | install: install-base install-test 18 | 19 | .PHONY: install-ci 20 | install-ci: install 21 | 22 | .PHONY: install-base 23 | install-base: 24 | python3 -m pip install pip==${PIP_VERSION} 25 | pip install -r requirements/base.txt 26 | 27 | .PHONY: install-test 28 | install-test: 29 | pip install -r requirements/test.txt 30 | 31 | ## pip-compile: compiles all base and test requirements 32 | .PHONY: pip-compile 33 | pip-compile: 34 | # NOTE(crag): you have to manually install pip-tools for now to run this. 35 | # There is a better way to do this with a pinned pip-compile version and a venv. 36 | bash -c "pip-compile -h >/dev/null || { echo please run \'pip install pip-tools\' and then rerun this command; exit 1; }" 37 | pip-compile --upgrade -o requirements/base.txt 38 | pip-compile --upgrade -o requirements/test.txt requirements/base.txt requirements/test.in 39 | 40 | ## install-project-local: install unstructured_api_tools into your local python environment 41 | .PHONY: install-project-local 42 | install-project-local: install 43 | # MAYBE TODO: fail if already exists? 44 | pip install -e . 45 | 46 | ## uninstall-project-local: uninstall unstructured_api_tools from your local python environment 47 | .PHONY: uninstall-project-local 48 | uninstall-project-local: 49 | pip uninstall ${PACKAGE_NAME} 50 | 51 | ################# 52 | # Test and Lint # 53 | ################# 54 | 55 | ## run-jupyter-test-notebooks: starts jupyter, allows execution of test notebooks 56 | .PHONY: run-jupyter-test-notebooks 57 | run-jupyter-test-notebooks: 58 | PYTHONPATH=$(realpath .)/test_unstructured_api_tools/pipeline-test-project/ JUPYTER_PATH=$(realpath .)/test_unstructured_api_tools/pipeline-test-project/ jupyter-notebook --NotebookApp.token='' --NotebookApp.password='' 59 | 60 | ## tidy-test-notebooks: execute notebooks and remove metadata 61 | .PHONY: tidy-test-notebooks 62 | tidy-test-notebooks: 63 | PYTHONPATH=. ./test_unstructured_api_tools/pipeline-test-project/scripts/check-and-format-notebooks.py 64 | 65 | ## generate-test-api: generates FastAPIs under ./test_unstructured_api_tools/pipeline-test-project 66 | .PHONY: generate-test-api 67 | generate-test-api: 68 | # generates FastAPI API's from notebooks in the test project ./test_unstructured_api_tools/pipeline-test-project 69 | PYTHONPATH=. PIPELINE_FAMILY_CONFIG=test_unstructured_api_tools/pipeline-test-project/preprocessing-pipeline-family.yaml \ 70 | python3 ./unstructured_api_tools/cli.py convert-pipeline-notebooks \ 71 | --input-directory ./test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks \ 72 | --output-directory ./test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api 73 | 74 | 75 | ## api-check-test: verifies auto-generated pipeline APIs match the existing ones 76 | .PHONY: api-check-test 77 | api-check-test: 78 | PYTHONPATH=. PACKAGE_NAME=prepline_test_project ./test_unstructured_api_tools/pipeline-test-project/scripts/test-doc-pipeline-apis-consistent.sh 79 | 80 | 81 | ## test: runs all unittests 82 | .PHONY: test 83 | test: 84 | PYTHONPATH=.:./test_unstructured_api_tools/pipeline-test-project pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov=prepline_test_project --cov-report term-missing -vvv 85 | 86 | ## check: runs linters (includes tests) 87 | .PHONY: check 88 | check: check-src check-tests check-version 89 | 90 | ## check-src: runs linters (source only, no tests) 91 | .PHONY: check-src 92 | check-src: 93 | black --line-length 100 ${PACKAGE_NAME} --check 94 | flake8 ${PACKAGE_NAME} 95 | mypy ${PACKAGE_NAME} --ignore-missing-imports --install-types --non-interactive 96 | autoflake --remove-unused-variables --remove-duplicate-keys --expand-star-imports \ 97 | --remove-all-unused-imports -cd -r ${PACKAGE_NAME} test_${PACKAGE_NAME} \ 98 | --exclude test_${PACKAGE_NAME}/pipeline-test-project 99 | 100 | 101 | .PHONY: check-tests 102 | check-tests: 103 | black --line-length 100 test_${PACKAGE_NAME} --check --exclude test_${PACKAGE_NAME}/pipeline-test-project 104 | flake8 test_${PACKAGE_NAME} --exclude test_${PACKAGE_NAME}/pipeline-test-project/prepline_test_project/api 105 | 106 | ## check-scripts: run shellcheck 107 | .PHONY: check-scripts 108 | check-scripts: 109 | # Fail if any of these files have warnings 110 | scripts/shellcheck.sh 111 | 112 | ## check-version: run check to ensure version in CHANGELOG.md matches version in package 113 | .PHONY: check-version 114 | check-version: 115 | # Fail if syncing version would produce changes 116 | scripts/version-sync.sh -c \ 117 | -f ${PACKAGE_NAME}/__version__.py semver 118 | 119 | ## tidy: run black 120 | .PHONY: tidy 121 | tidy: tidy-black tidy-autoflake 122 | 123 | tidy-autoflake: 124 | autoflake --remove-unused-variables --remove-duplicate-keys --expand-star-imports \ 125 | --remove-all-unused-imports -i -r ${PACKAGE_NAME} test_${PACKAGE_NAME} \ 126 | --exclude test_${PACKAGE_NAME}/pipeline-test-project 127 | 128 | 129 | tidy-black: 130 | black --line-length 100 ${PACKAGE_NAME} 131 | black --line-length 100 test_${PACKAGE_NAME} --exclude test_${PACKAGE_NAME}/pipeline-test-project 132 | 133 | 134 | ## version-sync: update __version__.py with most recent version from CHANGELOG.md 135 | .PHONY: version-sync 136 | version-sync: 137 | scripts/version-sync.sh \ 138 | -f ${PACKAGE_NAME}/__version__.py semver 139 | 140 | .PHONY: check-coverage 141 | check-coverage: 142 | # TODO(crag): add coverage check for test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/ 143 | coverage report --fail-under=95 144 | 145 | ########## 146 | # Docker # 147 | ########## 148 | 149 | # Docker targets are provided for convenience only and are not required in a standard development environment 150 | 151 | DOCKER_IMAGE ?= unstructured-api-tools:dev 152 | 153 | .PHONY: docker-build 154 | docker-build: 155 | PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh 156 | 157 | .PHONY: docker-start-bash 158 | docker-start-bash: 159 | docker run -ti --rm ${DOCKER_IMAGE} 160 | 161 | .PHONY: docker-test 162 | docker-test: docker-build 163 | docker run --rm \ 164 | -v ${CURRENT_DIR}/test_unstructured_api_tools:/home/test_unstructured_api_tools \ 165 | -v ${CURRENT_DIR}/unstructured_api_tools:/home/unstructured_api_tools \ 166 | $(DOCKER_IMAGE) \ 167 | bash -c "make test" 168 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 6 |

7 | 8 |

9 |

Open-Source Pre-Processing Tools for Unstructured Data

10 |

11 | 12 | 13 | The `unstructured_api_tools` library includes utilities for converting pipeline notebooks into 14 | REST API applications. `unstructured_api_tools` is intended for use in conjunction with 15 | pipeline repos. See [`pipeline-sec-filings`](https://github.com/Unstructured-IO/pipeline-sec-filings) 16 | for an example of a repo that uses `unstructured_api_tools`. 17 | 18 | ## Installation 19 | 20 | To install the library, run `pip install unstructured_api_tools`. 21 | 22 | ## Developer Quick Start 23 | 24 | * Using `pyenv` to manage virtualenv's is recommended 25 | * Mac install instructions. See [here](https://github.com/Unstructured-IO/community#mac--homebrew) for more detailed instructions. 26 | * `brew install pyenv-virtualenv` 27 | * `pyenv install 3.8.15` 28 | * Linux instructions are available [here](https://github.com/Unstructured-IO/community#linux). 29 | 30 | * Create a virtualenv to work in and activate it, e.g. for one named `unstructured_api_tools`: 31 | 32 | `pyenv virtualenv 3.8.15 unstructured_api_tools`
33 | `pyenv activate unstructured_api_tools` 34 | 35 | * Run `make install-project-local` 36 | 37 | ## Usage 38 | 39 | Use the CLI command to convert pipeline notebooks to scripts, for example: 40 | 41 | ```bash 42 | unstructured_api_tools convert-pipeline-notebooks \ 43 | --input-directory pipeline-family-sec-filings/pipeline-notebooks \ 44 | --output-directory pipeline-family-sec-filings/prepline_sec_filings/api \ 45 | --pipeline-family sec-filings \ 46 | --semver 0.2.1 47 | ``` 48 | 49 | If you do not provide the `pipeline-family` and `semver` arguments, those values are parsed from 50 | `preprocessing-pipeline-family.yaml`. You can provide the `preprocessing-pipeline-family.yaml` file 51 | explicitly with `--config-filename` or the `PIPELINE_FAMILY_CONFIG` environment variable. If neither 52 | of those is specified, the fallback is to use the `preprocessing-pipeline-family.yaml` file in the 53 | current working directory. 54 | 55 | The API file undergoes `black`, `flake8` and `mypy` checks after being generated. If you want 56 | `flake8` to ignore specific errors, you can specify them through the CLI with 57 | `--flake8-ignore F401, E402`. 58 | See the [`flake8` docs](https://flake8.pycqa.org/en/latest/user/error-codes.html#error-violation-codes) 59 | for a full list of error codes. 60 | 61 | ### Conversion from `pipeline_api` to FastAPI 62 | 63 | The command described in [**Usage**](#Usage) generates a FastAPI API route for each `pipeline_api` 64 | function defined in the notebook. The signature of the `pipeline_api` method determines what 65 | parameters the generated FastAPI accepts. 66 | 67 | Currently, only plain text file uploads are supported and as such the first argument must always be 68 | `text`, but support for multiple files and binary files is coming soon! 69 | 70 | In addition, any number of string array parameters may be specified. Any kwarg beginning with 71 | `m_` indicates a multi-value string parameter that is accepted by the FastAPI API. 72 | 73 | For example, in a notebook containing: 74 | 75 | def pipeline_api(text, m_subject=[], m_name=[]): 76 | 77 | `text` represents the content of a file posted to the FastAPI API, and the `m_subject` and `m_name` 78 | keyword args represent optional parameters that may be posted to the API as well, both allowing 79 | multiple string parameters. A `curl` request against such an API could look like this: 80 | 81 | curl -X 'POST' \ 82 | 'https://///' \ 83 | -H 'accept: application/json' \ 84 | -H 'Content-Type: multipart/form-data' \ 85 | -F 'file=@file-to-process.txt' \ 86 | -F 'subject=art' \ 87 | -F 'subject=history' 88 | -F 'subject=math' \ 89 | -F 'name=feynman' 90 | 91 | In addition, you can specify the response type if `pipeline_api` can support both "application/json" 92 | and "text/csv" as return types. 93 | 94 | For example, in a notebook containing a kwarg `response_type`: 95 | 96 | def pipeline_api(text, response_type="text/csv", m_subject=[], m_name=[]): 97 | 98 | The consumer of the API may then specify "text/csv" as the requested response content type with the usual 99 | HTTP Accept header, e.g. `Accept: application/json` or `Accept: text/csv`. 100 | 101 | ## Security Policy 102 | 103 | See our [security policy](https://github.com/Unstructured-IO/unstructured-api-tools/security/policy) for 104 | information on how to report security vulnerabilities. 105 | 106 | ## Learn more 107 | 108 | | Section | Description | 109 | |-|-| 110 | | [Company Website](https://unstructured.io) | Unstructured.io product and company info | 111 | -------------------------------------------------------------------------------- /img/unstructured_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/img/unstructured_logo.png -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.8 3 | # by the following command: 4 | # 5 | # pip-compile --output-file=requirements/base.txt 6 | # 7 | anyio==3.6.2 8 | # via 9 | # starlette 10 | # watchfiles 11 | attrs==22.2.0 12 | # via jsonschema 13 | autoflake==2.1.1 14 | # via unstructured-api-tools (setup.py) 15 | beautifulsoup4==4.12.1 16 | # via nbconvert 17 | bleach==6.0.0 18 | # via nbconvert 19 | click==8.1.3 20 | # via 21 | # unstructured-api-tools (setup.py) 22 | # uvicorn 23 | defusedxml==0.7.1 24 | # via nbconvert 25 | fastapi==0.95.0 26 | # via unstructured-api-tools (setup.py) 27 | fastjsonschema==2.16.3 28 | # via nbformat 29 | h11==0.14.0 30 | # via uvicorn 31 | httptools==0.5.0 32 | # via uvicorn 33 | idna==3.4 34 | # via anyio 35 | importlib-metadata==6.1.0 36 | # via 37 | # jupyter-client 38 | # nbconvert 39 | importlib-resources==5.12.0 40 | # via jsonschema 41 | jinja2==3.1.2 42 | # via 43 | # nbconvert 44 | # unstructured-api-tools (setup.py) 45 | jsonschema==4.17.3 46 | # via nbformat 47 | jupyter-client==8.1.0 48 | # via nbclient 49 | jupyter-core==5.3.0 50 | # via 51 | # jupyter-client 52 | # nbclient 53 | # nbconvert 54 | # nbformat 55 | jupyterlab-pygments==0.2.2 56 | # via nbconvert 57 | markupsafe==2.1.2 58 | # via 59 | # jinja2 60 | # nbconvert 61 | mistune==2.0.5 62 | # via nbconvert 63 | mypy==1.2.0 64 | # via unstructured-api-tools (setup.py) 65 | mypy-extensions==1.0.0 66 | # via mypy 67 | nbclient==0.7.3 68 | # via nbconvert 69 | nbconvert==7.3.0 70 | # via unstructured-api-tools (setup.py) 71 | nbformat==5.8.0 72 | # via 73 | # nbclient 74 | # nbconvert 75 | numpy==1.24.3 76 | # via pandas 77 | packaging==23.0 78 | # via nbconvert 79 | pandas==2.0.2 80 | # via unstructured-api-tools (setup.py) 81 | pandocfilters==1.5.0 82 | # via nbconvert 83 | pkgutil-resolve-name==1.3.10 84 | # via jsonschema 85 | platformdirs==3.2.0 86 | # via jupyter-core 87 | pydantic==1.10.7 88 | # via fastapi 89 | pyflakes==3.0.1 90 | # via autoflake 91 | pygments==2.14.0 92 | # via nbconvert 93 | pyrsistent==0.19.3 94 | # via jsonschema 95 | python-dateutil==2.8.2 96 | # via 97 | # jupyter-client 98 | # pandas 99 | python-dotenv==1.0.0 100 | # via uvicorn 101 | python-multipart==0.0.6 102 | # via unstructured-api-tools (setup.py) 103 | pytz==2023.3 104 | # via pandas 105 | pyyaml==6.0 106 | # via uvicorn 107 | pyzmq==25.0.2 108 | # via jupyter-client 109 | six==1.16.0 110 | # via 111 | # bleach 112 | # python-dateutil 113 | sniffio==1.3.0 114 | # via anyio 115 | soupsieve==2.4 116 | # via beautifulsoup4 117 | starlette==0.26.1 118 | # via fastapi 119 | tinycss2==1.2.1 120 | # via nbconvert 121 | tomli==2.0.1 122 | # via 123 | # autoflake 124 | # mypy 125 | tornado==6.2 126 | # via jupyter-client 127 | traitlets==5.9.0 128 | # via 129 | # jupyter-client 130 | # jupyter-core 131 | # nbclient 132 | # nbconvert 133 | # nbformat 134 | types-requests==2.28.11.17 135 | # via unstructured-api-tools (setup.py) 136 | types-ujson==5.7.0.1 137 | # via unstructured-api-tools (setup.py) 138 | types-urllib3==1.26.25.10 139 | # via types-requests 140 | typing-extensions==4.5.0 141 | # via 142 | # mypy 143 | # pydantic 144 | # starlette 145 | tzdata==2023.3 146 | # via pandas 147 | uvicorn[standard]==0.21.1 148 | # via unstructured-api-tools (setup.py) 149 | uvloop==0.17.0 150 | # via uvicorn 151 | watchfiles==0.19.0 152 | # via uvicorn 153 | webencodings==0.5.1 154 | # via 155 | # bleach 156 | # tinycss2 157 | websockets==11.0.1 158 | # via uvicorn 159 | zipp==3.15.0 160 | # via 161 | # importlib-metadata 162 | # importlib-resources 163 | -------------------------------------------------------------------------------- /requirements/test.in: -------------------------------------------------------------------------------- 1 | black>=22.3.0 2 | coverage 3 | flake8 4 | httpx 5 | # NOTE(robinson) - Pinning version due to the NotOneFoundException crash described here. 6 | # ref: https://github.com/ipython/ipython/issues/13598 7 | ipython>=8.9.0 8 | pytest-cov 9 | # NOTE(mrobinson) - requests is needed for the fastapi test client 10 | requests 11 | requests_toolbelt 12 | nbdev 13 | pytest-mock 14 | -------------------------------------------------------------------------------- /requirements/test.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.8 3 | # by the following command: 4 | # 5 | # pip-compile --output-file=requirements/test.txt requirements/base.txt requirements/test.in 6 | # 7 | anyio==3.6.2 8 | # via 9 | # -r requirements/base.txt 10 | # httpcore 11 | # starlette 12 | # watchfiles 13 | asttokens==2.2.1 14 | # via 15 | # nbdev 16 | # stack-data 17 | astunparse==1.6.3 18 | # via nbdev 19 | attrs==22.2.0 20 | # via 21 | # -r requirements/base.txt 22 | # jsonschema 23 | # pytest 24 | backcall==0.2.0 25 | # via ipython 26 | beautifulsoup4==4.12.1 27 | # via 28 | # -r requirements/base.txt 29 | # nbconvert 30 | black==23.3.0 31 | # via -r requirements/test.in 32 | bleach==6.0.0 33 | # via 34 | # -r requirements/base.txt 35 | # nbconvert 36 | certifi==2022.12.7 37 | # via 38 | # httpcore 39 | # httpx 40 | # requests 41 | charset-normalizer==3.1.0 42 | # via requests 43 | click==8.1.3 44 | # via 45 | # -r requirements/base.txt 46 | # black 47 | # uvicorn 48 | coverage[toml]==7.2.3 49 | # via 50 | # -r requirements/test.in 51 | # pytest-cov 52 | decorator==5.1.1 53 | # via ipython 54 | defusedxml==0.7.1 55 | # via 56 | # -r requirements/base.txt 57 | # nbconvert 58 | exceptiongroup==1.1.1 59 | # via pytest 60 | execnb==0.1.5 61 | # via nbdev 62 | executing==1.2.0 63 | # via stack-data 64 | fastapi==0.95.0 65 | # via -r requirements/base.txt 66 | fastcore==1.5.29 67 | # via 68 | # execnb 69 | # ghapi 70 | # nbdev 71 | fastjsonschema==2.16.3 72 | # via 73 | # -r requirements/base.txt 74 | # nbformat 75 | flake8==6.0.0 76 | # via -r requirements/test.in 77 | ghapi==1.0.3 78 | # via nbdev 79 | h11==0.14.0 80 | # via 81 | # -r requirements/base.txt 82 | # httpcore 83 | # uvicorn 84 | httpcore==0.16.3 85 | # via httpx 86 | httptools==0.5.0 87 | # via 88 | # -r requirements/base.txt 89 | # uvicorn 90 | httpx==0.23.3 91 | # via -r requirements/test.in 92 | idna==3.4 93 | # via 94 | # -r requirements/base.txt 95 | # anyio 96 | # requests 97 | # rfc3986 98 | importlib-metadata==6.1.0 99 | # via 100 | # -r requirements/base.txt 101 | # jupyter-client 102 | # nbconvert 103 | importlib-resources==5.12.0 104 | # via 105 | # -r requirements/base.txt 106 | # jsonschema 107 | iniconfig==2.0.0 108 | # via pytest 109 | ipython==8.12.0 110 | # via 111 | # -r requirements/test.in 112 | # execnb 113 | jedi==0.18.2 114 | # via ipython 115 | jinja2==3.1.2 116 | # via 117 | # -r requirements/base.txt 118 | # nbconvert 119 | jsonschema==4.17.3 120 | # via 121 | # -r requirements/base.txt 122 | # nbformat 123 | jupyter-client==8.1.0 124 | # via 125 | # -r requirements/base.txt 126 | # nbclient 127 | jupyter-core==5.3.0 128 | # via 129 | # -r requirements/base.txt 130 | # jupyter-client 131 | # nbclient 132 | # nbconvert 133 | # nbformat 134 | jupyterlab-pygments==0.2.2 135 | # via 136 | # -r requirements/base.txt 137 | # nbconvert 138 | markupsafe==2.1.2 139 | # via 140 | # -r requirements/base.txt 141 | # jinja2 142 | # nbconvert 143 | matplotlib-inline==0.1.6 144 | # via ipython 145 | mccabe==0.7.0 146 | # via flake8 147 | mistune==2.0.5 148 | # via 149 | # -r requirements/base.txt 150 | # nbconvert 151 | mypy==1.2.0 152 | # via -r requirements/base.txt 153 | mypy-extensions==1.0.0 154 | # via 155 | # -r requirements/base.txt 156 | # black 157 | # mypy 158 | nbclient==0.7.3 159 | # via 160 | # -r requirements/base.txt 161 | # nbconvert 162 | nbconvert==7.3.0 163 | # via -r requirements/base.txt 164 | nbdev==2.3.12 165 | # via -r requirements/test.in 166 | nbformat==5.8.0 167 | # via 168 | # -r requirements/base.txt 169 | # nbclient 170 | # nbconvert 171 | packaging==23.0 172 | # via 173 | # -r requirements/base.txt 174 | # black 175 | # fastcore 176 | # ghapi 177 | # nbconvert 178 | # pytest 179 | pandocfilters==1.5.0 180 | # via 181 | # -r requirements/base.txt 182 | # nbconvert 183 | parso==0.8.3 184 | # via jedi 185 | pathspec==0.11.1 186 | # via black 187 | pexpect==4.8.0 188 | # via ipython 189 | pickleshare==0.7.5 190 | # via ipython 191 | pkgutil-resolve-name==1.3.10 192 | # via 193 | # -r requirements/base.txt 194 | # jsonschema 195 | platformdirs==3.2.0 196 | # via 197 | # -r requirements/base.txt 198 | # black 199 | # jupyter-core 200 | pluggy==1.0.0 201 | # via pytest 202 | prompt-toolkit==3.0.38 203 | # via ipython 204 | ptyprocess==0.7.0 205 | # via pexpect 206 | pure-eval==0.2.2 207 | # via stack-data 208 | pycodestyle==2.10.0 209 | # via flake8 210 | pydantic==1.10.7 211 | # via 212 | # -r requirements/base.txt 213 | # fastapi 214 | pyflakes==3.0.1 215 | # via flake8 216 | pygments==2.14.0 217 | # via 218 | # -r requirements/base.txt 219 | # ipython 220 | # nbconvert 221 | pyrsistent==0.19.3 222 | # via 223 | # -r requirements/base.txt 224 | # jsonschema 225 | pytest==7.2.2 226 | # via 227 | # pytest-cov 228 | # pytest-mock 229 | pytest-cov==4.0.0 230 | # via -r requirements/test.in 231 | pytest-mock==3.10.0 232 | # via -r requirements/test.in 233 | python-dateutil==2.8.2 234 | # via 235 | # -r requirements/base.txt 236 | # jupyter-client 237 | python-dotenv==1.0.0 238 | # via 239 | # -r requirements/base.txt 240 | # uvicorn 241 | python-multipart==0.0.6 242 | # via -r requirements/base.txt 243 | pyyaml==6.0 244 | # via 245 | # -r requirements/base.txt 246 | # nbdev 247 | # uvicorn 248 | pyzmq==25.0.2 249 | # via 250 | # -r requirements/base.txt 251 | # jupyter-client 252 | requests==2.28.2 253 | # via 254 | # -r requirements/test.in 255 | # requests-toolbelt 256 | requests-toolbelt==0.10.1 257 | # via -r requirements/test.in 258 | rfc3986[idna2008]==1.5.0 259 | # via httpx 260 | six==1.16.0 261 | # via 262 | # -r requirements/base.txt 263 | # asttokens 264 | # astunparse 265 | # bleach 266 | # python-dateutil 267 | sniffio==1.3.0 268 | # via 269 | # -r requirements/base.txt 270 | # anyio 271 | # httpcore 272 | # httpx 273 | soupsieve==2.4 274 | # via 275 | # -r requirements/base.txt 276 | # beautifulsoup4 277 | stack-data==0.6.2 278 | # via ipython 279 | starlette==0.26.1 280 | # via 281 | # -r requirements/base.txt 282 | # fastapi 283 | tinycss2==1.2.1 284 | # via 285 | # -r requirements/base.txt 286 | # nbconvert 287 | tomli==2.0.1 288 | # via 289 | # -r requirements/base.txt 290 | # black 291 | # coverage 292 | # mypy 293 | # pytest 294 | tornado==6.2 295 | # via 296 | # -r requirements/base.txt 297 | # jupyter-client 298 | traitlets==5.9.0 299 | # via 300 | # -r requirements/base.txt 301 | # ipython 302 | # jupyter-client 303 | # jupyter-core 304 | # matplotlib-inline 305 | # nbclient 306 | # nbconvert 307 | # nbformat 308 | types-requests==2.28.11.17 309 | # via -r requirements/base.txt 310 | types-ujson==5.7.0.1 311 | # via -r requirements/base.txt 312 | types-urllib3==1.26.25.10 313 | # via 314 | # -r requirements/base.txt 315 | # types-requests 316 | typing-extensions==4.5.0 317 | # via 318 | # -r requirements/base.txt 319 | # black 320 | # ipython 321 | # mypy 322 | # pydantic 323 | # starlette 324 | urllib3==1.26.15 325 | # via requests 326 | uvicorn[standard]==0.21.1 327 | # via -r requirements/base.txt 328 | uvloop==0.17.0 329 | # via 330 | # -r requirements/base.txt 331 | # uvicorn 332 | watchdog==3.0.0 333 | # via nbdev 334 | watchfiles==0.19.0 335 | # via 336 | # -r requirements/base.txt 337 | # uvicorn 338 | wcwidth==0.2.6 339 | # via prompt-toolkit 340 | webencodings==0.5.1 341 | # via 342 | # -r requirements/base.txt 343 | # bleach 344 | # tinycss2 345 | websockets==11.0.1 346 | # via 347 | # -r requirements/base.txt 348 | # uvicorn 349 | wheel==0.40.0 350 | # via astunparse 351 | zipp==3.15.0 352 | # via 353 | # -r requirements/base.txt 354 | # importlib-metadata 355 | # importlib-resources 356 | 357 | # The following packages are considered to be unsafe in a requirements file: 358 | # pip 359 | -------------------------------------------------------------------------------- /scripts/docker-build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | DOCKER_REPOSITORY="${DOCKER_REPOSITORY:-quay.io/unstructured-io/unstructured}" 5 | PIP_VERSION="${PIP_VERSION:-23.1.2}" 6 | DOCKER_IMAGE="${DOCKER_IMAGE:-unstructured-api-tools:dev}" 7 | 8 | DOCKER_BUILD_CMD=(docker buildx build --load -f Dockerfile \ 9 | --build-arg PIP_VERSION="$PIP_VERSION" \ 10 | --build-arg BUILDKIT_INLINE_CACHE=1 \ 11 | --progress plain \ 12 | --cache-from "$DOCKER_REPOSITORY":latest \ 13 | -t "$DOCKER_IMAGE" .) 14 | 15 | # only build for specific platform if DOCKER_BUILD_PLATFORM is set 16 | if [ -n "${DOCKER_BUILD_PLATFORM:-}" ]; then 17 | DOCKER_BUILD_CMD+=("--platform=$DOCKER_BUILD_PLATFORM") 18 | fi 19 | 20 | DOCKER_BUILDKIT=1 "${DOCKER_BUILD_CMD[@]}" 21 | -------------------------------------------------------------------------------- /scripts/shellcheck.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | find scripts -name "*.sh" -exec shellcheck {} + 4 | 5 | -------------------------------------------------------------------------------- /scripts/version-sync.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | function usage { 3 | echo "Usage: $(basename "$0") [-c] -f FILE_TO_CHANGE REPLACEMENT_FORMAT [-f FILE_TO_CHANGE REPLACEMENT_FORMAT ...]" 2>&1 4 | echo 'Synchronize files to latest version in source file' 5 | echo ' -s Specifies source file for version (default is CHANGELOG.md)' 6 | echo ' -f Specifies a file to change and the format for searching and replacing versions' 7 | echo ' FILE_TO_CHANGE is the file to be updated/checked for updates' 8 | echo ' REPLACEMENT_FORMAT is one of (semver, release, api-release)' 9 | echo ' semver indicates to look for a full semver version and replace with the latest full version' 10 | echo ' release indicates to look for a release semver version (x.x.x) and replace with the latest release version' 11 | echo ' api-release indicates to look for a release semver version in the context of an api route and replace with the latest release version' 12 | echo ' -c Compare versions and output proposed changes without changing anything.' 13 | } 14 | 15 | function getopts-extra () { 16 | declare i=1 17 | # if the next argument is not an option, then append it to array OPTARG 18 | while [[ ${OPTIND} -le $# && ${!OPTIND:0:1} != '-' ]]; do 19 | OPTARG[i]=${!OPTIND} 20 | i+=1 21 | OPTIND+=1 22 | done 23 | } 24 | 25 | # Parse input options 26 | declare CHECK=0 27 | declare SOURCE_FILE="CHANGELOG.md" 28 | declare -a FILES_TO_CHECK=() 29 | declare -a REPLACEMENT_FORMATS=() 30 | declare args 31 | declare OPTIND OPTARG opt 32 | while getopts ":hcs:f:" opt; do 33 | case $opt in 34 | h) 35 | usage 36 | exit 0 37 | ;; 38 | c) 39 | CHECK=1 40 | ;; 41 | s) 42 | SOURCE_FILE="$OPTARG" 43 | ;; 44 | f) 45 | getopts-extra "$@" 46 | args=( "${OPTARG[@]}" ) 47 | # validate length of args, should be 2 48 | if [ ${#args[@]} -eq 2 ]; then 49 | FILES_TO_CHECK+=( "${args[0]}" ) 50 | REPLACEMENT_FORMATS+=( "${args[1]}" ) 51 | else 52 | echo "Exactly 2 arguments must follow -f option." >&2 53 | exit 1 54 | fi 55 | ;; 56 | \?) 57 | echo "Invalid option: -$OPTARG." >&2 58 | usage 59 | exit 1 60 | ;; 61 | esac 62 | done 63 | 64 | # Parse REPLACEMENT_FORMATS 65 | RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?" 66 | RE_RELEASE="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)" 67 | RE_API_RELEASE="v(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)" 68 | # Pull out semver appearing earliest in SOURCE_FILE. 69 | LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$SOURCE_FILE") 70 | LAST_RELEASE=$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}") 71 | LAST_API_RELEASE="v$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])$" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")" 72 | declare -a RE_SEMVERS=() 73 | declare -a UPDATED_VERSIONS=() 74 | for i in "${!REPLACEMENT_FORMATS[@]}"; do 75 | REPLACEMENT_FORMAT=${REPLACEMENT_FORMATS[$i]} 76 | case $REPLACEMENT_FORMAT in 77 | semver) 78 | RE_SEMVERS+=( "$RE_SEMVER_FULL" ) 79 | UPDATED_VERSIONS+=( "$LAST_VERSION" ) 80 | ;; 81 | release) 82 | RE_SEMVERS+=( "$RE_RELEASE" ) 83 | UPDATED_VERSIONS+=( "$LAST_RELEASE" ) 84 | ;; 85 | api-release) 86 | RE_SEMVERS+=( "$RE_API_RELEASE" ) 87 | UPDATED_VERSIONS+=( "$LAST_API_RELEASE" ) 88 | ;; 89 | *) 90 | echo "Invalid replacement format: \"${REPLACEMENT_FORMAT}\". Use semver, release, or api-release" >&2 91 | exit 1 92 | ;; 93 | esac 94 | done 95 | 96 | if [ -z "$LAST_VERSION" ]; 97 | then 98 | # No match to semver regex in SOURCE_FILE, so no version to go from. 99 | printf "Error: Unable to find latest version from %s.\n" "$SOURCE_FILE" 100 | exit 1 101 | fi 102 | 103 | # Search files in FILES_TO_CHECK and change (or get diffs) 104 | declare FAILED_CHECK=0 105 | 106 | for i in "${!FILES_TO_CHECK[@]}"; do 107 | FILE_TO_CHANGE=${FILES_TO_CHECK[$i]} 108 | RE_SEMVER=${RE_SEMVERS[$i]} 109 | UPDATED_VERSION=${UPDATED_VERSIONS[$i]} 110 | FILE_VERSION=$(grep -o -m 1 -E "${RE_SEMVER}" "$FILE_TO_CHANGE") 111 | if [ -z "$FILE_VERSION" ]; 112 | then 113 | # No match to semver regex in VERSIONFILE, so nothing to replace 114 | printf "Error: No semver version found in file %s.\n" "$FILE_TO_CHANGE" 115 | exit 1 116 | else 117 | # Replace semver in VERSIONFILE with semver obtained from SOURCE_FILE 118 | TMPFILE=$(mktemp /tmp/new_version.XXXXXX) 119 | # Check sed version, exit if version < 4.3 120 | if ! sed --version > /dev/null 2>&1; then 121 | CURRENT_VERSION=1.archaic 122 | else 123 | CURRENT_VERSION=$(sed --version | head -n1 | cut -d" " -f4) 124 | fi 125 | REQUIRED_VERSION="4.3" 126 | if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$CURRENT_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then 127 | echo "sed version must be >= ${REQUIRED_VERSION}" && exit 1 128 | fi 129 | sed -E -r "s/$RE_SEMVER/$UPDATED_VERSION/" "$FILE_TO_CHANGE" > "$TMPFILE" 130 | if [ $CHECK == 1 ]; 131 | then 132 | DIFF=$(diff "$FILE_TO_CHANGE" "$TMPFILE" ) 133 | if [ -z "$DIFF" ]; 134 | then 135 | printf "version sync would make no changes to %s.\n" "$FILE_TO_CHANGE" 136 | rm "$TMPFILE" 137 | else 138 | FAILED_CHECK=1 139 | printf "version sync would make the following changes to %s:\n%s\n" "$FILE_TO_CHANGE" "$DIFF" 140 | rm "$TMPFILE" 141 | fi 142 | else 143 | cp "$TMPFILE" "$FILE_TO_CHANGE" 144 | rm "$TMPFILE" 145 | fi 146 | fi 147 | done 148 | 149 | # Exit with code determined by whether changes were needed in a check. 150 | if [ ${FAILED_CHECK} -ne 0 ]; then 151 | exit 1 152 | else 153 | exit 0 154 | fi 155 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license_files = LICENSE.md 3 | 4 | [flake8] 5 | max-line-length = 100 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | setup.py 3 | 4 | unstructured_api_tools - Utilities to manage APIs from notebooks 5 | 6 | Copyright 2022 Unstructured Technologies, Inc. 7 | 8 | Licensed under the Apache License, Version 2.0 (the "License"); 9 | you may not use this file except in compliance with the License. 10 | You may obtain a copy of the License at 11 | 12 | http://www.apache.org/licenses/LICENSE-2.0 13 | 14 | Unless required by applicable law or agreed to in writing, software 15 | distributed under the License is distributed on an "AS IS" BASIS, 16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | See the License for the specific language governing permissions and 18 | limitations under the License. 19 | """ 20 | 21 | from setuptools import setup, find_packages 22 | 23 | from unstructured_api_tools.__version__ import __version__ 24 | 25 | setup( 26 | name="unstructured_api_tools", 27 | description="A library that prepares raw documents for downstream ML tasks.", 28 | long_description=open("README.md", "r", encoding="utf-8").read(), 29 | long_description_content_type="text/markdown", 30 | keywords="NLP PDF HTML CV XML parsing preprocessing", 31 | url="https://github.com/Unstructured-IO/unstructured-api-tools", 32 | python_requires=">=3.8.0", 33 | classifiers=[ 34 | "Development Status :: 4 - Beta", 35 | "Intended Audience :: Developers", 36 | "Intended Audience :: Education", 37 | "Intended Audience :: Science/Research", 38 | "License :: OSI Approved :: Apache Software License", 39 | "Operating System :: OS Independent", 40 | "Programming Language :: Python :: 3", 41 | "Programming Language :: Python :: 3.8", 42 | "Programming Language :: Python :: 3.9", 43 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 44 | ], 45 | author="Unstructured Technologies", 46 | author_email="mrobinson@unstructuredai.io", 47 | license="Apache-2.0", 48 | packages=find_packages(), 49 | include_package_data=True, 50 | version=__version__, 51 | entry_points={ 52 | "console_scripts": "unstructured_api_tools=unstructured_api_tools.cli:cli" 53 | }, 54 | install_requires=[ 55 | "click>=8.1", 56 | "fastapi", 57 | "Jinja2", 58 | "mypy>=0.99", 59 | "nbconvert", 60 | "python-multipart", 61 | "pandas", 62 | "types-requests", 63 | "types-ujson", 64 | "uvicorn[standard]", 65 | "autoflake" 66 | ], 67 | extras_require={}, 68 | ) 69 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/api/fixtures/example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/example.jpg -------------------------------------------------------------------------------- /test_unstructured_api_tools/api/fixtures/example.jpg.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/example.jpg.gz -------------------------------------------------------------------------------- /test_unstructured_api_tools/api/fixtures/fake-email.msg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/fake-email.msg -------------------------------------------------------------------------------- /test_unstructured_api_tools/api/fixtures/fake.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/fake.docx -------------------------------------------------------------------------------- /test_unstructured_api_tools/api/fixtures/fake.docx.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/fake.docx.gz -------------------------------------------------------------------------------- /test_unstructured_api_tools/api/fixtures/markdown.md: -------------------------------------------------------------------------------- 1 | # Test markdown file 2 | 3 | This is the test markdown file. 100% code coverage is what I aim for. -------------------------------------------------------------------------------- /test_unstructured_api_tools/api/fixtures/text_file.txt: -------------------------------------------------------------------------------- 1 | this is the test text file -------------------------------------------------------------------------------- /test_unstructured_api_tools/api/fixtures/text_file.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/text_file.txt.gz -------------------------------------------------------------------------------- /test_unstructured_api_tools/api/fixtures/text_file_2.txt: -------------------------------------------------------------------------------- 1 | this is another test text file -------------------------------------------------------------------------------- /test_unstructured_api_tools/api/fixtures/text_file_2.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/text_file_2.txt.gz -------------------------------------------------------------------------------- /test_unstructured_api_tools/api/functions_and_variables.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | FILE_DOCX = "fake.docx" 4 | FILE_IMAGE = "example.jpg" 5 | FILE_TXT_1 = "text_file.txt" 6 | FILE_TXT_2 = "text_file_2.txt" 7 | FILE_MARKDOWN = "markdown.md" 8 | FILE_MSG = "fake-email.msg" 9 | FILE_JSON = "spring-weather.html.json" 10 | 11 | GZIP_FILE_DOCX = "fake.docx.gz" 12 | GZIP_FILE_IMAGE = "example.jpg.gz" 13 | GZIP_FILE_TXT_1 = "text_file.txt.gz" 14 | GZIP_FILE_TXT_2 = "text_file_2.txt.gz" 15 | 16 | FILENAME_LENGTHS = { 17 | FILE_DOCX: 36602, 18 | GZIP_FILE_DOCX: 36602, 19 | FILE_IMAGE: 32764, 20 | GZIP_FILE_IMAGE: 32764, 21 | FILE_TXT_1: 26, 22 | GZIP_FILE_TXT_1: 26, 23 | FILE_TXT_2: 30, 24 | GZIP_FILE_TXT_2: 30, 25 | FILE_MARKDOWN: 91, 26 | FILE_MSG: 11776, 27 | FILE_JSON: 13151, 28 | } 29 | FILENAME_FORMATS = { 30 | FILE_DOCX: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 31 | FILE_IMAGE: "image/jpeg", 32 | FILE_TXT_1: "text/plain", 33 | FILE_TXT_2: "text/plain", 34 | GZIP_FILE_DOCX: "application/gzip", 35 | GZIP_FILE_IMAGE: "application/gzip", 36 | GZIP_FILE_TXT_1: "application/gzip", 37 | GZIP_FILE_TXT_2: "application/gzip", 38 | FILE_MARKDOWN: "text/markdown", 39 | FILE_MSG: "message/rfc822", 40 | FILE_JSON: "application/json", 41 | "octet_stream": "application/octet-stream", 42 | } 43 | 44 | P_INPUT_1_SINGLE = {"input1": ["hi"]} 45 | P_INPUT_1_MULTI = {"input1": ["hi", "water is better than ice"]} 46 | P_INPUT_1_EMPTY = {"input1": []} 47 | P_INPUT_2_SINGLE = {"input2": ["hello"]} 48 | P_INPUT_2_MULTI = {"input2": ["hello", "earth is better than mars"]} 49 | P_INPUT_2_EMPTY = {"input2": []} 50 | P_INPUT_1_AND_2_MULTI = {"input2": ["hello", "earth is better than mars"], "input1": ["hi"]} 51 | 52 | JSON = "application/json" 53 | MIXED = "multipart/mixed" 54 | TEXT_CSV = "text/csv" 55 | INVALID = "invalid" 56 | 57 | RESPONSE_SCHEMA_ISD = {"output_schema": "isd"} 58 | RESPONSE_SCHEMA_LABELSTUDIO = {"output_schema": "labelstudio"} 59 | 60 | 61 | def convert_files_for_api(files, use_octet_stream_type=False): 62 | files_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures") 63 | return [ 64 | ( 65 | "files", 66 | ( 67 | test_file, 68 | open(os.path.join(files_path, test_file), "rb"), 69 | FILENAME_FORMATS["octet_stream" if use_octet_stream_type else test_file], 70 | ), 71 | ) 72 | for test_file in files 73 | ] 74 | 75 | 76 | def convert_text_files_for_api(files, use_octet_stream_type=False): 77 | files_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures") 78 | return [ 79 | ( 80 | "text_files", 81 | ( 82 | test_file, 83 | open(os.path.join(files_path, test_file), "rb"), 84 | FILENAME_FORMATS["octet_stream" if use_octet_stream_type else test_file], 85 | ), 86 | ) 87 | for test_file in files 88 | ] 89 | 90 | 91 | def generate_header_kwargs(value=None): 92 | return ( 93 | { 94 | "headers": { 95 | "Accept": value, 96 | } 97 | } 98 | if value 99 | else {} 100 | ) 101 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/api/test_docs.py: -------------------------------------------------------------------------------- 1 | from starlette.testclient import TestClient 2 | from prepline_test_project.api.app import app 3 | 4 | DOCS_ROUTE = "/test-project/docs" 5 | OPENAPI_ROUTE = "/test-project/openapi.json" 6 | HEALTHCHECK_ROUTE = "/healthcheck" 7 | 8 | client = TestClient(app) 9 | 10 | 11 | def test_openapi(): 12 | response = client.get(OPENAPI_ROUTE) 13 | assert response.status_code == 200 14 | 15 | 16 | def test_docs(): 17 | response = client.get(DOCS_ROUTE) 18 | assert response.status_code == 200 19 | 20 | 21 | def test_healthcheck(): 22 | response = client.get(HEALTHCHECK_ROUTE) 23 | assert response.status_code == 200 24 | assert response.json() == {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"} 25 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/README.md: -------------------------------------------------------------------------------- 1 | This directory is the base of barebones preprocessing-pipeline project 2 | used for the generatation of FastAPI's which are then used as test fixtures. 3 | 4 | It includes notebooks under pipeline-notebooks/ as is normally the case 5 | for pipeline projects. APIs are generated and checked by the Makefile 6 | in the root of the unstructured-api-tools repo. 7 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3931743a", 6 | "metadata": {}, 7 | "source": [ 8 | "# File Processing Pipeline" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "d83dab2a", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "7cb5e00b", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "# pipeline-api\n", 29 | "\n", 30 | "# test that a duplicate import gets handles correctly as this gets imported via the template as wel\n", 31 | "import json\n", 32 | "\n", 33 | "# test accessing os in a #pipeline-api cell does not break things\n", 34 | "_ = os.environ\n", 35 | "\n", 36 | "def pipeline_api(\n", 37 | " file,\n", 38 | " filename=None,\n", 39 | " file_content_type=None,\n", 40 | " m_input2=[],\n", 41 | "):\n", 42 | " return {\"silly_result\": ' : '.join([str(len(file.read())),\n", 43 | " filename,\n", 44 | " file_content_type,\n", 45 | " str(m_input2)])}" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "id": "65911889", 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "{'silly_result': \"17 : temp-file.txt : text/plain : ['my', 'inputs']\"}\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "import tempfile\n", 64 | "with tempfile.TemporaryFile() as fp:\n", 65 | " fp.write(b'This is some data')\n", 66 | " fp.seek(0)\n", 67 | " print(\n", 68 | " pipeline_api(\n", 69 | " fp,\n", 70 | " filename=\"temp-file.txt\",\n", 71 | " file_content_type=\"text/plain\",\n", 72 | " m_input2=[\"my\",\"inputs\"]\n", 73 | " )\n", 74 | " )" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "edce40fa", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "python3", 89 | "language": "python", 90 | "name": "python3" 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 5 95 | } 96 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# File Processing Pipeline" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# pipeline-api\n", 17 | "def pipeline_api(\n", 18 | " file\n", 19 | "):\n", 20 | " return {\"silly_result\": ' : '.join([str(len(file.read()))])}" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "{'silly_result': '17'}\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "import tempfile\n", 38 | "with tempfile.TemporaryFile() as fp:\n", 39 | " fp.write(b'This is some data')\n", 40 | " fp.seek(0)\n", 41 | " print(pipeline_api(fp))" 42 | ] 43 | } 44 | ], 45 | "metadata": { 46 | "kernelspec": { 47 | "display_name": "python3", 48 | "language": "python", 49 | "name": "python3" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 0 54 | } 55 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# File Processing Pipeline" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# pipeline-api\n", 17 | "import pandas as pd\n", 18 | "def pipeline_api(\n", 19 | " file, response_type=\"text/csv\", response_schema=\"isd\"\n", 20 | "):\n", 21 | " data = pd.DataFrame(data={\"silly_result\": [str(len(file.read())), str(response_type), str(response_schema)]})\n", 22 | " if response_type == \"text/csv\":\n", 23 | " return data.to_csv()\n", 24 | " else:\n", 25 | " text = \" : \".join(list(data[\"silly_result\"]))\n", 26 | " return {\"silly_result\": text}" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | ",silly_result\n", 39 | "0,17\n", 40 | "1,text/csv\n", 41 | "2,isd\n", 42 | "\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "import tempfile\n", 48 | "with tempfile.TemporaryFile() as fp:\n", 49 | " fp.write(b'This is some data')\n", 50 | " fp.seek(0)\n", 51 | " print(pipeline_api(file=fp, response_type=\"text/csv\", response_schema=\"isd\"))" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [] 60 | } 61 | ], 62 | "metadata": { 63 | "kernelspec": { 64 | "display_name": "python3", 65 | "language": "python", 66 | "name": "python3" 67 | } 68 | }, 69 | "nbformat": 4, 70 | "nbformat_minor": 1 71 | } 72 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# File Processing Pipeline" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# pipeline-api\n", 17 | "def pipeline_api(\n", 18 | " file,\n", 19 | " file_content_type=None,\n", 20 | " response_type=\"application/json\",\n", 21 | " response_schema=\"labelstudio\",\n", 22 | " m_input1=[]\n", 23 | "):\n", 24 | " return {\"silly_result\": ' : '.join([\n", 25 | " str(len(file.read())),\n", 26 | " str(file_content_type),\n", 27 | " str(response_type),\n", 28 | " str(response_schema),\n", 29 | " str(m_input1)\n", 30 | " ])}" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "{'silly_result': \"17 : None : application/json : isd : ['input1', 'input2']\"}\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "import tempfile\n", 48 | "with tempfile.TemporaryFile() as fp:\n", 49 | " fp.write(b'This is some data')\n", 50 | " fp.seek(0)\n", 51 | " print(\n", 52 | " pipeline_api(\n", 53 | " fp,\n", 54 | " None,\n", 55 | " \"application/json\",\n", 56 | " \"isd\",\n", 57 | " [\"input1\", \"input2\"]\n", 58 | " )\n", 59 | " )" 60 | ] 61 | } 62 | ], 63 | "metadata": { 64 | "kernelspec": { 65 | "display_name": "python3", 66 | "language": "python", 67 | "name": "python3" 68 | } 69 | }, 70 | "nbformat": 4, 71 | "nbformat_minor": 0 72 | } 73 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# File Processing Pipeline" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# pipeline-api\n", 17 | "import pandas as pd\n", 18 | "\n", 19 | "def pipeline_api(\n", 20 | " file,\n", 21 | " file_content_type=None,\n", 22 | " response_type=\"application/json\",\n", 23 | " response_schema=\"labelstudio\",\n", 24 | " m_input1=[],\n", 25 | " m_input2=[],\n", 26 | "):\n", 27 | " data = pd.DataFrame(data={\"silly_result\": [\n", 28 | " str(len(file.read())),\n", 29 | " str(file_content_type),\n", 30 | " str(response_type),\n", 31 | " str(response_schema),\n", 32 | " str(m_input1),\n", 33 | " str(m_input2),\n", 34 | " ]})\n", 35 | " if response_type == \"text/csv\":\n", 36 | " return data.to_csv()\n", 37 | " else:\n", 38 | " text = \" : \".join(list(data[\"silly_result\"]))\n", 39 | " return {\"silly_result\": text}" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | "{'silly_result': \"17 : None : application/json : isd : ['input1', 'input2'] : ['m_input2']\"}\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "import tempfile\n", 57 | "with tempfile.TemporaryFile() as fp:\n", 58 | " fp.write(b'This is some data')\n", 59 | " fp.seek(0)\n", 60 | " print(\n", 61 | " pipeline_api(\n", 62 | " fp,\n", 63 | " None,\n", 64 | " \"application/json\",\n", 65 | " \"isd\",\n", 66 | " [\"input1\", \"input2\"],\n", 67 | " [\"m_input2\"]\n", 68 | " )\n", 69 | " )" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "python3", 83 | "language": "python", 84 | "name": "python3" 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 1 89 | } 90 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Text Processing Pipeline" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# pipeline-api\n", 17 | "def pipeline_api(\n", 18 | " text,\n", 19 | "):\n", 20 | " return {\"silly_result\": ' : '.join([str(len(text)), text])}" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "{'silly_result': '9 : some text'}\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "print(pipeline_api(\"some text\"))" 38 | ] 39 | } 40 | ], 41 | "metadata": { 42 | "kernelspec": { 43 | "display_name": "python3", 44 | "language": "python", 45 | "name": "python3" 46 | } 47 | }, 48 | "nbformat": 4, 49 | "nbformat_minor": 1 50 | } 51 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Text Processing Pipeline" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# pipeline-api\n", 17 | "def pipeline_api(\n", 18 | " text,\n", 19 | " m_input1=[],\n", 20 | " m_input2=[]\n", 21 | "):\n", 22 | " return {\"silly_result\": ' : '.join([\n", 23 | " str(len(text)),\n", 24 | " text,\n", 25 | " str(m_input1),\n", 26 | " str(m_input2)\n", 27 | " ])}" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "{'silly_result': \"9 : some text : ['first_input'] : ['last', 'input']\"}\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "print(pipeline_api(\"some text\", m_input1=[\"first_input\"], m_input2=[\"last\", \"input\"]))" 45 | ] 46 | } 47 | ], 48 | "metadata": { 49 | "kernelspec": { 50 | "display_name": "python3", 51 | "language": "python", 52 | "name": "python3" 53 | } 54 | }, 55 | "nbformat": 4, 56 | "nbformat_minor": 1 57 | } 58 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "bafce76f", 6 | "metadata": {}, 7 | "source": [ 8 | "# Text Processing Pipeline" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "2524a9a4", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# pipeline-api\n", 19 | "import pandas as pd\n", 20 | "def pipeline_api(\n", 21 | " text,\n", 22 | " response_type=\"text/csv\"\n", 23 | "):\n", 24 | " data = pd.DataFrame(data={\"silly_result\": [str(len(text)), text, str(response_type)]})\n", 25 | " if response_type == \"text/csv\":\n", 26 | " return data.to_csv()\n", 27 | " else:\n", 28 | " text = \" : \".join(list(data[\"silly_result\"]))\n", 29 | " return {\"silly_result\": text}" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "id": "6a876bdf", 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | ",silly_result\n", 43 | "0,9\n", 44 | "1,some text\n", 45 | "2,text/csv\n", 46 | "\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "print(pipeline_api(\"some text\", \"text/csv\"))" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "83f27184", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [] 61 | } 62 | ], 63 | "metadata": { 64 | "kernelspec": { 65 | "display_name": "python3", 66 | "language": "python", 67 | "name": "python3" 68 | } 69 | }, 70 | "nbformat": 4, 71 | "nbformat_minor": 5 72 | } 73 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Text Processing Pipeline" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# pipeline-api\n", 17 | "import pandas as pd\n", 18 | "def pipeline_api(\n", 19 | " text,\n", 20 | " response_type=\"text/csv\",\n", 21 | " response_schema=\"isd\",\n", 22 | "):\n", 23 | " data = pd.DataFrame(data={\"silly_result\": [str(len(text)), text, str(response_type), str(response_schema)]})\n", 24 | " if response_type == \"text/csv\":\n", 25 | " return data.to_csv()\n", 26 | " else:\n", 27 | " text = \" : \".join(list(data[\"silly_result\"]))\n", 28 | " return {\"silly_result\": text}" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | ",silly_result\n", 41 | "0,9\n", 42 | "1,some text\n", 43 | "2,text/csv\n", 44 | "3,isd\n", 45 | "\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "print(pipeline_api(\"some text\", \"text/csv\", \"isd\"))" 51 | ] 52 | } 53 | ], 54 | "metadata": { 55 | "kernelspec": { 56 | "display_name": "python3", 57 | "language": "python", 58 | "name": "python3" 59 | } 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 1 63 | } 64 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-file-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Text & File Processing Pipeline" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# pipeline-api\n", 17 | "def pipeline_api(\n", 18 | " text,\n", 19 | " file=None,\n", 20 | " filename=None,\n", 21 | " file_content_type=None,\n", 22 | "):\n", 23 | " return {\"silly_result\": ' : '.join([\n", 24 | " str(len(text if text else \"\")),\n", 25 | " str(text),\n", 26 | " str(len(file.read()) if file else None),\n", 27 | " str(filename),\n", 28 | " str(file_content_type),\n", 29 | " ])}" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "{'silly_result': '9 : some text : 17 : temp-file.txt : None'}\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "import tempfile\n", 47 | "with tempfile.TemporaryFile() as fp:\n", 48 | " fp.write(b'This is some data')\n", 49 | " fp.seek(0)\n", 50 | " print(pipeline_api(\n", 51 | " text=\"some text\",\n", 52 | " file=fp,\n", 53 | " file_content_type=None,\n", 54 | " filename=\"temp-file.txt\"\n", 55 | " ))" 56 | ] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "python3", 62 | "language": "python", 63 | "name": "python3" 64 | } 65 | }, 66 | "nbformat": 4, 67 | "nbformat_minor": 1 68 | } 69 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-file-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Text & File Processing Pipeline" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# pipeline-api\n", 17 | "import pandas as pd\n", 18 | "def pipeline_api(\n", 19 | " text,\n", 20 | " file=None,\n", 21 | " filename=None,\n", 22 | " file_content_type=None,\n", 23 | " response_type=\"application/json\",\n", 24 | " m_input2=[]\n", 25 | "):\n", 26 | " data = pd.DataFrame(data={\"silly_result\": [\n", 27 | " str(len(text if text else \"\")),\n", 28 | " str(text),\n", 29 | " str(len(file.read()) if file else None),\n", 30 | " str(filename),\n", 31 | " str(file_content_type),\n", 32 | " str(response_type),\n", 33 | " str(m_input2)\n", 34 | " ]})\n", 35 | " if response_type == \"text/csv\":\n", 36 | " return data.to_csv()\n", 37 | " else:\n", 38 | " text = \" : \".join(list(data[\"silly_result\"]))\n", 39 | " return {\"silly_result\": text}" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | "{'silly_result': \"9 : some text : 17 : temp-file.txt : None : application/json : ['input1', 'input2']\"}\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "import tempfile\n", 57 | "with tempfile.TemporaryFile() as fp:\n", 58 | " fp.write(b'This is some data')\n", 59 | " fp.seek(0)\n", 60 | " print(pipeline_api(\n", 61 | " text=\"some text\",\n", 62 | " file=fp,\n", 63 | " file_content_type=None,\n", 64 | " filename=\"temp-file.txt\",\n", 65 | " response_type=\"application/json\",\n", 66 | " m_input2=[\"input1\", \"input2\"]\n", 67 | " ))" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "python3", 81 | "language": "python", 82 | "name": "python3" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 1 87 | } 88 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-file-3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Text & File Processing Pipeline" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# pipeline-api\n", 17 | "import pandas as pd\n", 18 | "def pipeline_api(\n", 19 | " text,\n", 20 | " file=None,\n", 21 | " filename=None,\n", 22 | " file_content_type=None,\n", 23 | " response_type=\"application/json\",\n", 24 | " response_schema=\"isd\"\n", 25 | "):\n", 26 | " data = pd.DataFrame(data={\"silly_result\": [\n", 27 | " str(len(text if text else \"\")),\n", 28 | " str(text),\n", 29 | " str(len(file.read()) if file else None),\n", 30 | " str(filename),\n", 31 | " str(file_content_type),\n", 32 | " str(response_type),\n", 33 | " str(response_schema)\n", 34 | " ]})\n", 35 | " if response_type == \"text/csv\":\n", 36 | " return data.to_csv()\n", 37 | " else:\n", 38 | " text = \" : \".join(list(data[\"silly_result\"]))\n", 39 | " return {\"silly_result\": text}" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | "{'silly_result': '9 : some text : 17 : temp-file.txt : None : application/json : isd'}\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "import tempfile\n", 57 | "with tempfile.TemporaryFile() as fp:\n", 58 | " fp.write(b'This is some data')\n", 59 | " fp.seek(0)\n", 60 | " print(pipeline_api(\n", 61 | " text=\"some text\",\n", 62 | " file=fp,\n", 63 | " file_content_type=None,\n", 64 | " filename=\"temp-file.txt\",\n", 65 | " response_type=\"application/json\",\n", 66 | " response_schema=\"isd\"\n", 67 | " ))" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "python3", 81 | "language": "python", 82 | "name": "python3" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 1 87 | } 88 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-file-4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Text & File Processing Pipeline" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# pipeline-api\n", 17 | "import pandas as pd\n", 18 | "def pipeline_api(\n", 19 | " text,\n", 20 | " file=None,\n", 21 | " filename=None,\n", 22 | " file_content_type=None,\n", 23 | " response_type=\"application/json\",\n", 24 | " response_schema=\"isd\",\n", 25 | " m_input1=[],\n", 26 | " m_input2=[]\n", 27 | "):\n", 28 | " data = pd.DataFrame(data={\"silly_result\": [\n", 29 | " str(len(text if text else \"\")),\n", 30 | " str(text),\n", 31 | " str(len(file.read()) if file else None),\n", 32 | " str(filename),\n", 33 | " str(file_content_type),\n", 34 | " str(response_type),\n", 35 | " str(response_schema),\n", 36 | " str(m_input1),\n", 37 | " str(m_input2),\n", 38 | " ]})\n", 39 | " if response_type == \"text/csv\":\n", 40 | " return data.to_csv()\n", 41 | " else:\n", 42 | " text = \" : \".join(list(data[\"silly_result\"]))\n", 43 | " return {\"silly_result\": text}" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "{'silly_result': \"9 : some text : 17 : temp-file.txt : None : application/json : isd : ['input1'] : ['input2', 'input3']\"}\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "import tempfile\n", 61 | "with tempfile.TemporaryFile() as fp:\n", 62 | " fp.write(b'This is some data')\n", 63 | " fp.seek(0)\n", 64 | " print(pipeline_api(\n", 65 | " text=\"some text\",\n", 66 | " file=fp,\n", 67 | " file_content_type=None,\n", 68 | " filename=\"temp-file.txt\",\n", 69 | " response_type=\"application/json\",\n", 70 | " response_schema=\"isd\",\n", 71 | " m_input1=[\"input1\"],\n", 72 | " m_input2=[\"input2\", \"input3\"]\n", 73 | " ))" 74 | ] 75 | } 76 | ], 77 | "metadata": { 78 | "kernelspec": { 79 | "display_name": "python3", 80 | "language": "python", 81 | "name": "python3" 82 | } 83 | }, 84 | "nbformat": 4, 85 | "nbformat_minor": 1 86 | } 87 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/__init__.py -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/app.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | 7 | from fastapi import FastAPI, Request, status 8 | import logging 9 | import os 10 | 11 | from .process_file_1 import router as process_file_1_router 12 | from .process_file_2 import router as process_file_2_router 13 | from .process_file_3 import router as process_file_3_router 14 | from .process_file_4 import router as process_file_4_router 15 | from .process_file_5 import router as process_file_5_router 16 | from .process_text_1 import router as process_text_1_router 17 | from .process_text_2 import router as process_text_2_router 18 | from .process_text_3 import router as process_text_3_router 19 | from .process_text_4 import router as process_text_4_router 20 | from .process_text_file_1 import router as process_text_file_1_router 21 | from .process_text_file_2 import router as process_text_file_2_router 22 | from .process_text_file_3 import router as process_text_file_3_router 23 | from .process_text_file_4 import router as process_text_file_4_router 24 | 25 | 26 | app = FastAPI( 27 | title="Unstructured Pipeline API", 28 | description="""""", 29 | version="1.0.0", 30 | docs_url="/test-project/docs", 31 | openapi_url="/test-project/openapi.json", 32 | ) 33 | 34 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", None) 35 | if allowed_origins: 36 | from fastapi.middleware.cors import CORSMiddleware 37 | 38 | app.add_middleware( 39 | CORSMiddleware, 40 | allow_origins=allowed_origins.split(","), 41 | allow_methods=["OPTIONS", "POST"], 42 | allow_headers=["Content-Type"], 43 | ) 44 | 45 | app.include_router(process_file_1_router) 46 | app.include_router(process_file_2_router) 47 | app.include_router(process_file_3_router) 48 | app.include_router(process_file_4_router) 49 | app.include_router(process_file_5_router) 50 | app.include_router(process_text_1_router) 51 | app.include_router(process_text_2_router) 52 | app.include_router(process_text_3_router) 53 | app.include_router(process_text_4_router) 54 | app.include_router(process_text_file_1_router) 55 | app.include_router(process_text_file_2_router) 56 | app.include_router(process_text_file_3_router) 57 | app.include_router(process_text_file_4_router) 58 | 59 | 60 | # Filter out /healthcheck noise 61 | class HealthCheckFilter(logging.Filter): 62 | def filter(self, record: logging.LogRecord) -> bool: 63 | return record.getMessage().find("/healthcheck") == -1 64 | 65 | 66 | # Filter out /metrics noise 67 | class MetricsCheckFilter(logging.Filter): 68 | def filter(self, record: logging.LogRecord) -> bool: 69 | return record.getMessage().find("/metrics") == -1 70 | 71 | 72 | logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter()) 73 | logging.getLogger("uvicorn.access").addFilter(MetricsCheckFilter()) 74 | 75 | 76 | @app.get("/healthcheck", status_code=status.HTTP_200_OK, include_in_schema=False) 77 | def healthcheck(request: Request): 78 | return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"} 79 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | import io 7 | import os 8 | import gzip 9 | import mimetypes 10 | from typing import List, Union 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException 12 | import json 13 | from fastapi.responses import StreamingResponse 14 | from starlette.datastructures import Headers 15 | from starlette.types import Send 16 | from base64 import b64encode 17 | from typing import Optional, Mapping 18 | import secrets 19 | 20 | 21 | app = FastAPI() 22 | router = APIRouter() 23 | 24 | 25 | # pipeline-api 26 | 27 | # test that a duplicate import gets handles correctly as this gets imported via the template as wel 28 | 29 | # test accessing os in a #pipeline-api cell does not break things 30 | _ = os.environ 31 | 32 | 33 | def pipeline_api( 34 | file, 35 | filename=None, 36 | file_content_type=None, 37 | m_input2=[], 38 | ): 39 | return { 40 | "silly_result": " : ".join( 41 | [str(len(file.read())), filename, file_content_type, str(m_input2)] 42 | ) 43 | } 44 | 45 | 46 | def get_validated_mimetype(file): 47 | """ 48 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too 49 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and 50 | return HTTP 400 for an invalid type. 51 | """ 52 | content_type = file.content_type 53 | if not content_type or content_type == "application/octet-stream": 54 | content_type = mimetypes.guess_type(str(file.filename))[0] 55 | 56 | # Some filetypes missing for this library, just hardcode them for now 57 | if not content_type: 58 | if file.filename.endswith(".md"): 59 | content_type = "text/markdown" 60 | elif file.filename.endswith(".msg"): 61 | content_type = "message/rfc822" 62 | 63 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") 64 | if allowed_mimetypes_str is not None: 65 | allowed_mimetypes = allowed_mimetypes_str.split(",") 66 | 67 | if content_type not in allowed_mimetypes: 68 | raise HTTPException( 69 | status_code=400, 70 | detail=( 71 | f"Unable to process {file.filename}: " 72 | f"File type {content_type} is not supported." 73 | ), 74 | ) 75 | 76 | return content_type 77 | 78 | 79 | class MultipartMixedResponse(StreamingResponse): 80 | CRLF = b"\r\n" 81 | 82 | def __init__(self, *args, content_type: str = None, **kwargs): 83 | super().__init__(*args, **kwargs) 84 | self.content_type = content_type 85 | 86 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None: 87 | super().init_headers(headers) 88 | self.boundary_value = secrets.token_hex(16) 89 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"' 90 | self.raw_headers.append((b"content-type", content_type.encode("latin-1"))) 91 | 92 | @property 93 | def boundary(self): 94 | return b"--" + self.boundary_value.encode() 95 | 96 | def _build_part_headers(self, headers: dict) -> bytes: 97 | header_bytes = b"" 98 | for header, value in headers.items(): 99 | header_bytes += f"{header}: {value}".encode() + self.CRLF 100 | return header_bytes 101 | 102 | def build_part(self, chunk: bytes) -> bytes: 103 | part = self.boundary + self.CRLF 104 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"} 105 | if self.content_type is not None: 106 | part_headers["Content-Type"] = self.content_type 107 | part += self._build_part_headers(part_headers) 108 | part += self.CRLF + chunk + self.CRLF 109 | return part 110 | 111 | async def stream_response(self, send: Send) -> None: 112 | await send( 113 | { 114 | "type": "http.response.start", 115 | "status": self.status_code, 116 | "headers": self.raw_headers, 117 | } 118 | ) 119 | async for chunk in self.body_iterator: 120 | if not isinstance(chunk, bytes): 121 | chunk = chunk.encode(self.charset) 122 | chunk = b64encode(chunk) 123 | await send( 124 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True} 125 | ) 126 | 127 | await send({"type": "http.response.body", "body": b"", "more_body": False}) 128 | 129 | 130 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile: 131 | def return_content_type(filename): 132 | if gz_uncompressed_content_type: 133 | return gz_uncompressed_content_type 134 | else: 135 | return str(mimetypes.guess_type(filename)[0]) 136 | 137 | filename = str(file.filename) if file.filename else "" 138 | if filename.endswith(".gz"): 139 | filename = filename[:-3] 140 | 141 | gzip_file = gzip.open(file.file).read() 142 | return UploadFile( 143 | file=io.BytesIO(gzip_file), 144 | size=len(gzip_file), 145 | filename=filename, 146 | headers=Headers({"content-type": return_content_type(filename)}), 147 | ) 148 | 149 | 150 | @router.post("/test-project/v1/process-file-1") 151 | @router.post("/test-project/v1.2.3/process-file-1") 152 | def pipeline_1( 153 | request: Request, 154 | gz_uncompressed_content_type: Optional[str] = Form(default=None), 155 | files: Union[List[UploadFile], None] = File(default=None), 156 | input2: List[str] = Form(default=[]), 157 | ): 158 | if files: 159 | for file_index in range(len(files)): 160 | if files[file_index].content_type == "application/gzip": 161 | files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type) 162 | 163 | content_type = request.headers.get("Accept") 164 | 165 | if isinstance(files, list) and len(files): 166 | if len(files) > 1: 167 | if content_type and content_type not in [ 168 | "*/*", 169 | "multipart/mixed", 170 | "application/json", 171 | "text/csv", 172 | ]: 173 | raise HTTPException( 174 | detail=( 175 | f"Conflict in media type {content_type}" 176 | ' with response type "multipart/mixed".\n' 177 | ), 178 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 179 | ) 180 | 181 | def response_generator(is_multipart): 182 | for file in files: 183 | file_content_type = get_validated_mimetype(file) 184 | 185 | _file = file.file 186 | 187 | response = pipeline_api( 188 | _file, 189 | m_input2=input2, 190 | filename=file.filename, 191 | file_content_type=file_content_type, 192 | ) 193 | 194 | if is_multipart: 195 | if type(response) not in [str, bytes]: 196 | response = json.dumps(response) 197 | yield response 198 | 199 | if content_type == "multipart/mixed": 200 | return MultipartMixedResponse( 201 | response_generator(is_multipart=True), 202 | ) 203 | else: 204 | return ( 205 | list(response_generator(is_multipart=False))[0] 206 | if len(files) == 1 207 | else response_generator(is_multipart=False) 208 | ) 209 | else: 210 | raise HTTPException( 211 | detail='Request parameter "files" is required.\n', 212 | status_code=status.HTTP_400_BAD_REQUEST, 213 | ) 214 | 215 | 216 | app.include_router(router) 217 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | import io 7 | import os 8 | import gzip 9 | import mimetypes 10 | from typing import List, Union 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException 12 | import json 13 | from fastapi.responses import StreamingResponse 14 | from starlette.datastructures import Headers 15 | from starlette.types import Send 16 | from base64 import b64encode 17 | from typing import Optional, Mapping 18 | import secrets 19 | 20 | 21 | app = FastAPI() 22 | router = APIRouter() 23 | 24 | 25 | # pipeline-api 26 | def pipeline_api(file): 27 | return {"silly_result": " : ".join([str(len(file.read()))])} 28 | 29 | 30 | def get_validated_mimetype(file): 31 | """ 32 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too 33 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and 34 | return HTTP 400 for an invalid type. 35 | """ 36 | content_type = file.content_type 37 | if not content_type or content_type == "application/octet-stream": 38 | content_type = mimetypes.guess_type(str(file.filename))[0] 39 | 40 | # Some filetypes missing for this library, just hardcode them for now 41 | if not content_type: 42 | if file.filename.endswith(".md"): 43 | content_type = "text/markdown" 44 | elif file.filename.endswith(".msg"): 45 | content_type = "message/rfc822" 46 | 47 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") 48 | if allowed_mimetypes_str is not None: 49 | allowed_mimetypes = allowed_mimetypes_str.split(",") 50 | 51 | if content_type not in allowed_mimetypes: 52 | raise HTTPException( 53 | status_code=400, 54 | detail=( 55 | f"Unable to process {file.filename}: " 56 | f"File type {content_type} is not supported." 57 | ), 58 | ) 59 | 60 | return content_type 61 | 62 | 63 | class MultipartMixedResponse(StreamingResponse): 64 | CRLF = b"\r\n" 65 | 66 | def __init__(self, *args, content_type: str = None, **kwargs): 67 | super().__init__(*args, **kwargs) 68 | self.content_type = content_type 69 | 70 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None: 71 | super().init_headers(headers) 72 | self.boundary_value = secrets.token_hex(16) 73 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"' 74 | self.raw_headers.append((b"content-type", content_type.encode("latin-1"))) 75 | 76 | @property 77 | def boundary(self): 78 | return b"--" + self.boundary_value.encode() 79 | 80 | def _build_part_headers(self, headers: dict) -> bytes: 81 | header_bytes = b"" 82 | for header, value in headers.items(): 83 | header_bytes += f"{header}: {value}".encode() + self.CRLF 84 | return header_bytes 85 | 86 | def build_part(self, chunk: bytes) -> bytes: 87 | part = self.boundary + self.CRLF 88 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"} 89 | if self.content_type is not None: 90 | part_headers["Content-Type"] = self.content_type 91 | part += self._build_part_headers(part_headers) 92 | part += self.CRLF + chunk + self.CRLF 93 | return part 94 | 95 | async def stream_response(self, send: Send) -> None: 96 | await send( 97 | { 98 | "type": "http.response.start", 99 | "status": self.status_code, 100 | "headers": self.raw_headers, 101 | } 102 | ) 103 | async for chunk in self.body_iterator: 104 | if not isinstance(chunk, bytes): 105 | chunk = chunk.encode(self.charset) 106 | chunk = b64encode(chunk) 107 | await send( 108 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True} 109 | ) 110 | 111 | await send({"type": "http.response.body", "body": b"", "more_body": False}) 112 | 113 | 114 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile: 115 | def return_content_type(filename): 116 | if gz_uncompressed_content_type: 117 | return gz_uncompressed_content_type 118 | else: 119 | return str(mimetypes.guess_type(filename)[0]) 120 | 121 | filename = str(file.filename) if file.filename else "" 122 | if filename.endswith(".gz"): 123 | filename = filename[:-3] 124 | 125 | gzip_file = gzip.open(file.file).read() 126 | return UploadFile( 127 | file=io.BytesIO(gzip_file), 128 | size=len(gzip_file), 129 | filename=filename, 130 | headers=Headers({"content-type": return_content_type(filename)}), 131 | ) 132 | 133 | 134 | @router.post("/test-project/v1/process-file-2") 135 | @router.post("/test-project/v1.2.3/process-file-2") 136 | def pipeline_1( 137 | request: Request, 138 | gz_uncompressed_content_type: Optional[str] = Form(default=None), 139 | files: Union[List[UploadFile], None] = File(default=None), 140 | ): 141 | if files: 142 | for file_index in range(len(files)): 143 | if files[file_index].content_type == "application/gzip": 144 | files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type) 145 | 146 | content_type = request.headers.get("Accept") 147 | 148 | if isinstance(files, list) and len(files): 149 | if len(files) > 1: 150 | if content_type and content_type not in [ 151 | "*/*", 152 | "multipart/mixed", 153 | "application/json", 154 | "text/csv", 155 | ]: 156 | raise HTTPException( 157 | detail=( 158 | f"Conflict in media type {content_type}" 159 | ' with response type "multipart/mixed".\n' 160 | ), 161 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 162 | ) 163 | 164 | def response_generator(is_multipart): 165 | for file in files: 166 | get_validated_mimetype(file) 167 | 168 | _file = file.file 169 | 170 | response = pipeline_api( 171 | _file, 172 | ) 173 | 174 | if is_multipart: 175 | if type(response) not in [str, bytes]: 176 | response = json.dumps(response) 177 | yield response 178 | 179 | if content_type == "multipart/mixed": 180 | return MultipartMixedResponse( 181 | response_generator(is_multipart=True), 182 | ) 183 | else: 184 | return ( 185 | list(response_generator(is_multipart=False))[0] 186 | if len(files) == 1 187 | else response_generator(is_multipart=False) 188 | ) 189 | else: 190 | raise HTTPException( 191 | detail='Request parameter "files" is required.\n', 192 | status_code=status.HTTP_400_BAD_REQUEST, 193 | ) 194 | 195 | 196 | app.include_router(router) 197 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | import io 7 | import os 8 | import gzip 9 | import mimetypes 10 | from typing import List, Union 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException 12 | from fastapi.responses import PlainTextResponse 13 | import json 14 | from fastapi.responses import StreamingResponse 15 | from starlette.datastructures import Headers 16 | from starlette.types import Send 17 | from base64 import b64encode 18 | from typing import Optional, Mapping 19 | import secrets 20 | import pandas as pd 21 | 22 | 23 | app = FastAPI() 24 | router = APIRouter() 25 | 26 | 27 | def is_expected_response_type(media_type, response_type): 28 | if media_type == "application/json" and response_type not in [dict, list]: 29 | return True 30 | elif media_type == "text/csv" and response_type != str: 31 | return True 32 | else: 33 | return False 34 | 35 | 36 | # pipeline-api 37 | def pipeline_api(file, response_type="text/csv", response_schema="isd"): 38 | data = pd.DataFrame( 39 | data={"silly_result": [str(len(file.read())), str(response_type), str(response_schema)]} 40 | ) 41 | if response_type == "text/csv": 42 | return data.to_csv() 43 | else: 44 | text = " : ".join(list(data["silly_result"])) 45 | return {"silly_result": text} 46 | 47 | 48 | def get_validated_mimetype(file): 49 | """ 50 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too 51 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and 52 | return HTTP 400 for an invalid type. 53 | """ 54 | content_type = file.content_type 55 | if not content_type or content_type == "application/octet-stream": 56 | content_type = mimetypes.guess_type(str(file.filename))[0] 57 | 58 | # Some filetypes missing for this library, just hardcode them for now 59 | if not content_type: 60 | if file.filename.endswith(".md"): 61 | content_type = "text/markdown" 62 | elif file.filename.endswith(".msg"): 63 | content_type = "message/rfc822" 64 | 65 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") 66 | if allowed_mimetypes_str is not None: 67 | allowed_mimetypes = allowed_mimetypes_str.split(",") 68 | 69 | if content_type not in allowed_mimetypes: 70 | raise HTTPException( 71 | status_code=400, 72 | detail=( 73 | f"Unable to process {file.filename}: " 74 | f"File type {content_type} is not supported." 75 | ), 76 | ) 77 | 78 | return content_type 79 | 80 | 81 | class MultipartMixedResponse(StreamingResponse): 82 | CRLF = b"\r\n" 83 | 84 | def __init__(self, *args, content_type: str = None, **kwargs): 85 | super().__init__(*args, **kwargs) 86 | self.content_type = content_type 87 | 88 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None: 89 | super().init_headers(headers) 90 | self.boundary_value = secrets.token_hex(16) 91 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"' 92 | self.raw_headers.append((b"content-type", content_type.encode("latin-1"))) 93 | 94 | @property 95 | def boundary(self): 96 | return b"--" + self.boundary_value.encode() 97 | 98 | def _build_part_headers(self, headers: dict) -> bytes: 99 | header_bytes = b"" 100 | for header, value in headers.items(): 101 | header_bytes += f"{header}: {value}".encode() + self.CRLF 102 | return header_bytes 103 | 104 | def build_part(self, chunk: bytes) -> bytes: 105 | part = self.boundary + self.CRLF 106 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"} 107 | if self.content_type is not None: 108 | part_headers["Content-Type"] = self.content_type 109 | part += self._build_part_headers(part_headers) 110 | part += self.CRLF + chunk + self.CRLF 111 | return part 112 | 113 | async def stream_response(self, send: Send) -> None: 114 | await send( 115 | { 116 | "type": "http.response.start", 117 | "status": self.status_code, 118 | "headers": self.raw_headers, 119 | } 120 | ) 121 | async for chunk in self.body_iterator: 122 | if not isinstance(chunk, bytes): 123 | chunk = chunk.encode(self.charset) 124 | chunk = b64encode(chunk) 125 | await send( 126 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True} 127 | ) 128 | 129 | await send({"type": "http.response.body", "body": b"", "more_body": False}) 130 | 131 | 132 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile: 133 | def return_content_type(filename): 134 | if gz_uncompressed_content_type: 135 | return gz_uncompressed_content_type 136 | else: 137 | return str(mimetypes.guess_type(filename)[0]) 138 | 139 | filename = str(file.filename) if file.filename else "" 140 | if filename.endswith(".gz"): 141 | filename = filename[:-3] 142 | 143 | gzip_file = gzip.open(file.file).read() 144 | return UploadFile( 145 | file=io.BytesIO(gzip_file), 146 | size=len(gzip_file), 147 | filename=filename, 148 | headers=Headers({"content-type": return_content_type(filename)}), 149 | ) 150 | 151 | 152 | @router.post("/test-project/v1/process-file-3") 153 | @router.post("/test-project/v1.2.3/process-file-3") 154 | def pipeline_1( 155 | request: Request, 156 | gz_uncompressed_content_type: Optional[str] = Form(default=None), 157 | files: Union[List[UploadFile], None] = File(default=None), 158 | output_format: Union[str, None] = Form(default=None), 159 | output_schema: str = Form(default=None), 160 | ): 161 | if files: 162 | for file_index in range(len(files)): 163 | if files[file_index].content_type == "application/gzip": 164 | files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type) 165 | 166 | content_type = request.headers.get("Accept") 167 | 168 | default_response_type = output_format or "text/csv" 169 | if not content_type or content_type == "*/*" or content_type == "multipart/mixed": 170 | media_type = default_response_type 171 | else: 172 | media_type = content_type 173 | 174 | default_response_schema = output_schema or "isd" 175 | 176 | if isinstance(files, list) and len(files): 177 | if len(files) > 1: 178 | if content_type and content_type not in [ 179 | "*/*", 180 | "multipart/mixed", 181 | "application/json", 182 | "text/csv", 183 | ]: 184 | raise HTTPException( 185 | detail=( 186 | f"Conflict in media type {content_type}" 187 | ' with response type "multipart/mixed".\n' 188 | ), 189 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 190 | ) 191 | 192 | def response_generator(is_multipart): 193 | for file in files: 194 | get_validated_mimetype(file) 195 | 196 | _file = file.file 197 | 198 | response = pipeline_api( 199 | _file, 200 | response_type=media_type, 201 | response_schema=default_response_schema, 202 | ) 203 | 204 | if is_expected_response_type(media_type, type(response)): 205 | raise HTTPException( 206 | detail=( 207 | f"Conflict in media type {media_type}" 208 | f" with response type {type(response)}.\n" 209 | ), 210 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 211 | ) 212 | 213 | valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"] 214 | if media_type in valid_response_types: 215 | if is_multipart: 216 | if type(response) not in [str, bytes]: 217 | response = json.dumps(response) 218 | elif media_type == "text/csv": 219 | response = PlainTextResponse(response) 220 | yield response 221 | else: 222 | raise HTTPException( 223 | detail=f"Unsupported media type {media_type}.\n", 224 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 225 | ) 226 | 227 | def join_responses(responses): 228 | if media_type != "text/csv": 229 | return responses 230 | data = pd.read_csv(io.BytesIO(responses[0].body)) 231 | if len(responses) > 1: 232 | for resp in responses[1:]: 233 | resp_data = pd.read_csv(io.BytesIO(resp.body)) 234 | data = data.merge(resp_data, how="outer") 235 | return PlainTextResponse(data.to_csv()) 236 | 237 | if content_type == "multipart/mixed": 238 | return MultipartMixedResponse( 239 | response_generator(is_multipart=True), content_type=media_type 240 | ) 241 | else: 242 | return ( 243 | list(response_generator(is_multipart=False))[0] 244 | if len(files) == 1 245 | else join_responses(list(response_generator(is_multipart=False))) 246 | ) 247 | else: 248 | raise HTTPException( 249 | detail='Request parameter "files" is required.\n', 250 | status_code=status.HTTP_400_BAD_REQUEST, 251 | ) 252 | 253 | 254 | app.include_router(router) 255 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | import io 7 | import os 8 | import gzip 9 | import mimetypes 10 | from typing import List, Union 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException 12 | from fastapi.responses import PlainTextResponse 13 | import json 14 | from fastapi.responses import StreamingResponse 15 | from starlette.datastructures import Headers 16 | from starlette.types import Send 17 | from base64 import b64encode 18 | from typing import Optional, Mapping 19 | import secrets 20 | import pandas as pd 21 | 22 | 23 | app = FastAPI() 24 | router = APIRouter() 25 | 26 | 27 | def is_expected_response_type(media_type, response_type): 28 | if media_type == "application/json" and response_type not in [dict, list]: 29 | return True 30 | elif media_type == "text/csv" and response_type != str: 31 | return True 32 | else: 33 | return False 34 | 35 | 36 | # pipeline-api 37 | def pipeline_api( 38 | file, 39 | file_content_type=None, 40 | response_type="application/json", 41 | response_schema="labelstudio", 42 | m_input1=[], 43 | ): 44 | return { 45 | "silly_result": " : ".join( 46 | [ 47 | str(len(file.read())), 48 | str(file_content_type), 49 | str(response_type), 50 | str(response_schema), 51 | str(m_input1), 52 | ] 53 | ) 54 | } 55 | 56 | 57 | def get_validated_mimetype(file): 58 | """ 59 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too 60 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and 61 | return HTTP 400 for an invalid type. 62 | """ 63 | content_type = file.content_type 64 | if not content_type or content_type == "application/octet-stream": 65 | content_type = mimetypes.guess_type(str(file.filename))[0] 66 | 67 | # Some filetypes missing for this library, just hardcode them for now 68 | if not content_type: 69 | if file.filename.endswith(".md"): 70 | content_type = "text/markdown" 71 | elif file.filename.endswith(".msg"): 72 | content_type = "message/rfc822" 73 | 74 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") 75 | if allowed_mimetypes_str is not None: 76 | allowed_mimetypes = allowed_mimetypes_str.split(",") 77 | 78 | if content_type not in allowed_mimetypes: 79 | raise HTTPException( 80 | status_code=400, 81 | detail=( 82 | f"Unable to process {file.filename}: " 83 | f"File type {content_type} is not supported." 84 | ), 85 | ) 86 | 87 | return content_type 88 | 89 | 90 | class MultipartMixedResponse(StreamingResponse): 91 | CRLF = b"\r\n" 92 | 93 | def __init__(self, *args, content_type: str = None, **kwargs): 94 | super().__init__(*args, **kwargs) 95 | self.content_type = content_type 96 | 97 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None: 98 | super().init_headers(headers) 99 | self.boundary_value = secrets.token_hex(16) 100 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"' 101 | self.raw_headers.append((b"content-type", content_type.encode("latin-1"))) 102 | 103 | @property 104 | def boundary(self): 105 | return b"--" + self.boundary_value.encode() 106 | 107 | def _build_part_headers(self, headers: dict) -> bytes: 108 | header_bytes = b"" 109 | for header, value in headers.items(): 110 | header_bytes += f"{header}: {value}".encode() + self.CRLF 111 | return header_bytes 112 | 113 | def build_part(self, chunk: bytes) -> bytes: 114 | part = self.boundary + self.CRLF 115 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"} 116 | if self.content_type is not None: 117 | part_headers["Content-Type"] = self.content_type 118 | part += self._build_part_headers(part_headers) 119 | part += self.CRLF + chunk + self.CRLF 120 | return part 121 | 122 | async def stream_response(self, send: Send) -> None: 123 | await send( 124 | { 125 | "type": "http.response.start", 126 | "status": self.status_code, 127 | "headers": self.raw_headers, 128 | } 129 | ) 130 | async for chunk in self.body_iterator: 131 | if not isinstance(chunk, bytes): 132 | chunk = chunk.encode(self.charset) 133 | chunk = b64encode(chunk) 134 | await send( 135 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True} 136 | ) 137 | 138 | await send({"type": "http.response.body", "body": b"", "more_body": False}) 139 | 140 | 141 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile: 142 | def return_content_type(filename): 143 | if gz_uncompressed_content_type: 144 | return gz_uncompressed_content_type 145 | else: 146 | return str(mimetypes.guess_type(filename)[0]) 147 | 148 | filename = str(file.filename) if file.filename else "" 149 | if filename.endswith(".gz"): 150 | filename = filename[:-3] 151 | 152 | gzip_file = gzip.open(file.file).read() 153 | return UploadFile( 154 | file=io.BytesIO(gzip_file), 155 | size=len(gzip_file), 156 | filename=filename, 157 | headers=Headers({"content-type": return_content_type(filename)}), 158 | ) 159 | 160 | 161 | @router.post("/test-project/v1/process-file-4") 162 | @router.post("/test-project/v1.2.3/process-file-4") 163 | def pipeline_1( 164 | request: Request, 165 | gz_uncompressed_content_type: Optional[str] = Form(default=None), 166 | files: Union[List[UploadFile], None] = File(default=None), 167 | output_format: Union[str, None] = Form(default=None), 168 | output_schema: str = Form(default=None), 169 | input1: List[str] = Form(default=[]), 170 | ): 171 | if files: 172 | for file_index in range(len(files)): 173 | if files[file_index].content_type == "application/gzip": 174 | files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type) 175 | 176 | content_type = request.headers.get("Accept") 177 | 178 | default_response_type = output_format or "application/json" 179 | if not content_type or content_type == "*/*" or content_type == "multipart/mixed": 180 | media_type = default_response_type 181 | else: 182 | media_type = content_type 183 | 184 | default_response_schema = output_schema or "labelstudio" 185 | 186 | if isinstance(files, list) and len(files): 187 | if len(files) > 1: 188 | if content_type and content_type not in [ 189 | "*/*", 190 | "multipart/mixed", 191 | "application/json", 192 | "text/csv", 193 | ]: 194 | raise HTTPException( 195 | detail=( 196 | f"Conflict in media type {content_type}" 197 | ' with response type "multipart/mixed".\n' 198 | ), 199 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 200 | ) 201 | 202 | def response_generator(is_multipart): 203 | for file in files: 204 | file_content_type = get_validated_mimetype(file) 205 | 206 | _file = file.file 207 | 208 | response = pipeline_api( 209 | _file, 210 | m_input1=input1, 211 | response_type=media_type, 212 | response_schema=default_response_schema, 213 | file_content_type=file_content_type, 214 | ) 215 | 216 | if is_expected_response_type(media_type, type(response)): 217 | raise HTTPException( 218 | detail=( 219 | f"Conflict in media type {media_type}" 220 | f" with response type {type(response)}.\n" 221 | ), 222 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 223 | ) 224 | 225 | valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"] 226 | if media_type in valid_response_types: 227 | if is_multipart: 228 | if type(response) not in [str, bytes]: 229 | response = json.dumps(response) 230 | elif media_type == "text/csv": 231 | response = PlainTextResponse(response) 232 | yield response 233 | else: 234 | raise HTTPException( 235 | detail=f"Unsupported media type {media_type}.\n", 236 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 237 | ) 238 | 239 | def join_responses(responses): 240 | if media_type != "text/csv": 241 | return responses 242 | data = pd.read_csv(io.BytesIO(responses[0].body)) 243 | if len(responses) > 1: 244 | for resp in responses[1:]: 245 | resp_data = pd.read_csv(io.BytesIO(resp.body)) 246 | data = data.merge(resp_data, how="outer") 247 | return PlainTextResponse(data.to_csv()) 248 | 249 | if content_type == "multipart/mixed": 250 | return MultipartMixedResponse( 251 | response_generator(is_multipart=True), content_type=media_type 252 | ) 253 | else: 254 | return ( 255 | list(response_generator(is_multipart=False))[0] 256 | if len(files) == 1 257 | else join_responses(list(response_generator(is_multipart=False))) 258 | ) 259 | else: 260 | raise HTTPException( 261 | detail='Request parameter "files" is required.\n', 262 | status_code=status.HTTP_400_BAD_REQUEST, 263 | ) 264 | 265 | 266 | app.include_router(router) 267 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | import io 7 | import os 8 | import gzip 9 | import mimetypes 10 | from typing import List, Union 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException 12 | from fastapi.responses import PlainTextResponse 13 | import json 14 | from fastapi.responses import StreamingResponse 15 | from starlette.datastructures import Headers 16 | from starlette.types import Send 17 | from base64 import b64encode 18 | from typing import Optional, Mapping 19 | import secrets 20 | import pandas as pd 21 | 22 | 23 | app = FastAPI() 24 | router = APIRouter() 25 | 26 | 27 | def is_expected_response_type(media_type, response_type): 28 | if media_type == "application/json" and response_type not in [dict, list]: 29 | return True 30 | elif media_type == "text/csv" and response_type != str: 31 | return True 32 | else: 33 | return False 34 | 35 | 36 | # pipeline-api 37 | 38 | 39 | def pipeline_api( 40 | file, 41 | file_content_type=None, 42 | response_type="application/json", 43 | response_schema="labelstudio", 44 | m_input1=[], 45 | m_input2=[], 46 | ): 47 | data = pd.DataFrame( 48 | data={ 49 | "silly_result": [ 50 | str(len(file.read())), 51 | str(file_content_type), 52 | str(response_type), 53 | str(response_schema), 54 | str(m_input1), 55 | str(m_input2), 56 | ] 57 | } 58 | ) 59 | if response_type == "text/csv": 60 | return data.to_csv() 61 | else: 62 | text = " : ".join(list(data["silly_result"])) 63 | return {"silly_result": text} 64 | 65 | 66 | def get_validated_mimetype(file): 67 | """ 68 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too 69 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and 70 | return HTTP 400 for an invalid type. 71 | """ 72 | content_type = file.content_type 73 | if not content_type or content_type == "application/octet-stream": 74 | content_type = mimetypes.guess_type(str(file.filename))[0] 75 | 76 | # Some filetypes missing for this library, just hardcode them for now 77 | if not content_type: 78 | if file.filename.endswith(".md"): 79 | content_type = "text/markdown" 80 | elif file.filename.endswith(".msg"): 81 | content_type = "message/rfc822" 82 | 83 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") 84 | if allowed_mimetypes_str is not None: 85 | allowed_mimetypes = allowed_mimetypes_str.split(",") 86 | 87 | if content_type not in allowed_mimetypes: 88 | raise HTTPException( 89 | status_code=400, 90 | detail=( 91 | f"Unable to process {file.filename}: " 92 | f"File type {content_type} is not supported." 93 | ), 94 | ) 95 | 96 | return content_type 97 | 98 | 99 | class MultipartMixedResponse(StreamingResponse): 100 | CRLF = b"\r\n" 101 | 102 | def __init__(self, *args, content_type: str = None, **kwargs): 103 | super().__init__(*args, **kwargs) 104 | self.content_type = content_type 105 | 106 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None: 107 | super().init_headers(headers) 108 | self.boundary_value = secrets.token_hex(16) 109 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"' 110 | self.raw_headers.append((b"content-type", content_type.encode("latin-1"))) 111 | 112 | @property 113 | def boundary(self): 114 | return b"--" + self.boundary_value.encode() 115 | 116 | def _build_part_headers(self, headers: dict) -> bytes: 117 | header_bytes = b"" 118 | for header, value in headers.items(): 119 | header_bytes += f"{header}: {value}".encode() + self.CRLF 120 | return header_bytes 121 | 122 | def build_part(self, chunk: bytes) -> bytes: 123 | part = self.boundary + self.CRLF 124 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"} 125 | if self.content_type is not None: 126 | part_headers["Content-Type"] = self.content_type 127 | part += self._build_part_headers(part_headers) 128 | part += self.CRLF + chunk + self.CRLF 129 | return part 130 | 131 | async def stream_response(self, send: Send) -> None: 132 | await send( 133 | { 134 | "type": "http.response.start", 135 | "status": self.status_code, 136 | "headers": self.raw_headers, 137 | } 138 | ) 139 | async for chunk in self.body_iterator: 140 | if not isinstance(chunk, bytes): 141 | chunk = chunk.encode(self.charset) 142 | chunk = b64encode(chunk) 143 | await send( 144 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True} 145 | ) 146 | 147 | await send({"type": "http.response.body", "body": b"", "more_body": False}) 148 | 149 | 150 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile: 151 | def return_content_type(filename): 152 | if gz_uncompressed_content_type: 153 | return gz_uncompressed_content_type 154 | else: 155 | return str(mimetypes.guess_type(filename)[0]) 156 | 157 | filename = str(file.filename) if file.filename else "" 158 | if filename.endswith(".gz"): 159 | filename = filename[:-3] 160 | 161 | gzip_file = gzip.open(file.file).read() 162 | return UploadFile( 163 | file=io.BytesIO(gzip_file), 164 | size=len(gzip_file), 165 | filename=filename, 166 | headers=Headers({"content-type": return_content_type(filename)}), 167 | ) 168 | 169 | 170 | @router.post("/test-project/v1/process-file-5") 171 | @router.post("/test-project/v1.2.3/process-file-5") 172 | def pipeline_1( 173 | request: Request, 174 | gz_uncompressed_content_type: Optional[str] = Form(default=None), 175 | files: Union[List[UploadFile], None] = File(default=None), 176 | output_format: Union[str, None] = Form(default=None), 177 | output_schema: str = Form(default=None), 178 | input1: List[str] = Form(default=[]), 179 | input2: List[str] = Form(default=[]), 180 | ): 181 | if files: 182 | for file_index in range(len(files)): 183 | if files[file_index].content_type == "application/gzip": 184 | files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type) 185 | 186 | content_type = request.headers.get("Accept") 187 | 188 | default_response_type = output_format or "application/json" 189 | if not content_type or content_type == "*/*" or content_type == "multipart/mixed": 190 | media_type = default_response_type 191 | else: 192 | media_type = content_type 193 | 194 | default_response_schema = output_schema or "labelstudio" 195 | 196 | if isinstance(files, list) and len(files): 197 | if len(files) > 1: 198 | if content_type and content_type not in [ 199 | "*/*", 200 | "multipart/mixed", 201 | "application/json", 202 | "text/csv", 203 | ]: 204 | raise HTTPException( 205 | detail=( 206 | f"Conflict in media type {content_type}" 207 | ' with response type "multipart/mixed".\n' 208 | ), 209 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 210 | ) 211 | 212 | def response_generator(is_multipart): 213 | for file in files: 214 | file_content_type = get_validated_mimetype(file) 215 | 216 | _file = file.file 217 | 218 | response = pipeline_api( 219 | _file, 220 | m_input1=input1, 221 | m_input2=input2, 222 | response_type=media_type, 223 | response_schema=default_response_schema, 224 | file_content_type=file_content_type, 225 | ) 226 | 227 | if is_expected_response_type(media_type, type(response)): 228 | raise HTTPException( 229 | detail=( 230 | f"Conflict in media type {media_type}" 231 | f" with response type {type(response)}.\n" 232 | ), 233 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 234 | ) 235 | 236 | valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"] 237 | if media_type in valid_response_types: 238 | if is_multipart: 239 | if type(response) not in [str, bytes]: 240 | response = json.dumps(response) 241 | elif media_type == "text/csv": 242 | response = PlainTextResponse(response) 243 | yield response 244 | else: 245 | raise HTTPException( 246 | detail=f"Unsupported media type {media_type}.\n", 247 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 248 | ) 249 | 250 | def join_responses(responses): 251 | if media_type != "text/csv": 252 | return responses 253 | data = pd.read_csv(io.BytesIO(responses[0].body)) 254 | if len(responses) > 1: 255 | for resp in responses[1:]: 256 | resp_data = pd.read_csv(io.BytesIO(resp.body)) 257 | data = data.merge(resp_data, how="outer") 258 | return PlainTextResponse(data.to_csv()) 259 | 260 | if content_type == "multipart/mixed": 261 | return MultipartMixedResponse( 262 | response_generator(is_multipart=True), content_type=media_type 263 | ) 264 | else: 265 | return ( 266 | list(response_generator(is_multipart=False))[0] 267 | if len(files) == 1 268 | else join_responses(list(response_generator(is_multipart=False))) 269 | ) 270 | else: 271 | raise HTTPException( 272 | detail='Request parameter "files" is required.\n', 273 | status_code=status.HTTP_400_BAD_REQUEST, 274 | ) 275 | 276 | 277 | app.include_router(router) 278 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | import io 7 | import os 8 | import gzip 9 | import mimetypes 10 | from typing import List, Union 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException 12 | import json 13 | from fastapi.responses import StreamingResponse 14 | from starlette.datastructures import Headers 15 | from starlette.types import Send 16 | from base64 import b64encode 17 | from typing import Optional, Mapping 18 | import secrets 19 | 20 | 21 | app = FastAPI() 22 | router = APIRouter() 23 | 24 | 25 | # pipeline-api 26 | def pipeline_api( 27 | text, 28 | ): 29 | return {"silly_result": " : ".join([str(len(text)), text])} 30 | 31 | 32 | def get_validated_mimetype(file): 33 | """ 34 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too 35 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and 36 | return HTTP 400 for an invalid type. 37 | """ 38 | content_type = file.content_type 39 | if not content_type or content_type == "application/octet-stream": 40 | content_type = mimetypes.guess_type(str(file.filename))[0] 41 | 42 | # Some filetypes missing for this library, just hardcode them for now 43 | if not content_type: 44 | if file.filename.endswith(".md"): 45 | content_type = "text/markdown" 46 | elif file.filename.endswith(".msg"): 47 | content_type = "message/rfc822" 48 | 49 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") 50 | if allowed_mimetypes_str is not None: 51 | allowed_mimetypes = allowed_mimetypes_str.split(",") 52 | 53 | if content_type not in allowed_mimetypes: 54 | raise HTTPException( 55 | status_code=400, 56 | detail=( 57 | f"Unable to process {file.filename}: " 58 | f"File type {content_type} is not supported." 59 | ), 60 | ) 61 | 62 | return content_type 63 | 64 | 65 | class MultipartMixedResponse(StreamingResponse): 66 | CRLF = b"\r\n" 67 | 68 | def __init__(self, *args, content_type: str = None, **kwargs): 69 | super().__init__(*args, **kwargs) 70 | self.content_type = content_type 71 | 72 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None: 73 | super().init_headers(headers) 74 | self.boundary_value = secrets.token_hex(16) 75 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"' 76 | self.raw_headers.append((b"content-type", content_type.encode("latin-1"))) 77 | 78 | @property 79 | def boundary(self): 80 | return b"--" + self.boundary_value.encode() 81 | 82 | def _build_part_headers(self, headers: dict) -> bytes: 83 | header_bytes = b"" 84 | for header, value in headers.items(): 85 | header_bytes += f"{header}: {value}".encode() + self.CRLF 86 | return header_bytes 87 | 88 | def build_part(self, chunk: bytes) -> bytes: 89 | part = self.boundary + self.CRLF 90 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"} 91 | if self.content_type is not None: 92 | part_headers["Content-Type"] = self.content_type 93 | part += self._build_part_headers(part_headers) 94 | part += self.CRLF + chunk + self.CRLF 95 | return part 96 | 97 | async def stream_response(self, send: Send) -> None: 98 | await send( 99 | { 100 | "type": "http.response.start", 101 | "status": self.status_code, 102 | "headers": self.raw_headers, 103 | } 104 | ) 105 | async for chunk in self.body_iterator: 106 | if not isinstance(chunk, bytes): 107 | chunk = chunk.encode(self.charset) 108 | chunk = b64encode(chunk) 109 | await send( 110 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True} 111 | ) 112 | 113 | await send({"type": "http.response.body", "body": b"", "more_body": False}) 114 | 115 | 116 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile: 117 | def return_content_type(filename): 118 | if gz_uncompressed_content_type: 119 | return gz_uncompressed_content_type 120 | else: 121 | return str(mimetypes.guess_type(filename)[0]) 122 | 123 | filename = str(file.filename) if file.filename else "" 124 | if filename.endswith(".gz"): 125 | filename = filename[:-3] 126 | 127 | gzip_file = gzip.open(file.file).read() 128 | return UploadFile( 129 | file=io.BytesIO(gzip_file), 130 | size=len(gzip_file), 131 | filename=filename, 132 | headers=Headers({"content-type": return_content_type(filename)}), 133 | ) 134 | 135 | 136 | @router.post("/test-project/v1/process-text-1") 137 | @router.post("/test-project/v1.2.3/process-text-1") 138 | def pipeline_1( 139 | request: Request, 140 | gz_uncompressed_content_type: Optional[str] = Form(default=None), 141 | text_files: Union[List[UploadFile], None] = File(default=None), 142 | ): 143 | if text_files: 144 | for file_index in range(len(text_files)): 145 | if text_files[file_index].content_type == "application/gzip": 146 | text_files[file_index] = ungz_file(text_files[file_index]) 147 | 148 | content_type = request.headers.get("Accept") 149 | 150 | if isinstance(text_files, list) and len(text_files): 151 | if len(text_files) > 1: 152 | if content_type and content_type not in [ 153 | "*/*", 154 | "multipart/mixed", 155 | "application/json", 156 | "text/csv", 157 | ]: 158 | raise HTTPException( 159 | detail=( 160 | f"Conflict in media type {content_type}" 161 | ' with response type "multipart/mixed".\n' 162 | ), 163 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 164 | ) 165 | 166 | def response_generator(is_multipart): 167 | for file in text_files: 168 | get_validated_mimetype(file) 169 | 170 | text = file.file.read().decode("utf-8") 171 | 172 | response = pipeline_api( 173 | text, 174 | ) 175 | 176 | if is_multipart: 177 | if type(response) not in [str, bytes]: 178 | response = json.dumps(response) 179 | yield response 180 | 181 | if content_type == "multipart/mixed": 182 | return MultipartMixedResponse( 183 | response_generator(is_multipart=True), 184 | ) 185 | else: 186 | return ( 187 | list(response_generator(is_multipart=False))[0] 188 | if len(text_files) == 1 189 | else response_generator(is_multipart=False) 190 | ) 191 | else: 192 | raise HTTPException( 193 | detail='Request parameter "text_files" is required.\n', 194 | status_code=status.HTTP_400_BAD_REQUEST, 195 | ) 196 | 197 | 198 | app.include_router(router) 199 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | import io 7 | import os 8 | import gzip 9 | import mimetypes 10 | from typing import List, Union 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException 12 | import json 13 | from fastapi.responses import StreamingResponse 14 | from starlette.datastructures import Headers 15 | from starlette.types import Send 16 | from base64 import b64encode 17 | from typing import Optional, Mapping 18 | import secrets 19 | 20 | 21 | app = FastAPI() 22 | router = APIRouter() 23 | 24 | 25 | # pipeline-api 26 | def pipeline_api(text, m_input1=[], m_input2=[]): 27 | return {"silly_result": " : ".join([str(len(text)), text, str(m_input1), str(m_input2)])} 28 | 29 | 30 | def get_validated_mimetype(file): 31 | """ 32 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too 33 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and 34 | return HTTP 400 for an invalid type. 35 | """ 36 | content_type = file.content_type 37 | if not content_type or content_type == "application/octet-stream": 38 | content_type = mimetypes.guess_type(str(file.filename))[0] 39 | 40 | # Some filetypes missing for this library, just hardcode them for now 41 | if not content_type: 42 | if file.filename.endswith(".md"): 43 | content_type = "text/markdown" 44 | elif file.filename.endswith(".msg"): 45 | content_type = "message/rfc822" 46 | 47 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") 48 | if allowed_mimetypes_str is not None: 49 | allowed_mimetypes = allowed_mimetypes_str.split(",") 50 | 51 | if content_type not in allowed_mimetypes: 52 | raise HTTPException( 53 | status_code=400, 54 | detail=( 55 | f"Unable to process {file.filename}: " 56 | f"File type {content_type} is not supported." 57 | ), 58 | ) 59 | 60 | return content_type 61 | 62 | 63 | class MultipartMixedResponse(StreamingResponse): 64 | CRLF = b"\r\n" 65 | 66 | def __init__(self, *args, content_type: str = None, **kwargs): 67 | super().__init__(*args, **kwargs) 68 | self.content_type = content_type 69 | 70 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None: 71 | super().init_headers(headers) 72 | self.boundary_value = secrets.token_hex(16) 73 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"' 74 | self.raw_headers.append((b"content-type", content_type.encode("latin-1"))) 75 | 76 | @property 77 | def boundary(self): 78 | return b"--" + self.boundary_value.encode() 79 | 80 | def _build_part_headers(self, headers: dict) -> bytes: 81 | header_bytes = b"" 82 | for header, value in headers.items(): 83 | header_bytes += f"{header}: {value}".encode() + self.CRLF 84 | return header_bytes 85 | 86 | def build_part(self, chunk: bytes) -> bytes: 87 | part = self.boundary + self.CRLF 88 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"} 89 | if self.content_type is not None: 90 | part_headers["Content-Type"] = self.content_type 91 | part += self._build_part_headers(part_headers) 92 | part += self.CRLF + chunk + self.CRLF 93 | return part 94 | 95 | async def stream_response(self, send: Send) -> None: 96 | await send( 97 | { 98 | "type": "http.response.start", 99 | "status": self.status_code, 100 | "headers": self.raw_headers, 101 | } 102 | ) 103 | async for chunk in self.body_iterator: 104 | if not isinstance(chunk, bytes): 105 | chunk = chunk.encode(self.charset) 106 | chunk = b64encode(chunk) 107 | await send( 108 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True} 109 | ) 110 | 111 | await send({"type": "http.response.body", "body": b"", "more_body": False}) 112 | 113 | 114 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile: 115 | def return_content_type(filename): 116 | if gz_uncompressed_content_type: 117 | return gz_uncompressed_content_type 118 | else: 119 | return str(mimetypes.guess_type(filename)[0]) 120 | 121 | filename = str(file.filename) if file.filename else "" 122 | if filename.endswith(".gz"): 123 | filename = filename[:-3] 124 | 125 | gzip_file = gzip.open(file.file).read() 126 | return UploadFile( 127 | file=io.BytesIO(gzip_file), 128 | size=len(gzip_file), 129 | filename=filename, 130 | headers=Headers({"content-type": return_content_type(filename)}), 131 | ) 132 | 133 | 134 | @router.post("/test-project/v1/process-text-2") 135 | @router.post("/test-project/v1.2.3/process-text-2") 136 | def pipeline_1( 137 | request: Request, 138 | gz_uncompressed_content_type: Optional[str] = Form(default=None), 139 | text_files: Union[List[UploadFile], None] = File(default=None), 140 | input1: List[str] = Form(default=[]), 141 | input2: List[str] = Form(default=[]), 142 | ): 143 | if text_files: 144 | for file_index in range(len(text_files)): 145 | if text_files[file_index].content_type == "application/gzip": 146 | text_files[file_index] = ungz_file(text_files[file_index]) 147 | 148 | content_type = request.headers.get("Accept") 149 | 150 | if isinstance(text_files, list) and len(text_files): 151 | if len(text_files) > 1: 152 | if content_type and content_type not in [ 153 | "*/*", 154 | "multipart/mixed", 155 | "application/json", 156 | "text/csv", 157 | ]: 158 | raise HTTPException( 159 | detail=( 160 | f"Conflict in media type {content_type}" 161 | ' with response type "multipart/mixed".\n' 162 | ), 163 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 164 | ) 165 | 166 | def response_generator(is_multipart): 167 | for file in text_files: 168 | get_validated_mimetype(file) 169 | 170 | text = file.file.read().decode("utf-8") 171 | 172 | response = pipeline_api( 173 | text, 174 | m_input1=input1, 175 | m_input2=input2, 176 | ) 177 | 178 | if is_multipart: 179 | if type(response) not in [str, bytes]: 180 | response = json.dumps(response) 181 | yield response 182 | 183 | if content_type == "multipart/mixed": 184 | return MultipartMixedResponse( 185 | response_generator(is_multipart=True), 186 | ) 187 | else: 188 | return ( 189 | list(response_generator(is_multipart=False))[0] 190 | if len(text_files) == 1 191 | else response_generator(is_multipart=False) 192 | ) 193 | else: 194 | raise HTTPException( 195 | detail='Request parameter "text_files" is required.\n', 196 | status_code=status.HTTP_400_BAD_REQUEST, 197 | ) 198 | 199 | 200 | app.include_router(router) 201 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | import io 7 | import os 8 | import gzip 9 | import mimetypes 10 | from typing import List, Union 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException 12 | from fastapi.responses import PlainTextResponse 13 | import json 14 | from fastapi.responses import StreamingResponse 15 | from starlette.datastructures import Headers 16 | from starlette.types import Send 17 | from base64 import b64encode 18 | from typing import Optional, Mapping 19 | import secrets 20 | import pandas as pd 21 | 22 | 23 | app = FastAPI() 24 | router = APIRouter() 25 | 26 | 27 | def is_expected_response_type(media_type, response_type): 28 | if media_type == "application/json" and response_type not in [dict, list]: 29 | return True 30 | elif media_type == "text/csv" and response_type != str: 31 | return True 32 | else: 33 | return False 34 | 35 | 36 | # pipeline-api 37 | def pipeline_api(text, response_type="text/csv"): 38 | data = pd.DataFrame(data={"silly_result": [str(len(text)), text, str(response_type)]}) 39 | if response_type == "text/csv": 40 | return data.to_csv() 41 | else: 42 | text = " : ".join(list(data["silly_result"])) 43 | return {"silly_result": text} 44 | 45 | 46 | def get_validated_mimetype(file): 47 | """ 48 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too 49 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and 50 | return HTTP 400 for an invalid type. 51 | """ 52 | content_type = file.content_type 53 | if not content_type or content_type == "application/octet-stream": 54 | content_type = mimetypes.guess_type(str(file.filename))[0] 55 | 56 | # Some filetypes missing for this library, just hardcode them for now 57 | if not content_type: 58 | if file.filename.endswith(".md"): 59 | content_type = "text/markdown" 60 | elif file.filename.endswith(".msg"): 61 | content_type = "message/rfc822" 62 | 63 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") 64 | if allowed_mimetypes_str is not None: 65 | allowed_mimetypes = allowed_mimetypes_str.split(",") 66 | 67 | if content_type not in allowed_mimetypes: 68 | raise HTTPException( 69 | status_code=400, 70 | detail=( 71 | f"Unable to process {file.filename}: " 72 | f"File type {content_type} is not supported." 73 | ), 74 | ) 75 | 76 | return content_type 77 | 78 | 79 | class MultipartMixedResponse(StreamingResponse): 80 | CRLF = b"\r\n" 81 | 82 | def __init__(self, *args, content_type: str = None, **kwargs): 83 | super().__init__(*args, **kwargs) 84 | self.content_type = content_type 85 | 86 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None: 87 | super().init_headers(headers) 88 | self.boundary_value = secrets.token_hex(16) 89 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"' 90 | self.raw_headers.append((b"content-type", content_type.encode("latin-1"))) 91 | 92 | @property 93 | def boundary(self): 94 | return b"--" + self.boundary_value.encode() 95 | 96 | def _build_part_headers(self, headers: dict) -> bytes: 97 | header_bytes = b"" 98 | for header, value in headers.items(): 99 | header_bytes += f"{header}: {value}".encode() + self.CRLF 100 | return header_bytes 101 | 102 | def build_part(self, chunk: bytes) -> bytes: 103 | part = self.boundary + self.CRLF 104 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"} 105 | if self.content_type is not None: 106 | part_headers["Content-Type"] = self.content_type 107 | part += self._build_part_headers(part_headers) 108 | part += self.CRLF + chunk + self.CRLF 109 | return part 110 | 111 | async def stream_response(self, send: Send) -> None: 112 | await send( 113 | { 114 | "type": "http.response.start", 115 | "status": self.status_code, 116 | "headers": self.raw_headers, 117 | } 118 | ) 119 | async for chunk in self.body_iterator: 120 | if not isinstance(chunk, bytes): 121 | chunk = chunk.encode(self.charset) 122 | chunk = b64encode(chunk) 123 | await send( 124 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True} 125 | ) 126 | 127 | await send({"type": "http.response.body", "body": b"", "more_body": False}) 128 | 129 | 130 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile: 131 | def return_content_type(filename): 132 | if gz_uncompressed_content_type: 133 | return gz_uncompressed_content_type 134 | else: 135 | return str(mimetypes.guess_type(filename)[0]) 136 | 137 | filename = str(file.filename) if file.filename else "" 138 | if filename.endswith(".gz"): 139 | filename = filename[:-3] 140 | 141 | gzip_file = gzip.open(file.file).read() 142 | return UploadFile( 143 | file=io.BytesIO(gzip_file), 144 | size=len(gzip_file), 145 | filename=filename, 146 | headers=Headers({"content-type": return_content_type(filename)}), 147 | ) 148 | 149 | 150 | @router.post("/test-project/v1/process-text-3") 151 | @router.post("/test-project/v1.2.3/process-text-3") 152 | def pipeline_1( 153 | request: Request, 154 | gz_uncompressed_content_type: Optional[str] = Form(default=None), 155 | text_files: Union[List[UploadFile], None] = File(default=None), 156 | output_format: Union[str, None] = Form(default=None), 157 | ): 158 | if text_files: 159 | for file_index in range(len(text_files)): 160 | if text_files[file_index].content_type == "application/gzip": 161 | text_files[file_index] = ungz_file(text_files[file_index]) 162 | 163 | content_type = request.headers.get("Accept") 164 | 165 | default_response_type = output_format or "text/csv" 166 | if not content_type or content_type == "*/*" or content_type == "multipart/mixed": 167 | media_type = default_response_type 168 | else: 169 | media_type = content_type 170 | 171 | if isinstance(text_files, list) and len(text_files): 172 | if len(text_files) > 1: 173 | if content_type and content_type not in [ 174 | "*/*", 175 | "multipart/mixed", 176 | "application/json", 177 | "text/csv", 178 | ]: 179 | raise HTTPException( 180 | detail=( 181 | f"Conflict in media type {content_type}" 182 | ' with response type "multipart/mixed".\n' 183 | ), 184 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 185 | ) 186 | 187 | def response_generator(is_multipart): 188 | for file in text_files: 189 | get_validated_mimetype(file) 190 | 191 | text = file.file.read().decode("utf-8") 192 | 193 | response = pipeline_api( 194 | text, 195 | response_type=media_type, 196 | ) 197 | 198 | if is_expected_response_type(media_type, type(response)): 199 | raise HTTPException( 200 | detail=( 201 | f"Conflict in media type {media_type}" 202 | f" with response type {type(response)}.\n" 203 | ), 204 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 205 | ) 206 | 207 | valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"] 208 | if media_type in valid_response_types: 209 | if is_multipart: 210 | if type(response) not in [str, bytes]: 211 | response = json.dumps(response) 212 | elif media_type == "text/csv": 213 | response = PlainTextResponse(response) 214 | yield response 215 | else: 216 | raise HTTPException( 217 | detail=f"Unsupported media type {media_type}.\n", 218 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 219 | ) 220 | 221 | def join_responses(responses): 222 | if media_type != "text/csv": 223 | return responses 224 | data = pd.read_csv(io.BytesIO(responses[0].body)) 225 | if len(responses) > 1: 226 | for resp in responses[1:]: 227 | resp_data = pd.read_csv(io.BytesIO(resp.body)) 228 | data = data.merge(resp_data, how="outer") 229 | return PlainTextResponse(data.to_csv()) 230 | 231 | if content_type == "multipart/mixed": 232 | return MultipartMixedResponse( 233 | response_generator(is_multipart=True), content_type=media_type 234 | ) 235 | else: 236 | return ( 237 | list(response_generator(is_multipart=False))[0] 238 | if len(text_files) == 1 239 | else join_responses(list(response_generator(is_multipart=False))) 240 | ) 241 | else: 242 | raise HTTPException( 243 | detail='Request parameter "text_files" is required.\n', 244 | status_code=status.HTTP_400_BAD_REQUEST, 245 | ) 246 | 247 | 248 | app.include_router(router) 249 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | import io 7 | import os 8 | import gzip 9 | import mimetypes 10 | from typing import List, Union 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException 12 | from fastapi.responses import PlainTextResponse 13 | import json 14 | from fastapi.responses import StreamingResponse 15 | from starlette.datastructures import Headers 16 | from starlette.types import Send 17 | from base64 import b64encode 18 | from typing import Optional, Mapping 19 | import secrets 20 | import pandas as pd 21 | 22 | 23 | app = FastAPI() 24 | router = APIRouter() 25 | 26 | 27 | def is_expected_response_type(media_type, response_type): 28 | if media_type == "application/json" and response_type not in [dict, list]: 29 | return True 30 | elif media_type == "text/csv" and response_type != str: 31 | return True 32 | else: 33 | return False 34 | 35 | 36 | # pipeline-api 37 | def pipeline_api( 38 | text, 39 | response_type="text/csv", 40 | response_schema="isd", 41 | ): 42 | data = pd.DataFrame( 43 | data={"silly_result": [str(len(text)), text, str(response_type), str(response_schema)]} 44 | ) 45 | if response_type == "text/csv": 46 | return data.to_csv() 47 | else: 48 | text = " : ".join(list(data["silly_result"])) 49 | return {"silly_result": text} 50 | 51 | 52 | def get_validated_mimetype(file): 53 | """ 54 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too 55 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and 56 | return HTTP 400 for an invalid type. 57 | """ 58 | content_type = file.content_type 59 | if not content_type or content_type == "application/octet-stream": 60 | content_type = mimetypes.guess_type(str(file.filename))[0] 61 | 62 | # Some filetypes missing for this library, just hardcode them for now 63 | if not content_type: 64 | if file.filename.endswith(".md"): 65 | content_type = "text/markdown" 66 | elif file.filename.endswith(".msg"): 67 | content_type = "message/rfc822" 68 | 69 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") 70 | if allowed_mimetypes_str is not None: 71 | allowed_mimetypes = allowed_mimetypes_str.split(",") 72 | 73 | if content_type not in allowed_mimetypes: 74 | raise HTTPException( 75 | status_code=400, 76 | detail=( 77 | f"Unable to process {file.filename}: " 78 | f"File type {content_type} is not supported." 79 | ), 80 | ) 81 | 82 | return content_type 83 | 84 | 85 | class MultipartMixedResponse(StreamingResponse): 86 | CRLF = b"\r\n" 87 | 88 | def __init__(self, *args, content_type: str = None, **kwargs): 89 | super().__init__(*args, **kwargs) 90 | self.content_type = content_type 91 | 92 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None: 93 | super().init_headers(headers) 94 | self.boundary_value = secrets.token_hex(16) 95 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"' 96 | self.raw_headers.append((b"content-type", content_type.encode("latin-1"))) 97 | 98 | @property 99 | def boundary(self): 100 | return b"--" + self.boundary_value.encode() 101 | 102 | def _build_part_headers(self, headers: dict) -> bytes: 103 | header_bytes = b"" 104 | for header, value in headers.items(): 105 | header_bytes += f"{header}: {value}".encode() + self.CRLF 106 | return header_bytes 107 | 108 | def build_part(self, chunk: bytes) -> bytes: 109 | part = self.boundary + self.CRLF 110 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"} 111 | if self.content_type is not None: 112 | part_headers["Content-Type"] = self.content_type 113 | part += self._build_part_headers(part_headers) 114 | part += self.CRLF + chunk + self.CRLF 115 | return part 116 | 117 | async def stream_response(self, send: Send) -> None: 118 | await send( 119 | { 120 | "type": "http.response.start", 121 | "status": self.status_code, 122 | "headers": self.raw_headers, 123 | } 124 | ) 125 | async for chunk in self.body_iterator: 126 | if not isinstance(chunk, bytes): 127 | chunk = chunk.encode(self.charset) 128 | chunk = b64encode(chunk) 129 | await send( 130 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True} 131 | ) 132 | 133 | await send({"type": "http.response.body", "body": b"", "more_body": False}) 134 | 135 | 136 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile: 137 | def return_content_type(filename): 138 | if gz_uncompressed_content_type: 139 | return gz_uncompressed_content_type 140 | else: 141 | return str(mimetypes.guess_type(filename)[0]) 142 | 143 | filename = str(file.filename) if file.filename else "" 144 | if filename.endswith(".gz"): 145 | filename = filename[:-3] 146 | 147 | gzip_file = gzip.open(file.file).read() 148 | return UploadFile( 149 | file=io.BytesIO(gzip_file), 150 | size=len(gzip_file), 151 | filename=filename, 152 | headers=Headers({"content-type": return_content_type(filename)}), 153 | ) 154 | 155 | 156 | @router.post("/test-project/v1/process-text-4") 157 | @router.post("/test-project/v1.2.3/process-text-4") 158 | def pipeline_1( 159 | request: Request, 160 | gz_uncompressed_content_type: Optional[str] = Form(default=None), 161 | text_files: Union[List[UploadFile], None] = File(default=None), 162 | output_format: Union[str, None] = Form(default=None), 163 | output_schema: str = Form(default=None), 164 | ): 165 | if text_files: 166 | for file_index in range(len(text_files)): 167 | if text_files[file_index].content_type == "application/gzip": 168 | text_files[file_index] = ungz_file(text_files[file_index]) 169 | 170 | content_type = request.headers.get("Accept") 171 | 172 | default_response_type = output_format or "text/csv" 173 | if not content_type or content_type == "*/*" or content_type == "multipart/mixed": 174 | media_type = default_response_type 175 | else: 176 | media_type = content_type 177 | 178 | default_response_schema = output_schema or "isd" 179 | 180 | if isinstance(text_files, list) and len(text_files): 181 | if len(text_files) > 1: 182 | if content_type and content_type not in [ 183 | "*/*", 184 | "multipart/mixed", 185 | "application/json", 186 | "text/csv", 187 | ]: 188 | raise HTTPException( 189 | detail=( 190 | f"Conflict in media type {content_type}" 191 | ' with response type "multipart/mixed".\n' 192 | ), 193 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 194 | ) 195 | 196 | def response_generator(is_multipart): 197 | for file in text_files: 198 | get_validated_mimetype(file) 199 | 200 | text = file.file.read().decode("utf-8") 201 | 202 | response = pipeline_api( 203 | text, 204 | response_type=media_type, 205 | response_schema=default_response_schema, 206 | ) 207 | 208 | if is_expected_response_type(media_type, type(response)): 209 | raise HTTPException( 210 | detail=( 211 | f"Conflict in media type {media_type}" 212 | f" with response type {type(response)}.\n" 213 | ), 214 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 215 | ) 216 | 217 | valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"] 218 | if media_type in valid_response_types: 219 | if is_multipart: 220 | if type(response) not in [str, bytes]: 221 | response = json.dumps(response) 222 | elif media_type == "text/csv": 223 | response = PlainTextResponse(response) 224 | yield response 225 | else: 226 | raise HTTPException( 227 | detail=f"Unsupported media type {media_type}.\n", 228 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 229 | ) 230 | 231 | def join_responses(responses): 232 | if media_type != "text/csv": 233 | return responses 234 | data = pd.read_csv(io.BytesIO(responses[0].body)) 235 | if len(responses) > 1: 236 | for resp in responses[1:]: 237 | resp_data = pd.read_csv(io.BytesIO(resp.body)) 238 | data = data.merge(resp_data, how="outer") 239 | return PlainTextResponse(data.to_csv()) 240 | 241 | if content_type == "multipart/mixed": 242 | return MultipartMixedResponse( 243 | response_generator(is_multipart=True), content_type=media_type 244 | ) 245 | else: 246 | return ( 247 | list(response_generator(is_multipart=False))[0] 248 | if len(text_files) == 1 249 | else join_responses(list(response_generator(is_multipart=False))) 250 | ) 251 | else: 252 | raise HTTPException( 253 | detail='Request parameter "text_files" is required.\n', 254 | status_code=status.HTTP_400_BAD_REQUEST, 255 | ) 256 | 257 | 258 | app.include_router(router) 259 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | import io 7 | import os 8 | import gzip 9 | import mimetypes 10 | from typing import List, Union 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException 12 | import json 13 | from fastapi.responses import StreamingResponse 14 | from starlette.datastructures import Headers 15 | from starlette.types import Send 16 | from base64 import b64encode 17 | from typing import Optional, Mapping 18 | import secrets 19 | 20 | 21 | app = FastAPI() 22 | router = APIRouter() 23 | 24 | 25 | # pipeline-api 26 | def pipeline_api( 27 | text, 28 | file=None, 29 | filename=None, 30 | file_content_type=None, 31 | ): 32 | return { 33 | "silly_result": " : ".join( 34 | [ 35 | str(len(text if text else "")), 36 | str(text), 37 | str(len(file.read()) if file else None), 38 | str(filename), 39 | str(file_content_type), 40 | ] 41 | ) 42 | } 43 | 44 | 45 | def get_validated_mimetype(file): 46 | """ 47 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too 48 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and 49 | return HTTP 400 for an invalid type. 50 | """ 51 | content_type = file.content_type 52 | if not content_type or content_type == "application/octet-stream": 53 | content_type = mimetypes.guess_type(str(file.filename))[0] 54 | 55 | # Some filetypes missing for this library, just hardcode them for now 56 | if not content_type: 57 | if file.filename.endswith(".md"): 58 | content_type = "text/markdown" 59 | elif file.filename.endswith(".msg"): 60 | content_type = "message/rfc822" 61 | 62 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") 63 | if allowed_mimetypes_str is not None: 64 | allowed_mimetypes = allowed_mimetypes_str.split(",") 65 | 66 | if content_type not in allowed_mimetypes: 67 | raise HTTPException( 68 | status_code=400, 69 | detail=( 70 | f"Unable to process {file.filename}: " 71 | f"File type {content_type} is not supported." 72 | ), 73 | ) 74 | 75 | return content_type 76 | 77 | 78 | class MultipartMixedResponse(StreamingResponse): 79 | CRLF = b"\r\n" 80 | 81 | def __init__(self, *args, content_type: str = None, **kwargs): 82 | super().__init__(*args, **kwargs) 83 | self.content_type = content_type 84 | 85 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None: 86 | super().init_headers(headers) 87 | self.boundary_value = secrets.token_hex(16) 88 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"' 89 | self.raw_headers.append((b"content-type", content_type.encode("latin-1"))) 90 | 91 | @property 92 | def boundary(self): 93 | return b"--" + self.boundary_value.encode() 94 | 95 | def _build_part_headers(self, headers: dict) -> bytes: 96 | header_bytes = b"" 97 | for header, value in headers.items(): 98 | header_bytes += f"{header}: {value}".encode() + self.CRLF 99 | return header_bytes 100 | 101 | def build_part(self, chunk: bytes) -> bytes: 102 | part = self.boundary + self.CRLF 103 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"} 104 | if self.content_type is not None: 105 | part_headers["Content-Type"] = self.content_type 106 | part += self._build_part_headers(part_headers) 107 | part += self.CRLF + chunk + self.CRLF 108 | return part 109 | 110 | async def stream_response(self, send: Send) -> None: 111 | await send( 112 | { 113 | "type": "http.response.start", 114 | "status": self.status_code, 115 | "headers": self.raw_headers, 116 | } 117 | ) 118 | async for chunk in self.body_iterator: 119 | if not isinstance(chunk, bytes): 120 | chunk = chunk.encode(self.charset) 121 | chunk = b64encode(chunk) 122 | await send( 123 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True} 124 | ) 125 | 126 | await send({"type": "http.response.body", "body": b"", "more_body": False}) 127 | 128 | 129 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile: 130 | def return_content_type(filename): 131 | if gz_uncompressed_content_type: 132 | return gz_uncompressed_content_type 133 | else: 134 | return str(mimetypes.guess_type(filename)[0]) 135 | 136 | filename = str(file.filename) if file.filename else "" 137 | if filename.endswith(".gz"): 138 | filename = filename[:-3] 139 | 140 | gzip_file = gzip.open(file.file).read() 141 | return UploadFile( 142 | file=io.BytesIO(gzip_file), 143 | size=len(gzip_file), 144 | filename=filename, 145 | headers=Headers({"content-type": return_content_type(filename)}), 146 | ) 147 | 148 | 149 | @router.post("/test-project/v1/process-text-file-1") 150 | @router.post("/test-project/v1.2.3/process-text-file-1") 151 | def pipeline_1( 152 | request: Request, 153 | gz_uncompressed_content_type: Optional[str] = Form(default=None), 154 | files: Union[List[UploadFile], None] = File(default=None), 155 | text_files: Union[List[UploadFile], None] = File(default=None), 156 | ): 157 | if files: 158 | for file_index in range(len(files)): 159 | if files[file_index].content_type == "application/gzip": 160 | files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type) 161 | 162 | if text_files: 163 | for file_index in range(len(text_files)): 164 | if text_files[file_index].content_type == "application/gzip": 165 | text_files[file_index] = ungz_file(text_files[file_index]) 166 | 167 | content_type = request.headers.get("Accept") 168 | 169 | has_text = isinstance(text_files, list) and len(text_files) 170 | has_files = isinstance(files, list) and len(files) 171 | if not has_text and not has_files: 172 | raise HTTPException( 173 | detail='One of the request parameters "text_files" or "files" is required.\n', 174 | status_code=status.HTTP_400_BAD_REQUEST, 175 | ) 176 | files_list: List = files or [] 177 | text_files_list: List = text_files or [] 178 | 179 | if len(files_list) or len(text_files_list): 180 | if all( 181 | [ 182 | content_type, 183 | content_type not in ["*/*", "multipart/mixed", "application/json", "text/csv"], 184 | len(files_list) + len(text_files_list) > 1, 185 | ] 186 | ): 187 | raise HTTPException( 188 | detail=( 189 | f"Conflict in media type {content_type}" 190 | ' with response type "multipart/mixed".\n' 191 | ), 192 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 193 | ) 194 | 195 | def response_generator(is_multipart): 196 | for text_file in text_files_list: 197 | text = text_file.file.read().decode("utf-8") 198 | 199 | response = pipeline_api( 200 | text=text, 201 | file=None, 202 | ) 203 | 204 | if is_multipart: 205 | if type(response) not in [str, bytes]: 206 | response = json.dumps(response) 207 | yield response 208 | 209 | for file in files_list: 210 | _file = file.file 211 | 212 | file_content_type = get_validated_mimetype(file) 213 | 214 | response = pipeline_api( 215 | text=None, 216 | file=_file, 217 | filename=file.filename, 218 | file_content_type=file_content_type, 219 | ) 220 | 221 | if is_multipart: 222 | if type(response) not in [str, bytes]: 223 | response = json.dumps(response) 224 | yield response 225 | 226 | if content_type == "multipart/mixed": 227 | return MultipartMixedResponse( 228 | response_generator(is_multipart=True), 229 | ) 230 | else: 231 | return ( 232 | list(response_generator(is_multipart=False))[0] 233 | if len(files_list + text_files_list) == 1 234 | else response_generator(is_multipart=False) 235 | ) 236 | else: 237 | raise HTTPException( 238 | detail='Request parameters "files" or "text_files" are required.\n', 239 | status_code=status.HTTP_400_BAD_REQUEST, 240 | ) 241 | 242 | 243 | app.include_router(router) 244 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/preprocessing-pipeline-family.yaml: -------------------------------------------------------------------------------- 1 | name: test-project 2 | version: 1.2.3 3 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/scripts/check-and-format-notebooks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from copy import deepcopy 5 | import difflib 6 | import json 7 | from pathlib import Path 8 | import sys 9 | from typing import List, Tuple, Union 10 | 11 | from nbdev import clean 12 | from nbconvert.preprocessors import ExecutePreprocessor 13 | import nbformat 14 | from unstructured_api_tools.pipelines.convert import read_notebook 15 | 16 | 17 | def process_nb(nb: nbformat.NotebookNode, working_dir: Union[str, Path]) -> nbformat.NotebookNode: 18 | """Execute cells in nb using working_dir as the working directory for imports, modifying the 19 | notebook in place (in memory).""" 20 | ep = ExecutePreprocessor(timeout=600) 21 | ep.preprocess(nb, {"metadata": {"path": working_dir}}) 22 | return nb 23 | 24 | 25 | def nb_paths(root_path: Union[str, Path]) -> List[Path]: 26 | """Fetches all .ipynb filenames that belong to subdirectories of root_path (1 level deep) with 27 | 'notebooks' in the name.""" 28 | root_path = Path(root_path) 29 | return [ 30 | fn 31 | for dir in root_path.iterdir() 32 | # NOTE(alan): Search only in paths with 'notebooks' in the title such as pipeline-notebooks 33 | # and exploration-notebooks 34 | if "notebooks" in dir.stem and dir.is_dir() 35 | for fn in dir.iterdir() 36 | if fn.suffix == ".ipynb" 37 | ] 38 | 39 | 40 | def to_results_str(fns: List[Path], nonmatching_nbs: List[Path]) -> Tuple[str, str]: 41 | """Given files that were checked and list of files that would be changed, produces a summary of 42 | changes as well as a list of files to be changed""" 43 | unchanged = len(fns) - len(nonmatching_nbs) 44 | results = [] 45 | if nonmatching_nbs: 46 | results.append( 47 | f"{len(nonmatching_nbs)} " 48 | f"{'file' if len(nonmatching_nbs) == 1 else 'files'} " 49 | f"{'would be ' if check else ''}changed" 50 | ) 51 | if unchanged: 52 | results.append( 53 | f"{unchanged} " 54 | f"{'file' if unchanged == 1 else 'files'} " 55 | f"{'would be ' if check else ''}left unchanged" 56 | ) 57 | summary_str = ", ".join(results) + ".\n" 58 | if nonmatching_nbs: 59 | details_str = ( 60 | f"The following notebooks {'would have been' if check else 'were'} " 61 | "changed when executed and cleaned:\n* " + "\n* ".join(nonmatching_nbs) + "\n" 62 | ) 63 | else: 64 | details_str = "" 65 | 66 | return summary_str, details_str 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser() 71 | parser.add_argument( 72 | "--check", 73 | default=False, 74 | action="store_true", 75 | help="Check notebook format without making changes. Return code 0 means formatting would " 76 | "produce no changes. Return code 1 means some files would be changed.", 77 | ) 78 | parser.add_argument( 79 | "notebooks", 80 | metavar="notebook", 81 | nargs="*", 82 | help="Path(s) to notebook(s) to format (or check). If you don't pass any paths, " 83 | "notebooks in any subfolders with 'notebooks' in the name will be processed.", 84 | default=[], 85 | ) 86 | args = parser.parse_args() 87 | check = args.check 88 | notebooks = args.notebooks 89 | 90 | root_path = Path(__file__).parent.parent 91 | nonmatching_nbs = [] 92 | fns = notebooks if notebooks else nb_paths(root_path) 93 | for fn in fns: 94 | nb = read_notebook(fn) 95 | modified_nb = deepcopy(nb) 96 | process_nb(modified_nb, root_path) 97 | clean.clean_nb(modified_nb, allowed_cell_metadata_keys=["tags"]) 98 | if nb != modified_nb: 99 | nonmatching_nbs.append(str(fn)) 100 | nb_json = json.dumps(nb.dict(), indent=2, sort_keys=True) 101 | modified_nb_json = json.dumps(modified_nb.dict(), indent=2, sort_keys=True) 102 | sys.stderr.write(f"The following diff shows the modifications made to {fn}\n") 103 | sys.stderr.writelines( 104 | ( 105 | difflib.unified_diff( 106 | nb_json.splitlines(keepends=True), 107 | modified_nb_json.splitlines(keepends=True), 108 | ) 109 | ) 110 | ) 111 | if not check: 112 | nbformat.write(modified_nb, fn) 113 | 114 | summary_str, details_str = to_results_str(fns, nonmatching_nbs) 115 | print(summary_str) 116 | if check: 117 | sys.stderr.write(details_str) 118 | if nonmatching_nbs: 119 | sys.exit(1) 120 | else: 121 | print(details_str) 122 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipeline-test-project/scripts/test-doc-pipeline-apis-consistent.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -eu -o pipefail 4 | 5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 6 | cd "$SCRIPT_DIR"/.. 7 | 8 | PIPELINE_OUTPUT_DIR=tmp-api-check-output-$RANDOM 9 | FILE_INDICTATING_FAILURE="$PIPELINE_OUTPUT_DIR"-has-failures 10 | mkdir -p $PIPELINE_OUTPUT_DIR 11 | touch $PIPELINE_OUTPUT_DIR/__init__.py 12 | 13 | function tmp_pipeline_comp_cleanup () { 14 | cd "$SCRIPT_DIR"/.. 15 | rm -f "$FILE_INDICTATING_FAILURE" 16 | if [[ "$1" -eq 0 ]]; then 17 | rm -rf $PIPELINE_OUTPUT_DIR 18 | fi 19 | exit "$1" 20 | } 21 | 22 | # Now in project root 23 | cd ../.. 24 | 25 | PYTHONPATH=. PIPELINE_FAMILY_CONFIG=test_unstructured_api_tools/pipeline-test-project/preprocessing-pipeline-family.yaml \ 26 | python3 ./unstructured_api_tools/cli.py convert-pipeline-notebooks \ 27 | --input-directory ./test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks \ 28 | --output-directory ./test_unstructured_api_tools/pipeline-test-project/"$PIPELINE_OUTPUT_DIR" 29 | 30 | # Back in the test project 31 | cd - 32 | 33 | NUM_PIPELINE_API_FILES_GENERATED=$(find "$PIPELINE_OUTPUT_DIR" -name "*.py" | wc -l) 34 | 35 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -eq 0 ]]; then 36 | echo "No pipelines where created by unstructured_api_tools convert-pipeline-notebooks" 37 | tmp_pipeline_comp_cleanup 1 38 | fi 39 | 40 | NUM_EXISTING_PIPELINE_API_FILES=$(find "$PACKAGE_NAME"/api -name "*.py" | wc -l) 41 | 42 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -gt "$NUM_EXISTING_PIPELINE_API_FILES" ]]; then 43 | echo "More pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api" 44 | tmp_pipeline_comp_cleanup 1 45 | elif [[ "$NUM_PIPELINE_API_FILES_GENERATED" -lt "$NUM_EXISTING_PIPELINE_API_FILES" ]]; then 46 | echo "Fewer pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api" 47 | tmp_pipeline_comp_cleanup 1 48 | fi 49 | 50 | cd "$PACKAGE_NAME"/api 51 | find . -name "*.py" -print0 | while IFS= read -r -d '' pipeline_file; do 52 | set +o pipefail 53 | if ! diff -u "$pipeline_file" ../../"$PIPELINE_OUTPUT_DIR/$pipeline_file"; then 54 | touch "../../$FILE_INDICTATING_FAILURE" 55 | fi 56 | set -o pipefail 57 | done 58 | cd - 59 | 60 | if [ -r "$FILE_INDICTATING_FAILURE" ]; then 61 | echo 62 | echo "Autogenerated pipeline api file(s) do not match existing versions, see above for diff's" 63 | echo " or run: diff -ru ${PACKAGE_NAME}/api/ ${PIPELINE_OUTPUT_DIR}/" 64 | tmp_pipeline_comp_cleanup 1 65 | fi 66 | tmp_pipeline_comp_cleanup 0 67 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipelines/test_api_conventions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import yaml 4 | 5 | import unstructured_api_tools.pipelines.api_conventions as conventions 6 | 7 | 8 | @pytest.fixture 9 | def sample_config(): 10 | return {"version": "0.2.1", "name": "sec_filings"} 11 | 12 | 13 | @pytest.mark.parametrize( 14 | # NOTE(yuming): Test cases ref: https://regex101.com/r/Ly7O1x/3/ 15 | "invalid_semver_string", 16 | [ 17 | "1", 18 | "1.2", 19 | "1.2.3-0123", 20 | "1.2.3-0123.0123", 21 | "1.1.2+.123", 22 | "+invalid", 23 | "-invalid", 24 | "-invalid+invalid", 25 | "-invalid.01", 26 | "alpha", 27 | "alpha.beta", 28 | "alpha.beta.1", 29 | "alpha.1", 30 | "alpha+beta", 31 | "alpha_beta", 32 | "alpha.", 33 | "alpha..", 34 | "beta", 35 | "1.0.0-alpha_beta", 36 | "-alpha.", 37 | "1.0.0-alpha..", 38 | "1.0.0-alpha..1", 39 | "1.0.0-alpha...1", 40 | "1.0.0-alpha....1", 41 | "1.0.0-alpha.....1", 42 | "1.0.0-alpha......1", 43 | "1.0.0-alpha.......1", 44 | "01.1.1", 45 | "1.01.1", 46 | "1.1.01", 47 | "1.2", 48 | "1.2.3.DEV", 49 | "1.2-SNAPSHOT", 50 | "1.2.31.2.3----RC-SNAPSHOT.12.09.1--..12+788", 51 | "1.2-RC-SNAPSHOT", 52 | "-1.0.3-gamma+b7718", 53 | "+justmeta", 54 | "9.8.7+meta+meta", 55 | "9.8.7-whatever+meta+meta", 56 | "9999999999999.999999999999999999.999999999----RC-SNAPSHOT.12.09.1---------..12", 57 | ], 58 | ) 59 | def test_raise_for_invalid_semver_string(invalid_semver_string): 60 | with pytest.raises(ValueError): 61 | conventions.raise_for_invalid_semver_string(invalid_semver_string) 62 | 63 | 64 | @pytest.mark.parametrize( 65 | # NOTE(yuming): Test cases ref: https://regex101.com/r/Ly7O1x/3/ 66 | "valid_semver_string", 67 | [ 68 | "0.0.4", 69 | "1.2.3", 70 | "10.20.30", 71 | "1.1.2-prerelease+meta", 72 | "1.1.2+meta", 73 | "1.1.2+meta-valid", 74 | "1.0.0-alpha", 75 | "1.0.0-beta", 76 | "1.0.0-alpha.beta", 77 | "1.0.0-alpha.beta.1", 78 | "1.0.0-alpha.1", 79 | "1.0.0-alpha0.valid", 80 | "1.0.0-alpha.0valid", 81 | "1.0.0-alpha-a.b-c-somethinglong+build.1-aef.1-its-okay", 82 | "1.0.0-rc.1+build.1", 83 | "2.0.0-rc.1+build.123", 84 | "1.2.3-beta", 85 | "10.2.3-DEV-SNAPSHOT", 86 | "1.2.3-SNAPSHOT-123", 87 | "1.0.0", 88 | "2.0.0", 89 | "1.1.7", 90 | "2.0.0+build.1848", 91 | "2.0.1-alpha.1227", 92 | "1.0.0-alpha+beta", 93 | "1.2.3----RC-SNAPSHOT.12.9.1--.12+788", 94 | "1.2.3----R-S.12.9.1--.12+meta", 95 | "1.2.3----RC-SNAPSHOT.12.9.1--.12", 96 | "1.0.0+0.build.1-rc.10000aaa-kk-0.1", 97 | "99999999999999999999999.999999999999999999.99999999999999999", 98 | "1.0.0-0A.is.legal", 99 | ], 100 | ) 101 | def test_pass_for_valid_semver_string(valid_semver_string): 102 | try: 103 | conventions.raise_for_invalid_semver_string(valid_semver_string) 104 | except ValueError: 105 | assert False, f"{valid_semver_string} raised an exception." 106 | 107 | 108 | def test_get_pipeline_path(): 109 | path = conventions.get_pipeline_path( 110 | filename="risk_narrative.py", pipeline_family="sec_filings", semver="0.2.1" 111 | ) 112 | assert path == "/sec-filings/v0.2.1/risk-narrative" 113 | 114 | 115 | def test_get_short_pipeline_path(): 116 | path = conventions.get_pipeline_path( 117 | filename="risk_narrative.py", 118 | pipeline_family="sec_filings", 119 | semver="0.2.1", 120 | shorter=True, 121 | ) 122 | 123 | assert path == "/sec-filings/v0/risk-narrative" 124 | 125 | 126 | def test_get_pipeline_path_raises_if_either_not_specified(): 127 | with pytest.raises(ValueError): 128 | conventions.get_pipeline_path( 129 | filename="risk_narrative.py", pipeline_family="sec_filings", semver=None 130 | ) 131 | 132 | with pytest.raises(ValueError): 133 | conventions.get_pipeline_path( 134 | filename="risk_narrative.py", pipeline_family=None, semver="0.2.1" 135 | ) 136 | 137 | 138 | def test_get_pipeline_path_reads_from_file(tmpdir, sample_config): 139 | filename = os.path.join(tmpdir.dirname, "pipeline-family.yaml") 140 | with open(filename, "w") as f: 141 | yaml.dump(sample_config, f) 142 | 143 | path = conventions.get_pipeline_path(filename="risk_narrative.py", config_filename=filename) 144 | assert path == "/sec-filings/v0.2.1/risk-narrative" 145 | 146 | 147 | def test_pipeline_config_reads_from_file(tmpdir, sample_config): 148 | filename = os.path.join(tmpdir.dirname, "pipeline-family.yaml") 149 | with open(filename, "w") as f: 150 | yaml.dump(sample_config, f) 151 | 152 | config = conventions.PipelineConfig(filename=filename) 153 | assert config.name == "sec_filings" 154 | assert config.version == "0.2.1" 155 | 156 | 157 | def test_pipeline_config_reads_from_env(tmpdir, monkeypatch, sample_config): 158 | filename = os.path.join(tmpdir.dirname, "pipeline-family.yaml") 159 | with open(filename, "w") as f: 160 | yaml.dump(sample_config, f) 161 | 162 | monkeypatch.setenv("PIPELINE_FAMILY_CONFIG", filename) 163 | 164 | config = conventions.PipelineConfig(filename=None) 165 | assert config.name == "sec_filings" 166 | 167 | 168 | def test_pipeline_config_raises_with_missing_file(tmpdir, monkeypatch, sample_config): 169 | # NOTE(robinson) - Will default to looking for ${PWD}/pipeline-family.yaml, which 170 | # does not exist 171 | with pytest.raises(FileNotFoundError): 172 | conventions.PipelineConfig(filename=None) 173 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/pipelines/test_lint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import re 4 | from unittest.mock import patch 5 | 6 | import unstructured_api_tools.pipelines.lint as lint 7 | 8 | 9 | class MockPopen: 10 | def __init__(self, *args, **kwargs): 11 | pass 12 | 13 | def communicate(self, *args, **kwargs): 14 | raise ValueError("Squawk!") 15 | 16 | 17 | def test_run_lint_cmd_cleans_up_on_exception(monkeypatch): 18 | monkeypatch.setattr(lint, "Popen", MockPopen) 19 | with patch.object(os, "unlink", return_value=None) as mock_unlink: 20 | with pytest.raises(ValueError): 21 | lint._run_lint_cmd(["fake"], "fake.py", re.compile("[A-Z]")) 22 | 23 | mock_unlink.assert_called_once() 24 | 25 | 26 | def test_flake8(): 27 | file_text = """# A test file 28 | 29 | def hello_world(): 30 | pass 31 | """ 32 | assert lint.check_flake8(file_text) is True 33 | 34 | 35 | def test_flake8_passes_with_unsued_import(): 36 | file_text = """# A test file 37 | import os 38 | 39 | 40 | def hello_world(): 41 | pass 42 | """ 43 | assert lint.check_flake8(file_text) is True 44 | 45 | 46 | def test_flake8_raises_with_bad_lint(): 47 | file_text = """# A test file 48 | 49 | def hello_world() : 50 | pass""" 51 | with pytest.raises(lint.LintError): 52 | lint.check_flake8(file_text) 53 | 54 | 55 | def test_format_black(): 56 | file_text = """# A test file 57 | 58 | def hello_world() : 59 | pass 60 | """ 61 | formatted_text = lint.format_black(file_text) 62 | 63 | assert ( 64 | formatted_text 65 | == """# A test file 66 | 67 | 68 | def hello_world(): 69 | pass 70 | """ 71 | ) 72 | 73 | 74 | def test_validate_flake8_ignore(): 75 | lint.validate_flake8_ignore("E405, F401") is True 76 | 77 | 78 | def test_validate_flake8_ignore_bad_input(): 79 | with pytest.raises(ValueError): 80 | lint.validate_flake8_ignore("NOT A REAL CODE") 81 | 82 | 83 | def test_mypy(): 84 | file_text = """# A test file 85 | 86 | def hello_world(text: str) -> str: 87 | return text 88 | """ 89 | assert lint.check_mypy(file_text) is True 90 | 91 | 92 | def test_mypy_raises_with_bad_type(): 93 | file_text = """# A test file 94 | 95 | def hello_world(text: str) -> str: 96 | return int(text) 97 | """ 98 | with pytest.raises(lint.LintError): 99 | lint.check_mypy(file_text) 100 | 101 | 102 | def test_check_black(): 103 | file_text = """# A test file 104 | 105 | 106 | def hello_world(): 107 | pass 108 | """ 109 | assert lint.check_black(file_text) is True 110 | 111 | 112 | def test_check_black_raises_with_bad_format(): 113 | file_text = """# A test file 114 | 115 | 116 | def hello_world() : 117 | pass 118 | """ 119 | with pytest.raises(lint.LintError): 120 | lint.check_black(file_text) 121 | -------------------------------------------------------------------------------- /test_unstructured_api_tools/test_cli.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import pytest 4 | 5 | from click.testing import CliRunner 6 | from nbformat import NotebookNode 7 | 8 | import unstructured_api_tools.cli as cli 9 | 10 | 11 | @pytest.fixture 12 | def sample_notebook(): 13 | return NotebookNode( 14 | { 15 | "cells": [ 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "id": "768fa8c6", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": "# pipeline-api\nimport random", # noqa: E501 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "id": "64f6386b", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": "def function_not_to_include():\n pass", 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "45988caf", 35 | "metadata": {}, 36 | "source": "# pipeline-api", 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "id": "c8e0cad6", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": "# pipeline-api\ndef pipeline_api(text: str):\n sec_document = 'not a real document'\n risk_narrative = sec_document[0:5]\n return risk_narrative", # noqa: E501 45 | }, 46 | ], 47 | "metadata": { 48 | "kernelspec": { 49 | "display_name": "Python 3 (ipykernel)", 50 | "language": "python", 51 | "name": "python3", 52 | }, 53 | "language_info": { 54 | "codemirror_mode": {"name": "ipython", "version": 3}, 55 | "file_extension": ".py", 56 | "mimetype": "text/x-python", 57 | "name": "python", 58 | "nbconvert_exporter": "python", 59 | "pygments_lexer": "ipython3", 60 | "version": "3.8.13", 61 | }, 62 | }, 63 | "nbformat": 4, 64 | "nbformat_minor": 5, 65 | } 66 | ) 67 | 68 | 69 | def test_convert_pipeline_notebooks(sample_notebook, tmpdir): 70 | for i in range(5): 71 | filename = os.path.join(tmpdir.dirname, f"pipeline-this-is-a-test-{i}.ipynb") 72 | with open(filename, "w") as f: 73 | json.dump(sample_notebook, f, indent=4) 74 | 75 | runner = CliRunner() 76 | result = runner.invoke( 77 | cli.cli, 78 | [ 79 | "convert-pipeline-notebooks", 80 | "--input-directory", 81 | tmpdir.dirname, 82 | "--output-directory", 83 | tmpdir.dirname, 84 | "--pipeline-family", 85 | "fake-family-name", 86 | "--semver", 87 | "2.1.1", 88 | ], 89 | ) 90 | assert result.exit_code == 0 91 | 92 | files = os.listdir(tmpdir.dirname) 93 | for i in range(5): 94 | assert f"this_is_a_test_{i}.py" in files 95 | assert "app.py" in files 96 | 97 | 98 | def test_convert_pipeline_notebooks_passing_flake8_ignore(sample_notebook, tmpdir): 99 | for i in range(5): 100 | filename = os.path.join(tmpdir.dirname, f"pipeline-this-is-a-test-{i}.ipynb") 101 | with open(filename, "w") as f: 102 | json.dump(sample_notebook, f, indent=4) 103 | 104 | runner = CliRunner() 105 | result = runner.invoke( 106 | cli.cli, 107 | [ 108 | "convert-pipeline-notebooks", 109 | "--input-directory", 110 | tmpdir.dirname, 111 | "--output-directory", 112 | tmpdir.dirname, 113 | "--pipeline-family", 114 | "fake-family-name", 115 | "--semver", 116 | "2.1.1", 117 | "--flake8-ignore", 118 | "E402, F401", 119 | ], 120 | ) 121 | assert result.exit_code == 0 122 | 123 | files = os.listdir(tmpdir.dirname) 124 | for i in range(5): 125 | assert f"this_is_a_test_{i}.py" in files 126 | assert "app.py" in files 127 | -------------------------------------------------------------------------------- /unstructured_api_tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/unstructured_api_tools/__init__.py -------------------------------------------------------------------------------- /unstructured_api_tools/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.10.11" # pragma: no cover 2 | -------------------------------------------------------------------------------- /unstructured_api_tools/cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | 4 | import click 5 | 6 | from unstructured_api_tools.pipelines.convert import convert_notebook_files_to_api 7 | from unstructured_api_tools.pipelines.lint import ( 8 | FLAKE8_DEFAULT_OPTS, 9 | validate_flake8_ignore, 10 | ) 11 | 12 | 13 | @click.group() 14 | def cli(): 15 | pass 16 | 17 | 18 | @cli.command() 19 | @click.option("--input-directory") 20 | @click.option("--output-directory") 21 | @click.option("--pipeline-family") 22 | @click.option("--semver") 23 | @click.option("--config-filename") 24 | @click.option("--flake8-ignore") 25 | def convert_pipeline_notebooks( 26 | input_directory: str, 27 | output_directory: str, 28 | pipeline_family: Optional[str] = None, 29 | semver: Optional[str] = None, 30 | config_filename: Optional[str] = None, 31 | flake8_ignore: Optional[str] = None, 32 | ): 33 | """Convert a pipeline notebook to a Python script. The conversion script will retain 34 | any cell that includes # pipeline-api at the top.""" 35 | notebook_filenames = sorted([f for f in os.listdir(input_directory) if f.endswith(".ipynb")]) 36 | 37 | if flake8_ignore: 38 | validate_flake8_ignore(flake8_ignore) 39 | # NOTE(robinson) - Not making line length configurable because setting it to 40 | # 100 allows flake8 to be consistent with black 41 | flake8_opts = ["--max-line-length", "100", "--ignore", flake8_ignore] 42 | else: 43 | flake8_opts = FLAKE8_DEFAULT_OPTS 44 | 45 | convert_notebook_files_to_api( 46 | notebook_filenames, 47 | input_directory, 48 | output_directory, 49 | pipeline_family=pipeline_family, 50 | semver=semver, 51 | config_filename=config_filename, 52 | flake8_opts=flake8_opts, 53 | ) 54 | 55 | 56 | if __name__ == "__main__": 57 | cli() # pragma: nocover 58 | -------------------------------------------------------------------------------- /unstructured_api_tools/pipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/unstructured_api_tools/pipelines/__init__.py -------------------------------------------------------------------------------- /unstructured_api_tools/pipelines/api_conventions.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import os 3 | from typing import Optional 4 | import yaml 5 | import re 6 | 7 | 8 | def get_config(filename: Optional[str] = None): 9 | if filename is None: 10 | default = os.path.join(os.getcwd(), "preprocessing-pipeline-family.yaml") 11 | filename = os.environ.get("PIPELINE_FAMILY_CONFIG", default) 12 | 13 | if not os.path.exists(filename): 14 | raise FileNotFoundError( 15 | f"A pipeline family config was not found at {filename}." 16 | "The config class looks for the config in the following " 17 | "order:\n" 18 | " 1. The filename parameter\n" 19 | " 2. The PIPELINE_FAMILY_CONFIG environment variable\n" 20 | ' 3. "${PWD}"/pipeline-family.yaml' 21 | ) 22 | 23 | with open(filename, "r") as f: 24 | config = yaml.safe_load(f) 25 | 26 | return config 27 | 28 | 29 | @dataclass 30 | class PipelineConfig: 31 | name: str 32 | version: str 33 | description: str 34 | long_description: str 35 | filename: str 36 | 37 | def __init__(self, filename: Optional[str] = None): 38 | """Parses pipeline family metadata from the pipeline-family.yaml file. If no 39 | filename is passed, reverts to the PIPELINE_FAMILY_CONFIG environment variable. 40 | Otherwise, looks for pipeline-family.yaml in the working directory.""" 41 | config = get_config(filename) 42 | 43 | self.name = config["name"] 44 | self.version = config["version"] 45 | self.description = config.get("description", "Unstructured Pipeline API") 46 | self.long_description = config.get("long_description", "") 47 | 48 | 49 | def raise_for_invalid_semver_string(semver: str): 50 | """Raise an error if the semver string is invalid.""" 51 | # NOTE(yuming): Suggested regular expression (RegEx) to check a semver string 52 | # ref: https://semver.org/#is-there-a-suggested-regular-expression 53 | # -regex-to-check-a-semver-string 54 | valid_semver_pattern = r"""^(?P0|[1-9]\d*)\.(?P0|[1-9]\d*)\. 55 | (?P0|[1-9]\d*)(?:-(?P(?:0|[1-9]\d*|\d*[a-zA-Z-] 56 | [0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))? 57 | (?:\+(?P[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$""" 58 | valid_semver_re = re.compile(valid_semver_pattern, re.VERBOSE) 59 | 60 | if not re.match(valid_semver_re, semver): 61 | raise ValueError("Semver string must be a valid string.") 62 | 63 | 64 | def get_pipeline_path( 65 | filename: str, 66 | pipeline_family: Optional[str] = None, 67 | semver: Optional[str] = None, 68 | config_filename: Optional[str] = None, 69 | shorter: Optional[bool] = False, 70 | ) -> str: 71 | """Builds the pipeline path according to the conventions outlined in the architecture docs. 72 | ref: https://github.com/Unstructured-IO/ 73 | docs-and-arch/blob/main/Pipelines-and-APIs.md#api-specification 74 | """ 75 | if any([pipeline_family, semver]) and not all([pipeline_family, semver]): 76 | raise ValueError( 77 | "If either pipeline_family or semver is specified, the other must be " 78 | "specified as well." 79 | ) 80 | 81 | if not any([pipeline_family, semver]): 82 | config = PipelineConfig(filename=config_filename) 83 | pipeline_family = config.name 84 | semver = config.version 85 | else: 86 | # NOTE(robinson) - Explicit type casting if the variables are passed. Otherwise 87 | # mypy gets cranky because Optional[str] implies they could be None. 88 | pipeline_family = str(pipeline_family) 89 | semver = str(semver) 90 | 91 | raise_for_invalid_semver_string(semver) 92 | 93 | if shorter: 94 | semver = semver.split(".")[0] 95 | 96 | pipeline_family = pipeline_family.replace("_", "-") 97 | 98 | filepath = filename.split("/") 99 | # NOTE(robinson) - Converts something like "sec_filings.py" to "sec-filings" 100 | pipeline_name = filepath[-1].replace("_", "-").replace(".py", "") 101 | 102 | return f"/{pipeline_family}/v{semver}/{pipeline_name}" 103 | 104 | 105 | def get_api_name_from_config(filename: Optional[str] = None): 106 | try: 107 | return get_config(filename).get("name", None) 108 | except FileNotFoundError: 109 | return None 110 | -------------------------------------------------------------------------------- /unstructured_api_tools/pipelines/lint.py: -------------------------------------------------------------------------------- 1 | """Tools for linting and autoformatting generated API files.""" 2 | import os 3 | import re 4 | from subprocess import PIPE, Popen 5 | import tempfile 6 | from typing import List 7 | from autoflake import ( 8 | check, 9 | filter_unused_import, 10 | SAFE_IMPORTS, 11 | unused_import_module_name, 12 | filter_useless_pass, 13 | ) 14 | import pyflakes.api 15 | import pyflakes.messages 16 | import pyflakes.reporter 17 | import io 18 | import collections 19 | 20 | from black import format_str, FileMode 21 | from autoflake import fix_code 22 | 23 | # NOTE(robinson) - F401 is for unused imports 24 | FLAKE8_DEFAULT_OPTS: List[str] = ["--max-line-length", "100", "--ignore", "F401"] 25 | FLAKE8_PREFIX_RE = re.compile(r".+:\d+:\d+:\s") 26 | FLAKE8_ERROR_CODE_RE = re.compile(r"([A-Z]\d{3},?\s?)+") 27 | 28 | MYPY_PREFIX_RE = re.compile(r".+:\d+:\s") 29 | 30 | 31 | class LintError(RuntimeError): 32 | pass 33 | 34 | 35 | def _create_tempfile(file_text: str): 36 | tmp = tempfile.NamedTemporaryFile(delete=False) 37 | tmp.write(file_text.encode()) 38 | tmp.close() 39 | return tmp 40 | 41 | 42 | def _create_file_for_user_debugging(content: str, filename: str): 43 | """Creates file in user's current working to facilitate debugging lint errors.""" 44 | with open(filename, "w+") as f: 45 | f.write(content) 46 | 47 | 48 | def _run_lint_cmd(cmd: List[str], filename: str, prefix_re: re.Pattern): 49 | """Runs a subprocess with the specified lint command and raises a LintError 50 | if the file does not pass.""" 51 | try: 52 | process = Popen(cmd, stdout=PIPE, stderr=PIPE) 53 | stdout, _ = process.communicate() 54 | except Exception as e: 55 | # NOTE(robinson) - Catching the error ensures we clean up the temp file 56 | os.unlink(filename) # NOTE(robinson) - Removes the temporary file 57 | raise e 58 | 59 | os.unlink(filename) # NOTE(robinson) - Removes the temporary file 60 | if process.returncode != 0: 61 | err = prefix_re.sub("", stdout.decode("utf-8")) 62 | raise LintError("\n\n" + err) 63 | 64 | return True 65 | 66 | 67 | def check_flake8(file_text: str, opts: List[str] = FLAKE8_DEFAULT_OPTS) -> bool: 68 | """Runs flake8 on the text. Raises and exception if the file does 69 | not pass linting. Uses subprocess because per the Flake8 docs, Flake8 70 | does not have a public Python API. 71 | ref: https://flake8.pycqa.org/en/latest/user/python-api.html#public-python-api""" 72 | tmp = _create_tempfile(file_text) 73 | cmd = ["flake8", tmp.name] + opts 74 | try: 75 | _run_lint_cmd(cmd, tmp.name, MYPY_PREFIX_RE) 76 | except Exception as e: 77 | debug_file = "tmp-flake8-check-pipeline-api.py" 78 | _create_file_for_user_debugging(file_text, debug_file) 79 | cmd[1] = debug_file 80 | raise LintError("run the following to debug: \n" f"{' '.join(cmd)}") from e 81 | return True 82 | 83 | 84 | def validate_flake8_ignore(flake8_ignore: str) -> bool: 85 | """Validates the CLI argument for Flake8 errors. For CLI input validation.""" 86 | if FLAKE8_ERROR_CODE_RE.match(flake8_ignore) is None: 87 | raise ValueError(f"{flake8_ignore} is an invalid argument for the --flake8-ignore flag.") 88 | return True 89 | 90 | 91 | def check_mypy(file_text: str): 92 | """Runs mypy type checking on the file text.""" 93 | tmp = _create_tempfile(file_text) 94 | cmd = ["mypy", tmp.name, "--ignore-missing-imports", "--implicit-optional"] 95 | try: 96 | _run_lint_cmd(cmd, tmp.name, MYPY_PREFIX_RE) 97 | except Exception as e: 98 | debug_file = "tmp-myp-check-pipeline-api.py" 99 | _create_file_for_user_debugging(file_text, debug_file) 100 | cmd[1] = debug_file 101 | raise LintError("run the following to debug: \n" f"{' '.join(cmd)}") from e 102 | return True 103 | 104 | 105 | def check_black(file_text: str) -> bool: 106 | """Checks if a file needs to be reformatted with black.""" 107 | passes = format_black(file_text) == file_text 108 | if not passes: 109 | raise LintError("File text needs to be reformatted with black.") 110 | return passes 111 | 112 | 113 | def format_black(file_text: str) -> str: 114 | """Auto-formats a file using black.""" 115 | return format_str(file_text, mode=FileMode(line_length=100)) 116 | 117 | 118 | def format_autoflake(file_text: str) -> str: 119 | return fix_code( 120 | source=file_text, 121 | remove_unused_variables=True, 122 | remove_all_unused_imports=True, 123 | expand_star_imports=True, 124 | ) 125 | 126 | 127 | """ 128 | Autoflake only takes into account unused imports by checking for pyflakes.messages.UnusedImport 129 | but does not handle duplicate imports which come out as pyflakes.messages.RedefinedWhileUnused 130 | from pyflakes. The following code is an extension of autoflake to take duplicate 131 | imports into account 132 | """ 133 | 134 | 135 | def duplicate_import_line_numbers(messages): 136 | """Yield line numbers of unused imports.""" 137 | for message in messages: 138 | if isinstance(message, pyflakes.messages.RedefinedWhileUnused): 139 | yield message.lineno 140 | 141 | 142 | def _remove_duplicate_imports(text: str): 143 | messages = check(text) 144 | marked_import_line_numbers = frozenset( 145 | duplicate_import_line_numbers(messages), 146 | ) 147 | marked_unused_module = collections.defaultdict(lambda: []) 148 | for line_number, module_name in unused_import_module_name(messages): 149 | marked_unused_module[line_number].append(module_name) 150 | sio = io.StringIO(text) 151 | previous_line = "" 152 | result = None 153 | for line_number, line in enumerate(sio.readlines(), start=1): 154 | if line_number in marked_import_line_numbers: 155 | result = filter_unused_import( 156 | line, 157 | unused_module=marked_unused_module[line_number], 158 | remove_all_unused_imports=True, 159 | imports=SAFE_IMPORTS, 160 | previous_line=previous_line, 161 | ) 162 | else: 163 | result = line 164 | yield result 165 | previous_line = line 166 | 167 | 168 | def remove_duplicate_imports(text: str) -> str: 169 | return "".join(filter_useless_pass("".join(_remove_duplicate_imports(text)))) 170 | -------------------------------------------------------------------------------- /unstructured_api_tools/pipelines/templates/pipeline_app.txt: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | 7 | from fastapi import FastAPI, Request, status 8 | import logging 9 | import os 10 | 11 | {% for module in module_names -%} 12 | from .{{ module }} import router as {{module}}_router 13 | {% endfor %} 14 | 15 | app = FastAPI( 16 | title="{{ title }}", 17 | description="""{{ description }}""", 18 | version="{{ version or '1.0.0' }}", 19 | docs_url="{{ '/' ~ version_name ~ '/docs' if version_name else '/docs' }}", 20 | openapi_url="{{ '/' ~ version_name ~ '/openapi.json' if version_name else '/openapi.json' }}" 21 | ) 22 | 23 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", None) 24 | if allowed_origins: 25 | from fastapi.middleware.cors import CORSMiddleware 26 | app.add_middleware( 27 | CORSMiddleware, 28 | allow_origins=allowed_origins.split(","), 29 | allow_methods=["OPTIONS", "POST"], 30 | allow_headers=["Content-Type"] 31 | ) 32 | 33 | {% for module in module_names -%} 34 | app.include_router({{ module }}_router) 35 | {% endfor %} 36 | 37 | # Filter out /healthcheck noise 38 | class HealthCheckFilter(logging.Filter): 39 | def filter(self, record: logging.LogRecord) -> bool: 40 | return record.getMessage().find("/healthcheck") == -1 41 | 42 | # Filter out /metrics noise 43 | class MetricsCheckFilter(logging.Filter): 44 | def filter(self, record: logging.LogRecord) -> bool: 45 | return record.getMessage().find("/metrics") == -1 46 | 47 | logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter()) 48 | logging.getLogger("uvicorn.access").addFilter(MetricsCheckFilter()) 49 | 50 | @app.get("/healthcheck", status_code=status.HTTP_200_OK, include_in_schema=False) 51 | def healthcheck(request: Request): 52 | return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"} 53 | --------------------------------------------------------------------------------