├── .coveragerc
├── .github
    └── workflows
    │   ├── ci.yml
    │   └── codeql-analysis.yml
├── .gitignore
├── CHANGELOG.md
├── Dockerfile
├── LICENSE.md
├── MANIFEST.in
├── Makefile
├── README.md
├── img
    └── unstructured_logo.png
├── requirements
    ├── base.txt
    ├── test.in
    └── test.txt
├── scripts
    ├── docker-build.sh
    ├── shellcheck.sh
    └── version-sync.sh
├── setup.cfg
├── setup.py
├── test_unstructured_api_tools
    ├── api
    │   ├── fixtures
    │   │   ├── example.jpg
    │   │   ├── example.jpg.gz
    │   │   ├── fake-email.msg
    │   │   ├── fake.docx
    │   │   ├── fake.docx.gz
    │   │   ├── markdown.md
    │   │   ├── spring-weather.html.json
    │   │   ├── text_file.txt
    │   │   ├── text_file.txt.gz
    │   │   ├── text_file_2.txt
    │   │   └── text_file_2.txt.gz
    │   ├── functions_and_variables.py
    │   ├── test_docs.py
    │   ├── test_file_apis.py
    │   ├── test_file_text_apis.py
    │   └── test_text_apis.py
    ├── pipeline-test-project
    │   ├── README.md
    │   ├── pipeline-notebooks
    │   │   ├── pipeline-process-file-1.ipynb
    │   │   ├── pipeline-process-file-2.ipynb
    │   │   ├── pipeline-process-file-3.ipynb
    │   │   ├── pipeline-process-file-4.ipynb
    │   │   ├── pipeline-process-file-5.ipynb
    │   │   ├── pipeline-process-text-1.ipynb
    │   │   ├── pipeline-process-text-2.ipynb
    │   │   ├── pipeline-process-text-3.ipynb
    │   │   ├── pipeline-process-text-4.ipynb
    │   │   ├── pipeline-process-text-file-1.ipynb
    │   │   ├── pipeline-process-text-file-2.ipynb
    │   │   ├── pipeline-process-text-file-3.ipynb
    │   │   └── pipeline-process-text-file-4.ipynb
    │   ├── prepline_test_project
    │   │   └── api
    │   │   │   ├── __init__.py
    │   │   │   ├── app.py
    │   │   │   ├── process_file_1.py
    │   │   │   ├── process_file_2.py
    │   │   │   ├── process_file_3.py
    │   │   │   ├── process_file_4.py
    │   │   │   ├── process_file_5.py
    │   │   │   ├── process_text_1.py
    │   │   │   ├── process_text_2.py
    │   │   │   ├── process_text_3.py
    │   │   │   ├── process_text_4.py
    │   │   │   ├── process_text_file_1.py
    │   │   │   ├── process_text_file_2.py
    │   │   │   ├── process_text_file_3.py
    │   │   │   └── process_text_file_4.py
    │   ├── preprocessing-pipeline-family.yaml
    │   └── scripts
    │   │   ├── check-and-format-notebooks.py
    │   │   └── test-doc-pipeline-apis-consistent.sh
    ├── pipelines
    │   ├── test_api_conventions.py
    │   ├── test_convert.py
    │   └── test_lint.py
    └── test_cli.py
└── unstructured_api_tools
    ├── __init__.py
    ├── __version__.py
    ├── cli.py
    └── pipelines
        ├── __init__.py
        ├── api_conventions.py
        ├── convert.py
        ├── lint.py
        └── templates
            ├── pipeline_api.txt
            └── pipeline_app.txt


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = *.txt
3 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   # NOTE(robinson) - We are limiting when we run CI avoid exceeding our 2,000 min/month limt.
  5 |   # We can switch to running on push if we make this repo public or are fine with
  6 |   # paying for CI minutes.
  7 |   push:
  8 |     branches: [ main ]
  9 |   pull_request:
 10 |     branches: [ main ]
 11 | 
 12 | env:
 13 |   PYTHON_VERSION: 3.8
 14 | 
 15 | jobs:
 16 |   setup:
 17 |     strategy:
 18 |       matrix:
 19 |         python-version: ["3.8", "3.9", "3.10"]
 20 |     runs-on: ubuntu-latest
 21 |     steps:
 22 |     - uses: actions/checkout@v3
 23 |     - uses: actions/cache@v3
 24 |       id: virtualenv-cache
 25 |       with:
 26 |         path: |
 27 |           .venv
 28 |         key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }}
 29 |     - name: Set up Python ${{ matrix.python-version }}
 30 |       uses: actions/setup-python@v4
 31 |       with:
 32 |         python-version: ${{ matrix.python-version }}
 33 |     - name: Setup virtual environment (no cache hit)
 34 |       if: steps.virtualenv-cache.outputs.cache-hit != 'true'
 35 |       run: |
 36 |         python${{ matrix.python-version }} -m venv .venv
 37 |         source .venv/bin/activate
 38 |         make install-ci
 39 | 
 40 |   lint:
 41 |     runs-on: ubuntu-latest
 42 |     needs: setup
 43 |     steps:
 44 |     - uses: actions/checkout@v3
 45 |     - uses: actions/cache@v3
 46 |       id: virtualenv-cache
 47 |       with:
 48 |         path: .venv
 49 |         key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
 50 |     - name: Lint
 51 |       run: |
 52 |         source .venv/bin/activate
 53 |         make check
 54 | 
 55 |   shellcheck:
 56 |     runs-on: ubuntu-latest
 57 |     steps:
 58 |       - uses: actions/checkout@v2
 59 |       - name: ShellCheck
 60 |         uses: ludeeus/action-shellcheck@master
 61 | 
 62 |   test_api_consistency:
 63 |     strategy:
 64 |       matrix:
 65 |         python-version: ["3.8", "3.9", "3.10"]
 66 |     runs-on: ubuntu-latest
 67 |     needs: [setup, lint]
 68 |     steps:
 69 |     - uses: actions/checkout@v3
 70 |     - uses: actions/cache@v3
 71 |       id: virtualenv-cache
 72 |       with:
 73 |         path: |
 74 |           .venv
 75 |         key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }}
 76 |     - name: API Consistency
 77 |       run: |
 78 |         source .venv/bin/activate
 79 |         make api-check-test
 80 | 
 81 |   test:
 82 |     strategy:
 83 |       matrix:
 84 |         python-version: [ "3.8", "3.9", "3.10" ]
 85 |     runs-on: ubuntu-latest
 86 |     needs: test_api_consistency
 87 |     steps:
 88 |     - uses: actions/checkout@v3
 89 |     - uses: actions/cache@v3
 90 |       id: virtualenv-cache
 91 |       with:
 92 |         path: |
 93 |           .venv
 94 |         key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }}
 95 |     - name: Test
 96 |       run: |
 97 |         source .venv/bin/activate
 98 |         make test
 99 |         make check-coverage
100 | 
101 |   changelog:
102 |     runs-on: ubuntu-latest
103 |     steps:
104 |     - if: github.ref != 'refs/heads/main'
105 |       uses: dorny/paths-filter@v2
106 |       id: changes
107 |       with:
108 |         filters: |
109 |           src:
110 |             - 'unstructured_api_tools/**'
111 |     - if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main'
112 |       uses: dangoslen/changelog-enforcer@v3
113 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ "main" ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ "main" ]
20 |   schedule:
21 |     - cron: '35 10 * * 3'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 |         # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38 | 
39 |     steps:
40 |     - name: Checkout repository
41 |       uses: actions/checkout@v3
42 | 
43 |     # Initializes the CodeQL tools for scanning.
44 |     - name: Initialize CodeQL
45 |       uses: github/codeql-action/init@v2
46 |       with:
47 |         languages: ${{ matrix.language }}
48 |         # If you wish to specify custom queries, you can do so here or in a config file.
49 |         # By default, queries listed here will override any specified in a config file.
50 |         # Prefix the list here with "+" to use these queries and those in the config file.
51 |         
52 |         # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
53 |         # queries: security-extended,security-and-quality
54 | 
55 |         
56 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
57 |     # If this step fails, then you should remove it and run the build manually (see below)
58 |     - name: Autobuild
59 |       uses: github/codeql-action/autobuild@v2
60 | 
61 |     # ℹ️ Command-line programs to run using the OS shell.
62 |     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
63 | 
64 |     #   If the Autobuild fails above, remove it and uncomment the following three lines. 
65 |     #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
66 | 
67 |     # - run: |
68 |     #   echo "Run, Build Application using script"
69 |     #   ./location_of_script_within_repo/buildscript.sh
70 | 
71 |     - name: Perform CodeQL Analysis
72 |       uses: github/codeql-action/analyze@v2
73 |       with:
74 |         category: "/language:${{matrix.language}}"
75 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | tmp*
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Pycharm
 80 | .idea/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | 
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 | 
122 | # Rope project settings
123 | .ropeproject
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | 
136 | # API test project test files
137 | /test_unstructured_api_tools/pipeline-test-project/tmp*


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # 0.10.11
  2 | 
  3 | * Fix using metrics filter for logger
  4 | 
  5 | # 0.10.10
  6 | 
  7 | * Filter out metrics endpoint requests from logs
  8 | 
  9 | # 0.10.9
 10 | 
 11 | * Fix output formatting for csv responses
 12 | 
 13 | # 0.10.8
 14 | 
 15 | * Add autoflake and duplicate import removal to linting steps
 16 | 
 17 | # 0.10.7
 18 | 
 19 | * Add support for passing request into pipeline
 20 | 
 21 | # 0.10.6
 22 | 
 23 | * Fix ENV variable processing for CORS
 24 | 
 25 | # 0.10.5
 26 | 
 27 | * Add optional CORS to api
 28 | 
 29 | # 0.10.4
 30 | 
 31 | * Add filter on /healthcheck logs
 32 | 
 33 | # 0.10.3
 34 | 
 35 | * Add support for json and msg file types
 36 | 
 37 | # 0.10.2
 38 | 
 39 | * Set black line length to 100
 40 | 
 41 | # 0.10.1
 42 | 
 43 | * Add Ability to request one file as multipart/form data
 44 | 
 45 | # 0.10.0
 46 | 
 47 | * Update templates for generated API. 
 48 | * Improve code for accepting gzip files.
 49 | 
 50 | # 0.9.4
 51 | 
 52 | * Add dynamic openapi_url to match docs_url
 53 | 
 54 | # 0.9.3
 55 | 
 56 | * Removed /healthcheck endpoint from docs
 57 | * Add fix for handling content type sent as None
 58 | 
 59 | # 0.9.2
 60 | 
 61 | * Add content_type to error message for unsupported file types
 62 |   
 63 | # 0.9.1
 64 | 
 65 | * Allow references to standard imports in pipeline cells
 66 | * Removed unused /healthcheck endpoints
 67 | 
 68 | # 0.9.0
 69 | 
 70 | * Add supporting gzip compressed files
 71 | 
 72 | # 0.8.1
 73 | 
 74 | * Removed async/await from endpoints.
 75 | * Refactored template for generating endpoints with shorter semver.
 76 | 
 77 | # 0.8.0
 78 | 
 79 | * Add duplicate routes with semver major version
 80 | 
 81 | # 0.7.0
 82 | 
 83 | * Add dynamic docs_url
 84 | 
 85 | # 0.6.0
 86 | 
 87 | * Add file type validation via `UNSTRUCTURED_ALLOWED_MIMETYPES`
 88 | 
 89 | # 0.5.0
 90 | 
 91 | * Removed rate limit and slow api from project. Updated templates and tests.
 92 | 
 93 | # 0.4.9
 94 | 
 95 | *  Bug fix: Generated code now consistent across Operating Systems
 96 | 
 97 | # 0.4.8
 98 | 
 99 | *  Add ability to return JSON responses for multiple text_files
100 | 
101 | # 0.4.7
102 | 
103 | * Notebook conversion organizes module level imports at the top of the file
104 | * Allow for FastAPI metadata to be read from the config file
105 | * Add `__init__.py` to API module and add a default version for FastAPI.
106 | 
107 | # 0.4.6
108 | 
109 | * Add support for `response_schema` parameter in Pipeline API functions.
110 | 
111 | # 0.4.5
112 | 
113 | * fix bug to get `response_type` value before first call of it in template
114 | 
115 | # 0.4.4
116 | 
117 | * Implement generation of an app-level FastAPI module.
118 | 
119 | # 0.4.3
120 | 
121 | * Updates `mypy` type checking code to use `--implicit-optional`
122 | 
123 | ## 0.4.2
124 | 
125 | * Add types-ujson dependency
126 | 
127 | ## 0.4.1
128 | 
129 | * Implement feature to allow accepting multiple binary files to the autogenerated pipeline APIs.
130 | 
131 | ## 0.4.0
132 | 
133 | * Implement feature to allow accepting multiple text files to the autogenerated pipeline APIs.
134 | 
135 | ## 0.3.1
136 | 
137 | * Removed the ratelimit on healthchecks
138 | * Dependency bumps
139 | 
140 | ## 0.3.0
141 | 
142 | * Add the ability to pass Accept MIME type headers to pipeline API's
143 | * Dependency bumps
144 | 
145 | ## 0.2.0
146 | 
147 | * Initial release of unstructured-api-tools
148 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:experimental
 2 | FROM quay.io/unstructured-io/base-images:rocky8.7-2 as base
 3 | 
 4 | RUN yum install -y make
 5 | 
 6 | ARG PIP_VERSION
 7 | 
 8 | # Set up environment
 9 | ENV HOME /home/
10 | WORKDIR ${HOME}
11 | RUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \
12 |   &&  ssh-keyscan -t rsa github.com >> /home/.ssh/known_hosts
13 | ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
14 | ENV PATH="/home/usr/.local/bin:${PATH}"
15 | 
16 | FROM base as deps
17 | # Copy and install Unstructured
18 | COPY requirements requirements
19 | 
20 | RUN python3.8 -m pip install pip==${PIP_VERSION} && \
21 |   dnf -y groupinstall "Development Tools" && \
22 |   pip install --no-cache -r requirements/base.txt && \
23 |   pip install --no-cache -r requirements/test.txt && \
24 |   dnf -y groupremove "Development Tools" && \
25 |   dnf clean all
26 | 
27 | FROM deps as code
28 | COPY Makefile Makefile
29 | 
30 | CMD ["/bin/bash"]
31 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2022 Unstructured Technologies, Inc
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include unstructured_api_tools/pipelines/templates/*.txt
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | PACKAGE_NAME := unstructured_api_tools
  2 | PIP_VERSION := 22.2.1
  3 | CURRENT_DIR := $(shell pwd)
  4 | 
  5 | 
  6 | .PHONY: help
  7 | help: Makefile
  8 | 	@sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $<
  9 | 
 10 | 
 11 | ###########
 12 | # Install #
 13 | ###########
 14 | 
 15 | ## install:                    installs all base and test requirements
 16 | .PHONY: install
 17 | install: install-base install-test
 18 | 
 19 | .PHONY: install-ci
 20 | install-ci: install
 21 | 
 22 | .PHONY: install-base
 23 | install-base:
 24 | 	python3 -m pip install pip==${PIP_VERSION}
 25 | 	pip install -r requirements/base.txt
 26 | 
 27 | .PHONY: install-test
 28 | install-test:
 29 | 	pip install -r requirements/test.txt
 30 | 
 31 | ## pip-compile:                compiles all base and test requirements
 32 | .PHONY: pip-compile
 33 | pip-compile:
 34 | 	# NOTE(crag): you have to manually install pip-tools for now to run this.
 35 | 	# There is a better way to do this with a pinned pip-compile version and a venv.
 36 | 	bash -c "pip-compile -h >/dev/null  || { echo please run \'pip install pip-tools\' and then rerun this command; exit 1; }"
 37 | 	pip-compile --upgrade -o requirements/base.txt
 38 | 	pip-compile --upgrade -o requirements/test.txt requirements/base.txt requirements/test.in
 39 | 
 40 | ## install-project-local:      install unstructured_api_tools into your local python environment
 41 | .PHONY: install-project-local
 42 | install-project-local: install
 43 | 	# MAYBE TODO: fail if already exists?
 44 | 	pip install -e .
 45 | 
 46 | ## uninstall-project-local:    uninstall unstructured_api_tools from your local python environment
 47 | .PHONY: uninstall-project-local
 48 | uninstall-project-local:
 49 | 	pip uninstall ${PACKAGE_NAME}
 50 | 
 51 | #################
 52 | # Test and Lint #
 53 | #################
 54 | 
 55 | ## run-jupyter-test-notebooks: starts jupyter, allows execution of test notebooks
 56 | .PHONY: run-jupyter-test-notebooks
 57 | run-jupyter-test-notebooks:
 58 | 	PYTHONPATH=$(realpath .)/test_unstructured_api_tools/pipeline-test-project/ JUPYTER_PATH=$(realpath .)/test_unstructured_api_tools/pipeline-test-project/ jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''
 59 | 
 60 | ## tidy-test-notebooks:        execute notebooks and remove metadata
 61 | .PHONY: tidy-test-notebooks
 62 | tidy-test-notebooks:
 63 | 	PYTHONPATH=. ./test_unstructured_api_tools/pipeline-test-project/scripts/check-and-format-notebooks.py
 64 | 
 65 | ## generate-test-api:          generates FastAPIs under ./test_unstructured_api_tools/pipeline-test-project
 66 | .PHONY: generate-test-api
 67 | generate-test-api:
 68 | 	# generates FastAPI API's from notebooks in the test project ./test_unstructured_api_tools/pipeline-test-project
 69 | 	PYTHONPATH=. PIPELINE_FAMILY_CONFIG=test_unstructured_api_tools/pipeline-test-project/preprocessing-pipeline-family.yaml \
 70 | 		python3 ./unstructured_api_tools/cli.py convert-pipeline-notebooks \
 71 | 		--input-directory ./test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks \
 72 | 		--output-directory ./test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api
 73 | 
 74 | 
 75 | ## api-check-test:             verifies auto-generated pipeline APIs match the existing ones
 76 | .PHONY: api-check-test
 77 | api-check-test:
 78 | 	PYTHONPATH=. PACKAGE_NAME=prepline_test_project ./test_unstructured_api_tools/pipeline-test-project/scripts/test-doc-pipeline-apis-consistent.sh
 79 | 
 80 | 
 81 | ## test:                       runs all unittests
 82 | .PHONY: test
 83 | test:
 84 | 	PYTHONPATH=.:./test_unstructured_api_tools/pipeline-test-project pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov=prepline_test_project --cov-report term-missing -vvv
 85 | 
 86 | ## check:                      runs linters (includes tests)
 87 | .PHONY: check
 88 | check: check-src check-tests check-version
 89 | 
 90 | ## check-src:                  runs linters (source only, no tests)
 91 | .PHONY: check-src
 92 | check-src:
 93 | 	black --line-length 100 ${PACKAGE_NAME} --check
 94 | 	flake8 ${PACKAGE_NAME}
 95 | 	mypy ${PACKAGE_NAME} --ignore-missing-imports --install-types --non-interactive
 96 | 	autoflake --remove-unused-variables --remove-duplicate-keys --expand-star-imports \
 97 | 		--remove-all-unused-imports -cd -r ${PACKAGE_NAME}  test_${PACKAGE_NAME} \
 98 | 		--exclude test_${PACKAGE_NAME}/pipeline-test-project
 99 | 
100 | 
101 | .PHONY: check-tests
102 | check-tests:
103 | 	black --line-length 100 test_${PACKAGE_NAME} --check --exclude test_${PACKAGE_NAME}/pipeline-test-project
104 | 	flake8 test_${PACKAGE_NAME} --exclude test_${PACKAGE_NAME}/pipeline-test-project/prepline_test_project/api
105 | 
106 | ## check-scripts:              run shellcheck
107 | .PHONY: check-scripts
108 | check-scripts:
109 |     # Fail if any of these files have warnings
110 | 	scripts/shellcheck.sh
111 | 
112 | ## check-version:              run check to ensure version in CHANGELOG.md matches version in package
113 | .PHONY: check-version
114 | check-version:
115 |     # Fail if syncing version would produce changes
116 | 	scripts/version-sync.sh -c \
117 | 		-f ${PACKAGE_NAME}/__version__.py semver
118 | 
119 | ## tidy:                       run black
120 | .PHONY: tidy
121 | tidy: tidy-black tidy-autoflake
122 | 
123 | tidy-autoflake:
124 | 	autoflake --remove-unused-variables --remove-duplicate-keys --expand-star-imports \
125 | 	--remove-all-unused-imports -i -r ${PACKAGE_NAME}  test_${PACKAGE_NAME} \
126 | 	--exclude test_${PACKAGE_NAME}/pipeline-test-project
127 | 
128 | 
129 | tidy-black:
130 | 	black --line-length 100 ${PACKAGE_NAME}
131 | 	black --line-length 100 test_${PACKAGE_NAME} --exclude test_${PACKAGE_NAME}/pipeline-test-project
132 | 
133 | 
134 | ## version-sync:               update __version__.py with most recent version from CHANGELOG.md
135 | .PHONY: version-sync
136 | version-sync:
137 | 	scripts/version-sync.sh \
138 | 		-f ${PACKAGE_NAME}/__version__.py semver
139 | 
140 | .PHONY: check-coverage
141 | check-coverage:
142 | 	# TODO(crag): add coverage check for test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/
143 | 	coverage report --fail-under=95
144 | 
145 | ##########
146 | # Docker #
147 | ##########
148 | 
149 | # Docker targets are provided for convenience only and are not required in a standard development environment
150 | 
151 | DOCKER_IMAGE ?= unstructured-api-tools:dev
152 | 
153 | .PHONY: docker-build
154 | docker-build:
155 | 	PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh
156 | 
157 | .PHONY: docker-start-bash
158 | docker-start-bash:
159 | 	docker run -ti --rm ${DOCKER_IMAGE}
160 | 
161 | .PHONY: docker-test
162 | docker-test: docker-build
163 | 	docker run --rm \
164 | 	-v ${CURRENT_DIR}/test_unstructured_api_tools:/home/test_unstructured_api_tools \
165 | 	-v ${CURRENT_DIR}/unstructured_api_tools:/home/unstructured_api_tools \
166 | 	$(DOCKER_IMAGE) \
167 | 	bash -c "make test"
168 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h3 align="center">
  2 |   <img
  3 |     src="https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/main/img/unstructured_logo.png"
  4 |     height="200"
  5 |   >
  6 | </h3>
  7 | 
  8 | <h3 align="center">
  9 |   <p>Open-Source Pre-Processing Tools for Unstructured Data</p>
 10 | </h3>
 11 | 
 12 | 
 13 | The `unstructured_api_tools` library includes utilities for converting pipeline notebooks into
 14 | REST API applications. `unstructured_api_tools` is intended for use in conjunction with
 15 | pipeline repos. See [`pipeline-sec-filings`](https://github.com/Unstructured-IO/pipeline-sec-filings)
 16 | for an example of a repo that uses `unstructured_api_tools`.
 17 | 
 18 | ## Installation
 19 | 
 20 | To install the library, run `pip install unstructured_api_tools`.
 21 | 
 22 | ## Developer Quick Start
 23 | 
 24 | * Using `pyenv` to manage virtualenv's is recommended
 25 | 	* Mac install instructions. See [here](https://github.com/Unstructured-IO/community#mac--homebrew) for more detailed instructions.
 26 | 		* `brew install pyenv-virtualenv`
 27 | 	  * `pyenv install 3.8.15`
 28 |   * Linux instructions are available [here](https://github.com/Unstructured-IO/community#linux).
 29 | 
 30 | * Create a virtualenv to work in and activate it, e.g. for one named `unstructured_api_tools`:
 31 | 
 32 | 	`pyenv  virtualenv 3.8.15 unstructured_api_tools` <br />
 33 | 	`pyenv activate unstructured_api_tools`
 34 | 
 35 | * Run `make install-project-local`
 36 | 
 37 | ## Usage
 38 | 
 39 | Use the CLI command to convert pipeline notebooks to scripts, for example:
 40 | 
 41 | ```bash
 42 | unstructured_api_tools convert-pipeline-notebooks \
 43 |   --input-directory pipeline-family-sec-filings/pipeline-notebooks \
 44 |   --output-directory pipeline-family-sec-filings/prepline_sec_filings/api \
 45 |   --pipeline-family sec-filings \
 46 |   --semver 0.2.1
 47 | ```
 48 | 
 49 | If you do not provide the `pipeline-family` and `semver` arguments, those values are parsed from
 50 | `preprocessing-pipeline-family.yaml`. You can provide the `preprocessing-pipeline-family.yaml` file
 51 | explicitly with `--config-filename` or the `PIPELINE_FAMILY_CONFIG` environment variable. If neither
 52 | of those is specified, the fallback is to use the `preprocessing-pipeline-family.yaml` file in the
 53 | current working directory.
 54 | 
 55 | The API file undergoes `black`, `flake8` and `mypy` checks after being generated. If you want
 56 | `flake8` to ignore specific errors, you can specify them through the CLI with
 57 | `--flake8-ignore F401, E402`.
 58 | See the [`flake8` docs](https://flake8.pycqa.org/en/latest/user/error-codes.html#error-violation-codes)
 59 | for a full list of error codes.
 60 | 
 61 | ### Conversion from `pipeline_api` to FastAPI
 62 | 
 63 | The command described in [**Usage**](#Usage) generates a FastAPI API route for each `pipeline_api`
 64 | function defined in the notebook. The signature of the `pipeline_api` method determines what
 65 | parameters the generated FastAPI accepts.
 66 | 
 67 | Currently, only plain text file uploads are supported and as such the first argument must always be
 68 | `text`, but support for multiple files and binary files is coming soon!
 69 | 
 70 | In addition, any number of string array parameters may be specified. Any kwarg beginning with
 71 | `m_` indicates a multi-value string parameter that is accepted by the FastAPI API.
 72 | 
 73 | For example, in a notebook containing:
 74 | 
 75 |     def pipeline_api(text, m_subject=[], m_name=[]):
 76 | 
 77 | `text` represents the content of a file posted to the FastAPI API, and the `m_subject` and `m_name`
 78 | keyword args represent optional parameters that may be posted to the API as well, both allowing
 79 | multiple string parameters. A `curl` request against such an API could look like this:
 80 | 
 81 |     curl -X 'POST' \
 82 |       'https://<hostname>/<pipeline-family-name>/<pipeline-family-version>/<api-name>' \
 83 |       -H 'accept: application/json'  \
 84 |       -H 'Content-Type: multipart/form-data' \
 85 |       -F 'file=@file-to-process.txt' \
 86 |       -F 'subject=art' \
 87 |       -F 'subject=history'
 88 |       -F 'subject=math' \
 89 |       -F 'name=feynman'
 90 | 
 91 | In addition, you can specify the response type if `pipeline_api` can support both "application/json"
 92 | and "text/csv" as return types.
 93 | 
 94 | For example, in a notebook containing a kwarg `response_type`:
 95 | 
 96 |     def pipeline_api(text, response_type="text/csv", m_subject=[], m_name=[]):
 97 | 
 98 | The consumer of the API may then specify "text/csv" as the requested response content type with the usual
 99 | HTTP Accept header, e.g. `Accept: application/json` or `Accept: text/csv`.
100 | 
101 | ## Security Policy
102 | 
103 | See our [security policy](https://github.com/Unstructured-IO/unstructured-api-tools/security/policy) for
104 | information on how to report security vulnerabilities.
105 | 
106 | ## Learn more
107 | 
108 | | Section | Description |
109 | |-|-|
110 | | [Company Website](https://unstructured.io) | Unstructured.io product and company info |
111 | 


--------------------------------------------------------------------------------
/img/unstructured_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/img/unstructured_logo.png


--------------------------------------------------------------------------------
/requirements/base.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is autogenerated by pip-compile with Python 3.8
  3 | # by the following command:
  4 | #
  5 | #    pip-compile --output-file=requirements/base.txt
  6 | #
  7 | anyio==3.6.2
  8 |     # via
  9 |     #   starlette
 10 |     #   watchfiles
 11 | attrs==22.2.0
 12 |     # via jsonschema
 13 | autoflake==2.1.1
 14 |     # via unstructured-api-tools (setup.py)
 15 | beautifulsoup4==4.12.1
 16 |     # via nbconvert
 17 | bleach==6.0.0
 18 |     # via nbconvert
 19 | click==8.1.3
 20 |     # via
 21 |     #   unstructured-api-tools (setup.py)
 22 |     #   uvicorn
 23 | defusedxml==0.7.1
 24 |     # via nbconvert
 25 | fastapi==0.95.0
 26 |     # via unstructured-api-tools (setup.py)
 27 | fastjsonschema==2.16.3
 28 |     # via nbformat
 29 | h11==0.14.0
 30 |     # via uvicorn
 31 | httptools==0.5.0
 32 |     # via uvicorn
 33 | idna==3.4
 34 |     # via anyio
 35 | importlib-metadata==6.1.0
 36 |     # via
 37 |     #   jupyter-client
 38 |     #   nbconvert
 39 | importlib-resources==5.12.0
 40 |     # via jsonschema
 41 | jinja2==3.1.2
 42 |     # via
 43 |     #   nbconvert
 44 |     #   unstructured-api-tools (setup.py)
 45 | jsonschema==4.17.3
 46 |     # via nbformat
 47 | jupyter-client==8.1.0
 48 |     # via nbclient
 49 | jupyter-core==5.3.0
 50 |     # via
 51 |     #   jupyter-client
 52 |     #   nbclient
 53 |     #   nbconvert
 54 |     #   nbformat
 55 | jupyterlab-pygments==0.2.2
 56 |     # via nbconvert
 57 | markupsafe==2.1.2
 58 |     # via
 59 |     #   jinja2
 60 |     #   nbconvert
 61 | mistune==2.0.5
 62 |     # via nbconvert
 63 | mypy==1.2.0
 64 |     # via unstructured-api-tools (setup.py)
 65 | mypy-extensions==1.0.0
 66 |     # via mypy
 67 | nbclient==0.7.3
 68 |     # via nbconvert
 69 | nbconvert==7.3.0
 70 |     # via unstructured-api-tools (setup.py)
 71 | nbformat==5.8.0
 72 |     # via
 73 |     #   nbclient
 74 |     #   nbconvert
 75 | numpy==1.24.3
 76 |     # via pandas
 77 | packaging==23.0
 78 |     # via nbconvert
 79 | pandas==2.0.2
 80 |     # via unstructured-api-tools (setup.py)
 81 | pandocfilters==1.5.0
 82 |     # via nbconvert
 83 | pkgutil-resolve-name==1.3.10
 84 |     # via jsonschema
 85 | platformdirs==3.2.0
 86 |     # via jupyter-core
 87 | pydantic==1.10.7
 88 |     # via fastapi
 89 | pyflakes==3.0.1
 90 |     # via autoflake
 91 | pygments==2.14.0
 92 |     # via nbconvert
 93 | pyrsistent==0.19.3
 94 |     # via jsonschema
 95 | python-dateutil==2.8.2
 96 |     # via
 97 |     #   jupyter-client
 98 |     #   pandas
 99 | python-dotenv==1.0.0
100 |     # via uvicorn
101 | python-multipart==0.0.6
102 |     # via unstructured-api-tools (setup.py)
103 | pytz==2023.3
104 |     # via pandas
105 | pyyaml==6.0
106 |     # via uvicorn
107 | pyzmq==25.0.2
108 |     # via jupyter-client
109 | six==1.16.0
110 |     # via
111 |     #   bleach
112 |     #   python-dateutil
113 | sniffio==1.3.0
114 |     # via anyio
115 | soupsieve==2.4
116 |     # via beautifulsoup4
117 | starlette==0.26.1
118 |     # via fastapi
119 | tinycss2==1.2.1
120 |     # via nbconvert
121 | tomli==2.0.1
122 |     # via
123 |     #   autoflake
124 |     #   mypy
125 | tornado==6.2
126 |     # via jupyter-client
127 | traitlets==5.9.0
128 |     # via
129 |     #   jupyter-client
130 |     #   jupyter-core
131 |     #   nbclient
132 |     #   nbconvert
133 |     #   nbformat
134 | types-requests==2.28.11.17
135 |     # via unstructured-api-tools (setup.py)
136 | types-ujson==5.7.0.1
137 |     # via unstructured-api-tools (setup.py)
138 | types-urllib3==1.26.25.10
139 |     # via types-requests
140 | typing-extensions==4.5.0
141 |     # via
142 |     #   mypy
143 |     #   pydantic
144 |     #   starlette
145 | tzdata==2023.3
146 |     # via pandas
147 | uvicorn[standard]==0.21.1
148 |     # via unstructured-api-tools (setup.py)
149 | uvloop==0.17.0
150 |     # via uvicorn
151 | watchfiles==0.19.0
152 |     # via uvicorn
153 | webencodings==0.5.1
154 |     # via
155 |     #   bleach
156 |     #   tinycss2
157 | websockets==11.0.1
158 |     # via uvicorn
159 | zipp==3.15.0
160 |     # via
161 |     #   importlib-metadata
162 |     #   importlib-resources
163 | 


--------------------------------------------------------------------------------
/requirements/test.in:
--------------------------------------------------------------------------------
 1 | black>=22.3.0
 2 | coverage
 3 | flake8
 4 | httpx
 5 | # NOTE(robinson) - Pinning version due to the NotOneFoundException crash described here.
 6 | # ref: https://github.com/ipython/ipython/issues/13598
 7 | ipython>=8.9.0
 8 | pytest-cov
 9 | # NOTE(mrobinson) - requests is needed for the fastapi test client
10 | requests
11 | requests_toolbelt
12 | nbdev
13 | pytest-mock
14 | 


--------------------------------------------------------------------------------
/requirements/test.txt:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file is autogenerated by pip-compile with Python 3.8
  3 | # by the following command:
  4 | #
  5 | #    pip-compile --output-file=requirements/test.txt requirements/base.txt requirements/test.in
  6 | #
  7 | anyio==3.6.2
  8 |     # via
  9 |     #   -r requirements/base.txt
 10 |     #   httpcore
 11 |     #   starlette
 12 |     #   watchfiles
 13 | asttokens==2.2.1
 14 |     # via
 15 |     #   nbdev
 16 |     #   stack-data
 17 | astunparse==1.6.3
 18 |     # via nbdev
 19 | attrs==22.2.0
 20 |     # via
 21 |     #   -r requirements/base.txt
 22 |     #   jsonschema
 23 |     #   pytest
 24 | backcall==0.2.0
 25 |     # via ipython
 26 | beautifulsoup4==4.12.1
 27 |     # via
 28 |     #   -r requirements/base.txt
 29 |     #   nbconvert
 30 | black==23.3.0
 31 |     # via -r requirements/test.in
 32 | bleach==6.0.0
 33 |     # via
 34 |     #   -r requirements/base.txt
 35 |     #   nbconvert
 36 | certifi==2022.12.7
 37 |     # via
 38 |     #   httpcore
 39 |     #   httpx
 40 |     #   requests
 41 | charset-normalizer==3.1.0
 42 |     # via requests
 43 | click==8.1.3
 44 |     # via
 45 |     #   -r requirements/base.txt
 46 |     #   black
 47 |     #   uvicorn
 48 | coverage[toml]==7.2.3
 49 |     # via
 50 |     #   -r requirements/test.in
 51 |     #   pytest-cov
 52 | decorator==5.1.1
 53 |     # via ipython
 54 | defusedxml==0.7.1
 55 |     # via
 56 |     #   -r requirements/base.txt
 57 |     #   nbconvert
 58 | exceptiongroup==1.1.1
 59 |     # via pytest
 60 | execnb==0.1.5
 61 |     # via nbdev
 62 | executing==1.2.0
 63 |     # via stack-data
 64 | fastapi==0.95.0
 65 |     # via -r requirements/base.txt
 66 | fastcore==1.5.29
 67 |     # via
 68 |     #   execnb
 69 |     #   ghapi
 70 |     #   nbdev
 71 | fastjsonschema==2.16.3
 72 |     # via
 73 |     #   -r requirements/base.txt
 74 |     #   nbformat
 75 | flake8==6.0.0
 76 |     # via -r requirements/test.in
 77 | ghapi==1.0.3
 78 |     # via nbdev
 79 | h11==0.14.0
 80 |     # via
 81 |     #   -r requirements/base.txt
 82 |     #   httpcore
 83 |     #   uvicorn
 84 | httpcore==0.16.3
 85 |     # via httpx
 86 | httptools==0.5.0
 87 |     # via
 88 |     #   -r requirements/base.txt
 89 |     #   uvicorn
 90 | httpx==0.23.3
 91 |     # via -r requirements/test.in
 92 | idna==3.4
 93 |     # via
 94 |     #   -r requirements/base.txt
 95 |     #   anyio
 96 |     #   requests
 97 |     #   rfc3986
 98 | importlib-metadata==6.1.0
 99 |     # via
100 |     #   -r requirements/base.txt
101 |     #   jupyter-client
102 |     #   nbconvert
103 | importlib-resources==5.12.0
104 |     # via
105 |     #   -r requirements/base.txt
106 |     #   jsonschema
107 | iniconfig==2.0.0
108 |     # via pytest
109 | ipython==8.12.0
110 |     # via
111 |     #   -r requirements/test.in
112 |     #   execnb
113 | jedi==0.18.2
114 |     # via ipython
115 | jinja2==3.1.2
116 |     # via
117 |     #   -r requirements/base.txt
118 |     #   nbconvert
119 | jsonschema==4.17.3
120 |     # via
121 |     #   -r requirements/base.txt
122 |     #   nbformat
123 | jupyter-client==8.1.0
124 |     # via
125 |     #   -r requirements/base.txt
126 |     #   nbclient
127 | jupyter-core==5.3.0
128 |     # via
129 |     #   -r requirements/base.txt
130 |     #   jupyter-client
131 |     #   nbclient
132 |     #   nbconvert
133 |     #   nbformat
134 | jupyterlab-pygments==0.2.2
135 |     # via
136 |     #   -r requirements/base.txt
137 |     #   nbconvert
138 | markupsafe==2.1.2
139 |     # via
140 |     #   -r requirements/base.txt
141 |     #   jinja2
142 |     #   nbconvert
143 | matplotlib-inline==0.1.6
144 |     # via ipython
145 | mccabe==0.7.0
146 |     # via flake8
147 | mistune==2.0.5
148 |     # via
149 |     #   -r requirements/base.txt
150 |     #   nbconvert
151 | mypy==1.2.0
152 |     # via -r requirements/base.txt
153 | mypy-extensions==1.0.0
154 |     # via
155 |     #   -r requirements/base.txt
156 |     #   black
157 |     #   mypy
158 | nbclient==0.7.3
159 |     # via
160 |     #   -r requirements/base.txt
161 |     #   nbconvert
162 | nbconvert==7.3.0
163 |     # via -r requirements/base.txt
164 | nbdev==2.3.12
165 |     # via -r requirements/test.in
166 | nbformat==5.8.0
167 |     # via
168 |     #   -r requirements/base.txt
169 |     #   nbclient
170 |     #   nbconvert
171 | packaging==23.0
172 |     # via
173 |     #   -r requirements/base.txt
174 |     #   black
175 |     #   fastcore
176 |     #   ghapi
177 |     #   nbconvert
178 |     #   pytest
179 | pandocfilters==1.5.0
180 |     # via
181 |     #   -r requirements/base.txt
182 |     #   nbconvert
183 | parso==0.8.3
184 |     # via jedi
185 | pathspec==0.11.1
186 |     # via black
187 | pexpect==4.8.0
188 |     # via ipython
189 | pickleshare==0.7.5
190 |     # via ipython
191 | pkgutil-resolve-name==1.3.10
192 |     # via
193 |     #   -r requirements/base.txt
194 |     #   jsonschema
195 | platformdirs==3.2.0
196 |     # via
197 |     #   -r requirements/base.txt
198 |     #   black
199 |     #   jupyter-core
200 | pluggy==1.0.0
201 |     # via pytest
202 | prompt-toolkit==3.0.38
203 |     # via ipython
204 | ptyprocess==0.7.0
205 |     # via pexpect
206 | pure-eval==0.2.2
207 |     # via stack-data
208 | pycodestyle==2.10.0
209 |     # via flake8
210 | pydantic==1.10.7
211 |     # via
212 |     #   -r requirements/base.txt
213 |     #   fastapi
214 | pyflakes==3.0.1
215 |     # via flake8
216 | pygments==2.14.0
217 |     # via
218 |     #   -r requirements/base.txt
219 |     #   ipython
220 |     #   nbconvert
221 | pyrsistent==0.19.3
222 |     # via
223 |     #   -r requirements/base.txt
224 |     #   jsonschema
225 | pytest==7.2.2
226 |     # via
227 |     #   pytest-cov
228 |     #   pytest-mock
229 | pytest-cov==4.0.0
230 |     # via -r requirements/test.in
231 | pytest-mock==3.10.0
232 |     # via -r requirements/test.in
233 | python-dateutil==2.8.2
234 |     # via
235 |     #   -r requirements/base.txt
236 |     #   jupyter-client
237 | python-dotenv==1.0.0
238 |     # via
239 |     #   -r requirements/base.txt
240 |     #   uvicorn
241 | python-multipart==0.0.6
242 |     # via -r requirements/base.txt
243 | pyyaml==6.0
244 |     # via
245 |     #   -r requirements/base.txt
246 |     #   nbdev
247 |     #   uvicorn
248 | pyzmq==25.0.2
249 |     # via
250 |     #   -r requirements/base.txt
251 |     #   jupyter-client
252 | requests==2.28.2
253 |     # via
254 |     #   -r requirements/test.in
255 |     #   requests-toolbelt
256 | requests-toolbelt==0.10.1
257 |     # via -r requirements/test.in
258 | rfc3986[idna2008]==1.5.0
259 |     # via httpx
260 | six==1.16.0
261 |     # via
262 |     #   -r requirements/base.txt
263 |     #   asttokens
264 |     #   astunparse
265 |     #   bleach
266 |     #   python-dateutil
267 | sniffio==1.3.0
268 |     # via
269 |     #   -r requirements/base.txt
270 |     #   anyio
271 |     #   httpcore
272 |     #   httpx
273 | soupsieve==2.4
274 |     # via
275 |     #   -r requirements/base.txt
276 |     #   beautifulsoup4
277 | stack-data==0.6.2
278 |     # via ipython
279 | starlette==0.26.1
280 |     # via
281 |     #   -r requirements/base.txt
282 |     #   fastapi
283 | tinycss2==1.2.1
284 |     # via
285 |     #   -r requirements/base.txt
286 |     #   nbconvert
287 | tomli==2.0.1
288 |     # via
289 |     #   -r requirements/base.txt
290 |     #   black
291 |     #   coverage
292 |     #   mypy
293 |     #   pytest
294 | tornado==6.2
295 |     # via
296 |     #   -r requirements/base.txt
297 |     #   jupyter-client
298 | traitlets==5.9.0
299 |     # via
300 |     #   -r requirements/base.txt
301 |     #   ipython
302 |     #   jupyter-client
303 |     #   jupyter-core
304 |     #   matplotlib-inline
305 |     #   nbclient
306 |     #   nbconvert
307 |     #   nbformat
308 | types-requests==2.28.11.17
309 |     # via -r requirements/base.txt
310 | types-ujson==5.7.0.1
311 |     # via -r requirements/base.txt
312 | types-urllib3==1.26.25.10
313 |     # via
314 |     #   -r requirements/base.txt
315 |     #   types-requests
316 | typing-extensions==4.5.0
317 |     # via
318 |     #   -r requirements/base.txt
319 |     #   black
320 |     #   ipython
321 |     #   mypy
322 |     #   pydantic
323 |     #   starlette
324 | urllib3==1.26.15
325 |     # via requests
326 | uvicorn[standard]==0.21.1
327 |     # via -r requirements/base.txt
328 | uvloop==0.17.0
329 |     # via
330 |     #   -r requirements/base.txt
331 |     #   uvicorn
332 | watchdog==3.0.0
333 |     # via nbdev
334 | watchfiles==0.19.0
335 |     # via
336 |     #   -r requirements/base.txt
337 |     #   uvicorn
338 | wcwidth==0.2.6
339 |     # via prompt-toolkit
340 | webencodings==0.5.1
341 |     # via
342 |     #   -r requirements/base.txt
343 |     #   bleach
344 |     #   tinycss2
345 | websockets==11.0.1
346 |     # via
347 |     #   -r requirements/base.txt
348 |     #   uvicorn
349 | wheel==0.40.0
350 |     # via astunparse
351 | zipp==3.15.0
352 |     # via
353 |     #   -r requirements/base.txt
354 |     #   importlib-metadata
355 |     #   importlib-resources
356 | 
357 | # The following packages are considered to be unsafe in a requirements file:
358 | # pip
359 | 


--------------------------------------------------------------------------------
/scripts/docker-build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euo pipefail
 4 | DOCKER_REPOSITORY="${DOCKER_REPOSITORY:-quay.io/unstructured-io/unstructured}"
 5 | PIP_VERSION="${PIP_VERSION:-23.1.2}"
 6 | DOCKER_IMAGE="${DOCKER_IMAGE:-unstructured-api-tools:dev}"
 7 | 
 8 | DOCKER_BUILD_CMD=(docker buildx build --load -f Dockerfile \
 9 |   --build-arg PIP_VERSION="$PIP_VERSION" \
10 |   --build-arg BUILDKIT_INLINE_CACHE=1 \
11 |   --progress plain \
12 |   --cache-from "$DOCKER_REPOSITORY":latest \
13 |   -t "$DOCKER_IMAGE" .)
14 | 
15 | # only build for specific platform if DOCKER_BUILD_PLATFORM is set
16 | if [ -n "${DOCKER_BUILD_PLATFORM:-}" ]; then
17 |   DOCKER_BUILD_CMD+=("--platform=$DOCKER_BUILD_PLATFORM")
18 | fi
19 | 
20 | DOCKER_BUILDKIT=1 "${DOCKER_BUILD_CMD[@]}"
21 | 


--------------------------------------------------------------------------------
/scripts/shellcheck.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | find scripts -name "*.sh" -exec shellcheck {} +
4 | 
5 | 


--------------------------------------------------------------------------------
/scripts/version-sync.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | function usage {
  3 |     echo "Usage: $(basename "$0") [-c] -f FILE_TO_CHANGE REPLACEMENT_FORMAT [-f FILE_TO_CHANGE REPLACEMENT_FORMAT ...]" 2>&1
  4 |     echo 'Synchronize files to latest version in source file'
  5 |     echo '   -s              Specifies source file for version (default is CHANGELOG.md)'
  6 |     echo '   -f              Specifies a file to change and the format for searching and replacing versions'
  7 |     echo '                       FILE_TO_CHANGE is the file to be updated/checked for updates'
  8 |     echo '                       REPLACEMENT_FORMAT is one of (semver, release, api-release)'
  9 |     echo '                           semver indicates to look for a full semver version and replace with the latest full version'
 10 |     echo '                           release indicates to look for a release semver version (x.x.x) and replace with the latest release version'
 11 |     echo '                           api-release indicates to look for a release semver version in the context of an api route and replace with the latest release version'
 12 |     echo '   -c              Compare versions and output proposed changes without changing anything.'
 13 | }
 14 | 
 15 | function getopts-extra () {
 16 |     declare i=1
 17 |     # if the next argument is not an option, then append it to array OPTARG
 18 |     while [[ ${OPTIND} -le $# && ${!OPTIND:0:1} != '-' ]]; do
 19 |         OPTARG[i]=${!OPTIND}
 20 |         i+=1
 21 |         OPTIND+=1
 22 |     done
 23 | }
 24 | 
 25 | # Parse input options
 26 | declare CHECK=0
 27 | declare SOURCE_FILE="CHANGELOG.md"
 28 | declare -a FILES_TO_CHECK=()
 29 | declare -a REPLACEMENT_FORMATS=()
 30 | declare args
 31 | declare OPTIND OPTARG opt
 32 | while getopts ":hcs:f:" opt; do
 33 |     case $opt in
 34 |         h)
 35 |             usage
 36 |             exit 0
 37 |             ;;
 38 |         c)
 39 |             CHECK=1
 40 |             ;;
 41 |         s)
 42 |             SOURCE_FILE="$OPTARG"
 43 |             ;;
 44 |         f)
 45 |             getopts-extra "$@"
 46 |             args=( "${OPTARG[@]}" )
 47 |             # validate length of args, should be 2
 48 |             if [ ${#args[@]} -eq 2 ]; then
 49 |                 FILES_TO_CHECK+=( "${args[0]}" )
 50 |                 REPLACEMENT_FORMATS+=( "${args[1]}" )
 51 |             else
 52 |                 echo "Exactly 2 arguments must follow -f option." >&2
 53 |                 exit 1
 54 |             fi
 55 |             ;;
 56 |         \?)
 57 |             echo "Invalid option: -$OPTARG." >&2
 58 |             usage
 59 |             exit 1
 60 |             ;;
 61 |     esac
 62 | done
 63 | 
 64 | # Parse REPLACEMENT_FORMATS
 65 | RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
 66 | RE_RELEASE="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)"
 67 | RE_API_RELEASE="v(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)"
 68 | # Pull out semver appearing earliest in SOURCE_FILE.
 69 | LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$SOURCE_FILE")
 70 | LAST_RELEASE=$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")
 71 | LAST_API_RELEASE="v$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])$" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")"
 72 | declare -a RE_SEMVERS=()
 73 | declare -a UPDATED_VERSIONS=()
 74 | for i in "${!REPLACEMENT_FORMATS[@]}"; do
 75 |     REPLACEMENT_FORMAT=${REPLACEMENT_FORMATS[$i]}
 76 |     case $REPLACEMENT_FORMAT in
 77 |         semver)
 78 |             RE_SEMVERS+=( "$RE_SEMVER_FULL" )
 79 |             UPDATED_VERSIONS+=( "$LAST_VERSION" )
 80 |             ;;
 81 |         release)
 82 |             RE_SEMVERS+=( "$RE_RELEASE" )
 83 |             UPDATED_VERSIONS+=( "$LAST_RELEASE" )
 84 |             ;;
 85 |         api-release)
 86 |             RE_SEMVERS+=( "$RE_API_RELEASE" )
 87 |             UPDATED_VERSIONS+=( "$LAST_API_RELEASE" )
 88 |             ;;
 89 |         *)
 90 |             echo "Invalid replacement format: \"${REPLACEMENT_FORMAT}\". Use semver, release, or api-release" >&2
 91 |             exit 1
 92 |             ;;
 93 |     esac
 94 | done
 95 | 
 96 | if [ -z "$LAST_VERSION" ];
 97 | then
 98 |     # No match to semver regex in SOURCE_FILE, so no version to go from.
 99 |     printf "Error: Unable to find latest version from %s.\n" "$SOURCE_FILE"
100 |     exit 1
101 | fi
102 | 
103 | # Search files in FILES_TO_CHECK and change (or get diffs)
104 | declare FAILED_CHECK=0
105 | 
106 | for i in "${!FILES_TO_CHECK[@]}"; do
107 |     FILE_TO_CHANGE=${FILES_TO_CHECK[$i]}
108 |     RE_SEMVER=${RE_SEMVERS[$i]}
109 |     UPDATED_VERSION=${UPDATED_VERSIONS[$i]}
110 |     FILE_VERSION=$(grep -o -m 1 -E "${RE_SEMVER}" "$FILE_TO_CHANGE")
111 |     if [ -z "$FILE_VERSION" ];
112 |     then
113 |         # No match to semver regex in VERSIONFILE, so nothing to replace
114 |         printf "Error: No semver version found in file %s.\n" "$FILE_TO_CHANGE"
115 |         exit 1
116 |     else
117 |         # Replace semver in VERSIONFILE with semver obtained from SOURCE_FILE
118 |         TMPFILE=$(mktemp /tmp/new_version.XXXXXX)
119 |         # Check sed version, exit if version < 4.3
120 |         if ! sed --version > /dev/null 2>&1; then
121 |             CURRENT_VERSION=1.archaic
122 |         else
123 |             CURRENT_VERSION=$(sed --version | head -n1 | cut -d" " -f4)
124 |         fi
125 |         REQUIRED_VERSION="4.3"
126 |         if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$CURRENT_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then
127 |             echo "sed version must be >= ${REQUIRED_VERSION}" && exit 1
128 |         fi
129 |         sed -E -r "s/$RE_SEMVER/$UPDATED_VERSION/" "$FILE_TO_CHANGE" > "$TMPFILE"
130 |         if [ $CHECK == 1 ];
131 |         then
132 |             DIFF=$(diff "$FILE_TO_CHANGE"  "$TMPFILE" )
133 |             if [ -z "$DIFF" ];
134 |             then
135 |                 printf "version sync would make no changes to %s.\n" "$FILE_TO_CHANGE"
136 |                 rm "$TMPFILE"
137 |             else
138 |                 FAILED_CHECK=1
139 |                 printf "version sync would make the following changes to %s:\n%s\n" "$FILE_TO_CHANGE" "$DIFF"
140 |                 rm "$TMPFILE"
141 |             fi
142 |         else
143 |             cp "$TMPFILE" "$FILE_TO_CHANGE" 
144 |             rm "$TMPFILE"
145 |         fi
146 |     fi
147 | done
148 | 
149 | # Exit with code determined by whether changes were needed in a check.
150 | if [ ${FAILED_CHECK} -ne 0 ]; then
151 |     exit 1
152 | else
153 |     exit 0
154 | fi
155 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | license_files = LICENSE.md
3 | 
4 | [flake8]
5 | max-line-length = 100
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | setup.py
 3 | 
 4 | unstructured_api_tools - Utilities to manage APIs from notebooks
 5 | 
 6 | Copyright 2022 Unstructured Technologies, Inc.
 7 | 
 8 | Licensed under the Apache License, Version 2.0 (the "License");
 9 | you may not use this file except in compliance with the License.
10 | You may obtain a copy of the License at
11 | 
12 |     http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 | Unless required by applicable law or agreed to in writing, software
15 | distributed under the License is distributed on an "AS IS" BASIS,
16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | See the License for the specific language governing permissions and
18 | limitations under the License.
19 | """
20 | 
21 | from setuptools import setup, find_packages
22 | 
23 | from unstructured_api_tools.__version__ import __version__
24 | 
25 | setup(
26 |     name="unstructured_api_tools",
27 |     description="A library that prepares raw documents for downstream ML tasks.",
28 |     long_description=open("README.md", "r", encoding="utf-8").read(),
29 |     long_description_content_type="text/markdown",
30 |     keywords="NLP PDF HTML CV XML parsing preprocessing",
31 |     url="https://github.com/Unstructured-IO/unstructured-api-tools",
32 |     python_requires=">=3.8.0",
33 |     classifiers=[
34 |         "Development Status :: 4 - Beta",
35 |         "Intended Audience :: Developers",
36 |         "Intended Audience :: Education",
37 |         "Intended Audience :: Science/Research",
38 |         "License :: OSI Approved :: Apache Software License",
39 |         "Operating System :: OS Independent",
40 |         "Programming Language :: Python :: 3",
41 |         "Programming Language :: Python :: 3.8",
42 |         "Programming Language :: Python :: 3.9",
43 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
44 |     ],
45 |     author="Unstructured Technologies",
46 |     author_email="mrobinson@unstructuredai.io",
47 |     license="Apache-2.0",
48 |     packages=find_packages(),
49 |     include_package_data=True,
50 |     version=__version__,
51 |     entry_points={
52 |         "console_scripts": "unstructured_api_tools=unstructured_api_tools.cli:cli"
53 |     },
54 |     install_requires=[
55 |         "click>=8.1",
56 |         "fastapi",
57 |         "Jinja2",
58 |         "mypy>=0.99",
59 |         "nbconvert",
60 |         "python-multipart",
61 |         "pandas",
62 |         "types-requests",
63 |         "types-ujson",
64 |         "uvicorn[standard]",
65 |         "autoflake"
66 |     ],
67 |     extras_require={},
68 | )
69 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/example.jpg


--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/example.jpg.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/example.jpg.gz


--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/fake-email.msg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/fake-email.msg


--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/fake.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/fake.docx


--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/fake.docx.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/fake.docx.gz


--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/markdown.md:
--------------------------------------------------------------------------------
1 | # Test markdown file
2 | 
3 | This is the test markdown file. 100% code coverage is what I aim for.


--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/text_file.txt:
--------------------------------------------------------------------------------
1 | this is the test text file


--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/text_file.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/text_file.txt.gz


--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/text_file_2.txt:
--------------------------------------------------------------------------------
1 | this is another test text file


--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/text_file_2.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/text_file_2.txt.gz


--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/functions_and_variables.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | FILE_DOCX = "fake.docx"
  4 | FILE_IMAGE = "example.jpg"
  5 | FILE_TXT_1 = "text_file.txt"
  6 | FILE_TXT_2 = "text_file_2.txt"
  7 | FILE_MARKDOWN = "markdown.md"
  8 | FILE_MSG = "fake-email.msg"
  9 | FILE_JSON = "spring-weather.html.json"
 10 | 
 11 | GZIP_FILE_DOCX = "fake.docx.gz"
 12 | GZIP_FILE_IMAGE = "example.jpg.gz"
 13 | GZIP_FILE_TXT_1 = "text_file.txt.gz"
 14 | GZIP_FILE_TXT_2 = "text_file_2.txt.gz"
 15 | 
 16 | FILENAME_LENGTHS = {
 17 |     FILE_DOCX: 36602,
 18 |     GZIP_FILE_DOCX: 36602,
 19 |     FILE_IMAGE: 32764,
 20 |     GZIP_FILE_IMAGE: 32764,
 21 |     FILE_TXT_1: 26,
 22 |     GZIP_FILE_TXT_1: 26,
 23 |     FILE_TXT_2: 30,
 24 |     GZIP_FILE_TXT_2: 30,
 25 |     FILE_MARKDOWN: 91,
 26 |     FILE_MSG: 11776,
 27 |     FILE_JSON: 13151,
 28 | }
 29 | FILENAME_FORMATS = {
 30 |     FILE_DOCX: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 31 |     FILE_IMAGE: "image/jpeg",
 32 |     FILE_TXT_1: "text/plain",
 33 |     FILE_TXT_2: "text/plain",
 34 |     GZIP_FILE_DOCX: "application/gzip",
 35 |     GZIP_FILE_IMAGE: "application/gzip",
 36 |     GZIP_FILE_TXT_1: "application/gzip",
 37 |     GZIP_FILE_TXT_2: "application/gzip",
 38 |     FILE_MARKDOWN: "text/markdown",
 39 |     FILE_MSG: "message/rfc822",
 40 |     FILE_JSON: "application/json",
 41 |     "octet_stream": "application/octet-stream",
 42 | }
 43 | 
 44 | P_INPUT_1_SINGLE = {"input1": ["hi"]}
 45 | P_INPUT_1_MULTI = {"input1": ["hi", "water is better than ice"]}
 46 | P_INPUT_1_EMPTY = {"input1": []}
 47 | P_INPUT_2_SINGLE = {"input2": ["hello"]}
 48 | P_INPUT_2_MULTI = {"input2": ["hello", "earth is better than mars"]}
 49 | P_INPUT_2_EMPTY = {"input2": []}
 50 | P_INPUT_1_AND_2_MULTI = {"input2": ["hello", "earth is better than mars"], "input1": ["hi"]}
 51 | 
 52 | JSON = "application/json"
 53 | MIXED = "multipart/mixed"
 54 | TEXT_CSV = "text/csv"
 55 | INVALID = "invalid"
 56 | 
 57 | RESPONSE_SCHEMA_ISD = {"output_schema": "isd"}
 58 | RESPONSE_SCHEMA_LABELSTUDIO = {"output_schema": "labelstudio"}
 59 | 
 60 | 
 61 | def convert_files_for_api(files, use_octet_stream_type=False):
 62 |     files_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures")
 63 |     return [
 64 |         (
 65 |             "files",
 66 |             (
 67 |                 test_file,
 68 |                 open(os.path.join(files_path, test_file), "rb"),
 69 |                 FILENAME_FORMATS["octet_stream" if use_octet_stream_type else test_file],
 70 |             ),
 71 |         )
 72 |         for test_file in files
 73 |     ]
 74 | 
 75 | 
 76 | def convert_text_files_for_api(files, use_octet_stream_type=False):
 77 |     files_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures")
 78 |     return [
 79 |         (
 80 |             "text_files",
 81 |             (
 82 |                 test_file,
 83 |                 open(os.path.join(files_path, test_file), "rb"),
 84 |                 FILENAME_FORMATS["octet_stream" if use_octet_stream_type else test_file],
 85 |             ),
 86 |         )
 87 |         for test_file in files
 88 |     ]
 89 | 
 90 | 
 91 | def generate_header_kwargs(value=None):
 92 |     return (
 93 |         {
 94 |             "headers": {
 95 |                 "Accept": value,
 96 |             }
 97 |         }
 98 |         if value
 99 |         else {}
100 |     )
101 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/test_docs.py:
--------------------------------------------------------------------------------
 1 | from starlette.testclient import TestClient
 2 | from prepline_test_project.api.app import app
 3 | 
 4 | DOCS_ROUTE = "/test-project/docs"
 5 | OPENAPI_ROUTE = "/test-project/openapi.json"
 6 | HEALTHCHECK_ROUTE = "/healthcheck"
 7 | 
 8 | client = TestClient(app)
 9 | 
10 | 
11 | def test_openapi():
12 |     response = client.get(OPENAPI_ROUTE)
13 |     assert response.status_code == 200
14 | 
15 | 
16 | def test_docs():
17 |     response = client.get(DOCS_ROUTE)
18 |     assert response.status_code == 200
19 | 
20 | 
21 | def test_healthcheck():
22 |     response = client.get(HEALTHCHECK_ROUTE)
23 |     assert response.status_code == 200
24 |     assert response.json() == {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"}
25 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/README.md:
--------------------------------------------------------------------------------
1 | This directory is the base of barebones preprocessing-pipeline project
2 | used for the generatation of FastAPI's which are then used as test fixtures.
3 | 
4 | It includes notebooks under pipeline-notebooks/ as is normally the case
5 | for pipeline projects. APIs are generated and checked by the Makefile
6 | in the root of the unstructured-api-tools repo.
7 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-1.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "3931743a",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "# File Processing Pipeline"
 9 |    ]
10 |   },
11 |   {
12 |    "cell_type": "code",
13 |    "execution_count": null,
14 |    "id": "d83dab2a",
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "import os"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "id": "7cb5e00b",
25 |    "metadata": {},
26 |    "outputs": [],
27 |    "source": [
28 |     "# pipeline-api\n",
29 |     "\n",
30 |     "# test that a duplicate import gets handles correctly as this gets imported via the template as wel\n",
31 |     "import json\n",
32 |     "\n",
33 |     "# test accessing os in a #pipeline-api cell does not break things\n",
34 |     "_ = os.environ\n",
35 |     "\n",
36 |     "def pipeline_api(\n",
37 |     "    file,\n",
38 |     "    filename=None,\n",
39 |     "    file_content_type=None,\n",
40 |     "    m_input2=[],\n",
41 |     "):\n",
42 |     "    return {\"silly_result\": ' : '.join([str(len(file.read())),\n",
43 |     "                          filename,\n",
44 |     "                          file_content_type,\n",
45 |     "                          str(m_input2)])}"
46 |    ]
47 |   },
48 |   {
49 |    "cell_type": "code",
50 |    "execution_count": null,
51 |    "id": "65911889",
52 |    "metadata": {},
53 |    "outputs": [
54 |     {
55 |      "name": "stdout",
56 |      "output_type": "stream",
57 |      "text": [
58 |       "{'silly_result': \"17 : temp-file.txt : text/plain : ['my', 'inputs']\"}\n"
59 |      ]
60 |     }
61 |    ],
62 |    "source": [
63 |     "import tempfile\n",
64 |     "with tempfile.TemporaryFile() as fp:\n",
65 |     "    fp.write(b'This is some data')\n",
66 |     "    fp.seek(0)\n",
67 |     "    print(\n",
68 |     "        pipeline_api(\n",
69 |     "            fp,\n",
70 |     "            filename=\"temp-file.txt\",\n",
71 |     "            file_content_type=\"text/plain\",\n",
72 |     "            m_input2=[\"my\",\"inputs\"]\n",
73 |     "        )\n",
74 |     "    )"
75 |    ]
76 |   },
77 |   {
78 |    "cell_type": "code",
79 |    "execution_count": null,
80 |    "id": "edce40fa",
81 |    "metadata": {},
82 |    "outputs": [],
83 |    "source": []
84 |   }
85 |  ],
86 |  "metadata": {
87 |   "kernelspec": {
88 |    "display_name": "python3",
89 |    "language": "python",
90 |    "name": "python3"
91 |   }
92 |  },
93 |  "nbformat": 4,
94 |  "nbformat_minor": 5
95 | }
96 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-2.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# File Processing Pipeline"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "# pipeline-api\n",
17 |     "def pipeline_api(\n",
18 |     "    file\n",
19 |     "):\n",
20 |     "    return {\"silly_result\": ' : '.join([str(len(file.read()))])}"
21 |    ]
22 |   },
23 |   {
24 |    "cell_type": "code",
25 |    "execution_count": null,
26 |    "metadata": {},
27 |    "outputs": [
28 |     {
29 |      "name": "stdout",
30 |      "output_type": "stream",
31 |      "text": [
32 |       "{'silly_result': '17'}\n"
33 |      ]
34 |     }
35 |    ],
36 |    "source": [
37 |     "import tempfile\n",
38 |     "with tempfile.TemporaryFile() as fp:\n",
39 |     "    fp.write(b'This is some data')\n",
40 |     "    fp.seek(0)\n",
41 |     "    print(pipeline_api(fp))"
42 |    ]
43 |   }
44 |  ],
45 |  "metadata": {
46 |   "kernelspec": {
47 |    "display_name": "python3",
48 |    "language": "python",
49 |    "name": "python3"
50 |   }
51 |  },
52 |  "nbformat": 4,
53 |  "nbformat_minor": 0
54 | }
55 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-3.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# File Processing Pipeline"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "# pipeline-api\n",
17 |     "import pandas as pd\n",
18 |     "def pipeline_api(\n",
19 |     "    file, response_type=\"text/csv\", response_schema=\"isd\"\n",
20 |     "):\n",
21 |     "    data = pd.DataFrame(data={\"silly_result\": [str(len(file.read())), str(response_type), str(response_schema)]})\n",
22 |     "    if response_type == \"text/csv\":\n",
23 |     "        return data.to_csv()\n",
24 |     "    else:\n",
25 |     "        text = \" : \".join(list(data[\"silly_result\"]))\n",
26 |     "        return {\"silly_result\": text}"
27 |    ]
28 |   },
29 |   {
30 |    "cell_type": "code",
31 |    "execution_count": null,
32 |    "metadata": {},
33 |    "outputs": [
34 |     {
35 |      "name": "stdout",
36 |      "output_type": "stream",
37 |      "text": [
38 |       ",silly_result\n",
39 |       "0,17\n",
40 |       "1,text/csv\n",
41 |       "2,isd\n",
42 |       "\n"
43 |      ]
44 |     }
45 |    ],
46 |    "source": [
47 |     "import tempfile\n",
48 |     "with tempfile.TemporaryFile() as fp:\n",
49 |     "    fp.write(b'This is some data')\n",
50 |     "    fp.seek(0)\n",
51 |     "    print(pipeline_api(file=fp, response_type=\"text/csv\", response_schema=\"isd\"))"
52 |    ]
53 |   },
54 |   {
55 |    "cell_type": "code",
56 |    "execution_count": null,
57 |    "metadata": {},
58 |    "outputs": [],
59 |    "source": []
60 |   }
61 |  ],
62 |  "metadata": {
63 |   "kernelspec": {
64 |    "display_name": "python3",
65 |    "language": "python",
66 |    "name": "python3"
67 |   }
68 |  },
69 |  "nbformat": 4,
70 |  "nbformat_minor": 1
71 | }
72 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-4.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# File Processing Pipeline"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "# pipeline-api\n",
17 |     "def pipeline_api(\n",
18 |     "    file,\n",
19 |     "    file_content_type=None,\n",
20 |     "    response_type=\"application/json\",\n",
21 |     "    response_schema=\"labelstudio\",\n",
22 |     "    m_input1=[]\n",
23 |     "):\n",
24 |     "    return {\"silly_result\": ' : '.join([\n",
25 |     "        str(len(file.read())),\n",
26 |     "        str(file_content_type),\n",
27 |     "        str(response_type),\n",
28 |     "        str(response_schema),\n",
29 |     "        str(m_input1)\n",
30 |     "    ])}"
31 |    ]
32 |   },
33 |   {
34 |    "cell_type": "code",
35 |    "execution_count": null,
36 |    "metadata": {},
37 |    "outputs": [
38 |     {
39 |      "name": "stdout",
40 |      "output_type": "stream",
41 |      "text": [
42 |       "{'silly_result': \"17 : None : application/json : isd : ['input1', 'input2']\"}\n"
43 |      ]
44 |     }
45 |    ],
46 |    "source": [
47 |     "import tempfile\n",
48 |     "with tempfile.TemporaryFile() as fp:\n",
49 |     "    fp.write(b'This is some data')\n",
50 |     "    fp.seek(0)\n",
51 |     "    print(\n",
52 |     "        pipeline_api(\n",
53 |     "            fp,\n",
54 |     "            None,\n",
55 |     "            \"application/json\",\n",
56 |     "            \"isd\",\n",
57 |     "            [\"input1\", \"input2\"]\n",
58 |     "        )\n",
59 |     "    )"
60 |    ]
61 |   }
62 |  ],
63 |  "metadata": {
64 |   "kernelspec": {
65 |    "display_name": "python3",
66 |    "language": "python",
67 |    "name": "python3"
68 |   }
69 |  },
70 |  "nbformat": 4,
71 |  "nbformat_minor": 0
72 | }
73 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-5.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# File Processing Pipeline"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "# pipeline-api\n",
17 |     "import pandas as pd\n",
18 |     "\n",
19 |     "def pipeline_api(\n",
20 |     "    file,\n",
21 |     "    file_content_type=None,\n",
22 |     "    response_type=\"application/json\",\n",
23 |     "    response_schema=\"labelstudio\",\n",
24 |     "    m_input1=[],\n",
25 |     "    m_input2=[],\n",
26 |     "):\n",
27 |     "    data = pd.DataFrame(data={\"silly_result\": [\n",
28 |     "        str(len(file.read())),\n",
29 |     "        str(file_content_type),\n",
30 |     "        str(response_type),\n",
31 |     "        str(response_schema),\n",
32 |     "        str(m_input1),\n",
33 |     "        str(m_input2),\n",
34 |     "    ]})\n",
35 |     "    if response_type == \"text/csv\":\n",
36 |     "        return data.to_csv()\n",
37 |     "    else:\n",
38 |     "        text = \" : \".join(list(data[\"silly_result\"]))\n",
39 |     "        return {\"silly_result\": text}"
40 |    ]
41 |   },
42 |   {
43 |    "cell_type": "code",
44 |    "execution_count": null,
45 |    "metadata": {},
46 |    "outputs": [
47 |     {
48 |      "name": "stdout",
49 |      "output_type": "stream",
50 |      "text": [
51 |       "{'silly_result': \"17 : None : application/json : isd : ['input1', 'input2'] : ['m_input2']\"}\n"
52 |      ]
53 |     }
54 |    ],
55 |    "source": [
56 |     "import tempfile\n",
57 |     "with tempfile.TemporaryFile() as fp:\n",
58 |     "    fp.write(b'This is some data')\n",
59 |     "    fp.seek(0)\n",
60 |     "    print(\n",
61 |     "        pipeline_api(\n",
62 |     "            fp,\n",
63 |     "            None,\n",
64 |     "            \"application/json\",\n",
65 |     "            \"isd\",\n",
66 |     "            [\"input1\", \"input2\"],\n",
67 |     "            [\"m_input2\"]\n",
68 |     "        )\n",
69 |     "    )"
70 |    ]
71 |   },
72 |   {
73 |    "cell_type": "code",
74 |    "execution_count": null,
75 |    "metadata": {},
76 |    "outputs": [],
77 |    "source": []
78 |   }
79 |  ],
80 |  "metadata": {
81 |   "kernelspec": {
82 |    "display_name": "python3",
83 |    "language": "python",
84 |    "name": "python3"
85 |   }
86 |  },
87 |  "nbformat": 4,
88 |  "nbformat_minor": 1
89 | }
90 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-1.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Text Processing Pipeline"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "# pipeline-api\n",
17 |     "def pipeline_api(\n",
18 |     "    text,\n",
19 |     "):\n",
20 |     "    return {\"silly_result\": ' : '.join([str(len(text)), text])}"
21 |    ]
22 |   },
23 |   {
24 |    "cell_type": "code",
25 |    "execution_count": null,
26 |    "metadata": {},
27 |    "outputs": [
28 |     {
29 |      "name": "stdout",
30 |      "output_type": "stream",
31 |      "text": [
32 |       "{'silly_result': '9 : some text'}\n"
33 |      ]
34 |     }
35 |    ],
36 |    "source": [
37 |     "print(pipeline_api(\"some text\"))"
38 |    ]
39 |   }
40 |  ],
41 |  "metadata": {
42 |   "kernelspec": {
43 |    "display_name": "python3",
44 |    "language": "python",
45 |    "name": "python3"
46 |   }
47 |  },
48 |  "nbformat": 4,
49 |  "nbformat_minor": 1
50 | }
51 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-2.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Text Processing Pipeline"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "# pipeline-api\n",
17 |     "def pipeline_api(\n",
18 |     "    text,\n",
19 |     "    m_input1=[],\n",
20 |     "    m_input2=[]\n",
21 |     "):\n",
22 |     "    return {\"silly_result\": ' : '.join([\n",
23 |     "        str(len(text)),\n",
24 |     "        text,\n",
25 |     "        str(m_input1),\n",
26 |     "        str(m_input2)\n",
27 |     "    ])}"
28 |    ]
29 |   },
30 |   {
31 |    "cell_type": "code",
32 |    "execution_count": null,
33 |    "metadata": {},
34 |    "outputs": [
35 |     {
36 |      "name": "stdout",
37 |      "output_type": "stream",
38 |      "text": [
39 |       "{'silly_result': \"9 : some text : ['first_input'] : ['last', 'input']\"}\n"
40 |      ]
41 |     }
42 |    ],
43 |    "source": [
44 |     "print(pipeline_api(\"some text\", m_input1=[\"first_input\"], m_input2=[\"last\", \"input\"]))"
45 |    ]
46 |   }
47 |  ],
48 |  "metadata": {
49 |   "kernelspec": {
50 |    "display_name": "python3",
51 |    "language": "python",
52 |    "name": "python3"
53 |   }
54 |  },
55 |  "nbformat": 4,
56 |  "nbformat_minor": 1
57 | }
58 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-3.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "bafce76f",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "# Text Processing Pipeline"
 9 |    ]
10 |   },
11 |   {
12 |    "cell_type": "code",
13 |    "execution_count": null,
14 |    "id": "2524a9a4",
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "# pipeline-api\n",
19 |     "import pandas as pd\n",
20 |     "def pipeline_api(\n",
21 |     "    text,\n",
22 |     "    response_type=\"text/csv\"\n",
23 |     "):\n",
24 |     "    data = pd.DataFrame(data={\"silly_result\": [str(len(text)), text, str(response_type)]})\n",
25 |     "    if response_type == \"text/csv\":\n",
26 |     "        return data.to_csv()\n",
27 |     "    else:\n",
28 |     "        text = \" : \".join(list(data[\"silly_result\"]))\n",
29 |     "        return {\"silly_result\": text}"
30 |    ]
31 |   },
32 |   {
33 |    "cell_type": "code",
34 |    "execution_count": null,
35 |    "id": "6a876bdf",
36 |    "metadata": {},
37 |    "outputs": [
38 |     {
39 |      "name": "stdout",
40 |      "output_type": "stream",
41 |      "text": [
42 |       ",silly_result\n",
43 |       "0,9\n",
44 |       "1,some text\n",
45 |       "2,text/csv\n",
46 |       "\n"
47 |      ]
48 |     }
49 |    ],
50 |    "source": [
51 |     "print(pipeline_api(\"some text\", \"text/csv\"))"
52 |    ]
53 |   },
54 |   {
55 |    "cell_type": "code",
56 |    "execution_count": null,
57 |    "id": "83f27184",
58 |    "metadata": {},
59 |    "outputs": [],
60 |    "source": []
61 |   }
62 |  ],
63 |  "metadata": {
64 |   "kernelspec": {
65 |    "display_name": "python3",
66 |    "language": "python",
67 |    "name": "python3"
68 |   }
69 |  },
70 |  "nbformat": 4,
71 |  "nbformat_minor": 5
72 | }
73 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-4.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Text Processing Pipeline"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "# pipeline-api\n",
17 |     "import pandas as pd\n",
18 |     "def pipeline_api(\n",
19 |     "    text,\n",
20 |     "    response_type=\"text/csv\",\n",
21 |     "    response_schema=\"isd\",\n",
22 |     "):\n",
23 |     "    data = pd.DataFrame(data={\"silly_result\": [str(len(text)), text, str(response_type), str(response_schema)]})\n",
24 |     "    if response_type == \"text/csv\":\n",
25 |     "        return data.to_csv()\n",
26 |     "    else:\n",
27 |     "        text = \" : \".join(list(data[\"silly_result\"]))\n",
28 |     "        return {\"silly_result\": text}"
29 |    ]
30 |   },
31 |   {
32 |    "cell_type": "code",
33 |    "execution_count": null,
34 |    "metadata": {},
35 |    "outputs": [
36 |     {
37 |      "name": "stdout",
38 |      "output_type": "stream",
39 |      "text": [
40 |       ",silly_result\n",
41 |       "0,9\n",
42 |       "1,some text\n",
43 |       "2,text/csv\n",
44 |       "3,isd\n",
45 |       "\n"
46 |      ]
47 |     }
48 |    ],
49 |    "source": [
50 |     "print(pipeline_api(\"some text\", \"text/csv\", \"isd\"))"
51 |    ]
52 |   }
53 |  ],
54 |  "metadata": {
55 |   "kernelspec": {
56 |    "display_name": "python3",
57 |    "language": "python",
58 |    "name": "python3"
59 |   }
60 |  },
61 |  "nbformat": 4,
62 |  "nbformat_minor": 1
63 | }
64 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-file-1.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Text & File Processing Pipeline"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "# pipeline-api\n",
17 |     "def pipeline_api(\n",
18 |     "    text,\n",
19 |     "    file=None,\n",
20 |     "    filename=None,\n",
21 |     "    file_content_type=None,\n",
22 |     "):\n",
23 |     "    return {\"silly_result\": ' : '.join([\n",
24 |     "        str(len(text if text else \"\")),\n",
25 |     "        str(text),\n",
26 |     "        str(len(file.read()) if file else None),\n",
27 |     "        str(filename),\n",
28 |     "        str(file_content_type),\n",
29 |     "    ])}"
30 |    ]
31 |   },
32 |   {
33 |    "cell_type": "code",
34 |    "execution_count": null,
35 |    "metadata": {},
36 |    "outputs": [
37 |     {
38 |      "name": "stdout",
39 |      "output_type": "stream",
40 |      "text": [
41 |       "{'silly_result': '9 : some text : 17 : temp-file.txt : None'}\n"
42 |      ]
43 |     }
44 |    ],
45 |    "source": [
46 |     "import tempfile\n",
47 |     "with tempfile.TemporaryFile() as fp:\n",
48 |     "    fp.write(b'This is some data')\n",
49 |     "    fp.seek(0)\n",
50 |     "    print(pipeline_api(\n",
51 |     "        text=\"some text\",\n",
52 |     "        file=fp,\n",
53 |     "        file_content_type=None,\n",
54 |     "        filename=\"temp-file.txt\"\n",
55 |     "    ))"
56 |    ]
57 |   }
58 |  ],
59 |  "metadata": {
60 |   "kernelspec": {
61 |    "display_name": "python3",
62 |    "language": "python",
63 |    "name": "python3"
64 |   }
65 |  },
66 |  "nbformat": 4,
67 |  "nbformat_minor": 1
68 | }
69 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-file-2.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Text & File Processing Pipeline"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "# pipeline-api\n",
17 |     "import pandas as pd\n",
18 |     "def pipeline_api(\n",
19 |     "    text,\n",
20 |     "    file=None,\n",
21 |     "    filename=None,\n",
22 |     "    file_content_type=None,\n",
23 |     "    response_type=\"application/json\",\n",
24 |     "    m_input2=[]\n",
25 |     "):\n",
26 |     "    data = pd.DataFrame(data={\"silly_result\": [\n",
27 |     "        str(len(text if text else \"\")),\n",
28 |     "        str(text),\n",
29 |     "        str(len(file.read()) if file else None),\n",
30 |     "        str(filename),\n",
31 |     "        str(file_content_type),\n",
32 |     "        str(response_type),\n",
33 |     "        str(m_input2)\n",
34 |     "    ]})\n",
35 |     "    if response_type == \"text/csv\":\n",
36 |     "        return data.to_csv()\n",
37 |     "    else:\n",
38 |     "        text = \" : \".join(list(data[\"silly_result\"]))\n",
39 |     "        return {\"silly_result\": text}"
40 |    ]
41 |   },
42 |   {
43 |    "cell_type": "code",
44 |    "execution_count": null,
45 |    "metadata": {},
46 |    "outputs": [
47 |     {
48 |      "name": "stdout",
49 |      "output_type": "stream",
50 |      "text": [
51 |       "{'silly_result': \"9 : some text : 17 : temp-file.txt : None : application/json : ['input1', 'input2']\"}\n"
52 |      ]
53 |     }
54 |    ],
55 |    "source": [
56 |     "import tempfile\n",
57 |     "with tempfile.TemporaryFile() as fp:\n",
58 |     "    fp.write(b'This is some data')\n",
59 |     "    fp.seek(0)\n",
60 |     "    print(pipeline_api(\n",
61 |     "        text=\"some text\",\n",
62 |     "        file=fp,\n",
63 |     "        file_content_type=None,\n",
64 |     "        filename=\"temp-file.txt\",\n",
65 |     "        response_type=\"application/json\",\n",
66 |     "        m_input2=[\"input1\", \"input2\"]\n",
67 |     "    ))"
68 |    ]
69 |   },
70 |   {
71 |    "cell_type": "code",
72 |    "execution_count": null,
73 |    "metadata": {},
74 |    "outputs": [],
75 |    "source": []
76 |   }
77 |  ],
78 |  "metadata": {
79 |   "kernelspec": {
80 |    "display_name": "python3",
81 |    "language": "python",
82 |    "name": "python3"
83 |   }
84 |  },
85 |  "nbformat": 4,
86 |  "nbformat_minor": 1
87 | }
88 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-file-3.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Text & File Processing Pipeline"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "# pipeline-api\n",
17 |     "import pandas as pd\n",
18 |     "def pipeline_api(\n",
19 |     "    text,\n",
20 |     "    file=None,\n",
21 |     "    filename=None,\n",
22 |     "    file_content_type=None,\n",
23 |     "    response_type=\"application/json\",\n",
24 |     "    response_schema=\"isd\"\n",
25 |     "):\n",
26 |     "    data = pd.DataFrame(data={\"silly_result\": [\n",
27 |     "        str(len(text if text else \"\")),\n",
28 |     "        str(text),\n",
29 |     "        str(len(file.read()) if file else None),\n",
30 |     "        str(filename),\n",
31 |     "        str(file_content_type),\n",
32 |     "        str(response_type),\n",
33 |     "        str(response_schema)\n",
34 |     "    ]})\n",
35 |     "    if response_type == \"text/csv\":\n",
36 |     "        return data.to_csv()\n",
37 |     "    else:\n",
38 |     "        text = \" : \".join(list(data[\"silly_result\"]))\n",
39 |     "        return {\"silly_result\": text}"
40 |    ]
41 |   },
42 |   {
43 |    "cell_type": "code",
44 |    "execution_count": null,
45 |    "metadata": {},
46 |    "outputs": [
47 |     {
48 |      "name": "stdout",
49 |      "output_type": "stream",
50 |      "text": [
51 |       "{'silly_result': '9 : some text : 17 : temp-file.txt : None : application/json : isd'}\n"
52 |      ]
53 |     }
54 |    ],
55 |    "source": [
56 |     "import tempfile\n",
57 |     "with tempfile.TemporaryFile() as fp:\n",
58 |     "    fp.write(b'This is some data')\n",
59 |     "    fp.seek(0)\n",
60 |     "    print(pipeline_api(\n",
61 |     "        text=\"some text\",\n",
62 |     "        file=fp,\n",
63 |     "        file_content_type=None,\n",
64 |     "        filename=\"temp-file.txt\",\n",
65 |     "        response_type=\"application/json\",\n",
66 |     "        response_schema=\"isd\"\n",
67 |     "    ))"
68 |    ]
69 |   },
70 |   {
71 |    "cell_type": "code",
72 |    "execution_count": null,
73 |    "metadata": {},
74 |    "outputs": [],
75 |    "source": []
76 |   }
77 |  ],
78 |  "metadata": {
79 |   "kernelspec": {
80 |    "display_name": "python3",
81 |    "language": "python",
82 |    "name": "python3"
83 |   }
84 |  },
85 |  "nbformat": 4,
86 |  "nbformat_minor": 1
87 | }
88 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-file-4.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Text & File Processing Pipeline"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "# pipeline-api\n",
17 |     "import pandas as pd\n",
18 |     "def pipeline_api(\n",
19 |     "    text,\n",
20 |     "    file=None,\n",
21 |     "    filename=None,\n",
22 |     "    file_content_type=None,\n",
23 |     "    response_type=\"application/json\",\n",
24 |     "    response_schema=\"isd\",\n",
25 |     "    m_input1=[],\n",
26 |     "    m_input2=[]\n",
27 |     "):\n",
28 |     "    data = pd.DataFrame(data={\"silly_result\": [\n",
29 |     "        str(len(text if text else \"\")),\n",
30 |     "        str(text),\n",
31 |     "        str(len(file.read()) if file else None),\n",
32 |     "        str(filename),\n",
33 |     "        str(file_content_type),\n",
34 |     "        str(response_type),\n",
35 |     "        str(response_schema),\n",
36 |     "        str(m_input1),\n",
37 |     "        str(m_input2),\n",
38 |     "    ]})\n",
39 |     "    if response_type == \"text/csv\":\n",
40 |     "        return data.to_csv()\n",
41 |     "    else:\n",
42 |     "        text = \" : \".join(list(data[\"silly_result\"]))\n",
43 |     "        return {\"silly_result\": text}"
44 |    ]
45 |   },
46 |   {
47 |    "cell_type": "code",
48 |    "execution_count": null,
49 |    "metadata": {},
50 |    "outputs": [
51 |     {
52 |      "name": "stdout",
53 |      "output_type": "stream",
54 |      "text": [
55 |       "{'silly_result': \"9 : some text : 17 : temp-file.txt : None : application/json : isd : ['input1'] : ['input2', 'input3']\"}\n"
56 |      ]
57 |     }
58 |    ],
59 |    "source": [
60 |     "import tempfile\n",
61 |     "with tempfile.TemporaryFile() as fp:\n",
62 |     "    fp.write(b'This is some data')\n",
63 |     "    fp.seek(0)\n",
64 |     "    print(pipeline_api(\n",
65 |     "        text=\"some text\",\n",
66 |     "        file=fp,\n",
67 |     "        file_content_type=None,\n",
68 |     "        filename=\"temp-file.txt\",\n",
69 |     "        response_type=\"application/json\",\n",
70 |     "        response_schema=\"isd\",\n",
71 |     "        m_input1=[\"input1\"],\n",
72 |     "        m_input2=[\"input2\", \"input3\"]\n",
73 |     "    ))"
74 |    ]
75 |   }
76 |  ],
77 |  "metadata": {
78 |   "kernelspec": {
79 |    "display_name": "python3",
80 |    "language": "python",
81 |    "name": "python3"
82 |   }
83 |  },
84 |  "nbformat": 4,
85 |  "nbformat_minor": 1
86 | }
87 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/__init__.py


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/app.py:
--------------------------------------------------------------------------------
 1 | #####################################################################
 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
 3 | # DO NOT MODIFY DIRECTLY
 4 | #####################################################################
 5 | 
 6 | 
 7 | from fastapi import FastAPI, Request, status
 8 | import logging
 9 | import os
10 | 
11 | from .process_file_1 import router as process_file_1_router
12 | from .process_file_2 import router as process_file_2_router
13 | from .process_file_3 import router as process_file_3_router
14 | from .process_file_4 import router as process_file_4_router
15 | from .process_file_5 import router as process_file_5_router
16 | from .process_text_1 import router as process_text_1_router
17 | from .process_text_2 import router as process_text_2_router
18 | from .process_text_3 import router as process_text_3_router
19 | from .process_text_4 import router as process_text_4_router
20 | from .process_text_file_1 import router as process_text_file_1_router
21 | from .process_text_file_2 import router as process_text_file_2_router
22 | from .process_text_file_3 import router as process_text_file_3_router
23 | from .process_text_file_4 import router as process_text_file_4_router
24 | 
25 | 
26 | app = FastAPI(
27 |     title="Unstructured Pipeline API",
28 |     description="""""",
29 |     version="1.0.0",
30 |     docs_url="/test-project/docs",
31 |     openapi_url="/test-project/openapi.json",
32 | )
33 | 
34 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", None)
35 | if allowed_origins:
36 |     from fastapi.middleware.cors import CORSMiddleware
37 | 
38 |     app.add_middleware(
39 |         CORSMiddleware,
40 |         allow_origins=allowed_origins.split(","),
41 |         allow_methods=["OPTIONS", "POST"],
42 |         allow_headers=["Content-Type"],
43 |     )
44 | 
45 | app.include_router(process_file_1_router)
46 | app.include_router(process_file_2_router)
47 | app.include_router(process_file_3_router)
48 | app.include_router(process_file_4_router)
49 | app.include_router(process_file_5_router)
50 | app.include_router(process_text_1_router)
51 | app.include_router(process_text_2_router)
52 | app.include_router(process_text_3_router)
53 | app.include_router(process_text_4_router)
54 | app.include_router(process_text_file_1_router)
55 | app.include_router(process_text_file_2_router)
56 | app.include_router(process_text_file_3_router)
57 | app.include_router(process_text_file_4_router)
58 | 
59 | 
60 | # Filter out /healthcheck noise
61 | class HealthCheckFilter(logging.Filter):
62 |     def filter(self, record: logging.LogRecord) -> bool:
63 |         return record.getMessage().find("/healthcheck") == -1
64 | 
65 | 
66 | # Filter out /metrics noise
67 | class MetricsCheckFilter(logging.Filter):
68 |     def filter(self, record: logging.LogRecord) -> bool:
69 |         return record.getMessage().find("/metrics") == -1
70 | 
71 | 
72 | logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter())
73 | logging.getLogger("uvicorn.access").addFilter(MetricsCheckFilter())
74 | 
75 | 
76 | @app.get("/healthcheck", status_code=status.HTTP_200_OK, include_in_schema=False)
77 | def healthcheck(request: Request):
78 |     return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"}
79 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py:
--------------------------------------------------------------------------------
  1 | #####################################################################
  2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
  3 | # DO NOT MODIFY DIRECTLY
  4 | #####################################################################
  5 | 
  6 | import io
  7 | import os
  8 | import gzip
  9 | import mimetypes
 10 | from typing import List, Union
 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
 12 | import json
 13 | from fastapi.responses import StreamingResponse
 14 | from starlette.datastructures import Headers
 15 | from starlette.types import Send
 16 | from base64 import b64encode
 17 | from typing import Optional, Mapping
 18 | import secrets
 19 | 
 20 | 
 21 | app = FastAPI()
 22 | router = APIRouter()
 23 | 
 24 | 
 25 | # pipeline-api
 26 | 
 27 | # test that a duplicate import gets handles correctly as this gets imported via the template as wel
 28 | 
 29 | # test accessing os in a #pipeline-api cell does not break things
 30 | _ = os.environ
 31 | 
 32 | 
 33 | def pipeline_api(
 34 |     file,
 35 |     filename=None,
 36 |     file_content_type=None,
 37 |     m_input2=[],
 38 | ):
 39 |     return {
 40 |         "silly_result": " : ".join(
 41 |             [str(len(file.read())), filename, file_content_type, str(m_input2)]
 42 |         )
 43 |     }
 44 | 
 45 | 
 46 | def get_validated_mimetype(file):
 47 |     """
 48 |     Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
 49 |     generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
 50 |     return HTTP 400 for an invalid type.
 51 |     """
 52 |     content_type = file.content_type
 53 |     if not content_type or content_type == "application/octet-stream":
 54 |         content_type = mimetypes.guess_type(str(file.filename))[0]
 55 | 
 56 |         # Some filetypes missing for this library, just hardcode them for now
 57 |         if not content_type:
 58 |             if file.filename.endswith(".md"):
 59 |                 content_type = "text/markdown"
 60 |             elif file.filename.endswith(".msg"):
 61 |                 content_type = "message/rfc822"
 62 | 
 63 |     allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
 64 |     if allowed_mimetypes_str is not None:
 65 |         allowed_mimetypes = allowed_mimetypes_str.split(",")
 66 | 
 67 |         if content_type not in allowed_mimetypes:
 68 |             raise HTTPException(
 69 |                 status_code=400,
 70 |                 detail=(
 71 |                     f"Unable to process {file.filename}: "
 72 |                     f"File type {content_type} is not supported."
 73 |                 ),
 74 |             )
 75 | 
 76 |     return content_type
 77 | 
 78 | 
 79 | class MultipartMixedResponse(StreamingResponse):
 80 |     CRLF = b"\r\n"
 81 | 
 82 |     def __init__(self, *args, content_type: str = None, **kwargs):
 83 |         super().__init__(*args, **kwargs)
 84 |         self.content_type = content_type
 85 | 
 86 |     def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
 87 |         super().init_headers(headers)
 88 |         self.boundary_value = secrets.token_hex(16)
 89 |         content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
 90 |         self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
 91 | 
 92 |     @property
 93 |     def boundary(self):
 94 |         return b"--" + self.boundary_value.encode()
 95 | 
 96 |     def _build_part_headers(self, headers: dict) -> bytes:
 97 |         header_bytes = b""
 98 |         for header, value in headers.items():
 99 |             header_bytes += f"{header}: {value}".encode() + self.CRLF
100 |         return header_bytes
101 | 
102 |     def build_part(self, chunk: bytes) -> bytes:
103 |         part = self.boundary + self.CRLF
104 |         part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
105 |         if self.content_type is not None:
106 |             part_headers["Content-Type"] = self.content_type
107 |         part += self._build_part_headers(part_headers)
108 |         part += self.CRLF + chunk + self.CRLF
109 |         return part
110 | 
111 |     async def stream_response(self, send: Send) -> None:
112 |         await send(
113 |             {
114 |                 "type": "http.response.start",
115 |                 "status": self.status_code,
116 |                 "headers": self.raw_headers,
117 |             }
118 |         )
119 |         async for chunk in self.body_iterator:
120 |             if not isinstance(chunk, bytes):
121 |                 chunk = chunk.encode(self.charset)
122 |                 chunk = b64encode(chunk)
123 |             await send(
124 |                 {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
125 |             )
126 | 
127 |         await send({"type": "http.response.body", "body": b"", "more_body": False})
128 | 
129 | 
130 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
131 |     def return_content_type(filename):
132 |         if gz_uncompressed_content_type:
133 |             return gz_uncompressed_content_type
134 |         else:
135 |             return str(mimetypes.guess_type(filename)[0])
136 | 
137 |     filename = str(file.filename) if file.filename else ""
138 |     if filename.endswith(".gz"):
139 |         filename = filename[:-3]
140 | 
141 |     gzip_file = gzip.open(file.file).read()
142 |     return UploadFile(
143 |         file=io.BytesIO(gzip_file),
144 |         size=len(gzip_file),
145 |         filename=filename,
146 |         headers=Headers({"content-type": return_content_type(filename)}),
147 |     )
148 | 
149 | 
150 | @router.post("/test-project/v1/process-file-1")
151 | @router.post("/test-project/v1.2.3/process-file-1")
152 | def pipeline_1(
153 |     request: Request,
154 |     gz_uncompressed_content_type: Optional[str] = Form(default=None),
155 |     files: Union[List[UploadFile], None] = File(default=None),
156 |     input2: List[str] = Form(default=[]),
157 | ):
158 |     if files:
159 |         for file_index in range(len(files)):
160 |             if files[file_index].content_type == "application/gzip":
161 |                 files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type)
162 | 
163 |     content_type = request.headers.get("Accept")
164 | 
165 |     if isinstance(files, list) and len(files):
166 |         if len(files) > 1:
167 |             if content_type and content_type not in [
168 |                 "*/*",
169 |                 "multipart/mixed",
170 |                 "application/json",
171 |                 "text/csv",
172 |             ]:
173 |                 raise HTTPException(
174 |                     detail=(
175 |                         f"Conflict in media type {content_type}"
176 |                         ' with response type "multipart/mixed".\n'
177 |                     ),
178 |                     status_code=status.HTTP_406_NOT_ACCEPTABLE,
179 |                 )
180 | 
181 |         def response_generator(is_multipart):
182 |             for file in files:
183 |                 file_content_type = get_validated_mimetype(file)
184 | 
185 |                 _file = file.file
186 | 
187 |                 response = pipeline_api(
188 |                     _file,
189 |                     m_input2=input2,
190 |                     filename=file.filename,
191 |                     file_content_type=file_content_type,
192 |                 )
193 | 
194 |                 if is_multipart:
195 |                     if type(response) not in [str, bytes]:
196 |                         response = json.dumps(response)
197 |                 yield response
198 | 
199 |         if content_type == "multipart/mixed":
200 |             return MultipartMixedResponse(
201 |                 response_generator(is_multipart=True),
202 |             )
203 |         else:
204 |             return (
205 |                 list(response_generator(is_multipart=False))[0]
206 |                 if len(files) == 1
207 |                 else response_generator(is_multipart=False)
208 |             )
209 |     else:
210 |         raise HTTPException(
211 |             detail='Request parameter "files" is required.\n',
212 |             status_code=status.HTTP_400_BAD_REQUEST,
213 |         )
214 | 
215 | 
216 | app.include_router(router)
217 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py:
--------------------------------------------------------------------------------
  1 | #####################################################################
  2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
  3 | # DO NOT MODIFY DIRECTLY
  4 | #####################################################################
  5 | 
  6 | import io
  7 | import os
  8 | import gzip
  9 | import mimetypes
 10 | from typing import List, Union
 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
 12 | import json
 13 | from fastapi.responses import StreamingResponse
 14 | from starlette.datastructures import Headers
 15 | from starlette.types import Send
 16 | from base64 import b64encode
 17 | from typing import Optional, Mapping
 18 | import secrets
 19 | 
 20 | 
 21 | app = FastAPI()
 22 | router = APIRouter()
 23 | 
 24 | 
 25 | # pipeline-api
 26 | def pipeline_api(file):
 27 |     return {"silly_result": " : ".join([str(len(file.read()))])}
 28 | 
 29 | 
 30 | def get_validated_mimetype(file):
 31 |     """
 32 |     Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
 33 |     generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
 34 |     return HTTP 400 for an invalid type.
 35 |     """
 36 |     content_type = file.content_type
 37 |     if not content_type or content_type == "application/octet-stream":
 38 |         content_type = mimetypes.guess_type(str(file.filename))[0]
 39 | 
 40 |         # Some filetypes missing for this library, just hardcode them for now
 41 |         if not content_type:
 42 |             if file.filename.endswith(".md"):
 43 |                 content_type = "text/markdown"
 44 |             elif file.filename.endswith(".msg"):
 45 |                 content_type = "message/rfc822"
 46 | 
 47 |     allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
 48 |     if allowed_mimetypes_str is not None:
 49 |         allowed_mimetypes = allowed_mimetypes_str.split(",")
 50 | 
 51 |         if content_type not in allowed_mimetypes:
 52 |             raise HTTPException(
 53 |                 status_code=400,
 54 |                 detail=(
 55 |                     f"Unable to process {file.filename}: "
 56 |                     f"File type {content_type} is not supported."
 57 |                 ),
 58 |             )
 59 | 
 60 |     return content_type
 61 | 
 62 | 
 63 | class MultipartMixedResponse(StreamingResponse):
 64 |     CRLF = b"\r\n"
 65 | 
 66 |     def __init__(self, *args, content_type: str = None, **kwargs):
 67 |         super().__init__(*args, **kwargs)
 68 |         self.content_type = content_type
 69 | 
 70 |     def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
 71 |         super().init_headers(headers)
 72 |         self.boundary_value = secrets.token_hex(16)
 73 |         content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
 74 |         self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
 75 | 
 76 |     @property
 77 |     def boundary(self):
 78 |         return b"--" + self.boundary_value.encode()
 79 | 
 80 |     def _build_part_headers(self, headers: dict) -> bytes:
 81 |         header_bytes = b""
 82 |         for header, value in headers.items():
 83 |             header_bytes += f"{header}: {value}".encode() + self.CRLF
 84 |         return header_bytes
 85 | 
 86 |     def build_part(self, chunk: bytes) -> bytes:
 87 |         part = self.boundary + self.CRLF
 88 |         part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
 89 |         if self.content_type is not None:
 90 |             part_headers["Content-Type"] = self.content_type
 91 |         part += self._build_part_headers(part_headers)
 92 |         part += self.CRLF + chunk + self.CRLF
 93 |         return part
 94 | 
 95 |     async def stream_response(self, send: Send) -> None:
 96 |         await send(
 97 |             {
 98 |                 "type": "http.response.start",
 99 |                 "status": self.status_code,
100 |                 "headers": self.raw_headers,
101 |             }
102 |         )
103 |         async for chunk in self.body_iterator:
104 |             if not isinstance(chunk, bytes):
105 |                 chunk = chunk.encode(self.charset)
106 |                 chunk = b64encode(chunk)
107 |             await send(
108 |                 {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
109 |             )
110 | 
111 |         await send({"type": "http.response.body", "body": b"", "more_body": False})
112 | 
113 | 
114 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
115 |     def return_content_type(filename):
116 |         if gz_uncompressed_content_type:
117 |             return gz_uncompressed_content_type
118 |         else:
119 |             return str(mimetypes.guess_type(filename)[0])
120 | 
121 |     filename = str(file.filename) if file.filename else ""
122 |     if filename.endswith(".gz"):
123 |         filename = filename[:-3]
124 | 
125 |     gzip_file = gzip.open(file.file).read()
126 |     return UploadFile(
127 |         file=io.BytesIO(gzip_file),
128 |         size=len(gzip_file),
129 |         filename=filename,
130 |         headers=Headers({"content-type": return_content_type(filename)}),
131 |     )
132 | 
133 | 
134 | @router.post("/test-project/v1/process-file-2")
135 | @router.post("/test-project/v1.2.3/process-file-2")
136 | def pipeline_1(
137 |     request: Request,
138 |     gz_uncompressed_content_type: Optional[str] = Form(default=None),
139 |     files: Union[List[UploadFile], None] = File(default=None),
140 | ):
141 |     if files:
142 |         for file_index in range(len(files)):
143 |             if files[file_index].content_type == "application/gzip":
144 |                 files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type)
145 | 
146 |     content_type = request.headers.get("Accept")
147 | 
148 |     if isinstance(files, list) and len(files):
149 |         if len(files) > 1:
150 |             if content_type and content_type not in [
151 |                 "*/*",
152 |                 "multipart/mixed",
153 |                 "application/json",
154 |                 "text/csv",
155 |             ]:
156 |                 raise HTTPException(
157 |                     detail=(
158 |                         f"Conflict in media type {content_type}"
159 |                         ' with response type "multipart/mixed".\n'
160 |                     ),
161 |                     status_code=status.HTTP_406_NOT_ACCEPTABLE,
162 |                 )
163 | 
164 |         def response_generator(is_multipart):
165 |             for file in files:
166 |                 get_validated_mimetype(file)
167 | 
168 |                 _file = file.file
169 | 
170 |                 response = pipeline_api(
171 |                     _file,
172 |                 )
173 | 
174 |                 if is_multipart:
175 |                     if type(response) not in [str, bytes]:
176 |                         response = json.dumps(response)
177 |                 yield response
178 | 
179 |         if content_type == "multipart/mixed":
180 |             return MultipartMixedResponse(
181 |                 response_generator(is_multipart=True),
182 |             )
183 |         else:
184 |             return (
185 |                 list(response_generator(is_multipart=False))[0]
186 |                 if len(files) == 1
187 |                 else response_generator(is_multipart=False)
188 |             )
189 |     else:
190 |         raise HTTPException(
191 |             detail='Request parameter "files" is required.\n',
192 |             status_code=status.HTTP_400_BAD_REQUEST,
193 |         )
194 | 
195 | 
196 | app.include_router(router)
197 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py:
--------------------------------------------------------------------------------
  1 | #####################################################################
  2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
  3 | # DO NOT MODIFY DIRECTLY
  4 | #####################################################################
  5 | 
  6 | import io
  7 | import os
  8 | import gzip
  9 | import mimetypes
 10 | from typing import List, Union
 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
 12 | from fastapi.responses import PlainTextResponse
 13 | import json
 14 | from fastapi.responses import StreamingResponse
 15 | from starlette.datastructures import Headers
 16 | from starlette.types import Send
 17 | from base64 import b64encode
 18 | from typing import Optional, Mapping
 19 | import secrets
 20 | import pandas as pd
 21 | 
 22 | 
 23 | app = FastAPI()
 24 | router = APIRouter()
 25 | 
 26 | 
 27 | def is_expected_response_type(media_type, response_type):
 28 |     if media_type == "application/json" and response_type not in [dict, list]:
 29 |         return True
 30 |     elif media_type == "text/csv" and response_type != str:
 31 |         return True
 32 |     else:
 33 |         return False
 34 | 
 35 | 
 36 | # pipeline-api
 37 | def pipeline_api(file, response_type="text/csv", response_schema="isd"):
 38 |     data = pd.DataFrame(
 39 |         data={"silly_result": [str(len(file.read())), str(response_type), str(response_schema)]}
 40 |     )
 41 |     if response_type == "text/csv":
 42 |         return data.to_csv()
 43 |     else:
 44 |         text = " : ".join(list(data["silly_result"]))
 45 |         return {"silly_result": text}
 46 | 
 47 | 
 48 | def get_validated_mimetype(file):
 49 |     """
 50 |     Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
 51 |     generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
 52 |     return HTTP 400 for an invalid type.
 53 |     """
 54 |     content_type = file.content_type
 55 |     if not content_type or content_type == "application/octet-stream":
 56 |         content_type = mimetypes.guess_type(str(file.filename))[0]
 57 | 
 58 |         # Some filetypes missing for this library, just hardcode them for now
 59 |         if not content_type:
 60 |             if file.filename.endswith(".md"):
 61 |                 content_type = "text/markdown"
 62 |             elif file.filename.endswith(".msg"):
 63 |                 content_type = "message/rfc822"
 64 | 
 65 |     allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
 66 |     if allowed_mimetypes_str is not None:
 67 |         allowed_mimetypes = allowed_mimetypes_str.split(",")
 68 | 
 69 |         if content_type not in allowed_mimetypes:
 70 |             raise HTTPException(
 71 |                 status_code=400,
 72 |                 detail=(
 73 |                     f"Unable to process {file.filename}: "
 74 |                     f"File type {content_type} is not supported."
 75 |                 ),
 76 |             )
 77 | 
 78 |     return content_type
 79 | 
 80 | 
 81 | class MultipartMixedResponse(StreamingResponse):
 82 |     CRLF = b"\r\n"
 83 | 
 84 |     def __init__(self, *args, content_type: str = None, **kwargs):
 85 |         super().__init__(*args, **kwargs)
 86 |         self.content_type = content_type
 87 | 
 88 |     def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
 89 |         super().init_headers(headers)
 90 |         self.boundary_value = secrets.token_hex(16)
 91 |         content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
 92 |         self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
 93 | 
 94 |     @property
 95 |     def boundary(self):
 96 |         return b"--" + self.boundary_value.encode()
 97 | 
 98 |     def _build_part_headers(self, headers: dict) -> bytes:
 99 |         header_bytes = b""
100 |         for header, value in headers.items():
101 |             header_bytes += f"{header}: {value}".encode() + self.CRLF
102 |         return header_bytes
103 | 
104 |     def build_part(self, chunk: bytes) -> bytes:
105 |         part = self.boundary + self.CRLF
106 |         part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
107 |         if self.content_type is not None:
108 |             part_headers["Content-Type"] = self.content_type
109 |         part += self._build_part_headers(part_headers)
110 |         part += self.CRLF + chunk + self.CRLF
111 |         return part
112 | 
113 |     async def stream_response(self, send: Send) -> None:
114 |         await send(
115 |             {
116 |                 "type": "http.response.start",
117 |                 "status": self.status_code,
118 |                 "headers": self.raw_headers,
119 |             }
120 |         )
121 |         async for chunk in self.body_iterator:
122 |             if not isinstance(chunk, bytes):
123 |                 chunk = chunk.encode(self.charset)
124 |                 chunk = b64encode(chunk)
125 |             await send(
126 |                 {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
127 |             )
128 | 
129 |         await send({"type": "http.response.body", "body": b"", "more_body": False})
130 | 
131 | 
132 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
133 |     def return_content_type(filename):
134 |         if gz_uncompressed_content_type:
135 |             return gz_uncompressed_content_type
136 |         else:
137 |             return str(mimetypes.guess_type(filename)[0])
138 | 
139 |     filename = str(file.filename) if file.filename else ""
140 |     if filename.endswith(".gz"):
141 |         filename = filename[:-3]
142 | 
143 |     gzip_file = gzip.open(file.file).read()
144 |     return UploadFile(
145 |         file=io.BytesIO(gzip_file),
146 |         size=len(gzip_file),
147 |         filename=filename,
148 |         headers=Headers({"content-type": return_content_type(filename)}),
149 |     )
150 | 
151 | 
152 | @router.post("/test-project/v1/process-file-3")
153 | @router.post("/test-project/v1.2.3/process-file-3")
154 | def pipeline_1(
155 |     request: Request,
156 |     gz_uncompressed_content_type: Optional[str] = Form(default=None),
157 |     files: Union[List[UploadFile], None] = File(default=None),
158 |     output_format: Union[str, None] = Form(default=None),
159 |     output_schema: str = Form(default=None),
160 | ):
161 |     if files:
162 |         for file_index in range(len(files)):
163 |             if files[file_index].content_type == "application/gzip":
164 |                 files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type)
165 | 
166 |     content_type = request.headers.get("Accept")
167 | 
168 |     default_response_type = output_format or "text/csv"
169 |     if not content_type or content_type == "*/*" or content_type == "multipart/mixed":
170 |         media_type = default_response_type
171 |     else:
172 |         media_type = content_type
173 | 
174 |     default_response_schema = output_schema or "isd"
175 | 
176 |     if isinstance(files, list) and len(files):
177 |         if len(files) > 1:
178 |             if content_type and content_type not in [
179 |                 "*/*",
180 |                 "multipart/mixed",
181 |                 "application/json",
182 |                 "text/csv",
183 |             ]:
184 |                 raise HTTPException(
185 |                     detail=(
186 |                         f"Conflict in media type {content_type}"
187 |                         ' with response type "multipart/mixed".\n'
188 |                     ),
189 |                     status_code=status.HTTP_406_NOT_ACCEPTABLE,
190 |                 )
191 | 
192 |         def response_generator(is_multipart):
193 |             for file in files:
194 |                 get_validated_mimetype(file)
195 | 
196 |                 _file = file.file
197 | 
198 |                 response = pipeline_api(
199 |                     _file,
200 |                     response_type=media_type,
201 |                     response_schema=default_response_schema,
202 |                 )
203 | 
204 |                 if is_expected_response_type(media_type, type(response)):
205 |                     raise HTTPException(
206 |                         detail=(
207 |                             f"Conflict in media type {media_type}"
208 |                             f" with response type {type(response)}.\n"
209 |                         ),
210 |                         status_code=status.HTTP_406_NOT_ACCEPTABLE,
211 |                     )
212 | 
213 |                 valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"]
214 |                 if media_type in valid_response_types:
215 |                     if is_multipart:
216 |                         if type(response) not in [str, bytes]:
217 |                             response = json.dumps(response)
218 |                     elif media_type == "text/csv":
219 |                         response = PlainTextResponse(response)
220 |                     yield response
221 |                 else:
222 |                     raise HTTPException(
223 |                         detail=f"Unsupported media type {media_type}.\n",
224 |                         status_code=status.HTTP_406_NOT_ACCEPTABLE,
225 |                     )
226 | 
227 |         def join_responses(responses):
228 |             if media_type != "text/csv":
229 |                 return responses
230 |             data = pd.read_csv(io.BytesIO(responses[0].body))
231 |             if len(responses) > 1:
232 |                 for resp in responses[1:]:
233 |                     resp_data = pd.read_csv(io.BytesIO(resp.body))
234 |                     data = data.merge(resp_data, how="outer")
235 |             return PlainTextResponse(data.to_csv())
236 | 
237 |         if content_type == "multipart/mixed":
238 |             return MultipartMixedResponse(
239 |                 response_generator(is_multipart=True), content_type=media_type
240 |             )
241 |         else:
242 |             return (
243 |                 list(response_generator(is_multipart=False))[0]
244 |                 if len(files) == 1
245 |                 else join_responses(list(response_generator(is_multipart=False)))
246 |             )
247 |     else:
248 |         raise HTTPException(
249 |             detail='Request parameter "files" is required.\n',
250 |             status_code=status.HTTP_400_BAD_REQUEST,
251 |         )
252 | 
253 | 
254 | app.include_router(router)
255 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py:
--------------------------------------------------------------------------------
  1 | #####################################################################
  2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
  3 | # DO NOT MODIFY DIRECTLY
  4 | #####################################################################
  5 | 
  6 | import io
  7 | import os
  8 | import gzip
  9 | import mimetypes
 10 | from typing import List, Union
 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
 12 | from fastapi.responses import PlainTextResponse
 13 | import json
 14 | from fastapi.responses import StreamingResponse
 15 | from starlette.datastructures import Headers
 16 | from starlette.types import Send
 17 | from base64 import b64encode
 18 | from typing import Optional, Mapping
 19 | import secrets
 20 | import pandas as pd
 21 | 
 22 | 
 23 | app = FastAPI()
 24 | router = APIRouter()
 25 | 
 26 | 
 27 | def is_expected_response_type(media_type, response_type):
 28 |     if media_type == "application/json" and response_type not in [dict, list]:
 29 |         return True
 30 |     elif media_type == "text/csv" and response_type != str:
 31 |         return True
 32 |     else:
 33 |         return False
 34 | 
 35 | 
 36 | # pipeline-api
 37 | def pipeline_api(
 38 |     file,
 39 |     file_content_type=None,
 40 |     response_type="application/json",
 41 |     response_schema="labelstudio",
 42 |     m_input1=[],
 43 | ):
 44 |     return {
 45 |         "silly_result": " : ".join(
 46 |             [
 47 |                 str(len(file.read())),
 48 |                 str(file_content_type),
 49 |                 str(response_type),
 50 |                 str(response_schema),
 51 |                 str(m_input1),
 52 |             ]
 53 |         )
 54 |     }
 55 | 
 56 | 
 57 | def get_validated_mimetype(file):
 58 |     """
 59 |     Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
 60 |     generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
 61 |     return HTTP 400 for an invalid type.
 62 |     """
 63 |     content_type = file.content_type
 64 |     if not content_type or content_type == "application/octet-stream":
 65 |         content_type = mimetypes.guess_type(str(file.filename))[0]
 66 | 
 67 |         # Some filetypes missing for this library, just hardcode them for now
 68 |         if not content_type:
 69 |             if file.filename.endswith(".md"):
 70 |                 content_type = "text/markdown"
 71 |             elif file.filename.endswith(".msg"):
 72 |                 content_type = "message/rfc822"
 73 | 
 74 |     allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
 75 |     if allowed_mimetypes_str is not None:
 76 |         allowed_mimetypes = allowed_mimetypes_str.split(",")
 77 | 
 78 |         if content_type not in allowed_mimetypes:
 79 |             raise HTTPException(
 80 |                 status_code=400,
 81 |                 detail=(
 82 |                     f"Unable to process {file.filename}: "
 83 |                     f"File type {content_type} is not supported."
 84 |                 ),
 85 |             )
 86 | 
 87 |     return content_type
 88 | 
 89 | 
 90 | class MultipartMixedResponse(StreamingResponse):
 91 |     CRLF = b"\r\n"
 92 | 
 93 |     def __init__(self, *args, content_type: str = None, **kwargs):
 94 |         super().__init__(*args, **kwargs)
 95 |         self.content_type = content_type
 96 | 
 97 |     def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
 98 |         super().init_headers(headers)
 99 |         self.boundary_value = secrets.token_hex(16)
100 |         content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
101 |         self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
102 | 
103 |     @property
104 |     def boundary(self):
105 |         return b"--" + self.boundary_value.encode()
106 | 
107 |     def _build_part_headers(self, headers: dict) -> bytes:
108 |         header_bytes = b""
109 |         for header, value in headers.items():
110 |             header_bytes += f"{header}: {value}".encode() + self.CRLF
111 |         return header_bytes
112 | 
113 |     def build_part(self, chunk: bytes) -> bytes:
114 |         part = self.boundary + self.CRLF
115 |         part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
116 |         if self.content_type is not None:
117 |             part_headers["Content-Type"] = self.content_type
118 |         part += self._build_part_headers(part_headers)
119 |         part += self.CRLF + chunk + self.CRLF
120 |         return part
121 | 
122 |     async def stream_response(self, send: Send) -> None:
123 |         await send(
124 |             {
125 |                 "type": "http.response.start",
126 |                 "status": self.status_code,
127 |                 "headers": self.raw_headers,
128 |             }
129 |         )
130 |         async for chunk in self.body_iterator:
131 |             if not isinstance(chunk, bytes):
132 |                 chunk = chunk.encode(self.charset)
133 |                 chunk = b64encode(chunk)
134 |             await send(
135 |                 {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
136 |             )
137 | 
138 |         await send({"type": "http.response.body", "body": b"", "more_body": False})
139 | 
140 | 
141 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
142 |     def return_content_type(filename):
143 |         if gz_uncompressed_content_type:
144 |             return gz_uncompressed_content_type
145 |         else:
146 |             return str(mimetypes.guess_type(filename)[0])
147 | 
148 |     filename = str(file.filename) if file.filename else ""
149 |     if filename.endswith(".gz"):
150 |         filename = filename[:-3]
151 | 
152 |     gzip_file = gzip.open(file.file).read()
153 |     return UploadFile(
154 |         file=io.BytesIO(gzip_file),
155 |         size=len(gzip_file),
156 |         filename=filename,
157 |         headers=Headers({"content-type": return_content_type(filename)}),
158 |     )
159 | 
160 | 
161 | @router.post("/test-project/v1/process-file-4")
162 | @router.post("/test-project/v1.2.3/process-file-4")
163 | def pipeline_1(
164 |     request: Request,
165 |     gz_uncompressed_content_type: Optional[str] = Form(default=None),
166 |     files: Union[List[UploadFile], None] = File(default=None),
167 |     output_format: Union[str, None] = Form(default=None),
168 |     output_schema: str = Form(default=None),
169 |     input1: List[str] = Form(default=[]),
170 | ):
171 |     if files:
172 |         for file_index in range(len(files)):
173 |             if files[file_index].content_type == "application/gzip":
174 |                 files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type)
175 | 
176 |     content_type = request.headers.get("Accept")
177 | 
178 |     default_response_type = output_format or "application/json"
179 |     if not content_type or content_type == "*/*" or content_type == "multipart/mixed":
180 |         media_type = default_response_type
181 |     else:
182 |         media_type = content_type
183 | 
184 |     default_response_schema = output_schema or "labelstudio"
185 | 
186 |     if isinstance(files, list) and len(files):
187 |         if len(files) > 1:
188 |             if content_type and content_type not in [
189 |                 "*/*",
190 |                 "multipart/mixed",
191 |                 "application/json",
192 |                 "text/csv",
193 |             ]:
194 |                 raise HTTPException(
195 |                     detail=(
196 |                         f"Conflict in media type {content_type}"
197 |                         ' with response type "multipart/mixed".\n'
198 |                     ),
199 |                     status_code=status.HTTP_406_NOT_ACCEPTABLE,
200 |                 )
201 | 
202 |         def response_generator(is_multipart):
203 |             for file in files:
204 |                 file_content_type = get_validated_mimetype(file)
205 | 
206 |                 _file = file.file
207 | 
208 |                 response = pipeline_api(
209 |                     _file,
210 |                     m_input1=input1,
211 |                     response_type=media_type,
212 |                     response_schema=default_response_schema,
213 |                     file_content_type=file_content_type,
214 |                 )
215 | 
216 |                 if is_expected_response_type(media_type, type(response)):
217 |                     raise HTTPException(
218 |                         detail=(
219 |                             f"Conflict in media type {media_type}"
220 |                             f" with response type {type(response)}.\n"
221 |                         ),
222 |                         status_code=status.HTTP_406_NOT_ACCEPTABLE,
223 |                     )
224 | 
225 |                 valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"]
226 |                 if media_type in valid_response_types:
227 |                     if is_multipart:
228 |                         if type(response) not in [str, bytes]:
229 |                             response = json.dumps(response)
230 |                     elif media_type == "text/csv":
231 |                         response = PlainTextResponse(response)
232 |                     yield response
233 |                 else:
234 |                     raise HTTPException(
235 |                         detail=f"Unsupported media type {media_type}.\n",
236 |                         status_code=status.HTTP_406_NOT_ACCEPTABLE,
237 |                     )
238 | 
239 |         def join_responses(responses):
240 |             if media_type != "text/csv":
241 |                 return responses
242 |             data = pd.read_csv(io.BytesIO(responses[0].body))
243 |             if len(responses) > 1:
244 |                 for resp in responses[1:]:
245 |                     resp_data = pd.read_csv(io.BytesIO(resp.body))
246 |                     data = data.merge(resp_data, how="outer")
247 |             return PlainTextResponse(data.to_csv())
248 | 
249 |         if content_type == "multipart/mixed":
250 |             return MultipartMixedResponse(
251 |                 response_generator(is_multipart=True), content_type=media_type
252 |             )
253 |         else:
254 |             return (
255 |                 list(response_generator(is_multipart=False))[0]
256 |                 if len(files) == 1
257 |                 else join_responses(list(response_generator(is_multipart=False)))
258 |             )
259 |     else:
260 |         raise HTTPException(
261 |             detail='Request parameter "files" is required.\n',
262 |             status_code=status.HTTP_400_BAD_REQUEST,
263 |         )
264 | 
265 | 
266 | app.include_router(router)
267 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py:
--------------------------------------------------------------------------------
  1 | #####################################################################
  2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
  3 | # DO NOT MODIFY DIRECTLY
  4 | #####################################################################
  5 | 
  6 | import io
  7 | import os
  8 | import gzip
  9 | import mimetypes
 10 | from typing import List, Union
 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
 12 | from fastapi.responses import PlainTextResponse
 13 | import json
 14 | from fastapi.responses import StreamingResponse
 15 | from starlette.datastructures import Headers
 16 | from starlette.types import Send
 17 | from base64 import b64encode
 18 | from typing import Optional, Mapping
 19 | import secrets
 20 | import pandas as pd
 21 | 
 22 | 
 23 | app = FastAPI()
 24 | router = APIRouter()
 25 | 
 26 | 
 27 | def is_expected_response_type(media_type, response_type):
 28 |     if media_type == "application/json" and response_type not in [dict, list]:
 29 |         return True
 30 |     elif media_type == "text/csv" and response_type != str:
 31 |         return True
 32 |     else:
 33 |         return False
 34 | 
 35 | 
 36 | # pipeline-api
 37 | 
 38 | 
 39 | def pipeline_api(
 40 |     file,
 41 |     file_content_type=None,
 42 |     response_type="application/json",
 43 |     response_schema="labelstudio",
 44 |     m_input1=[],
 45 |     m_input2=[],
 46 | ):
 47 |     data = pd.DataFrame(
 48 |         data={
 49 |             "silly_result": [
 50 |                 str(len(file.read())),
 51 |                 str(file_content_type),
 52 |                 str(response_type),
 53 |                 str(response_schema),
 54 |                 str(m_input1),
 55 |                 str(m_input2),
 56 |             ]
 57 |         }
 58 |     )
 59 |     if response_type == "text/csv":
 60 |         return data.to_csv()
 61 |     else:
 62 |         text = " : ".join(list(data["silly_result"]))
 63 |         return {"silly_result": text}
 64 | 
 65 | 
 66 | def get_validated_mimetype(file):
 67 |     """
 68 |     Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
 69 |     generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
 70 |     return HTTP 400 for an invalid type.
 71 |     """
 72 |     content_type = file.content_type
 73 |     if not content_type or content_type == "application/octet-stream":
 74 |         content_type = mimetypes.guess_type(str(file.filename))[0]
 75 | 
 76 |         # Some filetypes missing for this library, just hardcode them for now
 77 |         if not content_type:
 78 |             if file.filename.endswith(".md"):
 79 |                 content_type = "text/markdown"
 80 |             elif file.filename.endswith(".msg"):
 81 |                 content_type = "message/rfc822"
 82 | 
 83 |     allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
 84 |     if allowed_mimetypes_str is not None:
 85 |         allowed_mimetypes = allowed_mimetypes_str.split(",")
 86 | 
 87 |         if content_type not in allowed_mimetypes:
 88 |             raise HTTPException(
 89 |                 status_code=400,
 90 |                 detail=(
 91 |                     f"Unable to process {file.filename}: "
 92 |                     f"File type {content_type} is not supported."
 93 |                 ),
 94 |             )
 95 | 
 96 |     return content_type
 97 | 
 98 | 
 99 | class MultipartMixedResponse(StreamingResponse):
100 |     CRLF = b"\r\n"
101 | 
102 |     def __init__(self, *args, content_type: str = None, **kwargs):
103 |         super().__init__(*args, **kwargs)
104 |         self.content_type = content_type
105 | 
106 |     def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
107 |         super().init_headers(headers)
108 |         self.boundary_value = secrets.token_hex(16)
109 |         content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
110 |         self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
111 | 
112 |     @property
113 |     def boundary(self):
114 |         return b"--" + self.boundary_value.encode()
115 | 
116 |     def _build_part_headers(self, headers: dict) -> bytes:
117 |         header_bytes = b""
118 |         for header, value in headers.items():
119 |             header_bytes += f"{header}: {value}".encode() + self.CRLF
120 |         return header_bytes
121 | 
122 |     def build_part(self, chunk: bytes) -> bytes:
123 |         part = self.boundary + self.CRLF
124 |         part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
125 |         if self.content_type is not None:
126 |             part_headers["Content-Type"] = self.content_type
127 |         part += self._build_part_headers(part_headers)
128 |         part += self.CRLF + chunk + self.CRLF
129 |         return part
130 | 
131 |     async def stream_response(self, send: Send) -> None:
132 |         await send(
133 |             {
134 |                 "type": "http.response.start",
135 |                 "status": self.status_code,
136 |                 "headers": self.raw_headers,
137 |             }
138 |         )
139 |         async for chunk in self.body_iterator:
140 |             if not isinstance(chunk, bytes):
141 |                 chunk = chunk.encode(self.charset)
142 |                 chunk = b64encode(chunk)
143 |             await send(
144 |                 {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
145 |             )
146 | 
147 |         await send({"type": "http.response.body", "body": b"", "more_body": False})
148 | 
149 | 
150 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
151 |     def return_content_type(filename):
152 |         if gz_uncompressed_content_type:
153 |             return gz_uncompressed_content_type
154 |         else:
155 |             return str(mimetypes.guess_type(filename)[0])
156 | 
157 |     filename = str(file.filename) if file.filename else ""
158 |     if filename.endswith(".gz"):
159 |         filename = filename[:-3]
160 | 
161 |     gzip_file = gzip.open(file.file).read()
162 |     return UploadFile(
163 |         file=io.BytesIO(gzip_file),
164 |         size=len(gzip_file),
165 |         filename=filename,
166 |         headers=Headers({"content-type": return_content_type(filename)}),
167 |     )
168 | 
169 | 
170 | @router.post("/test-project/v1/process-file-5")
171 | @router.post("/test-project/v1.2.3/process-file-5")
172 | def pipeline_1(
173 |     request: Request,
174 |     gz_uncompressed_content_type: Optional[str] = Form(default=None),
175 |     files: Union[List[UploadFile], None] = File(default=None),
176 |     output_format: Union[str, None] = Form(default=None),
177 |     output_schema: str = Form(default=None),
178 |     input1: List[str] = Form(default=[]),
179 |     input2: List[str] = Form(default=[]),
180 | ):
181 |     if files:
182 |         for file_index in range(len(files)):
183 |             if files[file_index].content_type == "application/gzip":
184 |                 files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type)
185 | 
186 |     content_type = request.headers.get("Accept")
187 | 
188 |     default_response_type = output_format or "application/json"
189 |     if not content_type or content_type == "*/*" or content_type == "multipart/mixed":
190 |         media_type = default_response_type
191 |     else:
192 |         media_type = content_type
193 | 
194 |     default_response_schema = output_schema or "labelstudio"
195 | 
196 |     if isinstance(files, list) and len(files):
197 |         if len(files) > 1:
198 |             if content_type and content_type not in [
199 |                 "*/*",
200 |                 "multipart/mixed",
201 |                 "application/json",
202 |                 "text/csv",
203 |             ]:
204 |                 raise HTTPException(
205 |                     detail=(
206 |                         f"Conflict in media type {content_type}"
207 |                         ' with response type "multipart/mixed".\n'
208 |                     ),
209 |                     status_code=status.HTTP_406_NOT_ACCEPTABLE,
210 |                 )
211 | 
212 |         def response_generator(is_multipart):
213 |             for file in files:
214 |                 file_content_type = get_validated_mimetype(file)
215 | 
216 |                 _file = file.file
217 | 
218 |                 response = pipeline_api(
219 |                     _file,
220 |                     m_input1=input1,
221 |                     m_input2=input2,
222 |                     response_type=media_type,
223 |                     response_schema=default_response_schema,
224 |                     file_content_type=file_content_type,
225 |                 )
226 | 
227 |                 if is_expected_response_type(media_type, type(response)):
228 |                     raise HTTPException(
229 |                         detail=(
230 |                             f"Conflict in media type {media_type}"
231 |                             f" with response type {type(response)}.\n"
232 |                         ),
233 |                         status_code=status.HTTP_406_NOT_ACCEPTABLE,
234 |                     )
235 | 
236 |                 valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"]
237 |                 if media_type in valid_response_types:
238 |                     if is_multipart:
239 |                         if type(response) not in [str, bytes]:
240 |                             response = json.dumps(response)
241 |                     elif media_type == "text/csv":
242 |                         response = PlainTextResponse(response)
243 |                     yield response
244 |                 else:
245 |                     raise HTTPException(
246 |                         detail=f"Unsupported media type {media_type}.\n",
247 |                         status_code=status.HTTP_406_NOT_ACCEPTABLE,
248 |                     )
249 | 
250 |         def join_responses(responses):
251 |             if media_type != "text/csv":
252 |                 return responses
253 |             data = pd.read_csv(io.BytesIO(responses[0].body))
254 |             if len(responses) > 1:
255 |                 for resp in responses[1:]:
256 |                     resp_data = pd.read_csv(io.BytesIO(resp.body))
257 |                     data = data.merge(resp_data, how="outer")
258 |             return PlainTextResponse(data.to_csv())
259 | 
260 |         if content_type == "multipart/mixed":
261 |             return MultipartMixedResponse(
262 |                 response_generator(is_multipart=True), content_type=media_type
263 |             )
264 |         else:
265 |             return (
266 |                 list(response_generator(is_multipart=False))[0]
267 |                 if len(files) == 1
268 |                 else join_responses(list(response_generator(is_multipart=False)))
269 |             )
270 |     else:
271 |         raise HTTPException(
272 |             detail='Request parameter "files" is required.\n',
273 |             status_code=status.HTTP_400_BAD_REQUEST,
274 |         )
275 | 
276 | 
277 | app.include_router(router)
278 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py:
--------------------------------------------------------------------------------
  1 | #####################################################################
  2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
  3 | # DO NOT MODIFY DIRECTLY
  4 | #####################################################################
  5 | 
  6 | import io
  7 | import os
  8 | import gzip
  9 | import mimetypes
 10 | from typing import List, Union
 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
 12 | import json
 13 | from fastapi.responses import StreamingResponse
 14 | from starlette.datastructures import Headers
 15 | from starlette.types import Send
 16 | from base64 import b64encode
 17 | from typing import Optional, Mapping
 18 | import secrets
 19 | 
 20 | 
 21 | app = FastAPI()
 22 | router = APIRouter()
 23 | 
 24 | 
 25 | # pipeline-api
 26 | def pipeline_api(
 27 |     text,
 28 | ):
 29 |     return {"silly_result": " : ".join([str(len(text)), text])}
 30 | 
 31 | 
 32 | def get_validated_mimetype(file):
 33 |     """
 34 |     Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
 35 |     generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
 36 |     return HTTP 400 for an invalid type.
 37 |     """
 38 |     content_type = file.content_type
 39 |     if not content_type or content_type == "application/octet-stream":
 40 |         content_type = mimetypes.guess_type(str(file.filename))[0]
 41 | 
 42 |         # Some filetypes missing for this library, just hardcode them for now
 43 |         if not content_type:
 44 |             if file.filename.endswith(".md"):
 45 |                 content_type = "text/markdown"
 46 |             elif file.filename.endswith(".msg"):
 47 |                 content_type = "message/rfc822"
 48 | 
 49 |     allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
 50 |     if allowed_mimetypes_str is not None:
 51 |         allowed_mimetypes = allowed_mimetypes_str.split(",")
 52 | 
 53 |         if content_type not in allowed_mimetypes:
 54 |             raise HTTPException(
 55 |                 status_code=400,
 56 |                 detail=(
 57 |                     f"Unable to process {file.filename}: "
 58 |                     f"File type {content_type} is not supported."
 59 |                 ),
 60 |             )
 61 | 
 62 |     return content_type
 63 | 
 64 | 
 65 | class MultipartMixedResponse(StreamingResponse):
 66 |     CRLF = b"\r\n"
 67 | 
 68 |     def __init__(self, *args, content_type: str = None, **kwargs):
 69 |         super().__init__(*args, **kwargs)
 70 |         self.content_type = content_type
 71 | 
 72 |     def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
 73 |         super().init_headers(headers)
 74 |         self.boundary_value = secrets.token_hex(16)
 75 |         content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
 76 |         self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
 77 | 
 78 |     @property
 79 |     def boundary(self):
 80 |         return b"--" + self.boundary_value.encode()
 81 | 
 82 |     def _build_part_headers(self, headers: dict) -> bytes:
 83 |         header_bytes = b""
 84 |         for header, value in headers.items():
 85 |             header_bytes += f"{header}: {value}".encode() + self.CRLF
 86 |         return header_bytes
 87 | 
 88 |     def build_part(self, chunk: bytes) -> bytes:
 89 |         part = self.boundary + self.CRLF
 90 |         part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
 91 |         if self.content_type is not None:
 92 |             part_headers["Content-Type"] = self.content_type
 93 |         part += self._build_part_headers(part_headers)
 94 |         part += self.CRLF + chunk + self.CRLF
 95 |         return part
 96 | 
 97 |     async def stream_response(self, send: Send) -> None:
 98 |         await send(
 99 |             {
100 |                 "type": "http.response.start",
101 |                 "status": self.status_code,
102 |                 "headers": self.raw_headers,
103 |             }
104 |         )
105 |         async for chunk in self.body_iterator:
106 |             if not isinstance(chunk, bytes):
107 |                 chunk = chunk.encode(self.charset)
108 |                 chunk = b64encode(chunk)
109 |             await send(
110 |                 {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
111 |             )
112 | 
113 |         await send({"type": "http.response.body", "body": b"", "more_body": False})
114 | 
115 | 
116 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
117 |     def return_content_type(filename):
118 |         if gz_uncompressed_content_type:
119 |             return gz_uncompressed_content_type
120 |         else:
121 |             return str(mimetypes.guess_type(filename)[0])
122 | 
123 |     filename = str(file.filename) if file.filename else ""
124 |     if filename.endswith(".gz"):
125 |         filename = filename[:-3]
126 | 
127 |     gzip_file = gzip.open(file.file).read()
128 |     return UploadFile(
129 |         file=io.BytesIO(gzip_file),
130 |         size=len(gzip_file),
131 |         filename=filename,
132 |         headers=Headers({"content-type": return_content_type(filename)}),
133 |     )
134 | 
135 | 
136 | @router.post("/test-project/v1/process-text-1")
137 | @router.post("/test-project/v1.2.3/process-text-1")
138 | def pipeline_1(
139 |     request: Request,
140 |     gz_uncompressed_content_type: Optional[str] = Form(default=None),
141 |     text_files: Union[List[UploadFile], None] = File(default=None),
142 | ):
143 |     if text_files:
144 |         for file_index in range(len(text_files)):
145 |             if text_files[file_index].content_type == "application/gzip":
146 |                 text_files[file_index] = ungz_file(text_files[file_index])
147 | 
148 |     content_type = request.headers.get("Accept")
149 | 
150 |     if isinstance(text_files, list) and len(text_files):
151 |         if len(text_files) > 1:
152 |             if content_type and content_type not in [
153 |                 "*/*",
154 |                 "multipart/mixed",
155 |                 "application/json",
156 |                 "text/csv",
157 |             ]:
158 |                 raise HTTPException(
159 |                     detail=(
160 |                         f"Conflict in media type {content_type}"
161 |                         ' with response type "multipart/mixed".\n'
162 |                     ),
163 |                     status_code=status.HTTP_406_NOT_ACCEPTABLE,
164 |                 )
165 | 
166 |         def response_generator(is_multipart):
167 |             for file in text_files:
168 |                 get_validated_mimetype(file)
169 | 
170 |                 text = file.file.read().decode("utf-8")
171 | 
172 |                 response = pipeline_api(
173 |                     text,
174 |                 )
175 | 
176 |                 if is_multipart:
177 |                     if type(response) not in [str, bytes]:
178 |                         response = json.dumps(response)
179 |                 yield response
180 | 
181 |         if content_type == "multipart/mixed":
182 |             return MultipartMixedResponse(
183 |                 response_generator(is_multipart=True),
184 |             )
185 |         else:
186 |             return (
187 |                 list(response_generator(is_multipart=False))[0]
188 |                 if len(text_files) == 1
189 |                 else response_generator(is_multipart=False)
190 |             )
191 |     else:
192 |         raise HTTPException(
193 |             detail='Request parameter "text_files" is required.\n',
194 |             status_code=status.HTTP_400_BAD_REQUEST,
195 |         )
196 | 
197 | 
198 | app.include_router(router)
199 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py:
--------------------------------------------------------------------------------
  1 | #####################################################################
  2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
  3 | # DO NOT MODIFY DIRECTLY
  4 | #####################################################################
  5 | 
  6 | import io
  7 | import os
  8 | import gzip
  9 | import mimetypes
 10 | from typing import List, Union
 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
 12 | import json
 13 | from fastapi.responses import StreamingResponse
 14 | from starlette.datastructures import Headers
 15 | from starlette.types import Send
 16 | from base64 import b64encode
 17 | from typing import Optional, Mapping
 18 | import secrets
 19 | 
 20 | 
 21 | app = FastAPI()
 22 | router = APIRouter()
 23 | 
 24 | 
 25 | # pipeline-api
 26 | def pipeline_api(text, m_input1=[], m_input2=[]):
 27 |     return {"silly_result": " : ".join([str(len(text)), text, str(m_input1), str(m_input2)])}
 28 | 
 29 | 
 30 | def get_validated_mimetype(file):
 31 |     """
 32 |     Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
 33 |     generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
 34 |     return HTTP 400 for an invalid type.
 35 |     """
 36 |     content_type = file.content_type
 37 |     if not content_type or content_type == "application/octet-stream":
 38 |         content_type = mimetypes.guess_type(str(file.filename))[0]
 39 | 
 40 |         # Some filetypes missing for this library, just hardcode them for now
 41 |         if not content_type:
 42 |             if file.filename.endswith(".md"):
 43 |                 content_type = "text/markdown"
 44 |             elif file.filename.endswith(".msg"):
 45 |                 content_type = "message/rfc822"
 46 | 
 47 |     allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
 48 |     if allowed_mimetypes_str is not None:
 49 |         allowed_mimetypes = allowed_mimetypes_str.split(",")
 50 | 
 51 |         if content_type not in allowed_mimetypes:
 52 |             raise HTTPException(
 53 |                 status_code=400,
 54 |                 detail=(
 55 |                     f"Unable to process {file.filename}: "
 56 |                     f"File type {content_type} is not supported."
 57 |                 ),
 58 |             )
 59 | 
 60 |     return content_type
 61 | 
 62 | 
 63 | class MultipartMixedResponse(StreamingResponse):
 64 |     CRLF = b"\r\n"
 65 | 
 66 |     def __init__(self, *args, content_type: str = None, **kwargs):
 67 |         super().__init__(*args, **kwargs)
 68 |         self.content_type = content_type
 69 | 
 70 |     def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
 71 |         super().init_headers(headers)
 72 |         self.boundary_value = secrets.token_hex(16)
 73 |         content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
 74 |         self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
 75 | 
 76 |     @property
 77 |     def boundary(self):
 78 |         return b"--" + self.boundary_value.encode()
 79 | 
 80 |     def _build_part_headers(self, headers: dict) -> bytes:
 81 |         header_bytes = b""
 82 |         for header, value in headers.items():
 83 |             header_bytes += f"{header}: {value}".encode() + self.CRLF
 84 |         return header_bytes
 85 | 
 86 |     def build_part(self, chunk: bytes) -> bytes:
 87 |         part = self.boundary + self.CRLF
 88 |         part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
 89 |         if self.content_type is not None:
 90 |             part_headers["Content-Type"] = self.content_type
 91 |         part += self._build_part_headers(part_headers)
 92 |         part += self.CRLF + chunk + self.CRLF
 93 |         return part
 94 | 
 95 |     async def stream_response(self, send: Send) -> None:
 96 |         await send(
 97 |             {
 98 |                 "type": "http.response.start",
 99 |                 "status": self.status_code,
100 |                 "headers": self.raw_headers,
101 |             }
102 |         )
103 |         async for chunk in self.body_iterator:
104 |             if not isinstance(chunk, bytes):
105 |                 chunk = chunk.encode(self.charset)
106 |                 chunk = b64encode(chunk)
107 |             await send(
108 |                 {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
109 |             )
110 | 
111 |         await send({"type": "http.response.body", "body": b"", "more_body": False})
112 | 
113 | 
114 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
115 |     def return_content_type(filename):
116 |         if gz_uncompressed_content_type:
117 |             return gz_uncompressed_content_type
118 |         else:
119 |             return str(mimetypes.guess_type(filename)[0])
120 | 
121 |     filename = str(file.filename) if file.filename else ""
122 |     if filename.endswith(".gz"):
123 |         filename = filename[:-3]
124 | 
125 |     gzip_file = gzip.open(file.file).read()
126 |     return UploadFile(
127 |         file=io.BytesIO(gzip_file),
128 |         size=len(gzip_file),
129 |         filename=filename,
130 |         headers=Headers({"content-type": return_content_type(filename)}),
131 |     )
132 | 
133 | 
134 | @router.post("/test-project/v1/process-text-2")
135 | @router.post("/test-project/v1.2.3/process-text-2")
136 | def pipeline_1(
137 |     request: Request,
138 |     gz_uncompressed_content_type: Optional[str] = Form(default=None),
139 |     text_files: Union[List[UploadFile], None] = File(default=None),
140 |     input1: List[str] = Form(default=[]),
141 |     input2: List[str] = Form(default=[]),
142 | ):
143 |     if text_files:
144 |         for file_index in range(len(text_files)):
145 |             if text_files[file_index].content_type == "application/gzip":
146 |                 text_files[file_index] = ungz_file(text_files[file_index])
147 | 
148 |     content_type = request.headers.get("Accept")
149 | 
150 |     if isinstance(text_files, list) and len(text_files):
151 |         if len(text_files) > 1:
152 |             if content_type and content_type not in [
153 |                 "*/*",
154 |                 "multipart/mixed",
155 |                 "application/json",
156 |                 "text/csv",
157 |             ]:
158 |                 raise HTTPException(
159 |                     detail=(
160 |                         f"Conflict in media type {content_type}"
161 |                         ' with response type "multipart/mixed".\n'
162 |                     ),
163 |                     status_code=status.HTTP_406_NOT_ACCEPTABLE,
164 |                 )
165 | 
166 |         def response_generator(is_multipart):
167 |             for file in text_files:
168 |                 get_validated_mimetype(file)
169 | 
170 |                 text = file.file.read().decode("utf-8")
171 | 
172 |                 response = pipeline_api(
173 |                     text,
174 |                     m_input1=input1,
175 |                     m_input2=input2,
176 |                 )
177 | 
178 |                 if is_multipart:
179 |                     if type(response) not in [str, bytes]:
180 |                         response = json.dumps(response)
181 |                 yield response
182 | 
183 |         if content_type == "multipart/mixed":
184 |             return MultipartMixedResponse(
185 |                 response_generator(is_multipart=True),
186 |             )
187 |         else:
188 |             return (
189 |                 list(response_generator(is_multipart=False))[0]
190 |                 if len(text_files) == 1
191 |                 else response_generator(is_multipart=False)
192 |             )
193 |     else:
194 |         raise HTTPException(
195 |             detail='Request parameter "text_files" is required.\n',
196 |             status_code=status.HTTP_400_BAD_REQUEST,
197 |         )
198 | 
199 | 
200 | app.include_router(router)
201 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py:
--------------------------------------------------------------------------------
  1 | #####################################################################
  2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
  3 | # DO NOT MODIFY DIRECTLY
  4 | #####################################################################
  5 | 
  6 | import io
  7 | import os
  8 | import gzip
  9 | import mimetypes
 10 | from typing import List, Union
 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
 12 | from fastapi.responses import PlainTextResponse
 13 | import json
 14 | from fastapi.responses import StreamingResponse
 15 | from starlette.datastructures import Headers
 16 | from starlette.types import Send
 17 | from base64 import b64encode
 18 | from typing import Optional, Mapping
 19 | import secrets
 20 | import pandas as pd
 21 | 
 22 | 
 23 | app = FastAPI()
 24 | router = APIRouter()
 25 | 
 26 | 
 27 | def is_expected_response_type(media_type, response_type):
 28 |     if media_type == "application/json" and response_type not in [dict, list]:
 29 |         return True
 30 |     elif media_type == "text/csv" and response_type != str:
 31 |         return True
 32 |     else:
 33 |         return False
 34 | 
 35 | 
 36 | # pipeline-api
 37 | def pipeline_api(text, response_type="text/csv"):
 38 |     data = pd.DataFrame(data={"silly_result": [str(len(text)), text, str(response_type)]})
 39 |     if response_type == "text/csv":
 40 |         return data.to_csv()
 41 |     else:
 42 |         text = " : ".join(list(data["silly_result"]))
 43 |         return {"silly_result": text}
 44 | 
 45 | 
 46 | def get_validated_mimetype(file):
 47 |     """
 48 |     Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
 49 |     generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
 50 |     return HTTP 400 for an invalid type.
 51 |     """
 52 |     content_type = file.content_type
 53 |     if not content_type or content_type == "application/octet-stream":
 54 |         content_type = mimetypes.guess_type(str(file.filename))[0]
 55 | 
 56 |         # Some filetypes missing for this library, just hardcode them for now
 57 |         if not content_type:
 58 |             if file.filename.endswith(".md"):
 59 |                 content_type = "text/markdown"
 60 |             elif file.filename.endswith(".msg"):
 61 |                 content_type = "message/rfc822"
 62 | 
 63 |     allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
 64 |     if allowed_mimetypes_str is not None:
 65 |         allowed_mimetypes = allowed_mimetypes_str.split(",")
 66 | 
 67 |         if content_type not in allowed_mimetypes:
 68 |             raise HTTPException(
 69 |                 status_code=400,
 70 |                 detail=(
 71 |                     f"Unable to process {file.filename}: "
 72 |                     f"File type {content_type} is not supported."
 73 |                 ),
 74 |             )
 75 | 
 76 |     return content_type
 77 | 
 78 | 
 79 | class MultipartMixedResponse(StreamingResponse):
 80 |     CRLF = b"\r\n"
 81 | 
 82 |     def __init__(self, *args, content_type: str = None, **kwargs):
 83 |         super().__init__(*args, **kwargs)
 84 |         self.content_type = content_type
 85 | 
 86 |     def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
 87 |         super().init_headers(headers)
 88 |         self.boundary_value = secrets.token_hex(16)
 89 |         content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
 90 |         self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
 91 | 
 92 |     @property
 93 |     def boundary(self):
 94 |         return b"--" + self.boundary_value.encode()
 95 | 
 96 |     def _build_part_headers(self, headers: dict) -> bytes:
 97 |         header_bytes = b""
 98 |         for header, value in headers.items():
 99 |             header_bytes += f"{header}: {value}".encode() + self.CRLF
100 |         return header_bytes
101 | 
102 |     def build_part(self, chunk: bytes) -> bytes:
103 |         part = self.boundary + self.CRLF
104 |         part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
105 |         if self.content_type is not None:
106 |             part_headers["Content-Type"] = self.content_type
107 |         part += self._build_part_headers(part_headers)
108 |         part += self.CRLF + chunk + self.CRLF
109 |         return part
110 | 
111 |     async def stream_response(self, send: Send) -> None:
112 |         await send(
113 |             {
114 |                 "type": "http.response.start",
115 |                 "status": self.status_code,
116 |                 "headers": self.raw_headers,
117 |             }
118 |         )
119 |         async for chunk in self.body_iterator:
120 |             if not isinstance(chunk, bytes):
121 |                 chunk = chunk.encode(self.charset)
122 |                 chunk = b64encode(chunk)
123 |             await send(
124 |                 {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
125 |             )
126 | 
127 |         await send({"type": "http.response.body", "body": b"", "more_body": False})
128 | 
129 | 
130 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
131 |     def return_content_type(filename):
132 |         if gz_uncompressed_content_type:
133 |             return gz_uncompressed_content_type
134 |         else:
135 |             return str(mimetypes.guess_type(filename)[0])
136 | 
137 |     filename = str(file.filename) if file.filename else ""
138 |     if filename.endswith(".gz"):
139 |         filename = filename[:-3]
140 | 
141 |     gzip_file = gzip.open(file.file).read()
142 |     return UploadFile(
143 |         file=io.BytesIO(gzip_file),
144 |         size=len(gzip_file),
145 |         filename=filename,
146 |         headers=Headers({"content-type": return_content_type(filename)}),
147 |     )
148 | 
149 | 
150 | @router.post("/test-project/v1/process-text-3")
151 | @router.post("/test-project/v1.2.3/process-text-3")
152 | def pipeline_1(
153 |     request: Request,
154 |     gz_uncompressed_content_type: Optional[str] = Form(default=None),
155 |     text_files: Union[List[UploadFile], None] = File(default=None),
156 |     output_format: Union[str, None] = Form(default=None),
157 | ):
158 |     if text_files:
159 |         for file_index in range(len(text_files)):
160 |             if text_files[file_index].content_type == "application/gzip":
161 |                 text_files[file_index] = ungz_file(text_files[file_index])
162 | 
163 |     content_type = request.headers.get("Accept")
164 | 
165 |     default_response_type = output_format or "text/csv"
166 |     if not content_type or content_type == "*/*" or content_type == "multipart/mixed":
167 |         media_type = default_response_type
168 |     else:
169 |         media_type = content_type
170 | 
171 |     if isinstance(text_files, list) and len(text_files):
172 |         if len(text_files) > 1:
173 |             if content_type and content_type not in [
174 |                 "*/*",
175 |                 "multipart/mixed",
176 |                 "application/json",
177 |                 "text/csv",
178 |             ]:
179 |                 raise HTTPException(
180 |                     detail=(
181 |                         f"Conflict in media type {content_type}"
182 |                         ' with response type "multipart/mixed".\n'
183 |                     ),
184 |                     status_code=status.HTTP_406_NOT_ACCEPTABLE,
185 |                 )
186 | 
187 |         def response_generator(is_multipart):
188 |             for file in text_files:
189 |                 get_validated_mimetype(file)
190 | 
191 |                 text = file.file.read().decode("utf-8")
192 | 
193 |                 response = pipeline_api(
194 |                     text,
195 |                     response_type=media_type,
196 |                 )
197 | 
198 |                 if is_expected_response_type(media_type, type(response)):
199 |                     raise HTTPException(
200 |                         detail=(
201 |                             f"Conflict in media type {media_type}"
202 |                             f" with response type {type(response)}.\n"
203 |                         ),
204 |                         status_code=status.HTTP_406_NOT_ACCEPTABLE,
205 |                     )
206 | 
207 |                 valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"]
208 |                 if media_type in valid_response_types:
209 |                     if is_multipart:
210 |                         if type(response) not in [str, bytes]:
211 |                             response = json.dumps(response)
212 |                     elif media_type == "text/csv":
213 |                         response = PlainTextResponse(response)
214 |                     yield response
215 |                 else:
216 |                     raise HTTPException(
217 |                         detail=f"Unsupported media type {media_type}.\n",
218 |                         status_code=status.HTTP_406_NOT_ACCEPTABLE,
219 |                     )
220 | 
221 |         def join_responses(responses):
222 |             if media_type != "text/csv":
223 |                 return responses
224 |             data = pd.read_csv(io.BytesIO(responses[0].body))
225 |             if len(responses) > 1:
226 |                 for resp in responses[1:]:
227 |                     resp_data = pd.read_csv(io.BytesIO(resp.body))
228 |                     data = data.merge(resp_data, how="outer")
229 |             return PlainTextResponse(data.to_csv())
230 | 
231 |         if content_type == "multipart/mixed":
232 |             return MultipartMixedResponse(
233 |                 response_generator(is_multipart=True), content_type=media_type
234 |             )
235 |         else:
236 |             return (
237 |                 list(response_generator(is_multipart=False))[0]
238 |                 if len(text_files) == 1
239 |                 else join_responses(list(response_generator(is_multipart=False)))
240 |             )
241 |     else:
242 |         raise HTTPException(
243 |             detail='Request parameter "text_files" is required.\n',
244 |             status_code=status.HTTP_400_BAD_REQUEST,
245 |         )
246 | 
247 | 
248 | app.include_router(router)
249 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py:
--------------------------------------------------------------------------------
  1 | #####################################################################
  2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
  3 | # DO NOT MODIFY DIRECTLY
  4 | #####################################################################
  5 | 
  6 | import io
  7 | import os
  8 | import gzip
  9 | import mimetypes
 10 | from typing import List, Union
 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
 12 | from fastapi.responses import PlainTextResponse
 13 | import json
 14 | from fastapi.responses import StreamingResponse
 15 | from starlette.datastructures import Headers
 16 | from starlette.types import Send
 17 | from base64 import b64encode
 18 | from typing import Optional, Mapping
 19 | import secrets
 20 | import pandas as pd
 21 | 
 22 | 
 23 | app = FastAPI()
 24 | router = APIRouter()
 25 | 
 26 | 
 27 | def is_expected_response_type(media_type, response_type):
 28 |     if media_type == "application/json" and response_type not in [dict, list]:
 29 |         return True
 30 |     elif media_type == "text/csv" and response_type != str:
 31 |         return True
 32 |     else:
 33 |         return False
 34 | 
 35 | 
 36 | # pipeline-api
 37 | def pipeline_api(
 38 |     text,
 39 |     response_type="text/csv",
 40 |     response_schema="isd",
 41 | ):
 42 |     data = pd.DataFrame(
 43 |         data={"silly_result": [str(len(text)), text, str(response_type), str(response_schema)]}
 44 |     )
 45 |     if response_type == "text/csv":
 46 |         return data.to_csv()
 47 |     else:
 48 |         text = " : ".join(list(data["silly_result"]))
 49 |         return {"silly_result": text}
 50 | 
 51 | 
 52 | def get_validated_mimetype(file):
 53 |     """
 54 |     Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
 55 |     generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
 56 |     return HTTP 400 for an invalid type.
 57 |     """
 58 |     content_type = file.content_type
 59 |     if not content_type or content_type == "application/octet-stream":
 60 |         content_type = mimetypes.guess_type(str(file.filename))[0]
 61 | 
 62 |         # Some filetypes missing for this library, just hardcode them for now
 63 |         if not content_type:
 64 |             if file.filename.endswith(".md"):
 65 |                 content_type = "text/markdown"
 66 |             elif file.filename.endswith(".msg"):
 67 |                 content_type = "message/rfc822"
 68 | 
 69 |     allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
 70 |     if allowed_mimetypes_str is not None:
 71 |         allowed_mimetypes = allowed_mimetypes_str.split(",")
 72 | 
 73 |         if content_type not in allowed_mimetypes:
 74 |             raise HTTPException(
 75 |                 status_code=400,
 76 |                 detail=(
 77 |                     f"Unable to process {file.filename}: "
 78 |                     f"File type {content_type} is not supported."
 79 |                 ),
 80 |             )
 81 | 
 82 |     return content_type
 83 | 
 84 | 
 85 | class MultipartMixedResponse(StreamingResponse):
 86 |     CRLF = b"\r\n"
 87 | 
 88 |     def __init__(self, *args, content_type: str = None, **kwargs):
 89 |         super().__init__(*args, **kwargs)
 90 |         self.content_type = content_type
 91 | 
 92 |     def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
 93 |         super().init_headers(headers)
 94 |         self.boundary_value = secrets.token_hex(16)
 95 |         content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
 96 |         self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
 97 | 
 98 |     @property
 99 |     def boundary(self):
100 |         return b"--" + self.boundary_value.encode()
101 | 
102 |     def _build_part_headers(self, headers: dict) -> bytes:
103 |         header_bytes = b""
104 |         for header, value in headers.items():
105 |             header_bytes += f"{header}: {value}".encode() + self.CRLF
106 |         return header_bytes
107 | 
108 |     def build_part(self, chunk: bytes) -> bytes:
109 |         part = self.boundary + self.CRLF
110 |         part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
111 |         if self.content_type is not None:
112 |             part_headers["Content-Type"] = self.content_type
113 |         part += self._build_part_headers(part_headers)
114 |         part += self.CRLF + chunk + self.CRLF
115 |         return part
116 | 
117 |     async def stream_response(self, send: Send) -> None:
118 |         await send(
119 |             {
120 |                 "type": "http.response.start",
121 |                 "status": self.status_code,
122 |                 "headers": self.raw_headers,
123 |             }
124 |         )
125 |         async for chunk in self.body_iterator:
126 |             if not isinstance(chunk, bytes):
127 |                 chunk = chunk.encode(self.charset)
128 |                 chunk = b64encode(chunk)
129 |             await send(
130 |                 {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
131 |             )
132 | 
133 |         await send({"type": "http.response.body", "body": b"", "more_body": False})
134 | 
135 | 
136 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
137 |     def return_content_type(filename):
138 |         if gz_uncompressed_content_type:
139 |             return gz_uncompressed_content_type
140 |         else:
141 |             return str(mimetypes.guess_type(filename)[0])
142 | 
143 |     filename = str(file.filename) if file.filename else ""
144 |     if filename.endswith(".gz"):
145 |         filename = filename[:-3]
146 | 
147 |     gzip_file = gzip.open(file.file).read()
148 |     return UploadFile(
149 |         file=io.BytesIO(gzip_file),
150 |         size=len(gzip_file),
151 |         filename=filename,
152 |         headers=Headers({"content-type": return_content_type(filename)}),
153 |     )
154 | 
155 | 
156 | @router.post("/test-project/v1/process-text-4")
157 | @router.post("/test-project/v1.2.3/process-text-4")
158 | def pipeline_1(
159 |     request: Request,
160 |     gz_uncompressed_content_type: Optional[str] = Form(default=None),
161 |     text_files: Union[List[UploadFile], None] = File(default=None),
162 |     output_format: Union[str, None] = Form(default=None),
163 |     output_schema: str = Form(default=None),
164 | ):
165 |     if text_files:
166 |         for file_index in range(len(text_files)):
167 |             if text_files[file_index].content_type == "application/gzip":
168 |                 text_files[file_index] = ungz_file(text_files[file_index])
169 | 
170 |     content_type = request.headers.get("Accept")
171 | 
172 |     default_response_type = output_format or "text/csv"
173 |     if not content_type or content_type == "*/*" or content_type == "multipart/mixed":
174 |         media_type = default_response_type
175 |     else:
176 |         media_type = content_type
177 | 
178 |     default_response_schema = output_schema or "isd"
179 | 
180 |     if isinstance(text_files, list) and len(text_files):
181 |         if len(text_files) > 1:
182 |             if content_type and content_type not in [
183 |                 "*/*",
184 |                 "multipart/mixed",
185 |                 "application/json",
186 |                 "text/csv",
187 |             ]:
188 |                 raise HTTPException(
189 |                     detail=(
190 |                         f"Conflict in media type {content_type}"
191 |                         ' with response type "multipart/mixed".\n'
192 |                     ),
193 |                     status_code=status.HTTP_406_NOT_ACCEPTABLE,
194 |                 )
195 | 
196 |         def response_generator(is_multipart):
197 |             for file in text_files:
198 |                 get_validated_mimetype(file)
199 | 
200 |                 text = file.file.read().decode("utf-8")
201 | 
202 |                 response = pipeline_api(
203 |                     text,
204 |                     response_type=media_type,
205 |                     response_schema=default_response_schema,
206 |                 )
207 | 
208 |                 if is_expected_response_type(media_type, type(response)):
209 |                     raise HTTPException(
210 |                         detail=(
211 |                             f"Conflict in media type {media_type}"
212 |                             f" with response type {type(response)}.\n"
213 |                         ),
214 |                         status_code=status.HTTP_406_NOT_ACCEPTABLE,
215 |                     )
216 | 
217 |                 valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"]
218 |                 if media_type in valid_response_types:
219 |                     if is_multipart:
220 |                         if type(response) not in [str, bytes]:
221 |                             response = json.dumps(response)
222 |                     elif media_type == "text/csv":
223 |                         response = PlainTextResponse(response)
224 |                     yield response
225 |                 else:
226 |                     raise HTTPException(
227 |                         detail=f"Unsupported media type {media_type}.\n",
228 |                         status_code=status.HTTP_406_NOT_ACCEPTABLE,
229 |                     )
230 | 
231 |         def join_responses(responses):
232 |             if media_type != "text/csv":
233 |                 return responses
234 |             data = pd.read_csv(io.BytesIO(responses[0].body))
235 |             if len(responses) > 1:
236 |                 for resp in responses[1:]:
237 |                     resp_data = pd.read_csv(io.BytesIO(resp.body))
238 |                     data = data.merge(resp_data, how="outer")
239 |             return PlainTextResponse(data.to_csv())
240 | 
241 |         if content_type == "multipart/mixed":
242 |             return MultipartMixedResponse(
243 |                 response_generator(is_multipart=True), content_type=media_type
244 |             )
245 |         else:
246 |             return (
247 |                 list(response_generator(is_multipart=False))[0]
248 |                 if len(text_files) == 1
249 |                 else join_responses(list(response_generator(is_multipart=False)))
250 |             )
251 |     else:
252 |         raise HTTPException(
253 |             detail='Request parameter "text_files" is required.\n',
254 |             status_code=status.HTTP_400_BAD_REQUEST,
255 |         )
256 | 
257 | 
258 | app.include_router(router)
259 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py:
--------------------------------------------------------------------------------
  1 | #####################################################################
  2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
  3 | # DO NOT MODIFY DIRECTLY
  4 | #####################################################################
  5 | 
  6 | import io
  7 | import os
  8 | import gzip
  9 | import mimetypes
 10 | from typing import List, Union
 11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
 12 | import json
 13 | from fastapi.responses import StreamingResponse
 14 | from starlette.datastructures import Headers
 15 | from starlette.types import Send
 16 | from base64 import b64encode
 17 | from typing import Optional, Mapping
 18 | import secrets
 19 | 
 20 | 
 21 | app = FastAPI()
 22 | router = APIRouter()
 23 | 
 24 | 
 25 | # pipeline-api
 26 | def pipeline_api(
 27 |     text,
 28 |     file=None,
 29 |     filename=None,
 30 |     file_content_type=None,
 31 | ):
 32 |     return {
 33 |         "silly_result": " : ".join(
 34 |             [
 35 |                 str(len(text if text else "")),
 36 |                 str(text),
 37 |                 str(len(file.read()) if file else None),
 38 |                 str(filename),
 39 |                 str(file_content_type),
 40 |             ]
 41 |         )
 42 |     }
 43 | 
 44 | 
 45 | def get_validated_mimetype(file):
 46 |     """
 47 |     Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
 48 |     generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
 49 |     return HTTP 400 for an invalid type.
 50 |     """
 51 |     content_type = file.content_type
 52 |     if not content_type or content_type == "application/octet-stream":
 53 |         content_type = mimetypes.guess_type(str(file.filename))[0]
 54 | 
 55 |         # Some filetypes missing for this library, just hardcode them for now
 56 |         if not content_type:
 57 |             if file.filename.endswith(".md"):
 58 |                 content_type = "text/markdown"
 59 |             elif file.filename.endswith(".msg"):
 60 |                 content_type = "message/rfc822"
 61 | 
 62 |     allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
 63 |     if allowed_mimetypes_str is not None:
 64 |         allowed_mimetypes = allowed_mimetypes_str.split(",")
 65 | 
 66 |         if content_type not in allowed_mimetypes:
 67 |             raise HTTPException(
 68 |                 status_code=400,
 69 |                 detail=(
 70 |                     f"Unable to process {file.filename}: "
 71 |                     f"File type {content_type} is not supported."
 72 |                 ),
 73 |             )
 74 | 
 75 |     return content_type
 76 | 
 77 | 
 78 | class MultipartMixedResponse(StreamingResponse):
 79 |     CRLF = b"\r\n"
 80 | 
 81 |     def __init__(self, *args, content_type: str = None, **kwargs):
 82 |         super().__init__(*args, **kwargs)
 83 |         self.content_type = content_type
 84 | 
 85 |     def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
 86 |         super().init_headers(headers)
 87 |         self.boundary_value = secrets.token_hex(16)
 88 |         content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
 89 |         self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
 90 | 
 91 |     @property
 92 |     def boundary(self):
 93 |         return b"--" + self.boundary_value.encode()
 94 | 
 95 |     def _build_part_headers(self, headers: dict) -> bytes:
 96 |         header_bytes = b""
 97 |         for header, value in headers.items():
 98 |             header_bytes += f"{header}: {value}".encode() + self.CRLF
 99 |         return header_bytes
100 | 
101 |     def build_part(self, chunk: bytes) -> bytes:
102 |         part = self.boundary + self.CRLF
103 |         part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
104 |         if self.content_type is not None:
105 |             part_headers["Content-Type"] = self.content_type
106 |         part += self._build_part_headers(part_headers)
107 |         part += self.CRLF + chunk + self.CRLF
108 |         return part
109 | 
110 |     async def stream_response(self, send: Send) -> None:
111 |         await send(
112 |             {
113 |                 "type": "http.response.start",
114 |                 "status": self.status_code,
115 |                 "headers": self.raw_headers,
116 |             }
117 |         )
118 |         async for chunk in self.body_iterator:
119 |             if not isinstance(chunk, bytes):
120 |                 chunk = chunk.encode(self.charset)
121 |                 chunk = b64encode(chunk)
122 |             await send(
123 |                 {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
124 |             )
125 | 
126 |         await send({"type": "http.response.body", "body": b"", "more_body": False})
127 | 
128 | 
129 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
130 |     def return_content_type(filename):
131 |         if gz_uncompressed_content_type:
132 |             return gz_uncompressed_content_type
133 |         else:
134 |             return str(mimetypes.guess_type(filename)[0])
135 | 
136 |     filename = str(file.filename) if file.filename else ""
137 |     if filename.endswith(".gz"):
138 |         filename = filename[:-3]
139 | 
140 |     gzip_file = gzip.open(file.file).read()
141 |     return UploadFile(
142 |         file=io.BytesIO(gzip_file),
143 |         size=len(gzip_file),
144 |         filename=filename,
145 |         headers=Headers({"content-type": return_content_type(filename)}),
146 |     )
147 | 
148 | 
149 | @router.post("/test-project/v1/process-text-file-1")
150 | @router.post("/test-project/v1.2.3/process-text-file-1")
151 | def pipeline_1(
152 |     request: Request,
153 |     gz_uncompressed_content_type: Optional[str] = Form(default=None),
154 |     files: Union[List[UploadFile], None] = File(default=None),
155 |     text_files: Union[List[UploadFile], None] = File(default=None),
156 | ):
157 |     if files:
158 |         for file_index in range(len(files)):
159 |             if files[file_index].content_type == "application/gzip":
160 |                 files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type)
161 | 
162 |     if text_files:
163 |         for file_index in range(len(text_files)):
164 |             if text_files[file_index].content_type == "application/gzip":
165 |                 text_files[file_index] = ungz_file(text_files[file_index])
166 | 
167 |     content_type = request.headers.get("Accept")
168 | 
169 |     has_text = isinstance(text_files, list) and len(text_files)
170 |     has_files = isinstance(files, list) and len(files)
171 |     if not has_text and not has_files:
172 |         raise HTTPException(
173 |             detail='One of the request parameters "text_files" or "files" is required.\n',
174 |             status_code=status.HTTP_400_BAD_REQUEST,
175 |         )
176 |     files_list: List = files or []
177 |     text_files_list: List = text_files or []
178 | 
179 |     if len(files_list) or len(text_files_list):
180 |         if all(
181 |             [
182 |                 content_type,
183 |                 content_type not in ["*/*", "multipart/mixed", "application/json", "text/csv"],
184 |                 len(files_list) + len(text_files_list) > 1,
185 |             ]
186 |         ):
187 |             raise HTTPException(
188 |                 detail=(
189 |                     f"Conflict in media type {content_type}"
190 |                     ' with response type "multipart/mixed".\n'
191 |                 ),
192 |                 status_code=status.HTTP_406_NOT_ACCEPTABLE,
193 |             )
194 | 
195 |         def response_generator(is_multipart):
196 |             for text_file in text_files_list:
197 |                 text = text_file.file.read().decode("utf-8")
198 | 
199 |                 response = pipeline_api(
200 |                     text=text,
201 |                     file=None,
202 |                 )
203 | 
204 |                 if is_multipart:
205 |                     if type(response) not in [str, bytes]:
206 |                         response = json.dumps(response)
207 |                 yield response
208 | 
209 |             for file in files_list:
210 |                 _file = file.file
211 | 
212 |                 file_content_type = get_validated_mimetype(file)
213 | 
214 |                 response = pipeline_api(
215 |                     text=None,
216 |                     file=_file,
217 |                     filename=file.filename,
218 |                     file_content_type=file_content_type,
219 |                 )
220 | 
221 |                 if is_multipart:
222 |                     if type(response) not in [str, bytes]:
223 |                         response = json.dumps(response)
224 |                 yield response
225 | 
226 |         if content_type == "multipart/mixed":
227 |             return MultipartMixedResponse(
228 |                 response_generator(is_multipart=True),
229 |             )
230 |         else:
231 |             return (
232 |                 list(response_generator(is_multipart=False))[0]
233 |                 if len(files_list + text_files_list) == 1
234 |                 else response_generator(is_multipart=False)
235 |             )
236 |     else:
237 |         raise HTTPException(
238 |             detail='Request parameters "files" or "text_files" are required.\n',
239 |             status_code=status.HTTP_400_BAD_REQUEST,
240 |         )
241 | 
242 | 
243 | app.include_router(router)
244 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/preprocessing-pipeline-family.yaml:
--------------------------------------------------------------------------------
1 | name: test-project
2 | version: 1.2.3
3 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/scripts/check-and-format-notebooks.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | from copy import deepcopy
  5 | import difflib
  6 | import json
  7 | from pathlib import Path
  8 | import sys
  9 | from typing import List, Tuple, Union
 10 | 
 11 | from nbdev import clean
 12 | from nbconvert.preprocessors import ExecutePreprocessor
 13 | import nbformat
 14 | from unstructured_api_tools.pipelines.convert import read_notebook
 15 | 
 16 | 
 17 | def process_nb(nb: nbformat.NotebookNode, working_dir: Union[str, Path]) -> nbformat.NotebookNode:
 18 |     """Execute cells in nb using working_dir as the working directory for imports, modifying the
 19 |     notebook in place (in memory)."""
 20 |     ep = ExecutePreprocessor(timeout=600)
 21 |     ep.preprocess(nb, {"metadata": {"path": working_dir}})
 22 |     return nb
 23 | 
 24 | 
 25 | def nb_paths(root_path: Union[str, Path]) -> List[Path]:
 26 |     """Fetches all .ipynb filenames that belong to subdirectories of root_path (1 level deep) with
 27 |     'notebooks' in the name."""
 28 |     root_path = Path(root_path)
 29 |     return [
 30 |         fn
 31 |         for dir in root_path.iterdir()
 32 |         # NOTE(alan): Search only in paths with 'notebooks' in the title such as pipeline-notebooks
 33 |         # and exploration-notebooks
 34 |         if "notebooks" in dir.stem and dir.is_dir()
 35 |         for fn in dir.iterdir()
 36 |         if fn.suffix == ".ipynb"
 37 |     ]
 38 | 
 39 | 
 40 | def to_results_str(fns: List[Path], nonmatching_nbs: List[Path]) -> Tuple[str, str]:
 41 |     """Given files that were checked and list of files that would be changed, produces a summary of
 42 |     changes as well as a list of files to be changed"""
 43 |     unchanged = len(fns) - len(nonmatching_nbs)
 44 |     results = []
 45 |     if nonmatching_nbs:
 46 |         results.append(
 47 |             f"{len(nonmatching_nbs)} "
 48 |             f"{'file' if len(nonmatching_nbs) == 1 else 'files'} "
 49 |             f"{'would be ' if check else ''}changed"
 50 |         )
 51 |     if unchanged:
 52 |         results.append(
 53 |             f"{unchanged} "
 54 |             f"{'file' if unchanged == 1 else 'files'} "
 55 |             f"{'would be ' if check else ''}left unchanged"
 56 |         )
 57 |     summary_str = ", ".join(results) + ".\n"
 58 |     if nonmatching_nbs:
 59 |         details_str = (
 60 |             f"The following notebooks {'would have been' if check else 'were'} "
 61 |             "changed when executed and cleaned:\n* " + "\n* ".join(nonmatching_nbs) + "\n"
 62 |         )
 63 |     else:
 64 |         details_str = ""
 65 | 
 66 |     return summary_str, details_str
 67 | 
 68 | 
 69 | if __name__ == "__main__":
 70 |     parser = argparse.ArgumentParser()
 71 |     parser.add_argument(
 72 |         "--check",
 73 |         default=False,
 74 |         action="store_true",
 75 |         help="Check notebook format without making changes. Return code 0 means formatting would "
 76 |         "produce no changes. Return code 1 means some files would be changed.",
 77 |     )
 78 |     parser.add_argument(
 79 |         "notebooks",
 80 |         metavar="notebook",
 81 |         nargs="*",
 82 |         help="Path(s) to notebook(s) to format (or check). If you don't pass any paths, "
 83 |         "notebooks in any subfolders with 'notebooks' in the name will be processed.",
 84 |         default=[],
 85 |     )
 86 |     args = parser.parse_args()
 87 |     check = args.check
 88 |     notebooks = args.notebooks
 89 | 
 90 |     root_path = Path(__file__).parent.parent
 91 |     nonmatching_nbs = []
 92 |     fns = notebooks if notebooks else nb_paths(root_path)
 93 |     for fn in fns:
 94 |         nb = read_notebook(fn)
 95 |         modified_nb = deepcopy(nb)
 96 |         process_nb(modified_nb, root_path)
 97 |         clean.clean_nb(modified_nb, allowed_cell_metadata_keys=["tags"])
 98 |         if nb != modified_nb:
 99 |             nonmatching_nbs.append(str(fn))
100 |             nb_json = json.dumps(nb.dict(), indent=2, sort_keys=True)
101 |             modified_nb_json = json.dumps(modified_nb.dict(), indent=2, sort_keys=True)
102 |             sys.stderr.write(f"The following diff shows the modifications made to {fn}\n")
103 |             sys.stderr.writelines(
104 |                 (
105 |                     difflib.unified_diff(
106 |                         nb_json.splitlines(keepends=True),
107 |                         modified_nb_json.splitlines(keepends=True),
108 |                     )
109 |                 )
110 |             )
111 |         if not check:
112 |             nbformat.write(modified_nb, fn)
113 | 
114 |     summary_str, details_str = to_results_str(fns, nonmatching_nbs)
115 |     print(summary_str)
116 |     if check:
117 |         sys.stderr.write(details_str)
118 |         if nonmatching_nbs:
119 |             sys.exit(1)
120 |     else:
121 |         print(details_str)
122 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/scripts/test-doc-pipeline-apis-consistent.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -eu -o pipefail
 4 | 
 5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 6 | cd "$SCRIPT_DIR"/..
 7 | 
 8 | PIPELINE_OUTPUT_DIR=tmp-api-check-output-$RANDOM
 9 | FILE_INDICTATING_FAILURE="$PIPELINE_OUTPUT_DIR"-has-failures
10 | mkdir -p $PIPELINE_OUTPUT_DIR
11 | touch $PIPELINE_OUTPUT_DIR/__init__.py
12 | 
13 | function tmp_pipeline_comp_cleanup () {
14 |     cd "$SCRIPT_DIR"/..
15 |     rm -f "$FILE_INDICTATING_FAILURE"
16 |     if [[ "$1" -eq 0 ]]; then
17 |       rm -rf $PIPELINE_OUTPUT_DIR
18 |     fi
19 |     exit "$1"
20 | }
21 | 
22 | # Now in project root
23 | cd ../..
24 | 
25 | PYTHONPATH=. PIPELINE_FAMILY_CONFIG=test_unstructured_api_tools/pipeline-test-project/preprocessing-pipeline-family.yaml \
26 |     python3 ./unstructured_api_tools/cli.py convert-pipeline-notebooks \
27 |     --input-directory ./test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks \
28 |     --output-directory ./test_unstructured_api_tools/pipeline-test-project/"$PIPELINE_OUTPUT_DIR"
29 | 
30 | # Back in the test project
31 | cd -
32 | 
33 | NUM_PIPELINE_API_FILES_GENERATED=$(find "$PIPELINE_OUTPUT_DIR" -name "*.py" | wc -l)
34 | 
35 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -eq 0 ]]; then
36 |     echo "No pipelines where created by unstructured_api_tools convert-pipeline-notebooks"
37 |     tmp_pipeline_comp_cleanup 1
38 | fi
39 | 
40 | NUM_EXISTING_PIPELINE_API_FILES=$(find "$PACKAGE_NAME"/api -name "*.py" | wc -l)
41 | 
42 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -gt "$NUM_EXISTING_PIPELINE_API_FILES"  ]]; then
43 |     echo "More pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api"
44 |     tmp_pipeline_comp_cleanup 1
45 | elif [[ "$NUM_PIPELINE_API_FILES_GENERATED" -lt "$NUM_EXISTING_PIPELINE_API_FILES"  ]]; then
46 |     echo "Fewer pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api"
47 |     tmp_pipeline_comp_cleanup 1
48 | fi
49 | 
50 | cd "$PACKAGE_NAME"/api
51 | find . -name "*.py" -print0 | while IFS= read -r -d '' pipeline_file; do
52 |     set +o pipefail
53 |     if ! diff -u "$pipeline_file" ../../"$PIPELINE_OUTPUT_DIR/$pipeline_file"; then
54 | 	touch "../../$FILE_INDICTATING_FAILURE"
55 |     fi
56 |     set -o pipefail
57 | done
58 | cd -
59 | 
60 | if [ -r "$FILE_INDICTATING_FAILURE" ]; then
61 |     echo
62 |     echo "Autogenerated pipeline api file(s) do not match existing versions, see above for diff's"
63 |     echo " or run: diff -ru ${PACKAGE_NAME}/api/ ${PIPELINE_OUTPUT_DIR}/"
64 |     tmp_pipeline_comp_cleanup 1
65 | fi
66 | tmp_pipeline_comp_cleanup 0
67 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipelines/test_api_conventions.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import yaml
  4 | 
  5 | import unstructured_api_tools.pipelines.api_conventions as conventions
  6 | 
  7 | 
  8 | @pytest.fixture
  9 | def sample_config():
 10 |     return {"version": "0.2.1", "name": "sec_filings"}
 11 | 
 12 | 
 13 | @pytest.mark.parametrize(
 14 |     # NOTE(yuming): Test cases ref: https://regex101.com/r/Ly7O1x/3/
 15 |     "invalid_semver_string",
 16 |     [
 17 |         "1",
 18 |         "1.2",
 19 |         "1.2.3-0123",
 20 |         "1.2.3-0123.0123",
 21 |         "1.1.2+.123",
 22 |         "+invalid",
 23 |         "-invalid",
 24 |         "-invalid+invalid",
 25 |         "-invalid.01",
 26 |         "alpha",
 27 |         "alpha.beta",
 28 |         "alpha.beta.1",
 29 |         "alpha.1",
 30 |         "alpha+beta",
 31 |         "alpha_beta",
 32 |         "alpha.",
 33 |         "alpha..",
 34 |         "beta",
 35 |         "1.0.0-alpha_beta",
 36 |         "-alpha.",
 37 |         "1.0.0-alpha..",
 38 |         "1.0.0-alpha..1",
 39 |         "1.0.0-alpha...1",
 40 |         "1.0.0-alpha....1",
 41 |         "1.0.0-alpha.....1",
 42 |         "1.0.0-alpha......1",
 43 |         "1.0.0-alpha.......1",
 44 |         "01.1.1",
 45 |         "1.01.1",
 46 |         "1.1.01",
 47 |         "1.2",
 48 |         "1.2.3.DEV",
 49 |         "1.2-SNAPSHOT",
 50 |         "1.2.31.2.3----RC-SNAPSHOT.12.09.1--..12+788",
 51 |         "1.2-RC-SNAPSHOT",
 52 |         "-1.0.3-gamma+b7718",
 53 |         "+justmeta",
 54 |         "9.8.7+meta+meta",
 55 |         "9.8.7-whatever+meta+meta",
 56 |         "9999999999999.999999999999999999.999999999----RC-SNAPSHOT.12.09.1---------..12",
 57 |     ],
 58 | )
 59 | def test_raise_for_invalid_semver_string(invalid_semver_string):
 60 |     with pytest.raises(ValueError):
 61 |         conventions.raise_for_invalid_semver_string(invalid_semver_string)
 62 | 
 63 | 
 64 | @pytest.mark.parametrize(
 65 |     # NOTE(yuming): Test cases ref: https://regex101.com/r/Ly7O1x/3/
 66 |     "valid_semver_string",
 67 |     [
 68 |         "0.0.4",
 69 |         "1.2.3",
 70 |         "10.20.30",
 71 |         "1.1.2-prerelease+meta",
 72 |         "1.1.2+meta",
 73 |         "1.1.2+meta-valid",
 74 |         "1.0.0-alpha",
 75 |         "1.0.0-beta",
 76 |         "1.0.0-alpha.beta",
 77 |         "1.0.0-alpha.beta.1",
 78 |         "1.0.0-alpha.1",
 79 |         "1.0.0-alpha0.valid",
 80 |         "1.0.0-alpha.0valid",
 81 |         "1.0.0-alpha-a.b-c-somethinglong+build.1-aef.1-its-okay",
 82 |         "1.0.0-rc.1+build.1",
 83 |         "2.0.0-rc.1+build.123",
 84 |         "1.2.3-beta",
 85 |         "10.2.3-DEV-SNAPSHOT",
 86 |         "1.2.3-SNAPSHOT-123",
 87 |         "1.0.0",
 88 |         "2.0.0",
 89 |         "1.1.7",
 90 |         "2.0.0+build.1848",
 91 |         "2.0.1-alpha.1227",
 92 |         "1.0.0-alpha+beta",
 93 |         "1.2.3----RC-SNAPSHOT.12.9.1--.12+788",
 94 |         "1.2.3----R-S.12.9.1--.12+meta",
 95 |         "1.2.3----RC-SNAPSHOT.12.9.1--.12",
 96 |         "1.0.0+0.build.1-rc.10000aaa-kk-0.1",
 97 |         "99999999999999999999999.999999999999999999.99999999999999999",
 98 |         "1.0.0-0A.is.legal",
 99 |     ],
100 | )
101 | def test_pass_for_valid_semver_string(valid_semver_string):
102 |     try:
103 |         conventions.raise_for_invalid_semver_string(valid_semver_string)
104 |     except ValueError:
105 |         assert False, f"{valid_semver_string} raised an exception."
106 | 
107 | 
108 | def test_get_pipeline_path():
109 |     path = conventions.get_pipeline_path(
110 |         filename="risk_narrative.py", pipeline_family="sec_filings", semver="0.2.1"
111 |     )
112 |     assert path == "/sec-filings/v0.2.1/risk-narrative"
113 | 
114 | 
115 | def test_get_short_pipeline_path():
116 |     path = conventions.get_pipeline_path(
117 |         filename="risk_narrative.py",
118 |         pipeline_family="sec_filings",
119 |         semver="0.2.1",
120 |         shorter=True,
121 |     )
122 | 
123 |     assert path == "/sec-filings/v0/risk-narrative"
124 | 
125 | 
126 | def test_get_pipeline_path_raises_if_either_not_specified():
127 |     with pytest.raises(ValueError):
128 |         conventions.get_pipeline_path(
129 |             filename="risk_narrative.py", pipeline_family="sec_filings", semver=None
130 |         )
131 | 
132 |     with pytest.raises(ValueError):
133 |         conventions.get_pipeline_path(
134 |             filename="risk_narrative.py", pipeline_family=None, semver="0.2.1"
135 |         )
136 | 
137 | 
138 | def test_get_pipeline_path_reads_from_file(tmpdir, sample_config):
139 |     filename = os.path.join(tmpdir.dirname, "pipeline-family.yaml")
140 |     with open(filename, "w") as f:
141 |         yaml.dump(sample_config, f)
142 | 
143 |     path = conventions.get_pipeline_path(filename="risk_narrative.py", config_filename=filename)
144 |     assert path == "/sec-filings/v0.2.1/risk-narrative"
145 | 
146 | 
147 | def test_pipeline_config_reads_from_file(tmpdir, sample_config):
148 |     filename = os.path.join(tmpdir.dirname, "pipeline-family.yaml")
149 |     with open(filename, "w") as f:
150 |         yaml.dump(sample_config, f)
151 | 
152 |     config = conventions.PipelineConfig(filename=filename)
153 |     assert config.name == "sec_filings"
154 |     assert config.version == "0.2.1"
155 | 
156 | 
157 | def test_pipeline_config_reads_from_env(tmpdir, monkeypatch, sample_config):
158 |     filename = os.path.join(tmpdir.dirname, "pipeline-family.yaml")
159 |     with open(filename, "w") as f:
160 |         yaml.dump(sample_config, f)
161 | 
162 |     monkeypatch.setenv("PIPELINE_FAMILY_CONFIG", filename)
163 | 
164 |     config = conventions.PipelineConfig(filename=None)
165 |     assert config.name == "sec_filings"
166 | 
167 | 
168 | def test_pipeline_config_raises_with_missing_file(tmpdir, monkeypatch, sample_config):
169 |     # NOTE(robinson) - Will default to looking for ${PWD}/pipeline-family.yaml, which
170 |     # does not exist
171 |     with pytest.raises(FileNotFoundError):
172 |         conventions.PipelineConfig(filename=None)
173 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipelines/test_lint.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | import re
  4 | from unittest.mock import patch
  5 | 
  6 | import unstructured_api_tools.pipelines.lint as lint
  7 | 
  8 | 
  9 | class MockPopen:
 10 |     def __init__(self, *args, **kwargs):
 11 |         pass
 12 | 
 13 |     def communicate(self, *args, **kwargs):
 14 |         raise ValueError("Squawk!")
 15 | 
 16 | 
 17 | def test_run_lint_cmd_cleans_up_on_exception(monkeypatch):
 18 |     monkeypatch.setattr(lint, "Popen", MockPopen)
 19 |     with patch.object(os, "unlink", return_value=None) as mock_unlink:
 20 |         with pytest.raises(ValueError):
 21 |             lint._run_lint_cmd(["fake"], "fake.py", re.compile("[A-Z]"))
 22 | 
 23 |     mock_unlink.assert_called_once()
 24 | 
 25 | 
 26 | def test_flake8():
 27 |     file_text = """# A test file
 28 | 
 29 | def hello_world():
 30 |     pass
 31 | """
 32 |     assert lint.check_flake8(file_text) is True
 33 | 
 34 | 
 35 | def test_flake8_passes_with_unsued_import():
 36 |     file_text = """# A test file
 37 | import os
 38 | 
 39 | 
 40 | def hello_world():
 41 |     pass
 42 | """
 43 |     assert lint.check_flake8(file_text) is True
 44 | 
 45 | 
 46 | def test_flake8_raises_with_bad_lint():
 47 |     file_text = """# A test file
 48 | 
 49 | def hello_world()   :
 50 |     pass"""
 51 |     with pytest.raises(lint.LintError):
 52 |         lint.check_flake8(file_text)
 53 | 
 54 | 
 55 | def test_format_black():
 56 |     file_text = """# A test file
 57 | 
 58 | def hello_world()   :
 59 |     pass
 60 | """
 61 |     formatted_text = lint.format_black(file_text)
 62 | 
 63 |     assert (
 64 |         formatted_text
 65 |         == """# A test file
 66 | 
 67 | 
 68 | def hello_world():
 69 |     pass
 70 | """
 71 |     )
 72 | 
 73 | 
 74 | def test_validate_flake8_ignore():
 75 |     lint.validate_flake8_ignore("E405, F401") is True
 76 | 
 77 | 
 78 | def test_validate_flake8_ignore_bad_input():
 79 |     with pytest.raises(ValueError):
 80 |         lint.validate_flake8_ignore("NOT A REAL CODE")
 81 | 
 82 | 
 83 | def test_mypy():
 84 |     file_text = """# A test file
 85 | 
 86 | def hello_world(text: str) -> str:
 87 |     return text
 88 | """
 89 |     assert lint.check_mypy(file_text) is True
 90 | 
 91 | 
 92 | def test_mypy_raises_with_bad_type():
 93 |     file_text = """# A test file
 94 | 
 95 | def hello_world(text: str) -> str:
 96 |     return int(text)
 97 | """
 98 |     with pytest.raises(lint.LintError):
 99 |         lint.check_mypy(file_text)
100 | 
101 | 
102 | def test_check_black():
103 |     file_text = """# A test file
104 | 
105 | 
106 | def hello_world():
107 |     pass
108 | """
109 |     assert lint.check_black(file_text) is True
110 | 
111 | 
112 | def test_check_black_raises_with_bad_format():
113 |     file_text = """# A test file
114 | 
115 | 
116 | def hello_world()   :
117 |     pass
118 | """
119 |     with pytest.raises(lint.LintError):
120 |         lint.check_black(file_text)
121 | 


--------------------------------------------------------------------------------
/test_unstructured_api_tools/test_cli.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import pytest
  4 | 
  5 | from click.testing import CliRunner
  6 | from nbformat import NotebookNode
  7 | 
  8 | import unstructured_api_tools.cli as cli
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def sample_notebook():
 13 |     return NotebookNode(
 14 |         {
 15 |             "cells": [
 16 |                 {
 17 |                     "cell_type": "code",
 18 |                     "execution_count": 1,
 19 |                     "id": "768fa8c6",
 20 |                     "metadata": {},
 21 |                     "outputs": [],
 22 |                     "source": "# pipeline-api\nimport random",  # noqa: E501
 23 |                 },
 24 |                 {
 25 |                     "cell_type": "code",
 26 |                     "execution_count": 1,
 27 |                     "id": "64f6386b",
 28 |                     "metadata": {},
 29 |                     "outputs": [],
 30 |                     "source": "def function_not_to_include():\n    pass",
 31 |                 },
 32 |                 {
 33 |                     "cell_type": "markdown",
 34 |                     "id": "45988caf",
 35 |                     "metadata": {},
 36 |                     "source": "# pipeline-api",
 37 |                 },
 38 |                 {
 39 |                     "cell_type": "code",
 40 |                     "execution_count": 2,
 41 |                     "id": "c8e0cad6",
 42 |                     "metadata": {},
 43 |                     "outputs": [],
 44 |                     "source": "# pipeline-api\ndef pipeline_api(text: str):\n    sec_document = 'not a real document'\n    risk_narrative = sec_document[0:5]\n    return risk_narrative",  # noqa: E501
 45 |                 },
 46 |             ],
 47 |             "metadata": {
 48 |                 "kernelspec": {
 49 |                     "display_name": "Python 3 (ipykernel)",
 50 |                     "language": "python",
 51 |                     "name": "python3",
 52 |                 },
 53 |                 "language_info": {
 54 |                     "codemirror_mode": {"name": "ipython", "version": 3},
 55 |                     "file_extension": ".py",
 56 |                     "mimetype": "text/x-python",
 57 |                     "name": "python",
 58 |                     "nbconvert_exporter": "python",
 59 |                     "pygments_lexer": "ipython3",
 60 |                     "version": "3.8.13",
 61 |                 },
 62 |             },
 63 |             "nbformat": 4,
 64 |             "nbformat_minor": 5,
 65 |         }
 66 |     )
 67 | 
 68 | 
 69 | def test_convert_pipeline_notebooks(sample_notebook, tmpdir):
 70 |     for i in range(5):
 71 |         filename = os.path.join(tmpdir.dirname, f"pipeline-this-is-a-test-{i}.ipynb")
 72 |         with open(filename, "w") as f:
 73 |             json.dump(sample_notebook, f, indent=4)
 74 | 
 75 |     runner = CliRunner()
 76 |     result = runner.invoke(
 77 |         cli.cli,
 78 |         [
 79 |             "convert-pipeline-notebooks",
 80 |             "--input-directory",
 81 |             tmpdir.dirname,
 82 |             "--output-directory",
 83 |             tmpdir.dirname,
 84 |             "--pipeline-family",
 85 |             "fake-family-name",
 86 |             "--semver",
 87 |             "2.1.1",
 88 |         ],
 89 |     )
 90 |     assert result.exit_code == 0
 91 | 
 92 |     files = os.listdir(tmpdir.dirname)
 93 |     for i in range(5):
 94 |         assert f"this_is_a_test_{i}.py" in files
 95 |     assert "app.py" in files
 96 | 
 97 | 
 98 | def test_convert_pipeline_notebooks_passing_flake8_ignore(sample_notebook, tmpdir):
 99 |     for i in range(5):
100 |         filename = os.path.join(tmpdir.dirname, f"pipeline-this-is-a-test-{i}.ipynb")
101 |         with open(filename, "w") as f:
102 |             json.dump(sample_notebook, f, indent=4)
103 | 
104 |     runner = CliRunner()
105 |     result = runner.invoke(
106 |         cli.cli,
107 |         [
108 |             "convert-pipeline-notebooks",
109 |             "--input-directory",
110 |             tmpdir.dirname,
111 |             "--output-directory",
112 |             tmpdir.dirname,
113 |             "--pipeline-family",
114 |             "fake-family-name",
115 |             "--semver",
116 |             "2.1.1",
117 |             "--flake8-ignore",
118 |             "E402, F401",
119 |         ],
120 |     )
121 |     assert result.exit_code == 0
122 | 
123 |     files = os.listdir(tmpdir.dirname)
124 |     for i in range(5):
125 |         assert f"this_is_a_test_{i}.py" in files
126 |     assert "app.py" in files
127 | 


--------------------------------------------------------------------------------
/unstructured_api_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/unstructured_api_tools/__init__.py


--------------------------------------------------------------------------------
/unstructured_api_tools/__version__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.10.11"  # pragma: no cover
2 | 


--------------------------------------------------------------------------------
/unstructured_api_tools/cli.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Optional
 3 | 
 4 | import click
 5 | 
 6 | from unstructured_api_tools.pipelines.convert import convert_notebook_files_to_api
 7 | from unstructured_api_tools.pipelines.lint import (
 8 |     FLAKE8_DEFAULT_OPTS,
 9 |     validate_flake8_ignore,
10 | )
11 | 
12 | 
13 | @click.group()
14 | def cli():
15 |     pass
16 | 
17 | 
18 | @cli.command()
19 | @click.option("--input-directory")
20 | @click.option("--output-directory")
21 | @click.option("--pipeline-family")
22 | @click.option("--semver")
23 | @click.option("--config-filename")
24 | @click.option("--flake8-ignore")
25 | def convert_pipeline_notebooks(
26 |     input_directory: str,
27 |     output_directory: str,
28 |     pipeline_family: Optional[str] = None,
29 |     semver: Optional[str] = None,
30 |     config_filename: Optional[str] = None,
31 |     flake8_ignore: Optional[str] = None,
32 | ):
33 |     """Convert a pipeline notebook to a Python script. The conversion script will retain
34 |     any cell that includes # pipeline-api at the top."""
35 |     notebook_filenames = sorted([f for f in os.listdir(input_directory) if f.endswith(".ipynb")])
36 | 
37 |     if flake8_ignore:
38 |         validate_flake8_ignore(flake8_ignore)
39 |         # NOTE(robinson) - Not making line length configurable because setting it to
40 |         # 100 allows flake8 to be consistent with black
41 |         flake8_opts = ["--max-line-length", "100", "--ignore", flake8_ignore]
42 |     else:
43 |         flake8_opts = FLAKE8_DEFAULT_OPTS
44 | 
45 |     convert_notebook_files_to_api(
46 |         notebook_filenames,
47 |         input_directory,
48 |         output_directory,
49 |         pipeline_family=pipeline_family,
50 |         semver=semver,
51 |         config_filename=config_filename,
52 |         flake8_opts=flake8_opts,
53 |     )
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     cli()  # pragma: nocover
58 | 


--------------------------------------------------------------------------------
/unstructured_api_tools/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/unstructured_api_tools/pipelines/__init__.py


--------------------------------------------------------------------------------
/unstructured_api_tools/pipelines/api_conventions.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | import os
  3 | from typing import Optional
  4 | import yaml
  5 | import re
  6 | 
  7 | 
  8 | def get_config(filename: Optional[str] = None):
  9 |     if filename is None:
 10 |         default = os.path.join(os.getcwd(), "preprocessing-pipeline-family.yaml")
 11 |         filename = os.environ.get("PIPELINE_FAMILY_CONFIG", default)
 12 | 
 13 |     if not os.path.exists(filename):
 14 |         raise FileNotFoundError(
 15 |             f"A pipeline family config was not found at {filename}."
 16 |             "The config class looks for the config in the following "
 17 |             "order:\n"
 18 |             "    1. The filename parameter\n"
 19 |             "    2. The PIPELINE_FAMILY_CONFIG environment variable\n"
 20 |             '    3. "${PWD}"/pipeline-family.yaml'
 21 |         )
 22 | 
 23 |     with open(filename, "r") as f:
 24 |         config = yaml.safe_load(f)
 25 | 
 26 |     return config
 27 | 
 28 | 
 29 | @dataclass
 30 | class PipelineConfig:
 31 |     name: str
 32 |     version: str
 33 |     description: str
 34 |     long_description: str
 35 |     filename: str
 36 | 
 37 |     def __init__(self, filename: Optional[str] = None):
 38 |         """Parses pipeline family metadata from the pipeline-family.yaml file. If no
 39 |         filename is passed, reverts to the PIPELINE_FAMILY_CONFIG environment variable.
 40 |         Otherwise, looks for pipeline-family.yaml in the working directory."""
 41 |         config = get_config(filename)
 42 | 
 43 |         self.name = config["name"]
 44 |         self.version = config["version"]
 45 |         self.description = config.get("description", "Unstructured Pipeline API")
 46 |         self.long_description = config.get("long_description", "")
 47 | 
 48 | 
 49 | def raise_for_invalid_semver_string(semver: str):
 50 |     """Raise an error if the semver string is invalid."""
 51 |     # NOTE(yuming): Suggested regular expression (RegEx) to check a semver string
 52 |     # ref: https://semver.org/#is-there-a-suggested-regular-expression
 53 |     # -regex-to-check-a-semver-string
 54 |     valid_semver_pattern = r"""^(?P<major>0|[1-9]\d*)\.(?P<minor>0|[1-9]\d*)\.
 55 |                         (?P<patch>0|[1-9]\d*)(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-]
 56 |                         [0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?
 57 |                         (?:\+(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$"""
 58 |     valid_semver_re = re.compile(valid_semver_pattern, re.VERBOSE)
 59 | 
 60 |     if not re.match(valid_semver_re, semver):
 61 |         raise ValueError("Semver string must be a valid string.")
 62 | 
 63 | 
 64 | def get_pipeline_path(
 65 |     filename: str,
 66 |     pipeline_family: Optional[str] = None,
 67 |     semver: Optional[str] = None,
 68 |     config_filename: Optional[str] = None,
 69 |     shorter: Optional[bool] = False,
 70 | ) -> str:
 71 |     """Builds the pipeline path according to the conventions outlined in the architecture docs.
 72 |     ref: https://github.com/Unstructured-IO/
 73 |     docs-and-arch/blob/main/Pipelines-and-APIs.md#api-specification
 74 |     """
 75 |     if any([pipeline_family, semver]) and not all([pipeline_family, semver]):
 76 |         raise ValueError(
 77 |             "If either pipeline_family or semver is specified, the other must be "
 78 |             "specified as well."
 79 |         )
 80 | 
 81 |     if not any([pipeline_family, semver]):
 82 |         config = PipelineConfig(filename=config_filename)
 83 |         pipeline_family = config.name
 84 |         semver = config.version
 85 |     else:
 86 |         # NOTE(robinson) - Explicit type casting if the variables are passed. Otherwise
 87 |         # mypy gets cranky because Optional[str] implies they could be None.
 88 |         pipeline_family = str(pipeline_family)
 89 |         semver = str(semver)
 90 | 
 91 |     raise_for_invalid_semver_string(semver)
 92 | 
 93 |     if shorter:
 94 |         semver = semver.split(".")[0]
 95 | 
 96 |     pipeline_family = pipeline_family.replace("_", "-")
 97 | 
 98 |     filepath = filename.split("/")
 99 |     # NOTE(robinson) - Converts something like "sec_filings.py" to "sec-filings"
100 |     pipeline_name = filepath[-1].replace("_", "-").replace(".py", "")
101 | 
102 |     return f"/{pipeline_family}/v{semver}/{pipeline_name}"
103 | 
104 | 
105 | def get_api_name_from_config(filename: Optional[str] = None):
106 |     try:
107 |         return get_config(filename).get("name", None)
108 |     except FileNotFoundError:
109 |         return None
110 | 


--------------------------------------------------------------------------------
/unstructured_api_tools/pipelines/lint.py:
--------------------------------------------------------------------------------
  1 | """Tools for linting and autoformatting generated API files."""
  2 | import os
  3 | import re
  4 | from subprocess import PIPE, Popen
  5 | import tempfile
  6 | from typing import List
  7 | from autoflake import (
  8 |     check,
  9 |     filter_unused_import,
 10 |     SAFE_IMPORTS,
 11 |     unused_import_module_name,
 12 |     filter_useless_pass,
 13 | )
 14 | import pyflakes.api
 15 | import pyflakes.messages
 16 | import pyflakes.reporter
 17 | import io
 18 | import collections
 19 | 
 20 | from black import format_str, FileMode
 21 | from autoflake import fix_code
 22 | 
 23 | # NOTE(robinson) - F401 is for unused imports
 24 | FLAKE8_DEFAULT_OPTS: List[str] = ["--max-line-length", "100", "--ignore", "F401"]
 25 | FLAKE8_PREFIX_RE = re.compile(r".+:\d+:\d+:\s")
 26 | FLAKE8_ERROR_CODE_RE = re.compile(r"([A-Z]\d{3},?\s?)+")
 27 | 
 28 | MYPY_PREFIX_RE = re.compile(r".+:\d+:\s")
 29 | 
 30 | 
 31 | class LintError(RuntimeError):
 32 |     pass
 33 | 
 34 | 
 35 | def _create_tempfile(file_text: str):
 36 |     tmp = tempfile.NamedTemporaryFile(delete=False)
 37 |     tmp.write(file_text.encode())
 38 |     tmp.close()
 39 |     return tmp
 40 | 
 41 | 
 42 | def _create_file_for_user_debugging(content: str, filename: str):
 43 |     """Creates file in user's current working to facilitate debugging lint errors."""
 44 |     with open(filename, "w+") as f:
 45 |         f.write(content)
 46 | 
 47 | 
 48 | def _run_lint_cmd(cmd: List[str], filename: str, prefix_re: re.Pattern):
 49 |     """Runs a subprocess with the specified lint command and raises a LintError
 50 |     if the file does not pass."""
 51 |     try:
 52 |         process = Popen(cmd, stdout=PIPE, stderr=PIPE)
 53 |         stdout, _ = process.communicate()
 54 |     except Exception as e:
 55 |         # NOTE(robinson) - Catching the error ensures we clean up the temp file
 56 |         os.unlink(filename)  # NOTE(robinson) - Removes the temporary file
 57 |         raise e
 58 | 
 59 |     os.unlink(filename)  # NOTE(robinson) - Removes the temporary file
 60 |     if process.returncode != 0:
 61 |         err = prefix_re.sub("", stdout.decode("utf-8"))
 62 |         raise LintError("\n\n" + err)
 63 | 
 64 |     return True
 65 | 
 66 | 
 67 | def check_flake8(file_text: str, opts: List[str] = FLAKE8_DEFAULT_OPTS) -> bool:
 68 |     """Runs flake8 on the text. Raises and exception if the file does
 69 |     not pass linting. Uses subprocess because per the Flake8 docs, Flake8
 70 |     does not have a public Python API.
 71 |     ref: https://flake8.pycqa.org/en/latest/user/python-api.html#public-python-api"""
 72 |     tmp = _create_tempfile(file_text)
 73 |     cmd = ["flake8", tmp.name] + opts
 74 |     try:
 75 |         _run_lint_cmd(cmd, tmp.name, MYPY_PREFIX_RE)
 76 |     except Exception as e:
 77 |         debug_file = "tmp-flake8-check-pipeline-api.py"
 78 |         _create_file_for_user_debugging(file_text, debug_file)
 79 |         cmd[1] = debug_file
 80 |         raise LintError("run the following to debug: \n" f"{' '.join(cmd)}") from e
 81 |     return True
 82 | 
 83 | 
 84 | def validate_flake8_ignore(flake8_ignore: str) -> bool:
 85 |     """Validates the CLI argument for Flake8 errors. For CLI input validation."""
 86 |     if FLAKE8_ERROR_CODE_RE.match(flake8_ignore) is None:
 87 |         raise ValueError(f"{flake8_ignore} is an invalid argument for the --flake8-ignore flag.")
 88 |     return True
 89 | 
 90 | 
 91 | def check_mypy(file_text: str):
 92 |     """Runs mypy type checking on the file text."""
 93 |     tmp = _create_tempfile(file_text)
 94 |     cmd = ["mypy", tmp.name, "--ignore-missing-imports", "--implicit-optional"]
 95 |     try:
 96 |         _run_lint_cmd(cmd, tmp.name, MYPY_PREFIX_RE)
 97 |     except Exception as e:
 98 |         debug_file = "tmp-myp-check-pipeline-api.py"
 99 |         _create_file_for_user_debugging(file_text, debug_file)
100 |         cmd[1] = debug_file
101 |         raise LintError("run the following to debug: \n" f"{' '.join(cmd)}") from e
102 |     return True
103 | 
104 | 
105 | def check_black(file_text: str) -> bool:
106 |     """Checks if a file needs to be reformatted with black."""
107 |     passes = format_black(file_text) == file_text
108 |     if not passes:
109 |         raise LintError("File text needs to be reformatted with black.")
110 |     return passes
111 | 
112 | 
113 | def format_black(file_text: str) -> str:
114 |     """Auto-formats a file using black."""
115 |     return format_str(file_text, mode=FileMode(line_length=100))
116 | 
117 | 
118 | def format_autoflake(file_text: str) -> str:
119 |     return fix_code(
120 |         source=file_text,
121 |         remove_unused_variables=True,
122 |         remove_all_unused_imports=True,
123 |         expand_star_imports=True,
124 |     )
125 | 
126 | 
127 | """
128 | Autoflake only takes into account unused imports by checking for pyflakes.messages.UnusedImport
129 | but does not handle duplicate imports which come out as pyflakes.messages.RedefinedWhileUnused
130 | from pyflakes. The following code is an extension of autoflake to take duplicate
131 | imports into account
132 | """
133 | 
134 | 
135 | def duplicate_import_line_numbers(messages):
136 |     """Yield line numbers of unused imports."""
137 |     for message in messages:
138 |         if isinstance(message, pyflakes.messages.RedefinedWhileUnused):
139 |             yield message.lineno
140 | 
141 | 
142 | def _remove_duplicate_imports(text: str):
143 |     messages = check(text)
144 |     marked_import_line_numbers = frozenset(
145 |         duplicate_import_line_numbers(messages),
146 |     )
147 |     marked_unused_module = collections.defaultdict(lambda: [])
148 |     for line_number, module_name in unused_import_module_name(messages):
149 |         marked_unused_module[line_number].append(module_name)
150 |     sio = io.StringIO(text)
151 |     previous_line = ""
152 |     result = None
153 |     for line_number, line in enumerate(sio.readlines(), start=1):
154 |         if line_number in marked_import_line_numbers:
155 |             result = filter_unused_import(
156 |                 line,
157 |                 unused_module=marked_unused_module[line_number],
158 |                 remove_all_unused_imports=True,
159 |                 imports=SAFE_IMPORTS,
160 |                 previous_line=previous_line,
161 |             )
162 |         else:
163 |             result = line
164 |         yield result
165 |         previous_line = line
166 | 
167 | 
168 | def remove_duplicate_imports(text: str) -> str:
169 |     return "".join(filter_useless_pass("".join(_remove_duplicate_imports(text))))
170 | 


--------------------------------------------------------------------------------
/unstructured_api_tools/pipelines/templates/pipeline_app.txt:
--------------------------------------------------------------------------------
 1 | #####################################################################
 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
 3 | # DO NOT MODIFY DIRECTLY
 4 | #####################################################################
 5 | 
 6 | 
 7 | from fastapi import FastAPI, Request, status
 8 | import logging
 9 | import os
10 | 
11 | {% for module in module_names -%}
12 | from .{{ module }} import router as {{module}}_router
13 | {% endfor %}
14 | 
15 | app = FastAPI(
16 |   title="{{ title }}",
17 |   description="""{{ description }}""",
18 |   version="{{ version or '1.0.0' }}",
19 |   docs_url="{{ '/' ~ version_name ~ '/docs' if version_name else '/docs' }}",
20 |   openapi_url="{{ '/' ~ version_name ~ '/openapi.json' if version_name else '/openapi.json' }}"
21 | )
22 | 
23 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", None)
24 | if allowed_origins:
25 |     from fastapi.middleware.cors import CORSMiddleware
26 |     app.add_middleware(
27 |         CORSMiddleware,
28 |         allow_origins=allowed_origins.split(","),
29 |         allow_methods=["OPTIONS", "POST"],
30 |         allow_headers=["Content-Type"]
31 |         )
32 | 
33 | {% for module in module_names -%}
34 | app.include_router({{ module }}_router)
35 | {% endfor %}
36 | 
37 | # Filter out /healthcheck noise
38 | class HealthCheckFilter(logging.Filter):
39 |     def filter(self, record: logging.LogRecord) -> bool:
40 |         return record.getMessage().find("/healthcheck") == -1
41 | 
42 | # Filter out /metrics noise
43 | class MetricsCheckFilter(logging.Filter):
44 |     def filter(self, record: logging.LogRecord) -> bool:
45 |         return record.getMessage().find("/metrics") == -1
46 | 
47 | logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter())
48 | logging.getLogger("uvicorn.access").addFilter(MetricsCheckFilter())
49 | 
50 | @app.get("/healthcheck", status_code=status.HTTP_200_OK, include_in_schema=False)
51 | def healthcheck(request: Request):
52 |     return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"}
53 | 


--------------------------------------------------------------------------------