├── .coveragerc
├── .github
└── workflows
│ ├── ci.yml
│ └── codeql-analysis.yml
├── .gitignore
├── CHANGELOG.md
├── Dockerfile
├── LICENSE.md
├── MANIFEST.in
├── Makefile
├── README.md
├── img
└── unstructured_logo.png
├── requirements
├── base.txt
├── test.in
└── test.txt
├── scripts
├── docker-build.sh
├── shellcheck.sh
└── version-sync.sh
├── setup.cfg
├── setup.py
├── test_unstructured_api_tools
├── api
│ ├── fixtures
│ │ ├── example.jpg
│ │ ├── example.jpg.gz
│ │ ├── fake-email.msg
│ │ ├── fake.docx
│ │ ├── fake.docx.gz
│ │ ├── markdown.md
│ │ ├── spring-weather.html.json
│ │ ├── text_file.txt
│ │ ├── text_file.txt.gz
│ │ ├── text_file_2.txt
│ │ └── text_file_2.txt.gz
│ ├── functions_and_variables.py
│ ├── test_docs.py
│ ├── test_file_apis.py
│ ├── test_file_text_apis.py
│ └── test_text_apis.py
├── pipeline-test-project
│ ├── README.md
│ ├── pipeline-notebooks
│ │ ├── pipeline-process-file-1.ipynb
│ │ ├── pipeline-process-file-2.ipynb
│ │ ├── pipeline-process-file-3.ipynb
│ │ ├── pipeline-process-file-4.ipynb
│ │ ├── pipeline-process-file-5.ipynb
│ │ ├── pipeline-process-text-1.ipynb
│ │ ├── pipeline-process-text-2.ipynb
│ │ ├── pipeline-process-text-3.ipynb
│ │ ├── pipeline-process-text-4.ipynb
│ │ ├── pipeline-process-text-file-1.ipynb
│ │ ├── pipeline-process-text-file-2.ipynb
│ │ ├── pipeline-process-text-file-3.ipynb
│ │ └── pipeline-process-text-file-4.ipynb
│ ├── prepline_test_project
│ │ └── api
│ │ │ ├── __init__.py
│ │ │ ├── app.py
│ │ │ ├── process_file_1.py
│ │ │ ├── process_file_2.py
│ │ │ ├── process_file_3.py
│ │ │ ├── process_file_4.py
│ │ │ ├── process_file_5.py
│ │ │ ├── process_text_1.py
│ │ │ ├── process_text_2.py
│ │ │ ├── process_text_3.py
│ │ │ ├── process_text_4.py
│ │ │ ├── process_text_file_1.py
│ │ │ ├── process_text_file_2.py
│ │ │ ├── process_text_file_3.py
│ │ │ └── process_text_file_4.py
│ ├── preprocessing-pipeline-family.yaml
│ └── scripts
│ │ ├── check-and-format-notebooks.py
│ │ └── test-doc-pipeline-apis-consistent.sh
├── pipelines
│ ├── test_api_conventions.py
│ ├── test_convert.py
│ └── test_lint.py
└── test_cli.py
└── unstructured_api_tools
├── __init__.py
├── __version__.py
├── cli.py
└── pipelines
├── __init__.py
├── api_conventions.py
├── convert.py
├── lint.py
└── templates
├── pipeline_api.txt
└── pipeline_app.txt
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = *.txt
3 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | # NOTE(robinson) - We are limiting when we run CI avoid exceeding our 2,000 min/month limt.
5 | # We can switch to running on push if we make this repo public or are fine with
6 | # paying for CI minutes.
7 | push:
8 | branches: [ main ]
9 | pull_request:
10 | branches: [ main ]
11 |
12 | env:
13 | PYTHON_VERSION: 3.8
14 |
15 | jobs:
16 | setup:
17 | strategy:
18 | matrix:
19 | python-version: ["3.8", "3.9", "3.10"]
20 | runs-on: ubuntu-latest
21 | steps:
22 | - uses: actions/checkout@v3
23 | - uses: actions/cache@v3
24 | id: virtualenv-cache
25 | with:
26 | path: |
27 | .venv
28 | key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }}
29 | - name: Set up Python ${{ matrix.python-version }}
30 | uses: actions/setup-python@v4
31 | with:
32 | python-version: ${{ matrix.python-version }}
33 | - name: Setup virtual environment (no cache hit)
34 | if: steps.virtualenv-cache.outputs.cache-hit != 'true'
35 | run: |
36 | python${{ matrix.python-version }} -m venv .venv
37 | source .venv/bin/activate
38 | make install-ci
39 |
40 | lint:
41 | runs-on: ubuntu-latest
42 | needs: setup
43 | steps:
44 | - uses: actions/checkout@v3
45 | - uses: actions/cache@v3
46 | id: virtualenv-cache
47 | with:
48 | path: .venv
49 | key: ${{ runner.os }}-${{ env.PYTHON_VERSION }}-${{ hashFiles('requirements/*.txt') }}
50 | - name: Lint
51 | run: |
52 | source .venv/bin/activate
53 | make check
54 |
55 | shellcheck:
56 | runs-on: ubuntu-latest
57 | steps:
58 | - uses: actions/checkout@v2
59 | - name: ShellCheck
60 | uses: ludeeus/action-shellcheck@master
61 |
62 | test_api_consistency:
63 | strategy:
64 | matrix:
65 | python-version: ["3.8", "3.9", "3.10"]
66 | runs-on: ubuntu-latest
67 | needs: [setup, lint]
68 | steps:
69 | - uses: actions/checkout@v3
70 | - uses: actions/cache@v3
71 | id: virtualenv-cache
72 | with:
73 | path: |
74 | .venv
75 | key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }}
76 | - name: API Consistency
77 | run: |
78 | source .venv/bin/activate
79 | make api-check-test
80 |
81 | test:
82 | strategy:
83 | matrix:
84 | python-version: [ "3.8", "3.9", "3.10" ]
85 | runs-on: ubuntu-latest
86 | needs: test_api_consistency
87 | steps:
88 | - uses: actions/checkout@v3
89 | - uses: actions/cache@v3
90 | id: virtualenv-cache
91 | with:
92 | path: |
93 | .venv
94 | key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }}
95 | - name: Test
96 | run: |
97 | source .venv/bin/activate
98 | make test
99 | make check-coverage
100 |
101 | changelog:
102 | runs-on: ubuntu-latest
103 | steps:
104 | - if: github.ref != 'refs/heads/main'
105 | uses: dorny/paths-filter@v2
106 | id: changes
107 | with:
108 | filters: |
109 | src:
110 | - 'unstructured_api_tools/**'
111 | - if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main'
112 | uses: dangoslen/changelog-enforcer@v3
113 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ "main" ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ "main" ]
20 | schedule:
21 | - cron: '35 10 * * 3'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ 'python' ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38 |
39 | steps:
40 | - name: Checkout repository
41 | uses: actions/checkout@v3
42 |
43 | # Initializes the CodeQL tools for scanning.
44 | - name: Initialize CodeQL
45 | uses: github/codeql-action/init@v2
46 | with:
47 | languages: ${{ matrix.language }}
48 | # If you wish to specify custom queries, you can do so here or in a config file.
49 | # By default, queries listed here will override any specified in a config file.
50 | # Prefix the list here with "+" to use these queries and those in the config file.
51 |
52 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
53 | # queries: security-extended,security-and-quality
54 |
55 |
56 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
57 | # If this step fails, then you should remove it and run the build manually (see below)
58 | - name: Autobuild
59 | uses: github/codeql-action/autobuild@v2
60 |
61 | # ℹ️ Command-line programs to run using the OS shell.
62 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
63 |
64 | # If the Autobuild fails above, remove it and uncomment the following three lines.
65 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
66 |
67 | # - run: |
68 | # echo "Run, Build Application using script"
69 | # ./location_of_script_within_repo/buildscript.sh
70 |
71 | - name: Perform CodeQL Analysis
72 | uses: github/codeql-action/analyze@v2
73 | with:
74 | category: "/language:${{matrix.language}}"
75 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | tmp*
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | pip-wheel-metadata/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | *.py,cover
53 | .hypothesis/
54 | .pytest_cache/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | target/
78 |
79 | # Pycharm
80 | .idea/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # IPython
86 | profile_default/
87 | ipython_config.py
88 |
89 | # pyenv
90 | .python-version
91 |
92 | # pipenv
93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
96 | # install all needed dependencies.
97 | #Pipfile.lock
98 |
99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 |
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 |
106 | # SageMath parsed files
107 | *.sage.py
108 |
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 |
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 |
122 | # Rope project settings
123 | .ropeproject
124 |
125 | # mkdocs documentation
126 | /site
127 |
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 |
133 | # Pyre type checker
134 | .pyre/
135 |
136 | # API test project test files
137 | /test_unstructured_api_tools/pipeline-test-project/tmp*
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # 0.10.11
2 |
3 | * Fix using metrics filter for logger
4 |
5 | # 0.10.10
6 |
7 | * Filter out metrics endpoint requests from logs
8 |
9 | # 0.10.9
10 |
11 | * Fix output formatting for csv responses
12 |
13 | # 0.10.8
14 |
15 | * Add autoflake and duplicate import removal to linting steps
16 |
17 | # 0.10.7
18 |
19 | * Add support for passing request into pipeline
20 |
21 | # 0.10.6
22 |
23 | * Fix ENV variable processing for CORS
24 |
25 | # 0.10.5
26 |
27 | * Add optional CORS to api
28 |
29 | # 0.10.4
30 |
31 | * Add filter on /healthcheck logs
32 |
33 | # 0.10.3
34 |
35 | * Add support for json and msg file types
36 |
37 | # 0.10.2
38 |
39 | * Set black line length to 100
40 |
41 | # 0.10.1
42 |
43 | * Add Ability to request one file as multipart/form data
44 |
45 | # 0.10.0
46 |
47 | * Update templates for generated API.
48 | * Improve code for accepting gzip files.
49 |
50 | # 0.9.4
51 |
52 | * Add dynamic openapi_url to match docs_url
53 |
54 | # 0.9.3
55 |
56 | * Removed /healthcheck endpoint from docs
57 | * Add fix for handling content type sent as None
58 |
59 | # 0.9.2
60 |
61 | * Add content_type to error message for unsupported file types
62 |
63 | # 0.9.1
64 |
65 | * Allow references to standard imports in pipeline cells
66 | * Removed unused /healthcheck endpoints
67 |
68 | # 0.9.0
69 |
70 | * Add supporting gzip compressed files
71 |
72 | # 0.8.1
73 |
74 | * Removed async/await from endpoints.
75 | * Refactored template for generating endpoints with shorter semver.
76 |
77 | # 0.8.0
78 |
79 | * Add duplicate routes with semver major version
80 |
81 | # 0.7.0
82 |
83 | * Add dynamic docs_url
84 |
85 | # 0.6.0
86 |
87 | * Add file type validation via `UNSTRUCTURED_ALLOWED_MIMETYPES`
88 |
89 | # 0.5.0
90 |
91 | * Removed rate limit and slow api from project. Updated templates and tests.
92 |
93 | # 0.4.9
94 |
95 | * Bug fix: Generated code now consistent across Operating Systems
96 |
97 | # 0.4.8
98 |
99 | * Add ability to return JSON responses for multiple text_files
100 |
101 | # 0.4.7
102 |
103 | * Notebook conversion organizes module level imports at the top of the file
104 | * Allow for FastAPI metadata to be read from the config file
105 | * Add `__init__.py` to API module and add a default version for FastAPI.
106 |
107 | # 0.4.6
108 |
109 | * Add support for `response_schema` parameter in Pipeline API functions.
110 |
111 | # 0.4.5
112 |
113 | * fix bug to get `response_type` value before first call of it in template
114 |
115 | # 0.4.4
116 |
117 | * Implement generation of an app-level FastAPI module.
118 |
119 | # 0.4.3
120 |
121 | * Updates `mypy` type checking code to use `--implicit-optional`
122 |
123 | ## 0.4.2
124 |
125 | * Add types-ujson dependency
126 |
127 | ## 0.4.1
128 |
129 | * Implement feature to allow accepting multiple binary files to the autogenerated pipeline APIs.
130 |
131 | ## 0.4.0
132 |
133 | * Implement feature to allow accepting multiple text files to the autogenerated pipeline APIs.
134 |
135 | ## 0.3.1
136 |
137 | * Removed the ratelimit on healthchecks
138 | * Dependency bumps
139 |
140 | ## 0.3.0
141 |
142 | * Add the ability to pass Accept MIME type headers to pipeline API's
143 | * Dependency bumps
144 |
145 | ## 0.2.0
146 |
147 | * Initial release of unstructured-api-tools
148 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # syntax=docker/dockerfile:experimental
2 | FROM quay.io/unstructured-io/base-images:rocky8.7-2 as base
3 |
4 | RUN yum install -y make
5 |
6 | ARG PIP_VERSION
7 |
8 | # Set up environment
9 | ENV HOME /home/
10 | WORKDIR ${HOME}
11 | RUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \
12 | && ssh-keyscan -t rsa github.com >> /home/.ssh/known_hosts
13 | ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
14 | ENV PATH="/home/usr/.local/bin:${PATH}"
15 |
16 | FROM base as deps
17 | # Copy and install Unstructured
18 | COPY requirements requirements
19 |
20 | RUN python3.8 -m pip install pip==${PIP_VERSION} && \
21 | dnf -y groupinstall "Development Tools" && \
22 | pip install --no-cache -r requirements/base.txt && \
23 | pip install --no-cache -r requirements/test.txt && \
24 | dnf -y groupremove "Development Tools" && \
25 | dnf clean all
26 |
27 | FROM deps as code
28 | COPY Makefile Makefile
29 |
30 | CMD ["/bin/bash"]
31 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2022 Unstructured Technologies, Inc
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include unstructured_api_tools/pipelines/templates/*.txt
2 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PACKAGE_NAME := unstructured_api_tools
2 | PIP_VERSION := 22.2.1
3 | CURRENT_DIR := $(shell pwd)
4 |
5 |
6 | .PHONY: help
7 | help: Makefile
8 | @sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $<
9 |
10 |
11 | ###########
12 | # Install #
13 | ###########
14 |
15 | ## install: installs all base and test requirements
16 | .PHONY: install
17 | install: install-base install-test
18 |
19 | .PHONY: install-ci
20 | install-ci: install
21 |
22 | .PHONY: install-base
23 | install-base:
24 | python3 -m pip install pip==${PIP_VERSION}
25 | pip install -r requirements/base.txt
26 |
27 | .PHONY: install-test
28 | install-test:
29 | pip install -r requirements/test.txt
30 |
31 | ## pip-compile: compiles all base and test requirements
32 | .PHONY: pip-compile
33 | pip-compile:
34 | # NOTE(crag): you have to manually install pip-tools for now to run this.
35 | # There is a better way to do this with a pinned pip-compile version and a venv.
36 | bash -c "pip-compile -h >/dev/null || { echo please run \'pip install pip-tools\' and then rerun this command; exit 1; }"
37 | pip-compile --upgrade -o requirements/base.txt
38 | pip-compile --upgrade -o requirements/test.txt requirements/base.txt requirements/test.in
39 |
40 | ## install-project-local: install unstructured_api_tools into your local python environment
41 | .PHONY: install-project-local
42 | install-project-local: install
43 | # MAYBE TODO: fail if already exists?
44 | pip install -e .
45 |
46 | ## uninstall-project-local: uninstall unstructured_api_tools from your local python environment
47 | .PHONY: uninstall-project-local
48 | uninstall-project-local:
49 | pip uninstall ${PACKAGE_NAME}
50 |
51 | #################
52 | # Test and Lint #
53 | #################
54 |
55 | ## run-jupyter-test-notebooks: starts jupyter, allows execution of test notebooks
56 | .PHONY: run-jupyter-test-notebooks
57 | run-jupyter-test-notebooks:
58 | PYTHONPATH=$(realpath .)/test_unstructured_api_tools/pipeline-test-project/ JUPYTER_PATH=$(realpath .)/test_unstructured_api_tools/pipeline-test-project/ jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''
59 |
60 | ## tidy-test-notebooks: execute notebooks and remove metadata
61 | .PHONY: tidy-test-notebooks
62 | tidy-test-notebooks:
63 | PYTHONPATH=. ./test_unstructured_api_tools/pipeline-test-project/scripts/check-and-format-notebooks.py
64 |
65 | ## generate-test-api: generates FastAPIs under ./test_unstructured_api_tools/pipeline-test-project
66 | .PHONY: generate-test-api
67 | generate-test-api:
68 | # generates FastAPI API's from notebooks in the test project ./test_unstructured_api_tools/pipeline-test-project
69 | PYTHONPATH=. PIPELINE_FAMILY_CONFIG=test_unstructured_api_tools/pipeline-test-project/preprocessing-pipeline-family.yaml \
70 | python3 ./unstructured_api_tools/cli.py convert-pipeline-notebooks \
71 | --input-directory ./test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks \
72 | --output-directory ./test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api
73 |
74 |
75 | ## api-check-test: verifies auto-generated pipeline APIs match the existing ones
76 | .PHONY: api-check-test
77 | api-check-test:
78 | PYTHONPATH=. PACKAGE_NAME=prepline_test_project ./test_unstructured_api_tools/pipeline-test-project/scripts/test-doc-pipeline-apis-consistent.sh
79 |
80 |
81 | ## test: runs all unittests
82 | .PHONY: test
83 | test:
84 | PYTHONPATH=.:./test_unstructured_api_tools/pipeline-test-project pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov=prepline_test_project --cov-report term-missing -vvv
85 |
86 | ## check: runs linters (includes tests)
87 | .PHONY: check
88 | check: check-src check-tests check-version
89 |
90 | ## check-src: runs linters (source only, no tests)
91 | .PHONY: check-src
92 | check-src:
93 | black --line-length 100 ${PACKAGE_NAME} --check
94 | flake8 ${PACKAGE_NAME}
95 | mypy ${PACKAGE_NAME} --ignore-missing-imports --install-types --non-interactive
96 | autoflake --remove-unused-variables --remove-duplicate-keys --expand-star-imports \
97 | --remove-all-unused-imports -cd -r ${PACKAGE_NAME} test_${PACKAGE_NAME} \
98 | --exclude test_${PACKAGE_NAME}/pipeline-test-project
99 |
100 |
101 | .PHONY: check-tests
102 | check-tests:
103 | black --line-length 100 test_${PACKAGE_NAME} --check --exclude test_${PACKAGE_NAME}/pipeline-test-project
104 | flake8 test_${PACKAGE_NAME} --exclude test_${PACKAGE_NAME}/pipeline-test-project/prepline_test_project/api
105 |
106 | ## check-scripts: run shellcheck
107 | .PHONY: check-scripts
108 | check-scripts:
109 | # Fail if any of these files have warnings
110 | scripts/shellcheck.sh
111 |
112 | ## check-version: run check to ensure version in CHANGELOG.md matches version in package
113 | .PHONY: check-version
114 | check-version:
115 | # Fail if syncing version would produce changes
116 | scripts/version-sync.sh -c \
117 | -f ${PACKAGE_NAME}/__version__.py semver
118 |
119 | ## tidy: run black
120 | .PHONY: tidy
121 | tidy: tidy-black tidy-autoflake
122 |
123 | tidy-autoflake:
124 | autoflake --remove-unused-variables --remove-duplicate-keys --expand-star-imports \
125 | --remove-all-unused-imports -i -r ${PACKAGE_NAME} test_${PACKAGE_NAME} \
126 | --exclude test_${PACKAGE_NAME}/pipeline-test-project
127 |
128 |
129 | tidy-black:
130 | black --line-length 100 ${PACKAGE_NAME}
131 | black --line-length 100 test_${PACKAGE_NAME} --exclude test_${PACKAGE_NAME}/pipeline-test-project
132 |
133 |
134 | ## version-sync: update __version__.py with most recent version from CHANGELOG.md
135 | .PHONY: version-sync
136 | version-sync:
137 | scripts/version-sync.sh \
138 | -f ${PACKAGE_NAME}/__version__.py semver
139 |
140 | .PHONY: check-coverage
141 | check-coverage:
142 | # TODO(crag): add coverage check for test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/
143 | coverage report --fail-under=95
144 |
145 | ##########
146 | # Docker #
147 | ##########
148 |
149 | # Docker targets are provided for convenience only and are not required in a standard development environment
150 |
151 | DOCKER_IMAGE ?= unstructured-api-tools:dev
152 |
153 | .PHONY: docker-build
154 | docker-build:
155 | PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh
156 |
157 | .PHONY: docker-start-bash
158 | docker-start-bash:
159 | docker run -ti --rm ${DOCKER_IMAGE}
160 |
161 | .PHONY: docker-test
162 | docker-test: docker-build
163 | docker run --rm \
164 | -v ${CURRENT_DIR}/test_unstructured_api_tools:/home/test_unstructured_api_tools \
165 | -v ${CURRENT_DIR}/unstructured_api_tools:/home/unstructured_api_tools \
166 | $(DOCKER_IMAGE) \
167 | bash -c "make test"
168 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
6 |
7 |
8 |
9 |
Open-Source Pre-Processing Tools for Unstructured Data
10 |
11 |
12 |
13 | The `unstructured_api_tools` library includes utilities for converting pipeline notebooks into
14 | REST API applications. `unstructured_api_tools` is intended for use in conjunction with
15 | pipeline repos. See [`pipeline-sec-filings`](https://github.com/Unstructured-IO/pipeline-sec-filings)
16 | for an example of a repo that uses `unstructured_api_tools`.
17 |
18 | ## Installation
19 |
20 | To install the library, run `pip install unstructured_api_tools`.
21 |
22 | ## Developer Quick Start
23 |
24 | * Using `pyenv` to manage virtualenv's is recommended
25 | * Mac install instructions. See [here](https://github.com/Unstructured-IO/community#mac--homebrew) for more detailed instructions.
26 | * `brew install pyenv-virtualenv`
27 | * `pyenv install 3.8.15`
28 | * Linux instructions are available [here](https://github.com/Unstructured-IO/community#linux).
29 |
30 | * Create a virtualenv to work in and activate it, e.g. for one named `unstructured_api_tools`:
31 |
32 | `pyenv virtualenv 3.8.15 unstructured_api_tools`
33 | `pyenv activate unstructured_api_tools`
34 |
35 | * Run `make install-project-local`
36 |
37 | ## Usage
38 |
39 | Use the CLI command to convert pipeline notebooks to scripts, for example:
40 |
41 | ```bash
42 | unstructured_api_tools convert-pipeline-notebooks \
43 | --input-directory pipeline-family-sec-filings/pipeline-notebooks \
44 | --output-directory pipeline-family-sec-filings/prepline_sec_filings/api \
45 | --pipeline-family sec-filings \
46 | --semver 0.2.1
47 | ```
48 |
49 | If you do not provide the `pipeline-family` and `semver` arguments, those values are parsed from
50 | `preprocessing-pipeline-family.yaml`. You can provide the `preprocessing-pipeline-family.yaml` file
51 | explicitly with `--config-filename` or the `PIPELINE_FAMILY_CONFIG` environment variable. If neither
52 | of those is specified, the fallback is to use the `preprocessing-pipeline-family.yaml` file in the
53 | current working directory.
54 |
55 | The API file undergoes `black`, `flake8` and `mypy` checks after being generated. If you want
56 | `flake8` to ignore specific errors, you can specify them through the CLI with
57 | `--flake8-ignore F401, E402`.
58 | See the [`flake8` docs](https://flake8.pycqa.org/en/latest/user/error-codes.html#error-violation-codes)
59 | for a full list of error codes.
60 |
61 | ### Conversion from `pipeline_api` to FastAPI
62 |
63 | The command described in [**Usage**](#Usage) generates a FastAPI API route for each `pipeline_api`
64 | function defined in the notebook. The signature of the `pipeline_api` method determines what
65 | parameters the generated FastAPI accepts.
66 |
67 | Currently, only plain text file uploads are supported and as such the first argument must always be
68 | `text`, but support for multiple files and binary files is coming soon!
69 |
70 | In addition, any number of string array parameters may be specified. Any kwarg beginning with
71 | `m_` indicates a multi-value string parameter that is accepted by the FastAPI API.
72 |
73 | For example, in a notebook containing:
74 |
75 | def pipeline_api(text, m_subject=[], m_name=[]):
76 |
77 | `text` represents the content of a file posted to the FastAPI API, and the `m_subject` and `m_name`
78 | keyword args represent optional parameters that may be posted to the API as well, both allowing
79 | multiple string parameters. A `curl` request against such an API could look like this:
80 |
81 | curl -X 'POST' \
82 | 'https://///' \
83 | -H 'accept: application/json' \
84 | -H 'Content-Type: multipart/form-data' \
85 | -F 'file=@file-to-process.txt' \
86 | -F 'subject=art' \
87 | -F 'subject=history'
88 | -F 'subject=math' \
89 | -F 'name=feynman'
90 |
91 | In addition, you can specify the response type if `pipeline_api` can support both "application/json"
92 | and "text/csv" as return types.
93 |
94 | For example, in a notebook containing a kwarg `response_type`:
95 |
96 | def pipeline_api(text, response_type="text/csv", m_subject=[], m_name=[]):
97 |
98 | The consumer of the API may then specify "text/csv" as the requested response content type with the usual
99 | HTTP Accept header, e.g. `Accept: application/json` or `Accept: text/csv`.
100 |
101 | ## Security Policy
102 |
103 | See our [security policy](https://github.com/Unstructured-IO/unstructured-api-tools/security/policy) for
104 | information on how to report security vulnerabilities.
105 |
106 | ## Learn more
107 |
108 | | Section | Description |
109 | |-|-|
110 | | [Company Website](https://unstructured.io) | Unstructured.io product and company info |
111 |
--------------------------------------------------------------------------------
/img/unstructured_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/img/unstructured_logo.png
--------------------------------------------------------------------------------
/requirements/base.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile with Python 3.8
3 | # by the following command:
4 | #
5 | # pip-compile --output-file=requirements/base.txt
6 | #
7 | anyio==3.6.2
8 | # via
9 | # starlette
10 | # watchfiles
11 | attrs==22.2.0
12 | # via jsonschema
13 | autoflake==2.1.1
14 | # via unstructured-api-tools (setup.py)
15 | beautifulsoup4==4.12.1
16 | # via nbconvert
17 | bleach==6.0.0
18 | # via nbconvert
19 | click==8.1.3
20 | # via
21 | # unstructured-api-tools (setup.py)
22 | # uvicorn
23 | defusedxml==0.7.1
24 | # via nbconvert
25 | fastapi==0.95.0
26 | # via unstructured-api-tools (setup.py)
27 | fastjsonschema==2.16.3
28 | # via nbformat
29 | h11==0.14.0
30 | # via uvicorn
31 | httptools==0.5.0
32 | # via uvicorn
33 | idna==3.4
34 | # via anyio
35 | importlib-metadata==6.1.0
36 | # via
37 | # jupyter-client
38 | # nbconvert
39 | importlib-resources==5.12.0
40 | # via jsonschema
41 | jinja2==3.1.2
42 | # via
43 | # nbconvert
44 | # unstructured-api-tools (setup.py)
45 | jsonschema==4.17.3
46 | # via nbformat
47 | jupyter-client==8.1.0
48 | # via nbclient
49 | jupyter-core==5.3.0
50 | # via
51 | # jupyter-client
52 | # nbclient
53 | # nbconvert
54 | # nbformat
55 | jupyterlab-pygments==0.2.2
56 | # via nbconvert
57 | markupsafe==2.1.2
58 | # via
59 | # jinja2
60 | # nbconvert
61 | mistune==2.0.5
62 | # via nbconvert
63 | mypy==1.2.0
64 | # via unstructured-api-tools (setup.py)
65 | mypy-extensions==1.0.0
66 | # via mypy
67 | nbclient==0.7.3
68 | # via nbconvert
69 | nbconvert==7.3.0
70 | # via unstructured-api-tools (setup.py)
71 | nbformat==5.8.0
72 | # via
73 | # nbclient
74 | # nbconvert
75 | numpy==1.24.3
76 | # via pandas
77 | packaging==23.0
78 | # via nbconvert
79 | pandas==2.0.2
80 | # via unstructured-api-tools (setup.py)
81 | pandocfilters==1.5.0
82 | # via nbconvert
83 | pkgutil-resolve-name==1.3.10
84 | # via jsonschema
85 | platformdirs==3.2.0
86 | # via jupyter-core
87 | pydantic==1.10.7
88 | # via fastapi
89 | pyflakes==3.0.1
90 | # via autoflake
91 | pygments==2.14.0
92 | # via nbconvert
93 | pyrsistent==0.19.3
94 | # via jsonschema
95 | python-dateutil==2.8.2
96 | # via
97 | # jupyter-client
98 | # pandas
99 | python-dotenv==1.0.0
100 | # via uvicorn
101 | python-multipart==0.0.6
102 | # via unstructured-api-tools (setup.py)
103 | pytz==2023.3
104 | # via pandas
105 | pyyaml==6.0
106 | # via uvicorn
107 | pyzmq==25.0.2
108 | # via jupyter-client
109 | six==1.16.0
110 | # via
111 | # bleach
112 | # python-dateutil
113 | sniffio==1.3.0
114 | # via anyio
115 | soupsieve==2.4
116 | # via beautifulsoup4
117 | starlette==0.26.1
118 | # via fastapi
119 | tinycss2==1.2.1
120 | # via nbconvert
121 | tomli==2.0.1
122 | # via
123 | # autoflake
124 | # mypy
125 | tornado==6.2
126 | # via jupyter-client
127 | traitlets==5.9.0
128 | # via
129 | # jupyter-client
130 | # jupyter-core
131 | # nbclient
132 | # nbconvert
133 | # nbformat
134 | types-requests==2.28.11.17
135 | # via unstructured-api-tools (setup.py)
136 | types-ujson==5.7.0.1
137 | # via unstructured-api-tools (setup.py)
138 | types-urllib3==1.26.25.10
139 | # via types-requests
140 | typing-extensions==4.5.0
141 | # via
142 | # mypy
143 | # pydantic
144 | # starlette
145 | tzdata==2023.3
146 | # via pandas
147 | uvicorn[standard]==0.21.1
148 | # via unstructured-api-tools (setup.py)
149 | uvloop==0.17.0
150 | # via uvicorn
151 | watchfiles==0.19.0
152 | # via uvicorn
153 | webencodings==0.5.1
154 | # via
155 | # bleach
156 | # tinycss2
157 | websockets==11.0.1
158 | # via uvicorn
159 | zipp==3.15.0
160 | # via
161 | # importlib-metadata
162 | # importlib-resources
163 |
--------------------------------------------------------------------------------
/requirements/test.in:
--------------------------------------------------------------------------------
1 | black>=22.3.0
2 | coverage
3 | flake8
4 | httpx
5 | # NOTE(robinson) - Pinning version due to the NotOneFoundException crash described here.
6 | # ref: https://github.com/ipython/ipython/issues/13598
7 | ipython>=8.9.0
8 | pytest-cov
9 | # NOTE(mrobinson) - requests is needed for the fastapi test client
10 | requests
11 | requests_toolbelt
12 | nbdev
13 | pytest-mock
14 |
--------------------------------------------------------------------------------
/requirements/test.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile with Python 3.8
3 | # by the following command:
4 | #
5 | # pip-compile --output-file=requirements/test.txt requirements/base.txt requirements/test.in
6 | #
7 | anyio==3.6.2
8 | # via
9 | # -r requirements/base.txt
10 | # httpcore
11 | # starlette
12 | # watchfiles
13 | asttokens==2.2.1
14 | # via
15 | # nbdev
16 | # stack-data
17 | astunparse==1.6.3
18 | # via nbdev
19 | attrs==22.2.0
20 | # via
21 | # -r requirements/base.txt
22 | # jsonschema
23 | # pytest
24 | backcall==0.2.0
25 | # via ipython
26 | beautifulsoup4==4.12.1
27 | # via
28 | # -r requirements/base.txt
29 | # nbconvert
30 | black==23.3.0
31 | # via -r requirements/test.in
32 | bleach==6.0.0
33 | # via
34 | # -r requirements/base.txt
35 | # nbconvert
36 | certifi==2022.12.7
37 | # via
38 | # httpcore
39 | # httpx
40 | # requests
41 | charset-normalizer==3.1.0
42 | # via requests
43 | click==8.1.3
44 | # via
45 | # -r requirements/base.txt
46 | # black
47 | # uvicorn
48 | coverage[toml]==7.2.3
49 | # via
50 | # -r requirements/test.in
51 | # pytest-cov
52 | decorator==5.1.1
53 | # via ipython
54 | defusedxml==0.7.1
55 | # via
56 | # -r requirements/base.txt
57 | # nbconvert
58 | exceptiongroup==1.1.1
59 | # via pytest
60 | execnb==0.1.5
61 | # via nbdev
62 | executing==1.2.0
63 | # via stack-data
64 | fastapi==0.95.0
65 | # via -r requirements/base.txt
66 | fastcore==1.5.29
67 | # via
68 | # execnb
69 | # ghapi
70 | # nbdev
71 | fastjsonschema==2.16.3
72 | # via
73 | # -r requirements/base.txt
74 | # nbformat
75 | flake8==6.0.0
76 | # via -r requirements/test.in
77 | ghapi==1.0.3
78 | # via nbdev
79 | h11==0.14.0
80 | # via
81 | # -r requirements/base.txt
82 | # httpcore
83 | # uvicorn
84 | httpcore==0.16.3
85 | # via httpx
86 | httptools==0.5.0
87 | # via
88 | # -r requirements/base.txt
89 | # uvicorn
90 | httpx==0.23.3
91 | # via -r requirements/test.in
92 | idna==3.4
93 | # via
94 | # -r requirements/base.txt
95 | # anyio
96 | # requests
97 | # rfc3986
98 | importlib-metadata==6.1.0
99 | # via
100 | # -r requirements/base.txt
101 | # jupyter-client
102 | # nbconvert
103 | importlib-resources==5.12.0
104 | # via
105 | # -r requirements/base.txt
106 | # jsonschema
107 | iniconfig==2.0.0
108 | # via pytest
109 | ipython==8.12.0
110 | # via
111 | # -r requirements/test.in
112 | # execnb
113 | jedi==0.18.2
114 | # via ipython
115 | jinja2==3.1.2
116 | # via
117 | # -r requirements/base.txt
118 | # nbconvert
119 | jsonschema==4.17.3
120 | # via
121 | # -r requirements/base.txt
122 | # nbformat
123 | jupyter-client==8.1.0
124 | # via
125 | # -r requirements/base.txt
126 | # nbclient
127 | jupyter-core==5.3.0
128 | # via
129 | # -r requirements/base.txt
130 | # jupyter-client
131 | # nbclient
132 | # nbconvert
133 | # nbformat
134 | jupyterlab-pygments==0.2.2
135 | # via
136 | # -r requirements/base.txt
137 | # nbconvert
138 | markupsafe==2.1.2
139 | # via
140 | # -r requirements/base.txt
141 | # jinja2
142 | # nbconvert
143 | matplotlib-inline==0.1.6
144 | # via ipython
145 | mccabe==0.7.0
146 | # via flake8
147 | mistune==2.0.5
148 | # via
149 | # -r requirements/base.txt
150 | # nbconvert
151 | mypy==1.2.0
152 | # via -r requirements/base.txt
153 | mypy-extensions==1.0.0
154 | # via
155 | # -r requirements/base.txt
156 | # black
157 | # mypy
158 | nbclient==0.7.3
159 | # via
160 | # -r requirements/base.txt
161 | # nbconvert
162 | nbconvert==7.3.0
163 | # via -r requirements/base.txt
164 | nbdev==2.3.12
165 | # via -r requirements/test.in
166 | nbformat==5.8.0
167 | # via
168 | # -r requirements/base.txt
169 | # nbclient
170 | # nbconvert
171 | packaging==23.0
172 | # via
173 | # -r requirements/base.txt
174 | # black
175 | # fastcore
176 | # ghapi
177 | # nbconvert
178 | # pytest
179 | pandocfilters==1.5.0
180 | # via
181 | # -r requirements/base.txt
182 | # nbconvert
183 | parso==0.8.3
184 | # via jedi
185 | pathspec==0.11.1
186 | # via black
187 | pexpect==4.8.0
188 | # via ipython
189 | pickleshare==0.7.5
190 | # via ipython
191 | pkgutil-resolve-name==1.3.10
192 | # via
193 | # -r requirements/base.txt
194 | # jsonschema
195 | platformdirs==3.2.0
196 | # via
197 | # -r requirements/base.txt
198 | # black
199 | # jupyter-core
200 | pluggy==1.0.0
201 | # via pytest
202 | prompt-toolkit==3.0.38
203 | # via ipython
204 | ptyprocess==0.7.0
205 | # via pexpect
206 | pure-eval==0.2.2
207 | # via stack-data
208 | pycodestyle==2.10.0
209 | # via flake8
210 | pydantic==1.10.7
211 | # via
212 | # -r requirements/base.txt
213 | # fastapi
214 | pyflakes==3.0.1
215 | # via flake8
216 | pygments==2.14.0
217 | # via
218 | # -r requirements/base.txt
219 | # ipython
220 | # nbconvert
221 | pyrsistent==0.19.3
222 | # via
223 | # -r requirements/base.txt
224 | # jsonschema
225 | pytest==7.2.2
226 | # via
227 | # pytest-cov
228 | # pytest-mock
229 | pytest-cov==4.0.0
230 | # via -r requirements/test.in
231 | pytest-mock==3.10.0
232 | # via -r requirements/test.in
233 | python-dateutil==2.8.2
234 | # via
235 | # -r requirements/base.txt
236 | # jupyter-client
237 | python-dotenv==1.0.0
238 | # via
239 | # -r requirements/base.txt
240 | # uvicorn
241 | python-multipart==0.0.6
242 | # via -r requirements/base.txt
243 | pyyaml==6.0
244 | # via
245 | # -r requirements/base.txt
246 | # nbdev
247 | # uvicorn
248 | pyzmq==25.0.2
249 | # via
250 | # -r requirements/base.txt
251 | # jupyter-client
252 | requests==2.28.2
253 | # via
254 | # -r requirements/test.in
255 | # requests-toolbelt
256 | requests-toolbelt==0.10.1
257 | # via -r requirements/test.in
258 | rfc3986[idna2008]==1.5.0
259 | # via httpx
260 | six==1.16.0
261 | # via
262 | # -r requirements/base.txt
263 | # asttokens
264 | # astunparse
265 | # bleach
266 | # python-dateutil
267 | sniffio==1.3.0
268 | # via
269 | # -r requirements/base.txt
270 | # anyio
271 | # httpcore
272 | # httpx
273 | soupsieve==2.4
274 | # via
275 | # -r requirements/base.txt
276 | # beautifulsoup4
277 | stack-data==0.6.2
278 | # via ipython
279 | starlette==0.26.1
280 | # via
281 | # -r requirements/base.txt
282 | # fastapi
283 | tinycss2==1.2.1
284 | # via
285 | # -r requirements/base.txt
286 | # nbconvert
287 | tomli==2.0.1
288 | # via
289 | # -r requirements/base.txt
290 | # black
291 | # coverage
292 | # mypy
293 | # pytest
294 | tornado==6.2
295 | # via
296 | # -r requirements/base.txt
297 | # jupyter-client
298 | traitlets==5.9.0
299 | # via
300 | # -r requirements/base.txt
301 | # ipython
302 | # jupyter-client
303 | # jupyter-core
304 | # matplotlib-inline
305 | # nbclient
306 | # nbconvert
307 | # nbformat
308 | types-requests==2.28.11.17
309 | # via -r requirements/base.txt
310 | types-ujson==5.7.0.1
311 | # via -r requirements/base.txt
312 | types-urllib3==1.26.25.10
313 | # via
314 | # -r requirements/base.txt
315 | # types-requests
316 | typing-extensions==4.5.0
317 | # via
318 | # -r requirements/base.txt
319 | # black
320 | # ipython
321 | # mypy
322 | # pydantic
323 | # starlette
324 | urllib3==1.26.15
325 | # via requests
326 | uvicorn[standard]==0.21.1
327 | # via -r requirements/base.txt
328 | uvloop==0.17.0
329 | # via
330 | # -r requirements/base.txt
331 | # uvicorn
332 | watchdog==3.0.0
333 | # via nbdev
334 | watchfiles==0.19.0
335 | # via
336 | # -r requirements/base.txt
337 | # uvicorn
338 | wcwidth==0.2.6
339 | # via prompt-toolkit
340 | webencodings==0.5.1
341 | # via
342 | # -r requirements/base.txt
343 | # bleach
344 | # tinycss2
345 | websockets==11.0.1
346 | # via
347 | # -r requirements/base.txt
348 | # uvicorn
349 | wheel==0.40.0
350 | # via astunparse
351 | zipp==3.15.0
352 | # via
353 | # -r requirements/base.txt
354 | # importlib-metadata
355 | # importlib-resources
356 |
357 | # The following packages are considered to be unsafe in a requirements file:
358 | # pip
359 |
--------------------------------------------------------------------------------
/scripts/docker-build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -euo pipefail
4 | DOCKER_REPOSITORY="${DOCKER_REPOSITORY:-quay.io/unstructured-io/unstructured}"
5 | PIP_VERSION="${PIP_VERSION:-23.1.2}"
6 | DOCKER_IMAGE="${DOCKER_IMAGE:-unstructured-api-tools:dev}"
7 |
8 | DOCKER_BUILD_CMD=(docker buildx build --load -f Dockerfile \
9 | --build-arg PIP_VERSION="$PIP_VERSION" \
10 | --build-arg BUILDKIT_INLINE_CACHE=1 \
11 | --progress plain \
12 | --cache-from "$DOCKER_REPOSITORY":latest \
13 | -t "$DOCKER_IMAGE" .)
14 |
15 | # only build for specific platform if DOCKER_BUILD_PLATFORM is set
16 | if [ -n "${DOCKER_BUILD_PLATFORM:-}" ]; then
17 | DOCKER_BUILD_CMD+=("--platform=$DOCKER_BUILD_PLATFORM")
18 | fi
19 |
20 | DOCKER_BUILDKIT=1 "${DOCKER_BUILD_CMD[@]}"
21 |
--------------------------------------------------------------------------------
/scripts/shellcheck.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | find scripts -name "*.sh" -exec shellcheck {} +
4 |
5 |
--------------------------------------------------------------------------------
/scripts/version-sync.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | function usage {
3 | echo "Usage: $(basename "$0") [-c] -f FILE_TO_CHANGE REPLACEMENT_FORMAT [-f FILE_TO_CHANGE REPLACEMENT_FORMAT ...]" 2>&1
4 | echo 'Synchronize files to latest version in source file'
5 | echo ' -s Specifies source file for version (default is CHANGELOG.md)'
6 | echo ' -f Specifies a file to change and the format for searching and replacing versions'
7 | echo ' FILE_TO_CHANGE is the file to be updated/checked for updates'
8 | echo ' REPLACEMENT_FORMAT is one of (semver, release, api-release)'
9 | echo ' semver indicates to look for a full semver version and replace with the latest full version'
10 | echo ' release indicates to look for a release semver version (x.x.x) and replace with the latest release version'
11 | echo ' api-release indicates to look for a release semver version in the context of an api route and replace with the latest release version'
12 | echo ' -c Compare versions and output proposed changes without changing anything.'
13 | }
14 |
15 | function getopts-extra () {
16 | declare i=1
17 | # if the next argument is not an option, then append it to array OPTARG
18 | while [[ ${OPTIND} -le $# && ${!OPTIND:0:1} != '-' ]]; do
19 | OPTARG[i]=${!OPTIND}
20 | i+=1
21 | OPTIND+=1
22 | done
23 | }
24 |
25 | # Parse input options
26 | declare CHECK=0
27 | declare SOURCE_FILE="CHANGELOG.md"
28 | declare -a FILES_TO_CHECK=()
29 | declare -a REPLACEMENT_FORMATS=()
30 | declare args
31 | declare OPTIND OPTARG opt
32 | while getopts ":hcs:f:" opt; do
33 | case $opt in
34 | h)
35 | usage
36 | exit 0
37 | ;;
38 | c)
39 | CHECK=1
40 | ;;
41 | s)
42 | SOURCE_FILE="$OPTARG"
43 | ;;
44 | f)
45 | getopts-extra "$@"
46 | args=( "${OPTARG[@]}" )
47 | # validate length of args, should be 2
48 | if [ ${#args[@]} -eq 2 ]; then
49 | FILES_TO_CHECK+=( "${args[0]}" )
50 | REPLACEMENT_FORMATS+=( "${args[1]}" )
51 | else
52 | echo "Exactly 2 arguments must follow -f option." >&2
53 | exit 1
54 | fi
55 | ;;
56 | \?)
57 | echo "Invalid option: -$OPTARG." >&2
58 | usage
59 | exit 1
60 | ;;
61 | esac
62 | done
63 |
64 | # Parse REPLACEMENT_FORMATS
65 | RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
66 | RE_RELEASE="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)"
67 | RE_API_RELEASE="v(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)"
68 | # Pull out semver appearing earliest in SOURCE_FILE.
69 | LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$SOURCE_FILE")
70 | LAST_RELEASE=$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")
71 | LAST_API_RELEASE="v$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])$" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")"
72 | declare -a RE_SEMVERS=()
73 | declare -a UPDATED_VERSIONS=()
74 | for i in "${!REPLACEMENT_FORMATS[@]}"; do
75 | REPLACEMENT_FORMAT=${REPLACEMENT_FORMATS[$i]}
76 | case $REPLACEMENT_FORMAT in
77 | semver)
78 | RE_SEMVERS+=( "$RE_SEMVER_FULL" )
79 | UPDATED_VERSIONS+=( "$LAST_VERSION" )
80 | ;;
81 | release)
82 | RE_SEMVERS+=( "$RE_RELEASE" )
83 | UPDATED_VERSIONS+=( "$LAST_RELEASE" )
84 | ;;
85 | api-release)
86 | RE_SEMVERS+=( "$RE_API_RELEASE" )
87 | UPDATED_VERSIONS+=( "$LAST_API_RELEASE" )
88 | ;;
89 | *)
90 | echo "Invalid replacement format: \"${REPLACEMENT_FORMAT}\". Use semver, release, or api-release" >&2
91 | exit 1
92 | ;;
93 | esac
94 | done
95 |
96 | if [ -z "$LAST_VERSION" ];
97 | then
98 | # No match to semver regex in SOURCE_FILE, so no version to go from.
99 | printf "Error: Unable to find latest version from %s.\n" "$SOURCE_FILE"
100 | exit 1
101 | fi
102 |
103 | # Search files in FILES_TO_CHECK and change (or get diffs)
104 | declare FAILED_CHECK=0
105 |
106 | for i in "${!FILES_TO_CHECK[@]}"; do
107 | FILE_TO_CHANGE=${FILES_TO_CHECK[$i]}
108 | RE_SEMVER=${RE_SEMVERS[$i]}
109 | UPDATED_VERSION=${UPDATED_VERSIONS[$i]}
110 | FILE_VERSION=$(grep -o -m 1 -E "${RE_SEMVER}" "$FILE_TO_CHANGE")
111 | if [ -z "$FILE_VERSION" ];
112 | then
113 | # No match to semver regex in VERSIONFILE, so nothing to replace
114 | printf "Error: No semver version found in file %s.\n" "$FILE_TO_CHANGE"
115 | exit 1
116 | else
117 | # Replace semver in VERSIONFILE with semver obtained from SOURCE_FILE
118 | TMPFILE=$(mktemp /tmp/new_version.XXXXXX)
119 | # Check sed version, exit if version < 4.3
120 | if ! sed --version > /dev/null 2>&1; then
121 | CURRENT_VERSION=1.archaic
122 | else
123 | CURRENT_VERSION=$(sed --version | head -n1 | cut -d" " -f4)
124 | fi
125 | REQUIRED_VERSION="4.3"
126 | if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$CURRENT_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then
127 | echo "sed version must be >= ${REQUIRED_VERSION}" && exit 1
128 | fi
129 | sed -E -r "s/$RE_SEMVER/$UPDATED_VERSION/" "$FILE_TO_CHANGE" > "$TMPFILE"
130 | if [ $CHECK == 1 ];
131 | then
132 | DIFF=$(diff "$FILE_TO_CHANGE" "$TMPFILE" )
133 | if [ -z "$DIFF" ];
134 | then
135 | printf "version sync would make no changes to %s.\n" "$FILE_TO_CHANGE"
136 | rm "$TMPFILE"
137 | else
138 | FAILED_CHECK=1
139 | printf "version sync would make the following changes to %s:\n%s\n" "$FILE_TO_CHANGE" "$DIFF"
140 | rm "$TMPFILE"
141 | fi
142 | else
143 | cp "$TMPFILE" "$FILE_TO_CHANGE"
144 | rm "$TMPFILE"
145 | fi
146 | fi
147 | done
148 |
149 | # Exit with code determined by whether changes were needed in a check.
150 | if [ ${FAILED_CHECK} -ne 0 ]; then
151 | exit 1
152 | else
153 | exit 0
154 | fi
155 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | license_files = LICENSE.md
3 |
4 | [flake8]
5 | max-line-length = 100
6 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """
2 | setup.py
3 |
4 | unstructured_api_tools - Utilities to manage APIs from notebooks
5 |
6 | Copyright 2022 Unstructured Technologies, Inc.
7 |
8 | Licensed under the Apache License, Version 2.0 (the "License");
9 | you may not use this file except in compliance with the License.
10 | You may obtain a copy of the License at
11 |
12 | http://www.apache.org/licenses/LICENSE-2.0
13 |
14 | Unless required by applicable law or agreed to in writing, software
15 | distributed under the License is distributed on an "AS IS" BASIS,
16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | See the License for the specific language governing permissions and
18 | limitations under the License.
19 | """
20 |
21 | from setuptools import setup, find_packages
22 |
23 | from unstructured_api_tools.__version__ import __version__
24 |
25 | setup(
26 | name="unstructured_api_tools",
27 | description="A library that prepares raw documents for downstream ML tasks.",
28 | long_description=open("README.md", "r", encoding="utf-8").read(),
29 | long_description_content_type="text/markdown",
30 | keywords="NLP PDF HTML CV XML parsing preprocessing",
31 | url="https://github.com/Unstructured-IO/unstructured-api-tools",
32 | python_requires=">=3.8.0",
33 | classifiers=[
34 | "Development Status :: 4 - Beta",
35 | "Intended Audience :: Developers",
36 | "Intended Audience :: Education",
37 | "Intended Audience :: Science/Research",
38 | "License :: OSI Approved :: Apache Software License",
39 | "Operating System :: OS Independent",
40 | "Programming Language :: Python :: 3",
41 | "Programming Language :: Python :: 3.8",
42 | "Programming Language :: Python :: 3.9",
43 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
44 | ],
45 | author="Unstructured Technologies",
46 | author_email="mrobinson@unstructuredai.io",
47 | license="Apache-2.0",
48 | packages=find_packages(),
49 | include_package_data=True,
50 | version=__version__,
51 | entry_points={
52 | "console_scripts": "unstructured_api_tools=unstructured_api_tools.cli:cli"
53 | },
54 | install_requires=[
55 | "click>=8.1",
56 | "fastapi",
57 | "Jinja2",
58 | "mypy>=0.99",
59 | "nbconvert",
60 | "python-multipart",
61 | "pandas",
62 | "types-requests",
63 | "types-ujson",
64 | "uvicorn[standard]",
65 | "autoflake"
66 | ],
67 | extras_require={},
68 | )
69 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/example.jpg
--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/example.jpg.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/example.jpg.gz
--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/fake-email.msg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/fake-email.msg
--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/fake.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/fake.docx
--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/fake.docx.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/fake.docx.gz
--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/markdown.md:
--------------------------------------------------------------------------------
1 | # Test markdown file
2 |
3 | This is the test markdown file. 100% code coverage is what I aim for.
--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/text_file.txt:
--------------------------------------------------------------------------------
1 | this is the test text file
--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/text_file.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/text_file.txt.gz
--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/text_file_2.txt:
--------------------------------------------------------------------------------
1 | this is another test text file
--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/fixtures/text_file_2.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/api/fixtures/text_file_2.txt.gz
--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/functions_and_variables.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | FILE_DOCX = "fake.docx"
4 | FILE_IMAGE = "example.jpg"
5 | FILE_TXT_1 = "text_file.txt"
6 | FILE_TXT_2 = "text_file_2.txt"
7 | FILE_MARKDOWN = "markdown.md"
8 | FILE_MSG = "fake-email.msg"
9 | FILE_JSON = "spring-weather.html.json"
10 |
11 | GZIP_FILE_DOCX = "fake.docx.gz"
12 | GZIP_FILE_IMAGE = "example.jpg.gz"
13 | GZIP_FILE_TXT_1 = "text_file.txt.gz"
14 | GZIP_FILE_TXT_2 = "text_file_2.txt.gz"
15 |
16 | FILENAME_LENGTHS = {
17 | FILE_DOCX: 36602,
18 | GZIP_FILE_DOCX: 36602,
19 | FILE_IMAGE: 32764,
20 | GZIP_FILE_IMAGE: 32764,
21 | FILE_TXT_1: 26,
22 | GZIP_FILE_TXT_1: 26,
23 | FILE_TXT_2: 30,
24 | GZIP_FILE_TXT_2: 30,
25 | FILE_MARKDOWN: 91,
26 | FILE_MSG: 11776,
27 | FILE_JSON: 13151,
28 | }
29 | FILENAME_FORMATS = {
30 | FILE_DOCX: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
31 | FILE_IMAGE: "image/jpeg",
32 | FILE_TXT_1: "text/plain",
33 | FILE_TXT_2: "text/plain",
34 | GZIP_FILE_DOCX: "application/gzip",
35 | GZIP_FILE_IMAGE: "application/gzip",
36 | GZIP_FILE_TXT_1: "application/gzip",
37 | GZIP_FILE_TXT_2: "application/gzip",
38 | FILE_MARKDOWN: "text/markdown",
39 | FILE_MSG: "message/rfc822",
40 | FILE_JSON: "application/json",
41 | "octet_stream": "application/octet-stream",
42 | }
43 |
44 | P_INPUT_1_SINGLE = {"input1": ["hi"]}
45 | P_INPUT_1_MULTI = {"input1": ["hi", "water is better than ice"]}
46 | P_INPUT_1_EMPTY = {"input1": []}
47 | P_INPUT_2_SINGLE = {"input2": ["hello"]}
48 | P_INPUT_2_MULTI = {"input2": ["hello", "earth is better than mars"]}
49 | P_INPUT_2_EMPTY = {"input2": []}
50 | P_INPUT_1_AND_2_MULTI = {"input2": ["hello", "earth is better than mars"], "input1": ["hi"]}
51 |
52 | JSON = "application/json"
53 | MIXED = "multipart/mixed"
54 | TEXT_CSV = "text/csv"
55 | INVALID = "invalid"
56 |
57 | RESPONSE_SCHEMA_ISD = {"output_schema": "isd"}
58 | RESPONSE_SCHEMA_LABELSTUDIO = {"output_schema": "labelstudio"}
59 |
60 |
61 | def convert_files_for_api(files, use_octet_stream_type=False):
62 | files_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures")
63 | return [
64 | (
65 | "files",
66 | (
67 | test_file,
68 | open(os.path.join(files_path, test_file), "rb"),
69 | FILENAME_FORMATS["octet_stream" if use_octet_stream_type else test_file],
70 | ),
71 | )
72 | for test_file in files
73 | ]
74 |
75 |
76 | def convert_text_files_for_api(files, use_octet_stream_type=False):
77 | files_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures")
78 | return [
79 | (
80 | "text_files",
81 | (
82 | test_file,
83 | open(os.path.join(files_path, test_file), "rb"),
84 | FILENAME_FORMATS["octet_stream" if use_octet_stream_type else test_file],
85 | ),
86 | )
87 | for test_file in files
88 | ]
89 |
90 |
91 | def generate_header_kwargs(value=None):
92 | return (
93 | {
94 | "headers": {
95 | "Accept": value,
96 | }
97 | }
98 | if value
99 | else {}
100 | )
101 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/api/test_docs.py:
--------------------------------------------------------------------------------
1 | from starlette.testclient import TestClient
2 | from prepline_test_project.api.app import app
3 |
4 | DOCS_ROUTE = "/test-project/docs"
5 | OPENAPI_ROUTE = "/test-project/openapi.json"
6 | HEALTHCHECK_ROUTE = "/healthcheck"
7 |
8 | client = TestClient(app)
9 |
10 |
11 | def test_openapi():
12 | response = client.get(OPENAPI_ROUTE)
13 | assert response.status_code == 200
14 |
15 |
16 | def test_docs():
17 | response = client.get(DOCS_ROUTE)
18 | assert response.status_code == 200
19 |
20 |
21 | def test_healthcheck():
22 | response = client.get(HEALTHCHECK_ROUTE)
23 | assert response.status_code == 200
24 | assert response.json() == {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"}
25 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/README.md:
--------------------------------------------------------------------------------
1 | This directory is the base of barebones preprocessing-pipeline project
2 | used for the generatation of FastAPI's which are then used as test fixtures.
3 |
4 | It includes notebooks under pipeline-notebooks/ as is normally the case
5 | for pipeline projects. APIs are generated and checked by the Makefile
6 | in the root of the unstructured-api-tools repo.
7 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "3931743a",
6 | "metadata": {},
7 | "source": [
8 | "# File Processing Pipeline"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "id": "d83dab2a",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import os"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "id": "7cb5e00b",
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "# pipeline-api\n",
29 | "\n",
30 | "# test that a duplicate import gets handles correctly as this gets imported via the template as wel\n",
31 | "import json\n",
32 | "\n",
33 | "# test accessing os in a #pipeline-api cell does not break things\n",
34 | "_ = os.environ\n",
35 | "\n",
36 | "def pipeline_api(\n",
37 | " file,\n",
38 | " filename=None,\n",
39 | " file_content_type=None,\n",
40 | " m_input2=[],\n",
41 | "):\n",
42 | " return {\"silly_result\": ' : '.join([str(len(file.read())),\n",
43 | " filename,\n",
44 | " file_content_type,\n",
45 | " str(m_input2)])}"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "id": "65911889",
52 | "metadata": {},
53 | "outputs": [
54 | {
55 | "name": "stdout",
56 | "output_type": "stream",
57 | "text": [
58 | "{'silly_result': \"17 : temp-file.txt : text/plain : ['my', 'inputs']\"}\n"
59 | ]
60 | }
61 | ],
62 | "source": [
63 | "import tempfile\n",
64 | "with tempfile.TemporaryFile() as fp:\n",
65 | " fp.write(b'This is some data')\n",
66 | " fp.seek(0)\n",
67 | " print(\n",
68 | " pipeline_api(\n",
69 | " fp,\n",
70 | " filename=\"temp-file.txt\",\n",
71 | " file_content_type=\"text/plain\",\n",
72 | " m_input2=[\"my\",\"inputs\"]\n",
73 | " )\n",
74 | " )"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "id": "edce40fa",
81 | "metadata": {},
82 | "outputs": [],
83 | "source": []
84 | }
85 | ],
86 | "metadata": {
87 | "kernelspec": {
88 | "display_name": "python3",
89 | "language": "python",
90 | "name": "python3"
91 | }
92 | },
93 | "nbformat": 4,
94 | "nbformat_minor": 5
95 | }
96 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# File Processing Pipeline"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# pipeline-api\n",
17 | "def pipeline_api(\n",
18 | " file\n",
19 | "):\n",
20 | " return {\"silly_result\": ' : '.join([str(len(file.read()))])}"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdout",
30 | "output_type": "stream",
31 | "text": [
32 | "{'silly_result': '17'}\n"
33 | ]
34 | }
35 | ],
36 | "source": [
37 | "import tempfile\n",
38 | "with tempfile.TemporaryFile() as fp:\n",
39 | " fp.write(b'This is some data')\n",
40 | " fp.seek(0)\n",
41 | " print(pipeline_api(fp))"
42 | ]
43 | }
44 | ],
45 | "metadata": {
46 | "kernelspec": {
47 | "display_name": "python3",
48 | "language": "python",
49 | "name": "python3"
50 | }
51 | },
52 | "nbformat": 4,
53 | "nbformat_minor": 0
54 | }
55 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# File Processing Pipeline"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# pipeline-api\n",
17 | "import pandas as pd\n",
18 | "def pipeline_api(\n",
19 | " file, response_type=\"text/csv\", response_schema=\"isd\"\n",
20 | "):\n",
21 | " data = pd.DataFrame(data={\"silly_result\": [str(len(file.read())), str(response_type), str(response_schema)]})\n",
22 | " if response_type == \"text/csv\":\n",
23 | " return data.to_csv()\n",
24 | " else:\n",
25 | " text = \" : \".join(list(data[\"silly_result\"]))\n",
26 | " return {\"silly_result\": text}"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {},
33 | "outputs": [
34 | {
35 | "name": "stdout",
36 | "output_type": "stream",
37 | "text": [
38 | ",silly_result\n",
39 | "0,17\n",
40 | "1,text/csv\n",
41 | "2,isd\n",
42 | "\n"
43 | ]
44 | }
45 | ],
46 | "source": [
47 | "import tempfile\n",
48 | "with tempfile.TemporaryFile() as fp:\n",
49 | " fp.write(b'This is some data')\n",
50 | " fp.seek(0)\n",
51 | " print(pipeline_api(file=fp, response_type=\"text/csv\", response_schema=\"isd\"))"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": []
60 | }
61 | ],
62 | "metadata": {
63 | "kernelspec": {
64 | "display_name": "python3",
65 | "language": "python",
66 | "name": "python3"
67 | }
68 | },
69 | "nbformat": 4,
70 | "nbformat_minor": 1
71 | }
72 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# File Processing Pipeline"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# pipeline-api\n",
17 | "def pipeline_api(\n",
18 | " file,\n",
19 | " file_content_type=None,\n",
20 | " response_type=\"application/json\",\n",
21 | " response_schema=\"labelstudio\",\n",
22 | " m_input1=[]\n",
23 | "):\n",
24 | " return {\"silly_result\": ' : '.join([\n",
25 | " str(len(file.read())),\n",
26 | " str(file_content_type),\n",
27 | " str(response_type),\n",
28 | " str(response_schema),\n",
29 | " str(m_input1)\n",
30 | " ])}"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | "{'silly_result': \"17 : None : application/json : isd : ['input1', 'input2']\"}\n"
43 | ]
44 | }
45 | ],
46 | "source": [
47 | "import tempfile\n",
48 | "with tempfile.TemporaryFile() as fp:\n",
49 | " fp.write(b'This is some data')\n",
50 | " fp.seek(0)\n",
51 | " print(\n",
52 | " pipeline_api(\n",
53 | " fp,\n",
54 | " None,\n",
55 | " \"application/json\",\n",
56 | " \"isd\",\n",
57 | " [\"input1\", \"input2\"]\n",
58 | " )\n",
59 | " )"
60 | ]
61 | }
62 | ],
63 | "metadata": {
64 | "kernelspec": {
65 | "display_name": "python3",
66 | "language": "python",
67 | "name": "python3"
68 | }
69 | },
70 | "nbformat": 4,
71 | "nbformat_minor": 0
72 | }
73 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-file-5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# File Processing Pipeline"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# pipeline-api\n",
17 | "import pandas as pd\n",
18 | "\n",
19 | "def pipeline_api(\n",
20 | " file,\n",
21 | " file_content_type=None,\n",
22 | " response_type=\"application/json\",\n",
23 | " response_schema=\"labelstudio\",\n",
24 | " m_input1=[],\n",
25 | " m_input2=[],\n",
26 | "):\n",
27 | " data = pd.DataFrame(data={\"silly_result\": [\n",
28 | " str(len(file.read())),\n",
29 | " str(file_content_type),\n",
30 | " str(response_type),\n",
31 | " str(response_schema),\n",
32 | " str(m_input1),\n",
33 | " str(m_input2),\n",
34 | " ]})\n",
35 | " if response_type == \"text/csv\":\n",
36 | " return data.to_csv()\n",
37 | " else:\n",
38 | " text = \" : \".join(list(data[\"silly_result\"]))\n",
39 | " return {\"silly_result\": text}"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "name": "stdout",
49 | "output_type": "stream",
50 | "text": [
51 | "{'silly_result': \"17 : None : application/json : isd : ['input1', 'input2'] : ['m_input2']\"}\n"
52 | ]
53 | }
54 | ],
55 | "source": [
56 | "import tempfile\n",
57 | "with tempfile.TemporaryFile() as fp:\n",
58 | " fp.write(b'This is some data')\n",
59 | " fp.seek(0)\n",
60 | " print(\n",
61 | " pipeline_api(\n",
62 | " fp,\n",
63 | " None,\n",
64 | " \"application/json\",\n",
65 | " \"isd\",\n",
66 | " [\"input1\", \"input2\"],\n",
67 | " [\"m_input2\"]\n",
68 | " )\n",
69 | " )"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": []
78 | }
79 | ],
80 | "metadata": {
81 | "kernelspec": {
82 | "display_name": "python3",
83 | "language": "python",
84 | "name": "python3"
85 | }
86 | },
87 | "nbformat": 4,
88 | "nbformat_minor": 1
89 | }
90 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Text Processing Pipeline"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# pipeline-api\n",
17 | "def pipeline_api(\n",
18 | " text,\n",
19 | "):\n",
20 | " return {\"silly_result\": ' : '.join([str(len(text)), text])}"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdout",
30 | "output_type": "stream",
31 | "text": [
32 | "{'silly_result': '9 : some text'}\n"
33 | ]
34 | }
35 | ],
36 | "source": [
37 | "print(pipeline_api(\"some text\"))"
38 | ]
39 | }
40 | ],
41 | "metadata": {
42 | "kernelspec": {
43 | "display_name": "python3",
44 | "language": "python",
45 | "name": "python3"
46 | }
47 | },
48 | "nbformat": 4,
49 | "nbformat_minor": 1
50 | }
51 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Text Processing Pipeline"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# pipeline-api\n",
17 | "def pipeline_api(\n",
18 | " text,\n",
19 | " m_input1=[],\n",
20 | " m_input2=[]\n",
21 | "):\n",
22 | " return {\"silly_result\": ' : '.join([\n",
23 | " str(len(text)),\n",
24 | " text,\n",
25 | " str(m_input1),\n",
26 | " str(m_input2)\n",
27 | " ])}"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "{'silly_result': \"9 : some text : ['first_input'] : ['last', 'input']\"}\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "print(pipeline_api(\"some text\", m_input1=[\"first_input\"], m_input2=[\"last\", \"input\"]))"
45 | ]
46 | }
47 | ],
48 | "metadata": {
49 | "kernelspec": {
50 | "display_name": "python3",
51 | "language": "python",
52 | "name": "python3"
53 | }
54 | },
55 | "nbformat": 4,
56 | "nbformat_minor": 1
57 | }
58 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "bafce76f",
6 | "metadata": {},
7 | "source": [
8 | "# Text Processing Pipeline"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "id": "2524a9a4",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# pipeline-api\n",
19 | "import pandas as pd\n",
20 | "def pipeline_api(\n",
21 | " text,\n",
22 | " response_type=\"text/csv\"\n",
23 | "):\n",
24 | " data = pd.DataFrame(data={\"silly_result\": [str(len(text)), text, str(response_type)]})\n",
25 | " if response_type == \"text/csv\":\n",
26 | " return data.to_csv()\n",
27 | " else:\n",
28 | " text = \" : \".join(list(data[\"silly_result\"]))\n",
29 | " return {\"silly_result\": text}"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "id": "6a876bdf",
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | ",silly_result\n",
43 | "0,9\n",
44 | "1,some text\n",
45 | "2,text/csv\n",
46 | "\n"
47 | ]
48 | }
49 | ],
50 | "source": [
51 | "print(pipeline_api(\"some text\", \"text/csv\"))"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "id": "83f27184",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": []
61 | }
62 | ],
63 | "metadata": {
64 | "kernelspec": {
65 | "display_name": "python3",
66 | "language": "python",
67 | "name": "python3"
68 | }
69 | },
70 | "nbformat": 4,
71 | "nbformat_minor": 5
72 | }
73 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Text Processing Pipeline"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# pipeline-api\n",
17 | "import pandas as pd\n",
18 | "def pipeline_api(\n",
19 | " text,\n",
20 | " response_type=\"text/csv\",\n",
21 | " response_schema=\"isd\",\n",
22 | "):\n",
23 | " data = pd.DataFrame(data={\"silly_result\": [str(len(text)), text, str(response_type), str(response_schema)]})\n",
24 | " if response_type == \"text/csv\":\n",
25 | " return data.to_csv()\n",
26 | " else:\n",
27 | " text = \" : \".join(list(data[\"silly_result\"]))\n",
28 | " return {\"silly_result\": text}"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {},
35 | "outputs": [
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | ",silly_result\n",
41 | "0,9\n",
42 | "1,some text\n",
43 | "2,text/csv\n",
44 | "3,isd\n",
45 | "\n"
46 | ]
47 | }
48 | ],
49 | "source": [
50 | "print(pipeline_api(\"some text\", \"text/csv\", \"isd\"))"
51 | ]
52 | }
53 | ],
54 | "metadata": {
55 | "kernelspec": {
56 | "display_name": "python3",
57 | "language": "python",
58 | "name": "python3"
59 | }
60 | },
61 | "nbformat": 4,
62 | "nbformat_minor": 1
63 | }
64 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-file-1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Text & File Processing Pipeline"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# pipeline-api\n",
17 | "def pipeline_api(\n",
18 | " text,\n",
19 | " file=None,\n",
20 | " filename=None,\n",
21 | " file_content_type=None,\n",
22 | "):\n",
23 | " return {\"silly_result\": ' : '.join([\n",
24 | " str(len(text if text else \"\")),\n",
25 | " str(text),\n",
26 | " str(len(file.read()) if file else None),\n",
27 | " str(filename),\n",
28 | " str(file_content_type),\n",
29 | " ])}"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {},
36 | "outputs": [
37 | {
38 | "name": "stdout",
39 | "output_type": "stream",
40 | "text": [
41 | "{'silly_result': '9 : some text : 17 : temp-file.txt : None'}\n"
42 | ]
43 | }
44 | ],
45 | "source": [
46 | "import tempfile\n",
47 | "with tempfile.TemporaryFile() as fp:\n",
48 | " fp.write(b'This is some data')\n",
49 | " fp.seek(0)\n",
50 | " print(pipeline_api(\n",
51 | " text=\"some text\",\n",
52 | " file=fp,\n",
53 | " file_content_type=None,\n",
54 | " filename=\"temp-file.txt\"\n",
55 | " ))"
56 | ]
57 | }
58 | ],
59 | "metadata": {
60 | "kernelspec": {
61 | "display_name": "python3",
62 | "language": "python",
63 | "name": "python3"
64 | }
65 | },
66 | "nbformat": 4,
67 | "nbformat_minor": 1
68 | }
69 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-file-2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Text & File Processing Pipeline"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# pipeline-api\n",
17 | "import pandas as pd\n",
18 | "def pipeline_api(\n",
19 | " text,\n",
20 | " file=None,\n",
21 | " filename=None,\n",
22 | " file_content_type=None,\n",
23 | " response_type=\"application/json\",\n",
24 | " m_input2=[]\n",
25 | "):\n",
26 | " data = pd.DataFrame(data={\"silly_result\": [\n",
27 | " str(len(text if text else \"\")),\n",
28 | " str(text),\n",
29 | " str(len(file.read()) if file else None),\n",
30 | " str(filename),\n",
31 | " str(file_content_type),\n",
32 | " str(response_type),\n",
33 | " str(m_input2)\n",
34 | " ]})\n",
35 | " if response_type == \"text/csv\":\n",
36 | " return data.to_csv()\n",
37 | " else:\n",
38 | " text = \" : \".join(list(data[\"silly_result\"]))\n",
39 | " return {\"silly_result\": text}"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "name": "stdout",
49 | "output_type": "stream",
50 | "text": [
51 | "{'silly_result': \"9 : some text : 17 : temp-file.txt : None : application/json : ['input1', 'input2']\"}\n"
52 | ]
53 | }
54 | ],
55 | "source": [
56 | "import tempfile\n",
57 | "with tempfile.TemporaryFile() as fp:\n",
58 | " fp.write(b'This is some data')\n",
59 | " fp.seek(0)\n",
60 | " print(pipeline_api(\n",
61 | " text=\"some text\",\n",
62 | " file=fp,\n",
63 | " file_content_type=None,\n",
64 | " filename=\"temp-file.txt\",\n",
65 | " response_type=\"application/json\",\n",
66 | " m_input2=[\"input1\", \"input2\"]\n",
67 | " ))"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": []
76 | }
77 | ],
78 | "metadata": {
79 | "kernelspec": {
80 | "display_name": "python3",
81 | "language": "python",
82 | "name": "python3"
83 | }
84 | },
85 | "nbformat": 4,
86 | "nbformat_minor": 1
87 | }
88 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-file-3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Text & File Processing Pipeline"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# pipeline-api\n",
17 | "import pandas as pd\n",
18 | "def pipeline_api(\n",
19 | " text,\n",
20 | " file=None,\n",
21 | " filename=None,\n",
22 | " file_content_type=None,\n",
23 | " response_type=\"application/json\",\n",
24 | " response_schema=\"isd\"\n",
25 | "):\n",
26 | " data = pd.DataFrame(data={\"silly_result\": [\n",
27 | " str(len(text if text else \"\")),\n",
28 | " str(text),\n",
29 | " str(len(file.read()) if file else None),\n",
30 | " str(filename),\n",
31 | " str(file_content_type),\n",
32 | " str(response_type),\n",
33 | " str(response_schema)\n",
34 | " ]})\n",
35 | " if response_type == \"text/csv\":\n",
36 | " return data.to_csv()\n",
37 | " else:\n",
38 | " text = \" : \".join(list(data[\"silly_result\"]))\n",
39 | " return {\"silly_result\": text}"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "name": "stdout",
49 | "output_type": "stream",
50 | "text": [
51 | "{'silly_result': '9 : some text : 17 : temp-file.txt : None : application/json : isd'}\n"
52 | ]
53 | }
54 | ],
55 | "source": [
56 | "import tempfile\n",
57 | "with tempfile.TemporaryFile() as fp:\n",
58 | " fp.write(b'This is some data')\n",
59 | " fp.seek(0)\n",
60 | " print(pipeline_api(\n",
61 | " text=\"some text\",\n",
62 | " file=fp,\n",
63 | " file_content_type=None,\n",
64 | " filename=\"temp-file.txt\",\n",
65 | " response_type=\"application/json\",\n",
66 | " response_schema=\"isd\"\n",
67 | " ))"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": []
76 | }
77 | ],
78 | "metadata": {
79 | "kernelspec": {
80 | "display_name": "python3",
81 | "language": "python",
82 | "name": "python3"
83 | }
84 | },
85 | "nbformat": 4,
86 | "nbformat_minor": 1
87 | }
88 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks/pipeline-process-text-file-4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Text & File Processing Pipeline"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# pipeline-api\n",
17 | "import pandas as pd\n",
18 | "def pipeline_api(\n",
19 | " text,\n",
20 | " file=None,\n",
21 | " filename=None,\n",
22 | " file_content_type=None,\n",
23 | " response_type=\"application/json\",\n",
24 | " response_schema=\"isd\",\n",
25 | " m_input1=[],\n",
26 | " m_input2=[]\n",
27 | "):\n",
28 | " data = pd.DataFrame(data={\"silly_result\": [\n",
29 | " str(len(text if text else \"\")),\n",
30 | " str(text),\n",
31 | " str(len(file.read()) if file else None),\n",
32 | " str(filename),\n",
33 | " str(file_content_type),\n",
34 | " str(response_type),\n",
35 | " str(response_schema),\n",
36 | " str(m_input1),\n",
37 | " str(m_input2),\n",
38 | " ]})\n",
39 | " if response_type == \"text/csv\":\n",
40 | " return data.to_csv()\n",
41 | " else:\n",
42 | " text = \" : \".join(list(data[\"silly_result\"]))\n",
43 | " return {\"silly_result\": text}"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "{'silly_result': \"9 : some text : 17 : temp-file.txt : None : application/json : isd : ['input1'] : ['input2', 'input3']\"}\n"
56 | ]
57 | }
58 | ],
59 | "source": [
60 | "import tempfile\n",
61 | "with tempfile.TemporaryFile() as fp:\n",
62 | " fp.write(b'This is some data')\n",
63 | " fp.seek(0)\n",
64 | " print(pipeline_api(\n",
65 | " text=\"some text\",\n",
66 | " file=fp,\n",
67 | " file_content_type=None,\n",
68 | " filename=\"temp-file.txt\",\n",
69 | " response_type=\"application/json\",\n",
70 | " response_schema=\"isd\",\n",
71 | " m_input1=[\"input1\"],\n",
72 | " m_input2=[\"input2\", \"input3\"]\n",
73 | " ))"
74 | ]
75 | }
76 | ],
77 | "metadata": {
78 | "kernelspec": {
79 | "display_name": "python3",
80 | "language": "python",
81 | "name": "python3"
82 | }
83 | },
84 | "nbformat": 4,
85 | "nbformat_minor": 1
86 | }
87 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/__init__.py
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/app.py:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 |
7 | from fastapi import FastAPI, Request, status
8 | import logging
9 | import os
10 |
11 | from .process_file_1 import router as process_file_1_router
12 | from .process_file_2 import router as process_file_2_router
13 | from .process_file_3 import router as process_file_3_router
14 | from .process_file_4 import router as process_file_4_router
15 | from .process_file_5 import router as process_file_5_router
16 | from .process_text_1 import router as process_text_1_router
17 | from .process_text_2 import router as process_text_2_router
18 | from .process_text_3 import router as process_text_3_router
19 | from .process_text_4 import router as process_text_4_router
20 | from .process_text_file_1 import router as process_text_file_1_router
21 | from .process_text_file_2 import router as process_text_file_2_router
22 | from .process_text_file_3 import router as process_text_file_3_router
23 | from .process_text_file_4 import router as process_text_file_4_router
24 |
25 |
26 | app = FastAPI(
27 | title="Unstructured Pipeline API",
28 | description="""""",
29 | version="1.0.0",
30 | docs_url="/test-project/docs",
31 | openapi_url="/test-project/openapi.json",
32 | )
33 |
34 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", None)
35 | if allowed_origins:
36 | from fastapi.middleware.cors import CORSMiddleware
37 |
38 | app.add_middleware(
39 | CORSMiddleware,
40 | allow_origins=allowed_origins.split(","),
41 | allow_methods=["OPTIONS", "POST"],
42 | allow_headers=["Content-Type"],
43 | )
44 |
45 | app.include_router(process_file_1_router)
46 | app.include_router(process_file_2_router)
47 | app.include_router(process_file_3_router)
48 | app.include_router(process_file_4_router)
49 | app.include_router(process_file_5_router)
50 | app.include_router(process_text_1_router)
51 | app.include_router(process_text_2_router)
52 | app.include_router(process_text_3_router)
53 | app.include_router(process_text_4_router)
54 | app.include_router(process_text_file_1_router)
55 | app.include_router(process_text_file_2_router)
56 | app.include_router(process_text_file_3_router)
57 | app.include_router(process_text_file_4_router)
58 |
59 |
60 | # Filter out /healthcheck noise
61 | class HealthCheckFilter(logging.Filter):
62 | def filter(self, record: logging.LogRecord) -> bool:
63 | return record.getMessage().find("/healthcheck") == -1
64 |
65 |
66 | # Filter out /metrics noise
67 | class MetricsCheckFilter(logging.Filter):
68 | def filter(self, record: logging.LogRecord) -> bool:
69 | return record.getMessage().find("/metrics") == -1
70 |
71 |
72 | logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter())
73 | logging.getLogger("uvicorn.access").addFilter(MetricsCheckFilter())
74 |
75 |
76 | @app.get("/healthcheck", status_code=status.HTTP_200_OK, include_in_schema=False)
77 | def healthcheck(request: Request):
78 | return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"}
79 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_1.py:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 | import io
7 | import os
8 | import gzip
9 | import mimetypes
10 | from typing import List, Union
11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
12 | import json
13 | from fastapi.responses import StreamingResponse
14 | from starlette.datastructures import Headers
15 | from starlette.types import Send
16 | from base64 import b64encode
17 | from typing import Optional, Mapping
18 | import secrets
19 |
20 |
21 | app = FastAPI()
22 | router = APIRouter()
23 |
24 |
25 | # pipeline-api
26 |
27 | # test that a duplicate import gets handles correctly as this gets imported via the template as wel
28 |
29 | # test accessing os in a #pipeline-api cell does not break things
30 | _ = os.environ
31 |
32 |
33 | def pipeline_api(
34 | file,
35 | filename=None,
36 | file_content_type=None,
37 | m_input2=[],
38 | ):
39 | return {
40 | "silly_result": " : ".join(
41 | [str(len(file.read())), filename, file_content_type, str(m_input2)]
42 | )
43 | }
44 |
45 |
46 | def get_validated_mimetype(file):
47 | """
48 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
49 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
50 | return HTTP 400 for an invalid type.
51 | """
52 | content_type = file.content_type
53 | if not content_type or content_type == "application/octet-stream":
54 | content_type = mimetypes.guess_type(str(file.filename))[0]
55 |
56 | # Some filetypes missing for this library, just hardcode them for now
57 | if not content_type:
58 | if file.filename.endswith(".md"):
59 | content_type = "text/markdown"
60 | elif file.filename.endswith(".msg"):
61 | content_type = "message/rfc822"
62 |
63 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
64 | if allowed_mimetypes_str is not None:
65 | allowed_mimetypes = allowed_mimetypes_str.split(",")
66 |
67 | if content_type not in allowed_mimetypes:
68 | raise HTTPException(
69 | status_code=400,
70 | detail=(
71 | f"Unable to process {file.filename}: "
72 | f"File type {content_type} is not supported."
73 | ),
74 | )
75 |
76 | return content_type
77 |
78 |
79 | class MultipartMixedResponse(StreamingResponse):
80 | CRLF = b"\r\n"
81 |
82 | def __init__(self, *args, content_type: str = None, **kwargs):
83 | super().__init__(*args, **kwargs)
84 | self.content_type = content_type
85 |
86 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
87 | super().init_headers(headers)
88 | self.boundary_value = secrets.token_hex(16)
89 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
90 | self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
91 |
92 | @property
93 | def boundary(self):
94 | return b"--" + self.boundary_value.encode()
95 |
96 | def _build_part_headers(self, headers: dict) -> bytes:
97 | header_bytes = b""
98 | for header, value in headers.items():
99 | header_bytes += f"{header}: {value}".encode() + self.CRLF
100 | return header_bytes
101 |
102 | def build_part(self, chunk: bytes) -> bytes:
103 | part = self.boundary + self.CRLF
104 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
105 | if self.content_type is not None:
106 | part_headers["Content-Type"] = self.content_type
107 | part += self._build_part_headers(part_headers)
108 | part += self.CRLF + chunk + self.CRLF
109 | return part
110 |
111 | async def stream_response(self, send: Send) -> None:
112 | await send(
113 | {
114 | "type": "http.response.start",
115 | "status": self.status_code,
116 | "headers": self.raw_headers,
117 | }
118 | )
119 | async for chunk in self.body_iterator:
120 | if not isinstance(chunk, bytes):
121 | chunk = chunk.encode(self.charset)
122 | chunk = b64encode(chunk)
123 | await send(
124 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
125 | )
126 |
127 | await send({"type": "http.response.body", "body": b"", "more_body": False})
128 |
129 |
130 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
131 | def return_content_type(filename):
132 | if gz_uncompressed_content_type:
133 | return gz_uncompressed_content_type
134 | else:
135 | return str(mimetypes.guess_type(filename)[0])
136 |
137 | filename = str(file.filename) if file.filename else ""
138 | if filename.endswith(".gz"):
139 | filename = filename[:-3]
140 |
141 | gzip_file = gzip.open(file.file).read()
142 | return UploadFile(
143 | file=io.BytesIO(gzip_file),
144 | size=len(gzip_file),
145 | filename=filename,
146 | headers=Headers({"content-type": return_content_type(filename)}),
147 | )
148 |
149 |
150 | @router.post("/test-project/v1/process-file-1")
151 | @router.post("/test-project/v1.2.3/process-file-1")
152 | def pipeline_1(
153 | request: Request,
154 | gz_uncompressed_content_type: Optional[str] = Form(default=None),
155 | files: Union[List[UploadFile], None] = File(default=None),
156 | input2: List[str] = Form(default=[]),
157 | ):
158 | if files:
159 | for file_index in range(len(files)):
160 | if files[file_index].content_type == "application/gzip":
161 | files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type)
162 |
163 | content_type = request.headers.get("Accept")
164 |
165 | if isinstance(files, list) and len(files):
166 | if len(files) > 1:
167 | if content_type and content_type not in [
168 | "*/*",
169 | "multipart/mixed",
170 | "application/json",
171 | "text/csv",
172 | ]:
173 | raise HTTPException(
174 | detail=(
175 | f"Conflict in media type {content_type}"
176 | ' with response type "multipart/mixed".\n'
177 | ),
178 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
179 | )
180 |
181 | def response_generator(is_multipart):
182 | for file in files:
183 | file_content_type = get_validated_mimetype(file)
184 |
185 | _file = file.file
186 |
187 | response = pipeline_api(
188 | _file,
189 | m_input2=input2,
190 | filename=file.filename,
191 | file_content_type=file_content_type,
192 | )
193 |
194 | if is_multipart:
195 | if type(response) not in [str, bytes]:
196 | response = json.dumps(response)
197 | yield response
198 |
199 | if content_type == "multipart/mixed":
200 | return MultipartMixedResponse(
201 | response_generator(is_multipart=True),
202 | )
203 | else:
204 | return (
205 | list(response_generator(is_multipart=False))[0]
206 | if len(files) == 1
207 | else response_generator(is_multipart=False)
208 | )
209 | else:
210 | raise HTTPException(
211 | detail='Request parameter "files" is required.\n',
212 | status_code=status.HTTP_400_BAD_REQUEST,
213 | )
214 |
215 |
216 | app.include_router(router)
217 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_2.py:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 | import io
7 | import os
8 | import gzip
9 | import mimetypes
10 | from typing import List, Union
11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
12 | import json
13 | from fastapi.responses import StreamingResponse
14 | from starlette.datastructures import Headers
15 | from starlette.types import Send
16 | from base64 import b64encode
17 | from typing import Optional, Mapping
18 | import secrets
19 |
20 |
21 | app = FastAPI()
22 | router = APIRouter()
23 |
24 |
25 | # pipeline-api
26 | def pipeline_api(file):
27 | return {"silly_result": " : ".join([str(len(file.read()))])}
28 |
29 |
30 | def get_validated_mimetype(file):
31 | """
32 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
33 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
34 | return HTTP 400 for an invalid type.
35 | """
36 | content_type = file.content_type
37 | if not content_type or content_type == "application/octet-stream":
38 | content_type = mimetypes.guess_type(str(file.filename))[0]
39 |
40 | # Some filetypes missing for this library, just hardcode them for now
41 | if not content_type:
42 | if file.filename.endswith(".md"):
43 | content_type = "text/markdown"
44 | elif file.filename.endswith(".msg"):
45 | content_type = "message/rfc822"
46 |
47 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
48 | if allowed_mimetypes_str is not None:
49 | allowed_mimetypes = allowed_mimetypes_str.split(",")
50 |
51 | if content_type not in allowed_mimetypes:
52 | raise HTTPException(
53 | status_code=400,
54 | detail=(
55 | f"Unable to process {file.filename}: "
56 | f"File type {content_type} is not supported."
57 | ),
58 | )
59 |
60 | return content_type
61 |
62 |
63 | class MultipartMixedResponse(StreamingResponse):
64 | CRLF = b"\r\n"
65 |
66 | def __init__(self, *args, content_type: str = None, **kwargs):
67 | super().__init__(*args, **kwargs)
68 | self.content_type = content_type
69 |
70 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
71 | super().init_headers(headers)
72 | self.boundary_value = secrets.token_hex(16)
73 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
74 | self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
75 |
76 | @property
77 | def boundary(self):
78 | return b"--" + self.boundary_value.encode()
79 |
80 | def _build_part_headers(self, headers: dict) -> bytes:
81 | header_bytes = b""
82 | for header, value in headers.items():
83 | header_bytes += f"{header}: {value}".encode() + self.CRLF
84 | return header_bytes
85 |
86 | def build_part(self, chunk: bytes) -> bytes:
87 | part = self.boundary + self.CRLF
88 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
89 | if self.content_type is not None:
90 | part_headers["Content-Type"] = self.content_type
91 | part += self._build_part_headers(part_headers)
92 | part += self.CRLF + chunk + self.CRLF
93 | return part
94 |
95 | async def stream_response(self, send: Send) -> None:
96 | await send(
97 | {
98 | "type": "http.response.start",
99 | "status": self.status_code,
100 | "headers": self.raw_headers,
101 | }
102 | )
103 | async for chunk in self.body_iterator:
104 | if not isinstance(chunk, bytes):
105 | chunk = chunk.encode(self.charset)
106 | chunk = b64encode(chunk)
107 | await send(
108 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
109 | )
110 |
111 | await send({"type": "http.response.body", "body": b"", "more_body": False})
112 |
113 |
114 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
115 | def return_content_type(filename):
116 | if gz_uncompressed_content_type:
117 | return gz_uncompressed_content_type
118 | else:
119 | return str(mimetypes.guess_type(filename)[0])
120 |
121 | filename = str(file.filename) if file.filename else ""
122 | if filename.endswith(".gz"):
123 | filename = filename[:-3]
124 |
125 | gzip_file = gzip.open(file.file).read()
126 | return UploadFile(
127 | file=io.BytesIO(gzip_file),
128 | size=len(gzip_file),
129 | filename=filename,
130 | headers=Headers({"content-type": return_content_type(filename)}),
131 | )
132 |
133 |
134 | @router.post("/test-project/v1/process-file-2")
135 | @router.post("/test-project/v1.2.3/process-file-2")
136 | def pipeline_1(
137 | request: Request,
138 | gz_uncompressed_content_type: Optional[str] = Form(default=None),
139 | files: Union[List[UploadFile], None] = File(default=None),
140 | ):
141 | if files:
142 | for file_index in range(len(files)):
143 | if files[file_index].content_type == "application/gzip":
144 | files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type)
145 |
146 | content_type = request.headers.get("Accept")
147 |
148 | if isinstance(files, list) and len(files):
149 | if len(files) > 1:
150 | if content_type and content_type not in [
151 | "*/*",
152 | "multipart/mixed",
153 | "application/json",
154 | "text/csv",
155 | ]:
156 | raise HTTPException(
157 | detail=(
158 | f"Conflict in media type {content_type}"
159 | ' with response type "multipart/mixed".\n'
160 | ),
161 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
162 | )
163 |
164 | def response_generator(is_multipart):
165 | for file in files:
166 | get_validated_mimetype(file)
167 |
168 | _file = file.file
169 |
170 | response = pipeline_api(
171 | _file,
172 | )
173 |
174 | if is_multipart:
175 | if type(response) not in [str, bytes]:
176 | response = json.dumps(response)
177 | yield response
178 |
179 | if content_type == "multipart/mixed":
180 | return MultipartMixedResponse(
181 | response_generator(is_multipart=True),
182 | )
183 | else:
184 | return (
185 | list(response_generator(is_multipart=False))[0]
186 | if len(files) == 1
187 | else response_generator(is_multipart=False)
188 | )
189 | else:
190 | raise HTTPException(
191 | detail='Request parameter "files" is required.\n',
192 | status_code=status.HTTP_400_BAD_REQUEST,
193 | )
194 |
195 |
196 | app.include_router(router)
197 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_3.py:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 | import io
7 | import os
8 | import gzip
9 | import mimetypes
10 | from typing import List, Union
11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
12 | from fastapi.responses import PlainTextResponse
13 | import json
14 | from fastapi.responses import StreamingResponse
15 | from starlette.datastructures import Headers
16 | from starlette.types import Send
17 | from base64 import b64encode
18 | from typing import Optional, Mapping
19 | import secrets
20 | import pandas as pd
21 |
22 |
23 | app = FastAPI()
24 | router = APIRouter()
25 |
26 |
27 | def is_expected_response_type(media_type, response_type):
28 | if media_type == "application/json" and response_type not in [dict, list]:
29 | return True
30 | elif media_type == "text/csv" and response_type != str:
31 | return True
32 | else:
33 | return False
34 |
35 |
36 | # pipeline-api
37 | def pipeline_api(file, response_type="text/csv", response_schema="isd"):
38 | data = pd.DataFrame(
39 | data={"silly_result": [str(len(file.read())), str(response_type), str(response_schema)]}
40 | )
41 | if response_type == "text/csv":
42 | return data.to_csv()
43 | else:
44 | text = " : ".join(list(data["silly_result"]))
45 | return {"silly_result": text}
46 |
47 |
48 | def get_validated_mimetype(file):
49 | """
50 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
51 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
52 | return HTTP 400 for an invalid type.
53 | """
54 | content_type = file.content_type
55 | if not content_type or content_type == "application/octet-stream":
56 | content_type = mimetypes.guess_type(str(file.filename))[0]
57 |
58 | # Some filetypes missing for this library, just hardcode them for now
59 | if not content_type:
60 | if file.filename.endswith(".md"):
61 | content_type = "text/markdown"
62 | elif file.filename.endswith(".msg"):
63 | content_type = "message/rfc822"
64 |
65 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
66 | if allowed_mimetypes_str is not None:
67 | allowed_mimetypes = allowed_mimetypes_str.split(",")
68 |
69 | if content_type not in allowed_mimetypes:
70 | raise HTTPException(
71 | status_code=400,
72 | detail=(
73 | f"Unable to process {file.filename}: "
74 | f"File type {content_type} is not supported."
75 | ),
76 | )
77 |
78 | return content_type
79 |
80 |
81 | class MultipartMixedResponse(StreamingResponse):
82 | CRLF = b"\r\n"
83 |
84 | def __init__(self, *args, content_type: str = None, **kwargs):
85 | super().__init__(*args, **kwargs)
86 | self.content_type = content_type
87 |
88 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
89 | super().init_headers(headers)
90 | self.boundary_value = secrets.token_hex(16)
91 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
92 | self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
93 |
94 | @property
95 | def boundary(self):
96 | return b"--" + self.boundary_value.encode()
97 |
98 | def _build_part_headers(self, headers: dict) -> bytes:
99 | header_bytes = b""
100 | for header, value in headers.items():
101 | header_bytes += f"{header}: {value}".encode() + self.CRLF
102 | return header_bytes
103 |
104 | def build_part(self, chunk: bytes) -> bytes:
105 | part = self.boundary + self.CRLF
106 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
107 | if self.content_type is not None:
108 | part_headers["Content-Type"] = self.content_type
109 | part += self._build_part_headers(part_headers)
110 | part += self.CRLF + chunk + self.CRLF
111 | return part
112 |
113 | async def stream_response(self, send: Send) -> None:
114 | await send(
115 | {
116 | "type": "http.response.start",
117 | "status": self.status_code,
118 | "headers": self.raw_headers,
119 | }
120 | )
121 | async for chunk in self.body_iterator:
122 | if not isinstance(chunk, bytes):
123 | chunk = chunk.encode(self.charset)
124 | chunk = b64encode(chunk)
125 | await send(
126 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
127 | )
128 |
129 | await send({"type": "http.response.body", "body": b"", "more_body": False})
130 |
131 |
132 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
133 | def return_content_type(filename):
134 | if gz_uncompressed_content_type:
135 | return gz_uncompressed_content_type
136 | else:
137 | return str(mimetypes.guess_type(filename)[0])
138 |
139 | filename = str(file.filename) if file.filename else ""
140 | if filename.endswith(".gz"):
141 | filename = filename[:-3]
142 |
143 | gzip_file = gzip.open(file.file).read()
144 | return UploadFile(
145 | file=io.BytesIO(gzip_file),
146 | size=len(gzip_file),
147 | filename=filename,
148 | headers=Headers({"content-type": return_content_type(filename)}),
149 | )
150 |
151 |
152 | @router.post("/test-project/v1/process-file-3")
153 | @router.post("/test-project/v1.2.3/process-file-3")
154 | def pipeline_1(
155 | request: Request,
156 | gz_uncompressed_content_type: Optional[str] = Form(default=None),
157 | files: Union[List[UploadFile], None] = File(default=None),
158 | output_format: Union[str, None] = Form(default=None),
159 | output_schema: str = Form(default=None),
160 | ):
161 | if files:
162 | for file_index in range(len(files)):
163 | if files[file_index].content_type == "application/gzip":
164 | files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type)
165 |
166 | content_type = request.headers.get("Accept")
167 |
168 | default_response_type = output_format or "text/csv"
169 | if not content_type or content_type == "*/*" or content_type == "multipart/mixed":
170 | media_type = default_response_type
171 | else:
172 | media_type = content_type
173 |
174 | default_response_schema = output_schema or "isd"
175 |
176 | if isinstance(files, list) and len(files):
177 | if len(files) > 1:
178 | if content_type and content_type not in [
179 | "*/*",
180 | "multipart/mixed",
181 | "application/json",
182 | "text/csv",
183 | ]:
184 | raise HTTPException(
185 | detail=(
186 | f"Conflict in media type {content_type}"
187 | ' with response type "multipart/mixed".\n'
188 | ),
189 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
190 | )
191 |
192 | def response_generator(is_multipart):
193 | for file in files:
194 | get_validated_mimetype(file)
195 |
196 | _file = file.file
197 |
198 | response = pipeline_api(
199 | _file,
200 | response_type=media_type,
201 | response_schema=default_response_schema,
202 | )
203 |
204 | if is_expected_response_type(media_type, type(response)):
205 | raise HTTPException(
206 | detail=(
207 | f"Conflict in media type {media_type}"
208 | f" with response type {type(response)}.\n"
209 | ),
210 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
211 | )
212 |
213 | valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"]
214 | if media_type in valid_response_types:
215 | if is_multipart:
216 | if type(response) not in [str, bytes]:
217 | response = json.dumps(response)
218 | elif media_type == "text/csv":
219 | response = PlainTextResponse(response)
220 | yield response
221 | else:
222 | raise HTTPException(
223 | detail=f"Unsupported media type {media_type}.\n",
224 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
225 | )
226 |
227 | def join_responses(responses):
228 | if media_type != "text/csv":
229 | return responses
230 | data = pd.read_csv(io.BytesIO(responses[0].body))
231 | if len(responses) > 1:
232 | for resp in responses[1:]:
233 | resp_data = pd.read_csv(io.BytesIO(resp.body))
234 | data = data.merge(resp_data, how="outer")
235 | return PlainTextResponse(data.to_csv())
236 |
237 | if content_type == "multipart/mixed":
238 | return MultipartMixedResponse(
239 | response_generator(is_multipart=True), content_type=media_type
240 | )
241 | else:
242 | return (
243 | list(response_generator(is_multipart=False))[0]
244 | if len(files) == 1
245 | else join_responses(list(response_generator(is_multipart=False)))
246 | )
247 | else:
248 | raise HTTPException(
249 | detail='Request parameter "files" is required.\n',
250 | status_code=status.HTTP_400_BAD_REQUEST,
251 | )
252 |
253 |
254 | app.include_router(router)
255 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_4.py:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 | import io
7 | import os
8 | import gzip
9 | import mimetypes
10 | from typing import List, Union
11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
12 | from fastapi.responses import PlainTextResponse
13 | import json
14 | from fastapi.responses import StreamingResponse
15 | from starlette.datastructures import Headers
16 | from starlette.types import Send
17 | from base64 import b64encode
18 | from typing import Optional, Mapping
19 | import secrets
20 | import pandas as pd
21 |
22 |
23 | app = FastAPI()
24 | router = APIRouter()
25 |
26 |
27 | def is_expected_response_type(media_type, response_type):
28 | if media_type == "application/json" and response_type not in [dict, list]:
29 | return True
30 | elif media_type == "text/csv" and response_type != str:
31 | return True
32 | else:
33 | return False
34 |
35 |
36 | # pipeline-api
37 | def pipeline_api(
38 | file,
39 | file_content_type=None,
40 | response_type="application/json",
41 | response_schema="labelstudio",
42 | m_input1=[],
43 | ):
44 | return {
45 | "silly_result": " : ".join(
46 | [
47 | str(len(file.read())),
48 | str(file_content_type),
49 | str(response_type),
50 | str(response_schema),
51 | str(m_input1),
52 | ]
53 | )
54 | }
55 |
56 |
57 | def get_validated_mimetype(file):
58 | """
59 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
60 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
61 | return HTTP 400 for an invalid type.
62 | """
63 | content_type = file.content_type
64 | if not content_type or content_type == "application/octet-stream":
65 | content_type = mimetypes.guess_type(str(file.filename))[0]
66 |
67 | # Some filetypes missing for this library, just hardcode them for now
68 | if not content_type:
69 | if file.filename.endswith(".md"):
70 | content_type = "text/markdown"
71 | elif file.filename.endswith(".msg"):
72 | content_type = "message/rfc822"
73 |
74 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
75 | if allowed_mimetypes_str is not None:
76 | allowed_mimetypes = allowed_mimetypes_str.split(",")
77 |
78 | if content_type not in allowed_mimetypes:
79 | raise HTTPException(
80 | status_code=400,
81 | detail=(
82 | f"Unable to process {file.filename}: "
83 | f"File type {content_type} is not supported."
84 | ),
85 | )
86 |
87 | return content_type
88 |
89 |
90 | class MultipartMixedResponse(StreamingResponse):
91 | CRLF = b"\r\n"
92 |
93 | def __init__(self, *args, content_type: str = None, **kwargs):
94 | super().__init__(*args, **kwargs)
95 | self.content_type = content_type
96 |
97 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
98 | super().init_headers(headers)
99 | self.boundary_value = secrets.token_hex(16)
100 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
101 | self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
102 |
103 | @property
104 | def boundary(self):
105 | return b"--" + self.boundary_value.encode()
106 |
107 | def _build_part_headers(self, headers: dict) -> bytes:
108 | header_bytes = b""
109 | for header, value in headers.items():
110 | header_bytes += f"{header}: {value}".encode() + self.CRLF
111 | return header_bytes
112 |
113 | def build_part(self, chunk: bytes) -> bytes:
114 | part = self.boundary + self.CRLF
115 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
116 | if self.content_type is not None:
117 | part_headers["Content-Type"] = self.content_type
118 | part += self._build_part_headers(part_headers)
119 | part += self.CRLF + chunk + self.CRLF
120 | return part
121 |
122 | async def stream_response(self, send: Send) -> None:
123 | await send(
124 | {
125 | "type": "http.response.start",
126 | "status": self.status_code,
127 | "headers": self.raw_headers,
128 | }
129 | )
130 | async for chunk in self.body_iterator:
131 | if not isinstance(chunk, bytes):
132 | chunk = chunk.encode(self.charset)
133 | chunk = b64encode(chunk)
134 | await send(
135 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
136 | )
137 |
138 | await send({"type": "http.response.body", "body": b"", "more_body": False})
139 |
140 |
141 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
142 | def return_content_type(filename):
143 | if gz_uncompressed_content_type:
144 | return gz_uncompressed_content_type
145 | else:
146 | return str(mimetypes.guess_type(filename)[0])
147 |
148 | filename = str(file.filename) if file.filename else ""
149 | if filename.endswith(".gz"):
150 | filename = filename[:-3]
151 |
152 | gzip_file = gzip.open(file.file).read()
153 | return UploadFile(
154 | file=io.BytesIO(gzip_file),
155 | size=len(gzip_file),
156 | filename=filename,
157 | headers=Headers({"content-type": return_content_type(filename)}),
158 | )
159 |
160 |
161 | @router.post("/test-project/v1/process-file-4")
162 | @router.post("/test-project/v1.2.3/process-file-4")
163 | def pipeline_1(
164 | request: Request,
165 | gz_uncompressed_content_type: Optional[str] = Form(default=None),
166 | files: Union[List[UploadFile], None] = File(default=None),
167 | output_format: Union[str, None] = Form(default=None),
168 | output_schema: str = Form(default=None),
169 | input1: List[str] = Form(default=[]),
170 | ):
171 | if files:
172 | for file_index in range(len(files)):
173 | if files[file_index].content_type == "application/gzip":
174 | files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type)
175 |
176 | content_type = request.headers.get("Accept")
177 |
178 | default_response_type = output_format or "application/json"
179 | if not content_type or content_type == "*/*" or content_type == "multipart/mixed":
180 | media_type = default_response_type
181 | else:
182 | media_type = content_type
183 |
184 | default_response_schema = output_schema or "labelstudio"
185 |
186 | if isinstance(files, list) and len(files):
187 | if len(files) > 1:
188 | if content_type and content_type not in [
189 | "*/*",
190 | "multipart/mixed",
191 | "application/json",
192 | "text/csv",
193 | ]:
194 | raise HTTPException(
195 | detail=(
196 | f"Conflict in media type {content_type}"
197 | ' with response type "multipart/mixed".\n'
198 | ),
199 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
200 | )
201 |
202 | def response_generator(is_multipart):
203 | for file in files:
204 | file_content_type = get_validated_mimetype(file)
205 |
206 | _file = file.file
207 |
208 | response = pipeline_api(
209 | _file,
210 | m_input1=input1,
211 | response_type=media_type,
212 | response_schema=default_response_schema,
213 | file_content_type=file_content_type,
214 | )
215 |
216 | if is_expected_response_type(media_type, type(response)):
217 | raise HTTPException(
218 | detail=(
219 | f"Conflict in media type {media_type}"
220 | f" with response type {type(response)}.\n"
221 | ),
222 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
223 | )
224 |
225 | valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"]
226 | if media_type in valid_response_types:
227 | if is_multipart:
228 | if type(response) not in [str, bytes]:
229 | response = json.dumps(response)
230 | elif media_type == "text/csv":
231 | response = PlainTextResponse(response)
232 | yield response
233 | else:
234 | raise HTTPException(
235 | detail=f"Unsupported media type {media_type}.\n",
236 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
237 | )
238 |
239 | def join_responses(responses):
240 | if media_type != "text/csv":
241 | return responses
242 | data = pd.read_csv(io.BytesIO(responses[0].body))
243 | if len(responses) > 1:
244 | for resp in responses[1:]:
245 | resp_data = pd.read_csv(io.BytesIO(resp.body))
246 | data = data.merge(resp_data, how="outer")
247 | return PlainTextResponse(data.to_csv())
248 |
249 | if content_type == "multipart/mixed":
250 | return MultipartMixedResponse(
251 | response_generator(is_multipart=True), content_type=media_type
252 | )
253 | else:
254 | return (
255 | list(response_generator(is_multipart=False))[0]
256 | if len(files) == 1
257 | else join_responses(list(response_generator(is_multipart=False)))
258 | )
259 | else:
260 | raise HTTPException(
261 | detail='Request parameter "files" is required.\n',
262 | status_code=status.HTTP_400_BAD_REQUEST,
263 | )
264 |
265 |
266 | app.include_router(router)
267 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_file_5.py:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 | import io
7 | import os
8 | import gzip
9 | import mimetypes
10 | from typing import List, Union
11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
12 | from fastapi.responses import PlainTextResponse
13 | import json
14 | from fastapi.responses import StreamingResponse
15 | from starlette.datastructures import Headers
16 | from starlette.types import Send
17 | from base64 import b64encode
18 | from typing import Optional, Mapping
19 | import secrets
20 | import pandas as pd
21 |
22 |
23 | app = FastAPI()
24 | router = APIRouter()
25 |
26 |
27 | def is_expected_response_type(media_type, response_type):
28 | if media_type == "application/json" and response_type not in [dict, list]:
29 | return True
30 | elif media_type == "text/csv" and response_type != str:
31 | return True
32 | else:
33 | return False
34 |
35 |
36 | # pipeline-api
37 |
38 |
39 | def pipeline_api(
40 | file,
41 | file_content_type=None,
42 | response_type="application/json",
43 | response_schema="labelstudio",
44 | m_input1=[],
45 | m_input2=[],
46 | ):
47 | data = pd.DataFrame(
48 | data={
49 | "silly_result": [
50 | str(len(file.read())),
51 | str(file_content_type),
52 | str(response_type),
53 | str(response_schema),
54 | str(m_input1),
55 | str(m_input2),
56 | ]
57 | }
58 | )
59 | if response_type == "text/csv":
60 | return data.to_csv()
61 | else:
62 | text = " : ".join(list(data["silly_result"]))
63 | return {"silly_result": text}
64 |
65 |
66 | def get_validated_mimetype(file):
67 | """
68 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
69 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
70 | return HTTP 400 for an invalid type.
71 | """
72 | content_type = file.content_type
73 | if not content_type or content_type == "application/octet-stream":
74 | content_type = mimetypes.guess_type(str(file.filename))[0]
75 |
76 | # Some filetypes missing for this library, just hardcode them for now
77 | if not content_type:
78 | if file.filename.endswith(".md"):
79 | content_type = "text/markdown"
80 | elif file.filename.endswith(".msg"):
81 | content_type = "message/rfc822"
82 |
83 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
84 | if allowed_mimetypes_str is not None:
85 | allowed_mimetypes = allowed_mimetypes_str.split(",")
86 |
87 | if content_type not in allowed_mimetypes:
88 | raise HTTPException(
89 | status_code=400,
90 | detail=(
91 | f"Unable to process {file.filename}: "
92 | f"File type {content_type} is not supported."
93 | ),
94 | )
95 |
96 | return content_type
97 |
98 |
99 | class MultipartMixedResponse(StreamingResponse):
100 | CRLF = b"\r\n"
101 |
102 | def __init__(self, *args, content_type: str = None, **kwargs):
103 | super().__init__(*args, **kwargs)
104 | self.content_type = content_type
105 |
106 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
107 | super().init_headers(headers)
108 | self.boundary_value = secrets.token_hex(16)
109 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
110 | self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
111 |
112 | @property
113 | def boundary(self):
114 | return b"--" + self.boundary_value.encode()
115 |
116 | def _build_part_headers(self, headers: dict) -> bytes:
117 | header_bytes = b""
118 | for header, value in headers.items():
119 | header_bytes += f"{header}: {value}".encode() + self.CRLF
120 | return header_bytes
121 |
122 | def build_part(self, chunk: bytes) -> bytes:
123 | part = self.boundary + self.CRLF
124 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
125 | if self.content_type is not None:
126 | part_headers["Content-Type"] = self.content_type
127 | part += self._build_part_headers(part_headers)
128 | part += self.CRLF + chunk + self.CRLF
129 | return part
130 |
131 | async def stream_response(self, send: Send) -> None:
132 | await send(
133 | {
134 | "type": "http.response.start",
135 | "status": self.status_code,
136 | "headers": self.raw_headers,
137 | }
138 | )
139 | async for chunk in self.body_iterator:
140 | if not isinstance(chunk, bytes):
141 | chunk = chunk.encode(self.charset)
142 | chunk = b64encode(chunk)
143 | await send(
144 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
145 | )
146 |
147 | await send({"type": "http.response.body", "body": b"", "more_body": False})
148 |
149 |
150 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
151 | def return_content_type(filename):
152 | if gz_uncompressed_content_type:
153 | return gz_uncompressed_content_type
154 | else:
155 | return str(mimetypes.guess_type(filename)[0])
156 |
157 | filename = str(file.filename) if file.filename else ""
158 | if filename.endswith(".gz"):
159 | filename = filename[:-3]
160 |
161 | gzip_file = gzip.open(file.file).read()
162 | return UploadFile(
163 | file=io.BytesIO(gzip_file),
164 | size=len(gzip_file),
165 | filename=filename,
166 | headers=Headers({"content-type": return_content_type(filename)}),
167 | )
168 |
169 |
170 | @router.post("/test-project/v1/process-file-5")
171 | @router.post("/test-project/v1.2.3/process-file-5")
172 | def pipeline_1(
173 | request: Request,
174 | gz_uncompressed_content_type: Optional[str] = Form(default=None),
175 | files: Union[List[UploadFile], None] = File(default=None),
176 | output_format: Union[str, None] = Form(default=None),
177 | output_schema: str = Form(default=None),
178 | input1: List[str] = Form(default=[]),
179 | input2: List[str] = Form(default=[]),
180 | ):
181 | if files:
182 | for file_index in range(len(files)):
183 | if files[file_index].content_type == "application/gzip":
184 | files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type)
185 |
186 | content_type = request.headers.get("Accept")
187 |
188 | default_response_type = output_format or "application/json"
189 | if not content_type or content_type == "*/*" or content_type == "multipart/mixed":
190 | media_type = default_response_type
191 | else:
192 | media_type = content_type
193 |
194 | default_response_schema = output_schema or "labelstudio"
195 |
196 | if isinstance(files, list) and len(files):
197 | if len(files) > 1:
198 | if content_type and content_type not in [
199 | "*/*",
200 | "multipart/mixed",
201 | "application/json",
202 | "text/csv",
203 | ]:
204 | raise HTTPException(
205 | detail=(
206 | f"Conflict in media type {content_type}"
207 | ' with response type "multipart/mixed".\n'
208 | ),
209 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
210 | )
211 |
212 | def response_generator(is_multipart):
213 | for file in files:
214 | file_content_type = get_validated_mimetype(file)
215 |
216 | _file = file.file
217 |
218 | response = pipeline_api(
219 | _file,
220 | m_input1=input1,
221 | m_input2=input2,
222 | response_type=media_type,
223 | response_schema=default_response_schema,
224 | file_content_type=file_content_type,
225 | )
226 |
227 | if is_expected_response_type(media_type, type(response)):
228 | raise HTTPException(
229 | detail=(
230 | f"Conflict in media type {media_type}"
231 | f" with response type {type(response)}.\n"
232 | ),
233 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
234 | )
235 |
236 | valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"]
237 | if media_type in valid_response_types:
238 | if is_multipart:
239 | if type(response) not in [str, bytes]:
240 | response = json.dumps(response)
241 | elif media_type == "text/csv":
242 | response = PlainTextResponse(response)
243 | yield response
244 | else:
245 | raise HTTPException(
246 | detail=f"Unsupported media type {media_type}.\n",
247 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
248 | )
249 |
250 | def join_responses(responses):
251 | if media_type != "text/csv":
252 | return responses
253 | data = pd.read_csv(io.BytesIO(responses[0].body))
254 | if len(responses) > 1:
255 | for resp in responses[1:]:
256 | resp_data = pd.read_csv(io.BytesIO(resp.body))
257 | data = data.merge(resp_data, how="outer")
258 | return PlainTextResponse(data.to_csv())
259 |
260 | if content_type == "multipart/mixed":
261 | return MultipartMixedResponse(
262 | response_generator(is_multipart=True), content_type=media_type
263 | )
264 | else:
265 | return (
266 | list(response_generator(is_multipart=False))[0]
267 | if len(files) == 1
268 | else join_responses(list(response_generator(is_multipart=False)))
269 | )
270 | else:
271 | raise HTTPException(
272 | detail='Request parameter "files" is required.\n',
273 | status_code=status.HTTP_400_BAD_REQUEST,
274 | )
275 |
276 |
277 | app.include_router(router)
278 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_1.py:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 | import io
7 | import os
8 | import gzip
9 | import mimetypes
10 | from typing import List, Union
11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
12 | import json
13 | from fastapi.responses import StreamingResponse
14 | from starlette.datastructures import Headers
15 | from starlette.types import Send
16 | from base64 import b64encode
17 | from typing import Optional, Mapping
18 | import secrets
19 |
20 |
21 | app = FastAPI()
22 | router = APIRouter()
23 |
24 |
25 | # pipeline-api
26 | def pipeline_api(
27 | text,
28 | ):
29 | return {"silly_result": " : ".join([str(len(text)), text])}
30 |
31 |
32 | def get_validated_mimetype(file):
33 | """
34 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
35 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
36 | return HTTP 400 for an invalid type.
37 | """
38 | content_type = file.content_type
39 | if not content_type or content_type == "application/octet-stream":
40 | content_type = mimetypes.guess_type(str(file.filename))[0]
41 |
42 | # Some filetypes missing for this library, just hardcode them for now
43 | if not content_type:
44 | if file.filename.endswith(".md"):
45 | content_type = "text/markdown"
46 | elif file.filename.endswith(".msg"):
47 | content_type = "message/rfc822"
48 |
49 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
50 | if allowed_mimetypes_str is not None:
51 | allowed_mimetypes = allowed_mimetypes_str.split(",")
52 |
53 | if content_type not in allowed_mimetypes:
54 | raise HTTPException(
55 | status_code=400,
56 | detail=(
57 | f"Unable to process {file.filename}: "
58 | f"File type {content_type} is not supported."
59 | ),
60 | )
61 |
62 | return content_type
63 |
64 |
65 | class MultipartMixedResponse(StreamingResponse):
66 | CRLF = b"\r\n"
67 |
68 | def __init__(self, *args, content_type: str = None, **kwargs):
69 | super().__init__(*args, **kwargs)
70 | self.content_type = content_type
71 |
72 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
73 | super().init_headers(headers)
74 | self.boundary_value = secrets.token_hex(16)
75 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
76 | self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
77 |
78 | @property
79 | def boundary(self):
80 | return b"--" + self.boundary_value.encode()
81 |
82 | def _build_part_headers(self, headers: dict) -> bytes:
83 | header_bytes = b""
84 | for header, value in headers.items():
85 | header_bytes += f"{header}: {value}".encode() + self.CRLF
86 | return header_bytes
87 |
88 | def build_part(self, chunk: bytes) -> bytes:
89 | part = self.boundary + self.CRLF
90 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
91 | if self.content_type is not None:
92 | part_headers["Content-Type"] = self.content_type
93 | part += self._build_part_headers(part_headers)
94 | part += self.CRLF + chunk + self.CRLF
95 | return part
96 |
97 | async def stream_response(self, send: Send) -> None:
98 | await send(
99 | {
100 | "type": "http.response.start",
101 | "status": self.status_code,
102 | "headers": self.raw_headers,
103 | }
104 | )
105 | async for chunk in self.body_iterator:
106 | if not isinstance(chunk, bytes):
107 | chunk = chunk.encode(self.charset)
108 | chunk = b64encode(chunk)
109 | await send(
110 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
111 | )
112 |
113 | await send({"type": "http.response.body", "body": b"", "more_body": False})
114 |
115 |
116 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
117 | def return_content_type(filename):
118 | if gz_uncompressed_content_type:
119 | return gz_uncompressed_content_type
120 | else:
121 | return str(mimetypes.guess_type(filename)[0])
122 |
123 | filename = str(file.filename) if file.filename else ""
124 | if filename.endswith(".gz"):
125 | filename = filename[:-3]
126 |
127 | gzip_file = gzip.open(file.file).read()
128 | return UploadFile(
129 | file=io.BytesIO(gzip_file),
130 | size=len(gzip_file),
131 | filename=filename,
132 | headers=Headers({"content-type": return_content_type(filename)}),
133 | )
134 |
135 |
136 | @router.post("/test-project/v1/process-text-1")
137 | @router.post("/test-project/v1.2.3/process-text-1")
138 | def pipeline_1(
139 | request: Request,
140 | gz_uncompressed_content_type: Optional[str] = Form(default=None),
141 | text_files: Union[List[UploadFile], None] = File(default=None),
142 | ):
143 | if text_files:
144 | for file_index in range(len(text_files)):
145 | if text_files[file_index].content_type == "application/gzip":
146 | text_files[file_index] = ungz_file(text_files[file_index])
147 |
148 | content_type = request.headers.get("Accept")
149 |
150 | if isinstance(text_files, list) and len(text_files):
151 | if len(text_files) > 1:
152 | if content_type and content_type not in [
153 | "*/*",
154 | "multipart/mixed",
155 | "application/json",
156 | "text/csv",
157 | ]:
158 | raise HTTPException(
159 | detail=(
160 | f"Conflict in media type {content_type}"
161 | ' with response type "multipart/mixed".\n'
162 | ),
163 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
164 | )
165 |
166 | def response_generator(is_multipart):
167 | for file in text_files:
168 | get_validated_mimetype(file)
169 |
170 | text = file.file.read().decode("utf-8")
171 |
172 | response = pipeline_api(
173 | text,
174 | )
175 |
176 | if is_multipart:
177 | if type(response) not in [str, bytes]:
178 | response = json.dumps(response)
179 | yield response
180 |
181 | if content_type == "multipart/mixed":
182 | return MultipartMixedResponse(
183 | response_generator(is_multipart=True),
184 | )
185 | else:
186 | return (
187 | list(response_generator(is_multipart=False))[0]
188 | if len(text_files) == 1
189 | else response_generator(is_multipart=False)
190 | )
191 | else:
192 | raise HTTPException(
193 | detail='Request parameter "text_files" is required.\n',
194 | status_code=status.HTTP_400_BAD_REQUEST,
195 | )
196 |
197 |
198 | app.include_router(router)
199 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_2.py:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 | import io
7 | import os
8 | import gzip
9 | import mimetypes
10 | from typing import List, Union
11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
12 | import json
13 | from fastapi.responses import StreamingResponse
14 | from starlette.datastructures import Headers
15 | from starlette.types import Send
16 | from base64 import b64encode
17 | from typing import Optional, Mapping
18 | import secrets
19 |
20 |
21 | app = FastAPI()
22 | router = APIRouter()
23 |
24 |
25 | # pipeline-api
26 | def pipeline_api(text, m_input1=[], m_input2=[]):
27 | return {"silly_result": " : ".join([str(len(text)), text, str(m_input1), str(m_input2)])}
28 |
29 |
30 | def get_validated_mimetype(file):
31 | """
32 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
33 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
34 | return HTTP 400 for an invalid type.
35 | """
36 | content_type = file.content_type
37 | if not content_type or content_type == "application/octet-stream":
38 | content_type = mimetypes.guess_type(str(file.filename))[0]
39 |
40 | # Some filetypes missing for this library, just hardcode them for now
41 | if not content_type:
42 | if file.filename.endswith(".md"):
43 | content_type = "text/markdown"
44 | elif file.filename.endswith(".msg"):
45 | content_type = "message/rfc822"
46 |
47 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
48 | if allowed_mimetypes_str is not None:
49 | allowed_mimetypes = allowed_mimetypes_str.split(",")
50 |
51 | if content_type not in allowed_mimetypes:
52 | raise HTTPException(
53 | status_code=400,
54 | detail=(
55 | f"Unable to process {file.filename}: "
56 | f"File type {content_type} is not supported."
57 | ),
58 | )
59 |
60 | return content_type
61 |
62 |
63 | class MultipartMixedResponse(StreamingResponse):
64 | CRLF = b"\r\n"
65 |
66 | def __init__(self, *args, content_type: str = None, **kwargs):
67 | super().__init__(*args, **kwargs)
68 | self.content_type = content_type
69 |
70 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
71 | super().init_headers(headers)
72 | self.boundary_value = secrets.token_hex(16)
73 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
74 | self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
75 |
76 | @property
77 | def boundary(self):
78 | return b"--" + self.boundary_value.encode()
79 |
80 | def _build_part_headers(self, headers: dict) -> bytes:
81 | header_bytes = b""
82 | for header, value in headers.items():
83 | header_bytes += f"{header}: {value}".encode() + self.CRLF
84 | return header_bytes
85 |
86 | def build_part(self, chunk: bytes) -> bytes:
87 | part = self.boundary + self.CRLF
88 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
89 | if self.content_type is not None:
90 | part_headers["Content-Type"] = self.content_type
91 | part += self._build_part_headers(part_headers)
92 | part += self.CRLF + chunk + self.CRLF
93 | return part
94 |
95 | async def stream_response(self, send: Send) -> None:
96 | await send(
97 | {
98 | "type": "http.response.start",
99 | "status": self.status_code,
100 | "headers": self.raw_headers,
101 | }
102 | )
103 | async for chunk in self.body_iterator:
104 | if not isinstance(chunk, bytes):
105 | chunk = chunk.encode(self.charset)
106 | chunk = b64encode(chunk)
107 | await send(
108 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
109 | )
110 |
111 | await send({"type": "http.response.body", "body": b"", "more_body": False})
112 |
113 |
114 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
115 | def return_content_type(filename):
116 | if gz_uncompressed_content_type:
117 | return gz_uncompressed_content_type
118 | else:
119 | return str(mimetypes.guess_type(filename)[0])
120 |
121 | filename = str(file.filename) if file.filename else ""
122 | if filename.endswith(".gz"):
123 | filename = filename[:-3]
124 |
125 | gzip_file = gzip.open(file.file).read()
126 | return UploadFile(
127 | file=io.BytesIO(gzip_file),
128 | size=len(gzip_file),
129 | filename=filename,
130 | headers=Headers({"content-type": return_content_type(filename)}),
131 | )
132 |
133 |
134 | @router.post("/test-project/v1/process-text-2")
135 | @router.post("/test-project/v1.2.3/process-text-2")
136 | def pipeline_1(
137 | request: Request,
138 | gz_uncompressed_content_type: Optional[str] = Form(default=None),
139 | text_files: Union[List[UploadFile], None] = File(default=None),
140 | input1: List[str] = Form(default=[]),
141 | input2: List[str] = Form(default=[]),
142 | ):
143 | if text_files:
144 | for file_index in range(len(text_files)):
145 | if text_files[file_index].content_type == "application/gzip":
146 | text_files[file_index] = ungz_file(text_files[file_index])
147 |
148 | content_type = request.headers.get("Accept")
149 |
150 | if isinstance(text_files, list) and len(text_files):
151 | if len(text_files) > 1:
152 | if content_type and content_type not in [
153 | "*/*",
154 | "multipart/mixed",
155 | "application/json",
156 | "text/csv",
157 | ]:
158 | raise HTTPException(
159 | detail=(
160 | f"Conflict in media type {content_type}"
161 | ' with response type "multipart/mixed".\n'
162 | ),
163 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
164 | )
165 |
166 | def response_generator(is_multipart):
167 | for file in text_files:
168 | get_validated_mimetype(file)
169 |
170 | text = file.file.read().decode("utf-8")
171 |
172 | response = pipeline_api(
173 | text,
174 | m_input1=input1,
175 | m_input2=input2,
176 | )
177 |
178 | if is_multipart:
179 | if type(response) not in [str, bytes]:
180 | response = json.dumps(response)
181 | yield response
182 |
183 | if content_type == "multipart/mixed":
184 | return MultipartMixedResponse(
185 | response_generator(is_multipart=True),
186 | )
187 | else:
188 | return (
189 | list(response_generator(is_multipart=False))[0]
190 | if len(text_files) == 1
191 | else response_generator(is_multipart=False)
192 | )
193 | else:
194 | raise HTTPException(
195 | detail='Request parameter "text_files" is required.\n',
196 | status_code=status.HTTP_400_BAD_REQUEST,
197 | )
198 |
199 |
200 | app.include_router(router)
201 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_3.py:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 | import io
7 | import os
8 | import gzip
9 | import mimetypes
10 | from typing import List, Union
11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
12 | from fastapi.responses import PlainTextResponse
13 | import json
14 | from fastapi.responses import StreamingResponse
15 | from starlette.datastructures import Headers
16 | from starlette.types import Send
17 | from base64 import b64encode
18 | from typing import Optional, Mapping
19 | import secrets
20 | import pandas as pd
21 |
22 |
23 | app = FastAPI()
24 | router = APIRouter()
25 |
26 |
27 | def is_expected_response_type(media_type, response_type):
28 | if media_type == "application/json" and response_type not in [dict, list]:
29 | return True
30 | elif media_type == "text/csv" and response_type != str:
31 | return True
32 | else:
33 | return False
34 |
35 |
36 | # pipeline-api
37 | def pipeline_api(text, response_type="text/csv"):
38 | data = pd.DataFrame(data={"silly_result": [str(len(text)), text, str(response_type)]})
39 | if response_type == "text/csv":
40 | return data.to_csv()
41 | else:
42 | text = " : ".join(list(data["silly_result"]))
43 | return {"silly_result": text}
44 |
45 |
46 | def get_validated_mimetype(file):
47 | """
48 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
49 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
50 | return HTTP 400 for an invalid type.
51 | """
52 | content_type = file.content_type
53 | if not content_type or content_type == "application/octet-stream":
54 | content_type = mimetypes.guess_type(str(file.filename))[0]
55 |
56 | # Some filetypes missing for this library, just hardcode them for now
57 | if not content_type:
58 | if file.filename.endswith(".md"):
59 | content_type = "text/markdown"
60 | elif file.filename.endswith(".msg"):
61 | content_type = "message/rfc822"
62 |
63 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
64 | if allowed_mimetypes_str is not None:
65 | allowed_mimetypes = allowed_mimetypes_str.split(",")
66 |
67 | if content_type not in allowed_mimetypes:
68 | raise HTTPException(
69 | status_code=400,
70 | detail=(
71 | f"Unable to process {file.filename}: "
72 | f"File type {content_type} is not supported."
73 | ),
74 | )
75 |
76 | return content_type
77 |
78 |
79 | class MultipartMixedResponse(StreamingResponse):
80 | CRLF = b"\r\n"
81 |
82 | def __init__(self, *args, content_type: str = None, **kwargs):
83 | super().__init__(*args, **kwargs)
84 | self.content_type = content_type
85 |
86 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
87 | super().init_headers(headers)
88 | self.boundary_value = secrets.token_hex(16)
89 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
90 | self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
91 |
92 | @property
93 | def boundary(self):
94 | return b"--" + self.boundary_value.encode()
95 |
96 | def _build_part_headers(self, headers: dict) -> bytes:
97 | header_bytes = b""
98 | for header, value in headers.items():
99 | header_bytes += f"{header}: {value}".encode() + self.CRLF
100 | return header_bytes
101 |
102 | def build_part(self, chunk: bytes) -> bytes:
103 | part = self.boundary + self.CRLF
104 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
105 | if self.content_type is not None:
106 | part_headers["Content-Type"] = self.content_type
107 | part += self._build_part_headers(part_headers)
108 | part += self.CRLF + chunk + self.CRLF
109 | return part
110 |
111 | async def stream_response(self, send: Send) -> None:
112 | await send(
113 | {
114 | "type": "http.response.start",
115 | "status": self.status_code,
116 | "headers": self.raw_headers,
117 | }
118 | )
119 | async for chunk in self.body_iterator:
120 | if not isinstance(chunk, bytes):
121 | chunk = chunk.encode(self.charset)
122 | chunk = b64encode(chunk)
123 | await send(
124 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
125 | )
126 |
127 | await send({"type": "http.response.body", "body": b"", "more_body": False})
128 |
129 |
130 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
131 | def return_content_type(filename):
132 | if gz_uncompressed_content_type:
133 | return gz_uncompressed_content_type
134 | else:
135 | return str(mimetypes.guess_type(filename)[0])
136 |
137 | filename = str(file.filename) if file.filename else ""
138 | if filename.endswith(".gz"):
139 | filename = filename[:-3]
140 |
141 | gzip_file = gzip.open(file.file).read()
142 | return UploadFile(
143 | file=io.BytesIO(gzip_file),
144 | size=len(gzip_file),
145 | filename=filename,
146 | headers=Headers({"content-type": return_content_type(filename)}),
147 | )
148 |
149 |
150 | @router.post("/test-project/v1/process-text-3")
151 | @router.post("/test-project/v1.2.3/process-text-3")
152 | def pipeline_1(
153 | request: Request,
154 | gz_uncompressed_content_type: Optional[str] = Form(default=None),
155 | text_files: Union[List[UploadFile], None] = File(default=None),
156 | output_format: Union[str, None] = Form(default=None),
157 | ):
158 | if text_files:
159 | for file_index in range(len(text_files)):
160 | if text_files[file_index].content_type == "application/gzip":
161 | text_files[file_index] = ungz_file(text_files[file_index])
162 |
163 | content_type = request.headers.get("Accept")
164 |
165 | default_response_type = output_format or "text/csv"
166 | if not content_type or content_type == "*/*" or content_type == "multipart/mixed":
167 | media_type = default_response_type
168 | else:
169 | media_type = content_type
170 |
171 | if isinstance(text_files, list) and len(text_files):
172 | if len(text_files) > 1:
173 | if content_type and content_type not in [
174 | "*/*",
175 | "multipart/mixed",
176 | "application/json",
177 | "text/csv",
178 | ]:
179 | raise HTTPException(
180 | detail=(
181 | f"Conflict in media type {content_type}"
182 | ' with response type "multipart/mixed".\n'
183 | ),
184 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
185 | )
186 |
187 | def response_generator(is_multipart):
188 | for file in text_files:
189 | get_validated_mimetype(file)
190 |
191 | text = file.file.read().decode("utf-8")
192 |
193 | response = pipeline_api(
194 | text,
195 | response_type=media_type,
196 | )
197 |
198 | if is_expected_response_type(media_type, type(response)):
199 | raise HTTPException(
200 | detail=(
201 | f"Conflict in media type {media_type}"
202 | f" with response type {type(response)}.\n"
203 | ),
204 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
205 | )
206 |
207 | valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"]
208 | if media_type in valid_response_types:
209 | if is_multipart:
210 | if type(response) not in [str, bytes]:
211 | response = json.dumps(response)
212 | elif media_type == "text/csv":
213 | response = PlainTextResponse(response)
214 | yield response
215 | else:
216 | raise HTTPException(
217 | detail=f"Unsupported media type {media_type}.\n",
218 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
219 | )
220 |
221 | def join_responses(responses):
222 | if media_type != "text/csv":
223 | return responses
224 | data = pd.read_csv(io.BytesIO(responses[0].body))
225 | if len(responses) > 1:
226 | for resp in responses[1:]:
227 | resp_data = pd.read_csv(io.BytesIO(resp.body))
228 | data = data.merge(resp_data, how="outer")
229 | return PlainTextResponse(data.to_csv())
230 |
231 | if content_type == "multipart/mixed":
232 | return MultipartMixedResponse(
233 | response_generator(is_multipart=True), content_type=media_type
234 | )
235 | else:
236 | return (
237 | list(response_generator(is_multipart=False))[0]
238 | if len(text_files) == 1
239 | else join_responses(list(response_generator(is_multipart=False)))
240 | )
241 | else:
242 | raise HTTPException(
243 | detail='Request parameter "text_files" is required.\n',
244 | status_code=status.HTTP_400_BAD_REQUEST,
245 | )
246 |
247 |
248 | app.include_router(router)
249 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_4.py:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 | import io
7 | import os
8 | import gzip
9 | import mimetypes
10 | from typing import List, Union
11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
12 | from fastapi.responses import PlainTextResponse
13 | import json
14 | from fastapi.responses import StreamingResponse
15 | from starlette.datastructures import Headers
16 | from starlette.types import Send
17 | from base64 import b64encode
18 | from typing import Optional, Mapping
19 | import secrets
20 | import pandas as pd
21 |
22 |
23 | app = FastAPI()
24 | router = APIRouter()
25 |
26 |
27 | def is_expected_response_type(media_type, response_type):
28 | if media_type == "application/json" and response_type not in [dict, list]:
29 | return True
30 | elif media_type == "text/csv" and response_type != str:
31 | return True
32 | else:
33 | return False
34 |
35 |
36 | # pipeline-api
37 | def pipeline_api(
38 | text,
39 | response_type="text/csv",
40 | response_schema="isd",
41 | ):
42 | data = pd.DataFrame(
43 | data={"silly_result": [str(len(text)), text, str(response_type), str(response_schema)]}
44 | )
45 | if response_type == "text/csv":
46 | return data.to_csv()
47 | else:
48 | text = " : ".join(list(data["silly_result"]))
49 | return {"silly_result": text}
50 |
51 |
52 | def get_validated_mimetype(file):
53 | """
54 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
55 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
56 | return HTTP 400 for an invalid type.
57 | """
58 | content_type = file.content_type
59 | if not content_type or content_type == "application/octet-stream":
60 | content_type = mimetypes.guess_type(str(file.filename))[0]
61 |
62 | # Some filetypes missing for this library, just hardcode them for now
63 | if not content_type:
64 | if file.filename.endswith(".md"):
65 | content_type = "text/markdown"
66 | elif file.filename.endswith(".msg"):
67 | content_type = "message/rfc822"
68 |
69 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
70 | if allowed_mimetypes_str is not None:
71 | allowed_mimetypes = allowed_mimetypes_str.split(",")
72 |
73 | if content_type not in allowed_mimetypes:
74 | raise HTTPException(
75 | status_code=400,
76 | detail=(
77 | f"Unable to process {file.filename}: "
78 | f"File type {content_type} is not supported."
79 | ),
80 | )
81 |
82 | return content_type
83 |
84 |
85 | class MultipartMixedResponse(StreamingResponse):
86 | CRLF = b"\r\n"
87 |
88 | def __init__(self, *args, content_type: str = None, **kwargs):
89 | super().__init__(*args, **kwargs)
90 | self.content_type = content_type
91 |
92 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
93 | super().init_headers(headers)
94 | self.boundary_value = secrets.token_hex(16)
95 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
96 | self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
97 |
98 | @property
99 | def boundary(self):
100 | return b"--" + self.boundary_value.encode()
101 |
102 | def _build_part_headers(self, headers: dict) -> bytes:
103 | header_bytes = b""
104 | for header, value in headers.items():
105 | header_bytes += f"{header}: {value}".encode() + self.CRLF
106 | return header_bytes
107 |
108 | def build_part(self, chunk: bytes) -> bytes:
109 | part = self.boundary + self.CRLF
110 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
111 | if self.content_type is not None:
112 | part_headers["Content-Type"] = self.content_type
113 | part += self._build_part_headers(part_headers)
114 | part += self.CRLF + chunk + self.CRLF
115 | return part
116 |
117 | async def stream_response(self, send: Send) -> None:
118 | await send(
119 | {
120 | "type": "http.response.start",
121 | "status": self.status_code,
122 | "headers": self.raw_headers,
123 | }
124 | )
125 | async for chunk in self.body_iterator:
126 | if not isinstance(chunk, bytes):
127 | chunk = chunk.encode(self.charset)
128 | chunk = b64encode(chunk)
129 | await send(
130 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
131 | )
132 |
133 | await send({"type": "http.response.body", "body": b"", "more_body": False})
134 |
135 |
136 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
137 | def return_content_type(filename):
138 | if gz_uncompressed_content_type:
139 | return gz_uncompressed_content_type
140 | else:
141 | return str(mimetypes.guess_type(filename)[0])
142 |
143 | filename = str(file.filename) if file.filename else ""
144 | if filename.endswith(".gz"):
145 | filename = filename[:-3]
146 |
147 | gzip_file = gzip.open(file.file).read()
148 | return UploadFile(
149 | file=io.BytesIO(gzip_file),
150 | size=len(gzip_file),
151 | filename=filename,
152 | headers=Headers({"content-type": return_content_type(filename)}),
153 | )
154 |
155 |
156 | @router.post("/test-project/v1/process-text-4")
157 | @router.post("/test-project/v1.2.3/process-text-4")
158 | def pipeline_1(
159 | request: Request,
160 | gz_uncompressed_content_type: Optional[str] = Form(default=None),
161 | text_files: Union[List[UploadFile], None] = File(default=None),
162 | output_format: Union[str, None] = Form(default=None),
163 | output_schema: str = Form(default=None),
164 | ):
165 | if text_files:
166 | for file_index in range(len(text_files)):
167 | if text_files[file_index].content_type == "application/gzip":
168 | text_files[file_index] = ungz_file(text_files[file_index])
169 |
170 | content_type = request.headers.get("Accept")
171 |
172 | default_response_type = output_format or "text/csv"
173 | if not content_type or content_type == "*/*" or content_type == "multipart/mixed":
174 | media_type = default_response_type
175 | else:
176 | media_type = content_type
177 |
178 | default_response_schema = output_schema or "isd"
179 |
180 | if isinstance(text_files, list) and len(text_files):
181 | if len(text_files) > 1:
182 | if content_type and content_type not in [
183 | "*/*",
184 | "multipart/mixed",
185 | "application/json",
186 | "text/csv",
187 | ]:
188 | raise HTTPException(
189 | detail=(
190 | f"Conflict in media type {content_type}"
191 | ' with response type "multipart/mixed".\n'
192 | ),
193 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
194 | )
195 |
196 | def response_generator(is_multipart):
197 | for file in text_files:
198 | get_validated_mimetype(file)
199 |
200 | text = file.file.read().decode("utf-8")
201 |
202 | response = pipeline_api(
203 | text,
204 | response_type=media_type,
205 | response_schema=default_response_schema,
206 | )
207 |
208 | if is_expected_response_type(media_type, type(response)):
209 | raise HTTPException(
210 | detail=(
211 | f"Conflict in media type {media_type}"
212 | f" with response type {type(response)}.\n"
213 | ),
214 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
215 | )
216 |
217 | valid_response_types = ["application/json", "text/csv", "*/*", "multipart/mixed"]
218 | if media_type in valid_response_types:
219 | if is_multipart:
220 | if type(response) not in [str, bytes]:
221 | response = json.dumps(response)
222 | elif media_type == "text/csv":
223 | response = PlainTextResponse(response)
224 | yield response
225 | else:
226 | raise HTTPException(
227 | detail=f"Unsupported media type {media_type}.\n",
228 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
229 | )
230 |
231 | def join_responses(responses):
232 | if media_type != "text/csv":
233 | return responses
234 | data = pd.read_csv(io.BytesIO(responses[0].body))
235 | if len(responses) > 1:
236 | for resp in responses[1:]:
237 | resp_data = pd.read_csv(io.BytesIO(resp.body))
238 | data = data.merge(resp_data, how="outer")
239 | return PlainTextResponse(data.to_csv())
240 |
241 | if content_type == "multipart/mixed":
242 | return MultipartMixedResponse(
243 | response_generator(is_multipart=True), content_type=media_type
244 | )
245 | else:
246 | return (
247 | list(response_generator(is_multipart=False))[0]
248 | if len(text_files) == 1
249 | else join_responses(list(response_generator(is_multipart=False)))
250 | )
251 | else:
252 | raise HTTPException(
253 | detail='Request parameter "text_files" is required.\n',
254 | status_code=status.HTTP_400_BAD_REQUEST,
255 | )
256 |
257 |
258 | app.include_router(router)
259 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/prepline_test_project/api/process_text_file_1.py:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 | import io
7 | import os
8 | import gzip
9 | import mimetypes
10 | from typing import List, Union
11 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter, HTTPException
12 | import json
13 | from fastapi.responses import StreamingResponse
14 | from starlette.datastructures import Headers
15 | from starlette.types import Send
16 | from base64 import b64encode
17 | from typing import Optional, Mapping
18 | import secrets
19 |
20 |
21 | app = FastAPI()
22 | router = APIRouter()
23 |
24 |
25 | # pipeline-api
26 | def pipeline_api(
27 | text,
28 | file=None,
29 | filename=None,
30 | file_content_type=None,
31 | ):
32 | return {
33 | "silly_result": " : ".join(
34 | [
35 | str(len(text if text else "")),
36 | str(text),
37 | str(len(file.read()) if file else None),
38 | str(filename),
39 | str(file_content_type),
40 | ]
41 | )
42 | }
43 |
44 |
45 | def get_validated_mimetype(file):
46 | """
47 | Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
48 | generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
49 | return HTTP 400 for an invalid type.
50 | """
51 | content_type = file.content_type
52 | if not content_type or content_type == "application/octet-stream":
53 | content_type = mimetypes.guess_type(str(file.filename))[0]
54 |
55 | # Some filetypes missing for this library, just hardcode them for now
56 | if not content_type:
57 | if file.filename.endswith(".md"):
58 | content_type = "text/markdown"
59 | elif file.filename.endswith(".msg"):
60 | content_type = "message/rfc822"
61 |
62 | allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
63 | if allowed_mimetypes_str is not None:
64 | allowed_mimetypes = allowed_mimetypes_str.split(",")
65 |
66 | if content_type not in allowed_mimetypes:
67 | raise HTTPException(
68 | status_code=400,
69 | detail=(
70 | f"Unable to process {file.filename}: "
71 | f"File type {content_type} is not supported."
72 | ),
73 | )
74 |
75 | return content_type
76 |
77 |
78 | class MultipartMixedResponse(StreamingResponse):
79 | CRLF = b"\r\n"
80 |
81 | def __init__(self, *args, content_type: str = None, **kwargs):
82 | super().__init__(*args, **kwargs)
83 | self.content_type = content_type
84 |
85 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
86 | super().init_headers(headers)
87 | self.boundary_value = secrets.token_hex(16)
88 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
89 | self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
90 |
91 | @property
92 | def boundary(self):
93 | return b"--" + self.boundary_value.encode()
94 |
95 | def _build_part_headers(self, headers: dict) -> bytes:
96 | header_bytes = b""
97 | for header, value in headers.items():
98 | header_bytes += f"{header}: {value}".encode() + self.CRLF
99 | return header_bytes
100 |
101 | def build_part(self, chunk: bytes) -> bytes:
102 | part = self.boundary + self.CRLF
103 | part_headers = {"Content-Length": len(chunk), "Content-Transfer-Encoding": "base64"}
104 | if self.content_type is not None:
105 | part_headers["Content-Type"] = self.content_type
106 | part += self._build_part_headers(part_headers)
107 | part += self.CRLF + chunk + self.CRLF
108 | return part
109 |
110 | async def stream_response(self, send: Send) -> None:
111 | await send(
112 | {
113 | "type": "http.response.start",
114 | "status": self.status_code,
115 | "headers": self.raw_headers,
116 | }
117 | )
118 | async for chunk in self.body_iterator:
119 | if not isinstance(chunk, bytes):
120 | chunk = chunk.encode(self.charset)
121 | chunk = b64encode(chunk)
122 | await send(
123 | {"type": "http.response.body", "body": self.build_part(chunk), "more_body": True}
124 | )
125 |
126 | await send({"type": "http.response.body", "body": b"", "more_body": False})
127 |
128 |
129 | def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile:
130 | def return_content_type(filename):
131 | if gz_uncompressed_content_type:
132 | return gz_uncompressed_content_type
133 | else:
134 | return str(mimetypes.guess_type(filename)[0])
135 |
136 | filename = str(file.filename) if file.filename else ""
137 | if filename.endswith(".gz"):
138 | filename = filename[:-3]
139 |
140 | gzip_file = gzip.open(file.file).read()
141 | return UploadFile(
142 | file=io.BytesIO(gzip_file),
143 | size=len(gzip_file),
144 | filename=filename,
145 | headers=Headers({"content-type": return_content_type(filename)}),
146 | )
147 |
148 |
149 | @router.post("/test-project/v1/process-text-file-1")
150 | @router.post("/test-project/v1.2.3/process-text-file-1")
151 | def pipeline_1(
152 | request: Request,
153 | gz_uncompressed_content_type: Optional[str] = Form(default=None),
154 | files: Union[List[UploadFile], None] = File(default=None),
155 | text_files: Union[List[UploadFile], None] = File(default=None),
156 | ):
157 | if files:
158 | for file_index in range(len(files)):
159 | if files[file_index].content_type == "application/gzip":
160 | files[file_index] = ungz_file(files[file_index], gz_uncompressed_content_type)
161 |
162 | if text_files:
163 | for file_index in range(len(text_files)):
164 | if text_files[file_index].content_type == "application/gzip":
165 | text_files[file_index] = ungz_file(text_files[file_index])
166 |
167 | content_type = request.headers.get("Accept")
168 |
169 | has_text = isinstance(text_files, list) and len(text_files)
170 | has_files = isinstance(files, list) and len(files)
171 | if not has_text and not has_files:
172 | raise HTTPException(
173 | detail='One of the request parameters "text_files" or "files" is required.\n',
174 | status_code=status.HTTP_400_BAD_REQUEST,
175 | )
176 | files_list: List = files or []
177 | text_files_list: List = text_files or []
178 |
179 | if len(files_list) or len(text_files_list):
180 | if all(
181 | [
182 | content_type,
183 | content_type not in ["*/*", "multipart/mixed", "application/json", "text/csv"],
184 | len(files_list) + len(text_files_list) > 1,
185 | ]
186 | ):
187 | raise HTTPException(
188 | detail=(
189 | f"Conflict in media type {content_type}"
190 | ' with response type "multipart/mixed".\n'
191 | ),
192 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
193 | )
194 |
195 | def response_generator(is_multipart):
196 | for text_file in text_files_list:
197 | text = text_file.file.read().decode("utf-8")
198 |
199 | response = pipeline_api(
200 | text=text,
201 | file=None,
202 | )
203 |
204 | if is_multipart:
205 | if type(response) not in [str, bytes]:
206 | response = json.dumps(response)
207 | yield response
208 |
209 | for file in files_list:
210 | _file = file.file
211 |
212 | file_content_type = get_validated_mimetype(file)
213 |
214 | response = pipeline_api(
215 | text=None,
216 | file=_file,
217 | filename=file.filename,
218 | file_content_type=file_content_type,
219 | )
220 |
221 | if is_multipart:
222 | if type(response) not in [str, bytes]:
223 | response = json.dumps(response)
224 | yield response
225 |
226 | if content_type == "multipart/mixed":
227 | return MultipartMixedResponse(
228 | response_generator(is_multipart=True),
229 | )
230 | else:
231 | return (
232 | list(response_generator(is_multipart=False))[0]
233 | if len(files_list + text_files_list) == 1
234 | else response_generator(is_multipart=False)
235 | )
236 | else:
237 | raise HTTPException(
238 | detail='Request parameters "files" or "text_files" are required.\n',
239 | status_code=status.HTTP_400_BAD_REQUEST,
240 | )
241 |
242 |
243 | app.include_router(router)
244 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/preprocessing-pipeline-family.yaml:
--------------------------------------------------------------------------------
1 | name: test-project
2 | version: 1.2.3
3 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/scripts/check-and-format-notebooks.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | from copy import deepcopy
5 | import difflib
6 | import json
7 | from pathlib import Path
8 | import sys
9 | from typing import List, Tuple, Union
10 |
11 | from nbdev import clean
12 | from nbconvert.preprocessors import ExecutePreprocessor
13 | import nbformat
14 | from unstructured_api_tools.pipelines.convert import read_notebook
15 |
16 |
17 | def process_nb(nb: nbformat.NotebookNode, working_dir: Union[str, Path]) -> nbformat.NotebookNode:
18 | """Execute cells in nb using working_dir as the working directory for imports, modifying the
19 | notebook in place (in memory)."""
20 | ep = ExecutePreprocessor(timeout=600)
21 | ep.preprocess(nb, {"metadata": {"path": working_dir}})
22 | return nb
23 |
24 |
25 | def nb_paths(root_path: Union[str, Path]) -> List[Path]:
26 | """Fetches all .ipynb filenames that belong to subdirectories of root_path (1 level deep) with
27 | 'notebooks' in the name."""
28 | root_path = Path(root_path)
29 | return [
30 | fn
31 | for dir in root_path.iterdir()
32 | # NOTE(alan): Search only in paths with 'notebooks' in the title such as pipeline-notebooks
33 | # and exploration-notebooks
34 | if "notebooks" in dir.stem and dir.is_dir()
35 | for fn in dir.iterdir()
36 | if fn.suffix == ".ipynb"
37 | ]
38 |
39 |
40 | def to_results_str(fns: List[Path], nonmatching_nbs: List[Path]) -> Tuple[str, str]:
41 | """Given files that were checked and list of files that would be changed, produces a summary of
42 | changes as well as a list of files to be changed"""
43 | unchanged = len(fns) - len(nonmatching_nbs)
44 | results = []
45 | if nonmatching_nbs:
46 | results.append(
47 | f"{len(nonmatching_nbs)} "
48 | f"{'file' if len(nonmatching_nbs) == 1 else 'files'} "
49 | f"{'would be ' if check else ''}changed"
50 | )
51 | if unchanged:
52 | results.append(
53 | f"{unchanged} "
54 | f"{'file' if unchanged == 1 else 'files'} "
55 | f"{'would be ' if check else ''}left unchanged"
56 | )
57 | summary_str = ", ".join(results) + ".\n"
58 | if nonmatching_nbs:
59 | details_str = (
60 | f"The following notebooks {'would have been' if check else 'were'} "
61 | "changed when executed and cleaned:\n* " + "\n* ".join(nonmatching_nbs) + "\n"
62 | )
63 | else:
64 | details_str = ""
65 |
66 | return summary_str, details_str
67 |
68 |
69 | if __name__ == "__main__":
70 | parser = argparse.ArgumentParser()
71 | parser.add_argument(
72 | "--check",
73 | default=False,
74 | action="store_true",
75 | help="Check notebook format without making changes. Return code 0 means formatting would "
76 | "produce no changes. Return code 1 means some files would be changed.",
77 | )
78 | parser.add_argument(
79 | "notebooks",
80 | metavar="notebook",
81 | nargs="*",
82 | help="Path(s) to notebook(s) to format (or check). If you don't pass any paths, "
83 | "notebooks in any subfolders with 'notebooks' in the name will be processed.",
84 | default=[],
85 | )
86 | args = parser.parse_args()
87 | check = args.check
88 | notebooks = args.notebooks
89 |
90 | root_path = Path(__file__).parent.parent
91 | nonmatching_nbs = []
92 | fns = notebooks if notebooks else nb_paths(root_path)
93 | for fn in fns:
94 | nb = read_notebook(fn)
95 | modified_nb = deepcopy(nb)
96 | process_nb(modified_nb, root_path)
97 | clean.clean_nb(modified_nb, allowed_cell_metadata_keys=["tags"])
98 | if nb != modified_nb:
99 | nonmatching_nbs.append(str(fn))
100 | nb_json = json.dumps(nb.dict(), indent=2, sort_keys=True)
101 | modified_nb_json = json.dumps(modified_nb.dict(), indent=2, sort_keys=True)
102 | sys.stderr.write(f"The following diff shows the modifications made to {fn}\n")
103 | sys.stderr.writelines(
104 | (
105 | difflib.unified_diff(
106 | nb_json.splitlines(keepends=True),
107 | modified_nb_json.splitlines(keepends=True),
108 | )
109 | )
110 | )
111 | if not check:
112 | nbformat.write(modified_nb, fn)
113 |
114 | summary_str, details_str = to_results_str(fns, nonmatching_nbs)
115 | print(summary_str)
116 | if check:
117 | sys.stderr.write(details_str)
118 | if nonmatching_nbs:
119 | sys.exit(1)
120 | else:
121 | print(details_str)
122 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipeline-test-project/scripts/test-doc-pipeline-apis-consistent.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -eu -o pipefail
4 |
5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
6 | cd "$SCRIPT_DIR"/..
7 |
8 | PIPELINE_OUTPUT_DIR=tmp-api-check-output-$RANDOM
9 | FILE_INDICTATING_FAILURE="$PIPELINE_OUTPUT_DIR"-has-failures
10 | mkdir -p $PIPELINE_OUTPUT_DIR
11 | touch $PIPELINE_OUTPUT_DIR/__init__.py
12 |
13 | function tmp_pipeline_comp_cleanup () {
14 | cd "$SCRIPT_DIR"/..
15 | rm -f "$FILE_INDICTATING_FAILURE"
16 | if [[ "$1" -eq 0 ]]; then
17 | rm -rf $PIPELINE_OUTPUT_DIR
18 | fi
19 | exit "$1"
20 | }
21 |
22 | # Now in project root
23 | cd ../..
24 |
25 | PYTHONPATH=. PIPELINE_FAMILY_CONFIG=test_unstructured_api_tools/pipeline-test-project/preprocessing-pipeline-family.yaml \
26 | python3 ./unstructured_api_tools/cli.py convert-pipeline-notebooks \
27 | --input-directory ./test_unstructured_api_tools/pipeline-test-project/pipeline-notebooks \
28 | --output-directory ./test_unstructured_api_tools/pipeline-test-project/"$PIPELINE_OUTPUT_DIR"
29 |
30 | # Back in the test project
31 | cd -
32 |
33 | NUM_PIPELINE_API_FILES_GENERATED=$(find "$PIPELINE_OUTPUT_DIR" -name "*.py" | wc -l)
34 |
35 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -eq 0 ]]; then
36 | echo "No pipelines where created by unstructured_api_tools convert-pipeline-notebooks"
37 | tmp_pipeline_comp_cleanup 1
38 | fi
39 |
40 | NUM_EXISTING_PIPELINE_API_FILES=$(find "$PACKAGE_NAME"/api -name "*.py" | wc -l)
41 |
42 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -gt "$NUM_EXISTING_PIPELINE_API_FILES" ]]; then
43 | echo "More pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api"
44 | tmp_pipeline_comp_cleanup 1
45 | elif [[ "$NUM_PIPELINE_API_FILES_GENERATED" -lt "$NUM_EXISTING_PIPELINE_API_FILES" ]]; then
46 | echo "Fewer pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api"
47 | tmp_pipeline_comp_cleanup 1
48 | fi
49 |
50 | cd "$PACKAGE_NAME"/api
51 | find . -name "*.py" -print0 | while IFS= read -r -d '' pipeline_file; do
52 | set +o pipefail
53 | if ! diff -u "$pipeline_file" ../../"$PIPELINE_OUTPUT_DIR/$pipeline_file"; then
54 | touch "../../$FILE_INDICTATING_FAILURE"
55 | fi
56 | set -o pipefail
57 | done
58 | cd -
59 |
60 | if [ -r "$FILE_INDICTATING_FAILURE" ]; then
61 | echo
62 | echo "Autogenerated pipeline api file(s) do not match existing versions, see above for diff's"
63 | echo " or run: diff -ru ${PACKAGE_NAME}/api/ ${PIPELINE_OUTPUT_DIR}/"
64 | tmp_pipeline_comp_cleanup 1
65 | fi
66 | tmp_pipeline_comp_cleanup 0
67 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipelines/test_api_conventions.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 | import yaml
4 |
5 | import unstructured_api_tools.pipelines.api_conventions as conventions
6 |
7 |
8 | @pytest.fixture
9 | def sample_config():
10 | return {"version": "0.2.1", "name": "sec_filings"}
11 |
12 |
13 | @pytest.mark.parametrize(
14 | # NOTE(yuming): Test cases ref: https://regex101.com/r/Ly7O1x/3/
15 | "invalid_semver_string",
16 | [
17 | "1",
18 | "1.2",
19 | "1.2.3-0123",
20 | "1.2.3-0123.0123",
21 | "1.1.2+.123",
22 | "+invalid",
23 | "-invalid",
24 | "-invalid+invalid",
25 | "-invalid.01",
26 | "alpha",
27 | "alpha.beta",
28 | "alpha.beta.1",
29 | "alpha.1",
30 | "alpha+beta",
31 | "alpha_beta",
32 | "alpha.",
33 | "alpha..",
34 | "beta",
35 | "1.0.0-alpha_beta",
36 | "-alpha.",
37 | "1.0.0-alpha..",
38 | "1.0.0-alpha..1",
39 | "1.0.0-alpha...1",
40 | "1.0.0-alpha....1",
41 | "1.0.0-alpha.....1",
42 | "1.0.0-alpha......1",
43 | "1.0.0-alpha.......1",
44 | "01.1.1",
45 | "1.01.1",
46 | "1.1.01",
47 | "1.2",
48 | "1.2.3.DEV",
49 | "1.2-SNAPSHOT",
50 | "1.2.31.2.3----RC-SNAPSHOT.12.09.1--..12+788",
51 | "1.2-RC-SNAPSHOT",
52 | "-1.0.3-gamma+b7718",
53 | "+justmeta",
54 | "9.8.7+meta+meta",
55 | "9.8.7-whatever+meta+meta",
56 | "9999999999999.999999999999999999.999999999----RC-SNAPSHOT.12.09.1---------..12",
57 | ],
58 | )
59 | def test_raise_for_invalid_semver_string(invalid_semver_string):
60 | with pytest.raises(ValueError):
61 | conventions.raise_for_invalid_semver_string(invalid_semver_string)
62 |
63 |
64 | @pytest.mark.parametrize(
65 | # NOTE(yuming): Test cases ref: https://regex101.com/r/Ly7O1x/3/
66 | "valid_semver_string",
67 | [
68 | "0.0.4",
69 | "1.2.3",
70 | "10.20.30",
71 | "1.1.2-prerelease+meta",
72 | "1.1.2+meta",
73 | "1.1.2+meta-valid",
74 | "1.0.0-alpha",
75 | "1.0.0-beta",
76 | "1.0.0-alpha.beta",
77 | "1.0.0-alpha.beta.1",
78 | "1.0.0-alpha.1",
79 | "1.0.0-alpha0.valid",
80 | "1.0.0-alpha.0valid",
81 | "1.0.0-alpha-a.b-c-somethinglong+build.1-aef.1-its-okay",
82 | "1.0.0-rc.1+build.1",
83 | "2.0.0-rc.1+build.123",
84 | "1.2.3-beta",
85 | "10.2.3-DEV-SNAPSHOT",
86 | "1.2.3-SNAPSHOT-123",
87 | "1.0.0",
88 | "2.0.0",
89 | "1.1.7",
90 | "2.0.0+build.1848",
91 | "2.0.1-alpha.1227",
92 | "1.0.0-alpha+beta",
93 | "1.2.3----RC-SNAPSHOT.12.9.1--.12+788",
94 | "1.2.3----R-S.12.9.1--.12+meta",
95 | "1.2.3----RC-SNAPSHOT.12.9.1--.12",
96 | "1.0.0+0.build.1-rc.10000aaa-kk-0.1",
97 | "99999999999999999999999.999999999999999999.99999999999999999",
98 | "1.0.0-0A.is.legal",
99 | ],
100 | )
101 | def test_pass_for_valid_semver_string(valid_semver_string):
102 | try:
103 | conventions.raise_for_invalid_semver_string(valid_semver_string)
104 | except ValueError:
105 | assert False, f"{valid_semver_string} raised an exception."
106 |
107 |
108 | def test_get_pipeline_path():
109 | path = conventions.get_pipeline_path(
110 | filename="risk_narrative.py", pipeline_family="sec_filings", semver="0.2.1"
111 | )
112 | assert path == "/sec-filings/v0.2.1/risk-narrative"
113 |
114 |
115 | def test_get_short_pipeline_path():
116 | path = conventions.get_pipeline_path(
117 | filename="risk_narrative.py",
118 | pipeline_family="sec_filings",
119 | semver="0.2.1",
120 | shorter=True,
121 | )
122 |
123 | assert path == "/sec-filings/v0/risk-narrative"
124 |
125 |
126 | def test_get_pipeline_path_raises_if_either_not_specified():
127 | with pytest.raises(ValueError):
128 | conventions.get_pipeline_path(
129 | filename="risk_narrative.py", pipeline_family="sec_filings", semver=None
130 | )
131 |
132 | with pytest.raises(ValueError):
133 | conventions.get_pipeline_path(
134 | filename="risk_narrative.py", pipeline_family=None, semver="0.2.1"
135 | )
136 |
137 |
138 | def test_get_pipeline_path_reads_from_file(tmpdir, sample_config):
139 | filename = os.path.join(tmpdir.dirname, "pipeline-family.yaml")
140 | with open(filename, "w") as f:
141 | yaml.dump(sample_config, f)
142 |
143 | path = conventions.get_pipeline_path(filename="risk_narrative.py", config_filename=filename)
144 | assert path == "/sec-filings/v0.2.1/risk-narrative"
145 |
146 |
147 | def test_pipeline_config_reads_from_file(tmpdir, sample_config):
148 | filename = os.path.join(tmpdir.dirname, "pipeline-family.yaml")
149 | with open(filename, "w") as f:
150 | yaml.dump(sample_config, f)
151 |
152 | config = conventions.PipelineConfig(filename=filename)
153 | assert config.name == "sec_filings"
154 | assert config.version == "0.2.1"
155 |
156 |
157 | def test_pipeline_config_reads_from_env(tmpdir, monkeypatch, sample_config):
158 | filename = os.path.join(tmpdir.dirname, "pipeline-family.yaml")
159 | with open(filename, "w") as f:
160 | yaml.dump(sample_config, f)
161 |
162 | monkeypatch.setenv("PIPELINE_FAMILY_CONFIG", filename)
163 |
164 | config = conventions.PipelineConfig(filename=None)
165 | assert config.name == "sec_filings"
166 |
167 |
168 | def test_pipeline_config_raises_with_missing_file(tmpdir, monkeypatch, sample_config):
169 | # NOTE(robinson) - Will default to looking for ${PWD}/pipeline-family.yaml, which
170 | # does not exist
171 | with pytest.raises(FileNotFoundError):
172 | conventions.PipelineConfig(filename=None)
173 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/pipelines/test_lint.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 | import re
4 | from unittest.mock import patch
5 |
6 | import unstructured_api_tools.pipelines.lint as lint
7 |
8 |
9 | class MockPopen:
10 | def __init__(self, *args, **kwargs):
11 | pass
12 |
13 | def communicate(self, *args, **kwargs):
14 | raise ValueError("Squawk!")
15 |
16 |
17 | def test_run_lint_cmd_cleans_up_on_exception(monkeypatch):
18 | monkeypatch.setattr(lint, "Popen", MockPopen)
19 | with patch.object(os, "unlink", return_value=None) as mock_unlink:
20 | with pytest.raises(ValueError):
21 | lint._run_lint_cmd(["fake"], "fake.py", re.compile("[A-Z]"))
22 |
23 | mock_unlink.assert_called_once()
24 |
25 |
26 | def test_flake8():
27 | file_text = """# A test file
28 |
29 | def hello_world():
30 | pass
31 | """
32 | assert lint.check_flake8(file_text) is True
33 |
34 |
35 | def test_flake8_passes_with_unsued_import():
36 | file_text = """# A test file
37 | import os
38 |
39 |
40 | def hello_world():
41 | pass
42 | """
43 | assert lint.check_flake8(file_text) is True
44 |
45 |
46 | def test_flake8_raises_with_bad_lint():
47 | file_text = """# A test file
48 |
49 | def hello_world() :
50 | pass"""
51 | with pytest.raises(lint.LintError):
52 | lint.check_flake8(file_text)
53 |
54 |
55 | def test_format_black():
56 | file_text = """# A test file
57 |
58 | def hello_world() :
59 | pass
60 | """
61 | formatted_text = lint.format_black(file_text)
62 |
63 | assert (
64 | formatted_text
65 | == """# A test file
66 |
67 |
68 | def hello_world():
69 | pass
70 | """
71 | )
72 |
73 |
74 | def test_validate_flake8_ignore():
75 | lint.validate_flake8_ignore("E405, F401") is True
76 |
77 |
78 | def test_validate_flake8_ignore_bad_input():
79 | with pytest.raises(ValueError):
80 | lint.validate_flake8_ignore("NOT A REAL CODE")
81 |
82 |
83 | def test_mypy():
84 | file_text = """# A test file
85 |
86 | def hello_world(text: str) -> str:
87 | return text
88 | """
89 | assert lint.check_mypy(file_text) is True
90 |
91 |
92 | def test_mypy_raises_with_bad_type():
93 | file_text = """# A test file
94 |
95 | def hello_world(text: str) -> str:
96 | return int(text)
97 | """
98 | with pytest.raises(lint.LintError):
99 | lint.check_mypy(file_text)
100 |
101 |
102 | def test_check_black():
103 | file_text = """# A test file
104 |
105 |
106 | def hello_world():
107 | pass
108 | """
109 | assert lint.check_black(file_text) is True
110 |
111 |
112 | def test_check_black_raises_with_bad_format():
113 | file_text = """# A test file
114 |
115 |
116 | def hello_world() :
117 | pass
118 | """
119 | with pytest.raises(lint.LintError):
120 | lint.check_black(file_text)
121 |
--------------------------------------------------------------------------------
/test_unstructured_api_tools/test_cli.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import pytest
4 |
5 | from click.testing import CliRunner
6 | from nbformat import NotebookNode
7 |
8 | import unstructured_api_tools.cli as cli
9 |
10 |
11 | @pytest.fixture
12 | def sample_notebook():
13 | return NotebookNode(
14 | {
15 | "cells": [
16 | {
17 | "cell_type": "code",
18 | "execution_count": 1,
19 | "id": "768fa8c6",
20 | "metadata": {},
21 | "outputs": [],
22 | "source": "# pipeline-api\nimport random", # noqa: E501
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 1,
27 | "id": "64f6386b",
28 | "metadata": {},
29 | "outputs": [],
30 | "source": "def function_not_to_include():\n pass",
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "id": "45988caf",
35 | "metadata": {},
36 | "source": "# pipeline-api",
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 2,
41 | "id": "c8e0cad6",
42 | "metadata": {},
43 | "outputs": [],
44 | "source": "# pipeline-api\ndef pipeline_api(text: str):\n sec_document = 'not a real document'\n risk_narrative = sec_document[0:5]\n return risk_narrative", # noqa: E501
45 | },
46 | ],
47 | "metadata": {
48 | "kernelspec": {
49 | "display_name": "Python 3 (ipykernel)",
50 | "language": "python",
51 | "name": "python3",
52 | },
53 | "language_info": {
54 | "codemirror_mode": {"name": "ipython", "version": 3},
55 | "file_extension": ".py",
56 | "mimetype": "text/x-python",
57 | "name": "python",
58 | "nbconvert_exporter": "python",
59 | "pygments_lexer": "ipython3",
60 | "version": "3.8.13",
61 | },
62 | },
63 | "nbformat": 4,
64 | "nbformat_minor": 5,
65 | }
66 | )
67 |
68 |
69 | def test_convert_pipeline_notebooks(sample_notebook, tmpdir):
70 | for i in range(5):
71 | filename = os.path.join(tmpdir.dirname, f"pipeline-this-is-a-test-{i}.ipynb")
72 | with open(filename, "w") as f:
73 | json.dump(sample_notebook, f, indent=4)
74 |
75 | runner = CliRunner()
76 | result = runner.invoke(
77 | cli.cli,
78 | [
79 | "convert-pipeline-notebooks",
80 | "--input-directory",
81 | tmpdir.dirname,
82 | "--output-directory",
83 | tmpdir.dirname,
84 | "--pipeline-family",
85 | "fake-family-name",
86 | "--semver",
87 | "2.1.1",
88 | ],
89 | )
90 | assert result.exit_code == 0
91 |
92 | files = os.listdir(tmpdir.dirname)
93 | for i in range(5):
94 | assert f"this_is_a_test_{i}.py" in files
95 | assert "app.py" in files
96 |
97 |
98 | def test_convert_pipeline_notebooks_passing_flake8_ignore(sample_notebook, tmpdir):
99 | for i in range(5):
100 | filename = os.path.join(tmpdir.dirname, f"pipeline-this-is-a-test-{i}.ipynb")
101 | with open(filename, "w") as f:
102 | json.dump(sample_notebook, f, indent=4)
103 |
104 | runner = CliRunner()
105 | result = runner.invoke(
106 | cli.cli,
107 | [
108 | "convert-pipeline-notebooks",
109 | "--input-directory",
110 | tmpdir.dirname,
111 | "--output-directory",
112 | tmpdir.dirname,
113 | "--pipeline-family",
114 | "fake-family-name",
115 | "--semver",
116 | "2.1.1",
117 | "--flake8-ignore",
118 | "E402, F401",
119 | ],
120 | )
121 | assert result.exit_code == 0
122 |
123 | files = os.listdir(tmpdir.dirname)
124 | for i in range(5):
125 | assert f"this_is_a_test_{i}.py" in files
126 | assert "app.py" in files
127 |
--------------------------------------------------------------------------------
/unstructured_api_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/unstructured_api_tools/__init__.py
--------------------------------------------------------------------------------
/unstructured_api_tools/__version__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.10.11" # pragma: no cover
2 |
--------------------------------------------------------------------------------
/unstructured_api_tools/cli.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Optional
3 |
4 | import click
5 |
6 | from unstructured_api_tools.pipelines.convert import convert_notebook_files_to_api
7 | from unstructured_api_tools.pipelines.lint import (
8 | FLAKE8_DEFAULT_OPTS,
9 | validate_flake8_ignore,
10 | )
11 |
12 |
13 | @click.group()
14 | def cli():
15 | pass
16 |
17 |
18 | @cli.command()
19 | @click.option("--input-directory")
20 | @click.option("--output-directory")
21 | @click.option("--pipeline-family")
22 | @click.option("--semver")
23 | @click.option("--config-filename")
24 | @click.option("--flake8-ignore")
25 | def convert_pipeline_notebooks(
26 | input_directory: str,
27 | output_directory: str,
28 | pipeline_family: Optional[str] = None,
29 | semver: Optional[str] = None,
30 | config_filename: Optional[str] = None,
31 | flake8_ignore: Optional[str] = None,
32 | ):
33 | """Convert a pipeline notebook to a Python script. The conversion script will retain
34 | any cell that includes # pipeline-api at the top."""
35 | notebook_filenames = sorted([f for f in os.listdir(input_directory) if f.endswith(".ipynb")])
36 |
37 | if flake8_ignore:
38 | validate_flake8_ignore(flake8_ignore)
39 | # NOTE(robinson) - Not making line length configurable because setting it to
40 | # 100 allows flake8 to be consistent with black
41 | flake8_opts = ["--max-line-length", "100", "--ignore", flake8_ignore]
42 | else:
43 | flake8_opts = FLAKE8_DEFAULT_OPTS
44 |
45 | convert_notebook_files_to_api(
46 | notebook_filenames,
47 | input_directory,
48 | output_directory,
49 | pipeline_family=pipeline_family,
50 | semver=semver,
51 | config_filename=config_filename,
52 | flake8_opts=flake8_opts,
53 | )
54 |
55 |
56 | if __name__ == "__main__":
57 | cli() # pragma: nocover
58 |
--------------------------------------------------------------------------------
/unstructured_api_tools/pipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/unstructured-api-tools/634370e9591dff36830b225d20d5b4995a61e24a/unstructured_api_tools/pipelines/__init__.py
--------------------------------------------------------------------------------
/unstructured_api_tools/pipelines/api_conventions.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | import os
3 | from typing import Optional
4 | import yaml
5 | import re
6 |
7 |
8 | def get_config(filename: Optional[str] = None):
9 | if filename is None:
10 | default = os.path.join(os.getcwd(), "preprocessing-pipeline-family.yaml")
11 | filename = os.environ.get("PIPELINE_FAMILY_CONFIG", default)
12 |
13 | if not os.path.exists(filename):
14 | raise FileNotFoundError(
15 | f"A pipeline family config was not found at {filename}."
16 | "The config class looks for the config in the following "
17 | "order:\n"
18 | " 1. The filename parameter\n"
19 | " 2. The PIPELINE_FAMILY_CONFIG environment variable\n"
20 | ' 3. "${PWD}"/pipeline-family.yaml'
21 | )
22 |
23 | with open(filename, "r") as f:
24 | config = yaml.safe_load(f)
25 |
26 | return config
27 |
28 |
29 | @dataclass
30 | class PipelineConfig:
31 | name: str
32 | version: str
33 | description: str
34 | long_description: str
35 | filename: str
36 |
37 | def __init__(self, filename: Optional[str] = None):
38 | """Parses pipeline family metadata from the pipeline-family.yaml file. If no
39 | filename is passed, reverts to the PIPELINE_FAMILY_CONFIG environment variable.
40 | Otherwise, looks for pipeline-family.yaml in the working directory."""
41 | config = get_config(filename)
42 |
43 | self.name = config["name"]
44 | self.version = config["version"]
45 | self.description = config.get("description", "Unstructured Pipeline API")
46 | self.long_description = config.get("long_description", "")
47 |
48 |
49 | def raise_for_invalid_semver_string(semver: str):
50 | """Raise an error if the semver string is invalid."""
51 | # NOTE(yuming): Suggested regular expression (RegEx) to check a semver string
52 | # ref: https://semver.org/#is-there-a-suggested-regular-expression
53 | # -regex-to-check-a-semver-string
54 | valid_semver_pattern = r"""^(?P0|[1-9]\d*)\.(?P0|[1-9]\d*)\.
55 | (?P0|[1-9]\d*)(?:-(?P(?:0|[1-9]\d*|\d*[a-zA-Z-]
56 | [0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?
57 | (?:\+(?P[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$"""
58 | valid_semver_re = re.compile(valid_semver_pattern, re.VERBOSE)
59 |
60 | if not re.match(valid_semver_re, semver):
61 | raise ValueError("Semver string must be a valid string.")
62 |
63 |
64 | def get_pipeline_path(
65 | filename: str,
66 | pipeline_family: Optional[str] = None,
67 | semver: Optional[str] = None,
68 | config_filename: Optional[str] = None,
69 | shorter: Optional[bool] = False,
70 | ) -> str:
71 | """Builds the pipeline path according to the conventions outlined in the architecture docs.
72 | ref: https://github.com/Unstructured-IO/
73 | docs-and-arch/blob/main/Pipelines-and-APIs.md#api-specification
74 | """
75 | if any([pipeline_family, semver]) and not all([pipeline_family, semver]):
76 | raise ValueError(
77 | "If either pipeline_family or semver is specified, the other must be "
78 | "specified as well."
79 | )
80 |
81 | if not any([pipeline_family, semver]):
82 | config = PipelineConfig(filename=config_filename)
83 | pipeline_family = config.name
84 | semver = config.version
85 | else:
86 | # NOTE(robinson) - Explicit type casting if the variables are passed. Otherwise
87 | # mypy gets cranky because Optional[str] implies they could be None.
88 | pipeline_family = str(pipeline_family)
89 | semver = str(semver)
90 |
91 | raise_for_invalid_semver_string(semver)
92 |
93 | if shorter:
94 | semver = semver.split(".")[0]
95 |
96 | pipeline_family = pipeline_family.replace("_", "-")
97 |
98 | filepath = filename.split("/")
99 | # NOTE(robinson) - Converts something like "sec_filings.py" to "sec-filings"
100 | pipeline_name = filepath[-1].replace("_", "-").replace(".py", "")
101 |
102 | return f"/{pipeline_family}/v{semver}/{pipeline_name}"
103 |
104 |
105 | def get_api_name_from_config(filename: Optional[str] = None):
106 | try:
107 | return get_config(filename).get("name", None)
108 | except FileNotFoundError:
109 | return None
110 |
--------------------------------------------------------------------------------
/unstructured_api_tools/pipelines/lint.py:
--------------------------------------------------------------------------------
1 | """Tools for linting and autoformatting generated API files."""
2 | import os
3 | import re
4 | from subprocess import PIPE, Popen
5 | import tempfile
6 | from typing import List
7 | from autoflake import (
8 | check,
9 | filter_unused_import,
10 | SAFE_IMPORTS,
11 | unused_import_module_name,
12 | filter_useless_pass,
13 | )
14 | import pyflakes.api
15 | import pyflakes.messages
16 | import pyflakes.reporter
17 | import io
18 | import collections
19 |
20 | from black import format_str, FileMode
21 | from autoflake import fix_code
22 |
23 | # NOTE(robinson) - F401 is for unused imports
24 | FLAKE8_DEFAULT_OPTS: List[str] = ["--max-line-length", "100", "--ignore", "F401"]
25 | FLAKE8_PREFIX_RE = re.compile(r".+:\d+:\d+:\s")
26 | FLAKE8_ERROR_CODE_RE = re.compile(r"([A-Z]\d{3},?\s?)+")
27 |
28 | MYPY_PREFIX_RE = re.compile(r".+:\d+:\s")
29 |
30 |
31 | class LintError(RuntimeError):
32 | pass
33 |
34 |
35 | def _create_tempfile(file_text: str):
36 | tmp = tempfile.NamedTemporaryFile(delete=False)
37 | tmp.write(file_text.encode())
38 | tmp.close()
39 | return tmp
40 |
41 |
42 | def _create_file_for_user_debugging(content: str, filename: str):
43 | """Creates file in user's current working to facilitate debugging lint errors."""
44 | with open(filename, "w+") as f:
45 | f.write(content)
46 |
47 |
48 | def _run_lint_cmd(cmd: List[str], filename: str, prefix_re: re.Pattern):
49 | """Runs a subprocess with the specified lint command and raises a LintError
50 | if the file does not pass."""
51 | try:
52 | process = Popen(cmd, stdout=PIPE, stderr=PIPE)
53 | stdout, _ = process.communicate()
54 | except Exception as e:
55 | # NOTE(robinson) - Catching the error ensures we clean up the temp file
56 | os.unlink(filename) # NOTE(robinson) - Removes the temporary file
57 | raise e
58 |
59 | os.unlink(filename) # NOTE(robinson) - Removes the temporary file
60 | if process.returncode != 0:
61 | err = prefix_re.sub("", stdout.decode("utf-8"))
62 | raise LintError("\n\n" + err)
63 |
64 | return True
65 |
66 |
67 | def check_flake8(file_text: str, opts: List[str] = FLAKE8_DEFAULT_OPTS) -> bool:
68 | """Runs flake8 on the text. Raises and exception if the file does
69 | not pass linting. Uses subprocess because per the Flake8 docs, Flake8
70 | does not have a public Python API.
71 | ref: https://flake8.pycqa.org/en/latest/user/python-api.html#public-python-api"""
72 | tmp = _create_tempfile(file_text)
73 | cmd = ["flake8", tmp.name] + opts
74 | try:
75 | _run_lint_cmd(cmd, tmp.name, MYPY_PREFIX_RE)
76 | except Exception as e:
77 | debug_file = "tmp-flake8-check-pipeline-api.py"
78 | _create_file_for_user_debugging(file_text, debug_file)
79 | cmd[1] = debug_file
80 | raise LintError("run the following to debug: \n" f"{' '.join(cmd)}") from e
81 | return True
82 |
83 |
84 | def validate_flake8_ignore(flake8_ignore: str) -> bool:
85 | """Validates the CLI argument for Flake8 errors. For CLI input validation."""
86 | if FLAKE8_ERROR_CODE_RE.match(flake8_ignore) is None:
87 | raise ValueError(f"{flake8_ignore} is an invalid argument for the --flake8-ignore flag.")
88 | return True
89 |
90 |
91 | def check_mypy(file_text: str):
92 | """Runs mypy type checking on the file text."""
93 | tmp = _create_tempfile(file_text)
94 | cmd = ["mypy", tmp.name, "--ignore-missing-imports", "--implicit-optional"]
95 | try:
96 | _run_lint_cmd(cmd, tmp.name, MYPY_PREFIX_RE)
97 | except Exception as e:
98 | debug_file = "tmp-myp-check-pipeline-api.py"
99 | _create_file_for_user_debugging(file_text, debug_file)
100 | cmd[1] = debug_file
101 | raise LintError("run the following to debug: \n" f"{' '.join(cmd)}") from e
102 | return True
103 |
104 |
105 | def check_black(file_text: str) -> bool:
106 | """Checks if a file needs to be reformatted with black."""
107 | passes = format_black(file_text) == file_text
108 | if not passes:
109 | raise LintError("File text needs to be reformatted with black.")
110 | return passes
111 |
112 |
113 | def format_black(file_text: str) -> str:
114 | """Auto-formats a file using black."""
115 | return format_str(file_text, mode=FileMode(line_length=100))
116 |
117 |
118 | def format_autoflake(file_text: str) -> str:
119 | return fix_code(
120 | source=file_text,
121 | remove_unused_variables=True,
122 | remove_all_unused_imports=True,
123 | expand_star_imports=True,
124 | )
125 |
126 |
127 | """
128 | Autoflake only takes into account unused imports by checking for pyflakes.messages.UnusedImport
129 | but does not handle duplicate imports which come out as pyflakes.messages.RedefinedWhileUnused
130 | from pyflakes. The following code is an extension of autoflake to take duplicate
131 | imports into account
132 | """
133 |
134 |
135 | def duplicate_import_line_numbers(messages):
136 | """Yield line numbers of unused imports."""
137 | for message in messages:
138 | if isinstance(message, pyflakes.messages.RedefinedWhileUnused):
139 | yield message.lineno
140 |
141 |
142 | def _remove_duplicate_imports(text: str):
143 | messages = check(text)
144 | marked_import_line_numbers = frozenset(
145 | duplicate_import_line_numbers(messages),
146 | )
147 | marked_unused_module = collections.defaultdict(lambda: [])
148 | for line_number, module_name in unused_import_module_name(messages):
149 | marked_unused_module[line_number].append(module_name)
150 | sio = io.StringIO(text)
151 | previous_line = ""
152 | result = None
153 | for line_number, line in enumerate(sio.readlines(), start=1):
154 | if line_number in marked_import_line_numbers:
155 | result = filter_unused_import(
156 | line,
157 | unused_module=marked_unused_module[line_number],
158 | remove_all_unused_imports=True,
159 | imports=SAFE_IMPORTS,
160 | previous_line=previous_line,
161 | )
162 | else:
163 | result = line
164 | yield result
165 | previous_line = line
166 |
167 |
168 | def remove_duplicate_imports(text: str) -> str:
169 | return "".join(filter_useless_pass("".join(_remove_duplicate_imports(text))))
170 |
--------------------------------------------------------------------------------
/unstructured_api_tools/pipelines/templates/pipeline_app.txt:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 |
7 | from fastapi import FastAPI, Request, status
8 | import logging
9 | import os
10 |
11 | {% for module in module_names -%}
12 | from .{{ module }} import router as {{module}}_router
13 | {% endfor %}
14 |
15 | app = FastAPI(
16 | title="{{ title }}",
17 | description="""{{ description }}""",
18 | version="{{ version or '1.0.0' }}",
19 | docs_url="{{ '/' ~ version_name ~ '/docs' if version_name else '/docs' }}",
20 | openapi_url="{{ '/' ~ version_name ~ '/openapi.json' if version_name else '/openapi.json' }}"
21 | )
22 |
23 | allowed_origins = os.environ.get("ALLOWED_ORIGINS", None)
24 | if allowed_origins:
25 | from fastapi.middleware.cors import CORSMiddleware
26 | app.add_middleware(
27 | CORSMiddleware,
28 | allow_origins=allowed_origins.split(","),
29 | allow_methods=["OPTIONS", "POST"],
30 | allow_headers=["Content-Type"]
31 | )
32 |
33 | {% for module in module_names -%}
34 | app.include_router({{ module }}_router)
35 | {% endfor %}
36 |
37 | # Filter out /healthcheck noise
38 | class HealthCheckFilter(logging.Filter):
39 | def filter(self, record: logging.LogRecord) -> bool:
40 | return record.getMessage().find("/healthcheck") == -1
41 |
42 | # Filter out /metrics noise
43 | class MetricsCheckFilter(logging.Filter):
44 | def filter(self, record: logging.LogRecord) -> bool:
45 | return record.getMessage().find("/metrics") == -1
46 |
47 | logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter())
48 | logging.getLogger("uvicorn.access").addFilter(MetricsCheckFilter())
49 |
50 | @app.get("/healthcheck", status_code=status.HTTP_200_OK, include_in_schema=False)
51 | def healthcheck(request: Request):
52 | return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"}
53 |
--------------------------------------------------------------------------------