├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── publish.yml
    │   └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── datasette_extract
    ├── __init__.py
    ├── static
    │   ├── extract.css
    │   └── heic2any-0.0.4.min.js
    └── templates
    │   ├── _extract_base_styles.html
    │   ├── _extract_drop_handler.html
    │   ├── extract.html
    │   ├── extract_create_table.html
    │   ├── extract_progress.html
    │   └── extract_to_table.html
├── pyproject.toml
└── tests
    ├── cassettes
        └── test_web
        │   └── test_extract_flow.yaml
    ├── conftest.py
    └── test_web.py


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: pip
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: daily
 7 |   groups:
 8 |     python-packages:
 9 |       patterns:
10 |         - "*"
11 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   test:
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |     - name: Set up Python ${{ matrix.python-version }}
19 |       uses: actions/setup-python@v5
20 |       with:
21 |         python-version: ${{ matrix.python-version }}
22 |         cache: pip
23 |         cache-dependency-path: pyproject.toml
24 |     - name: Install dependencies
25 |       run: |
26 |         pip install '.[test]'
27 |     - name: Run tests
28 |       run: |
29 |         python -m pytest
30 |   deploy:
31 |     runs-on: ubuntu-latest
32 |     needs: [test]
33 |     environment: release
34 |     permissions:
35 |       id-token: write
36 |     steps:
37 |     - uses: actions/checkout@v4
38 |     - name: Set up Python
39 |       uses: actions/setup-python@v5
40 |       with:
41 |         python-version: "3.12"
42 |         cache: pip
43 |         cache-dependency-path: pyproject.toml
44 |     - name: Install dependencies
45 |       run: |
46 |         pip install setuptools wheel build
47 |     - name: Build
48 |       run: |
49 |         python -m build
50 |     - name: Publish
51 |       uses: pypa/gh-action-pypi-publish@release/v1
52 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | permissions:
 6 |   contents: read
 7 | 
 8 | jobs:
 9 |   test:
10 |     runs-on: ubuntu-latest
11 |     strategy:
12 |       matrix:
13 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
14 |     steps:
15 |     - uses: actions/checkout@v4
16 |     - name: Set up Python ${{ matrix.python-version }}
17 |       uses: actions/setup-python@v5
18 |       with:
19 |         python-version: ${{ matrix.python-version }}
20 |         cache: pip
21 |         cache-dependency-path: pyproject.toml
22 |     - name: Install dependencies
23 |       run: |
24 |         pip install '.[test]'
25 |     - name: Run tests
26 |       run: |
27 |         python -m pytest
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .venv
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | venv
 6 | .eggs
 7 | .pytest_cache
 8 | *.egg-info
 9 | .DS_Store
10 | .idea/
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # datasette-extract
  2 | 
  3 | [![PyPI](https://img.shields.io/pypi/v/datasette-extract.svg)](https://pypi.org/project/datasette-extract/)
  4 | [![Changelog](https://img.shields.io/github/v/release/datasette/datasette-extract?include_prereleases&label=changelog)](https://github.com/datasette/datasette-extract/releases)
  5 | [![Tests](https://github.com/datasette/datasette-extract/workflows/Test/badge.svg)](https://github.com/datasette/datasette-extract/actions?query=workflow%3ATest)
  6 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/datasette/datasette-extract/blob/main/LICENSE)
  7 | 
  8 | Import unstructured data (text and images) into structured tables
  9 | 
 10 | ## Installation
 11 | 
 12 | Install this plugin in the same environment as [Datasette](https://datasette.io/).
 13 | ```bash
 14 | datasette install datasette-extract
 15 | ```
 16 | 
 17 | ## Configuration
 18 | 
 19 | This plugin requires an [OpenAI API key](https://platform.openai.com/api-keys).
 20 | 
 21 | You can set this using the `DATASETTE_SECRETS_OPENAI_API_KEY` environment variable, or you can configure the [datasette-secrets](https://github.com/datasette/datasette-secrets) plugin to allow users to enter their own plugin and save it, encrypted, in their database.
 22 | 
 23 | Here's how to start using Datasette with that environment variable:
 24 | 
 25 | ```bash
 26 | DATASETTE_SECRETS_OPENAI_API_KEY="xxx" datasette data.db --root --create
 27 | # Now click or command-click the URL containing .../-/auth-token?token=...
 28 | ```
 29 | - Replace `xxx` with your OpenAI API key
 30 | - The `--root` flag causes Datasette to output a link that will sign you in as root
 31 | - The `--create` flag will create the `data.db` SQLite database file if it does not exist
 32 | 
 33 | If you are using other models from plugins you should consult those LLM plugins for documentation on how to configure their API keys, if they need one.
 34 | 
 35 | By default all asyncio and schema supporting LLM models will be provided as options for the user. You can restrict that to a subset of models using the `models` setting:
 36 | 
 37 | ```yaml
 38 | plugins:
 39 |  datasette-extract:
 40 |   models:
 41 |   - openai/gpt-4.1-nano
 42 | ```
 43 | If you only list a single model users will not get an option to select the model when they use the extraction tool.
 44 | 
 45 | ## Usage
 46 | 
 47 | This plugin provides the following features:
 48 | 
 49 | - In the database action cog menu for a database select "Create table with extracted data" to create a new table with data extracted from text or an image
 50 | - In the table action cog menu select "Extract data into this table" to extract data into an existing table
 51 | 
 52 | When creating a table you can specify the column names, types and provide an optional hint (like "YYYY-MM-DD" for dates) to influence how the data should be extracted.
 53 | 
 54 | When populating an existing table you can provide hints and select which columns should be populated.
 55 | 
 56 | Text input can be pasted directly into the textarea.
 57 | 
 58 | Drag and drop a PDF or text file onto the textarea to populate it with the contents of that file. PDF files will have their text extracted, but only if the file contains text as opposed to scanned images.
 59 | 
 60 | Drag and drop a single image onto the textarea - or select it with the image file input box - to process an image.
 61 | 
 62 | ## Permissions
 63 | 
 64 | Users must have the `datasette-extract` permission to use this tool.
 65 | 
 66 | In order to create tables they also need the `create-table` permission.
 67 | 
 68 | To insert rows into an existing table they need `insert-row`.
 69 | 
 70 | Run this to grant those permissions to the root user:
 71 | ```bash
 72 | datasette . --root \
 73 |   -s permissions.insert-row.id root \
 74 |   -s permissions.create-table.id root \
 75 |   -s permissions.datasette-extract.id root \
 76 | ```
 77 | 
 78 | ## Development
 79 | 
 80 | To set up this plugin locally, first checkout the code. Then create a new virtual environment:
 81 | ```bash
 82 | cd datasette-extract
 83 | python3 -m venv venv
 84 | source venv/bin/activate
 85 | ```
 86 | Now install the dependencies and test dependencies:
 87 | ```bash
 88 | pip install -e '.[test]'
 89 | ```
 90 | To run the tests:
 91 | ```bash
 92 | pytest
 93 | ```
 94 | One option to run this in development is to use this recipe:
 95 | ```bash
 96 | DATASETTE_SECRETS_OPENAI_API_KEY="$(llm keys get openai)" \
 97 |   datasette . --root --secret 1 \
 98 |   -s permissions.insert-row.id root \
 99 |   -s permissions.create-table.id root \
100 |   -s permissions.datasette-extract.id root \
101 |   -s plugins.datasette-extract.models '["openai/gpt-4.1-nano", "openai/gpt-4.1-mini"]' \
102 |   --internal internal.db --reload
103 | ```


--------------------------------------------------------------------------------
/datasette_extract/__init__.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import llm
  3 | from datasette import hookimpl, Response, NotFound, Permission, Forbidden
  4 | from datasette_secrets import Secret, get_secret
  5 | from datetime import datetime, timezone
  6 | from sqlite_utils import Database
  7 | from starlette.requests import Request as StarletteRequest
  8 | import ijson
  9 | import json
 10 | import ulid
 11 | import urllib
 12 | 
 13 | 
 14 | @hookimpl
 15 | def register_permissions(datasette):
 16 |     return [
 17 |         Permission(
 18 |             name="datasette-extract",
 19 |             abbr=None,
 20 |             description="Use the extract tool to populate tables",
 21 |             takes_database=False,
 22 |             takes_resource=False,
 23 |             default=False,
 24 |         )
 25 |     ]
 26 | 
 27 | 
 28 | @hookimpl
 29 | def register_secrets():
 30 |     return [
 31 |         Secret(
 32 |             name="OPENAI_API_KEY",
 33 |             obtain_label="Get an OpenAI API key",
 34 |             obtain_url="https://platform.openai.com/api-keys",
 35 |         ),
 36 |     ]
 37 | 
 38 | 
 39 | @hookimpl
 40 | def permission_allowed(action, actor):
 41 |     if action == "datasette-extract" and actor and actor.get("id") == "root":
 42 |         return True
 43 | 
 44 | 
 45 | def get_config(datasette):
 46 |     return datasette.plugin_config("datasette-extract") or {}
 47 | 
 48 | 
 49 | async def can_extract(datasette, actor, database_name, to_table=None):
 50 |     if actor is None:
 51 |         return False
 52 |     reply_from_that = await datasette.permission_allowed(actor, "datasette-extract")
 53 |     if not reply_from_that:
 54 |         return False
 55 |     if not to_table:
 56 |         # Need create-table for database
 57 |         can_create_table = await datasette.permission_allowed(
 58 |             actor, "create-table", resource=database_name
 59 |         )
 60 |         if not can_create_table:
 61 |             return False
 62 |         return True
 63 |     else:
 64 |         # Need insert-row for that table
 65 |         return await datasette.permission_allowed(
 66 |             actor, "insert-row", resource=(database_name, to_table)
 67 |         )
 68 | 
 69 | 
 70 | def image_is_provided(image):
 71 |     # UploadFile(filename='', size=0, headers=Headers...
 72 |     return bool(image.size)
 73 | 
 74 | 
 75 | async def extract_create_table(datasette, request, scope, receive):
 76 |     database = request.url_vars["database"]
 77 |     try:
 78 |         datasette.get_database(database)
 79 |     except KeyError:
 80 |         raise NotFound("Database '{}' does not exist".format(database))
 81 | 
 82 |     if not await can_extract(datasette, request.actor, database):
 83 |         raise Forbidden("Permission denied to extract data")
 84 | 
 85 |     if request.method == "POST":
 86 |         starlette_request = StarletteRequest(scope, receive)
 87 |         post_vars = await starlette_request.form()
 88 |         content = (post_vars.get("content") or "").strip()
 89 |         image = post_vars.get("image") or ""
 90 |         instructions = post_vars.get("instructions") or ""
 91 |         if not content and not image_is_provided(image) and not instructions:
 92 |             return Response.text("No content provided", status=400)
 93 |         table = post_vars.get("table")
 94 |         if not table:
 95 |             return Response.text("No table provided", status=400)
 96 | 
 97 |         model_id = post_vars["model"]
 98 | 
 99 |         properties = {}
100 |         # Build the properties out of name_0 upwards, only if populated
101 |         for key, value in post_vars.items():
102 |             if key.startswith("name_") and value.strip():
103 |                 index = int(key.split("_")[1])
104 |                 type_ = post_vars.get("type_{}".format(index))
105 |                 hint = post_vars.get("hint_{}".format(index))
106 |                 properties[value] = {
107 |                     "type": type_,
108 |                 }
109 |                 if hint:
110 |                     properties[value]["description"] = hint
111 | 
112 |         return await extract_to_table_post(
113 |             datasette,
114 |             request,
115 |             model_id,
116 |             instructions,
117 |             content,
118 |             image,
119 |             database,
120 |             table,
121 |             properties,
122 |         )
123 | 
124 |     fields = []
125 |     if "_fields" in request.args:
126 |         try:
127 |             fields = [
128 |                 field
129 |                 for field in json.loads(request.args["_fields"])
130 |                 if isinstance(field, dict) and isinstance(field.get("index"), int)
131 |             ]
132 |         except (json.JSONDecodeError, TypeError):
133 |             fields = []
134 |     if not fields:
135 |         fields = [{"index": i} for i in range(10)]
136 | 
137 |     models = [
138 |         {"id": model.model_id, "name": str(model)}
139 |         for model in llm.get_async_models()
140 |         if model.supports_schema
141 |     ]
142 | 
143 |     config = get_config(datasette)
144 |     if config.get("models"):
145 |         models = [model for model in models if model["id"] in config["models"]]
146 | 
147 |     return Response.html(
148 |         await datasette.render_template(
149 |             "extract_create_table.html",
150 |             {
151 |                 "database": database,
152 |                 "fields": fields,
153 |                 "models": models,
154 |             },
155 |             request=request,
156 |         )
157 |     )
158 | 
159 | 
160 | async def extract_to_table(datasette, request, scope, receive):
161 |     database = request.url_vars["database"]
162 |     table = request.url_vars["table"]
163 |     # Do they exist?
164 |     try:
165 |         db = datasette.get_database(database)
166 |     except KeyError:
167 |         raise NotFound("Database '{}' does not exist".format(database))
168 | 
169 |     if not await can_extract(datasette, request.actor, database, table):
170 |         raise Forbidden("Permission denied to extract data")
171 | 
172 |     tables = await db.table_names()
173 |     if table not in tables:
174 |         raise NotFound("Table '{}' does not exist".format(table))
175 | 
176 |     schema = await db.execute_fn(lambda conn: Database(conn)[table].columns_dict)
177 | 
178 |     if request.method == "POST":
179 |         starlette_request = StarletteRequest(scope, receive)
180 |         post_vars = await starlette_request.form()
181 | 
182 |         # We only use columns that have their use_{colname} set
183 |         use_columns = [
184 |             key[len("use_") :]
185 |             for key, value in post_vars.items()
186 |             if key.startswith("use_") and value
187 |         ]
188 | 
189 |         # Grab all of the hints
190 |         column_hints = {
191 |             key[len("hint_") :]: value.strip()
192 |             for key, value in post_vars.items()
193 |             if key.startswith("hint_") and value.strip()
194 |         }
195 |         # Turn schema into a properties dict
196 |         properties = {}
197 |         for name, type_ in schema.items():
198 |             if name in use_columns:
199 |                 properties[name] = {"type": get_type(type_)}
200 |                 description = column_hints.get(name) or ""
201 |                 if description:
202 |                     properties[name]["description"] = description
203 | 
204 |         image = post_vars.get("image") or ""
205 |         instructions = post_vars.get("instructions") or ""
206 |         content = (post_vars.get("content") or "").strip()
207 |         model_id = post_vars["model"]
208 |         return await extract_to_table_post(
209 |             datasette,
210 |             request,
211 |             model_id,
212 |             instructions,
213 |             content,
214 |             image,
215 |             database,
216 |             table,
217 |             properties,
218 |         )
219 | 
220 |     # GET request logic starts here
221 |     # Restore properties from previous run, if possible
222 |     previous_runs = []
223 |     if await db.table_exists("_datasette_extract"):
224 |         previous_runs = [
225 |             dict(row)
226 |             for row in (
227 |                 await db.execute(
228 |                     """
229 |             select id, database_name, table_name, created, properties, instructions, completed, error, num_items
230 |             from _datasette_extract
231 |             where database_name = :database_name and table_name = :table_name
232 |             order by id desc limit 20
233 |         """,
234 |                     {"database_name": database, "table_name": table},
235 |                 )
236 |             ).rows
237 |         ]
238 | 
239 |     columns = [
240 |         {"name": name, "type": value, "hint": "", "checked": True}
241 |         for name, value in schema.items()
242 |     ]
243 | 
244 |     instructions = ""
245 | 
246 |     # If there are previous runs, use the properties from the last one to update columns
247 |     if previous_runs:
248 |         properties = json.loads(previous_runs[0]["properties"])
249 |         for column in columns:
250 |             column_name = column["name"]
251 |             column["checked"] = column_name in properties
252 |             column["hint"] = (properties.get(column_name) or {}).get(
253 |                 "description"
254 |             ) or ""
255 |         instructions = previous_runs[0]["instructions"] or ""
256 | 
257 |     duplicate_url = (
258 |         datasette.urls.database(database)
259 |         + "/-/extract?"
260 |         + urllib.parse.urlencode(
261 |             {
262 |                 "_fields": json.dumps(
263 |                     [
264 |                         {
265 |                             "index": i,
266 |                             "name": col["name"],
267 |                             "type": col["type"].__name__,
268 |                             "hint": col["hint"],
269 |                         }
270 |                         for i, col in enumerate(columns)
271 |                     ]
272 |                 )
273 |             }
274 |         )
275 |     )
276 | 
277 |     # Fetch models for the template (copied from extract_create_table)
278 |     models = [
279 |         {"id": model.model_id, "name": str(model)}
280 |         for model in llm.get_async_models()
281 |         if model.supports_schema
282 |     ]
283 |     config = get_config(datasette)
284 |     if config.get("models"):
285 |         models = [model for model in models if model["id"] in config["models"]]
286 | 
287 |     return Response.html(
288 |         await datasette.render_template(
289 |             "extract_to_table.html",
290 |             {
291 |                 "database": database,
292 |                 "table": table,
293 |                 "schema": schema,
294 |                 "columns": columns,
295 |                 "instructions": instructions,
296 |                 "duplicate_url": duplicate_url,
297 |                 "previous_runs": previous_runs,
298 |                 "models": models,
299 |             },
300 |             request=request,
301 |         )
302 |     )
303 | 
304 | 
305 | async def extract_table_task(
306 |     datasette,
307 |     model_id,
308 |     database,
309 |     table,
310 |     properties,
311 |     instructions,
312 |     content,
313 |     image,
314 |     task_id,
315 | ):
316 |     # This task runs in the background and writes to the table as it extracts rows
317 |     events = ijson.sendable_list()
318 |     coro = ijson.items_coro(events, "items.item", use_float=True)
319 |     seen_events = set()
320 |     items = []
321 | 
322 |     datasette._extract_tasks = getattr(datasette, "_extract_tasks", None) or {}
323 |     task_info = {
324 |         "items": items,
325 |         "database": database,
326 |         "model": model_id,
327 |         "table": table,
328 |         "instructions": instructions,
329 |         "properties": properties,
330 |         "error": None,
331 |         "done": False,
332 |     }
333 |     datasette._extract_tasks[task_id] = task_info
334 | 
335 |     # We record tasks to the _datasette_extract table, mainly so we can reuse
336 |     # property definitions later on
337 |     def start_write(conn):
338 |         with conn:
339 |             db = Database(conn)
340 |             db["_datasette_extract"].insert(
341 |                 {
342 |                     "id": task_id,
343 |                     "database_name": database,
344 |                     "table_name": table,
345 |                     "created": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
346 |                     "model": model_id,
347 |                     "instructions": instructions.strip() or None,
348 |                     "properties": json.dumps(properties),
349 |                     "completed": None,
350 |                     "error": None,
351 |                     "num_items": 0,
352 |                 },
353 |                 pk="id",
354 |                 alter=True,
355 |                 column_order=(  # Define order explicitly
356 |                     "id",
357 |                     "database_name",
358 |                     "table_name",
359 |                     "created",
360 |                     "model",
361 |                     "instructions",
362 |                     "properties",
363 |                     "completed",
364 |                     "error",
365 |                     "num_items",
366 |                 ),
367 |             )
368 | 
369 |     db = datasette.get_database(database)
370 | 
371 |     # Ensure table exists before writing
372 |     await db.execute_write_fn(
373 |         lambda conn: Database(conn)["_datasette_extract"].create(
374 |             {
375 |                 "id": str,
376 |                 "database_name": str,
377 |                 "table_name": str,
378 |                 "created": str,
379 |                 "model": str,
380 |                 "instructions": str,
381 |                 "properties": str,
382 |                 "completed": str,
383 |                 "error": str,
384 |                 "num_items": int,
385 |             },
386 |             pk="id",
387 |             if_not_exists=True,
388 |         )
389 |     )
390 | 
391 |     await db.execute_write_fn(start_write)
392 | 
393 |     def make_row_writer(row):
394 |         def _write(conn):
395 |             with conn:
396 |                 db = Database(conn)
397 |                 db[table].insert(row)
398 | 
399 |         return _write
400 | 
401 |     error = None
402 | 
403 |     try:
404 |         model = llm.get_async_model(model_id)
405 |         kwargs = {}
406 |         if instructions:
407 |             kwargs["system"] = instructions
408 |         if image_is_provided(image):
409 |             image_bytes = await image.read()
410 |             kwargs["attachments"] = [llm.Attachment(content=image_bytes)]
411 |         if content:
412 |             kwargs["prompt"] = content
413 | 
414 |         kwargs["schema"] = {
415 |             "type": "object",
416 |             "description": "Extract data",
417 |             "properties": {
418 |                 "items": {
419 |                     "type": "array",
420 |                     "items": {
421 |                         "type": "object",
422 |                         "properties": properties,
423 |                         "required": list(properties.keys()),
424 |                     },
425 |                 }
426 |             },
427 |             "required": ["items"],
428 |         }
429 | 
430 |         async for chunk in model.prompt(**kwargs):
431 |             if chunk:
432 |                 coro.send(chunk.encode("utf-8"))
433 |                 if events:
434 |                     # Any we have not seen yet?
435 |                     unseen_events = [
436 |                         e for e in events if json.dumps(e) not in seen_events
437 |                     ]
438 |                     if unseen_events:
439 |                         for event in unseen_events:
440 |                             event = remove_null_bytes(event)
441 |                             seen_events.add(json.dumps(event))
442 |                             items.append(event)
443 |                             await db.execute_write_fn(make_row_writer(event))
444 | 
445 |     except Exception as ex:
446 |         task_info["error"] = str(ex)
447 |         error = str(ex)
448 |     finally:
449 |         task_info["done"] = True
450 | 
451 |         def end_write(conn):
452 |             with conn:
453 |                 db = Database(conn)
454 |                 db["_datasette_extract"].update(
455 |                     task_id,
456 |                     {
457 |                         "completed": datetime.now(timezone.utc).strftime(
458 |                             "%Y-%m-%d %H:%M:%S"
459 |                         ),
460 |                         "num_items": len(items),
461 |                         "error": error,
462 |                     },
463 |                 )
464 | 
465 |         await db.execute_write_fn(end_write)
466 | 
467 | 
468 | async def extract_to_table_post(
469 |     datasette,
470 |     request,
471 |     model_id,
472 |     instructions,
473 |     content,
474 |     image,
475 |     database,
476 |     table,
477 |     properties,
478 | ):
479 |     # Here we go!
480 |     if not content and not image_is_provided(image) and not instructions:
481 |         return Response.text("No content provided", status=400)
482 | 
483 |     task_id = str(ulid.ULID())
484 | 
485 |     asyncio.create_task(
486 |         extract_table_task(
487 |             datasette,
488 |             model_id,
489 |             database,
490 |             table,
491 |             properties,
492 |             instructions,
493 |             content,
494 |             image,
495 |             task_id,
496 |         )
497 |     )
498 |     return Response.redirect(
499 |         datasette.urls.path("/-/extract/progress/{}".format(task_id))
500 |     )
501 | 
502 | 
503 | def get_task_info(datasette, task_id):
504 |     extract_tasks = getattr(datasette, "_extract_tasks", None) or {}
505 |     return extract_tasks.get(task_id)
506 | 
507 | 
508 | async def extract_progress(datasette, request):
509 |     task_info = get_task_info(datasette, request.url_vars["task_id"])
510 |     if not task_info:
511 |         return Response.text("Task not found", status=404)
512 |     return Response.html(
513 |         await datasette.render_template(
514 |             "extract_progress.html",
515 |             {
516 |                 "task": task_info,
517 |                 "table_url": datasette.urls.table(
518 |                     task_info["database"], task_info["table"]
519 |                 ),
520 |             },
521 |             request=request,
522 |         )
523 |     )
524 | 
525 | 
526 | async def extract_progress_json(datasette, request):
527 |     task_info = get_task_info(datasette, request.url_vars["task_id"])
528 |     if not task_info:
529 |         return Response.json({"ok": False, "error": "Task not found"}, status=404)
530 |     return Response.json(task_info)
531 | 
532 | 
533 | @hookimpl
534 | def register_routes():
535 |     return [
536 |         (r"^/(?P<database>[^/]+)/-/extract$", extract_create_table),
537 |         (r"^/(?P<database>[^/]+)/(?P<table>[^/]+)/-/extract$", extract_to_table),
538 |         (r"^/-/extract/progress/(?P<task_id>\w+)$", extract_progress),
539 |         (r"^/-/extract/progress/(?P<task_id>\w+)\.json$", extract_progress_json),
540 |     ]
541 | 
542 | 
543 | def get_type(type_):
544 |     if type_ is int:
545 |         return "integer"
546 |     elif type_ is float:
547 |         return "number"
548 |     else:
549 |         return "string"
550 | 
551 | 
552 | @hookimpl
553 | def database_actions(datasette, actor, database):
554 |     async def inner():
555 |         if not await get_secret(datasette, "OPENAI_API_KEY"):
556 |             return
557 |         if not await can_extract(datasette, actor, database):
558 |             return
559 |         return [
560 |             {
561 |                 "href": datasette.urls.database(database) + "/-/extract",
562 |                 "label": "Create table with AI extracted data",
563 |                 "description": "Paste in text or an image to extract structured data",
564 |             }
565 |         ]
566 | 
567 |     return inner
568 | 
569 | 
570 | @hookimpl
571 | def table_actions(datasette, actor, database, table):
572 |     async def inner():
573 |         if not await get_secret(datasette, "OPENAI_API_KEY"):
574 |             return
575 |         if not await can_extract(datasette, actor, database, table):
576 |             return
577 |         return [
578 |             {
579 |                 "href": datasette.urls.table(database, table) + "/-/extract",
580 |                 "label": "Extract data into this table with AI",
581 |                 "description": "Paste in text or an image to extract structured data",
582 |             }
583 |         ]
584 | 
585 |     return inner
586 | 
587 | 
588 | def remove_null_bytes(data: dict) -> dict:
589 |     """
590 |     Recursively removes null bytes (u0000) from string values in a dictionary with JSON semantics.
591 |     """
592 |     if isinstance(data, dict):
593 |         return {key: remove_null_bytes(value) for key, value in data.items()}
594 |     elif isinstance(data, list):
595 |         return [remove_null_bytes(item) for item in data]
596 |     elif isinstance(data, str):
597 |         return data.replace("\u0000", "")
598 |     else:
599 |         return data
600 | 


--------------------------------------------------------------------------------
/datasette_extract/static/extract.css:
--------------------------------------------------------------------------------
1 | form.extract-form label {
2 |     width: auto;
3 |     padding-top: 0.3em;
4 | }
5 | textarea.drag-over {
6 |     background-color: pink;
7 | }


--------------------------------------------------------------------------------
/datasette_extract/templates/_extract_base_styles.html:
--------------------------------------------------------------------------------
  1 | <style>
  2 |   * {
  3 |     box-sizing: border-box;
  4 |   }
  5 | 
  6 |   /* Clean Card Layout Styles - Base Structure */
  7 |   .extract-container {
  8 |     --primary-color: #4a6fa5;
  9 |     --secondary-color: #f8f9fa;
 10 |     --border-radius: 8px;
 11 |     --shadow: 0 2px 10px rgba(0,0,0,0.1);
 12 |     max-width: 1000px;
 13 |     margin: 0 auto;
 14 |     padding: 0 20px;
 15 |   }
 16 | 
 17 |   .extract-form {
 18 |     background-color: white;
 19 |     border-radius: var(--border-radius);
 20 |     box-shadow: var(--shadow);
 21 |     padding: 24px;
 22 |     margin-bottom: 30px;
 23 |   }
 24 | 
 25 |   .extract-form h1 { /* General H1 styling within the form container */
 26 |      margin-top: 0;
 27 |      color: var(--primary-color);
 28 |      font-size: 1.8em;
 29 |      border-bottom: 1px solid #eee;
 30 |      margin-bottom: 20px;
 31 |      padding-bottom: 15px;
 32 |   }
 33 | 
 34 |   /* Common Form Elements */
 35 |   .form-group {
 36 |     margin-bottom: 20px;
 37 |   }
 38 | 
 39 |   .extract-form label:not(td > label) { /* Apply base label style EXCEPT for labels directly inside table cells */
 40 |     display: block;
 41 |     font-weight: bold;
 42 |     margin-bottom: 8px;
 43 |     font-family: Helvetica, Arial, sans-serif;
 44 |   }
 45 | 
 46 |   .extract-form input[type="text"],
 47 |   .extract-form select,
 48 |   .extract-form textarea {
 49 |     width: 100%;
 50 |     padding: 8px 12px;
 51 |     border: 1px solid #ccc;
 52 |     border-radius: 4px;
 53 |     font-size: 16px;
 54 |     font-family: Helvetica, Arial, sans-serif;
 55 |     display: block; /* Ensure selects/inputs take full width consistently */
 56 |   }
 57 | 
 58 |   /* Default textarea height, can be overridden inline */
 59 |   .extract-form textarea {
 60 |      height: 10em;
 61 |   }
 62 | 
 63 |   .extract-form input[type="submit"] {
 64 |     background-color: var(--primary-color);
 65 |     color: white;
 66 |     border: none;
 67 |     border-radius: 4px;
 68 |     padding: 10px 20px;
 69 |     cursor: pointer;
 70 |     font-size: 16px;
 71 |     font-family: Helvetica, Arial, sans-serif;
 72 |     transition: background-color 0.3s;
 73 |   }
 74 | 
 75 |   .extract-form input[type="submit"]:hover {
 76 |     background-color: #395d8e;
 77 |   }
 78 | 
 79 |   /* Standard File Upload Area */
 80 |   .file-upload {
 81 |     border: 2px dashed #ddd;
 82 |     padding: 20px;
 83 |     text-align: center;
 84 |     border-radius: var(--border-radius);
 85 |     margin-bottom: 20px; /* Consistent with form-group */
 86 |   }
 87 |   .file-upload label { /* Adjust label within file upload */
 88 |      display: block;
 89 |      font-weight: bold;
 90 |      margin-bottom: 8px;
 91 |      font-family: Helvetica, Arial, sans-serif;
 92 |   }
 93 |   .file-upload input[type="file"] {
 94 |       margin-top: 8px; /* Add some space below the label */
 95 |   }
 96 | 
 97 |   /* Standard Processing Message */
 98 |   #processing_message {
 99 |     background-color: #fff8e5;
100 |     padding: 10px;
101 |     border-radius: 4px;
102 |     border-left: 4px solid #ffc107;
103 |     margin: 15px 0;
104 |     display: none; /* Hidden by default */
105 |   }
106 |   #processing_message strong {
107 |     color: #c79100; /* Darker yellow/orange for emphasis */
108 |   }
109 | 
110 | 
111 |   /* Basic Responsive adjustments */
112 |   @media (max-width: 768px) {
113 |     .extract-form {
114 |       padding: 16px;
115 |     }
116 |   }
117 | </style>
118 | 


--------------------------------------------------------------------------------
/datasette_extract/templates/_extract_drop_handler.html:
--------------------------------------------------------------------------------
  1 | <script type="module">
  2 | const textarea = document.getElementById("id_content");
  3 | const imageInput = document.getElementById("id_image");
  4 | const processingMessage = document.getElementById("processing_message");
  5 | 
  6 | async function convertHeicToJpeg(file) {
  7 |   const heic2any = (
  8 |     await import("/-/static-plugins/datasette_extract/heic2any-0.0.4.min.js")
  9 |   ).default;
 10 |   try {
 11 |     const blob = await heic2any({
 12 |       blob: file,
 13 |       toType: "image/jpeg",
 14 |       quality: 0.8,
 15 |     });
 16 |     return new File([blob], `converted-${file.name}.jpeg`, {
 17 |       type: "image/jpeg",
 18 |     });
 19 |   } catch (error) {
 20 |     console.error("Conversion error:", error);
 21 |     return null;
 22 |   }
 23 | }
 24 | 
 25 | function dragOverHandler(event) {
 26 |   event.preventDefault();
 27 |   event.stopPropagation();
 28 |   textarea.classList.add("drag-over");
 29 | }
 30 | 
 31 | function dragLeaveHandler(event) {
 32 |   event.preventDefault();
 33 |   event.stopPropagation();
 34 |   textarea.classList.remove("drag-over");
 35 | }
 36 | 
 37 | async function dropHandler(event) {
 38 |   event.preventDefault();
 39 |   event.stopPropagation();
 40 |   textarea.classList.remove("drag-over");
 41 | 
 42 |   const files = Array.from(event.dataTransfer.files);
 43 |   const imageOrPdfFiles = files.filter((file) => file.type.startsWith("image/") || file.type === "application/pdf");
 44 |   const otherFiles = files.filter(
 45 |     (file) =>
 46 |       !file.type.startsWith("image/") && file.type !== "application/pdf",
 47 |   );
 48 | 
 49 |   if (imageOrPdfFiles.length > 0) {
 50 |     processingMessage.style.display = "block";
 51 |     const convertedImages = await Promise.all(
 52 |       imageOrPdfFiles.map(async (file) => {
 53 |         if (file.type === "image/heic") {
 54 |           return await convertHeicToJpeg(file);
 55 |         }
 56 |         return file;
 57 |       }),
 58 |     );
 59 |     const validImages = convertedImages.filter((file) => file !== null);
 60 |     if (validImages.length > 0) {
 61 |       const dataTransfer = new DataTransfer();
 62 |       validImages.forEach((file) => dataTransfer.items.add(file));
 63 |       imageInput.files = dataTransfer.files;
 64 |     }
 65 |     processingMessage.style.display = "none";
 66 |   }
 67 | 
 68 |   if (otherFiles.length > 0) {
 69 |     const otherContents = await Promise.all(
 70 |       otherFiles.map(
 71 |         (file) =>
 72 |           new Promise((resolve, reject) => {
 73 |             const reader = new FileReader();
 74 |             reader.onload = (e) => {
 75 |               resolve(e.target.result);
 76 |             };
 77 |             reader.onerror = (e) => {
 78 |               reject(new Error("Failed to read file"));
 79 |             };
 80 |             reader.readAsText(file);
 81 |           }),
 82 |       ),
 83 |     );
 84 |     textarea.value +=
 85 |       (textarea.value ? "\n\n" : "") + otherContents.join("\n\n");
 86 |   }
 87 | }
 88 | 
 89 | imageInput.addEventListener("change", async (event) => {
 90 |   const file = event.target.files[0];
 91 |   if (file && file.type === "image/heic") {
 92 |     processingMessage.style.display = "block";
 93 |     const convertedFile = await convertHeicToJpeg(file);
 94 |     if (convertedFile) {
 95 |       const dataTransfer = new DataTransfer();
 96 |       dataTransfer.items.add(convertedFile);
 97 |       imageInput.files = dataTransfer.files;
 98 |     }
 99 |     processingMessage.style.display = "none";
100 |   }
101 | });
102 | 
103 | textarea.addEventListener("dragover", dragOverHandler);
104 | textarea.addEventListener("dragleave", dragLeaveHandler);
105 | textarea.addEventListener("drop", dropHandler);
106 | </script>


--------------------------------------------------------------------------------
/datasette_extract/templates/extract.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block title %}Extract{% endblock %}
 4 | 
 5 | {% block extra_head %}
 6 | <link type="text/css" rel="stylesheet" href="/-/static-plugins/datasette-extract/extract.css">
 7 | {% endblock %}
 8 | 
 9 | {% block content %}
10 | <h1>Extract</h1>
11 | {% endblock %}
12 | 


--------------------------------------------------------------------------------
/datasette_extract/templates/extract_create_table.html:
--------------------------------------------------------------------------------
  1 | {% extends "base.html" %}
  2 | 
  3 | {% block title %}Extract data and create a new table{% endblock %}
  4 | 
  5 | {% block extra_head %}
  6 | <link type="text/css" rel="stylesheet" href="/-/static-plugins/datasette-extract/extract.css">
  7 | {% include "_extract_base_styles.html" %}
  8 | <style>
  9 |   /* Specific styles for create_table form */
 10 |   /* H1 handled by base styles now */
 11 |   .extract-form h1 { /* Keep specific h1 style if needed, otherwise remove if base is sufficient */
 12 |     color: var(--primary-color);
 13 |     font-size: 1.8em;
 14 |   }
 15 | 
 16 |  /* Base styles handle text inputs, selects, textareas */
 17 |   .extract-form textarea#id_content {
 18 |     /* Specific height override */
 19 |     height: 20em;
 20 |   }
 21 | 
 22 |   .field-row {
 23 |     display: flex;
 24 |     flex-wrap: wrap;
 25 |     gap: 15px;
 26 |     margin-bottom: 15px;
 27 |     align-items: flex-start;
 28 |   }
 29 | 
 30 |   .name-field {
 31 |     flex: 2 1 200px;
 32 |   }
 33 | 
 34 |   .type-field {
 35 |     flex: 1 1 100px;
 36 |   }
 37 | 
 38 |   .hint-field {
 39 |     flex: 3 1 300px;
 40 |   }
 41 | 
 42 |   .column-container {
 43 |     margin-bottom: 30px;
 44 |     border-bottom: 1px solid #eee;
 45 |     padding-bottom: 20px;
 46 |   }
 47 | 
 48 |   .column-header {
 49 |     margin-bottom: 16px;
 50 |     color: var(--primary-color);
 51 |   }
 52 | 
 53 |   .section-title {
 54 |     color: var(--primary-color);
 55 |     margin-top: 25px;
 56 |     margin-bottom: 15px;
 57 |     font-size: 1.2em;
 58 |   }
 59 | 
 60 |   #add-column-btn {
 61 |     background-color: var(--secondary-color);
 62 |     color: #333;
 63 |     border: 1px solid #ddd;
 64 |     border-radius: 4px;
 65 |     padding: 8px 16px;
 66 |     cursor: pointer;
 67 |     font-size: 16px;
 68 |     font-family: Helvetica, Arial, sans-serif;
 69 |     transition: background-color 0.3s;
 70 |   }
 71 | 
 72 |   #add-column-btn:hover {
 73 |     background-color: #e9ecef;
 74 |   }
 75 | 
 76 | 
 77 |   /* Responsive adjustments for create table specific elements */
 78 |   @media (max-width: 768px) {
 79 |      /* Base responsive rules in _extract_base_styles.html */
 80 |     .field-row {
 81 |       flex-direction: column;
 82 |       gap: 10px;
 83 |     }
 84 | 
 85 |     .name-field, .type-field, .hint-field {
 86 |       flex: 1 1 100%;
 87 |     }
 88 |   }
 89 | </style>
 90 | {% endblock %}
 91 | 
 92 | {% block content %}
 93 | <div class="extract-container">
 94 |   <form action="{{ request.path }}" method="POST" class="extract-form" enctype="multipart/form-data">
 95 |     {# Moved h1 inside form to be styled by .extract-form h1 #}
 96 |     <h1>Extract data to create a new table in {{ database }}</h1>
 97 |     <div class="form-group">
 98 |       <input type="hidden" name="csrftoken" value="{{ csrftoken() }}">
 99 |       <label for="table">Table name:</label>
100 |       <input type="text" name="table" id="table" value="" placeholder="Enter a new table name">
101 |     </div>
102 |     {% if models|length > 1 %}
103 |     <div class="form-group">
104 |       <label for="model">Model:</label>
105 |       <select name="model" id="model">
106 |         {% for model in models %}
107 |           <option value="{{ model.id }}">{{ model.name }}</option>
108 |         {% endfor %}
109 |       </select>
110 |     </div>
111 |     {% else %}
112 |       <input type="hidden" name="model" value="{{ models[0].id }}">
113 |     {% endif %}
114 | 
115 |     <div class="column-container">
116 |       <h3 class="column-header">Define Columns</h3>
117 |       <div id="fields-container">
118 |       {% for field in fields %}
119 |         <div class="field-row">
120 |           <div class="name-field">
121 |             <label>Name</label>
122 |             <input type="text" name="name_{{ field.index }}" value="{{ field.name or '' }}">
123 |           </div>
124 |           <div class="type-field">
125 |             <label>Type</label>
126 |             <select name="type_{{ field.index }}">
127 |               <option value="string"{% if field.type == "str" %} selected{% endif %}>Text</option>
128 |               <option value="integer"{% if field.type == "int" %} selected{% endif %}>Integer</option>
129 |               <option value="number"{% if field.type == "float" %} selected{% endif %}>Float</option>
130 |             </select>
131 |           </div>
132 |           <div class="hint-field">
133 |             <label>Hint</label>
134 |             <input type="text" name="hint_{{ field.index }}" value="{{ field.hint or '' }}" placeholder="Optional hint">
135 |           </div>
136 |         </div>
137 |       {% endfor %}
138 |       </div>
139 |       <div id="add-another" style="display: none; margin-top: 15px;">
140 |         <button type="button" id="add-column-btn">+ Add another column</button>
141 |       </div>
142 |     </div>
143 | 
144 |     <h3 class="section-title">Enter Your Data</h3>
145 |     <div class="form-group">
146 |       <label for="id_content">Paste data here, or drag and drop text files:</label>
147 |       <textarea name="content" id="id_content" placeholder="Paste content here"></textarea> {# Height set via CSS rule #}
148 |     </div>
149 | 
150 |     <div id="processing_message"> {# style="display: none;" handled by CSS #}
151 |       <strong>Processing...</strong> This may take a moment.
152 |     </div>
153 | 
154 |     <div class="form-group file-upload"> {# Using standard file-upload class #}
155 |       <label>Or upload an image or PDF:</label>
156 |       <input type="file" id="id_image" name="image">
157 |     </div>
158 | 
159 |     <div class="form-group"> {# Added form-group wrapper for consistency #}
160 |       <label for="id_instructions">Additional instructions:</label>
161 |       <textarea name="instructions" id="id_instructions" style="height: 5em;" placeholder="Optional additional instructions"></textarea>
162 |     </div>
163 | 
164 |     <div class="form-group">
165 |       <input type="submit" value="Extract">
166 |     </div>
167 |   </form>
168 | </div>
169 | 
170 | {% include "_extract_drop_handler.html" %}
171 | 
172 | <script>
173 | document.querySelector('#add-another').style.display = 'block';
174 | document.getElementById('add-column-btn').addEventListener('click', function() {
175 |   var fieldsContainer = document.getElementById('fields-container');
176 |   var fieldRows = fieldsContainer.querySelectorAll('.field-row');
177 |   var fieldCount = fieldRows.length;
178 | 
179 |   var newField = document.createElement('div');
180 |   newField.className = 'field-row';
181 |   newField.innerHTML = `
182 |     <div class="name-field">
183 |       <label>Name</label>
184 |       <input type="text" name="name_${fieldCount}">
185 |     </div>
186 |     <div class="type-field">
187 |       <label>Type</label>
188 |       <select name="type_${fieldCount}">
189 |         <option value="string">Text</option>
190 |         <option value="integer">Integer</option>
191 |         <option value="number">Float</option>
192 |       </select>
193 |     </div>
194 |     <div class="hint-field">
195 |       <label>Hint</label>
196 |       <input type="text" name="hint_${fieldCount}" value="" placeholder="Optional hint">
197 |     </div>
198 |   `;
199 | 
200 |   fieldsContainer.appendChild(newField);
201 | });
202 | </script>
203 | {% endblock %}
204 | 


--------------------------------------------------------------------------------
/datasette_extract/templates/extract_progress.html:
--------------------------------------------------------------------------------
  1 | {% extends "base.html" %}
  2 | 
  3 | {% block title %}Extract progress{% endblock %}
  4 | 
  5 | {% block extra_head %}
  6 | <link type="text/css" rel="stylesheet" href="/-/static-plugins/datasette-extract/extract.css">
  7 | {% endblock %}
  8 | 
  9 | {% block content %}
 10 | <h1>Extract progress</h1>
 11 | 
 12 | <p>Extracting to table <a href="{{ table_url }}">{{ task.database }}/{{ task.table }}</a>:</p>
 13 | 
 14 | <svg id="loadingSpinner" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100px" height="100px" viewBox="0 0 100 100" preserveAspectRatio="xMidYMid">
 15 | <g transform="translate(80,50)">
 16 | <g transform="rotate(0)">
 17 | <circle cx="0" cy="0" r="5" fill="#276890" fill-opacity="1">
 18 |   <animateTransform attributeName="transform" type="scale" begin="-0.875s" values="1.5 1.5;1 1" keyTimes="0;1" dur="1s" repeatCount="indefinite"></animateTransform>
 19 |   <animate attributeName="fill-opacity" keyTimes="0;1" dur="1s" repeatCount="indefinite" values="1;0" begin="-0.875s"></animate>
 20 | </circle>
 21 | </g>
 22 | </g><g transform="translate(71.21320343559643,71.21320343559643)">
 23 | <g transform="rotate(45)">
 24 | <circle cx="0" cy="0" r="5" fill="#276890" fill-opacity="0.875">
 25 |   <animateTransform attributeName="transform" type="scale" begin="-0.75s" values="1.5 1.5;1 1" keyTimes="0;1" dur="1s" repeatCount="indefinite"></animateTransform>
 26 |   <animate attributeName="fill-opacity" keyTimes="0;1" dur="1s" repeatCount="indefinite" values="1;0" begin="-0.75s"></animate>
 27 | </circle>
 28 | </g>
 29 | </g><g transform="translate(50,80)">
 30 | <g transform="rotate(90)">
 31 | <circle cx="0" cy="0" r="5" fill="#276890" fill-opacity="0.75">
 32 |   <animateTransform attributeName="transform" type="scale" begin="-0.625s" values="1.5 1.5;1 1" keyTimes="0;1" dur="1s" repeatCount="indefinite"></animateTransform>
 33 |   <animate attributeName="fill-opacity" keyTimes="0;1" dur="1s" repeatCount="indefinite" values="1;0" begin="-0.625s"></animate>
 34 | </circle>
 35 | </g>
 36 | </g><g transform="translate(28.786796564403577,71.21320343559643)">
 37 | <g transform="rotate(135)">
 38 | <circle cx="0" cy="0" r="5" fill="#276890" fill-opacity="0.625">
 39 |   <animateTransform attributeName="transform" type="scale" begin="-0.5s" values="1.5 1.5;1 1" keyTimes="0;1" dur="1s" repeatCount="indefinite"></animateTransform>
 40 |   <animate attributeName="fill-opacity" keyTimes="0;1" dur="1s" repeatCount="indefinite" values="1;0" begin="-0.5s"></animate>
 41 | </circle>
 42 | </g>
 43 | </g><g transform="translate(20,50.00000000000001)">
 44 | <g transform="rotate(180)">
 45 | <circle cx="0" cy="0" r="5" fill="#276890" fill-opacity="0.5">
 46 |   <animateTransform attributeName="transform" type="scale" begin="-0.375s" values="1.5 1.5;1 1" keyTimes="0;1" dur="1s" repeatCount="indefinite"></animateTransform>
 47 |   <animate attributeName="fill-opacity" keyTimes="0;1" dur="1s" repeatCount="indefinite" values="1;0" begin="-0.375s"></animate>
 48 | </circle>
 49 | </g>
 50 | </g><g transform="translate(28.78679656440357,28.786796564403577)">
 51 | <g transform="rotate(225)">
 52 | <circle cx="0" cy="0" r="5" fill="#276890" fill-opacity="0.375">
 53 |   <animateTransform attributeName="transform" type="scale" begin="-0.25s" values="1.5 1.5;1 1" keyTimes="0;1" dur="1s" repeatCount="indefinite"></animateTransform>
 54 |   <animate attributeName="fill-opacity" keyTimes="0;1" dur="1s" repeatCount="indefinite" values="1;0" begin="-0.25s"></animate>
 55 | </circle>
 56 | </g>
 57 | </g><g transform="translate(49.99999999999999,20)">
 58 | <g transform="rotate(270)">
 59 | <circle cx="0" cy="0" r="5" fill="#276890" fill-opacity="0.25">
 60 |   <animateTransform attributeName="transform" type="scale" begin="-0.125s" values="1.5 1.5;1 1" keyTimes="0;1" dur="1s" repeatCount="indefinite"></animateTransform>
 61 |   <animate attributeName="fill-opacity" keyTimes="0;1" dur="1s" repeatCount="indefinite" values="1;0" begin="-0.125s"></animate>
 62 | </circle>
 63 | </g>
 64 | </g><g transform="translate(71.21320343559643,28.78679656440357)">
 65 | <g transform="rotate(315)">
 66 | <circle cx="0" cy="0" r="5" fill="#276890" fill-opacity="0.125">
 67 |   <animateTransform attributeName="transform" type="scale" begin="0s" values="1.5 1.5;1 1" keyTimes="0;1" dur="1s" repeatCount="indefinite"></animateTransform>
 68 |   <animate attributeName="fill-opacity" keyTimes="0;1" dur="1s" repeatCount="indefinite" values="1;0" begin="0s"></animate>
 69 | </circle>
 70 | </g>
 71 | </g>
 72 | <!-- [ldio] generated by https://loading.io/ --></svg>
 73 | 
 74 | <pre id="output" style="white-space: pre-wrap; margin-bottom: 1em;"></pre>
 75 | 
 76 | <script>
 77 | const outputElement = document.getElementById("output");
 78 | const pollUrl = window.location.href + '.json';
 79 | 
 80 | async function pollData() {
 81 |     const response = await fetch(pollUrl);
 82 |     const data = await response.json();
 83 |     if (data && data.items && data.items.length) {
 84 |         outputElement.textContent = JSON.stringify(data.items, null, 2);
 85 |     }
 86 |     let finishMessage = 'Extraction complete!';
 87 |     if (data && data.error) {
 88 |         outputElement.textContent = `Error: ${data.error}`;
 89 |         outputElement.style.color = "red";
 90 |         finishMessage = 'Extraction failed';
 91 |     }
 92 |     if (data.done) {
 93 |         clearInterval(pollInterval);
 94 |         const loadingSpinner = document.getElementById("loadingSpinner");
 95 |         loadingSpinner.parentNode.removeChild(loadingSpinner);
 96 |         const doneMessage = document.createElement("p");
 97 |         doneMessage.style.fontWeight = "bold";
 98 |         doneMessage.textContent = finishMessage;
 99 |         outputElement.parentNode.appendChild(doneMessage);
100 |     }
101 | }
102 | 
103 | const pollInterval = setInterval(pollData, 1000);
104 | pollData();
105 | </script>
106 | 
107 | {% endblock %}
108 | 


--------------------------------------------------------------------------------
/datasette_extract/templates/extract_to_table.html:
--------------------------------------------------------------------------------
  1 | {% extends "base.html" %}
  2 | 
  3 | {% block title %}Extract{% endblock %}
  4 | 
  5 | {% block extra_head %}
  6 | <link type="text/css" rel="stylesheet" href="/-/static-plugins/datasette-extract/extract.css">
  7 | {% include "_extract_base_styles.html" %}
  8 | <style>
  9 | /* Specific styles for extract_to_table form */
 10 | 
 11 | table.formgrid td {
 12 |   white-space: normal;
 13 |   padding: 0.2em;
 14 |   vertical-align: middle; /* Align checkbox and label nicely */
 15 | }
 16 | table.formgrid td.checkbox {
 17 |   vertical-align: middle;
 18 | }
 19 | /* Add some spacing for the hint input */
 20 | table.formgrid td input[type="text"] {
 21 |     margin-left: 0.5em;
 22 | }
 23 | /* Adjust label inside table */
 24 | table.formgrid td label {
 25 |     display: inline; /* Keep label on same line as checkbox */
 26 |     font-weight: normal; /* Don't need bold here */
 27 |     margin-bottom: 0;
 28 | }
 29 | 
 30 | /* Ensure textareas have appropriate height */
 31 | .extract-form textarea#id_content { height: 20em; }
 32 | .extract-form textarea#id_instructions { height: 5em; }
 33 | 
 34 | /* Specific responsive rules if needed for formgrid */
 35 | @media (max-width: 768px) {
 36 |   /* Base responsive rules in _extract_base_styles.html */
 37 | }
 38 | </style>
 39 | {% endblock %}
 40 | 
 41 | {% block content %}
 42 | <div class="extract-container">
 43 |   <form action="{{ request.path }}" method="POST" class="extract-form" enctype="multipart/form-data">
 44 |     <h1>Extract data into {{ database }} / {{ table }}</h1>
 45 |     <input type="hidden" name="csrftoken" value="{{ csrftoken() }}">
 46 | 
 47 |     <p>Select columns to populate with extracted data:</p>
 48 |     <table class="formgrid">
 49 |     {% for column in columns %}
 50 |       <tr>
 51 |         <td class="checkbox">
 52 |           <input type="checkbox"{% if column.checked %} checked="checked"{% endif %} name="use_{{ column.name }}" id="use_{{ column.name }}">
 53 |         </td>
 54 |         <td>
 55 |           <label for="use_{{ column.name }}">{{ column.name }}</label>
 56 |         </td>
 57 |         <td>
 58 |           <input size="40" type="text" name="hint_{{ column.name }}" value="{{ column.hint }}" placeholder="Optional hint">
 59 |         </td>
 60 |       </tr> {# Closing TR tag was missing, added it #}
 61 |     {% endfor %}
 62 |     </table>
 63 | 
 64 |     {% if models|length > 1 %}
 65 |     <div class="form-group">
 66 |       <label for="model">Model:</label>
 67 |       <select name="model" id="model">
 68 |         {% for model in models %}
 69 |           <option value="{{ model.id }}">{{ model.name }}</option>
 70 |         {% endfor %}
 71 |       </select>
 72 |     </div>
 73 |     {% else %}
 74 |       {# Ensure models list is not empty before accessing index 0 #}
 75 |       {% if models %}
 76 |         <input type="hidden" name="model" value="{{ models[0].id }}">
 77 |       {% else %}
 78 |         <p><strong>Error:</strong> No suitable AI models found or configured.</p>
 79 |       {% endif %}
 80 |     {% endif %}
 81 | 
 82 |     <div class="form-group">
 83 |       <label for="id_content">Paste data here, or drag and drop text or PDF files:</label>
 84 |       <textarea name="content" id="id_content" placeholder="Paste content here"></textarea> {# Height set via CSS rule #}
 85 |     </div>
 86 | 
 87 |     <div id="processing_message"> {# Use standard div, display:none handled by CSS #}
 88 |         <strong>Processing...</strong> This may take a moment.
 89 |     </div>
 90 | 
 91 |     <div class="form-group file-upload"> {# Use standard file-upload structure #}
 92 |       <label>Or upload an image:</label>
 93 |       <input type="file" id="id_image" name="image">
 94 |     </div>
 95 | 
 96 | 
 97 |     <div class="form-group">
 98 |       <label for="id_instructions">Additional instructions:</label>
 99 |       <textarea name="instructions" id="id_instructions" placeholder="Optional additional instructions">{{ instructions }}</textarea> {# Height set via CSS rule #}
100 |     </div>
101 | 
102 |     {# Only show submit if models are available #}
103 |     {% if models %}
104 |     <div class="form-group">
105 |       <input type="submit" value="Extract">
106 |     </div>
107 |     {% endif %}
108 |   </form>
109 | </div> {# End extract-container #}
110 | 
111 | <p style="margin-left: 20px;"><a href="{{ duplicate_url }}">Duplicate these columns to a new table</a></p> {# Keep outside the card #}
112 | 
113 | {% include "_extract_drop_handler.html" %}
114 | 
115 | {% if previous_runs %}
116 | <h2>Previous extraction tasks</h2>
117 | <div style="overflow-x: auto;"> {# Changed overflow to overflow-x #}
118 | <table>
119 |   <thead>
120 |     <tr>
121 |       <th>ID</th>
122 |       <th>Created</th>
123 |       <th>Completed</th>
124 |       <th>Model</th> {# Added Model column #}
125 |       <th>Properties</th>
126 |       <th>Instructions</th>
127 |       <th>Error</th>
128 |       <th>Items</th>
129 |     </tr>
130 |   </thead>
131 |   <tbody> {# Added tbody #}
132 |   {% for run in previous_runs %}
133 |   <tr>
134 |     <td>{{ run.id }}</td>
135 |     <td>{{ run.created }}</td>
136 |     <td>{{ run.completed or "" }}</td>
137 |     <td>{{ run.model or "" }}</td> {# Display model used #}
138 |     <td style="max-width: 300px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;" title="{{ run.properties }}">{{ run.properties }}</td> {# Truncate long properties #}
139 |     <td style="max-width: 200px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;" title="{{ run.instructions or '' }}">{{ run.instructions or "" }}</td> {# Truncate long instructions #}
140 |     <td>{{ run.error or "" }}</td>
141 |     <td>{{ run.num_items }}</td>
142 |   </tr>
143 |   {% endfor %}
144 |   </tbody>
145 | </table>
146 | </div>
147 | {% endif %}
148 | 
149 | {% endblock %}
150 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "datasette-extract"
 3 | version = "0.1a10"
 4 | description = "Import unstructured data (text and images) into structured tables"
 5 | readme = "README.md"
 6 | authors = [{name = "Simon Willison"}]
 7 | license = {text = "Apache-2.0"}
 8 | classifiers=[
 9 |     "Framework :: Datasette",
10 |     "License :: OSI Approved :: Apache Software License"
11 | ]
12 | requires-python = ">=3.9"
13 | dependencies = [
14 |     "datasette>=1.0a12",
15 |     "datasette-secrets>=0.1a2",
16 |     "llm>=0.24",
17 |     "llm-openai-plugin",
18 |     "sqlite-utils",
19 |     "openai>=1.0",
20 |     "ijson",
21 |     "python-ulid",
22 |     "starlette",
23 | ]
24 | 
25 | [project.urls]
26 | Homepage = "https://github.com/datasette/datasette-extract"
27 | Changelog = "https://github.com/datasette/datasette-extract/releases"
28 | Issues = "https://github.com/datasette/datasette-extract/issues"
29 | CI = "https://github.com/datasette/datasette-extract/actions"
30 | 
31 | [project.entry-points.datasette]
32 | extract = "datasette_extract"
33 | 
34 | [project.optional-dependencies]
35 | test = ["pytest", "pytest-asyncio", "pytest-recording"]
36 | 
37 | [tool.setuptools.packages.find]
38 | where = ["."]
39 | 
40 | [tool.pytest.ini_options]
41 | asyncio_mode = "strict"
42 | asyncio_default_fixture_loop_scope = "function"
43 | 
44 | [tool.setuptools.package-data]
45 | datasette_extract = ["static/*", "templates/*"]
46 | 


--------------------------------------------------------------------------------
/tests/cassettes/test_web/test_extract_flow.yaml:
--------------------------------------------------------------------------------
  1 | interactions:
  2 | - request:
  3 |     body: '{"messages": [{"role": "user", "content": "Sergei is 4, Cynthia is 7"}],
  4 |       "model": "gpt-4o", "max_tokens": 4096, "stream": true, "tool_choice": {"type":
  5 |       "function", "function": {"name": "extract_data"}}, "tools": [{"type": "function",
  6 |       "function": {"name": "extract_data", "description": "Extract data matching this
  7 |       schema", "parameters": {"type": "object", "properties": {"items": {"type": "array",
  8 |       "items": {"type": "object", "properties": {"name": {"type": "string"}, "age":
  9 |       {"type": "integer"}}, "required": ["name", "age"]}}}, "required": ["items"]}}}]}'
 10 |     headers:
 11 |       accept:
 12 |       - application/json
 13 |       accept-encoding:
 14 |       - gzip, deflate
 15 |       connection:
 16 |       - keep-alive
 17 |       content-length:
 18 |       - '569'
 19 |       content-type:
 20 |       - application/json
 21 |       host:
 22 |       - api.openai.com
 23 |       user-agent:
 24 |       - AsyncOpenAI/Python 1.12.0
 25 |       x-stainless-arch:
 26 |       - arm64
 27 |       x-stainless-async:
 28 |       - async:asyncio
 29 |       x-stainless-lang:
 30 |       - python
 31 |       x-stainless-os:
 32 |       - MacOS
 33 |       x-stainless-package-version:
 34 |       - 1.12.0
 35 |       x-stainless-runtime:
 36 |       - CPython
 37 |       x-stainless-runtime-version:
 38 |       - 3.8.17
 39 |     method: POST
 40 |     uri: https://api.openai.com/v1/chat/completions
 41 |   response:
 42 |     body:
 43 |       string: 'data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"role":"assistant","content":null,"tool_calls":[{"index":0,"id":"call_kA2wo7WDHmwUJusBFs4ygmES","type":"function","function":{"name":"extract_data","arguments":""}}]},"logprobs":null,"finish_reason":null}]}
 44 | 
 45 | 
 46 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"{\""}}]},"logprobs":null,"finish_reason":null}]}
 47 | 
 48 | 
 49 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"items"}}]},"logprobs":null,"finish_reason":null}]}
 50 | 
 51 | 
 52 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"\":[{\""}}]},"logprobs":null,"finish_reason":null}]}
 53 | 
 54 | 
 55 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"name"}}]},"logprobs":null,"finish_reason":null}]}
 56 | 
 57 | 
 58 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"\":\""}}]},"logprobs":null,"finish_reason":null}]}
 59 | 
 60 | 
 61 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"S"}}]},"logprobs":null,"finish_reason":null}]}
 62 | 
 63 | 
 64 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"erge"}}]},"logprobs":null,"finish_reason":null}]}
 65 | 
 66 | 
 67 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"i"}}]},"logprobs":null,"finish_reason":null}]}
 68 | 
 69 | 
 70 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"\",\""}}]},"logprobs":null,"finish_reason":null}]}
 71 | 
 72 | 
 73 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"age"}}]},"logprobs":null,"finish_reason":null}]}
 74 | 
 75 | 
 76 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"\":"}}]},"logprobs":null,"finish_reason":null}]}
 77 | 
 78 | 
 79 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"4"}}]},"logprobs":null,"finish_reason":null}]}
 80 | 
 81 | 
 82 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"},{\""}}]},"logprobs":null,"finish_reason":null}]}
 83 | 
 84 | 
 85 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"name"}}]},"logprobs":null,"finish_reason":null}]}
 86 | 
 87 | 
 88 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"\":\""}}]},"logprobs":null,"finish_reason":null}]}
 89 | 
 90 | 
 91 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"C"}}]},"logprobs":null,"finish_reason":null}]}
 92 | 
 93 | 
 94 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"ynthia"}}]},"logprobs":null,"finish_reason":null}]}
 95 | 
 96 | 
 97 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"\",\""}}]},"logprobs":null,"finish_reason":null}]}
 98 | 
 99 | 
100 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"age"}}]},"logprobs":null,"finish_reason":null}]}
101 | 
102 | 
103 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"\":"}}]},"logprobs":null,"finish_reason":null}]}
104 | 
105 | 
106 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"7"}}]},"logprobs":null,"finish_reason":null}]}
107 | 
108 | 
109 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"}"}}]},"logprobs":null,"finish_reason":null}]}
110 | 
111 | 
112 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"function":{"arguments":"]}"}}]},"logprobs":null,"finish_reason":null}]}
113 | 
114 | 
115 |         data: {"id":"chatcmpl-90FBRO8oMAOcaiHBSwaLA5FwysMd8","object":"chat.completion.chunk","created":1709845393,"model":"gpt-4-0125-preview","system_fingerprint":"fp_00ceb2df5b","choices":[{"index":0,"delta":{},"logprobs":null,"finish_reason":"stop"}]}
116 | 
117 | 
118 |         data: [DONE]
119 | 
120 | 
121 |         '
122 |     headers:
123 |       CF-Cache-Status:
124 |       - DYNAMIC
125 |       CF-RAY:
126 |       - 860d87ee78fe05b0-IAD
127 |       Cache-Control:
128 |       - no-cache, must-revalidate
129 |       Connection:
130 |       - keep-alive
131 |       Content-Type:
132 |       - text/event-stream
133 |       Date:
134 |       - Thu, 07 Mar 2024 21:03:14 GMT
135 |       Server:
136 |       - cloudflare
137 |       Set-Cookie:
138 |       - __cf_bm=J9MngQT.keaLbZCJXAiEkPvdBVr9C3SKbpnpL1jYuek-1709845394-1.0.1.1-9hKD3I36Z49B36OQzt7XhnyECVieGuDaBVW_mSTIhiMazMbj3CJxO_rRtNpJbCfyBc7C_Je.uKjNPRjOjiKo1A;
139 |         path=/; expires=Thu, 07-Mar-24 21:33:14 GMT; domain=.api.openai.com; HttpOnly;
140 |         Secure; SameSite=None
141 |       - _cfuvid=eX.4qosxEGBGBdTBOKtw3AypbsoulKJDrzp8Fuf6nlk-1709845394774-0.0.1.1-604800000;
142 |         path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
143 |       Transfer-Encoding:
144 |       - chunked
145 |       access-control-allow-origin:
146 |       - '*'
147 |       alt-svc:
148 |       - h3=":443"; ma=86400
149 |       openai-model:
150 |       - gpt-4-0125-preview
151 |       openai-organization:
152 |       - user-r3e61fpak04cbaokp5buoae4
153 |       openai-processing-ms:
154 |       - '833'
155 |       openai-version:
156 |       - '2020-10-01'
157 |       strict-transport-security:
158 |       - max-age=15724800; includeSubDomains
159 |       x-ratelimit-limit-requests:
160 |       - '5000'
161 |       x-ratelimit-limit-tokens:
162 |       - '600000'
163 |       x-ratelimit-remaining-requests:
164 |       - '4999'
165 |       x-ratelimit-remaining-tokens:
166 |       - '595896'
167 |       x-ratelimit-reset-requests:
168 |       - 12ms
169 |       x-ratelimit-reset-tokens:
170 |       - 410ms
171 |       x-request-id:
172 |       - req_6e17deaf81e80f3ae585170854be7863
173 |     status:
174 |       code: 200
175 |       message: OK
176 | - request:
177 |     body: '{"input":[{"role":"system","content":"Be nice"},{"role":"user","content":"Sergei
178 |       is 4, Cynthia is 7"}],"model":"gpt-4.1-mini","stream":true,"text":{"format":{"type":"json_schema","name":"output","schema":{"type":"object","description":"Extract
179 |       data","properties":{"items":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"age":{"type":"integer"}},"required":["name","age"],"additionalProperties":false}}},"required":["items"],"additionalProperties":false}}}}'
180 |     headers:
181 |       accept:
182 |       - application/json
183 |       accept-encoding:
184 |       - gzip, deflate
185 |       connection:
186 |       - keep-alive
187 |       content-length:
188 |       - '489'
189 |       content-type:
190 |       - application/json
191 |       host:
192 |       - api.openai.com
193 |       user-agent:
194 |       - AsyncOpenAI/Python 1.74.0
195 |       x-stainless-arch:
196 |       - arm64
197 |       x-stainless-async:
198 |       - async:asyncio
199 |       x-stainless-lang:
200 |       - python
201 |       x-stainless-os:
202 |       - MacOS
203 |       x-stainless-package-version:
204 |       - 1.74.0
205 |       x-stainless-read-timeout:
206 |       - '600'
207 |       x-stainless-retry-count:
208 |       - '0'
209 |       x-stainless-runtime:
210 |       - CPython
211 |       x-stainless-runtime-version:
212 |       - 3.10.10
213 |     method: POST
214 |     uri: https://api.openai.com/v1/responses
215 |   response:
216 |     body:
217 |       string: 'event: response.created
218 | 
219 |         data: {"type":"response.created","response":{"id":"resp_67ff31a30314819199fd35c1f89e2fca062d29d0a5a0a6e9","object":"response","created_at":1744777635,"status":"in_progress","error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"model":"gpt-4.1-mini-2025-04-14","output":[],"parallel_tool_calls":true,"previous_response_id":null,"reasoning":{"effort":null,"summary":null},"store":true,"temperature":1.0,"text":{"format":{"type":"json_schema","description":null,"name":"output","schema":{"type":"object","description":"Extract
220 |         data","properties":{"items":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"age":{"type":"integer"}},"required":["name","age"],"additionalProperties":false}}},"required":["items"],"additionalProperties":false},"strict":true}},"tool_choice":"auto","tools":[],"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}}}
221 | 
222 | 
223 |         event: response.in_progress
224 | 
225 |         data: {"type":"response.in_progress","response":{"id":"resp_67ff31a30314819199fd35c1f89e2fca062d29d0a5a0a6e9","object":"response","created_at":1744777635,"status":"in_progress","error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"model":"gpt-4.1-mini-2025-04-14","output":[],"parallel_tool_calls":true,"previous_response_id":null,"reasoning":{"effort":null,"summary":null},"store":true,"temperature":1.0,"text":{"format":{"type":"json_schema","description":null,"name":"output","schema":{"type":"object","description":"Extract
226 |         data","properties":{"items":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"age":{"type":"integer"}},"required":["name","age"],"additionalProperties":false}}},"required":["items"],"additionalProperties":false},"strict":true}},"tool_choice":"auto","tools":[],"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}}}
227 | 
228 | 
229 |         event: response.output_item.added
230 | 
231 |         data: {"type":"response.output_item.added","output_index":0,"item":{"id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","type":"message","status":"in_progress","content":[],"role":"assistant"}}
232 | 
233 | 
234 |         event: response.content_part.added
235 | 
236 |         data: {"type":"response.content_part.added","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"part":{"type":"output_text","annotations":[],"text":""}}
237 | 
238 | 
239 |         event: response.output_text.delta
240 | 
241 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"{\""}
242 | 
243 | 
244 |         event: response.output_text.delta
245 | 
246 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"items"}
247 | 
248 | 
249 |         event: response.output_text.delta
250 | 
251 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"\":["}
252 | 
253 | 
254 |         event: response.output_text.delta
255 | 
256 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"{\""}
257 | 
258 | 
259 |         event: response.output_text.delta
260 | 
261 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"name"}
262 | 
263 | 
264 |         event: response.output_text.delta
265 | 
266 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"\":\""}
267 | 
268 | 
269 |         event: response.output_text.delta
270 | 
271 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"Ser"}
272 | 
273 | 
274 |         event: response.output_text.delta
275 | 
276 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"ge"}
277 | 
278 | 
279 |         event: response.output_text.delta
280 | 
281 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"i"}
282 | 
283 | 
284 |         event: response.output_text.delta
285 | 
286 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"\",\""}
287 | 
288 | 
289 |         event: response.output_text.delta
290 | 
291 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"age"}
292 | 
293 | 
294 |         event: response.output_text.delta
295 | 
296 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"\":"}
297 | 
298 | 
299 |         event: response.output_text.delta
300 | 
301 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"4"}
302 | 
303 | 
304 |         event: response.output_text.delta
305 | 
306 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"},{\""}
307 | 
308 | 
309 |         event: response.output_text.delta
310 | 
311 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"name"}
312 | 
313 | 
314 |         event: response.output_text.delta
315 | 
316 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"\":\""}
317 | 
318 | 
319 |         event: response.output_text.delta
320 | 
321 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"C"}
322 | 
323 | 
324 |         event: response.output_text.delta
325 | 
326 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"ynthia"}
327 | 
328 | 
329 |         event: response.output_text.delta
330 | 
331 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"\",\""}
332 | 
333 | 
334 |         event: response.output_text.delta
335 | 
336 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"age"}
337 | 
338 | 
339 |         event: response.output_text.delta
340 | 
341 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"\":"}
342 | 
343 | 
344 |         event: response.output_text.delta
345 | 
346 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"7"}
347 | 
348 | 
349 |         event: response.output_text.delta
350 | 
351 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"}"}
352 | 
353 | 
354 |         event: response.output_text.delta
355 | 
356 |         data: {"type":"response.output_text.delta","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"delta":"]}"}
357 | 
358 | 
359 |         event: response.output_text.done
360 | 
361 |         data: {"type":"response.output_text.done","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"text":"{\"items\":[{\"name\":\"Sergei\",\"age\":4},{\"name\":\"Cynthia\",\"age\":7}]}"}
362 | 
363 | 
364 |         event: response.content_part.done
365 | 
366 |         data: {"type":"response.content_part.done","item_id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","output_index":0,"content_index":0,"part":{"type":"output_text","annotations":[],"text":"{\"items\":[{\"name\":\"Sergei\",\"age\":4},{\"name\":\"Cynthia\",\"age\":7}]}"}}
367 | 
368 | 
369 |         event: response.output_item.done
370 | 
371 |         data: {"type":"response.output_item.done","output_index":0,"item":{"id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"text":"{\"items\":[{\"name\":\"Sergei\",\"age\":4},{\"name\":\"Cynthia\",\"age\":7}]}"}],"role":"assistant"}}
372 | 
373 | 
374 |         event: response.completed
375 | 
376 |         data: {"type":"response.completed","response":{"id":"resp_67ff31a30314819199fd35c1f89e2fca062d29d0a5a0a6e9","object":"response","created_at":1744777635,"status":"completed","error":null,"incomplete_details":null,"instructions":null,"max_output_tokens":null,"model":"gpt-4.1-mini-2025-04-14","output":[{"id":"msg_67ff31a53bb88191b23f3098ffa5242f062d29d0a5a0a6e9","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"text":"{\"items\":[{\"name\":\"Sergei\",\"age\":4},{\"name\":\"Cynthia\",\"age\":7}]}"}],"role":"assistant"}],"parallel_tool_calls":true,"previous_response_id":null,"reasoning":{"effort":null,"summary":null},"store":true,"temperature":1.0,"text":{"format":{"type":"json_schema","description":null,"name":"output","schema":{"type":"object","description":"Extract
377 |         data","properties":{"items":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"age":{"type":"integer"}},"required":["name","age"],"additionalProperties":false}}},"required":["items"],"additionalProperties":false},"strict":true}},"tool_choice":"auto","tools":[],"top_p":1.0,"truncation":"disabled","usage":{"input_tokens":73,"input_tokens_details":{"cached_tokens":0},"output_tokens":25,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":98},"user":null,"metadata":{}}}
378 | 
379 | 
380 |         '
381 |     headers:
382 |       CF-RAY:
383 |       - 9310edd79bc9db7e-LAX
384 |       Connection:
385 |       - keep-alive
386 |       Content-Type:
387 |       - text/event-stream; charset=utf-8
388 |       Date:
389 |       - Wed, 16 Apr 2025 04:27:15 GMT
390 |       Server:
391 |       - cloudflare
392 |       Set-Cookie:
393 |       - __cf_bm=QqD39ZsS976cvij2g1toXhPldYk2peAIADruVybqrQM-1744777635-1.0.1.1-YWDq3yAcD4X5IKjmBesi_zmFNZaPsKCodDhQMgUjxuAxCHEgf2ZBwv2CWF.aVYRy3x8k9nNunn15cxLgclZLedEH27z16W_TTc8kMC9wgtk;
394 |         path=/; expires=Wed, 16-Apr-25 04:57:15 GMT; domain=.api.openai.com; HttpOnly;
395 |         Secure; SameSite=None
396 |       - _cfuvid=tO3rNWwFHMJiF5YHSIZshHhTXq9sIQEuv21iSTfLqWA-1744777635115-0.0.1.1-604800000;
397 |         path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None
398 |       Transfer-Encoding:
399 |       - chunked
400 |       X-Content-Type-Options:
401 |       - nosniff
402 |       alt-svc:
403 |       - h3=":443"; ma=86400
404 |       cf-cache-status:
405 |       - DYNAMIC
406 |       openai-organization:
407 |       - user-r3e61fpak04cbaokp5buoae4
408 |       openai-processing-ms:
409 |       - '78'
410 |       openai-version:
411 |       - '2020-10-01'
412 |       strict-transport-security:
413 |       - max-age=31536000; includeSubDomains; preload
414 |       x-request-id:
415 |       - req_afabaf55be7fdbf5d4255ad0989c36e8
416 |     status:
417 |       code: 200
418 |       message: OK
419 | version: 1
420 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.fixture(autouse=True)
 5 | def mock_api_key(monkeypatch):
 6 |     monkeypatch.setenv("DATASETTE_SECRETS_OPENAI_API_KEY", "mock-api-key")
 7 |     monkeypatch.setenv("OPENAI_API_KEY", "mock-api-key")
 8 | 
 9 | 
10 | @pytest.fixture(scope="module")
11 | def vcr_config():
12 |     return {"filter_headers": ["authorization"]}
13 | 


--------------------------------------------------------------------------------
/tests/test_web.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from datasette.app import Datasette
  3 | from datasette_extract import remove_null_bytes
  4 | import json
  5 | import pytest
  6 | import urllib
  7 | 
  8 | 
  9 | @pytest.mark.vcr(ignore_localhost=True)
 10 | @pytest.mark.asyncio
 11 | async def test_extract_flow():
 12 |     ds = Datasette()
 13 |     ds.add_memory_database("data")
 14 |     cookies = {"ds_actor": ds.client.actor_cookie({"id": "root"})}
 15 |     response = await ds.client.get("/data/-/extract", cookies=cookies)
 16 |     assert response.status_code == 200
 17 |     assert "<h1>Extract data to create a new table in data</h1>" in response.text
 18 |     csrftoken = response.cookies["ds_csrftoken"]
 19 |     cookies["ds_csrftoken"] = csrftoken
 20 |     # Now submit a POST, then wait 30s
 21 |     post_response = await ds.client.post(
 22 |         "/data/-/extract",
 23 |         data={
 24 |             "table": "ages",
 25 |             "content": "Sergei is 4, Cynthia is 7",
 26 |             "csrftoken": csrftoken,
 27 |             "name_0": "name",
 28 |             "type_0": "string",
 29 |             "name_1": "age",
 30 |             "type_1": "integer",
 31 |             "instructions": "Be nice",
 32 |             "model": "openai/gpt-4.1-mini",
 33 |         },
 34 |         files={
 35 |             # Send an empty image too
 36 |             "image": b""
 37 |         },
 38 |         cookies=cookies,
 39 |     )
 40 |     assert post_response.status_code == 302
 41 |     redirect_url = post_response.headers["location"]
 42 |     assert redirect_url.startswith("/-/extract/progress/")
 43 |     task_id = redirect_url.split("/")[-1]
 44 |     poll_url = redirect_url + ".json"
 45 |     # Wait a moment for ds._extract_tasks to be populated
 46 |     await asyncio.sleep(0.5)
 47 |     assert task_id in ds._extract_tasks
 48 |     # Now we poll for completion
 49 |     data = None
 50 |     while True:
 51 |         poll_response = await ds.client.get(poll_url)
 52 |         data = poll_response.json()
 53 |         if data["done"]:
 54 |             break
 55 |         await asyncio.sleep(1)
 56 | 
 57 |     assert data == {
 58 |         "items": [{"name": "Sergei", "age": 4}, {"name": "Cynthia", "age": 7}],
 59 |         "database": "data",
 60 |         "model": "openai/gpt-4.1-mini",
 61 |         "table": "ages",
 62 |         "instructions": "Be nice",
 63 |         "properties": {"name": {"type": "string"}, "age": {"type": "integer"}},
 64 |         "error": None,
 65 |         "done": True,
 66 |     }
 67 | 
 68 | 
 69 | @pytest.mark.asyncio
 70 | @pytest.mark.parametrize(
 71 |     "actor,path,should_allow",
 72 |     (
 73 |         ("root", "/test/-/extract", True),
 74 |         ("root", "/test/foo/-/extract", True),
 75 |         ("allowed_all", "/test/-/extract", True),
 76 |         ("allowed_all", "/test/foo/-/extract", True),
 77 |         ("no_extract", "/test/-/extract", False),
 78 |         ("no_extract", "/test/foo/-/extract", False),
 79 |         ("no_insert", "/test/-/extract", True),
 80 |         ("no_insert", "/test/foo/-/extract", False),
 81 |         ("no_create", "/test/-/extract", False),
 82 |         ("no_create", "/test/foo/-/extract", True),
 83 |     ),
 84 | )
 85 | async def test_permissions(actor, path, should_allow):
 86 |     ds = Datasette(
 87 |         config={
 88 |             "permissions": {
 89 |                 "insert-row": {"id": ["allowed_all", "no_create"]},
 90 |                 "create-table": {"id": ["allowed_all", "no_extract", "no_insert"]},
 91 |                 "datasette-extract": {
 92 |                     "id": ["allowed_all", "no_insert", "no_create", "root"]
 93 |                 },
 94 |             }
 95 |         }
 96 |     )
 97 |     db = ds.add_memory_database("test")
 98 |     await db.execute_write("create table if not exists foo (id integer primary key)")
 99 |     cookies = {"ds_actor": ds.client.actor_cookie({"id": actor})}
100 |     response = await ds.client.get(path, cookies=cookies)
101 |     if should_allow:
102 |         assert response.status_code == 200
103 |     else:
104 |         assert response.status_code == 403
105 | 
106 |     # Also check if the action items were visible
107 |     if path == "/test/-/extract":
108 |         fetch_path = "/test"
109 |     else:
110 |         fetch_path = "/test/foo"
111 |     html = (await ds.client.get(fetch_path, cookies=cookies)).text
112 |     fragment = f'<a href="{path}"'
113 |     if should_allow:
114 |         assert fragment in html
115 |     else:
116 |         assert fragment not in html
117 | 
118 | 
119 | @pytest.mark.asyncio
120 | @pytest.mark.parametrize("path", ("/test2", "/test2/foo"))
121 | @pytest.mark.parametrize("has_env_variable", (True, False))
122 | async def test_action_menus_require_api_key(monkeypatch, path, has_env_variable):
123 |     if not has_env_variable:
124 |         monkeypatch.delenv("DATASETTE_SECRETS_OPENAI_API_KEY")
125 |     ds = Datasette(
126 |         config={
127 |             "permissions": {
128 |                 "datasette-extract": {"id": "root"},
129 |             }
130 |         }
131 |     )
132 |     db = ds.add_memory_database("test2")
133 |     await db.execute_write("create table if not exists foo (id integer primary key)")
134 |     cookies = {"ds_actor": ds.client.actor_cookie({"id": "root"})}
135 |     response = await ds.client.get(path, cookies=cookies)
136 | 
137 |     fragment = '/-/extract"'
138 |     if has_env_variable:
139 |         assert fragment in response.text
140 |     else:
141 |         assert fragment not in response.text
142 | 
143 | 
144 | @pytest.mark.asyncio
145 | async def test_create_table_copying_columns():
146 |     ds = Datasette()
147 |     data = ds.add_memory_database("data")
148 |     await data.execute_write(
149 |         "create table foo (name text, age integer, weight float, bio text)"
150 |     )
151 |     cookies = {"ds_actor": ds.client.actor_cookie({"id": "root"})}
152 |     response = await ds.client.get("/data/foo/-/extract", cookies=cookies)
153 |     assert response.status_code == 200
154 |     fields_raw = response.text.split('><a href="/data/-/extract?_fields=')[1].split(
155 |         '"'
156 |     )[0]
157 |     fields = json.loads(urllib.parse.unquote_plus(fields_raw))
158 |     assert fields == [
159 |         {"index": 0, "name": "name", "type": "str", "hint": ""},
160 |         {"index": 1, "name": "age", "type": "int", "hint": ""},
161 |         {"index": 2, "name": "weight", "type": "float", "hint": ""},
162 |         {"index": 3, "name": "bio", "type": "str", "hint": ""},
163 |     ]
164 |     # Navigating to the /data/-/extract page with that link should prefill the form
165 |     response2 = await ds.client.get(
166 |         f"/data/-/extract?_fields={fields_raw}", cookies=cookies
167 |     )
168 |     expecteds = (
169 |         'name="name_0" value="name"',
170 |         'name="name_1" value="age"',
171 |         'name="name_2" value="weight"',
172 |         'name="name_3" value="bio"',
173 |     )
174 |     for expected in expecteds:
175 |         assert expected in response2.text
176 | 
177 | 
178 | @pytest.mark.parametrize(
179 |     "input,expected",
180 |     (
181 |         # Input is always a JSON-style dict
182 |         ({"a": "b"}, {"a": "b"}),
183 |         ({"a": None}, {"a": None}),
184 |         ({"a": "\x00"}, {"a": ""}),
185 |         ({"a": "\x00\x00"}, {"a": ""}),
186 |         ({"a": "\x00\x01"}, {"a": "\x01"}),
187 |         # Nested list
188 |         ({"a": ["\x00", "\x01"]}, {"a": ["", "\x01"]}),
189 |     ),
190 | )
191 | def test_remove_null_bytes(input, expected):
192 |     result = remove_null_bytes(input)
193 |     assert result == expected
194 | 


--------------------------------------------------------------------------------