├── pytest.ini ├── .gitignore ├── datasette_upload_csvs ├── templates │ ├── upload_csv_done.html │ └── upload_csv.html └── __init__.py ├── .github └── workflows │ ├── test.yml │ └── publish.yml ├── README.md ├── setup.py ├── tests └── test_datasette_upload_csvs.py └── LICENSE /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | asyncio_mode = strict 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | venv 6 | .eggs 7 | .pytest_cache 8 | *.egg-info 9 | .DS_Store 10 | .vscode 11 | *.db 12 | -------------------------------------------------------------------------------- /datasette_upload_csvs/templates/upload_csv_done.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block title %}Upload in progress{% endblock %} 4 | 5 | {% block content %} 6 |
Importing rows into 9 | {{ table }} 10 |
11 | {% endblock %} 12 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: [push] 4 | 5 | jobs: 6 | test: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python-version: ["3.7", "3.8", "3.9", "3.10"] 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python ${{ matrix.python-version }} 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: ${{ matrix.python-version }} 17 | - uses: actions/cache@v2 18 | name: Configure pip caching 19 | with: 20 | path: ~/.cache/pip 21 | key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }} 22 | restore-keys: | 23 | ${{ runner.os }}-pip- 24 | - name: Install dependencies 25 | run: | 26 | pip install -e '.[test]' 27 | - name: Run tests 28 | run: | 29 | pytest 30 | 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # datasette-upload-csvs 2 | 3 | [](https://pypi.org/project/datasette-upload-csvs/) 4 | [](https://github.com/simonw/datasette-upload-csvs/releases) 5 | [](https://github.com/simonw/datasette-upload-csvs/actions?query=workflow%3ATest) 6 | [](https://github.com/simonw/datasette-upload-csvs/blob/main/LICENSE) 7 | 8 | Datasette plugin for uploading CSV files and converting them to database tables 9 | 10 | ## Installation 11 | 12 | datasette install datasette-upload-csvs 13 | 14 | ## Usage 15 | 16 | The plugin adds an interface at `/-/upload-csvs` for uploading a CSV file and using it to create a new database table. 17 | 18 | By default only [the root actor](https://datasette.readthedocs.io/en/stable/authentication.html#using-the-root-actor) can access the page - so you'll need to run Datasette with the `--root` option and click on the link shown in the terminal to sign in and access the page. 19 | 20 | The `upload-csvs` permission governs access. You can use permission plugins such as [datasette-permissions-sql](https://github.com/simonw/datasette-permissions-sql) to grant additional access to the write interface. 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import os 3 | 4 | VERSION = "0.8.2" 5 | 6 | 7 | def get_long_description(): 8 | with open( 9 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "README.md"), 10 | encoding="utf8", 11 | ) as fp: 12 | return fp.read() 13 | 14 | 15 | setup( 16 | name="datasette-upload-csvs", 17 | description="Datasette plugin for uploading CSV files and converting them to database tables", 18 | long_description=get_long_description(), 19 | long_description_content_type="text/markdown", 20 | author="Simon Willison", 21 | url="https://datasette.io/plugins/datasette-upload-csvs", 22 | project_urls={ 23 | "Issues": "https://github.com/simonw/datasette-upload-csvs/issues", 24 | "CI": "https://github.com/simonw/datasette-upload-csvs/actions", 25 | "Changelog": "https://github.com/simonw/datasette-upload-csvs/releases", 26 | }, 27 | license="Apache License, Version 2.0", 28 | version=VERSION, 29 | packages=["datasette_upload_csvs"], 30 | entry_points={"datasette": ["upload_csvs = datasette_upload_csvs"]}, 31 | python_requires=">=3.7", 32 | install_requires=[ 33 | "datasette>=0.61", 34 | "asgi-csrf>=0.7", 35 | "starlette", 36 | "aiofiles", 37 | "python-multipart", 38 | "charset-normalizer", 39 | "sqlite-utils", 40 | ], 41 | extras_require={ 42 | "test": ["pytest", "pytest-asyncio", "asgiref", "httpx", "asgi-lifespan"] 43 | }, 44 | package_data={"datasette_upload_csvs": ["templates/*.html"]}, 45 | ) 46 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | test: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | python-version: ["3.7", "3.8", "3.9", "3.10"] 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - uses: actions/cache@v2 20 | name: Configure pip caching 21 | with: 22 | path: ~/.cache/pip 23 | key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }} 24 | restore-keys: | 25 | ${{ runner.os }}-pip- 26 | - name: Install dependencies 27 | run: | 28 | pip install -e '.[test]' 29 | - name: Run tests 30 | run: | 31 | pytest 32 | deploy: 33 | runs-on: ubuntu-latest 34 | needs: [test] 35 | steps: 36 | - uses: actions/checkout@v2 37 | - name: Set up Python 38 | uses: actions/setup-python@v2 39 | with: 40 | python-version: "3.10" 41 | - uses: actions/cache@v2 42 | name: Configure pip caching 43 | with: 44 | path: ~/.cache/pip 45 | key: ${{ runner.os }}-publish-pip-${{ hashFiles('**/setup.py') }} 46 | restore-keys: | 47 | ${{ runner.os }}-publish-pip- 48 | - name: Install dependencies 49 | run: | 50 | pip install setuptools wheel twine build 51 | - name: Publish 52 | env: 53 | TWINE_USERNAME: __token__ 54 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} 55 | run: | 56 | python -m build 57 | twine upload dist/* 58 | 59 | -------------------------------------------------------------------------------- /datasette_upload_csvs/templates/upload_csv.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block title %}Upload a CSV to "{{ database_name }}"{% endblock %} 4 | 5 | {% block extra_head %} 6 | {{ super() }} 7 | 46 | {% endblock %} 47 | 48 | {% block content %} 49 |A table will be created in database "{{ database_name }}".
51 | 65 | 66 | 67 | 198 | {% endblock %} 199 | -------------------------------------------------------------------------------- /datasette_upload_csvs/__init__.py: -------------------------------------------------------------------------------- 1 | from datasette import hookimpl 2 | from datasette.utils.asgi import Response, Forbidden 3 | from charset_normalizer import detect 4 | from starlette.requests import Request 5 | from urllib.parse import quote_plus 6 | import csv as csv_std 7 | import codecs 8 | import datetime 9 | import io 10 | import os 11 | import sqlite_utils 12 | from sqlite_utils.utils import TypeTracker 13 | import uuid 14 | 15 | 16 | @hookimpl 17 | def permission_allowed(actor, action): 18 | if action == "upload-csvs" and actor and actor.get("id") == "root": 19 | return True 20 | 21 | 22 | @hookimpl 23 | def register_routes(): 24 | return [ 25 | (r"^/-/upload-csvs$", upload_csvs), 26 | (r"^/-/upload-csv$", lambda: Response.redirect("/-/upload-csvs")), 27 | ] 28 | 29 | 30 | @hookimpl 31 | def menu_links(datasette, actor): 32 | async def inner(): 33 | if await datasette.permission_allowed( 34 | actor, "upload-csvs", default=False 35 | ) and any( 36 | db.is_mutable and db.name not in ("_memory", "_internal") 37 | for db in datasette.databases.values() 38 | ): 39 | return [ 40 | {"href": datasette.urls.path("/-/upload-csvs"), "label": "Upload CSVs"}, 41 | ] 42 | 43 | return inner 44 | 45 | 46 | async def upload_csvs(scope, receive, datasette, request): 47 | if not await datasette.permission_allowed( 48 | request.actor, "upload-csvs", default=False 49 | ): 50 | raise Forbidden("Permission denied for upload-csvs") 51 | 52 | num_bytes_to_detect_with = 2048 * 1024 53 | # ?_num_bytes= can over-ride this, used by the tests 54 | if request.args.get("_num_bytes_to_detect_with"): 55 | num_bytes_to_detect_with = int(request.args["_num_bytes_to_detect_with"]) 56 | 57 | # For the moment just use the first database that's not immutable 58 | dbs = [ 59 | db 60 | for db in datasette.databases.values() 61 | if db.is_mutable and db.name not in ("_internal", "_memory") 62 | ] 63 | if not dbs: 64 | raise Forbidden("No mutable databases available") 65 | db = dbs[0] 66 | 67 | # We need the ds_request to pass to render_template for CSRF tokens 68 | ds_request = request 69 | 70 | # We use the Starlette request object to handle file uploads 71 | starlette_request = Request(scope, receive) 72 | if starlette_request.method != "POST": 73 | return Response.html( 74 | await datasette.render_template( 75 | "upload_csv.html", {"database_name": db.name}, request=ds_request 76 | ) 77 | ) 78 | 79 | formdata = await starlette_request.form() 80 | csv = formdata["csv"] 81 | # csv.file is a SpooledTemporaryFile. csv.filename is the filename 82 | table_name = formdata.get("table") 83 | if not table_name: 84 | table_name = csv.filename 85 | if table_name.endswith(".csv"): 86 | table_name = table_name[:-4] 87 | 88 | # If the table already exists, add a suffix 89 | suffix = 2 90 | base_table_name = table_name 91 | while await db.table_exists(table_name): 92 | table_name = "{}_{}".format(base_table_name, suffix) 93 | suffix += 1 94 | 95 | total_size = get_temporary_file_size(csv.file) 96 | task_id = str(uuid.uuid4()) 97 | 98 | # Use the first 2MB to detect the character encoding 99 | first_bytes = csv.file.read(num_bytes_to_detect_with) 100 | csv.file.seek(0) 101 | encoding = detect(first_bytes)["encoding"] 102 | 103 | # latin-1 is a superset of ascii, and less likely to hit errors 104 | # https://github.com/simonw/datasette-upload-csvs/issues/25 105 | if encoding == "ascii": 106 | encoding = "latin-1" 107 | 108 | def insert_initial_record(conn): 109 | database = sqlite_utils.Database(conn) 110 | database["_csv_progress_"].insert( 111 | { 112 | "id": task_id, 113 | "table_name": table_name, 114 | "bytes_todo": total_size, 115 | "bytes_done": 0, 116 | "rows_done": 0, 117 | "started": str(datetime.datetime.utcnow()), 118 | "completed": None, 119 | "error": None, 120 | }, 121 | pk="id", 122 | alter=True, 123 | ) 124 | 125 | await db.execute_write_fn(insert_initial_record) 126 | 127 | def insert_docs(database): 128 | reader = csv_std.reader(codecs.iterdecode(csv.file, encoding)) 129 | headers = next(reader) 130 | 131 | tracker = TypeTracker() 132 | 133 | docs = tracker.wrap(dict(zip(headers, row)) for row in reader) 134 | 135 | i = 0 136 | 137 | def docs_with_progress(): 138 | nonlocal i 139 | for doc in docs: 140 | i += 1 141 | yield doc 142 | if i % 10 == 0: 143 | database["_csv_progress_"].update( 144 | task_id, 145 | { 146 | "rows_done": i, 147 | "bytes_done": csv.file.tell(), 148 | }, 149 | ) 150 | 151 | database[table_name].insert_all( 152 | docs_with_progress(), alter=True, batch_size=100 153 | ) 154 | database["_csv_progress_"].update( 155 | task_id, 156 | { 157 | "rows_done": i, 158 | "bytes_done": total_size, 159 | "completed": str(datetime.datetime.utcnow()), 160 | }, 161 | ) 162 | # Trasform columns to detected types 163 | database[table_name].transform(types=tracker.types) 164 | return database[table_name].count 165 | 166 | def insert_docs_catch_errors(conn): 167 | database = sqlite_utils.Database(conn) 168 | try: 169 | insert_docs(database) 170 | except Exception as error: 171 | database["_csv_progress_"].update( 172 | task_id, 173 | {"error": str(error)}, 174 | ) 175 | 176 | await db.execute_write_fn(insert_docs_catch_errors, block=False) 177 | 178 | if formdata.get("xhr"): 179 | return Response.json( 180 | { 181 | "url": datasette.urls.table(db.name, table_name), 182 | "database_path": quote_plus(db.name), 183 | "task_id": task_id, 184 | "bytes_todo": total_size, 185 | } 186 | ) 187 | 188 | return Response.html( 189 | await datasette.render_template( 190 | "upload_csv_done.html", 191 | { 192 | "database": db.name, 193 | "table": table_name, 194 | "table_url": datasette.urls.table(db.name, table_name), 195 | }, 196 | ) 197 | ) 198 | 199 | 200 | def get_temporary_file_size(file): 201 | if isinstance(file._file, (io.BytesIO, io.StringIO)): 202 | return len(file._file.getvalue()) 203 | try: 204 | return os.fstat(file._file.fileno()).st_size 205 | except Exception: 206 | raise 207 | -------------------------------------------------------------------------------- /tests/test_datasette_upload_csvs.py: -------------------------------------------------------------------------------- 1 | from datasette.app import Datasette 2 | from datasette.utils import tilde_encode 3 | import asyncio 4 | from asgi_lifespan import LifespanManager 5 | import json 6 | from unittest.mock import ANY 7 | import pytest 8 | import httpx 9 | import sqlite_utils 10 | 11 | 12 | @pytest.mark.asyncio 13 | async def test_lifespan(): 14 | ds = Datasette([], memory=True) 15 | app = ds.app() 16 | async with LifespanManager(app): 17 | async with httpx.AsyncClient(app=app) as client: 18 | response = await client.get("http://localhost/") 19 | assert 200 == response.status_code 20 | 21 | 22 | @pytest.mark.asyncio 23 | async def test_redirect(): 24 | datasette = Datasette([], memory=True) 25 | async with httpx.AsyncClient(app=datasette.app()) as client: 26 | response = await client.get("http://localhost/-/upload-csv") 27 | assert response.status_code == 302 28 | assert response.headers["location"] == "/-/upload-csvs" 29 | 30 | 31 | @pytest.mark.asyncio 32 | @pytest.mark.parametrize("auth", [True, False]) 33 | @pytest.mark.parametrize("has_database", [True, False]) 34 | async def test_menu(tmpdir, auth, has_database): 35 | path = str(tmpdir / "data.db") 36 | db = sqlite_utils.Database(path) 37 | db.vacuum() 38 | ds = Datasette([path] if has_database else [], memory=True) 39 | app = ds.app() 40 | async with LifespanManager(app): 41 | async with httpx.AsyncClient(app=app) as client: 42 | cookies = {} 43 | if auth: 44 | cookies = {"ds_actor": ds.sign({"a": {"id": "root"}}, "actor")} 45 | response = await client.get("http://localhost/", cookies=cookies) 46 | assert response.status_code == 200 47 | should_allow = False 48 | if auth and has_database: 49 | assert "/-/upload-csvs" in response.text 50 | should_allow = True 51 | else: 52 | assert "/-/upload-csvs" not in response.text 53 | should_allow == False 54 | assert ( 55 | ( 56 | await client.get("http://localhost/-/upload-csvs", cookies=cookies) 57 | ).status_code 58 | == 200 59 | if should_allow 60 | else 403 61 | ) 62 | 63 | 64 | SIMPLE = b"name,age\nCleo,5\nPancakes,4" 65 | SIMPLE_EXPECTED = [{"name": "Cleo", "age": 5}, {"name": "Pancakes", "age": 4}] 66 | NOT_UTF8 = ( 67 | b"IncidentNumber,DateTimeOfCall,CalYear,FinYear,TypeOfIncident,PumpCount,PumpHoursTotal,HourlyNotionalCost(\xa3),IncidentNotionalCost(\xa3)\r\n" 68 | b"139091,01/01/2009 03:01,2009,2008/09,Special Service,1,2,2.55,5.10\r\n" 69 | b"275091,01/01/2009 08:51,2009,2008/09,Special Service,1,1,2.55,2.55" 70 | ) 71 | NOT_UTF8_EXPECTED = [ 72 | { 73 | "IncidentNumber": 139091, 74 | "DateTimeOfCall": "01/01/2009 03:01", 75 | "CalYear": 2009, 76 | "FinYear": "2008/09", 77 | "TypeOfIncident": "Special Service", 78 | "PumpCount": 1, 79 | "PumpHoursTotal": 2, 80 | "HourlyNotionalCost(£)": 2.55, 81 | "IncidentNotionalCost(£)": 5.10, 82 | }, 83 | { 84 | "IncidentNumber": 275091, 85 | "DateTimeOfCall": "01/01/2009 08:51", 86 | "CalYear": 2009, 87 | "FinYear": "2008/09", 88 | "TypeOfIncident": "Special Service", 89 | "PumpCount": 1, 90 | "PumpHoursTotal": 1, 91 | "HourlyNotionalCost(£)": 2.55, 92 | "IncidentNotionalCost(£)": 2.55, 93 | }, 94 | ] 95 | LATIN1_AFTER_FIRST_2KB = ("just_one_column\n" + "aabbcc\n" * 1048 + "a.b.é").encode( 96 | "latin-1" 97 | ) 98 | 99 | 100 | @pytest.mark.asyncio 101 | @pytest.mark.parametrize( 102 | "filename,content,expected_table,expected_rows", 103 | ( 104 | ("dogs.csv", SIMPLE, "dogs", SIMPLE_EXPECTED), 105 | ( 106 | "weird ~ filename here.csv.csv", 107 | SIMPLE, 108 | "weird ~ filename here.csv", 109 | SIMPLE_EXPECTED, 110 | ), 111 | ("not-utf8.csv", NOT_UTF8, "not-utf8", NOT_UTF8_EXPECTED), 112 | ("latin1-after-x.csv", "LATIN1_AFTER_FIRST_2KB", "latin1-after-x", ANY), 113 | # This table already exists 114 | ("already_exists.csv", SIMPLE, "already_exists_2", SIMPLE_EXPECTED), 115 | ), 116 | ) 117 | @pytest.mark.parametrize("use_xhr", (True, False)) 118 | async def test_upload( 119 | tmpdir, filename, content, expected_table, expected_rows, use_xhr 120 | ): 121 | expected_url = "/data/{}".format(tilde_encode(expected_table)) 122 | path = str(tmpdir / "data.db") 123 | db = sqlite_utils.Database(path) 124 | db.vacuum() 125 | db.enable_wal() 126 | db["already_exists"].insert({"id": 1}) 127 | binary_content = content 128 | # Trick to avoid a 12MB string being part of the pytest rendered test name: 129 | if content == "LATIN1_AFTER_FIRST_2KB": 130 | binary_content = LATIN1_AFTER_FIRST_2KB 131 | 132 | db["hello"].insert({"hello": "world"}) 133 | 134 | datasette = Datasette([path]) 135 | 136 | cookies = {"ds_actor": datasette.sign({"a": {"id": "root"}}, "actor")} 137 | 138 | # First test the upload page exists 139 | async with httpx.AsyncClient(app=datasette.app()) as client: 140 | response = await client.get("http://localhost/-/upload-csvs", cookies=cookies) 141 | assert 200 == response.status_code 142 | assert ( 143 | '