├── .github └── workflows │ ├── publish.yml │ └── test.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── csvs_to_sqlite ├── __init__.py ├── cli.py └── utils.py ├── setup.cfg ├── setup.py └── tests ├── test_csvs_to_sqlite.py └── test_utils.py /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | test: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | cache: pip 23 | cache-dependency-path: setup.py 24 | - name: Install dependencies 25 | run: | 26 | pip install -e '.[test]' 27 | - name: Run tests 28 | run: | 29 | python -m pytest 30 | deploy: 31 | runs-on: ubuntu-latest 32 | needs: [test] 33 | environment: release 34 | permissions: 35 | id-token: write 36 | steps: 37 | - uses: actions/checkout@v4 38 | - name: Set up Python 39 | uses: actions/setup-python@v5 40 | with: 41 | python-version: "3.13" 42 | cache: pip 43 | cache-dependency-path: setup.py 44 | - name: Install dependencies 45 | run: | 46 | pip install setuptools wheel build 47 | - name: Build 48 | run: | 49 | python -m build 50 | - name: Publish 51 | uses: pypa/gh-action-pypi-publish@release/v1 52 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: [push, pull_request] 4 | 5 | permissions: 6 | contents: read 7 | 8 | jobs: 9 | test: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | cache: pip 21 | cache-dependency-path: setup.py 22 | - name: Install dependencies 23 | run: | 24 | pip install -e '.[test]' 25 | - name: Run tests 26 | run: | 27 | python -m pytest 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | *.egg-info 3 | *.pyc 4 | .pytest_cache/ 5 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | Changelog 2 | ========= 3 | 4 | 0.9 (2019-01-16) 5 | ---------------- 6 | - Support for loading CSVs directly from URLs, thanks @betatim - #38 7 | - New -pk/--primary-key options, closes #22 8 | - Create FTS index for extracted column values 9 | - Added --no-fulltext-fks option, closes #32 10 | - Now using black for code formatting 11 | - Bumped versions of dependencies 12 | 13 | 0.8.1 (2018-04-24) 14 | ------------------ 15 | - Updated README and CHANGELOG, tweaked --help output 16 | 17 | 0.8 (2018-04-24) 18 | ---------------- 19 | - `-d` and `-df` options for specifying date/datetime columns, closes #33 20 | - Maintain lookup tables in SQLite, refs #17 21 | - `--index` option to specify which columns to index, closes #24 22 | - Test confirming `--shape` and `--filename-column` and `-c` work together #25 23 | - Use usecols when loading CSV if shape specified 24 | - `--filename-column` is now compatible with `--shape`, closes #10 25 | - `--no-index-fks` option 26 | 27 | By default, csvs-to-sqlite creates an index for every foreign key column that is 28 | added using the `--extract-column` option. 29 | 30 | For large tables, this can dramatically increase the size of the resulting 31 | database file on disk. The new `--no-index-fks` option allows you to disable 32 | this feature to save on file size. 33 | 34 | Refs #24 which will allow you to explicitly list which columns SHOULD have 35 | an index created. 36 | - Added `--filename-column` option, refs #10 37 | - Fixes for Python 2, refs #25 38 | - Implemented new `--shape` option - refs #25 39 | - `--table` option for specifying table to write to, refs #10 40 | - Updated README to cover `--skip-errors`, refs #20 41 | - Add `--skip-errors` option (#20) [Jani Monoses] 42 | - Less verbosity (#19) [Jani Monoses] 43 | 44 | Only log `extract_columns` info when that option is passed. 45 | - Add option for field quoting behaviour (#15) [Jani Monoses] 46 | 47 | 48 | 0.7 (2017-11-25) 49 | ---------------- 50 | - Add -s option to specify input field separator (#13) [Jani Monoses] 51 | 52 | 53 | 0.6.1 (2017-11-24) 54 | ------------------ 55 | - -f and -c now work for single table multiple columns. 56 | 57 | Fixes #12 58 | 59 | 0.6 (2017-11-24) 60 | ---------------- 61 | - `--fts` and `--extract-column` now cooperate. 62 | 63 | If you extract a column and then specify that same column in the `--fts` list, 64 | `csvs-to-sqlite` now uses the original value of that column in the index. 65 | 66 | Example using CSV from https://data.sfgov.org/City-Infrastructure/Street-Tree-List/tkzw-k3nq 67 | 68 | csvs-to-sqlite Street_Tree_List.csv trees-fts.db \ 69 | -c qLegalStatus -c qSpecies -c qSiteInfo \ 70 | -c PlantType -c qCaretaker -c qCareAssistant \ 71 | -f qLegalStatus -f qSpecies -f qAddress \ 72 | -f qSiteInfo -f PlantType -f qCaretaker \ 73 | -f qCareAssistant -f PermitNotes 74 | 75 | Closes #9 76 | - Added `--fts` option for setting up SQLite full-text search. 77 | 78 | The `--fts` option will create a corresponding SQLite FTS virtual table, using 79 | the best available version of the FTS module. 80 | 81 | https://sqlite.org/fts5.html 82 | https://www.sqlite.org/fts3.html 83 | 84 | Usage: 85 | 86 | csvs-to-sqlite my-csv.csv output.db -f column1 -f column2 87 | 88 | Example generated with this option: https://sf-trees-search.now.sh/ 89 | 90 | Example search: https://sf-trees-search.now.sh/sf-trees-search-a899b92?sql=select+*+from+Street_Tree_List+where+rowid+in+%28select+rowid+from+Street_Tree_List_fts+where+Street_Tree_List_fts+match+%27grove+london+dpw%27%29%0D%0A 91 | 92 | Will be used in https://github.com/simonw/datasette/issues/131 93 | - Handle column names with spaces in them. 94 | - Added `csvs-to-sqlite --version` option. 95 | 96 | Using http://click.pocoo.org/5/api/#click.version_option 97 | 98 | 99 | 0.5 (2017-11-19) 100 | ---------------- 101 | - Release 0.5. 102 | - Foreign key extraction for mix of integer and NaN now works. 103 | 104 | Similar issue to a8ab5248f4a - when we extracted a column that included a 105 | mixture of both integers and NaNs things went a bit weird. 106 | - Added test for column extraction. 107 | - Fixed bug with accidentally hard-coded column. 108 | 109 | 110 | 0.4 (2017-11-19) 111 | ---------------- 112 | - Release 0.4. 113 | - Automatically deploy tags as PyPI releases. 114 | 115 | https://docs.travis-ci.com/user/deployment/pypi/ 116 | - Fixed tests for Python 2. 117 | - Ensure columns of ints + NaNs map to SQLite INTEGER. 118 | 119 | Pandas does a good job of figuring out which SQLite column types should be 120 | used for a DataFrame - with one exception: due to a limitation of NumPy it 121 | treats columns containing a mixture of integers and NaN (blank values) as 122 | being of type float64, which means they end up as REAL columns in SQLite. 123 | 124 | http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na 125 | 126 | To fix this, we now check to see if a float64 column actually consists solely 127 | of NaN and integer-valued floats (checked using v.is_integer() in Python). If 128 | that is the case, we over-ride the column type to be INTEGER instead. 129 | - Use miniconda to speed up Travis CI builds (#8) 130 | 131 | Using Travis CI configuration code copied from https://github.com/EducationalTestingService/skll/blob/87b071743ba7cf0b1063c7265005d43b172b5d91/.travis.yml 132 | 133 | Which is itself an updated version of the pattern described in http://dan-blanchard.roughdraft.io/7045057-quicker-travis-builds-that-rely-on-numpy-and-scipy-using-miniconda 134 | 135 | I had to switch to running `pytest` directly, because `python setup.py test` was still trying to install a pandas package that involved compiling everything from scratch (which is why Travis CI builds were taking around 15 minutes). 136 | - Don't include an `index` column - rely on SQLite rowid instead. 137 | 138 | 139 | 0.3 (2017-11-17) 140 | ---------------- 141 | - Added `--extract-column` to README. 142 | 143 | Also updated the `--help` output and added a Travis CI badge. 144 | - Configure Travis CI. 145 | 146 | Also made it so `python setup.py test` runs the tests. 147 | - Mechanism for converting columns into separate tables. 148 | 149 | Let's say you have a CSV file that looks like this: 150 | 151 | county,precinct,office,district,party,candidate,votes 152 | Clark,1,President,,REP,John R. Kasich,5 153 | Clark,2,President,,REP,John R. Kasich,0 154 | Clark,3,President,,REP,John R. Kasich,7 155 | 156 | (Real example from https://github.com/openelections/openelections-data-sd/blob/master/2016/20160607__sd__primary__clark__precinct.csv ) 157 | 158 | You can now convert selected columns into separate lookup tables using the new 159 | `--extract-column` option (shortname: `-c`) - for example: 160 | 161 | csvs-to-sqlite openelections-data-*/*.csv \ 162 | -c county:County:name \ 163 | -c precinct:Precinct:name \ 164 | -c office -c district -c party -c candidate \ 165 | openelections.db 166 | 167 | The format is as follows: 168 | 169 | column_name:optional_table_name:optional_table_value_column_name 170 | 171 | If you just specify the column name e.g. `-c office`, the following table will 172 | be created: 173 | 174 | CREATE TABLE "party" ( 175 | "id" INTEGER PRIMARY KEY, 176 | "value" TEXT 177 | ); 178 | 179 | If you specify all three options, e.g. `-c precinct:Precinct:name` the table 180 | will look like this: 181 | 182 | CREATE TABLE "Precinct" ( 183 | "id" INTEGER PRIMARY KEY, 184 | "name" TEXT 185 | ); 186 | 187 | The original tables will be created like this: 188 | 189 | CREATE TABLE "ca__primary__san_francisco__precinct" ( 190 | "county" INTEGER, 191 | "precinct" INTEGER, 192 | "office" INTEGER, 193 | "district" INTEGER, 194 | "party" INTEGER, 195 | "candidate" INTEGER, 196 | "votes" INTEGER, 197 | FOREIGN KEY (county) REFERENCES County(id), 198 | FOREIGN KEY (party) REFERENCES party(id), 199 | FOREIGN KEY (precinct) REFERENCES Precinct(id), 200 | FOREIGN KEY (office) REFERENCES office(id), 201 | FOREIGN KEY (candidate) REFERENCES candidate(id) 202 | ); 203 | 204 | They will be populated with IDs that reference the new derived tables. 205 | 206 | Closes #2 207 | - Can now add new tables to existing database. 208 | 209 | And the new `--replace-tables` option allows you to tell it to replace existing 210 | tables rather than quitting with an error. 211 | 212 | Closes #1 213 | - Fixed compatibility with Python 3. 214 | - Badge links to PyPI. 215 | - Create LICENSE. 216 | - Create README.md. 217 | - Initial release. 218 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # csvs-to-sqlite 2 | 3 | [![PyPI](https://img.shields.io/pypi/v/csvs-to-sqlite.svg)](https://pypi.org/project/csvs-to-sqlite/) 4 | [![Changelog](https://img.shields.io/github/v/release/simonw/csvs-to-sqlite?include_prereleases&label=changelog)](https://github.com/simonw/csvs-to-sqlite/releases) 5 | [![Tests](https://github.com/simonw/csvs-to-sqlite/workflows/Test/badge.svg)](https://github.com/simonw/csvs-to-sqlite/actions?query=workflow%3ATest) 6 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/simonw/csvs-to-sqlite/blob/main/LICENSE) 7 | 8 | Convert CSV files into a SQLite database. Browse and publish that SQLite database with [Datasette](https://github.com/simonw/datasette). 9 | 10 | > [!NOTE] 11 | > This tool is **infrequently maintained**. I suggest [using sqlite-utils](https://sqlite-utils.datasette.io/en/stable/cli.html#inserting-csv-or-tsv-data) for importing CSV and TSV to SQLite instead for most cases. 12 | 13 | Basic usage: 14 | ```bash 15 | csvs-to-sqlite myfile.csv mydatabase.db 16 | ``` 17 | This will create a new SQLite database called `mydatabase.db` containing a 18 | single table, `myfile`, containing the CSV content. 19 | 20 | You can provide multiple CSV files: 21 | ``` 22 | csvs-to-sqlite one.csv two.csv bundle.db 23 | ``` 24 | The `bundle.db` database will contain two tables, `one` and `two`. 25 | 26 | This means you can use wildcards: 27 | ```bash 28 | csvs-to-sqlite ~/Downloads/*.csv my-downloads.db 29 | ``` 30 | If you pass a path to one or more directories, the script will recursively 31 | search those directories for CSV files and create tables for each one. 32 | ```bash 33 | csvs-to-sqlite ~/path/to/directory all-my-csvs.db 34 | ``` 35 | ## Handling TSV (tab-separated values) 36 | 37 | You can use the `-s` option to specify a different delimiter. If you want 38 | to use a tab character you'll need to apply shell escaping like so: 39 | ```bash 40 | csvs-to-sqlite my-file.tsv my-file.db -s $'\t' 41 | ``` 42 | ## Refactoring columns into separate lookup tables 43 | 44 | Let's say you have a CSV file that looks like this: 45 | ```csv 46 | county,precinct,office,district,party,candidate,votes 47 | Clark,1,President,,REP,John R. Kasich,5 48 | Clark,2,President,,REP,John R. Kasich,0 49 | Clark,3,President,,REP,John R. Kasich,7 50 | ``` 51 | ([Real example taken from the Open Elections project](https://github.com/openelections/openelections-data-sd/blob/master/2016/20160607__sd__primary__clark__precinct.csv)) 52 | 53 | You can now convert selected columns into separate lookup tables using the new 54 | `--extract-column` option (shortname: `-c`) - for example: 55 | ```bash 56 | csvs-to-sqlite openelections-data-*/*.csv \ 57 | -c county:County:name \ 58 | -c precinct:Precinct:name \ 59 | -c office -c district -c party -c candidate \ 60 | openelections.db 61 | ``` 62 | The format is as follows: 63 | ```bash 64 | column_name:optional_table_name:optional_table_value_column_name 65 | ``` 66 | If you just specify the column name e.g. `-c office`, the following table will 67 | be created: 68 | ```sql 69 | CREATE TABLE "office" ( 70 | "id" INTEGER PRIMARY KEY, 71 | "value" TEXT 72 | ); 73 | ``` 74 | If you specify all three options, e.g. `-c precinct:Precinct:name` the table 75 | will look like this: 76 | ```sql 77 | CREATE TABLE "Precinct" ( 78 | "id" INTEGER PRIMARY KEY, 79 | "name" TEXT 80 | ); 81 | ``` 82 | The original tables will be created like this: 83 | ```sql 84 | CREATE TABLE "ca__primary__san_francisco__precinct" ( 85 | "county" INTEGER, 86 | "precinct" INTEGER, 87 | "office" INTEGER, 88 | "district" INTEGER, 89 | "party" INTEGER, 90 | "candidate" INTEGER, 91 | "votes" INTEGER, 92 | FOREIGN KEY (county) REFERENCES County(id), 93 | FOREIGN KEY (party) REFERENCES party(id), 94 | FOREIGN KEY (precinct) REFERENCES Precinct(id), 95 | FOREIGN KEY (office) REFERENCES office(id), 96 | FOREIGN KEY (candidate) REFERENCES candidate(id) 97 | ); 98 | ``` 99 | They will be populated with IDs that reference the new derived tables. 100 | 101 | ## Installation 102 | 103 | ```bash 104 | pip install csvs-to-sqlite 105 | ``` 106 | 107 | `csvs-to-sqlite` now requires Python 3. If you are running Python 2 you can install the last version to support Python 2: 108 | ```bash 109 | pip install csvs-to-sqlite==0.9.2 110 | ``` 111 | 112 | ## csvs-to-sqlite --help 113 | 114 | 125 | ``` 126 | Usage: csvs-to-sqlite [OPTIONS] PATHS... DBNAME 127 | 128 | PATHS: paths to individual .csv files or to directories containing .csvs 129 | 130 | DBNAME: name of the SQLite database file to create 131 | 132 | Options: 133 | -s, --separator TEXT Field separator in input .csv 134 | -q, --quoting INTEGER Control field quoting behavior per csv.QUOTE_* 135 | constants. Use one of QUOTE_MINIMAL (0), 136 | QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or 137 | QUOTE_NONE (3). 138 | --skip-errors Skip lines with too many fields instead of 139 | stopping the import 140 | --replace-tables Replace tables if they already exist 141 | -t, --table TEXT Table to use (instead of using CSV filename) 142 | -c, --extract-column TEXT One or more columns to 'extract' into a 143 | separate lookup table. If you pass a simple 144 | column name that column will be replaced with 145 | integer foreign key references to a new table 146 | of that name. You can customize the name of 147 | the table like so: state:States:state_name 148 | 149 | This will pull unique values from the 'state' 150 | column and use them to populate a new 'States' 151 | table, with an id column primary key and a 152 | state_name column containing the strings from 153 | the original column. 154 | -d, --date TEXT One or more columns to parse into ISO 155 | formatted dates 156 | -dt, --datetime TEXT One or more columns to parse into ISO 157 | formatted datetimes 158 | -df, --datetime-format TEXT One or more custom date format strings to try 159 | when parsing dates/datetimes 160 | -pk, --primary-key TEXT One or more columns to use as the primary key 161 | -f, --fts TEXT One or more columns to use to populate a full- 162 | text index 163 | -i, --index TEXT Add index on this column (or a compound index 164 | with -i col1,col2) 165 | --shape TEXT Custom shape for the DB table - format is 166 | csvcol:dbcol(TYPE),... 167 | --filename-column TEXT Add a column with this name and populate with 168 | CSV file name 169 | --fixed-column ... Populate column with a fixed string 170 | --fixed-column-int ... 171 | Populate column with a fixed integer 172 | --fixed-column-float ... 173 | Populate column with a fixed float 174 | --no-index-fks Skip adding index to foreign key columns 175 | created using --extract-column (default is to 176 | add them) 177 | --no-fulltext-fks Skip adding full-text index on values 178 | extracted using --extract-column (default is 179 | to add them) 180 | --just-strings Import all columns as text strings by default 181 | (and, if specified, still obey --shape, 182 | --date/datetime, and --datetime-format) 183 | --version Show the version and exit. 184 | --help Show this message and exit. 185 | 186 | ``` 187 | 188 | -------------------------------------------------------------------------------- /csvs_to_sqlite/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/simonw/csvs-to-sqlite/33b79ed80a688ceae4486ebdcca28ae689f54c03/csvs_to_sqlite/__init__.py -------------------------------------------------------------------------------- /csvs_to_sqlite/cli.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import click 4 | from .utils import ( 5 | LoadCsvError, 6 | LookupTable, 7 | PathOrURL, 8 | add_index, 9 | apply_dates_and_datetimes, 10 | apply_shape, 11 | best_fts_version, 12 | csvs_from_paths, 13 | generate_and_populate_fts, 14 | load_csv, 15 | refactor_dataframes, 16 | table_exists, 17 | drop_table, 18 | to_sql_with_foreign_keys, 19 | ) 20 | import os 21 | import sqlite3 22 | 23 | 24 | @click.command() 25 | @click.argument("paths", type=PathOrURL(exists=True), nargs=-1, required=True) 26 | @click.argument("dbname", nargs=1) 27 | @click.option("--separator", "-s", default=",", help="Field separator in input .csv") 28 | @click.option( 29 | "--quoting", 30 | "-q", 31 | default=0, 32 | help="Control field quoting behavior per csv.QUOTE_* constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).", 33 | ) 34 | @click.option( 35 | "--skip-errors", 36 | is_flag=True, 37 | help="Skip lines with too many fields instead of stopping the import", 38 | ) 39 | @click.option( 40 | "--replace-tables", is_flag=True, help="Replace tables if they already exist" 41 | ) 42 | @click.option( 43 | "--table", "-t", help="Table to use (instead of using CSV filename)", default=None 44 | ) 45 | @click.option( 46 | "--extract-column", 47 | "-c", 48 | multiple=True, 49 | help=( 50 | "One or more columns to 'extract' into a separate lookup table. " 51 | "If you pass a simple column name that column will be replaced " 52 | "with integer foreign key references to a new table of that " 53 | "name. You can customize the name of the table like so:\n" 54 | " state:States:state_name\n\n" 55 | "This will pull unique values from the 'state' column and use " 56 | "them to populate a new 'States' table, with an id column " 57 | "primary key and a state_name column containing the strings " 58 | "from the original column." 59 | ), 60 | ) 61 | @click.option( 62 | "--date", 63 | "-d", 64 | multiple=True, 65 | help=("One or more columns to parse into ISO formatted dates"), 66 | ) 67 | @click.option( 68 | "--datetime", 69 | "-dt", 70 | multiple=True, 71 | help=("One or more columns to parse into ISO formatted datetimes"), 72 | ) 73 | @click.option( 74 | "--datetime-format", 75 | "-df", 76 | multiple=True, 77 | help=("One or more custom date format strings to try when parsing dates/datetimes"), 78 | ) 79 | @click.option( 80 | "--primary-key", 81 | "-pk", 82 | multiple=True, 83 | help=("One or more columns to use as the primary key"), 84 | ) 85 | @click.option( 86 | "--fts", 87 | "-f", 88 | multiple=True, 89 | help=("One or more columns to use to populate a full-text index"), 90 | ) 91 | @click.option( 92 | "--index", 93 | "-i", 94 | multiple=True, 95 | help=("Add index on this column (or a compound index with -i col1,col2)"), 96 | ) 97 | @click.option( 98 | "--shape", 99 | help="Custom shape for the DB table - format is csvcol:dbcol(TYPE),...", 100 | default=None, 101 | ) 102 | @click.option( 103 | "--filename-column", 104 | help="Add a column with this name and populate with CSV file name", 105 | default=None, 106 | ) 107 | @click.option( 108 | "fixed_columns", 109 | "--fixed-column", 110 | type=(str, str), 111 | multiple=True, 112 | help="Populate column with a fixed string", 113 | default=None, 114 | ) 115 | @click.option( 116 | "fixed_columns_int", 117 | "--fixed-column-int", 118 | type=(str, int), 119 | multiple=True, 120 | help="Populate column with a fixed integer", 121 | default=None, 122 | ) 123 | @click.option( 124 | "fixed_columns_float", 125 | "--fixed-column-float", 126 | type=(str, float), 127 | multiple=True, 128 | help="Populate column with a fixed float", 129 | default=None, 130 | ) 131 | @click.option( 132 | "--no-index-fks", 133 | "no_index_fks", 134 | is_flag=True, 135 | help="Skip adding index to foreign key columns created using --extract-column (default is to add them)", 136 | ) 137 | @click.option( 138 | "--no-fulltext-fks", 139 | "no_fulltext_fks", 140 | is_flag=True, 141 | help="Skip adding full-text index on values extracted using --extract-column (default is to add them)", 142 | ) 143 | @click.option( 144 | "--just-strings", 145 | is_flag=True, 146 | help="Import all columns as text strings by default (and, if specified, still obey --shape, --date/datetime, and --datetime-format)", 147 | ) 148 | @click.version_option() 149 | def cli( 150 | paths, 151 | dbname, 152 | separator, 153 | quoting, 154 | skip_errors, 155 | replace_tables, 156 | table, 157 | extract_column, 158 | date, 159 | datetime, 160 | datetime_format, 161 | primary_key, 162 | fts, 163 | index, 164 | shape, 165 | filename_column, 166 | fixed_columns, 167 | fixed_columns_int, 168 | fixed_columns_float, 169 | no_index_fks, 170 | no_fulltext_fks, 171 | just_strings, 172 | ): 173 | """ 174 | PATHS: paths to individual .csv files or to directories containing .csvs 175 | 176 | DBNAME: name of the SQLite database file to create 177 | """ 178 | # make plural for more readable code: 179 | extract_columns = extract_column 180 | del extract_column 181 | 182 | if extract_columns: 183 | click.echo("extract_columns={}".format(extract_columns)) 184 | if dbname.endswith(".csv"): 185 | raise click.BadParameter("dbname must not end with .csv") 186 | if "." not in dbname: 187 | dbname += ".db" 188 | 189 | db_existed = os.path.exists(dbname) 190 | 191 | conn = sqlite3.connect(dbname) 192 | 193 | dataframes = [] 194 | csvs = csvs_from_paths(paths) 195 | sql_type_overrides = None 196 | for name, path in csvs.items(): 197 | try: 198 | df = load_csv( 199 | path, separator, skip_errors, quoting, shape, just_strings=just_strings 200 | ) 201 | df.table_name = table or name 202 | if filename_column: 203 | df[filename_column] = name 204 | if shape: 205 | shape += ",{}".format(filename_column) 206 | if fixed_columns: 207 | for colname, value in fixed_columns: 208 | df[colname] = value 209 | if shape: 210 | shape += ",{}".format(colname) 211 | if fixed_columns_int: 212 | for colname, value in fixed_columns_int: 213 | df[colname] = value 214 | if shape: 215 | shape += ",{}".format(colname) 216 | if fixed_columns_float: 217 | for colname, value in fixed_columns_float: 218 | df[colname] = value 219 | if shape: 220 | shape += ",{}".format(colname) 221 | sql_type_overrides = apply_shape(df, shape) 222 | apply_dates_and_datetimes(df, date, datetime, datetime_format) 223 | dataframes.append(df) 224 | except LoadCsvError as e: 225 | click.echo("Could not load {}: {}".format(path, e), err=True) 226 | 227 | click.echo("Loaded {} dataframes".format(len(dataframes))) 228 | 229 | # Use extract_columns to build a column:(table,label) dictionary 230 | foreign_keys = {} 231 | for col in extract_columns: 232 | bits = col.split(":") 233 | if len(bits) == 3: 234 | foreign_keys[bits[0]] = (bits[1], bits[2]) 235 | elif len(bits) == 2: 236 | foreign_keys[bits[0]] = (bits[1], "value") 237 | else: 238 | foreign_keys[bits[0]] = (bits[0], "value") 239 | 240 | # Now we have loaded the dataframes, we can refactor them 241 | created_tables = {} 242 | refactored = refactor_dataframes( 243 | conn, dataframes, foreign_keys, not no_fulltext_fks 244 | ) 245 | for df in refactored: 246 | # This is a bit trickier because we need to 247 | # create the table with extra SQL for foreign keys 248 | if replace_tables and table_exists(conn, df.table_name): 249 | drop_table(conn, df.table_name) 250 | if table_exists(conn, df.table_name): 251 | df.to_sql(df.table_name, conn, if_exists="append", index=False) 252 | else: 253 | to_sql_with_foreign_keys( 254 | conn, 255 | df, 256 | df.table_name, 257 | foreign_keys, 258 | sql_type_overrides, 259 | primary_keys=primary_key, 260 | index_fks=not no_index_fks, 261 | ) 262 | created_tables[df.table_name] = df 263 | if index: 264 | for index_defn in index: 265 | add_index(conn, df.table_name, index_defn) 266 | 267 | # Create FTS tables 268 | if fts: 269 | fts_version = best_fts_version() 270 | if not fts_version: 271 | conn.close() 272 | raise click.BadParameter( 273 | "Your SQLite version does not support any variant of FTS" 274 | ) 275 | # Check that columns make sense 276 | for table, df in created_tables.items(): 277 | for fts_column in fts: 278 | if fts_column not in df.columns: 279 | raise click.BadParameter( 280 | 'FTS column "{}" does not exist'.format(fts_column) 281 | ) 282 | 283 | generate_and_populate_fts(conn, created_tables.keys(), fts, foreign_keys) 284 | 285 | conn.close() 286 | 287 | if db_existed: 288 | click.echo( 289 | "Added {} CSV file{} to {}".format( 290 | len(csvs), "" if len(csvs) == 1 else "s", dbname 291 | ) 292 | ) 293 | else: 294 | click.echo( 295 | "Created {} from {} CSV file{}".format( 296 | dbname, len(csvs), "" if len(csvs) == 1 else "s" 297 | ) 298 | ) 299 | -------------------------------------------------------------------------------- /csvs_to_sqlite/utils.py: -------------------------------------------------------------------------------- 1 | import dateparser 2 | import os 3 | import fnmatch 4 | import hashlib 5 | import lru 6 | import pandas as pd 7 | import numpy as np 8 | import re 9 | import six 10 | import sqlite3 11 | 12 | from six.moves.urllib.parse import urlparse 13 | from six.moves.urllib.parse import uses_relative, uses_netloc, uses_params 14 | 15 | import click 16 | 17 | 18 | class LoadCsvError(Exception): 19 | pass 20 | 21 | 22 | def load_csv( 23 | filepath, 24 | separator, 25 | skip_errors, 26 | quoting, 27 | shape, 28 | encodings_to_try=("utf8", "latin-1"), 29 | just_strings=False, 30 | ): 31 | dtype = str if just_strings is True else None 32 | usecols = None 33 | if shape: 34 | usecols = [defn["csv_name"] for defn in parse_shape(shape)] 35 | try: 36 | for encoding in encodings_to_try: 37 | try: 38 | return pd.read_csv( 39 | filepath, 40 | sep=separator, 41 | quoting=quoting, 42 | on_bad_lines="skip" if skip_errors else "error", 43 | low_memory=True, 44 | encoding=encoding, 45 | usecols=usecols, 46 | dtype=dtype, 47 | ) 48 | except UnicodeDecodeError: 49 | continue 50 | except pd.errors.ParserError as e: 51 | raise LoadCsvError(e) 52 | # If we get here, we failed 53 | raise LoadCsvError("All encodings failed") 54 | except Exception as e: 55 | raise LoadCsvError(e) 56 | 57 | 58 | def csvs_from_paths(paths): 59 | csvs = {} 60 | 61 | def add_item(filepath, full_path=None): 62 | name = os.path.splitext(os.path.basename(filepath))[0] 63 | if name in csvs: 64 | i = 1 65 | while True: 66 | name_plus_suffix = "{}-{}".format(name, i) 67 | if name_plus_suffix not in csvs: 68 | name = name_plus_suffix 69 | break 70 | else: 71 | i += 1 72 | if full_path is None: 73 | csvs[name] = filepath 74 | else: 75 | csvs[name] = full_path 76 | 77 | for path in paths: 78 | if os.path.isfile(path): 79 | add_item(path) 80 | elif _is_url(path): 81 | add_item(urlparse(path).path, path) 82 | elif os.path.isdir(path): 83 | # Recursively seek out ALL csvs in directory 84 | for root, dirnames, filenames in os.walk(path): 85 | for filename in fnmatch.filter(filenames, "*.csv"): 86 | relpath = os.path.relpath(root, path) 87 | namepath = os.path.join(relpath, os.path.splitext(filename)[0]) 88 | csvs[namepath] = os.path.join(root, filename) 89 | 90 | return csvs 91 | 92 | 93 | def _is_url(possible_url): 94 | valid_schemes = set(uses_relative + uses_netloc + uses_params) 95 | valid_schemes.discard("") 96 | 97 | try: 98 | return urlparse(possible_url).scheme in valid_schemes 99 | except: 100 | return False 101 | 102 | 103 | class PathOrURL(click.Path): 104 | """The PathOrURL type handles paths or URLs. 105 | 106 | If the argument can be parsed as a URL, it will be treated as one. 107 | Otherwise PathorURL behaves like click.Path. 108 | """ 109 | 110 | def __init__( 111 | self, 112 | exists=False, 113 | file_okay=True, 114 | dir_okay=True, 115 | writable=False, 116 | readable=True, 117 | resolve_path=False, 118 | allow_dash=False, 119 | path_type=None, 120 | ): 121 | super(PathOrURL, self).__init__( 122 | exists=exists, 123 | file_okay=file_okay, 124 | dir_okay=dir_okay, 125 | writable=writable, 126 | readable=readable, 127 | resolve_path=resolve_path, 128 | allow_dash=allow_dash, 129 | path_type=path_type, 130 | ) 131 | 132 | def convert(self, value, param, ctx): 133 | if _is_url(value): 134 | return self.coerce_path_result(value) 135 | else: 136 | return super(PathOrURL, self).convert(value, param, ctx) 137 | 138 | 139 | class LookupTable: 140 | def __init__(self, conn, table_name, value_column, index_fts): 141 | self.conn = conn 142 | self.table_name = table_name 143 | self.value_column = value_column 144 | self.fts_table_name = "{table_name}_{value_column}_fts".format( 145 | table_name=table_name, value_column=value_column 146 | ) 147 | self.index_fts = index_fts 148 | self.cache = lru.LRUCacheDict(max_size=1000) 149 | self.ensure_table_exists() 150 | 151 | def ensure_table_exists(self): 152 | if not self.conn.execute( 153 | """ 154 | SELECT name 155 | FROM sqlite_master 156 | WHERE type='table' 157 | AND name=? 158 | """, 159 | (self.table_name,), 160 | ).fetchall(): 161 | create_sql = """ 162 | CREATE TABLE "{table_name}" ( 163 | "id" INTEGER PRIMARY KEY, 164 | "{value_column}" TEXT 165 | ); 166 | """.format( 167 | table_name=self.table_name, value_column=self.value_column 168 | ) 169 | self.conn.execute(create_sql) 170 | if self.index_fts: 171 | # Add a FTS index on the value_column 172 | self.conn.execute( 173 | """ 174 | CREATE VIRTUAL TABLE "{fts_table_name}" 175 | USING {fts_version} ({value_column}, content="{table_name}"); 176 | """.format( 177 | fts_version=best_fts_version(), 178 | fts_table_name=self.fts_table_name, 179 | table_name=self.table_name, 180 | value_column=self.value_column, 181 | ) 182 | ) 183 | 184 | def __repr__(self): 185 | return "<{}: {} rows>".format( 186 | self.table_name, 187 | self.conn.execute( 188 | 'select count(*) from "{}"'.format(self.table_name) 189 | ).fetchone()[0], 190 | ) 191 | 192 | def id_for_value(self, value): 193 | if pd.isnull(value): 194 | return None 195 | # value should be a string 196 | if not isinstance(value, six.string_types): 197 | if isinstance(value, float): 198 | value = "{0:g}".format(value) 199 | else: 200 | value = six.text_type(value) 201 | try: 202 | # First try our in-memory cache 203 | return self.cache[value] 204 | except KeyError: 205 | # Next try the database table 206 | sql = 'SELECT id FROM "{table_name}" WHERE "{value_column}"=?'.format( 207 | table_name=self.table_name, value_column=self.value_column 208 | ) 209 | result = self.conn.execute(sql, (value,)).fetchall() 210 | if result: 211 | id = result[0][0] 212 | else: 213 | # Not in DB! Insert it 214 | cursor = self.conn.cursor() 215 | cursor.execute( 216 | """ 217 | INSERT INTO "{table_name}" ("{value_column}") VALUES (?); 218 | """.format( 219 | table_name=self.table_name, value_column=self.value_column 220 | ), 221 | (value,), 222 | ) 223 | id = cursor.lastrowid 224 | if self.index_fts: 225 | # And update FTS index 226 | sql = """ 227 | INSERT INTO "{fts_table_name}" (rowid, "{value_column}") VALUES (?, ?); 228 | """.format( 229 | fts_table_name=self.fts_table_name, 230 | value_column=self.value_column, 231 | ) 232 | cursor.execute(sql, (id, value)) 233 | 234 | self.cache[value] = id 235 | return id 236 | 237 | 238 | def refactor_dataframes(conn, dataframes, foreign_keys, index_fts): 239 | lookup_tables = {} 240 | for column, (table_name, value_column) in foreign_keys.items(): 241 | # Now apply this to the dataframes 242 | for dataframe in dataframes: 243 | if column in dataframe.columns: 244 | lookup_table = lookup_tables.get(table_name) 245 | if lookup_table is None: 246 | lookup_table = LookupTable( 247 | conn=conn, 248 | table_name=table_name, 249 | value_column=value_column, 250 | index_fts=index_fts, 251 | ) 252 | lookup_tables[table_name] = lookup_table 253 | dataframe[column] = dataframe[column].apply(lookup_table.id_for_value) 254 | return dataframes 255 | 256 | 257 | def table_exists(conn, table): 258 | return conn.execute( 259 | """ 260 | select count(*) from sqlite_master 261 | where type="table" and name=? 262 | """, 263 | [table], 264 | ).fetchone()[0] 265 | 266 | 267 | def drop_table(conn, table): 268 | conn.execute("DROP TABLE [{}]".format(table)) 269 | 270 | 271 | def get_create_table_sql( 272 | table_name, df, index=True, sql_type_overrides=None, primary_keys=None 273 | ): 274 | # Create a temporary table with just the first row 275 | # We do this in memory because we just want to get the 276 | # CREATE TABLE statement 277 | # Returns (sql, columns) 278 | conn = sqlite3.connect(":memory:") 279 | # Before calling to_sql we need correct the dtypes that we will be using 280 | # to pick the right SQL column types. pandas mostly gets this right... 281 | # except for columns that contain a mixture of integers and Nones. These 282 | # will be incorrectly detected as being of DB type REAL when we want them 283 | # to be INTEGER instead. 284 | # http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na 285 | sql_type_overrides = sql_type_overrides or {} 286 | if isinstance(df, pd.DataFrame): 287 | columns_and_types = df.dtypes.items() 288 | elif isinstance(df, pd.Series): 289 | columns_and_types = [(df.name, df.dtype)] 290 | for column, dtype in columns_and_types: 291 | # Are any of these float columns? 292 | if dtype in (np.float32, np.float64): 293 | # if every non-NaN value is an integer, switch to int 294 | if isinstance(df, pd.Series): 295 | series = df 296 | else: 297 | series = df[column] 298 | num_non_integer_floats = series.map( 299 | lambda v: not np.isnan(v) and not v.is_integer() 300 | ).sum() 301 | if num_non_integer_floats == 0: 302 | # Everything was NaN or an integer-float - switch type: 303 | sql_type_overrides[column] = "INTEGER" 304 | 305 | df[:1].to_sql(table_name, conn, index=index, dtype=sql_type_overrides) 306 | sql = conn.execute( 307 | "select sql from sqlite_master where name = ?", [table_name] 308 | ).fetchone()[0] 309 | columns = [ 310 | row[1] for row in conn.execute("PRAGMA table_info([{}])".format(table_name)) 311 | ] 312 | if primary_keys: 313 | # Rewrite SQL to add PRIMARY KEY (col1, col2) at end 314 | assert sql[-1] == ")" 315 | sql = sql[:-1] + " ,PRIMARY KEY ({cols})\n)".format( 316 | cols=", ".join("[{}]".format(col) for col in primary_keys) 317 | ) 318 | return sql, columns 319 | 320 | 321 | def to_sql_with_foreign_keys( 322 | conn, 323 | df, 324 | name, 325 | foreign_keys, 326 | sql_type_overrides=None, 327 | primary_keys=None, 328 | index_fks=False, 329 | ): 330 | create_sql, columns = get_create_table_sql( 331 | name, 332 | df, 333 | index=False, 334 | primary_keys=primary_keys, 335 | sql_type_overrides=sql_type_overrides, 336 | ) 337 | foreign_key_bits = [] 338 | index_bits = [] 339 | for column, (table, value_column) in foreign_keys.items(): 340 | if column in columns: 341 | foreign_key_bits.append( 342 | 'FOREIGN KEY ("{}") REFERENCES [{}](id)'.format(column, table) 343 | ) 344 | if index_fks: 345 | index_bits.append( 346 | # CREATE INDEX indexname ON table(column); 347 | 'CREATE INDEX ["{}_{}"] ON [{}]("{}");'.format( 348 | name, column, name, column 349 | ) 350 | ) 351 | 352 | foreign_key_sql = ",\n ".join(foreign_key_bits) 353 | if foreign_key_sql: 354 | create_sql = "{},\n{});".format(create_sql.strip().rstrip(")"), foreign_key_sql) 355 | if index_bits: 356 | create_sql += "\n" + "\n".join(index_bits) 357 | conn.executescript(create_sql) 358 | # Now that we have created the table, insert the rows: 359 | df.to_sql(df.table_name, conn, if_exists="append", index=False) 360 | 361 | 362 | def best_fts_version(): 363 | "Discovers the most advanced supported SQLite FTS version" 364 | conn = sqlite3.connect(":memory:") 365 | for fts in ("FTS5", "FTS4", "FTS3"): 366 | try: 367 | conn.execute("CREATE VIRTUAL TABLE v USING {} (t);".format(fts)) 368 | return fts 369 | except sqlite3.OperationalError: 370 | continue 371 | return None 372 | 373 | 374 | def generate_and_populate_fts(conn, created_tables, cols, foreign_keys): 375 | fts_version = best_fts_version() 376 | sql = [] 377 | fts_cols = ", ".join('"{}"'.format(c) for c in cols) 378 | for table in created_tables: 379 | sql.append( 380 | 'CREATE VIRTUAL TABLE "{content_table}_fts" USING {fts_version} ({cols}, content="{content_table}")'.format( 381 | cols=fts_cols, content_table=table, fts_version=fts_version 382 | ) 383 | ) 384 | if not foreign_keys: 385 | # Select is simple: 386 | select = "SELECT rowid, {cols} FROM [{content_table}]".format( 387 | cols=fts_cols, content_table=table 388 | ) 389 | else: 390 | # Select is complicated: 391 | # select 392 | # county, precinct, office.value, district.value, 393 | # party.value, candidate.value, votes 394 | # from content_table 395 | # left join office on content_table.office = office.id 396 | # left join district on content_table.district = district.id 397 | # left join party on content_table.party = party.id 398 | # left join candidate on content_table.candidate = candidate.id 399 | # order by content_table.rowid 400 | select_cols = [] 401 | joins = [] 402 | table_seen_count = {} 403 | for col in cols: 404 | if col in foreign_keys: 405 | other_table, label_column = foreign_keys[col] 406 | seen_count = table_seen_count.get(other_table, 0) + 1 407 | table_seen_count[other_table] = seen_count 408 | alias = "" 409 | if seen_count > 1: 410 | alias = "table_alias_{}_{}".format( 411 | hashlib.md5(other_table.encode("utf8")).hexdigest(), 412 | seen_count, 413 | ) 414 | select_cols.append( 415 | '[{}]."{}"'.format(alias or other_table, label_column) 416 | ) 417 | joins.append( 418 | 'left join [{other_table}] {alias} on [{table}]."{column}" = [{alias_or_other_table}].id'.format( 419 | other_table=other_table, 420 | alias_or_other_table=alias or other_table, 421 | alias=alias, 422 | table=table, 423 | column=col, 424 | ) 425 | ) 426 | else: 427 | select_cols.append('"{}"'.format(col)) 428 | select = "SELECT [{content_table}].rowid, {select_cols} FROM [{content_table}] {joins}".format( 429 | select_cols=", ".join("{}".format(c) for c in select_cols), 430 | content_table=table, 431 | joins="\n".join(joins), 432 | ) 433 | sql.append( 434 | 'INSERT INTO "{content_table}_fts" (rowid, {cols}) {select}'.format( 435 | cols=fts_cols, content_table=table, select=select 436 | ) 437 | ) 438 | conn.executescript(";\n".join(sql)) 439 | 440 | 441 | type_re = re.compile(r"\((real|integer|text|blob|numeric)\)$", re.I) 442 | 443 | 444 | def parse_shape(shape): 445 | # Shape is format 'county:Cty,votes:Vts(REAL)' 446 | defs = [b.strip() for b in shape.split(",")] 447 | defns = [] 448 | for defn in defs: 449 | # Is there a type defined? 450 | type_override = None 451 | m = type_re.search(defn) 452 | if m: 453 | type_override = m.group(1) 454 | defn = type_re.sub("", defn) 455 | # In Python 2 type_override needs to be a bytestring 456 | if six.PY2: 457 | type_override = str(type_override) 458 | # Is this a rename? 459 | if ":" in defn: 460 | csv_name, db_name = defn.split(":", 1) 461 | else: 462 | csv_name, db_name = defn, defn 463 | defns.append( 464 | {"csv_name": csv_name, "db_name": db_name, "type_override": type_override} 465 | ) 466 | return defns 467 | 468 | 469 | def apply_shape(df, shape): 470 | # Shape is format 'county:Cty,votes:Vts(REAL)' 471 | # Applies changes in place, returns dtype= arg for to_sql 472 | if not shape: 473 | return None 474 | defns = parse_shape(shape) 475 | # Drop any columns we don't want 476 | cols_to_keep = [d["csv_name"] for d in defns] 477 | cols_to_drop = [c for c in df.columns if c not in cols_to_keep] 478 | if cols_to_drop: 479 | df.drop(cols_to_drop, axis=1, inplace=True) 480 | # Apply column renames 481 | renames = { 482 | d["csv_name"]: d["db_name"] for d in defns if d["csv_name"] != d["db_name"] 483 | } 484 | if renames: 485 | df.rename(columns=renames, inplace=True) 486 | # Return type overrides, if any 487 | return {d["db_name"]: d["type_override"] for d in defns if d["type_override"]} 488 | 489 | 490 | def add_index(conn, table_name, index): 491 | columns_to_index = [b.strip() for b in index.split(",")] 492 | # Figure out columns in table so we can sanity check this 493 | cursor = conn.execute("select * from [{}] limit 0".format(table_name)) 494 | columns = [r[0] for r in cursor.description] 495 | if all([(c in columns) for c in columns_to_index]): 496 | sql = 'CREATE INDEX ["{}_{}"] ON [{}]("{}");'.format( 497 | table_name, 498 | "_".join(columns_to_index), 499 | table_name, 500 | '", "'.join(columns_to_index), 501 | ) 502 | conn.execute(sql) 503 | 504 | 505 | def apply_dates_and_datetimes(df, date_cols, datetime_cols, datetime_formats): 506 | def parse_datetime(datestring, force_date=False): 507 | if pd.isnull(datestring): 508 | return datestring 509 | dt = dateparser.parse(datestring, date_formats=datetime_formats) 510 | if force_date: 511 | return dt.date().isoformat() 512 | else: 513 | return dt.isoformat() 514 | 515 | for date_col in date_cols: 516 | df[date_col] = df[date_col].apply(lambda s: parse_datetime(s, force_date=True)) 517 | for datetime_col in datetime_cols: 518 | df[datetime_col] = df[datetime_col].apply(parse_datetime) 519 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | 4 | [bdist_wheel] 5 | universal=1 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import io 3 | import os 4 | 5 | VERSION = "1.3.1" 6 | 7 | 8 | def get_long_description(): 9 | with io.open( 10 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "README.md"), 11 | encoding="utf8", 12 | ) as fp: 13 | return fp.read() 14 | 15 | 16 | setup( 17 | name="csvs_to_sqlite", 18 | description="Convert CSV files into a SQLite database", 19 | long_description=get_long_description(), 20 | long_description_content_type="text/markdown", 21 | author="Simon Willison", 22 | version=VERSION, 23 | license="Apache License, Version 2.0", 24 | packages=find_packages(), 25 | install_requires=[ 26 | "click>=7.0", 27 | "dateparser>=1.0", 28 | "pandas>=1.0", 29 | "py-lru-cache~=0.1.4", 30 | "six", 31 | ], 32 | extras_require={"test": ["pytest", "cogapp"]}, 33 | entry_points=""" 34 | [console_scripts] 35 | csvs-to-sqlite=csvs_to_sqlite.cli:cli 36 | """, 37 | url="https://github.com/simonw/csvs-to-sqlite", 38 | classifiers=[ 39 | "Intended Audience :: Developers", 40 | "Intended Audience :: Science/Research", 41 | "Intended Audience :: End Users/Desktop", 42 | "Topic :: Database", 43 | "License :: OSI Approved :: Apache Software License", 44 | "Programming Language :: Python :: 3.6", 45 | "Programming Language :: Python :: 3.7", 46 | "Programming Language :: Python :: 3.8", 47 | "Programming Language :: Python :: 3.9", 48 | ], 49 | ) 50 | -------------------------------------------------------------------------------- /tests/test_csvs_to_sqlite.py: -------------------------------------------------------------------------------- 1 | from click.testing import CliRunner 2 | from csvs_to_sqlite import cli 3 | from six import string_types, text_type 4 | from cogapp import Cog 5 | import sys 6 | from io import StringIO 7 | import pathlib 8 | import sqlite3 9 | 10 | CSV = """county,precinct,office,district,party,candidate,votes 11 | Yolo,100001,President,,LIB,Gary Johnson,41 12 | Yolo,100001,President,,PAF,Gloria Estela La Riva,8 13 | Yolo,100001,Proposition 51,,,No,398 14 | Yolo,100001,Proposition 51,,,Yes,460 15 | Yolo,100001,State Assembly,7,DEM,Kevin McCarty,572 16 | Yolo,100001,State Assembly,7,REP,Ryan K. Brown,291""" 17 | 18 | CSV_MULTI = """film,actor_1,actor_2 19 | The Rock,Sean Connery,Nicolas Cage 20 | National Treasure,Nicolas Cage,Diane Kruger 21 | Troy,Diane Kruger,Orlando Bloom""" 22 | 23 | CSV_DATES = """headline,date,datetime 24 | First headline,3rd May 2017,10pm on April 4 1938 25 | Second headline,04/30/2005,5:45 10 December 2009""" 26 | 27 | CSV_DATES_CUSTOM_FORMAT = """headline,date 28 | Custom format,03/02/01""" 29 | 30 | CSV_CUSTOM_PRIMARY_KEYS = """pk1,pk2,name 31 | one,one,11 32 | one,two,12 33 | two,one,21""" 34 | 35 | CSV_STRINGS_AND_DATES = """name,gross,release_date 36 | Adaptation,22.5,6 of December in the year 2002 37 | Face/Off,245.7,19 of June in the year 1997 38 | The Rock,134.1,9 of June in the year 1996""" 39 | 40 | 41 | def test_flat(): 42 | runner = CliRunner() 43 | with runner.isolated_filesystem(): 44 | open("test.csv", "w").write(CSV) 45 | result = runner.invoke(cli.cli, ["test.csv", "test.db"]) 46 | assert result.exit_code == 0 47 | assert result.output.strip().endswith("Created test.db from 1 CSV file") 48 | conn = sqlite3.connect("test.db") 49 | assert [ 50 | (0, "county", "TEXT", 0, None, 0), 51 | (1, "precinct", "INTEGER", 0, None, 0), 52 | (2, "office", "TEXT", 0, None, 0), 53 | (3, "district", "INTEGER", 0, None, 0), 54 | (4, "party", "TEXT", 0, None, 0), 55 | (5, "candidate", "TEXT", 0, None, 0), 56 | (6, "votes", "INTEGER", 0, None, 0), 57 | ] == list(conn.execute("PRAGMA table_info(test)")) 58 | rows = conn.execute("select * from test").fetchall() 59 | assert [ 60 | ("Yolo", 100001, "President", None, "LIB", "Gary Johnson", 41), 61 | ("Yolo", 100001, "President", None, "PAF", "Gloria Estela La Riva", 8), 62 | ("Yolo", 100001, "Proposition 51", None, None, "No", 398), 63 | ("Yolo", 100001, "Proposition 51", None, None, "Yes", 460), 64 | ("Yolo", 100001, "State Assembly", 7, "DEM", "Kevin McCarty", 572), 65 | ("Yolo", 100001, "State Assembly", 7, "REP", "Ryan K. Brown", 291), 66 | ] == rows 67 | last_row = rows[-1] 68 | for i, t in enumerate( 69 | (string_types, int, string_types, int, string_types, string_types, int) 70 | ): 71 | assert isinstance(last_row[i], t) 72 | 73 | 74 | def test_extract_columns(): 75 | runner = CliRunner() 76 | with runner.isolated_filesystem(): 77 | open("test.csv", "w").write(CSV) 78 | result = runner.invoke( 79 | cli.cli, 80 | "test.csv extracted.db -c office -c district -c party -c candidate".split(), 81 | ) 82 | assert result.exit_code == 0 83 | assert result.output.strip().endswith("Created extracted.db from 1 CSV file") 84 | conn = sqlite3.connect("extracted.db") 85 | assert [ 86 | (0, "county", "TEXT", 0, None, 0), 87 | (1, "precinct", "INTEGER", 0, None, 0), 88 | (2, "office", "INTEGER", 0, None, 0), 89 | (3, "district", "INTEGER", 0, None, 0), 90 | (4, "party", "INTEGER", 0, None, 0), 91 | (5, "candidate", "INTEGER", 0, None, 0), 92 | (6, "votes", "INTEGER", 0, None, 0), 93 | ] == list(conn.execute("PRAGMA table_info(test)")) 94 | rows = conn.execute( 95 | """ 96 | select 97 | county, precinct, office.value, district.value, 98 | party.value, candidate.value, votes 99 | from test 100 | left join office on test.office = office.id 101 | left join district on test.district = district.id 102 | left join party on test.party = party.id 103 | left join candidate on test.candidate = candidate.id 104 | order by test.rowid 105 | """ 106 | ).fetchall() 107 | assert [ 108 | ("Yolo", 100001, "President", None, "LIB", "Gary Johnson", 41), 109 | ("Yolo", 100001, "President", None, "PAF", "Gloria Estela La Riva", 8), 110 | ("Yolo", 100001, "Proposition 51", None, None, "No", 398), 111 | ("Yolo", 100001, "Proposition 51", None, None, "Yes", 460), 112 | ("Yolo", 100001, "State Assembly", "7", "DEM", "Kevin McCarty", 572), 113 | ("Yolo", 100001, "State Assembly", "7", "REP", "Ryan K. Brown", 291), 114 | ] == rows 115 | last_row = rows[-1] 116 | for i, t in enumerate( 117 | ( 118 | string_types, 119 | int, 120 | string_types, 121 | string_types, 122 | string_types, 123 | string_types, 124 | int, 125 | ) 126 | ): 127 | assert isinstance(last_row[i], t) 128 | 129 | # Check that the various foreign key tables have the right things in them 130 | assert [ 131 | (1, "President"), 132 | (2, "Proposition 51"), 133 | (3, "State Assembly"), 134 | ] == conn.execute("select * from office").fetchall() 135 | assert [(1, "7")] == conn.execute("select * from district").fetchall() 136 | assert [(1, "LIB"), (2, "PAF"), (3, "DEM"), (4, "REP")] == conn.execute( 137 | "select * from party" 138 | ).fetchall() 139 | assert [ 140 | (1, "Gary Johnson"), 141 | (2, "Gloria Estela La Riva"), 142 | (3, "No"), 143 | (4, "Yes"), 144 | (5, "Kevin McCarty"), 145 | (6, "Ryan K. Brown"), 146 | ] == conn.execute("select * from candidate").fetchall() 147 | 148 | # Check that a FTS index was created for each extracted table 149 | fts_tables = [ 150 | r[0] 151 | for r in conn.execute( 152 | """ 153 | select name from sqlite_master 154 | where type='table' and name like '%_fts' 155 | and sql like '%USING FTS%' 156 | """ 157 | ).fetchall() 158 | ] 159 | assert set(fts_tables) == { 160 | "office_value_fts", 161 | "district_value_fts", 162 | "party_value_fts", 163 | "candidate_value_fts", 164 | } 165 | 166 | 167 | def test_fts(): 168 | runner = CliRunner() 169 | with runner.isolated_filesystem(): 170 | open("test.csv", "w").write(CSV) 171 | result = runner.invoke( 172 | cli.cli, "test.csv fts.db -f office -f party -f candidate".split() 173 | ) 174 | assert result.exit_code == 0 175 | conn = sqlite3.connect("fts.db") 176 | assert ( 177 | [("Yolo", 100001, "President", "PAF", "Gloria Estela La Riva")] 178 | == conn.execute( 179 | """ 180 | select county, precinct, office, party, candidate 181 | from test 182 | where rowid in ( 183 | select rowid from test_fts 184 | where test_fts match 'president gloria' 185 | ) 186 | """ 187 | ).fetchall() 188 | ) 189 | 190 | 191 | def test_fts_error_on_missing_columns(): 192 | runner = CliRunner() 193 | with runner.isolated_filesystem(): 194 | open("test.csv", "w").write(CSV) 195 | result = runner.invoke(cli.cli, "test.csv fts.db -f badcolumn".split()) 196 | assert result.exit_code != 0 197 | assert result.output.strip().endswith('FTS column "badcolumn" does not exist') 198 | 199 | 200 | def test_fts_and_extract_columns(): 201 | runner = CliRunner() 202 | with runner.isolated_filesystem(): 203 | open("test.csv", "w").write(CSV) 204 | result = runner.invoke( 205 | cli.cli, 206 | ( 207 | "test.csv fts-extracted.db -c office -c party -c candidate " 208 | "-f party -f candidate" 209 | ).split(), 210 | ) 211 | assert result.exit_code == 0 212 | conn = sqlite3.connect("fts-extracted.db") 213 | assert ( 214 | [("Yolo", 100001, "President", "PAF", "Gloria Estela La Riva")] 215 | == conn.execute( 216 | """ 217 | select 218 | county, precinct, office.value, party.value, candidate.value 219 | from test 220 | left join office on test.office = office.id 221 | left join party on test.party = party.id 222 | left join candidate on test.candidate = candidate.id 223 | where test.rowid in ( 224 | select rowid from test_fts 225 | where test_fts match 'paf gloria' 226 | ) 227 | """ 228 | ).fetchall() 229 | ) 230 | 231 | 232 | def test_fts_one_column_multiple_aliases(): 233 | runner = CliRunner() 234 | with runner.isolated_filesystem(): 235 | open("test.csv", "w").write(CSV_MULTI) 236 | result = runner.invoke( 237 | cli.cli, 238 | ( 239 | "test.csv fts-extracted.db -c film " 240 | "-c actor_1:actors:name -c actor_2:actors:name " 241 | "-f film -f actor_1 -f actor_2" 242 | ).split(), 243 | ) 244 | assert result.exit_code == 0 245 | conn = sqlite3.connect("fts-extracted.db") 246 | assert ( 247 | [ 248 | ("The Rock", "Sean Connery", "Nicolas Cage"), 249 | ("National Treasure", "Nicolas Cage", "Diane Kruger"), 250 | ("Troy", "Diane Kruger", "Orlando Bloom"), 251 | ] 252 | == conn.execute( 253 | """ 254 | select 255 | film.value, a1.name, a2.name 256 | from test 257 | join film on test.film = film.id 258 | join actors a1 on test.actor_1 = a1.id 259 | join actors a2 on test.actor_2 = a2.id 260 | """ 261 | ).fetchall() 262 | ) 263 | assert ( 264 | [ 265 | ("National Treasure", "Nicolas Cage", "Diane Kruger"), 266 | ("Troy", "Diane Kruger", "Orlando Bloom"), 267 | ] 268 | == conn.execute( 269 | """ 270 | select 271 | film.value, a1.name, a2.name 272 | from test 273 | join film on test.film = film.id 274 | join actors a1 on test.actor_1 = a1.id 275 | join actors a2 on test.actor_2 = a2.id 276 | where test.rowid in ( 277 | select rowid from [test_fts] where [test_fts] match 'kruger' 278 | ) 279 | """ 280 | ).fetchall() 281 | ) 282 | 283 | 284 | def test_shape(): 285 | runner = CliRunner() 286 | with runner.isolated_filesystem(): 287 | open("test.csv", "w").write(CSV) 288 | result = runner.invoke( 289 | cli.cli, 290 | ["test.csv", "test-reshaped.db", "--shape", "county:Cty,votes:Vts(REAL)"], 291 | ) 292 | assert result.exit_code == 0 293 | conn = sqlite3.connect("test-reshaped.db") 294 | # Check we only have Cty and Vts columns: 295 | assert [ 296 | (0, "Cty", "TEXT", 0, None, 0), 297 | (1, "Vts", "REAL", 0, None, 0), 298 | ] == conn.execute("PRAGMA table_info(test);").fetchall() 299 | # Now check that values are as expected: 300 | results = conn.execute( 301 | """ 302 | select Cty, Vts from test 303 | """ 304 | ).fetchall() 305 | assert [ 306 | ("Yolo", 41.0), 307 | ("Yolo", 8.0), 308 | ("Yolo", 398.0), 309 | ("Yolo", 460.0), 310 | ("Yolo", 572.0), 311 | ("Yolo", 291.0), 312 | ] == results 313 | for city, votes in results: 314 | assert isinstance(city, text_type) 315 | assert isinstance(votes, float) 316 | 317 | 318 | def test_filename_column(): 319 | runner = CliRunner() 320 | with runner.isolated_filesystem(): 321 | open("test1.csv", "w").write(CSV) 322 | open("test2.csv", "w").write(CSV_MULTI) 323 | result = runner.invoke( 324 | cli.cli, [".", "test-filename.db", "--filename-column", "source"] 325 | ) 326 | assert result.exit_code == 0 327 | conn = sqlite3.connect("test-filename.db") 328 | assert [("./test1",), ("./test2",)] == conn.execute( 329 | "select name from sqlite_master order by name" 330 | ).fetchall() 331 | # Check the source column has been added and populated 332 | assert [("Yolo", "Gary Johnson", 41, "./test1")] == conn.execute( 333 | "select county, candidate, votes, source from [./test1] limit 1" 334 | ).fetchall() 335 | assert [ 336 | ("The Rock", "Sean Connery", "Nicolas Cage", "./test2") 337 | ] == conn.execute( 338 | "select film, actor_1, actor_2, source from [./test2] limit 1" 339 | ).fetchall() 340 | 341 | 342 | def test_filename_column_with_shape(): 343 | runner = CliRunner() 344 | with runner.isolated_filesystem(): 345 | open("test.csv", "w").write(CSV) 346 | result = runner.invoke( 347 | cli.cli, 348 | [ 349 | "test.csv", 350 | "test.db", 351 | "--filename-column", 352 | "source", 353 | "--shape", 354 | "county:Cty,votes:Vts", 355 | ], 356 | ) 357 | assert result.exit_code == 0 358 | conn = sqlite3.connect("test.db") 359 | assert [("Yolo", 41, "test")] == conn.execute( 360 | "select Cty, Vts, source from test limit 1" 361 | ).fetchall() 362 | 363 | 364 | def test_fixed_column(): 365 | """ 366 | Tests that all three fixed_column options are handled correctly. 367 | """ 368 | runner = CliRunner() 369 | with runner.isolated_filesystem(): 370 | open("test.csv", "w").write(CSV) 371 | result = runner.invoke( 372 | cli.cli, 373 | [ 374 | "test.csv", 375 | "test.db", 376 | "--fixed-column", 377 | "col1", 378 | "foo", 379 | "--fixed-column", 380 | "col2", 381 | "bar", 382 | "--fixed-column-int", 383 | "col3", 384 | "1", 385 | "--fixed-column-float", 386 | "col4", 387 | "1.1", 388 | ], 389 | ) 390 | assert result.exit_code == 0 391 | assert result.output.strip().endswith("Created test.db from 1 CSV file") 392 | conn = sqlite3.connect("test.db") 393 | assert [ 394 | (0, "county", "TEXT", 0, None, 0), 395 | (1, "precinct", "INTEGER", 0, None, 0), 396 | (2, "office", "TEXT", 0, None, 0), 397 | (3, "district", "INTEGER", 0, None, 0), 398 | (4, "party", "TEXT", 0, None, 0), 399 | (5, "candidate", "TEXT", 0, None, 0), 400 | (6, "votes", "INTEGER", 0, None, 0), 401 | (7, "col1", "TEXT", 0, None, 0), 402 | (8, "col2", "TEXT", 0, None, 0), 403 | (9, "col3", "INTEGER", 0, None, 0), 404 | (10, "col4", "REAL", 0, None, 0), 405 | ] == list(conn.execute("PRAGMA table_info(test)")) 406 | rows = conn.execute("select * from test").fetchall() 407 | assert [ 408 | ( 409 | "Yolo", 410 | 100001, 411 | "President", 412 | None, 413 | "LIB", 414 | "Gary Johnson", 415 | 41, 416 | "foo", 417 | "bar", 418 | 1, 419 | 1.1, 420 | ), 421 | ( 422 | "Yolo", 423 | 100001, 424 | "President", 425 | None, 426 | "PAF", 427 | "Gloria Estela La Riva", 428 | 8, 429 | "foo", 430 | "bar", 431 | 1, 432 | 1.1, 433 | ), 434 | ( 435 | "Yolo", 436 | 100001, 437 | "Proposition 51", 438 | None, 439 | None, 440 | "No", 441 | 398, 442 | "foo", 443 | "bar", 444 | 1, 445 | 1.1, 446 | ), 447 | ( 448 | "Yolo", 449 | 100001, 450 | "Proposition 51", 451 | None, 452 | None, 453 | "Yes", 454 | 460, 455 | "foo", 456 | "bar", 457 | 1, 458 | 1.1, 459 | ), 460 | ( 461 | "Yolo", 462 | 100001, 463 | "State Assembly", 464 | 7, 465 | "DEM", 466 | "Kevin McCarty", 467 | 572, 468 | "foo", 469 | "bar", 470 | 1, 471 | 1.1, 472 | ), 473 | ( 474 | "Yolo", 475 | 100001, 476 | "State Assembly", 477 | 7, 478 | "REP", 479 | "Ryan K. Brown", 480 | 291, 481 | "foo", 482 | "bar", 483 | 1, 484 | 1.1, 485 | ), 486 | ] == rows 487 | 488 | 489 | def test_fixed_column_with_shape(): 490 | """ 491 | Test that fixed_column works with shape. 492 | """ 493 | runner = CliRunner() 494 | with runner.isolated_filesystem(): 495 | open("test.csv", "w").write(CSV) 496 | result = runner.invoke( 497 | cli.cli, 498 | [ 499 | "test.csv", 500 | "test.db", 501 | "--fixed-column", 502 | "col1", 503 | "foo", 504 | "--fixed-column", 505 | "col2", 506 | "bar", 507 | "--shape", 508 | "county:Cty,votes:Vts", 509 | ], 510 | ) 511 | assert result.exit_code == 0 512 | conn = sqlite3.connect("test.db") 513 | assert [("Yolo", 41, "foo", "bar")] == conn.execute( 514 | "select Cty, Vts, col1, col2 from test limit 1" 515 | ).fetchall() 516 | 517 | 518 | def test_shape_with_extract_columns(): 519 | runner = CliRunner() 520 | with runner.isolated_filesystem(): 521 | open("test.csv", "w").write(CSV) 522 | result = runner.invoke( 523 | cli.cli, 524 | [ 525 | "test.csv", 526 | "test.db", 527 | "--filename-column", 528 | "Source", 529 | "--shape", 530 | "county:Cty,votes:Vts", 531 | "-c", 532 | "Cty", 533 | "-c", 534 | "Vts", 535 | "-c", 536 | "Source", 537 | ], 538 | ) 539 | assert result.exit_code == 0 540 | conn = sqlite3.connect("test.db") 541 | assert ( 542 | [("Yolo", "41", "test")] 543 | == conn.execute( 544 | """ 545 | select 546 | Cty.value, Vts.value, Source.value 547 | from test 548 | left join Cty on test.Cty = Cty.id 549 | left join Vts on test.Vts = Vts.id 550 | left join Source on test.Source = Source.id 551 | limit 1 552 | """ 553 | ).fetchall() 554 | ) 555 | 556 | 557 | def test_custom_indexes(): 558 | runner = CliRunner() 559 | with runner.isolated_filesystem(): 560 | open("test.csv", "w").write(CSV) 561 | result = runner.invoke( 562 | cli.cli, 563 | ["test.csv", "test.db", "--index", "county", "-i", "party,candidate"], 564 | ) 565 | assert result.exit_code == 0 566 | conn = sqlite3.connect("test.db") 567 | assert [ 568 | ('"test_county"', "test"), 569 | ('"test_party_candidate"', "test"), 570 | ] == conn.execute( 571 | 'select name, tbl_name from sqlite_master where type = "index" order by name' 572 | ).fetchall() 573 | 574 | 575 | def test_dates_and_datetimes(): 576 | runner = CliRunner() 577 | with runner.isolated_filesystem(): 578 | open("test.csv", "w").write(CSV_DATES) 579 | result = runner.invoke( 580 | cli.cli, ["test.csv", "test.db", "-d", "date", "-dt", "datetime"] 581 | ) 582 | assert result.exit_code == 0 583 | conn = sqlite3.connect("test.db") 584 | expected = [ 585 | ("First headline", "2017-05-03", "1938-04-04T22:00:00"), 586 | ("Second headline", "2005-04-30", "2009-12-10T05:45:00"), 587 | ] 588 | actual = conn.execute("select * from test").fetchall() 589 | assert expected == actual 590 | 591 | 592 | def test_dates_custom_formats(): 593 | runner = CliRunner() 594 | with runner.isolated_filesystem(): 595 | open("test.csv", "w").write(CSV_DATES_CUSTOM_FORMAT) 596 | result = runner.invoke( 597 | cli.cli, ["test.csv", "test.db", "-d", "date", "-df", "%y/%d/%m"] 598 | ) 599 | assert result.exit_code == 0 600 | conn = sqlite3.connect("test.db") 601 | # Input was 03/02/01 602 | expected = [("Custom format", "2003-01-02")] 603 | actual = conn.execute("select * from test").fetchall() 604 | assert expected == actual 605 | 606 | 607 | def test_extract_cols_no_fts(): 608 | runner = CliRunner() 609 | with runner.isolated_filesystem(): 610 | open("test.csv", "w").write(CSV) 611 | result = runner.invoke( 612 | cli.cli, 613 | ( 614 | "test.csv fts-extracted.db -c office -c party -c candidate " 615 | "-f party -f candidate --no-fulltext-fks" 616 | ).split(), 617 | ) 618 | assert result.exit_code == 0 619 | conn = sqlite3.connect("fts-extracted.db") 620 | assert ( 621 | [("test_fts",)] 622 | == conn.execute( 623 | """ 624 | select name from sqlite_master 625 | where type='table' and name like '%_fts' 626 | and sql like '%USING FTS%' 627 | """ 628 | ).fetchall() 629 | ) 630 | 631 | 632 | def test_custom_primary_keys(): 633 | runner = CliRunner() 634 | with runner.isolated_filesystem(): 635 | open("pks.csv", "w").write(CSV_CUSTOM_PRIMARY_KEYS) 636 | result = runner.invoke( 637 | cli.cli, ("pks.csv pks.db -pk pk1 --primary-key pk2").split() 638 | ) 639 | assert result.exit_code == 0 640 | conn = sqlite3.connect("pks.db") 641 | pks = [ 642 | r[1] for r in conn.execute('PRAGMA table_info("pks")').fetchall() if r[-1] 643 | ] 644 | assert ["pk1", "pk2"] == pks 645 | 646 | 647 | def test_just_strings_default(): 648 | """ 649 | Just like test_flat(), except all columns are strings 650 | """ 651 | runner = CliRunner() 652 | with runner.isolated_filesystem(): 653 | open("test.csv", "w").write(CSV) 654 | result = runner.invoke( 655 | cli.cli, 656 | "test.csv just-strings.db --just-strings".split(), 657 | ) 658 | assert result.exit_code == 0 659 | 660 | conn = sqlite3.connect("just-strings.db") 661 | assert [ 662 | (0, "county", "TEXT", 0, None, 0), 663 | (1, "precinct", "TEXT", 0, None, 0), 664 | (2, "office", "TEXT", 0, None, 0), 665 | (3, "district", "TEXT", 0, None, 0), 666 | (4, "party", "TEXT", 0, None, 0), 667 | (5, "candidate", "TEXT", 0, None, 0), 668 | (6, "votes", "TEXT", 0, None, 0), 669 | ] == list(conn.execute("PRAGMA table_info(test)")) 670 | rows = conn.execute("select * from test").fetchall() 671 | assert [ 672 | ("Yolo", "100001", "President", None, "LIB", "Gary Johnson", "41"), 673 | ("Yolo", "100001", "President", None, "PAF", "Gloria Estela La Riva", "8"), 674 | ("Yolo", "100001", "Proposition 51", None, None, "No", "398"), 675 | ("Yolo", "100001", "Proposition 51", None, None, "Yes", "460"), 676 | ("Yolo", "100001", "State Assembly", "7", "DEM", "Kevin McCarty", "572"), 677 | ("Yolo", "100001", "State Assembly", "7", "REP", "Ryan K. Brown", "291"), 678 | ] == rows 679 | last_row = rows[-1] 680 | for i, t in enumerate( 681 | ( 682 | string_types, 683 | string_types, 684 | string_types, 685 | string_types, 686 | string_types, 687 | string_types, 688 | string_types, 689 | ) 690 | ): 691 | assert isinstance(last_row[i], t) 692 | 693 | 694 | def test_just_strings_with_shape(): 695 | """ 696 | Make sure shape and just_strings play well together 697 | """ 698 | runner = CliRunner() 699 | with runner.isolated_filesystem(): 700 | open("test.csv", "w").write(CSV) 701 | result = runner.invoke( 702 | cli.cli, 703 | [ 704 | "test.csv", 705 | "test-reshaped-strings.db", 706 | "--just-strings", 707 | "--shape", 708 | "county:Cty,district:district,votes:Vts(REAL)", 709 | ], 710 | ) 711 | assert result.exit_code == 0 712 | conn = sqlite3.connect("test-reshaped-strings.db") 713 | # Check that Cty, Vts exist as defined, and so does votetxt: 714 | assert [ 715 | (0, "Cty", "TEXT", 0, None, 0), 716 | (1, "district", "TEXT", 0, None, 0), 717 | (2, "Vts", "REAL", 0, None, 0), 718 | ] == conn.execute("PRAGMA table_info(test);").fetchall() 719 | # Now check that values are as expected: 720 | results = conn.execute( 721 | """ 722 | select Cty, Vts, district from test 723 | """ 724 | ).fetchall() 725 | assert [ 726 | ("Yolo", 41.0, None), 727 | ("Yolo", 8.0, None), 728 | ("Yolo", 398.0, None), 729 | ("Yolo", 460.0, None), 730 | ("Yolo", 572.0, "7"), 731 | ("Yolo", 291.0, "7"), 732 | ] == results 733 | for city, votes, district in results: 734 | assert isinstance(city, text_type) 735 | assert isinstance(votes, float) 736 | assert isinstance(district, text_type) or district is None 737 | 738 | 739 | def test_just_strings_with_date_specified(): 740 | runner = CliRunner() 741 | with runner.isolated_filesystem(): 742 | open("nic_cages_greatest.csv", "w").write(CSV_STRINGS_AND_DATES) 743 | result = runner.invoke( 744 | cli.cli, 745 | [ 746 | "nic_cages_greatest.csv", 747 | "movies.db", 748 | "--date", 749 | "release_date", 750 | "--datetime-format", 751 | "%d of %B in the year %Y", 752 | "--just-strings", 753 | ], 754 | ) 755 | assert result.exit_code == 0 756 | conn = sqlite3.connect("movies.db") 757 | expected = [ 758 | ("Adaptation", "22.5", "2002-12-06"), 759 | ("Face/Off", "245.7", "1997-06-19"), 760 | ("The Rock", "134.1", "1996-06-09"), 761 | ] 762 | actual = conn.execute("select * from nic_cages_greatest").fetchall() 763 | assert expected == actual 764 | 765 | for name, gross, dt in actual: 766 | assert isinstance(gross, text_type) 767 | 768 | 769 | def test_if_cog_needs_to_be_run(): 770 | _stdout = sys.stdout 771 | sys.stdout = StringIO() 772 | readme = pathlib.Path(__file__).parent.parent / "README.md" 773 | result = Cog().main(["cog", str(readme)]) 774 | output = sys.stdout.getvalue() 775 | sys.stdout = _stdout 776 | assert ( 777 | output == readme.read_text() 778 | ), "Run 'cog -r README.md' to update help in README" 779 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from csvs_to_sqlite import utils 2 | import pytest 3 | import sqlite3 4 | import pandas as pd 5 | 6 | TEST_TABLES = """ 7 | CREATE TABLE foo ( 8 | id integer primary key, 9 | value text 10 | ); 11 | """ 12 | 13 | 14 | @pytest.mark.parametrize("table,expected", [("foo", True), ("bar", False)]) 15 | def test_table_exists(table, expected): 16 | conn = sqlite3.connect(":memory:") 17 | conn.executescript(TEST_TABLES) 18 | assert expected == utils.table_exists(conn, table) 19 | 20 | 21 | def test_get_create_table_sql(): 22 | df = pd.DataFrame([{"number": 1, "letter": "a"}]) 23 | sql, columns = utils.get_create_table_sql("hello", df) 24 | assert ( 25 | 'CREATE TABLE "hello" (\n' 26 | '"index" INTEGER,\n' 27 | ' "number" INTEGER,\n' 28 | ' "letter" TEXT\n' 29 | ")" 30 | ) == sql 31 | assert {"index", "letter", "number"} == set(columns) 32 | 33 | 34 | def test_refactor_dataframes(): 35 | df = pd.DataFrame( 36 | [ 37 | {"name": "Terry", "score": 0.5}, 38 | {"name": "Terry", "score": 0.8}, 39 | {"name": "Owen", "score": 0.7}, 40 | ] 41 | ) 42 | conn = sqlite3.connect(":memory:") 43 | output = utils.refactor_dataframes( 44 | conn, [df], {"name": ("People", "first_name")}, False 45 | ) 46 | assert 1 == len(output) 47 | dataframe = output[0] 48 | # There should be a 'People' table in sqlite 49 | assert [(1, "Terry"), (2, "Owen")] == conn.execute( 50 | "select id, first_name from People" 51 | ).fetchall() 52 | assert ( 53 | " name score\n" "0 1 0.5\n" "1 1 0.8\n" "2 2 0.7" 54 | ) == str(dataframe) 55 | --------------------------------------------------------------------------------