├── .github
    └── workflows
    │   ├── publish.yml
    │   └── test.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── csvs_to_sqlite
    ├── __init__.py
    ├── cli.py
    └── utils.py
├── setup.cfg
├── setup.py
└── tests
    ├── test_csvs_to_sqlite.py
    └── test_utils.py


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   test:
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
16 |     steps:
17 |     - uses: actions/checkout@v4
18 |     - name: Set up Python ${{ matrix.python-version }}
19 |       uses: actions/setup-python@v5
20 |       with:
21 |         python-version: ${{ matrix.python-version }}
22 |         cache: pip
23 |         cache-dependency-path: setup.py
24 |     - name: Install dependencies
25 |       run: |
26 |         pip install -e '.[test]'
27 |     - name: Run tests
28 |       run: |
29 |         python -m pytest
30 |   deploy:
31 |     runs-on: ubuntu-latest
32 |     needs: [test]
33 |     environment: release
34 |     permissions:
35 |       id-token: write
36 |     steps:
37 |     - uses: actions/checkout@v4
38 |     - name: Set up Python
39 |       uses: actions/setup-python@v5
40 |       with:
41 |         python-version: "3.13"
42 |         cache: pip
43 |         cache-dependency-path: setup.py
44 |     - name: Install dependencies
45 |       run: |
46 |         pip install setuptools wheel build
47 |     - name: Build
48 |       run: |
49 |         python -m build
50 |     - name: Publish
51 |       uses: pypa/gh-action-pypi-publish@release/v1
52 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | permissions:
 6 |   contents: read
 7 | 
 8 | jobs:
 9 |   test:
10 |     runs-on: ubuntu-latest
11 |     strategy:
12 |       matrix:
13 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
14 |     steps:
15 |     - uses: actions/checkout@v4
16 |     - name: Set up Python ${{ matrix.python-version }}
17 |       uses: actions/setup-python@v5
18 |       with:
19 |         python-version: ${{ matrix.python-version }}
20 |         cache: pip
21 |         cache-dependency-path: setup.py
22 |     - name: Install dependencies
23 |       run: |
24 |         pip install -e '.[test]'
25 |     - name: Run tests
26 |       run: |
27 |         python -m pytest
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | *.egg-info
3 | *.pyc
4 | .pytest_cache/
5 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | Changelog
  2 | =========
  3 | 
  4 | 0.9 (2019-01-16)
  5 | ----------------
  6 | - Support for loading CSVs directly from URLs, thanks @betatim - #38
  7 | - New -pk/--primary-key options, closes #22
  8 | - Create FTS index for extracted column values
  9 | - Added --no-fulltext-fks option, closes #32
 10 | - Now using black for code formatting
 11 | - Bumped versions of dependencies
 12 | 
 13 | 0.8.1 (2018-04-24)
 14 | ------------------
 15 | - Updated README and CHANGELOG, tweaked --help output
 16 | 
 17 | 0.8 (2018-04-24)
 18 | ----------------
 19 | - `-d` and `-df` options for specifying date/datetime columns, closes #33
 20 | - Maintain lookup tables in SQLite, refs #17
 21 | - `--index` option to specify which columns to index, closes #24
 22 | - Test confirming `--shape` and `--filename-column` and `-c` work together #25
 23 | - Use usecols when loading CSV if shape specified
 24 | - `--filename-column` is now compatible with `--shape`, closes #10
 25 | - `--no-index-fks` option
 26 | 
 27 |   By default, csvs-to-sqlite creates an index for every foreign key column that is
 28 |   added using the `--extract-column` option.
 29 | 
 30 |   For large tables, this can dramatically increase the size of the resulting
 31 |   database file on disk. The new `--no-index-fks` option allows you to disable
 32 |   this feature to save on file size.
 33 | 
 34 |   Refs #24 which will allow you to explicitly list which columns SHOULD have
 35 |   an index created.
 36 | - Added `--filename-column` option, refs #10
 37 | - Fixes for Python 2, refs #25
 38 | - Implemented new `--shape` option - refs #25
 39 | - `--table` option for specifying table to write to, refs #10
 40 | - Updated README to cover `--skip-errors`, refs #20
 41 | - Add `--skip-errors` option (#20) [Jani Monoses]
 42 | - Less verbosity (#19) [Jani Monoses]
 43 | 
 44 |   Only log `extract_columns` info when that option is passed.
 45 | - Add option for field quoting behaviour (#15) [Jani Monoses]
 46 | 
 47 | 
 48 | 0.7 (2017-11-25)
 49 | ----------------
 50 | - Add -s option to specify input field separator (#13) [Jani Monoses]
 51 | 
 52 | 
 53 | 0.6.1 (2017-11-24)
 54 | ------------------
 55 | - -f and -c now work for single table multiple columns.
 56 | 
 57 |   Fixes #12
 58 | 
 59 | 0.6 (2017-11-24)
 60 | ----------------
 61 | - `--fts` and `--extract-column` now cooperate.
 62 | 
 63 |   If you extract a column and then specify that same column in the `--fts` list,
 64 |   `csvs-to-sqlite` now uses the original value of that column in the index.
 65 | 
 66 |   Example using CSV from https://data.sfgov.org/City-Infrastructure/Street-Tree-List/tkzw-k3nq
 67 | 
 68 |       csvs-to-sqlite Street_Tree_List.csv trees-fts.db \
 69 |           -c qLegalStatus -c qSpecies -c qSiteInfo \
 70 |           -c PlantType -c qCaretaker -c qCareAssistant \
 71 |           -f qLegalStatus -f qSpecies -f qAddress \
 72 |           -f qSiteInfo -f PlantType -f qCaretaker \
 73 |           -f qCareAssistant -f PermitNotes
 74 | 
 75 |   Closes #9
 76 | - Added `--fts` option for setting up SQLite full-text search.
 77 | 
 78 |   The `--fts` option will create a corresponding SQLite FTS virtual table, using
 79 |   the best available version of the FTS module.
 80 | 
 81 |   https://sqlite.org/fts5.html
 82 |   https://www.sqlite.org/fts3.html
 83 | 
 84 |   Usage:
 85 | 
 86 |       csvs-to-sqlite my-csv.csv output.db -f column1 -f column2
 87 | 
 88 |   Example generated with this option: https://sf-trees-search.now.sh/
 89 | 
 90 |   Example search: https://sf-trees-search.now.sh/sf-trees-search-a899b92?sql=select+*+from+Street_Tree_List+where+rowid+in+%28select+rowid+from+Street_Tree_List_fts+where+Street_Tree_List_fts+match+%27grove+london+dpw%27%29%0D%0A
 91 | 
 92 |   Will be used in https://github.com/simonw/datasette/issues/131
 93 | - Handle column names with spaces in them.
 94 | - Added `csvs-to-sqlite --version` option.
 95 | 
 96 |   Using http://click.pocoo.org/5/api/#click.version_option
 97 | 
 98 | 
 99 | 0.5 (2017-11-19)
100 | ----------------
101 | - Release 0.5.
102 | - Foreign key extraction for mix of integer and NaN now works.
103 | 
104 |   Similar issue to a8ab5248f4a - when we extracted a column that included a
105 |   mixture of both integers and NaNs things went a bit weird.
106 | - Added test for column extraction.
107 | - Fixed bug with accidentally hard-coded column.
108 | 
109 | 
110 | 0.4 (2017-11-19)
111 | ----------------
112 | - Release 0.4.
113 | - Automatically deploy tags as PyPI releases.
114 | 
115 |   https://docs.travis-ci.com/user/deployment/pypi/
116 | - Fixed tests for Python 2.
117 | - Ensure columns of ints + NaNs map to SQLite INTEGER.
118 | 
119 |   Pandas does a good job of figuring out which SQLite column types should be
120 |   used for a DataFrame - with one exception: due to a limitation of NumPy it
121 |   treats columns containing a mixture of integers and NaN (blank values) as
122 |   being of type float64, which means they end up as REAL columns in SQLite.
123 | 
124 |   http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na
125 | 
126 |   To fix this, we now check to see if a float64 column actually consists solely
127 |   of NaN and integer-valued floats (checked using v.is_integer() in Python). If
128 |   that is the case, we over-ride the column type to be INTEGER instead.
129 | - Use miniconda to speed up Travis CI builds (#8)
130 | 
131 |   Using Travis CI configuration code copied from https://github.com/EducationalTestingService/skll/blob/87b071743ba7cf0b1063c7265005d43b172b5d91/.travis.yml
132 | 
133 |   Which is itself an updated version of the pattern described in http://dan-blanchard.roughdraft.io/7045057-quicker-travis-builds-that-rely-on-numpy-and-scipy-using-miniconda
134 | 
135 |   I had to switch to running `pytest` directly, because `python setup.py test` was still trying to install a pandas package that involved compiling everything from scratch (which is why Travis CI builds were taking around 15 minutes).
136 | - Don't include an `index` column - rely on SQLite rowid instead.
137 | 
138 | 
139 | 0.3 (2017-11-17)
140 | ----------------
141 | - Added `--extract-column` to README.
142 | 
143 |   Also updated the `--help` output and added a Travis CI badge.
144 | - Configure Travis CI.
145 | 
146 |   Also made it so `python setup.py test` runs the tests.
147 | - Mechanism for converting columns into separate tables.
148 | 
149 |   Let's say you have a CSV file that looks like this:
150 | 
151 |       county,precinct,office,district,party,candidate,votes
152 |       Clark,1,President,,REP,John R. Kasich,5
153 |       Clark,2,President,,REP,John R. Kasich,0
154 |       Clark,3,President,,REP,John R. Kasich,7
155 | 
156 |   (Real example from https://github.com/openelections/openelections-data-sd/blob/master/2016/20160607__sd__primary__clark__precinct.csv )
157 | 
158 |   You can now convert selected columns into separate lookup tables using the new
159 |   `--extract-column` option (shortname: `-c`) - for example:
160 | 
161 |       csvs-to-sqlite openelections-data-*/*.csv \
162 |           -c county:County:name \
163 |           -c precinct:Precinct:name \
164 |           -c office -c district -c party -c candidate \
165 |           openelections.db
166 | 
167 |   The format is as follows:
168 | 
169 |       column_name:optional_table_name:optional_table_value_column_name
170 | 
171 |   If you just specify the column name e.g. `-c office`, the following table will
172 |   be created:
173 | 
174 |       CREATE TABLE "party" (
175 |           "id" INTEGER PRIMARY KEY,
176 |           "value" TEXT
177 |       );
178 | 
179 |   If you specify all three options, e.g. `-c precinct:Precinct:name` the table
180 |   will look like this:
181 | 
182 |       CREATE TABLE "Precinct" (
183 |           "id" INTEGER PRIMARY KEY,
184 |           "name" TEXT
185 |       );
186 | 
187 |   The original tables will be created like this:
188 | 
189 |       CREATE TABLE "ca__primary__san_francisco__precinct" (
190 |           "county" INTEGER,
191 |           "precinct" INTEGER,
192 |           "office" INTEGER,
193 |           "district" INTEGER,
194 |           "party" INTEGER,
195 |           "candidate" INTEGER,
196 |           "votes" INTEGER,
197 |           FOREIGN KEY (county) REFERENCES County(id),
198 |           FOREIGN KEY (party) REFERENCES party(id),
199 |           FOREIGN KEY (precinct) REFERENCES Precinct(id),
200 |           FOREIGN KEY (office) REFERENCES office(id),
201 |           FOREIGN KEY (candidate) REFERENCES candidate(id)
202 |       );
203 | 
204 |   They will be populated with IDs that reference the new derived tables.
205 | 
206 |   Closes #2
207 | - Can now add new tables to existing database.
208 | 
209 |   And the new `--replace-tables` option allows you to tell it to replace existing
210 |   tables rather than quitting with an error.
211 | 
212 |   Closes #1
213 | - Fixed compatibility with Python 3.
214 | - Badge links to PyPI.
215 | - Create LICENSE.
216 | - Create README.md.
217 | - Initial release.
218 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # csvs-to-sqlite
  2 | 
  3 | [![PyPI](https://img.shields.io/pypi/v/csvs-to-sqlite.svg)](https://pypi.org/project/csvs-to-sqlite/)
  4 | [![Changelog](https://img.shields.io/github/v/release/simonw/csvs-to-sqlite?include_prereleases&label=changelog)](https://github.com/simonw/csvs-to-sqlite/releases)
  5 | [![Tests](https://github.com/simonw/csvs-to-sqlite/workflows/Test/badge.svg)](https://github.com/simonw/csvs-to-sqlite/actions?query=workflow%3ATest)
  6 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/simonw/csvs-to-sqlite/blob/main/LICENSE)
  7 | 
  8 | Convert CSV files into a SQLite database. Browse and publish that SQLite database with [Datasette](https://github.com/simonw/datasette).
  9 | 
 10 | > [!NOTE]
 11 | > This tool is **infrequently maintained**. I suggest [using sqlite-utils](https://sqlite-utils.datasette.io/en/stable/cli.html#inserting-csv-or-tsv-data) for importing CSV and TSV to SQLite instead for most cases.
 12 | 
 13 | Basic usage:
 14 | ```bash
 15 | csvs-to-sqlite myfile.csv mydatabase.db
 16 | ```
 17 | This will create a new SQLite database called `mydatabase.db` containing a
 18 | single table, `myfile`, containing the CSV content.
 19 | 
 20 | You can provide multiple CSV files:
 21 | ```
 22 | csvs-to-sqlite one.csv two.csv bundle.db
 23 | ```
 24 | The `bundle.db` database will contain two tables, `one` and `two`.
 25 | 
 26 | This means you can use wildcards:
 27 | ```bash
 28 | csvs-to-sqlite ~/Downloads/*.csv my-downloads.db
 29 | ```
 30 | If you pass a path to one or more directories, the script will recursively
 31 | search those directories for CSV files and create tables for each one.
 32 | ```bash
 33 | csvs-to-sqlite ~/path/to/directory all-my-csvs.db
 34 | ```
 35 | ## Handling TSV (tab-separated values)
 36 | 
 37 | You can use the `-s` option to specify a different delimiter. If you want
 38 | to use a tab character you'll need to apply shell escaping like so:
 39 | ```bash
 40 | csvs-to-sqlite my-file.tsv my-file.db -s $'\t'
 41 | ```
 42 | ## Refactoring columns into separate lookup tables
 43 | 
 44 | Let's say you have a CSV file that looks like this:
 45 | ```csv
 46 | county,precinct,office,district,party,candidate,votes
 47 | Clark,1,President,,REP,John R. Kasich,5
 48 | Clark,2,President,,REP,John R. Kasich,0
 49 | Clark,3,President,,REP,John R. Kasich,7
 50 | ```
 51 | ([Real example taken from the Open Elections project](https://github.com/openelections/openelections-data-sd/blob/master/2016/20160607__sd__primary__clark__precinct.csv))
 52 | 
 53 | You can now convert selected columns into separate lookup tables using the new
 54 | `--extract-column` option (shortname: `-c`) - for example:
 55 | ```bash
 56 | csvs-to-sqlite openelections-data-*/*.csv \
 57 |     -c county:County:name \
 58 |     -c precinct:Precinct:name \
 59 |     -c office -c district -c party -c candidate \
 60 |     openelections.db
 61 | ```
 62 | The format is as follows:
 63 | ```bash
 64 | column_name:optional_table_name:optional_table_value_column_name
 65 | ```
 66 | If you just specify the column name e.g. `-c office`, the following table will
 67 | be created:
 68 | ```sql
 69 | CREATE TABLE "office" (
 70 |     "id" INTEGER PRIMARY KEY,
 71 |     "value" TEXT
 72 | );
 73 | ```
 74 | If you specify all three options, e.g. `-c precinct:Precinct:name` the table
 75 | will look like this:
 76 | ```sql
 77 | CREATE TABLE "Precinct" (
 78 |     "id" INTEGER PRIMARY KEY,
 79 |     "name" TEXT
 80 | );
 81 | ```
 82 | The original tables will be created like this:
 83 | ```sql
 84 | CREATE TABLE "ca__primary__san_francisco__precinct" (
 85 |     "county" INTEGER,
 86 |     "precinct" INTEGER,
 87 |     "office" INTEGER,
 88 |     "district" INTEGER,
 89 |     "party" INTEGER,
 90 |     "candidate" INTEGER,
 91 |     "votes" INTEGER,
 92 |     FOREIGN KEY (county) REFERENCES County(id),
 93 |     FOREIGN KEY (party) REFERENCES party(id),
 94 |     FOREIGN KEY (precinct) REFERENCES Precinct(id),
 95 |     FOREIGN KEY (office) REFERENCES office(id),
 96 |     FOREIGN KEY (candidate) REFERENCES candidate(id)
 97 | );
 98 | ```
 99 | They will be populated with IDs that reference the new derived tables.
100 | 
101 | ## Installation
102 | 
103 | ```bash
104 | pip install csvs-to-sqlite
105 | ```
106 | 
107 | `csvs-to-sqlite` now requires Python 3. If you are running Python 2 you can install the last version to support Python 2:
108 | ```bash
109 | pip install csvs-to-sqlite==0.9.2
110 | ```
111 | 
112 | ## csvs-to-sqlite --help
113 | 
114 | <!-- [[[cog
115 | import cog
116 | from csvs_to_sqlite import cli
117 | from click.testing import CliRunner
118 | runner = CliRunner()
119 | result = runner.invoke(cli.cli, ["--help"])
120 | help = result.output.replace("Usage: cli", "Usage: csvs-to-sqlite")
121 | cog.out(
122 |     "```\n{}\n```".format(help)
123 | )
124 | ]]] -->
125 | ```
126 | Usage: csvs-to-sqlite [OPTIONS] PATHS... DBNAME
127 | 
128 |   PATHS: paths to individual .csv files or to directories containing .csvs
129 | 
130 |   DBNAME: name of the SQLite database file to create
131 | 
132 | Options:
133 |   -s, --separator TEXT            Field separator in input .csv
134 |   -q, --quoting INTEGER           Control field quoting behavior per csv.QUOTE_*
135 |                                   constants. Use one of QUOTE_MINIMAL (0),
136 |                                   QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or
137 |                                   QUOTE_NONE (3).
138 |   --skip-errors                   Skip lines with too many fields instead of
139 |                                   stopping the import
140 |   --replace-tables                Replace tables if they already exist
141 |   -t, --table TEXT                Table to use (instead of using CSV filename)
142 |   -c, --extract-column TEXT       One or more columns to 'extract' into a
143 |                                   separate lookup table. If you pass a simple
144 |                                   column name that column will be replaced with
145 |                                   integer foreign key references to a new table
146 |                                   of that name. You can customize the name of
147 |                                   the table like so:     state:States:state_name
148 |                                   
149 |                                   This will pull unique values from the 'state'
150 |                                   column and use them to populate a new 'States'
151 |                                   table, with an id column primary key and a
152 |                                   state_name column containing the strings from
153 |                                   the original column.
154 |   -d, --date TEXT                 One or more columns to parse into ISO
155 |                                   formatted dates
156 |   -dt, --datetime TEXT            One or more columns to parse into ISO
157 |                                   formatted datetimes
158 |   -df, --datetime-format TEXT     One or more custom date format strings to try
159 |                                   when parsing dates/datetimes
160 |   -pk, --primary-key TEXT         One or more columns to use as the primary key
161 |   -f, --fts TEXT                  One or more columns to use to populate a full-
162 |                                   text index
163 |   -i, --index TEXT                Add index on this column (or a compound index
164 |                                   with -i col1,col2)
165 |   --shape TEXT                    Custom shape for the DB table - format is
166 |                                   csvcol:dbcol(TYPE),...
167 |   --filename-column TEXT          Add a column with this name and populate with
168 |                                   CSV file name
169 |   --fixed-column <TEXT TEXT>...   Populate column with a fixed string
170 |   --fixed-column-int <TEXT INTEGER>...
171 |                                   Populate column with a fixed integer
172 |   --fixed-column-float <TEXT FLOAT>...
173 |                                   Populate column with a fixed float
174 |   --no-index-fks                  Skip adding index to foreign key columns
175 |                                   created using --extract-column (default is to
176 |                                   add them)
177 |   --no-fulltext-fks               Skip adding full-text index on values
178 |                                   extracted using --extract-column (default is
179 |                                   to add them)
180 |   --just-strings                  Import all columns as text strings by default
181 |                                   (and, if specified, still obey --shape,
182 |                                   --date/datetime, and --datetime-format)
183 |   --version                       Show the version and exit.
184 |   --help                          Show this message and exit.
185 | 
186 | ```
187 | <!-- [[[end]]] -->
188 | 


--------------------------------------------------------------------------------
/csvs_to_sqlite/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/simonw/csvs-to-sqlite/33b79ed80a688ceae4486ebdcca28ae689f54c03/csvs_to_sqlite/__init__.py


--------------------------------------------------------------------------------
/csvs_to_sqlite/cli.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | import click
  4 | from .utils import (
  5 |     LoadCsvError,
  6 |     LookupTable,
  7 |     PathOrURL,
  8 |     add_index,
  9 |     apply_dates_and_datetimes,
 10 |     apply_shape,
 11 |     best_fts_version,
 12 |     csvs_from_paths,
 13 |     generate_and_populate_fts,
 14 |     load_csv,
 15 |     refactor_dataframes,
 16 |     table_exists,
 17 |     drop_table,
 18 |     to_sql_with_foreign_keys,
 19 | )
 20 | import os
 21 | import sqlite3
 22 | 
 23 | 
 24 | @click.command()
 25 | @click.argument("paths", type=PathOrURL(exists=True), nargs=-1, required=True)
 26 | @click.argument("dbname", nargs=1)
 27 | @click.option("--separator", "-s", default=",", help="Field separator in input .csv")
 28 | @click.option(
 29 |     "--quoting",
 30 |     "-q",
 31 |     default=0,
 32 |     help="Control field quoting behavior per csv.QUOTE_* constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).",
 33 | )
 34 | @click.option(
 35 |     "--skip-errors",
 36 |     is_flag=True,
 37 |     help="Skip lines with too many fields instead of stopping the import",
 38 | )
 39 | @click.option(
 40 |     "--replace-tables", is_flag=True, help="Replace tables if they already exist"
 41 | )
 42 | @click.option(
 43 |     "--table", "-t", help="Table to use (instead of using CSV filename)", default=None
 44 | )
 45 | @click.option(
 46 |     "--extract-column",
 47 |     "-c",
 48 |     multiple=True,
 49 |     help=(
 50 |         "One or more columns to 'extract' into a separate lookup table. "
 51 |         "If you pass a simple column name that column will be replaced "
 52 |         "with integer foreign key references to a new table of that "
 53 |         "name. You can customize the name of the table like so:\n"
 54 |         "    state:States:state_name\n\n"
 55 |         "This will pull unique values from the 'state' column and use "
 56 |         "them to populate a new 'States' table, with an id column "
 57 |         "primary key and a state_name column containing the strings "
 58 |         "from the original column."
 59 |     ),
 60 | )
 61 | @click.option(
 62 |     "--date",
 63 |     "-d",
 64 |     multiple=True,
 65 |     help=("One or more columns to parse into ISO formatted dates"),
 66 | )
 67 | @click.option(
 68 |     "--datetime",
 69 |     "-dt",
 70 |     multiple=True,
 71 |     help=("One or more columns to parse into ISO formatted datetimes"),
 72 | )
 73 | @click.option(
 74 |     "--datetime-format",
 75 |     "-df",
 76 |     multiple=True,
 77 |     help=("One or more custom date format strings to try when parsing dates/datetimes"),
 78 | )
 79 | @click.option(
 80 |     "--primary-key",
 81 |     "-pk",
 82 |     multiple=True,
 83 |     help=("One or more columns to use as the primary key"),
 84 | )
 85 | @click.option(
 86 |     "--fts",
 87 |     "-f",
 88 |     multiple=True,
 89 |     help=("One or more columns to use to populate a full-text index"),
 90 | )
 91 | @click.option(
 92 |     "--index",
 93 |     "-i",
 94 |     multiple=True,
 95 |     help=("Add index on this column (or a compound index with -i col1,col2)"),
 96 | )
 97 | @click.option(
 98 |     "--shape",
 99 |     help="Custom shape for the DB table - format is csvcol:dbcol(TYPE),...",
100 |     default=None,
101 | )
102 | @click.option(
103 |     "--filename-column",
104 |     help="Add a column with this name and populate with CSV file name",
105 |     default=None,
106 | )
107 | @click.option(
108 |     "fixed_columns",
109 |     "--fixed-column",
110 |     type=(str, str),
111 |     multiple=True,
112 |     help="Populate column with a fixed string",
113 |     default=None,
114 | )
115 | @click.option(
116 |     "fixed_columns_int",
117 |     "--fixed-column-int",
118 |     type=(str, int),
119 |     multiple=True,
120 |     help="Populate column with a fixed integer",
121 |     default=None,
122 | )
123 | @click.option(
124 |     "fixed_columns_float",
125 |     "--fixed-column-float",
126 |     type=(str, float),
127 |     multiple=True,
128 |     help="Populate column with a fixed float",
129 |     default=None,
130 | )
131 | @click.option(
132 |     "--no-index-fks",
133 |     "no_index_fks",
134 |     is_flag=True,
135 |     help="Skip adding index to foreign key columns created using --extract-column (default is to add them)",
136 | )
137 | @click.option(
138 |     "--no-fulltext-fks",
139 |     "no_fulltext_fks",
140 |     is_flag=True,
141 |     help="Skip adding full-text index on values extracted using --extract-column (default is to add them)",
142 | )
143 | @click.option(
144 |     "--just-strings",
145 |     is_flag=True,
146 |     help="Import all columns as text strings by default (and, if specified, still obey --shape, --date/datetime, and --datetime-format)",
147 | )
148 | @click.version_option()
149 | def cli(
150 |     paths,
151 |     dbname,
152 |     separator,
153 |     quoting,
154 |     skip_errors,
155 |     replace_tables,
156 |     table,
157 |     extract_column,
158 |     date,
159 |     datetime,
160 |     datetime_format,
161 |     primary_key,
162 |     fts,
163 |     index,
164 |     shape,
165 |     filename_column,
166 |     fixed_columns,
167 |     fixed_columns_int,
168 |     fixed_columns_float,
169 |     no_index_fks,
170 |     no_fulltext_fks,
171 |     just_strings,
172 | ):
173 |     """
174 |     PATHS: paths to individual .csv files or to directories containing .csvs
175 | 
176 |     DBNAME: name of the SQLite database file to create
177 |     """
178 |     # make plural for more readable code:
179 |     extract_columns = extract_column
180 |     del extract_column
181 | 
182 |     if extract_columns:
183 |         click.echo("extract_columns={}".format(extract_columns))
184 |     if dbname.endswith(".csv"):
185 |         raise click.BadParameter("dbname must not end with .csv")
186 |     if "." not in dbname:
187 |         dbname += ".db"
188 | 
189 |     db_existed = os.path.exists(dbname)
190 | 
191 |     conn = sqlite3.connect(dbname)
192 | 
193 |     dataframes = []
194 |     csvs = csvs_from_paths(paths)
195 |     sql_type_overrides = None
196 |     for name, path in csvs.items():
197 |         try:
198 |             df = load_csv(
199 |                 path, separator, skip_errors, quoting, shape, just_strings=just_strings
200 |             )
201 |             df.table_name = table or name
202 |             if filename_column:
203 |                 df[filename_column] = name
204 |                 if shape:
205 |                     shape += ",{}".format(filename_column)
206 |             if fixed_columns:
207 |                 for colname, value in fixed_columns:
208 |                     df[colname] = value
209 |                     if shape:
210 |                         shape += ",{}".format(colname)
211 |             if fixed_columns_int:
212 |                 for colname, value in fixed_columns_int:
213 |                     df[colname] = value
214 |                     if shape:
215 |                         shape += ",{}".format(colname)
216 |             if fixed_columns_float:
217 |                 for colname, value in fixed_columns_float:
218 |                     df[colname] = value
219 |                     if shape:
220 |                         shape += ",{}".format(colname)
221 |             sql_type_overrides = apply_shape(df, shape)
222 |             apply_dates_and_datetimes(df, date, datetime, datetime_format)
223 |             dataframes.append(df)
224 |         except LoadCsvError as e:
225 |             click.echo("Could not load {}: {}".format(path, e), err=True)
226 | 
227 |     click.echo("Loaded {} dataframes".format(len(dataframes)))
228 | 
229 |     # Use extract_columns to build a column:(table,label) dictionary
230 |     foreign_keys = {}
231 |     for col in extract_columns:
232 |         bits = col.split(":")
233 |         if len(bits) == 3:
234 |             foreign_keys[bits[0]] = (bits[1], bits[2])
235 |         elif len(bits) == 2:
236 |             foreign_keys[bits[0]] = (bits[1], "value")
237 |         else:
238 |             foreign_keys[bits[0]] = (bits[0], "value")
239 | 
240 |     # Now we have loaded the dataframes, we can refactor them
241 |     created_tables = {}
242 |     refactored = refactor_dataframes(
243 |         conn, dataframes, foreign_keys, not no_fulltext_fks
244 |     )
245 |     for df in refactored:
246 |         # This is a bit trickier because we need to
247 |         # create the table with extra SQL for foreign keys
248 |         if replace_tables and table_exists(conn, df.table_name):
249 |             drop_table(conn, df.table_name)
250 |         if table_exists(conn, df.table_name):
251 |             df.to_sql(df.table_name, conn, if_exists="append", index=False)
252 |         else:
253 |             to_sql_with_foreign_keys(
254 |                 conn,
255 |                 df,
256 |                 df.table_name,
257 |                 foreign_keys,
258 |                 sql_type_overrides,
259 |                 primary_keys=primary_key,
260 |                 index_fks=not no_index_fks,
261 |             )
262 |             created_tables[df.table_name] = df
263 |         if index:
264 |             for index_defn in index:
265 |                 add_index(conn, df.table_name, index_defn)
266 | 
267 |     # Create FTS tables
268 |     if fts:
269 |         fts_version = best_fts_version()
270 |         if not fts_version:
271 |             conn.close()
272 |             raise click.BadParameter(
273 |                 "Your SQLite version does not support any variant of FTS"
274 |             )
275 |         # Check that columns make sense
276 |         for table, df in created_tables.items():
277 |             for fts_column in fts:
278 |                 if fts_column not in df.columns:
279 |                     raise click.BadParameter(
280 |                         'FTS column "{}" does not exist'.format(fts_column)
281 |                     )
282 | 
283 |         generate_and_populate_fts(conn, created_tables.keys(), fts, foreign_keys)
284 | 
285 |     conn.close()
286 | 
287 |     if db_existed:
288 |         click.echo(
289 |             "Added {} CSV file{} to {}".format(
290 |                 len(csvs), "" if len(csvs) == 1 else "s", dbname
291 |             )
292 |         )
293 |     else:
294 |         click.echo(
295 |             "Created {} from {} CSV file{}".format(
296 |                 dbname, len(csvs), "" if len(csvs) == 1 else "s"
297 |             )
298 |         )
299 | 


--------------------------------------------------------------------------------
/csvs_to_sqlite/utils.py:
--------------------------------------------------------------------------------
  1 | import dateparser
  2 | import os
  3 | import fnmatch
  4 | import hashlib
  5 | import lru
  6 | import pandas as pd
  7 | import numpy as np
  8 | import re
  9 | import six
 10 | import sqlite3
 11 | 
 12 | from six.moves.urllib.parse import urlparse
 13 | from six.moves.urllib.parse import uses_relative, uses_netloc, uses_params
 14 | 
 15 | import click
 16 | 
 17 | 
 18 | class LoadCsvError(Exception):
 19 |     pass
 20 | 
 21 | 
 22 | def load_csv(
 23 |     filepath,
 24 |     separator,
 25 |     skip_errors,
 26 |     quoting,
 27 |     shape,
 28 |     encodings_to_try=("utf8", "latin-1"),
 29 |     just_strings=False,
 30 | ):
 31 |     dtype = str if just_strings is True else None
 32 |     usecols = None
 33 |     if shape:
 34 |         usecols = [defn["csv_name"] for defn in parse_shape(shape)]
 35 |     try:
 36 |         for encoding in encodings_to_try:
 37 |             try:
 38 |                 return pd.read_csv(
 39 |                     filepath,
 40 |                     sep=separator,
 41 |                     quoting=quoting,
 42 |                     on_bad_lines="skip" if skip_errors else "error",
 43 |                     low_memory=True,
 44 |                     encoding=encoding,
 45 |                     usecols=usecols,
 46 |                     dtype=dtype,
 47 |                 )
 48 |             except UnicodeDecodeError:
 49 |                 continue
 50 |             except pd.errors.ParserError as e:
 51 |                 raise LoadCsvError(e)
 52 |         # If we get here, we failed
 53 |         raise LoadCsvError("All encodings failed")
 54 |     except Exception as e:
 55 |         raise LoadCsvError(e)
 56 | 
 57 | 
 58 | def csvs_from_paths(paths):
 59 |     csvs = {}
 60 | 
 61 |     def add_item(filepath, full_path=None):
 62 |         name = os.path.splitext(os.path.basename(filepath))[0]
 63 |         if name in csvs:
 64 |             i = 1
 65 |             while True:
 66 |                 name_plus_suffix = "{}-{}".format(name, i)
 67 |                 if name_plus_suffix not in csvs:
 68 |                     name = name_plus_suffix
 69 |                     break
 70 |                 else:
 71 |                     i += 1
 72 |         if full_path is None:
 73 |             csvs[name] = filepath
 74 |         else:
 75 |             csvs[name] = full_path
 76 | 
 77 |     for path in paths:
 78 |         if os.path.isfile(path):
 79 |             add_item(path)
 80 |         elif _is_url(path):
 81 |             add_item(urlparse(path).path, path)
 82 |         elif os.path.isdir(path):
 83 |             # Recursively seek out ALL csvs in directory
 84 |             for root, dirnames, filenames in os.walk(path):
 85 |                 for filename in fnmatch.filter(filenames, "*.csv"):
 86 |                     relpath = os.path.relpath(root, path)
 87 |                     namepath = os.path.join(relpath, os.path.splitext(filename)[0])
 88 |                     csvs[namepath] = os.path.join(root, filename)
 89 | 
 90 |     return csvs
 91 | 
 92 | 
 93 | def _is_url(possible_url):
 94 |     valid_schemes = set(uses_relative + uses_netloc + uses_params)
 95 |     valid_schemes.discard("")
 96 | 
 97 |     try:
 98 |         return urlparse(possible_url).scheme in valid_schemes
 99 |     except:
100 |         return False
101 | 
102 | 
103 | class PathOrURL(click.Path):
104 |     """The PathOrURL type handles paths or URLs.
105 | 
106 |     If the argument can be parsed as a URL, it will be treated as one.
107 |     Otherwise PathorURL behaves like click.Path.
108 |     """
109 | 
110 |     def __init__(
111 |         self,
112 |         exists=False,
113 |         file_okay=True,
114 |         dir_okay=True,
115 |         writable=False,
116 |         readable=True,
117 |         resolve_path=False,
118 |         allow_dash=False,
119 |         path_type=None,
120 |     ):
121 |         super(PathOrURL, self).__init__(
122 |             exists=exists,
123 |             file_okay=file_okay,
124 |             dir_okay=dir_okay,
125 |             writable=writable,
126 |             readable=readable,
127 |             resolve_path=resolve_path,
128 |             allow_dash=allow_dash,
129 |             path_type=path_type,
130 |         )
131 | 
132 |     def convert(self, value, param, ctx):
133 |         if _is_url(value):
134 |             return self.coerce_path_result(value)
135 |         else:
136 |             return super(PathOrURL, self).convert(value, param, ctx)
137 | 
138 | 
139 | class LookupTable:
140 |     def __init__(self, conn, table_name, value_column, index_fts):
141 |         self.conn = conn
142 |         self.table_name = table_name
143 |         self.value_column = value_column
144 |         self.fts_table_name = "{table_name}_{value_column}_fts".format(
145 |             table_name=table_name, value_column=value_column
146 |         )
147 |         self.index_fts = index_fts
148 |         self.cache = lru.LRUCacheDict(max_size=1000)
149 |         self.ensure_table_exists()
150 | 
151 |     def ensure_table_exists(self):
152 |         if not self.conn.execute(
153 |             """
154 |             SELECT name
155 |             FROM sqlite_master
156 |             WHERE type='table'
157 |             AND name=?
158 |         """,
159 |             (self.table_name,),
160 |         ).fetchall():
161 |             create_sql = """
162 |                 CREATE TABLE "{table_name}" (
163 |                     "id" INTEGER PRIMARY KEY,
164 |                     "{value_column}" TEXT
165 |                 );
166 |             """.format(
167 |                 table_name=self.table_name, value_column=self.value_column
168 |             )
169 |             self.conn.execute(create_sql)
170 |             if self.index_fts:
171 |                 # Add a FTS index on the value_column
172 |                 self.conn.execute(
173 |                     """
174 |                     CREATE VIRTUAL TABLE "{fts_table_name}"
175 |                     USING {fts_version} ({value_column}, content="{table_name}");
176 |                 """.format(
177 |                         fts_version=best_fts_version(),
178 |                         fts_table_name=self.fts_table_name,
179 |                         table_name=self.table_name,
180 |                         value_column=self.value_column,
181 |                     )
182 |                 )
183 | 
184 |     def __repr__(self):
185 |         return "<{}: {} rows>".format(
186 |             self.table_name,
187 |             self.conn.execute(
188 |                 'select count(*) from "{}"'.format(self.table_name)
189 |             ).fetchone()[0],
190 |         )
191 | 
192 |     def id_for_value(self, value):
193 |         if pd.isnull(value):
194 |             return None
195 |         # value should be a string
196 |         if not isinstance(value, six.string_types):
197 |             if isinstance(value, float):
198 |                 value = "{0:g}".format(value)
199 |             else:
200 |                 value = six.text_type(value)
201 |         try:
202 |             # First try our in-memory cache
203 |             return self.cache[value]
204 |         except KeyError:
205 |             # Next try the database table
206 |             sql = 'SELECT id FROM "{table_name}" WHERE "{value_column}"=?'.format(
207 |                 table_name=self.table_name, value_column=self.value_column
208 |             )
209 |             result = self.conn.execute(sql, (value,)).fetchall()
210 |             if result:
211 |                 id = result[0][0]
212 |             else:
213 |                 # Not in DB! Insert it
214 |                 cursor = self.conn.cursor()
215 |                 cursor.execute(
216 |                     """
217 |                     INSERT INTO "{table_name}" ("{value_column}") VALUES (?);
218 |                 """.format(
219 |                         table_name=self.table_name, value_column=self.value_column
220 |                     ),
221 |                     (value,),
222 |                 )
223 |                 id = cursor.lastrowid
224 |                 if self.index_fts:
225 |                     # And update FTS index
226 |                     sql = """
227 |                         INSERT INTO "{fts_table_name}" (rowid, "{value_column}") VALUES (?, ?);
228 |                     """.format(
229 |                         fts_table_name=self.fts_table_name,
230 |                         value_column=self.value_column,
231 |                     )
232 |                     cursor.execute(sql, (id, value))
233 | 
234 |             self.cache[value] = id
235 |             return id
236 | 
237 | 
238 | def refactor_dataframes(conn, dataframes, foreign_keys, index_fts):
239 |     lookup_tables = {}
240 |     for column, (table_name, value_column) in foreign_keys.items():
241 |         # Now apply this to the dataframes
242 |         for dataframe in dataframes:
243 |             if column in dataframe.columns:
244 |                 lookup_table = lookup_tables.get(table_name)
245 |                 if lookup_table is None:
246 |                     lookup_table = LookupTable(
247 |                         conn=conn,
248 |                         table_name=table_name,
249 |                         value_column=value_column,
250 |                         index_fts=index_fts,
251 |                     )
252 |                     lookup_tables[table_name] = lookup_table
253 |                 dataframe[column] = dataframe[column].apply(lookup_table.id_for_value)
254 |     return dataframes
255 | 
256 | 
257 | def table_exists(conn, table):
258 |     return conn.execute(
259 |         """
260 |         select count(*) from sqlite_master
261 |         where type="table" and name=?
262 |     """,
263 |         [table],
264 |     ).fetchone()[0]
265 | 
266 | 
267 | def drop_table(conn, table):
268 |     conn.execute("DROP TABLE [{}]".format(table))
269 | 
270 | 
271 | def get_create_table_sql(
272 |     table_name, df, index=True, sql_type_overrides=None, primary_keys=None
273 | ):
274 |     # Create a temporary table with just the first row
275 |     # We do this in memory because we just want to get the
276 |     # CREATE TABLE statement
277 |     # Returns (sql, columns)
278 |     conn = sqlite3.connect(":memory:")
279 |     # Before calling to_sql we need correct the dtypes that we will be using
280 |     # to pick the right SQL column types. pandas mostly gets this right...
281 |     # except for columns that contain a mixture of integers and Nones. These
282 |     # will be incorrectly detected as being of DB type REAL when we want them
283 |     # to be INTEGER instead.
284 |     # http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na
285 |     sql_type_overrides = sql_type_overrides or {}
286 |     if isinstance(df, pd.DataFrame):
287 |         columns_and_types = df.dtypes.items()
288 |     elif isinstance(df, pd.Series):
289 |         columns_and_types = [(df.name, df.dtype)]
290 |     for column, dtype in columns_and_types:
291 |         # Are any of these float columns?
292 |         if dtype in (np.float32, np.float64):
293 |             # if every non-NaN value is an integer, switch to int
294 |             if isinstance(df, pd.Series):
295 |                 series = df
296 |             else:
297 |                 series = df[column]
298 |             num_non_integer_floats = series.map(
299 |                 lambda v: not np.isnan(v) and not v.is_integer()
300 |             ).sum()
301 |             if num_non_integer_floats == 0:
302 |                 # Everything was NaN or an integer-float - switch type:
303 |                 sql_type_overrides[column] = "INTEGER"
304 | 
305 |     df[:1].to_sql(table_name, conn, index=index, dtype=sql_type_overrides)
306 |     sql = conn.execute(
307 |         "select sql from sqlite_master where name = ?", [table_name]
308 |     ).fetchone()[0]
309 |     columns = [
310 |         row[1] for row in conn.execute("PRAGMA table_info([{}])".format(table_name))
311 |     ]
312 |     if primary_keys:
313 |         # Rewrite SQL to add PRIMARY KEY (col1, col2) at end
314 |         assert sql[-1] == ")"
315 |         sql = sql[:-1] + "  ,PRIMARY KEY ({cols})\n)".format(
316 |             cols=", ".join("[{}]".format(col) for col in primary_keys)
317 |         )
318 |     return sql, columns
319 | 
320 | 
321 | def to_sql_with_foreign_keys(
322 |     conn,
323 |     df,
324 |     name,
325 |     foreign_keys,
326 |     sql_type_overrides=None,
327 |     primary_keys=None,
328 |     index_fks=False,
329 | ):
330 |     create_sql, columns = get_create_table_sql(
331 |         name,
332 |         df,
333 |         index=False,
334 |         primary_keys=primary_keys,
335 |         sql_type_overrides=sql_type_overrides,
336 |     )
337 |     foreign_key_bits = []
338 |     index_bits = []
339 |     for column, (table, value_column) in foreign_keys.items():
340 |         if column in columns:
341 |             foreign_key_bits.append(
342 |                 'FOREIGN KEY ("{}") REFERENCES [{}](id)'.format(column, table)
343 |             )
344 |             if index_fks:
345 |                 index_bits.append(
346 |                     # CREATE INDEX indexname ON table(column);
347 |                     'CREATE INDEX ["{}_{}"] ON [{}]("{}");'.format(
348 |                         name, column, name, column
349 |                     )
350 |                 )
351 | 
352 |     foreign_key_sql = ",\n    ".join(foreign_key_bits)
353 |     if foreign_key_sql:
354 |         create_sql = "{},\n{});".format(create_sql.strip().rstrip(")"), foreign_key_sql)
355 |     if index_bits:
356 |         create_sql += "\n" + "\n".join(index_bits)
357 |     conn.executescript(create_sql)
358 |     # Now that we have created the table, insert the rows:
359 |     df.to_sql(df.table_name, conn, if_exists="append", index=False)
360 | 
361 | 
362 | def best_fts_version():
363 |     "Discovers the most advanced supported SQLite FTS version"
364 |     conn = sqlite3.connect(":memory:")
365 |     for fts in ("FTS5", "FTS4", "FTS3"):
366 |         try:
367 |             conn.execute("CREATE VIRTUAL TABLE v USING {} (t);".format(fts))
368 |             return fts
369 |         except sqlite3.OperationalError:
370 |             continue
371 |     return None
372 | 
373 | 
374 | def generate_and_populate_fts(conn, created_tables, cols, foreign_keys):
375 |     fts_version = best_fts_version()
376 |     sql = []
377 |     fts_cols = ", ".join('"{}"'.format(c) for c in cols)
378 |     for table in created_tables:
379 |         sql.append(
380 |             'CREATE VIRTUAL TABLE "{content_table}_fts" USING {fts_version} ({cols}, content="{content_table}")'.format(
381 |                 cols=fts_cols, content_table=table, fts_version=fts_version
382 |             )
383 |         )
384 |         if not foreign_keys:
385 |             # Select is simple:
386 |             select = "SELECT rowid, {cols} FROM [{content_table}]".format(
387 |                 cols=fts_cols, content_table=table
388 |             )
389 |         else:
390 |             # Select is complicated:
391 |             # select
392 |             #     county, precinct, office.value, district.value,
393 |             #     party.value, candidate.value, votes
394 |             # from content_table
395 |             #     left join office on content_table.office = office.id
396 |             #     left join district on content_table.district = district.id
397 |             #     left join party on content_table.party = party.id
398 |             #     left join candidate on content_table.candidate = candidate.id
399 |             # order by content_table.rowid
400 |             select_cols = []
401 |             joins = []
402 |             table_seen_count = {}
403 |             for col in cols:
404 |                 if col in foreign_keys:
405 |                     other_table, label_column = foreign_keys[col]
406 |                     seen_count = table_seen_count.get(other_table, 0) + 1
407 |                     table_seen_count[other_table] = seen_count
408 |                     alias = ""
409 |                     if seen_count > 1:
410 |                         alias = "table_alias_{}_{}".format(
411 |                             hashlib.md5(other_table.encode("utf8")).hexdigest(),
412 |                             seen_count,
413 |                         )
414 |                     select_cols.append(
415 |                         '[{}]."{}"'.format(alias or other_table, label_column)
416 |                     )
417 |                     joins.append(
418 |                         'left join [{other_table}] {alias} on [{table}]."{column}" = [{alias_or_other_table}].id'.format(
419 |                             other_table=other_table,
420 |                             alias_or_other_table=alias or other_table,
421 |                             alias=alias,
422 |                             table=table,
423 |                             column=col,
424 |                         )
425 |                     )
426 |                 else:
427 |                     select_cols.append('"{}"'.format(col))
428 |             select = "SELECT [{content_table}].rowid, {select_cols} FROM [{content_table}] {joins}".format(
429 |                 select_cols=", ".join("{}".format(c) for c in select_cols),
430 |                 content_table=table,
431 |                 joins="\n".join(joins),
432 |             )
433 |         sql.append(
434 |             'INSERT INTO "{content_table}_fts" (rowid, {cols}) {select}'.format(
435 |                 cols=fts_cols, content_table=table, select=select
436 |             )
437 |         )
438 |     conn.executescript(";\n".join(sql))
439 | 
440 | 
441 | type_re = re.compile(r"\((real|integer|text|blob|numeric)\)$", re.I)
442 | 
443 | 
444 | def parse_shape(shape):
445 |     # Shape is format 'county:Cty,votes:Vts(REAL)'
446 |     defs = [b.strip() for b in shape.split(",")]
447 |     defns = []
448 |     for defn in defs:
449 |         # Is there a type defined?
450 |         type_override = None
451 |         m = type_re.search(defn)
452 |         if m:
453 |             type_override = m.group(1)
454 |             defn = type_re.sub("", defn)
455 |             # In Python 2 type_override needs to be a bytestring
456 |             if six.PY2:
457 |                 type_override = str(type_override)
458 |         # Is this a rename?
459 |         if ":" in defn:
460 |             csv_name, db_name = defn.split(":", 1)
461 |         else:
462 |             csv_name, db_name = defn, defn
463 |         defns.append(
464 |             {"csv_name": csv_name, "db_name": db_name, "type_override": type_override}
465 |         )
466 |     return defns
467 | 
468 | 
469 | def apply_shape(df, shape):
470 |     # Shape is format 'county:Cty,votes:Vts(REAL)'
471 |     # Applies changes in place, returns dtype= arg for to_sql
472 |     if not shape:
473 |         return None
474 |     defns = parse_shape(shape)
475 |     # Drop any columns we don't want
476 |     cols_to_keep = [d["csv_name"] for d in defns]
477 |     cols_to_drop = [c for c in df.columns if c not in cols_to_keep]
478 |     if cols_to_drop:
479 |         df.drop(cols_to_drop, axis=1, inplace=True)
480 |     # Apply column renames
481 |     renames = {
482 |         d["csv_name"]: d["db_name"] for d in defns if d["csv_name"] != d["db_name"]
483 |     }
484 |     if renames:
485 |         df.rename(columns=renames, inplace=True)
486 |     # Return type overrides, if any
487 |     return {d["db_name"]: d["type_override"] for d in defns if d["type_override"]}
488 | 
489 | 
490 | def add_index(conn, table_name, index):
491 |     columns_to_index = [b.strip() for b in index.split(",")]
492 |     # Figure out columns in table so we can sanity check this
493 |     cursor = conn.execute("select * from [{}] limit 0".format(table_name))
494 |     columns = [r[0] for r in cursor.description]
495 |     if all([(c in columns) for c in columns_to_index]):
496 |         sql = 'CREATE INDEX ["{}_{}"] ON [{}]("{}");'.format(
497 |             table_name,
498 |             "_".join(columns_to_index),
499 |             table_name,
500 |             '", "'.join(columns_to_index),
501 |         )
502 |         conn.execute(sql)
503 | 
504 | 
505 | def apply_dates_and_datetimes(df, date_cols, datetime_cols, datetime_formats):
506 |     def parse_datetime(datestring, force_date=False):
507 |         if pd.isnull(datestring):
508 |             return datestring
509 |         dt = dateparser.parse(datestring, date_formats=datetime_formats)
510 |         if force_date:
511 |             return dt.date().isoformat()
512 |         else:
513 |             return dt.isoformat()
514 | 
515 |     for date_col in date_cols:
516 |         df[date_col] = df[date_col].apply(lambda s: parse_datetime(s, force_date=True))
517 |     for datetime_col in datetime_cols:
518 |         df[datetime_col] = df[datetime_col].apply(parse_datetime)
519 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
3 | 
4 | [bdist_wheel]
5 | universal=1
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import io
 3 | import os
 4 | 
 5 | VERSION = "1.3.1"
 6 | 
 7 | 
 8 | def get_long_description():
 9 |     with io.open(
10 |         os.path.join(os.path.dirname(os.path.abspath(__file__)), "README.md"),
11 |         encoding="utf8",
12 |     ) as fp:
13 |         return fp.read()
14 | 
15 | 
16 | setup(
17 |     name="csvs_to_sqlite",
18 |     description="Convert CSV files into a SQLite database",
19 |     long_description=get_long_description(),
20 |     long_description_content_type="text/markdown",
21 |     author="Simon Willison",
22 |     version=VERSION,
23 |     license="Apache License, Version 2.0",
24 |     packages=find_packages(),
25 |     install_requires=[
26 |         "click>=7.0",
27 |         "dateparser>=1.0",
28 |         "pandas>=1.0",
29 |         "py-lru-cache~=0.1.4",
30 |         "six",
31 |     ],
32 |     extras_require={"test": ["pytest", "cogapp"]},
33 |     entry_points="""
34 |         [console_scripts]
35 |         csvs-to-sqlite=csvs_to_sqlite.cli:cli
36 |     """,
37 |     url="https://github.com/simonw/csvs-to-sqlite",
38 |     classifiers=[
39 |         "Intended Audience :: Developers",
40 |         "Intended Audience :: Science/Research",
41 |         "Intended Audience :: End Users/Desktop",
42 |         "Topic :: Database",
43 |         "License :: OSI Approved :: Apache Software License",
44 |         "Programming Language :: Python :: 3.6",
45 |         "Programming Language :: Python :: 3.7",
46 |         "Programming Language :: Python :: 3.8",
47 |         "Programming Language :: Python :: 3.9",
48 |     ],
49 | )
50 | 


--------------------------------------------------------------------------------
/tests/test_csvs_to_sqlite.py:
--------------------------------------------------------------------------------
  1 | from click.testing import CliRunner
  2 | from csvs_to_sqlite import cli
  3 | from six import string_types, text_type
  4 | from cogapp import Cog
  5 | import sys
  6 | from io import StringIO
  7 | import pathlib
  8 | import sqlite3
  9 | 
 10 | CSV = """county,precinct,office,district,party,candidate,votes
 11 | Yolo,100001,President,,LIB,Gary Johnson,41
 12 | Yolo,100001,President,,PAF,Gloria Estela La Riva,8
 13 | Yolo,100001,Proposition 51,,,No,398
 14 | Yolo,100001,Proposition 51,,,Yes,460
 15 | Yolo,100001,State Assembly,7,DEM,Kevin McCarty,572
 16 | Yolo,100001,State Assembly,7,REP,Ryan K. Brown,291"""
 17 | 
 18 | CSV_MULTI = """film,actor_1,actor_2
 19 | The Rock,Sean Connery,Nicolas Cage
 20 | National Treasure,Nicolas Cage,Diane Kruger
 21 | Troy,Diane Kruger,Orlando Bloom"""
 22 | 
 23 | CSV_DATES = """headline,date,datetime
 24 | First headline,3rd May 2017,10pm on April 4 1938
 25 | Second headline,04/30/2005,5:45 10 December 2009"""
 26 | 
 27 | CSV_DATES_CUSTOM_FORMAT = """headline,date
 28 | Custom format,03/02/01"""
 29 | 
 30 | CSV_CUSTOM_PRIMARY_KEYS = """pk1,pk2,name
 31 | one,one,11
 32 | one,two,12
 33 | two,one,21"""
 34 | 
 35 | CSV_STRINGS_AND_DATES = """name,gross,release_date
 36 | Adaptation,22.5,6 of December in the year 2002
 37 | Face/Off,245.7,19 of June in the year 1997
 38 | The Rock,134.1,9 of June in the year 1996"""
 39 | 
 40 | 
 41 | def test_flat():
 42 |     runner = CliRunner()
 43 |     with runner.isolated_filesystem():
 44 |         open("test.csv", "w").write(CSV)
 45 |         result = runner.invoke(cli.cli, ["test.csv", "test.db"])
 46 |         assert result.exit_code == 0
 47 |         assert result.output.strip().endswith("Created test.db from 1 CSV file")
 48 |         conn = sqlite3.connect("test.db")
 49 |         assert [
 50 |             (0, "county", "TEXT", 0, None, 0),
 51 |             (1, "precinct", "INTEGER", 0, None, 0),
 52 |             (2, "office", "TEXT", 0, None, 0),
 53 |             (3, "district", "INTEGER", 0, None, 0),
 54 |             (4, "party", "TEXT", 0, None, 0),
 55 |             (5, "candidate", "TEXT", 0, None, 0),
 56 |             (6, "votes", "INTEGER", 0, None, 0),
 57 |         ] == list(conn.execute("PRAGMA table_info(test)"))
 58 |         rows = conn.execute("select * from test").fetchall()
 59 |         assert [
 60 |             ("Yolo", 100001, "President", None, "LIB", "Gary Johnson", 41),
 61 |             ("Yolo", 100001, "President", None, "PAF", "Gloria Estela La Riva", 8),
 62 |             ("Yolo", 100001, "Proposition 51", None, None, "No", 398),
 63 |             ("Yolo", 100001, "Proposition 51", None, None, "Yes", 460),
 64 |             ("Yolo", 100001, "State Assembly", 7, "DEM", "Kevin McCarty", 572),
 65 |             ("Yolo", 100001, "State Assembly", 7, "REP", "Ryan K. Brown", 291),
 66 |         ] == rows
 67 |         last_row = rows[-1]
 68 |         for i, t in enumerate(
 69 |             (string_types, int, string_types, int, string_types, string_types, int)
 70 |         ):
 71 |             assert isinstance(last_row[i], t)
 72 | 
 73 | 
 74 | def test_extract_columns():
 75 |     runner = CliRunner()
 76 |     with runner.isolated_filesystem():
 77 |         open("test.csv", "w").write(CSV)
 78 |         result = runner.invoke(
 79 |             cli.cli,
 80 |             "test.csv extracted.db -c office -c district -c party -c candidate".split(),
 81 |         )
 82 |         assert result.exit_code == 0
 83 |         assert result.output.strip().endswith("Created extracted.db from 1 CSV file")
 84 |         conn = sqlite3.connect("extracted.db")
 85 |         assert [
 86 |             (0, "county", "TEXT", 0, None, 0),
 87 |             (1, "precinct", "INTEGER", 0, None, 0),
 88 |             (2, "office", "INTEGER", 0, None, 0),
 89 |             (3, "district", "INTEGER", 0, None, 0),
 90 |             (4, "party", "INTEGER", 0, None, 0),
 91 |             (5, "candidate", "INTEGER", 0, None, 0),
 92 |             (6, "votes", "INTEGER", 0, None, 0),
 93 |         ] == list(conn.execute("PRAGMA table_info(test)"))
 94 |         rows = conn.execute(
 95 |             """
 96 |             select
 97 |                 county, precinct, office.value, district.value,
 98 |                 party.value, candidate.value, votes
 99 |             from test
100 |                 left join office on test.office = office.id
101 |                 left join district on test.district = district.id
102 |                 left join party on test.party = party.id
103 |                 left join candidate on test.candidate = candidate.id
104 |             order by test.rowid
105 |         """
106 |         ).fetchall()
107 |         assert [
108 |             ("Yolo", 100001, "President", None, "LIB", "Gary Johnson", 41),
109 |             ("Yolo", 100001, "President", None, "PAF", "Gloria Estela La Riva", 8),
110 |             ("Yolo", 100001, "Proposition 51", None, None, "No", 398),
111 |             ("Yolo", 100001, "Proposition 51", None, None, "Yes", 460),
112 |             ("Yolo", 100001, "State Assembly", "7", "DEM", "Kevin McCarty", 572),
113 |             ("Yolo", 100001, "State Assembly", "7", "REP", "Ryan K. Brown", 291),
114 |         ] == rows
115 |         last_row = rows[-1]
116 |         for i, t in enumerate(
117 |             (
118 |                 string_types,
119 |                 int,
120 |                 string_types,
121 |                 string_types,
122 |                 string_types,
123 |                 string_types,
124 |                 int,
125 |             )
126 |         ):
127 |             assert isinstance(last_row[i], t)
128 | 
129 |         # Check that the various foreign key tables have the right things in them
130 |         assert [
131 |             (1, "President"),
132 |             (2, "Proposition 51"),
133 |             (3, "State Assembly"),
134 |         ] == conn.execute("select * from office").fetchall()
135 |         assert [(1, "7")] == conn.execute("select * from district").fetchall()
136 |         assert [(1, "LIB"), (2, "PAF"), (3, "DEM"), (4, "REP")] == conn.execute(
137 |             "select * from party"
138 |         ).fetchall()
139 |         assert [
140 |             (1, "Gary Johnson"),
141 |             (2, "Gloria Estela La Riva"),
142 |             (3, "No"),
143 |             (4, "Yes"),
144 |             (5, "Kevin McCarty"),
145 |             (6, "Ryan K. Brown"),
146 |         ] == conn.execute("select * from candidate").fetchall()
147 | 
148 |         # Check that a FTS index was created for each extracted table
149 |         fts_tables = [
150 |             r[0]
151 |             for r in conn.execute(
152 |                 """
153 |             select name from sqlite_master
154 |             where type='table' and name like '%_fts'
155 |             and sql like '%USING FTS%'
156 |         """
157 |             ).fetchall()
158 |         ]
159 |         assert set(fts_tables) == {
160 |             "office_value_fts",
161 |             "district_value_fts",
162 |             "party_value_fts",
163 |             "candidate_value_fts",
164 |         }
165 | 
166 | 
167 | def test_fts():
168 |     runner = CliRunner()
169 |     with runner.isolated_filesystem():
170 |         open("test.csv", "w").write(CSV)
171 |         result = runner.invoke(
172 |             cli.cli, "test.csv fts.db -f office -f party -f candidate".split()
173 |         )
174 |         assert result.exit_code == 0
175 |         conn = sqlite3.connect("fts.db")
176 |         assert (
177 |             [("Yolo", 100001, "President", "PAF", "Gloria Estela La Riva")]
178 |             == conn.execute(
179 |                 """
180 |             select county, precinct, office, party, candidate
181 |             from test
182 |             where rowid in (
183 |                 select rowid from test_fts
184 |                 where test_fts match 'president gloria'
185 |             )
186 |         """
187 |             ).fetchall()
188 |         )
189 | 
190 | 
191 | def test_fts_error_on_missing_columns():
192 |     runner = CliRunner()
193 |     with runner.isolated_filesystem():
194 |         open("test.csv", "w").write(CSV)
195 |         result = runner.invoke(cli.cli, "test.csv fts.db -f badcolumn".split())
196 |         assert result.exit_code != 0
197 |         assert result.output.strip().endswith('FTS column "badcolumn" does not exist')
198 | 
199 | 
200 | def test_fts_and_extract_columns():
201 |     runner = CliRunner()
202 |     with runner.isolated_filesystem():
203 |         open("test.csv", "w").write(CSV)
204 |         result = runner.invoke(
205 |             cli.cli,
206 |             (
207 |                 "test.csv fts-extracted.db -c office -c party -c candidate "
208 |                 "-f party -f candidate"
209 |             ).split(),
210 |         )
211 |         assert result.exit_code == 0
212 |         conn = sqlite3.connect("fts-extracted.db")
213 |         assert (
214 |             [("Yolo", 100001, "President", "PAF", "Gloria Estela La Riva")]
215 |             == conn.execute(
216 |                 """
217 |             select
218 |                 county, precinct, office.value, party.value, candidate.value
219 |             from test
220 |                 left join office on test.office = office.id
221 |                 left join party on test.party = party.id
222 |                 left join candidate on test.candidate = candidate.id
223 |             where test.rowid in (
224 |                 select rowid from test_fts
225 |                 where test_fts match 'paf gloria'
226 |             )
227 |         """
228 |             ).fetchall()
229 |         )
230 | 
231 | 
232 | def test_fts_one_column_multiple_aliases():
233 |     runner = CliRunner()
234 |     with runner.isolated_filesystem():
235 |         open("test.csv", "w").write(CSV_MULTI)
236 |         result = runner.invoke(
237 |             cli.cli,
238 |             (
239 |                 "test.csv fts-extracted.db -c film "
240 |                 "-c actor_1:actors:name -c actor_2:actors:name "
241 |                 "-f film -f actor_1 -f actor_2"
242 |             ).split(),
243 |         )
244 |         assert result.exit_code == 0
245 |         conn = sqlite3.connect("fts-extracted.db")
246 |         assert (
247 |             [
248 |                 ("The Rock", "Sean Connery", "Nicolas Cage"),
249 |                 ("National Treasure", "Nicolas Cage", "Diane Kruger"),
250 |                 ("Troy", "Diane Kruger", "Orlando Bloom"),
251 |             ]
252 |             == conn.execute(
253 |                 """
254 |             select
255 |                 film.value, a1.name, a2.name
256 |             from test
257 |                 join film on test.film = film.id
258 |                 join actors a1 on test.actor_1 = a1.id
259 |                 join actors a2 on test.actor_2 = a2.id
260 |         """
261 |             ).fetchall()
262 |         )
263 |         assert (
264 |             [
265 |                 ("National Treasure", "Nicolas Cage", "Diane Kruger"),
266 |                 ("Troy", "Diane Kruger", "Orlando Bloom"),
267 |             ]
268 |             == conn.execute(
269 |                 """
270 |             select
271 |                 film.value, a1.name, a2.name
272 |             from test
273 |                 join film on test.film = film.id
274 |                 join actors a1 on test.actor_1 = a1.id
275 |                 join actors a2 on test.actor_2 = a2.id
276 |             where test.rowid in (
277 |                 select rowid from [test_fts] where [test_fts] match 'kruger'
278 |             )
279 |         """
280 |             ).fetchall()
281 |         )
282 | 
283 | 
284 | def test_shape():
285 |     runner = CliRunner()
286 |     with runner.isolated_filesystem():
287 |         open("test.csv", "w").write(CSV)
288 |         result = runner.invoke(
289 |             cli.cli,
290 |             ["test.csv", "test-reshaped.db", "--shape", "county:Cty,votes:Vts(REAL)"],
291 |         )
292 |         assert result.exit_code == 0
293 |         conn = sqlite3.connect("test-reshaped.db")
294 |         # Check we only have Cty and Vts columns:
295 |         assert [
296 |             (0, "Cty", "TEXT", 0, None, 0),
297 |             (1, "Vts", "REAL", 0, None, 0),
298 |         ] == conn.execute("PRAGMA table_info(test);").fetchall()
299 |         # Now check that values are as expected:
300 |         results = conn.execute(
301 |             """
302 |             select Cty, Vts from test
303 |         """
304 |         ).fetchall()
305 |         assert [
306 |             ("Yolo", 41.0),
307 |             ("Yolo", 8.0),
308 |             ("Yolo", 398.0),
309 |             ("Yolo", 460.0),
310 |             ("Yolo", 572.0),
311 |             ("Yolo", 291.0),
312 |         ] == results
313 |         for city, votes in results:
314 |             assert isinstance(city, text_type)
315 |             assert isinstance(votes, float)
316 | 
317 | 
318 | def test_filename_column():
319 |     runner = CliRunner()
320 |     with runner.isolated_filesystem():
321 |         open("test1.csv", "w").write(CSV)
322 |         open("test2.csv", "w").write(CSV_MULTI)
323 |         result = runner.invoke(
324 |             cli.cli, [".", "test-filename.db", "--filename-column", "source"]
325 |         )
326 |         assert result.exit_code == 0
327 |         conn = sqlite3.connect("test-filename.db")
328 |         assert [("./test1",), ("./test2",)] == conn.execute(
329 |             "select name from sqlite_master order by name"
330 |         ).fetchall()
331 |         # Check the source column has been added and populated
332 |         assert [("Yolo", "Gary Johnson", 41, "./test1")] == conn.execute(
333 |             "select county, candidate, votes, source from [./test1] limit 1"
334 |         ).fetchall()
335 |         assert [
336 |             ("The Rock", "Sean Connery", "Nicolas Cage", "./test2")
337 |         ] == conn.execute(
338 |             "select film, actor_1, actor_2, source from [./test2] limit 1"
339 |         ).fetchall()
340 | 
341 | 
342 | def test_filename_column_with_shape():
343 |     runner = CliRunner()
344 |     with runner.isolated_filesystem():
345 |         open("test.csv", "w").write(CSV)
346 |         result = runner.invoke(
347 |             cli.cli,
348 |             [
349 |                 "test.csv",
350 |                 "test.db",
351 |                 "--filename-column",
352 |                 "source",
353 |                 "--shape",
354 |                 "county:Cty,votes:Vts",
355 |             ],
356 |         )
357 |         assert result.exit_code == 0
358 |         conn = sqlite3.connect("test.db")
359 |         assert [("Yolo", 41, "test")] == conn.execute(
360 |             "select Cty, Vts, source from test limit 1"
361 |         ).fetchall()
362 | 
363 | 
364 | def test_fixed_column():
365 |     """
366 |     Tests that all three fixed_column options are handled correctly.
367 |     """
368 |     runner = CliRunner()
369 |     with runner.isolated_filesystem():
370 |         open("test.csv", "w").write(CSV)
371 |         result = runner.invoke(
372 |             cli.cli,
373 |             [
374 |                 "test.csv",
375 |                 "test.db",
376 |                 "--fixed-column",
377 |                 "col1",
378 |                 "foo",
379 |                 "--fixed-column",
380 |                 "col2",
381 |                 "bar",
382 |                 "--fixed-column-int",
383 |                 "col3",
384 |                 "1",
385 |                 "--fixed-column-float",
386 |                 "col4",
387 |                 "1.1",
388 |             ],
389 |         )
390 |         assert result.exit_code == 0
391 |         assert result.output.strip().endswith("Created test.db from 1 CSV file")
392 |         conn = sqlite3.connect("test.db")
393 |         assert [
394 |             (0, "county", "TEXT", 0, None, 0),
395 |             (1, "precinct", "INTEGER", 0, None, 0),
396 |             (2, "office", "TEXT", 0, None, 0),
397 |             (3, "district", "INTEGER", 0, None, 0),
398 |             (4, "party", "TEXT", 0, None, 0),
399 |             (5, "candidate", "TEXT", 0, None, 0),
400 |             (6, "votes", "INTEGER", 0, None, 0),
401 |             (7, "col1", "TEXT", 0, None, 0),
402 |             (8, "col2", "TEXT", 0, None, 0),
403 |             (9, "col3", "INTEGER", 0, None, 0),
404 |             (10, "col4", "REAL", 0, None, 0),
405 |         ] == list(conn.execute("PRAGMA table_info(test)"))
406 |         rows = conn.execute("select * from test").fetchall()
407 |         assert [
408 |             (
409 |                 "Yolo",
410 |                 100001,
411 |                 "President",
412 |                 None,
413 |                 "LIB",
414 |                 "Gary Johnson",
415 |                 41,
416 |                 "foo",
417 |                 "bar",
418 |                 1,
419 |                 1.1,
420 |             ),
421 |             (
422 |                 "Yolo",
423 |                 100001,
424 |                 "President",
425 |                 None,
426 |                 "PAF",
427 |                 "Gloria Estela La Riva",
428 |                 8,
429 |                 "foo",
430 |                 "bar",
431 |                 1,
432 |                 1.1,
433 |             ),
434 |             (
435 |                 "Yolo",
436 |                 100001,
437 |                 "Proposition 51",
438 |                 None,
439 |                 None,
440 |                 "No",
441 |                 398,
442 |                 "foo",
443 |                 "bar",
444 |                 1,
445 |                 1.1,
446 |             ),
447 |             (
448 |                 "Yolo",
449 |                 100001,
450 |                 "Proposition 51",
451 |                 None,
452 |                 None,
453 |                 "Yes",
454 |                 460,
455 |                 "foo",
456 |                 "bar",
457 |                 1,
458 |                 1.1,
459 |             ),
460 |             (
461 |                 "Yolo",
462 |                 100001,
463 |                 "State Assembly",
464 |                 7,
465 |                 "DEM",
466 |                 "Kevin McCarty",
467 |                 572,
468 |                 "foo",
469 |                 "bar",
470 |                 1,
471 |                 1.1,
472 |             ),
473 |             (
474 |                 "Yolo",
475 |                 100001,
476 |                 "State Assembly",
477 |                 7,
478 |                 "REP",
479 |                 "Ryan K. Brown",
480 |                 291,
481 |                 "foo",
482 |                 "bar",
483 |                 1,
484 |                 1.1,
485 |             ),
486 |         ] == rows
487 | 
488 | 
489 | def test_fixed_column_with_shape():
490 |     """
491 |     Test that fixed_column works with shape.
492 |     """
493 |     runner = CliRunner()
494 |     with runner.isolated_filesystem():
495 |         open("test.csv", "w").write(CSV)
496 |         result = runner.invoke(
497 |             cli.cli,
498 |             [
499 |                 "test.csv",
500 |                 "test.db",
501 |                 "--fixed-column",
502 |                 "col1",
503 |                 "foo",
504 |                 "--fixed-column",
505 |                 "col2",
506 |                 "bar",
507 |                 "--shape",
508 |                 "county:Cty,votes:Vts",
509 |             ],
510 |         )
511 |         assert result.exit_code == 0
512 |         conn = sqlite3.connect("test.db")
513 |         assert [("Yolo", 41, "foo", "bar")] == conn.execute(
514 |             "select Cty, Vts, col1, col2 from test limit 1"
515 |         ).fetchall()
516 | 
517 | 
518 | def test_shape_with_extract_columns():
519 |     runner = CliRunner()
520 |     with runner.isolated_filesystem():
521 |         open("test.csv", "w").write(CSV)
522 |         result = runner.invoke(
523 |             cli.cli,
524 |             [
525 |                 "test.csv",
526 |                 "test.db",
527 |                 "--filename-column",
528 |                 "Source",
529 |                 "--shape",
530 |                 "county:Cty,votes:Vts",
531 |                 "-c",
532 |                 "Cty",
533 |                 "-c",
534 |                 "Vts",
535 |                 "-c",
536 |                 "Source",
537 |             ],
538 |         )
539 |         assert result.exit_code == 0
540 |         conn = sqlite3.connect("test.db")
541 |         assert (
542 |             [("Yolo", "41", "test")]
543 |             == conn.execute(
544 |                 """
545 |             select
546 |                 Cty.value, Vts.value, Source.value
547 |             from test
548 |                 left join Cty on test.Cty = Cty.id
549 |                 left join Vts on test.Vts = Vts.id
550 |                 left join Source on test.Source = Source.id
551 |             limit 1
552 |         """
553 |             ).fetchall()
554 |         )
555 | 
556 | 
557 | def test_custom_indexes():
558 |     runner = CliRunner()
559 |     with runner.isolated_filesystem():
560 |         open("test.csv", "w").write(CSV)
561 |         result = runner.invoke(
562 |             cli.cli,
563 |             ["test.csv", "test.db", "--index", "county", "-i", "party,candidate"],
564 |         )
565 |         assert result.exit_code == 0
566 |         conn = sqlite3.connect("test.db")
567 |         assert [
568 |             ('"test_county"', "test"),
569 |             ('"test_party_candidate"', "test"),
570 |         ] == conn.execute(
571 |             'select name, tbl_name from sqlite_master where type = "index" order by name'
572 |         ).fetchall()
573 | 
574 | 
575 | def test_dates_and_datetimes():
576 |     runner = CliRunner()
577 |     with runner.isolated_filesystem():
578 |         open("test.csv", "w").write(CSV_DATES)
579 |         result = runner.invoke(
580 |             cli.cli, ["test.csv", "test.db", "-d", "date", "-dt", "datetime"]
581 |         )
582 |         assert result.exit_code == 0
583 |         conn = sqlite3.connect("test.db")
584 |         expected = [
585 |             ("First headline", "2017-05-03", "1938-04-04T22:00:00"),
586 |             ("Second headline", "2005-04-30", "2009-12-10T05:45:00"),
587 |         ]
588 |         actual = conn.execute("select * from test").fetchall()
589 |         assert expected == actual
590 | 
591 | 
592 | def test_dates_custom_formats():
593 |     runner = CliRunner()
594 |     with runner.isolated_filesystem():
595 |         open("test.csv", "w").write(CSV_DATES_CUSTOM_FORMAT)
596 |         result = runner.invoke(
597 |             cli.cli, ["test.csv", "test.db", "-d", "date", "-df", "%y/%d/%m"]
598 |         )
599 |         assert result.exit_code == 0
600 |         conn = sqlite3.connect("test.db")
601 |         # Input was 03/02/01
602 |         expected = [("Custom format", "2003-01-02")]
603 |         actual = conn.execute("select * from test").fetchall()
604 |         assert expected == actual
605 | 
606 | 
607 | def test_extract_cols_no_fts():
608 |     runner = CliRunner()
609 |     with runner.isolated_filesystem():
610 |         open("test.csv", "w").write(CSV)
611 |         result = runner.invoke(
612 |             cli.cli,
613 |             (
614 |                 "test.csv fts-extracted.db -c office -c party -c candidate "
615 |                 "-f party -f candidate --no-fulltext-fks"
616 |             ).split(),
617 |         )
618 |         assert result.exit_code == 0
619 |         conn = sqlite3.connect("fts-extracted.db")
620 |         assert (
621 |             [("test_fts",)]
622 |             == conn.execute(
623 |                 """
624 |             select name from sqlite_master
625 |             where type='table' and name like '%_fts'
626 |             and sql like '%USING FTS%'
627 |         """
628 |             ).fetchall()
629 |         )
630 | 
631 | 
632 | def test_custom_primary_keys():
633 |     runner = CliRunner()
634 |     with runner.isolated_filesystem():
635 |         open("pks.csv", "w").write(CSV_CUSTOM_PRIMARY_KEYS)
636 |         result = runner.invoke(
637 |             cli.cli, ("pks.csv pks.db -pk pk1 --primary-key pk2").split()
638 |         )
639 |         assert result.exit_code == 0
640 |         conn = sqlite3.connect("pks.db")
641 |         pks = [
642 |             r[1] for r in conn.execute('PRAGMA table_info("pks")').fetchall() if r[-1]
643 |         ]
644 |         assert ["pk1", "pk2"] == pks
645 | 
646 | 
647 | def test_just_strings_default():
648 |     """
649 |     Just like test_flat(), except all columns are strings
650 |     """
651 |     runner = CliRunner()
652 |     with runner.isolated_filesystem():
653 |         open("test.csv", "w").write(CSV)
654 |         result = runner.invoke(
655 |             cli.cli,
656 |             "test.csv just-strings.db --just-strings".split(),
657 |         )
658 |         assert result.exit_code == 0
659 | 
660 |         conn = sqlite3.connect("just-strings.db")
661 |         assert [
662 |             (0, "county", "TEXT", 0, None, 0),
663 |             (1, "precinct", "TEXT", 0, None, 0),
664 |             (2, "office", "TEXT", 0, None, 0),
665 |             (3, "district", "TEXT", 0, None, 0),
666 |             (4, "party", "TEXT", 0, None, 0),
667 |             (5, "candidate", "TEXT", 0, None, 0),
668 |             (6, "votes", "TEXT", 0, None, 0),
669 |         ] == list(conn.execute("PRAGMA table_info(test)"))
670 |         rows = conn.execute("select * from test").fetchall()
671 |         assert [
672 |             ("Yolo", "100001", "President", None, "LIB", "Gary Johnson", "41"),
673 |             ("Yolo", "100001", "President", None, "PAF", "Gloria Estela La Riva", "8"),
674 |             ("Yolo", "100001", "Proposition 51", None, None, "No", "398"),
675 |             ("Yolo", "100001", "Proposition 51", None, None, "Yes", "460"),
676 |             ("Yolo", "100001", "State Assembly", "7", "DEM", "Kevin McCarty", "572"),
677 |             ("Yolo", "100001", "State Assembly", "7", "REP", "Ryan K. Brown", "291"),
678 |         ] == rows
679 |         last_row = rows[-1]
680 |         for i, t in enumerate(
681 |             (
682 |                 string_types,
683 |                 string_types,
684 |                 string_types,
685 |                 string_types,
686 |                 string_types,
687 |                 string_types,
688 |                 string_types,
689 |             )
690 |         ):
691 |             assert isinstance(last_row[i], t)
692 | 
693 | 
694 | def test_just_strings_with_shape():
695 |     """
696 |     Make sure shape and just_strings play well together
697 |     """
698 |     runner = CliRunner()
699 |     with runner.isolated_filesystem():
700 |         open("test.csv", "w").write(CSV)
701 |         result = runner.invoke(
702 |             cli.cli,
703 |             [
704 |                 "test.csv",
705 |                 "test-reshaped-strings.db",
706 |                 "--just-strings",
707 |                 "--shape",
708 |                 "county:Cty,district:district,votes:Vts(REAL)",
709 |             ],
710 |         )
711 |         assert result.exit_code == 0
712 |         conn = sqlite3.connect("test-reshaped-strings.db")
713 |         # Check that Cty, Vts exist as defined, and so does votetxt:
714 |         assert [
715 |             (0, "Cty", "TEXT", 0, None, 0),
716 |             (1, "district", "TEXT", 0, None, 0),
717 |             (2, "Vts", "REAL", 0, None, 0),
718 |         ] == conn.execute("PRAGMA table_info(test);").fetchall()
719 |         # Now check that values are as expected:
720 |         results = conn.execute(
721 |             """
722 |             select Cty, Vts, district from test
723 |         """
724 |         ).fetchall()
725 |         assert [
726 |             ("Yolo", 41.0, None),
727 |             ("Yolo", 8.0, None),
728 |             ("Yolo", 398.0, None),
729 |             ("Yolo", 460.0, None),
730 |             ("Yolo", 572.0, "7"),
731 |             ("Yolo", 291.0, "7"),
732 |         ] == results
733 |         for city, votes, district in results:
734 |             assert isinstance(city, text_type)
735 |             assert isinstance(votes, float)
736 |             assert isinstance(district, text_type) or district is None
737 | 
738 | 
739 | def test_just_strings_with_date_specified():
740 |     runner = CliRunner()
741 |     with runner.isolated_filesystem():
742 |         open("nic_cages_greatest.csv", "w").write(CSV_STRINGS_AND_DATES)
743 |         result = runner.invoke(
744 |             cli.cli,
745 |             [
746 |                 "nic_cages_greatest.csv",
747 |                 "movies.db",
748 |                 "--date",
749 |                 "release_date",
750 |                 "--datetime-format",
751 |                 "%d of %B in the year %Y",
752 |                 "--just-strings",
753 |             ],
754 |         )
755 |         assert result.exit_code == 0
756 |         conn = sqlite3.connect("movies.db")
757 |         expected = [
758 |             ("Adaptation", "22.5", "2002-12-06"),
759 |             ("Face/Off", "245.7", "1997-06-19"),
760 |             ("The Rock", "134.1", "1996-06-09"),
761 |         ]
762 |         actual = conn.execute("select * from nic_cages_greatest").fetchall()
763 |         assert expected == actual
764 | 
765 |         for name, gross, dt in actual:
766 |             assert isinstance(gross, text_type)
767 | 
768 | 
769 | def test_if_cog_needs_to_be_run():
770 |     _stdout = sys.stdout
771 |     sys.stdout = StringIO()
772 |     readme = pathlib.Path(__file__).parent.parent / "README.md"
773 |     result = Cog().main(["cog", str(readme)])
774 |     output = sys.stdout.getvalue()
775 |     sys.stdout = _stdout
776 |     assert (
777 |         output == readme.read_text()
778 |     ), "Run 'cog -r README.md' to update help in README"
779 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | from csvs_to_sqlite import utils
 2 | import pytest
 3 | import sqlite3
 4 | import pandas as pd
 5 | 
 6 | TEST_TABLES = """
 7 | CREATE TABLE foo (
 8 |   id integer primary key,
 9 |   value text
10 | );
11 | """
12 | 
13 | 
14 | @pytest.mark.parametrize("table,expected", [("foo", True), ("bar", False)])
15 | def test_table_exists(table, expected):
16 |     conn = sqlite3.connect(":memory:")
17 |     conn.executescript(TEST_TABLES)
18 |     assert expected == utils.table_exists(conn, table)
19 | 
20 | 
21 | def test_get_create_table_sql():
22 |     df = pd.DataFrame([{"number": 1, "letter": "a"}])
23 |     sql, columns = utils.get_create_table_sql("hello", df)
24 |     assert (
25 |         'CREATE TABLE "hello" (\n'
26 |         '"index" INTEGER,\n'
27 |         '  "number" INTEGER,\n'
28 |         '  "letter" TEXT\n'
29 |         ")"
30 |     ) == sql
31 |     assert {"index", "letter", "number"} == set(columns)
32 | 
33 | 
34 | def test_refactor_dataframes():
35 |     df = pd.DataFrame(
36 |         [
37 |             {"name": "Terry", "score": 0.5},
38 |             {"name": "Terry", "score": 0.8},
39 |             {"name": "Owen", "score": 0.7},
40 |         ]
41 |     )
42 |     conn = sqlite3.connect(":memory:")
43 |     output = utils.refactor_dataframes(
44 |         conn, [df], {"name": ("People", "first_name")}, False
45 |     )
46 |     assert 1 == len(output)
47 |     dataframe = output[0]
48 |     # There should be a 'People' table in sqlite
49 |     assert [(1, "Terry"), (2, "Owen")] == conn.execute(
50 |         "select id, first_name from People"
51 |     ).fetchall()
52 |     assert (
53 |         "   name  score\n" "0     1    0.5\n" "1     1    0.8\n" "2     2    0.7"
54 |     ) == str(dataframe)
55 | 


--------------------------------------------------------------------------------