├── .bumpversion.cfg ├── .github ├── FUNDING.yml ├── dependabot.yml └── workflows │ └── build.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE.txt ├── Makefile ├── README.md ├── dataset ├── __init__.py ├── chunked.py ├── database.py ├── table.py ├── types.py └── util.py ├── docs ├── .gitignore ├── Makefile ├── _static │ ├── dataset-logo-dark.png │ ├── dataset-logo-light.png │ ├── dataset-logo.png │ └── knight_mozilla_on.jpg ├── api.rst ├── conf.py ├── index.rst ├── install.rst ├── queries.rst ├── quickstart.rst └── requirements.txt ├── setup.cfg ├── setup.py └── test ├── __init__.py ├── sample_data.py └── test_dataset.py /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.6.2 3 | tag_name = {new_version} 4 | commit = True 5 | tag = True 6 | 7 | [bumpversion:file:setup.py] 8 | search = version="{current_version}" 9 | replace = version="{new_version}" 10 | 11 | [bumpversion:file:dataset/__init__.py] 12 | 13 | [bumpversion:file:docs/conf.py] 14 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: pudo 4 | open_collective: pudo 5 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "04:00" 8 | open-pull-requests-limit: 100 9 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: [push] 4 | 5 | jobs: 6 | python: 7 | runs-on: ubuntu-latest 8 | 9 | services: 10 | postgres: 11 | image: postgres 12 | env: 13 | POSTGRES_USER: postgres 14 | POSTGRES_PASSWORD: postgres 15 | POSTGRES_DB: dataset 16 | ports: 17 | - 5432/tcp 18 | options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 19 | mysql: 20 | image: mysql 21 | env: 22 | MYSQL_USER: mysql 23 | MYSQL_PASSWORD: mysql 24 | MYSQL_DATABASE: dataset 25 | MYSQL_ROOT_PASSWORD: mysql 26 | ports: 27 | - 3306/tcp 28 | options: --health-cmd="mysqladmin ping" --health-interval=5s --health-timeout=2s --health-retries=3 29 | 30 | steps: 31 | - uses: actions/checkout@v1 32 | - name: Show ref 33 | run: | 34 | echo "$GITHUB_REF" 35 | - name: Set up Python 36 | uses: actions/setup-python@v1 37 | with: 38 | python-version: "3.x" 39 | - name: Install dependencies 40 | env: 41 | DEBIAN_FRONTEND: noninteractive 42 | run: | 43 | sudo apt-get -qq update 44 | pip install -e ".[dev]" 45 | - name: Run SQLite tests 46 | env: 47 | DATABASE_URL: "sqlite:///:memory:" 48 | run: | 49 | make test 50 | - name: Run PostgreSQL tests 51 | env: 52 | DATABASE_URL: "postgresql://postgres:postgres@127.0.0.1:${{ job.services.postgres.ports[5432] }}/dataset" 53 | run: | 54 | make test 55 | - name: Run mysql tests 56 | env: 57 | DATABASE_URL: "mysql+pymysql://mysql:mysql@127.0.0.1:${{ job.services.mysql.ports[3306] }}/dataset?charset=utf8" 58 | run: | 59 | make test 60 | - name: Run flake8 to lint 61 | run: | 62 | flake8 --ignore=E501,E123,E124,E126,E127,E128 dataset 63 | - name: Build a distribution 64 | run: | 65 | python setup.py sdist bdist_wheel 66 | - name: Publish a Python distribution to PyPI 67 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') 68 | uses: pypa/gh-action-pypi-publish@master 69 | with: 70 | user: __token__ 71 | password: ${{ secrets.pypi_password }} 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.egg-info 3 | *.egg 4 | .eggs/ 5 | dist/* 6 | .tox/* 7 | .vscode/* 8 | build/* 9 | .DS_Store 10 | .watchr 11 | .coverage 12 | htmlcov/ 13 | 14 | *.pyo 15 | env3/* 16 | env/* 17 | Test.yaml 18 | Freezefile.yaml 19 | :memory: 20 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # dataset ChangeLog 2 | 3 | *The changelog has only been started with version 0.3.12, previous 4 | changes must be reconstructed from revision history.* 5 | 6 | * 1.2.0: Add support for views, multiple comparison operators. 7 | Remove support for Python 2. 8 | * 1.1.0: Introduce `types` system to shortcut for SQLA types. 9 | * 1.0.0: Massive re-factor and code cleanup. 10 | * 0.6.0: Remove sqlite_datetime_fix for automatic int-casting of dates, 11 | make table['foo', 'bar'] an alias for table.distinct('foo', 'bar'), 12 | check validity of column and table names more thoroughly, rename 13 | reflectMetadata constructor argument to reflect_metadata, fix 14 | ResultIter to not leave queries open (so you can update in a loop). 15 | * 0.5.7: dataset Databases can now have customized row types. This allows, 16 | for example, information to be retrieved in attribute-accessible dict 17 | subclasses, such as stuf. 18 | * 0.5.4: Context manager for transactions, thanks to @victorkashirin. 19 | * 0.5.1: Fix a regression where empty queries would raise an exception. 20 | * 0.5: Improve overall code quality and testing, including Travis CI. 21 | An advanced __getitem__ syntax which allowed for the specification 22 | of primary keys when getting a table was dropped. 23 | DDL is no longer run against a transaction, but the base connection. 24 | * 0.4: Python 3 support and switch to alembic for migrations. 25 | * 0.3.15: Fixes to update and insertion of data, thanks to @cli248 26 | and @abhinav-upadhyay. 27 | * 0.3.14: dataset went viral somehow. Thanks to @gtsafas for 28 | refactorings, @alasdairnicol for fixing the Freezfile example in 29 | the documentation. @diegoguimaraes fixed the behaviour of insert to 30 | return the newly-created primary key ID. table.find_one() now 31 | returns a dict, not an SQLAlchemy ResultProxy. Slugs are now generated 32 | using the Python-Slugify package, removing slug code from dataset. 33 | * 0.3.13: Fixed logging, added support for transformations on result 34 | rows to support slug generation in output (#28). 35 | * 0.3.12: Makes table primary key's types and names configurable, fixing 36 | #19. Contributed by @dnatag. 37 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Open Knowledge Foundation, Friedrich Lindenberg, 2 | Gregor Aisch 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a 5 | copy of this software and associated documentation files (the 6 | "Software"), to deal in the Software without restriction, including 7 | without limitation the rights to use, copy, modify, merge, publish, 8 | distribute, sublicense, and/or sell copies of the Software, and to 9 | permit persons to whom the Software is furnished to do so, subject to 10 | the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included 13 | in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: clean test dists 3 | 4 | .PHONY: test 5 | test: 6 | pytest 7 | 8 | dists: 9 | python setup.py sdist 10 | python setup.py bdist_wheel 11 | 12 | release: dists 13 | pip install -q twine 14 | twine upload dist/* 15 | 16 | .PHONY: clean 17 | clean: 18 | rm -rf dist build .eggs 19 | find . -name '*.egg-info' -exec rm -fr {} + 20 | find . -name '*.egg' -exec rm -f {} + 21 | find . -name '*.pyc' -exec rm -f {} + 22 | find . -name '*.pyo' -exec rm -f {} + 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | dataset: databases for lazy people 2 | ================================== 3 | 4 | ![build](https://github.com/pudo/dataset/workflows/build/badge.svg) 5 | 6 | In short, **dataset** makes reading and writing data in databases as simple as reading and writing JSON files. 7 | 8 | [Read the docs](https://dataset.readthedocs.io/) 9 | 10 | To install dataset, fetch it with ``pip``: 11 | 12 | ```bash 13 | $ pip install dataset 14 | ``` 15 | 16 | **Note:** as of version 1.0, **dataset** is split into two packages, with the 17 | data export features now extracted into a stand-alone package, **datafreeze**. 18 | See the relevant repository [here](https://github.com/pudo/datafreeze). 19 | -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import warnings 3 | from dataset.database import Database 4 | from dataset.table import Table 5 | from dataset.util import row_type 6 | 7 | # shut up useless SA warning: 8 | warnings.filterwarnings("ignore", "Unicode type received non-unicode bind param value.") 9 | warnings.filterwarnings( 10 | "ignore", "Skipping unsupported ALTER for creation of implicit constraint" 11 | ) 12 | 13 | __all__ = ["Database", "Table", "connect"] 14 | __version__ = "1.6.2" 15 | 16 | 17 | def connect( 18 | url=None, 19 | schema=None, 20 | engine_kwargs=None, 21 | ensure_schema=True, 22 | row_type=row_type, 23 | sqlite_wal_mode=True, 24 | on_connect_statements=None, 25 | ): 26 | """Opens a new connection to a database. 27 | 28 | *url* can be any valid `SQLAlchemy engine URL`_. If *url* is not defined 29 | it will try to use *DATABASE_URL* from environment variable. Returns an 30 | instance of :py:class:`Database `. Additionally, 31 | *engine_kwargs* will be directly passed to SQLAlchemy, e.g. set 32 | *engine_kwargs={'pool_recycle': 3600}* will avoid `DB connection timeout`_. 33 | Set *row_type* to an alternate dict-like class to change the type of 34 | container rows are stored in.:: 35 | 36 | db = dataset.connect('sqlite:///factbook.db') 37 | 38 | One of the main features of `dataset` is to automatically create tables and 39 | columns as data is inserted. This behaviour can optionally be disabled via 40 | the `ensure_schema` argument. It can also be overridden in a lot of the 41 | data manipulation methods using the `ensure` flag. 42 | 43 | If you want to run custom SQLite pragmas on database connect, you can add them 44 | to on_connect_statements as a set of strings. You can view a full 45 | `list of PRAGMAs here`_. 46 | 47 | .. _SQLAlchemy Engine URL: http://docs.sqlalchemy.org/en/latest/core/engines.html#sqlalchemy.create_engine 48 | .. _DB connection timeout: http://docs.sqlalchemy.org/en/latest/core/pooling.html#setting-pool-recycle 49 | .. _list of PRAGMAs here: https://www.sqlite.org/pragma.html 50 | """ 51 | if url is None: 52 | url = os.environ.get("DATABASE_URL", "sqlite://") 53 | 54 | return Database( 55 | url, 56 | schema=schema, 57 | engine_kwargs=engine_kwargs, 58 | ensure_schema=ensure_schema, 59 | row_type=row_type, 60 | sqlite_wal_mode=sqlite_wal_mode, 61 | on_connect_statements=on_connect_statements, 62 | ) 63 | -------------------------------------------------------------------------------- /dataset/chunked.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | 4 | class InvalidCallback(ValueError): 5 | pass 6 | 7 | 8 | class _Chunker(object): 9 | def __init__(self, table, chunksize, callback): 10 | self.queue = [] 11 | self.table = table 12 | self.chunksize = chunksize 13 | if callback and not callable(callback): 14 | raise InvalidCallback 15 | self.callback = callback 16 | 17 | def flush(self): 18 | self.queue.clear() 19 | 20 | def _queue_add(self, item): 21 | self.queue.append(item) 22 | if len(self.queue) >= self.chunksize: 23 | self.flush() 24 | 25 | def __enter__(self): 26 | return self 27 | 28 | def __exit__(self, exc_type, exc_val, exc_tb): 29 | self.flush() 30 | 31 | 32 | class ChunkedInsert(_Chunker): 33 | """Batch up insert operations 34 | with ChunkedInsert(my_table) as inserter: 35 | inserter(row) 36 | 37 | Rows will be inserted in groups of `chunksize` (defaulting to 1000). An 38 | optional callback can be provided that will be called before the insert. 39 | This callback takes one parameter which is the queue which is about to be 40 | inserted into the database 41 | """ 42 | 43 | def __init__(self, table, chunksize=1000, callback=None): 44 | self.fields = set() 45 | super().__init__(table, chunksize, callback) 46 | 47 | def insert(self, item): 48 | self.fields.update(item.keys()) 49 | super()._queue_add(item) 50 | 51 | def flush(self): 52 | for item in self.queue: 53 | for field in self.fields: 54 | item[field] = item.get(field) 55 | if self.callback is not None: 56 | self.callback(self.queue) 57 | self.table.insert_many(self.queue) 58 | super().flush() 59 | 60 | 61 | class ChunkedUpdate(_Chunker): 62 | """Batch up update operations 63 | with ChunkedUpdate(my_table) as updater: 64 | updater(row) 65 | 66 | Rows will be updated in groups of `chunksize` (defaulting to 1000). An 67 | optional callback can be provided that will be called before the update. 68 | This callback takes one parameter which is the queue which is about to be 69 | updated into the database 70 | """ 71 | 72 | def __init__(self, table, keys, chunksize=1000, callback=None): 73 | self.keys = keys 74 | super().__init__(table, chunksize, callback) 75 | 76 | def update(self, item): 77 | super()._queue_add(item) 78 | 79 | def flush(self): 80 | if self.callback is not None: 81 | self.callback(self.queue) 82 | self.queue.sort(key=dict.keys) 83 | for fields, items in itertools.groupby(self.queue, key=dict.keys): 84 | self.table.update_many(list(items), self.keys) 85 | super().flush() 86 | -------------------------------------------------------------------------------- /dataset/database.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import threading 3 | from urllib.parse import parse_qs, urlparse 4 | 5 | from sqlalchemy import create_engine, inspect 6 | from sqlalchemy.sql import text 7 | from sqlalchemy.schema import MetaData 8 | from sqlalchemy.util import safe_reraise 9 | from sqlalchemy import event 10 | 11 | from alembic.migration import MigrationContext 12 | from alembic.operations import Operations 13 | 14 | from dataset.table import Table 15 | from dataset.util import ResultIter, row_type, safe_url, QUERY_STEP 16 | from dataset.util import normalize_table_name 17 | from dataset.types import Types 18 | 19 | log = logging.getLogger(__name__) 20 | 21 | 22 | class Database(object): 23 | """A database object represents a SQL database with multiple tables.""" 24 | 25 | def __init__( 26 | self, 27 | url, 28 | schema=None, 29 | engine_kwargs=None, 30 | ensure_schema=True, 31 | row_type=row_type, 32 | sqlite_wal_mode=True, 33 | on_connect_statements=None, 34 | ): 35 | """Configure and connect to the database.""" 36 | if engine_kwargs is None: 37 | engine_kwargs = {} 38 | 39 | parsed_url = urlparse(url) 40 | # if parsed_url.scheme.lower() in 'sqlite': 41 | # # ref: https://github.com/pudo/dataset/issues/163 42 | # if 'poolclass' not in engine_kwargs: 43 | # engine_kwargs['poolclass'] = StaticPool 44 | 45 | self.lock = threading.RLock() 46 | self.local = threading.local() 47 | self.connections = {} 48 | 49 | if len(parsed_url.query): 50 | query = parse_qs(parsed_url.query) 51 | if schema is None: 52 | schema_qs = query.get("schema", query.get("searchpath", [])) 53 | if len(schema_qs): 54 | schema = schema_qs.pop() 55 | 56 | self.schema = schema 57 | self.engine = create_engine(url, **engine_kwargs) 58 | self.is_postgres = self.engine.dialect.name == "postgresql" 59 | self.is_sqlite = self.engine.dialect.name == "sqlite" 60 | if on_connect_statements is None: 61 | on_connect_statements = [] 62 | 63 | def _run_on_connect(dbapi_con, con_record): 64 | # reference: 65 | # https://stackoverflow.com/questions/9671490/how-to-set-sqlite-pragma-statements-with-sqlalchemy 66 | # https://stackoverflow.com/a/7831210/1890086 67 | for statement in on_connect_statements: 68 | dbapi_con.execute(statement) 69 | 70 | if self.is_sqlite and parsed_url.path != "" and sqlite_wal_mode: 71 | # we only enable WAL mode for sqlite databases that are not in-memory 72 | on_connect_statements.append("PRAGMA journal_mode=WAL") 73 | 74 | if len(on_connect_statements): 75 | event.listen(self.engine, "connect", _run_on_connect) 76 | 77 | self.types = Types(is_postgres=self.is_postgres) 78 | self.url = url 79 | self.row_type = row_type 80 | self.ensure_schema = ensure_schema 81 | self._tables = {} 82 | 83 | @property 84 | def executable(self): 85 | """Connection against which statements will be executed.""" 86 | with self.lock: 87 | tid = threading.get_ident() 88 | if tid not in self.connections: 89 | self.connections[tid] = self.engine.connect() 90 | return self.connections[tid] 91 | 92 | @property 93 | def op(self): 94 | """Get an alembic operations context.""" 95 | ctx = MigrationContext.configure(self.executable) 96 | return Operations(ctx) 97 | 98 | @property 99 | def inspect(self): 100 | """Get a SQLAlchemy inspector.""" 101 | return inspect(self.executable) 102 | 103 | def has_table(self, name): 104 | return self.inspect.has_table(name, schema=self.schema) 105 | 106 | @property 107 | def metadata(self): 108 | """Return a SQLAlchemy schema cache object.""" 109 | return MetaData(schema=self.schema, bind=self.executable) 110 | 111 | @property 112 | def in_transaction(self): 113 | """Check if this database is in a transactional context.""" 114 | if not hasattr(self.local, "tx"): 115 | return False 116 | return len(self.local.tx) > 0 117 | 118 | def _flush_tables(self): 119 | """Clear the table metadata after transaction rollbacks.""" 120 | for table in self._tables.values(): 121 | table._table = None 122 | 123 | def begin(self): 124 | """Enter a transaction explicitly. 125 | 126 | No data will be written until the transaction has been committed. 127 | """ 128 | if not hasattr(self.local, "tx"): 129 | self.local.tx = [] 130 | self.local.tx.append(self.executable.begin()) 131 | 132 | def commit(self): 133 | """Commit the current transaction. 134 | 135 | Make all statements executed since the transaction was begun permanent. 136 | """ 137 | if hasattr(self.local, "tx") and self.local.tx: 138 | tx = self.local.tx.pop() 139 | tx.commit() 140 | # Removed in 2020-12, I'm a bit worried this means that some DDL 141 | # operations in transactions won't cause metadata to refresh any 142 | # more: 143 | # self._flush_tables() 144 | 145 | def rollback(self): 146 | """Roll back the current transaction. 147 | 148 | Discard all statements executed since the transaction was begun. 149 | """ 150 | if hasattr(self.local, "tx") and self.local.tx: 151 | tx = self.local.tx.pop() 152 | tx.rollback() 153 | self._flush_tables() 154 | 155 | def __enter__(self): 156 | """Start a transaction.""" 157 | self.begin() 158 | return self 159 | 160 | def __exit__(self, error_type, error_value, traceback): 161 | """End a transaction by committing or rolling back.""" 162 | if error_type is None: 163 | try: 164 | self.commit() 165 | except Exception: 166 | with safe_reraise(): 167 | self.rollback() 168 | else: 169 | self.rollback() 170 | 171 | def close(self): 172 | """Close database connections. Makes this object unusable.""" 173 | with self.lock: 174 | for conn in self.connections.values(): 175 | conn.close() 176 | self.connections.clear() 177 | self.engine.dispose() 178 | self._tables = {} 179 | self.engine = None 180 | 181 | @property 182 | def tables(self): 183 | """Get a listing of all tables that exist in the database.""" 184 | return self.inspect.get_table_names(schema=self.schema) 185 | 186 | @property 187 | def views(self): 188 | """Get a listing of all views that exist in the database.""" 189 | return self.inspect.get_view_names(schema=self.schema) 190 | 191 | def __contains__(self, table_name): 192 | """Check if the given table name exists in the database.""" 193 | try: 194 | table_name = normalize_table_name(table_name) 195 | if table_name in self.tables: 196 | return True 197 | if table_name in self.views: 198 | return True 199 | return False 200 | except ValueError: 201 | return False 202 | 203 | def create_table( 204 | self, table_name, primary_id=None, primary_type=None, primary_increment=None 205 | ): 206 | """Create a new table. 207 | 208 | Either loads a table or creates it if it doesn't exist yet. You can 209 | define the name and type of the primary key field, if a new table is to 210 | be created. The default is to create an auto-incrementing integer, 211 | ``id``. You can also set the primary key to be a string or big integer. 212 | The caller will be responsible for the uniqueness of ``primary_id`` if 213 | it is defined as a text type. You can disable auto-increment behaviour 214 | for numeric primary keys by setting `primary_increment` to `False`. 215 | 216 | Returns a :py:class:`Table ` instance. 217 | :: 218 | 219 | table = db.create_table('population') 220 | 221 | # custom id and type 222 | table2 = db.create_table('population2', 'age') 223 | table3 = db.create_table('population3', 224 | primary_id='city', 225 | primary_type=db.types.text) 226 | # custom length of String 227 | table4 = db.create_table('population4', 228 | primary_id='city', 229 | primary_type=db.types.string(25)) 230 | # no primary key 231 | table5 = db.create_table('population5', 232 | primary_id=False) 233 | """ 234 | assert not isinstance( 235 | primary_type, str 236 | ), "Text-based primary_type support is dropped, use db.types." 237 | table_name = normalize_table_name(table_name) 238 | with self.lock: 239 | if table_name not in self._tables: 240 | self._tables[table_name] = Table( 241 | self, 242 | table_name, 243 | primary_id=primary_id, 244 | primary_type=primary_type, 245 | primary_increment=primary_increment, 246 | auto_create=True, 247 | ) 248 | return self._tables.get(table_name) 249 | 250 | def load_table(self, table_name): 251 | """Load a table. 252 | 253 | This will fail if the tables does not already exist in the database. If 254 | the table exists, its columns will be reflected and are available on 255 | the :py:class:`Table ` object. 256 | 257 | Returns a :py:class:`Table ` instance. 258 | :: 259 | 260 | table = db.load_table('population') 261 | """ 262 | table_name = normalize_table_name(table_name) 263 | with self.lock: 264 | if table_name not in self._tables: 265 | self._tables[table_name] = Table(self, table_name) 266 | return self._tables.get(table_name) 267 | 268 | def get_table( 269 | self, 270 | table_name, 271 | primary_id=None, 272 | primary_type=None, 273 | primary_increment=None, 274 | ): 275 | """Load or create a table. 276 | 277 | This is now the same as ``create_table``. 278 | :: 279 | 280 | table = db.get_table('population') 281 | # you can also use the short-hand syntax: 282 | table = db['population'] 283 | """ 284 | if not self.ensure_schema: 285 | return self.load_table(table_name) 286 | return self.create_table( 287 | table_name, primary_id, primary_type, primary_increment 288 | ) 289 | 290 | def __getitem__(self, table_name): 291 | """Get a given table.""" 292 | return self.get_table(table_name) 293 | 294 | def _ipython_key_completions_(self): 295 | """Completion for table names with IPython.""" 296 | return self.tables 297 | 298 | def query(self, query, *args, **kwargs): 299 | """Run a statement on the database directly. 300 | 301 | Allows for the execution of arbitrary read/write queries. A query can 302 | either be a plain text string, or a `SQLAlchemy expression 303 | `_. 304 | If a plain string is passed in, it will be converted to an expression 305 | automatically. 306 | 307 | Further positional and keyword arguments will be used for parameter 308 | binding. To include a positional argument in your query, use question 309 | marks in the query (i.e. ``SELECT * FROM tbl WHERE a = ?``). For 310 | keyword arguments, use a bind parameter (i.e. ``SELECT * FROM tbl 311 | WHERE a = :foo``). 312 | :: 313 | 314 | statement = 'SELECT user, COUNT(*) c FROM photos GROUP BY user' 315 | for row in db.query(statement): 316 | print(row['user'], row['c']) 317 | 318 | The returned iterator will yield each result sequentially. 319 | """ 320 | if isinstance(query, str): 321 | query = text(query) 322 | _step = kwargs.pop("_step", QUERY_STEP) 323 | if _step is False or _step == 0: 324 | _step = None 325 | rp = self.executable.execute(query, *args, **kwargs) 326 | return ResultIter(rp, row_type=self.row_type, step=_step) 327 | 328 | def __repr__(self): 329 | """Text representation contains the URL.""" 330 | return "" % safe_url(self.url) 331 | -------------------------------------------------------------------------------- /dataset/table.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import warnings 3 | import threading 4 | from banal import ensure_list 5 | 6 | from sqlalchemy import func, select, false 7 | from sqlalchemy.sql import and_, expression 8 | from sqlalchemy.sql.expression import bindparam, ClauseElement 9 | from sqlalchemy.schema import Column, Index 10 | from sqlalchemy.schema import Table as SQLATable 11 | from sqlalchemy.exc import NoSuchTableError 12 | 13 | from dataset.types import Types, MYSQL_LENGTH_TYPES 14 | from dataset.util import index_name 15 | from dataset.util import DatasetException, ResultIter, QUERY_STEP 16 | from dataset.util import normalize_table_name, pad_chunk_columns 17 | from dataset.util import normalize_column_name, normalize_column_key 18 | 19 | 20 | log = logging.getLogger(__name__) 21 | 22 | 23 | class Table(object): 24 | """Represents a table in a database and exposes common operations.""" 25 | 26 | PRIMARY_DEFAULT = "id" 27 | 28 | def __init__( 29 | self, 30 | database, 31 | table_name, 32 | primary_id=None, 33 | primary_type=None, 34 | primary_increment=None, 35 | auto_create=False, 36 | ): 37 | """Initialise the table from database schema.""" 38 | self.db = database 39 | self.name = normalize_table_name(table_name) 40 | self._table = None 41 | self._columns = None 42 | self._indexes = [] 43 | self._primary_id = ( 44 | primary_id if primary_id is not None else self.PRIMARY_DEFAULT 45 | ) 46 | self._primary_type = primary_type if primary_type is not None else Types.integer 47 | if primary_increment is None: 48 | primary_increment = self._primary_type in (Types.integer, Types.bigint) 49 | self._primary_increment = primary_increment 50 | self._auto_create = auto_create 51 | 52 | @property 53 | def exists(self): 54 | """Check to see if the table currently exists in the database.""" 55 | if self._table is not None: 56 | return True 57 | return self.name in self.db 58 | 59 | @property 60 | def table(self): 61 | """Get a reference to the table, which may be reflected or created.""" 62 | if self._table is None: 63 | self._sync_table(()) 64 | return self._table 65 | 66 | @property 67 | def _column_keys(self): 68 | """Get a dictionary of all columns and their case mapping.""" 69 | if not self.exists: 70 | return {} 71 | with self.db.lock: 72 | if self._columns is None: 73 | # Initialise the table if it doesn't exist 74 | table = self.table 75 | self._columns = {} 76 | for column in table.columns: 77 | name = normalize_column_name(column.name) 78 | key = normalize_column_key(name) 79 | if key in self._columns: 80 | log.warning("Duplicate column: %s", name) 81 | self._columns[key] = name 82 | return self._columns 83 | 84 | @property 85 | def columns(self): 86 | """Get a listing of all columns that exist in the table.""" 87 | return list(self._column_keys.values()) 88 | 89 | def has_column(self, column): 90 | """Check if a column with the given name exists on this table.""" 91 | key = normalize_column_key(normalize_column_name(column)) 92 | return key in self._column_keys 93 | 94 | def _get_column_name(self, name): 95 | """Find the best column name with case-insensitive matching.""" 96 | name = normalize_column_name(name) 97 | key = normalize_column_key(name) 98 | return self._column_keys.get(key, name) 99 | 100 | def insert(self, row, ensure=None, types=None): 101 | """Add a ``row`` dict by inserting it into the table. 102 | 103 | If ``ensure`` is set, any of the keys of the row are not 104 | table columns, they will be created automatically. 105 | 106 | During column creation, ``types`` will be checked for a key 107 | matching the name of a column to be created, and the given 108 | SQLAlchemy column type will be used. Otherwise, the type is 109 | guessed from the row value, defaulting to a simple unicode 110 | field. 111 | :: 112 | 113 | data = dict(title='I am a banana!') 114 | table.insert(data) 115 | 116 | Returns the inserted row's primary key. 117 | """ 118 | row = self._sync_columns(row, ensure, types=types) 119 | res = self.db.executable.execute(self.table.insert(row)) 120 | if len(res.inserted_primary_key) > 0: 121 | return res.inserted_primary_key[0] 122 | return True 123 | 124 | def insert_ignore(self, row, keys, ensure=None, types=None): 125 | """Add a ``row`` dict into the table if the row does not exist. 126 | 127 | If rows with matching ``keys`` exist no change is made. 128 | 129 | Setting ``ensure`` results in automatically creating missing columns, 130 | i.e., keys of the row are not table columns. 131 | 132 | During column creation, ``types`` will be checked for a key 133 | matching the name of a column to be created, and the given 134 | SQLAlchemy column type will be used. Otherwise, the type is 135 | guessed from the row value, defaulting to a simple unicode 136 | field. 137 | :: 138 | 139 | data = dict(id=10, title='I am a banana!') 140 | table.insert_ignore(data, ['id']) 141 | """ 142 | row = self._sync_columns(row, ensure, types=types) 143 | if self._check_ensure(ensure): 144 | self.create_index(keys) 145 | args, _ = self._keys_to_args(row, keys) 146 | if self.count(**args) == 0: 147 | return self.insert(row, ensure=False) 148 | return False 149 | 150 | def insert_many(self, rows, chunk_size=1000, ensure=None, types=None): 151 | """Add many rows at a time. 152 | 153 | This is significantly faster than adding them one by one. Per default 154 | the rows are processed in chunks of 1000 per commit, unless you specify 155 | a different ``chunk_size``. 156 | 157 | See :py:meth:`insert() ` for details on 158 | the other parameters. 159 | :: 160 | 161 | rows = [dict(name='Dolly')] * 10000 162 | table.insert_many(rows) 163 | """ 164 | # Sync table before inputting rows. 165 | sync_row = {} 166 | for row in rows: 167 | # Only get non-existing columns. 168 | sync_keys = list(sync_row.keys()) 169 | for key in [k for k in row.keys() if k not in sync_keys]: 170 | # Get a sample of the new column(s) from the row. 171 | sync_row[key] = row[key] 172 | self._sync_columns(sync_row, ensure, types=types) 173 | 174 | # Get columns name list to be used for padding later. 175 | columns = sync_row.keys() 176 | 177 | chunk = [] 178 | for index, row in enumerate(rows): 179 | chunk.append(row) 180 | 181 | # Insert when chunk_size is fulfilled or this is the last row 182 | if len(chunk) == chunk_size or index == len(rows) - 1: 183 | chunk = pad_chunk_columns(chunk, columns) 184 | self.table.insert().execute(chunk) 185 | chunk = [] 186 | 187 | def update(self, row, keys, ensure=None, types=None, return_count=False): 188 | """Update a row in the table. 189 | 190 | The update is managed via the set of column names stated in ``keys``: 191 | they will be used as filters for the data to be updated, using the 192 | values in ``row``. 193 | :: 194 | 195 | # update all entries with id matching 10, setting their title 196 | # columns 197 | data = dict(id=10, title='I am a banana!') 198 | table.update(data, ['id']) 199 | 200 | If keys in ``row`` update columns not present in the table, they will 201 | be created based on the settings of ``ensure`` and ``types``, matching 202 | the behavior of :py:meth:`insert() `. 203 | """ 204 | row = self._sync_columns(row, ensure, types=types) 205 | args, row = self._keys_to_args(row, keys) 206 | clause = self._args_to_clause(args) 207 | if not len(row): 208 | return self.count(clause) 209 | stmt = self.table.update(whereclause=clause, values=row) 210 | rp = self.db.executable.execute(stmt) 211 | if rp.supports_sane_rowcount(): 212 | return rp.rowcount 213 | if return_count: 214 | return self.count(clause) 215 | 216 | def update_many(self, rows, keys, chunk_size=1000, ensure=None, types=None): 217 | """Update many rows in the table at a time. 218 | 219 | This is significantly faster than updating them one by one. Per default 220 | the rows are processed in chunks of 1000 per commit, unless you specify 221 | a different ``chunk_size``. 222 | 223 | See :py:meth:`update() ` for details on 224 | the other parameters. 225 | """ 226 | keys = ensure_list(keys) 227 | 228 | chunk = [] 229 | columns = [] 230 | for index, row in enumerate(rows): 231 | columns.extend( 232 | col for col in row.keys() if (col not in columns) and (col not in keys) 233 | ) 234 | 235 | # bindparam requires names to not conflict (cannot be "id" for id) 236 | for key in keys: 237 | row["_%s" % key] = row[key] 238 | row.pop(key) 239 | chunk.append(row) 240 | 241 | # Update when chunk_size is fulfilled or this is the last row 242 | if len(chunk) == chunk_size or index == len(rows) - 1: 243 | cl = [self.table.c[k] == bindparam("_%s" % k) for k in keys] 244 | stmt = self.table.update( 245 | whereclause=and_(True, *cl), 246 | values={col: bindparam(col, required=False) for col in columns}, 247 | ) 248 | self.db.executable.execute(stmt, chunk) 249 | chunk = [] 250 | 251 | def upsert(self, row, keys, ensure=None, types=None): 252 | """An UPSERT is a smart combination of insert and update. 253 | 254 | If rows with matching ``keys`` exist they will be updated, otherwise a 255 | new row is inserted in the table. 256 | :: 257 | 258 | data = dict(id=10, title='I am a banana!') 259 | table.upsert(data, ['id']) 260 | """ 261 | row = self._sync_columns(row, ensure, types=types) 262 | if self._check_ensure(ensure): 263 | self.create_index(keys) 264 | row_count = self.update(row, keys, ensure=False, return_count=True) 265 | if row_count == 0: 266 | return self.insert(row, ensure=False) 267 | return True 268 | 269 | def upsert_many(self, rows, keys, chunk_size=1000, ensure=None, types=None): 270 | """ 271 | Sorts multiple input rows into upserts and inserts. Inserts are passed 272 | to insert and upserts are updated. 273 | 274 | See :py:meth:`upsert() ` and 275 | :py:meth:`insert_many() `. 276 | """ 277 | # Removing a bulk implementation in 5e09aba401. Doing this one by one 278 | # is incredibly slow, but doesn't run into issues with column creation. 279 | for row in rows: 280 | self.upsert(row, keys, ensure=ensure, types=types) 281 | 282 | def delete(self, *clauses, **filters): 283 | """Delete rows from the table. 284 | 285 | Keyword arguments can be used to add column-based filters. The filter 286 | criterion will always be equality: 287 | :: 288 | 289 | table.delete(place='Berlin') 290 | 291 | If no arguments are given, all records are deleted. 292 | """ 293 | if not self.exists: 294 | return False 295 | clause = self._args_to_clause(filters, clauses=clauses) 296 | stmt = self.table.delete(whereclause=clause) 297 | rp = self.db.executable.execute(stmt) 298 | return rp.rowcount > 0 299 | 300 | def _reflect_table(self): 301 | """Load the tables definition from the database.""" 302 | with self.db.lock: 303 | self._columns = None 304 | try: 305 | self._table = SQLATable( 306 | self.name, self.db.metadata, schema=self.db.schema, autoload=True 307 | ) 308 | except NoSuchTableError: 309 | self._table = None 310 | 311 | def _threading_warn(self): 312 | if self.db.in_transaction and threading.active_count() > 1: 313 | warnings.warn( 314 | "Changing the database schema inside a transaction " 315 | "in a multi-threaded environment is likely to lead " 316 | "to race conditions and synchronization issues.", 317 | RuntimeWarning, 318 | ) 319 | 320 | def _sync_table(self, columns): 321 | """Lazy load, create or adapt the table structure in the database.""" 322 | if self._table is None: 323 | # Load an existing table from the database. 324 | self._reflect_table() 325 | if self._table is None: 326 | # Create the table with an initial set of columns. 327 | if not self._auto_create: 328 | raise DatasetException("Table does not exist: %s" % self.name) 329 | # Keep the lock scope small because this is run very often. 330 | with self.db.lock: 331 | self._threading_warn() 332 | self._table = SQLATable( 333 | self.name, self.db.metadata, schema=self.db.schema 334 | ) 335 | if self._primary_id is not False: 336 | # This can go wrong on DBMS like MySQL and SQLite where 337 | # tables cannot have no columns. 338 | column = Column( 339 | self._primary_id, 340 | self._primary_type, 341 | primary_key=True, 342 | autoincrement=self._primary_increment, 343 | ) 344 | self._table.append_column(column) 345 | for column in columns: 346 | if not column.name == self._primary_id: 347 | self._table.append_column(column) 348 | self._table.create(self.db.executable, checkfirst=True) 349 | self._columns = None 350 | elif len(columns): 351 | with self.db.lock: 352 | self._reflect_table() 353 | self._threading_warn() 354 | for column in columns: 355 | if not self.has_column(column.name): 356 | self.db.op.add_column(self.name, column, schema=self.db.schema) 357 | self._reflect_table() 358 | 359 | def _sync_columns(self, row, ensure, types=None): 360 | """Create missing columns (or the table) prior to writes. 361 | 362 | If automatic schema generation is disabled (``ensure`` is ``False``), 363 | this will remove any keys from the ``row`` for which there is no 364 | matching column. 365 | """ 366 | ensure = self._check_ensure(ensure) 367 | types = types or {} 368 | types = {self._get_column_name(k): v for (k, v) in types.items()} 369 | out = {} 370 | sync_columns = {} 371 | for name, value in row.items(): 372 | name = self._get_column_name(name) 373 | if self.has_column(name): 374 | out[name] = value 375 | elif ensure: 376 | _type = types.get(name) 377 | if _type is None: 378 | _type = self.db.types.guess(value) 379 | sync_columns[name] = Column(name, _type) 380 | out[name] = value 381 | self._sync_table(sync_columns.values()) 382 | return out 383 | 384 | def _check_ensure(self, ensure): 385 | if ensure is None: 386 | return self.db.ensure_schema 387 | return ensure 388 | 389 | def _generate_clause(self, column, op, value): 390 | if op in ("like",): 391 | return self.table.c[column].like(value) 392 | if op in ("ilike",): 393 | return self.table.c[column].ilike(value) 394 | if op in ("notlike",): 395 | return self.table.c[column].notlike(value) 396 | if op in ("notilike",): 397 | return self.table.c[column].notilike(value) 398 | if op in (">", "gt"): 399 | return self.table.c[column] > value 400 | if op in ("<", "lt"): 401 | return self.table.c[column] < value 402 | if op in (">=", "gte"): 403 | return self.table.c[column] >= value 404 | if op in ("<=", "lte"): 405 | return self.table.c[column] <= value 406 | if op in ("=", "==", "is"): 407 | return self.table.c[column] == value 408 | if op in ("!=", "<>", "not"): 409 | return self.table.c[column] != value 410 | if op in ("in",): 411 | return self.table.c[column].in_(value) 412 | if op in ("notin",): 413 | return self.table.c[column].notin_(value) 414 | if op in ("between", ".."): 415 | start, end = value 416 | return self.table.c[column].between(start, end) 417 | if op in ("startswith",): 418 | return self.table.c[column].like(value + "%") 419 | if op in ("endswith",): 420 | return self.table.c[column].like("%" + value) 421 | return false() 422 | 423 | def _args_to_clause(self, args, clauses=()): 424 | clauses = list(clauses) 425 | for column, value in args.items(): 426 | column = self._get_column_name(column) 427 | if not self.has_column(column): 428 | clauses.append(false()) 429 | elif isinstance(value, (list, tuple, set)): 430 | clauses.append(self._generate_clause(column, "in", value)) 431 | elif isinstance(value, dict): 432 | for op, op_value in value.items(): 433 | clauses.append(self._generate_clause(column, op, op_value)) 434 | else: 435 | clauses.append(self._generate_clause(column, "=", value)) 436 | return and_(True, *clauses) 437 | 438 | def _args_to_order_by(self, order_by): 439 | orderings = [] 440 | for ordering in ensure_list(order_by): 441 | if ordering is None: 442 | continue 443 | column = ordering.lstrip("-") 444 | column = self._get_column_name(column) 445 | if not self.has_column(column): 446 | continue 447 | if ordering.startswith("-"): 448 | orderings.append(self.table.c[column].desc()) 449 | else: 450 | orderings.append(self.table.c[column].asc()) 451 | return orderings 452 | 453 | def _keys_to_args(self, row, keys): 454 | keys = [self._get_column_name(k) for k in ensure_list(keys)] 455 | row = row.copy() 456 | args = {k: row.pop(k, None) for k in keys} 457 | return args, row 458 | 459 | def create_column(self, name, type, **kwargs): 460 | """Create a new column ``name`` of a specified type. 461 | :: 462 | 463 | table.create_column('created_at', db.types.datetime) 464 | 465 | `type` corresponds to an SQLAlchemy type as described by 466 | `dataset.db.Types`. Additional keyword arguments are passed 467 | to the constructor of `Column`, so that default values, and 468 | options like `nullable` and `unique` can be set. 469 | :: 470 | 471 | table.create_column('key', unique=True, nullable=False) 472 | table.create_column('food', default='banana') 473 | """ 474 | name = self._get_column_name(name) 475 | if self.has_column(name): 476 | log.debug("Column exists: %s" % name) 477 | return 478 | self._sync_table((Column(name, type, **kwargs),)) 479 | 480 | def create_column_by_example(self, name, value): 481 | """ 482 | Explicitly create a new column ``name`` with a type that is appropriate 483 | to store the given example ``value``. The type is guessed in the same 484 | way as for the insert method with ``ensure=True``. 485 | :: 486 | 487 | table.create_column_by_example('length', 4.2) 488 | 489 | If a column of the same name already exists, no action is taken, even 490 | if it is not of the type we would have created. 491 | """ 492 | type_ = self.db.types.guess(value) 493 | self.create_column(name, type_) 494 | 495 | def drop_column(self, name): 496 | """ 497 | Drop the column ``name``. 498 | :: 499 | 500 | table.drop_column('created_at') 501 | 502 | """ 503 | if self.db.engine.dialect.name == "sqlite": 504 | raise RuntimeError("SQLite does not support dropping columns.") 505 | name = self._get_column_name(name) 506 | with self.db.lock: 507 | if not self.exists or not self.has_column(name): 508 | log.debug("Column does not exist: %s", name) 509 | return 510 | 511 | self._threading_warn() 512 | self.db.op.drop_column(self.table.name, name, schema=self.table.schema) 513 | self._reflect_table() 514 | 515 | def drop(self): 516 | """Drop the table from the database. 517 | 518 | Deletes both the schema and all the contents within it. 519 | """ 520 | with self.db.lock: 521 | if self.exists: 522 | self._threading_warn() 523 | self.table.drop(self.db.executable, checkfirst=True) 524 | self._table = None 525 | self._columns = None 526 | self.db._tables.pop(self.name, None) 527 | 528 | def has_index(self, columns): 529 | """Check if an index exists to cover the given ``columns``.""" 530 | if not self.exists: 531 | return False 532 | columns = set([self._get_column_name(c) for c in ensure_list(columns)]) 533 | if columns in self._indexes: 534 | return True 535 | for column in columns: 536 | if not self.has_column(column): 537 | return False 538 | indexes = self.db.inspect.get_indexes(self.name, schema=self.db.schema) 539 | for index in indexes: 540 | idx_columns = index.get("column_names", []) 541 | if len(columns.intersection(idx_columns)) == len(columns): 542 | self._indexes.append(columns) 543 | return True 544 | if self.table.primary_key is not None: 545 | pk_columns = [c.name for c in self.table.primary_key.columns] 546 | if len(columns.intersection(pk_columns)) == len(columns): 547 | self._indexes.append(columns) 548 | return True 549 | return False 550 | 551 | def create_index(self, columns, name=None, **kw): 552 | """Create an index to speed up queries on a table. 553 | 554 | If no ``name`` is given a random name is created. 555 | :: 556 | 557 | table.create_index(['name', 'country']) 558 | """ 559 | columns = [self._get_column_name(c) for c in ensure_list(columns)] 560 | with self.db.lock: 561 | if not self.exists: 562 | raise DatasetException("Table has not been created yet.") 563 | 564 | for column in columns: 565 | if not self.has_column(column): 566 | return 567 | 568 | if not self.has_index(columns): 569 | self._threading_warn() 570 | name = name or index_name(self.name, columns) 571 | columns = [self.table.c[c] for c in columns] 572 | 573 | # MySQL crashes out if you try to index very long text fields, 574 | # apparently. This defines (a somewhat random) prefix that 575 | # will be captured by the index, after which I assume the engine 576 | # conducts a more linear scan: 577 | mysql_length = {} 578 | for col in columns: 579 | if isinstance(col.type, MYSQL_LENGTH_TYPES): 580 | mysql_length[col.name] = 10 581 | kw["mysql_length"] = mysql_length 582 | 583 | idx = Index(name, *columns, **kw) 584 | idx.create(self.db.executable) 585 | 586 | def find(self, *_clauses, **kwargs): 587 | """Perform a simple search on the table. 588 | 589 | Simply pass keyword arguments as ``filter``. 590 | :: 591 | 592 | results = table.find(country='France') 593 | results = table.find(country='France', year=1980) 594 | 595 | Using ``_limit``:: 596 | 597 | # just return the first 10 rows 598 | results = table.find(country='France', _limit=10) 599 | 600 | You can sort the results by single or multiple columns. Append a minus 601 | sign to the column name for descending order:: 602 | 603 | # sort results by a column 'year' 604 | results = table.find(country='France', order_by='year') 605 | # return all rows sorted by multiple columns (descending by year) 606 | results = table.find(order_by=['country', '-year']) 607 | 608 | You can also submit filters based on criteria other than equality, 609 | see :ref:`advanced_filters` for details. 610 | 611 | To run more complex queries with JOINs, or to perform GROUP BY-style 612 | aggregation, you can also use :py:meth:`db.query() ` 613 | to run raw SQL queries instead. 614 | """ 615 | if not self.exists: 616 | return iter([]) 617 | 618 | _limit = kwargs.pop("_limit", None) 619 | _offset = kwargs.pop("_offset", 0) 620 | order_by = kwargs.pop("order_by", None) 621 | _streamed = kwargs.pop("_streamed", False) 622 | _step = kwargs.pop("_step", QUERY_STEP) 623 | if _step is False or _step == 0: 624 | _step = None 625 | 626 | order_by = self._args_to_order_by(order_by) 627 | args = self._args_to_clause(kwargs, clauses=_clauses) 628 | query = self.table.select(whereclause=args, limit=_limit, offset=_offset) 629 | if len(order_by): 630 | query = query.order_by(*order_by) 631 | 632 | conn = self.db.executable 633 | if _streamed: 634 | conn = self.db.engine.connect() 635 | conn = conn.execution_options(stream_results=True) 636 | 637 | return ResultIter(conn.execute(query), row_type=self.db.row_type, step=_step) 638 | 639 | def find_one(self, *args, **kwargs): 640 | """Get a single result from the table. 641 | 642 | Works just like :py:meth:`find() ` but returns one 643 | result, or ``None``. 644 | :: 645 | 646 | row = table.find_one(country='United States') 647 | """ 648 | if not self.exists: 649 | return None 650 | 651 | kwargs["_limit"] = 1 652 | kwargs["_step"] = None 653 | resiter = self.find(*args, **kwargs) 654 | try: 655 | for row in resiter: 656 | return row 657 | finally: 658 | resiter.close() 659 | 660 | def count(self, *_clauses, **kwargs): 661 | """Return the count of results for the given filter set.""" 662 | # NOTE: this does not have support for limit and offset since I can't 663 | # see how this is useful. Still, there might be compatibility issues 664 | # with people using these flags. Let's see how it goes. 665 | if not self.exists: 666 | return 0 667 | 668 | args = self._args_to_clause(kwargs, clauses=_clauses) 669 | query = select([func.count()], whereclause=args) 670 | query = query.select_from(self.table) 671 | rp = self.db.executable.execute(query) 672 | return rp.fetchone()[0] 673 | 674 | def __len__(self): 675 | """Return the number of rows in the table.""" 676 | return self.count() 677 | 678 | def distinct(self, *args, **kwargs): 679 | """Return all the unique (distinct) values for the given ``columns``. 680 | :: 681 | 682 | # returns only one row per year, ignoring the rest 683 | table.distinct('year') 684 | # works with multiple columns, too 685 | table.distinct('year', 'country') 686 | # you can also combine this with a filter 687 | table.distinct('year', country='China') 688 | """ 689 | if not self.exists: 690 | return iter([]) 691 | 692 | columns = [] 693 | clauses = [] 694 | for column in args: 695 | if isinstance(column, ClauseElement): 696 | clauses.append(column) 697 | else: 698 | if not self.has_column(column): 699 | raise DatasetException("No such column: %s" % column) 700 | columns.append(self.table.c[column]) 701 | 702 | _limit = kwargs.pop("_limit", None) 703 | _offset = kwargs.pop("_offset", 0) 704 | clause = self._args_to_clause(kwargs, clauses=clauses) 705 | if not len(columns): 706 | return iter([]) 707 | 708 | q = expression.select( 709 | columns, 710 | distinct=True, 711 | whereclause=clause, 712 | limit=_limit, 713 | offset=_offset, 714 | order_by=[c.asc() for c in columns], 715 | ) 716 | return self.db.query(q) 717 | 718 | # Legacy methods for running find queries. 719 | all = find 720 | 721 | def __iter__(self): 722 | """Return all rows of the table as simple dictionaries. 723 | 724 | Allows for iterating over all rows in the table without explicitly 725 | calling :py:meth:`find() `. 726 | :: 727 | 728 | for row in table: 729 | print(row) 730 | """ 731 | return self.find() 732 | 733 | def __repr__(self): 734 | """Get table representation.""" 735 | return "" % self.table.name 736 | -------------------------------------------------------------------------------- /dataset/types.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, date 2 | 3 | from sqlalchemy import Integer, UnicodeText, Float, BigInteger 4 | from sqlalchemy import String, Boolean, Date, DateTime, Unicode, JSON 5 | from sqlalchemy.dialects.postgresql import JSONB 6 | from sqlalchemy.types import TypeEngine, _Binary 7 | 8 | MYSQL_LENGTH_TYPES = (String, _Binary) 9 | 10 | 11 | class Types(object): 12 | """A holder class for easy access to SQLAlchemy type names.""" 13 | 14 | integer = Integer 15 | string = Unicode 16 | text = UnicodeText 17 | float = Float 18 | bigint = BigInteger 19 | boolean = Boolean 20 | date = Date 21 | datetime = DateTime 22 | 23 | def __init__(self, is_postgres=None): 24 | self.json = JSONB if is_postgres else JSON 25 | 26 | def guess(self, sample): 27 | """Given a single sample, guess the column type for the field. 28 | 29 | If the sample is an instance of an SQLAlchemy type, the type will be 30 | used instead. 31 | """ 32 | if isinstance(sample, TypeEngine): 33 | return sample 34 | if isinstance(sample, bool): 35 | return self.boolean 36 | elif isinstance(sample, int): 37 | return self.bigint 38 | elif isinstance(sample, float): 39 | return self.float 40 | elif isinstance(sample, datetime): 41 | return self.datetime 42 | elif isinstance(sample, date): 43 | return self.date 44 | elif isinstance(sample, dict): 45 | return self.json 46 | return self.text 47 | -------------------------------------------------------------------------------- /dataset/util.py: -------------------------------------------------------------------------------- 1 | from hashlib import sha1 2 | from urllib.parse import urlparse, urlencode 3 | from collections import OrderedDict 4 | from sqlalchemy.exc import ResourceClosedError 5 | 6 | QUERY_STEP = 1000 7 | row_type = OrderedDict 8 | 9 | try: 10 | # SQLAlchemy > 1.4.0, new row model. 11 | from sqlalchemy.engine import Row # noqa 12 | 13 | def convert_row(row_type, row): 14 | if row is None: 15 | return None 16 | return row_type(row._mapping.items()) 17 | 18 | 19 | except ImportError: 20 | # SQLAlchemy < 1.4.0, no _mapping. 21 | 22 | def convert_row(row_type, row): 23 | if row is None: 24 | return None 25 | return row_type(row.items()) 26 | 27 | 28 | class DatasetException(Exception): 29 | pass 30 | 31 | 32 | def iter_result_proxy(rp, step=None): 33 | """Iterate over the ResultProxy.""" 34 | while True: 35 | if step is None: 36 | chunk = rp.fetchall() 37 | else: 38 | chunk = rp.fetchmany(size=step) 39 | if not chunk: 40 | break 41 | for row in chunk: 42 | yield row 43 | 44 | 45 | def make_sqlite_url( 46 | path, 47 | cache=None, 48 | timeout=None, 49 | mode=None, 50 | check_same_thread=True, 51 | immutable=False, 52 | nolock=False, 53 | ): 54 | # NOTE: this PR 55 | # https://gerrit.sqlalchemy.org/c/sqlalchemy/sqlalchemy/+/1474/ 56 | # added support for URIs in SQLite 57 | # The full list of supported URIs is a combination of: 58 | # https://docs.python.org/3/library/sqlite3.html#sqlite3.connect 59 | # and 60 | # https://www.sqlite.org/uri.html 61 | params = {} 62 | if cache: 63 | assert cache in ("shared", "private") 64 | params["cache"] = cache 65 | if timeout: 66 | # Note: if timeout is None, it uses the default timeout 67 | params["timeout"] = timeout 68 | if mode: 69 | assert mode in ("ro", "rw", "rwc") 70 | params["mode"] = mode 71 | if nolock: 72 | params["nolock"] = 1 73 | if immutable: 74 | params["immutable"] = 1 75 | if not check_same_thread: 76 | params["check_same_thread"] = "false" 77 | if not params: 78 | return "sqlite:///" + path 79 | params["uri"] = "true" 80 | return "sqlite:///file:" + path + "?" + urlencode(params) 81 | 82 | 83 | class ResultIter(object): 84 | """SQLAlchemy ResultProxies are not iterable to get a 85 | list of dictionaries. This is to wrap them.""" 86 | 87 | def __init__(self, result_proxy, row_type=row_type, step=None): 88 | self.row_type = row_type 89 | self.result_proxy = result_proxy 90 | try: 91 | self.keys = list(result_proxy.keys()) 92 | self._iter = iter_result_proxy(result_proxy, step=step) 93 | except ResourceClosedError: 94 | self.keys = [] 95 | self._iter = iter([]) 96 | 97 | def __next__(self): 98 | try: 99 | return convert_row(self.row_type, next(self._iter)) 100 | except StopIteration: 101 | self.close() 102 | raise 103 | 104 | next = __next__ 105 | 106 | def __iter__(self): 107 | return self 108 | 109 | def close(self): 110 | self.result_proxy.close() 111 | 112 | 113 | def normalize_column_name(name): 114 | """Check if a string is a reasonable thing to use as a column name.""" 115 | if not isinstance(name, str): 116 | raise ValueError("%r is not a valid column name." % name) 117 | 118 | # limit to 63 characters 119 | name = name.strip()[:63] 120 | # column names can be 63 *bytes* max in postgresql 121 | if isinstance(name, str): 122 | while len(name.encode("utf-8")) >= 64: 123 | name = name[: len(name) - 1] 124 | 125 | if not len(name) or "." in name or "-" in name: 126 | raise ValueError("%r is not a valid column name." % name) 127 | return name 128 | 129 | 130 | def normalize_column_key(name): 131 | """Return a comparable column name.""" 132 | if name is None or not isinstance(name, str): 133 | return None 134 | return name.upper().strip().replace(" ", "") 135 | 136 | 137 | def normalize_table_name(name): 138 | """Check if the table name is obviously invalid.""" 139 | if not isinstance(name, str): 140 | raise ValueError("Invalid table name: %r" % name) 141 | name = name.strip()[:63] 142 | if not len(name): 143 | raise ValueError("Invalid table name: %r" % name) 144 | return name 145 | 146 | 147 | def safe_url(url): 148 | """Remove password from printed connection URLs.""" 149 | parsed = urlparse(url) 150 | if parsed.password is not None: 151 | pwd = ":%s@" % parsed.password 152 | url = url.replace(pwd, ":*****@") 153 | return url 154 | 155 | 156 | def index_name(table, columns): 157 | """Generate an artificial index name.""" 158 | sig = "||".join(columns) 159 | key = sha1(sig.encode("utf-8")).hexdigest()[:16] 160 | return "ix_%s_%s" % (table, key) 161 | 162 | 163 | def pad_chunk_columns(chunk, columns): 164 | """Given a set of items to be inserted, make sure they all have the 165 | same columns by padding columns with None if they are missing.""" 166 | for record in chunk: 167 | for column in columns: 168 | record.setdefault(column, None) 169 | return chunk 170 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/dataset.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/dataset.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/dataset" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/dataset" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/_static/dataset-logo-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pudo/dataset/b2ab09e58c6f17334e4286009b20887b6a8a8fac/docs/_static/dataset-logo-dark.png -------------------------------------------------------------------------------- /docs/_static/dataset-logo-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pudo/dataset/b2ab09e58c6f17334e4286009b20887b6a8a8fac/docs/_static/dataset-logo-light.png -------------------------------------------------------------------------------- /docs/_static/dataset-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pudo/dataset/b2ab09e58c6f17334e4286009b20887b6a8a8fac/docs/_static/dataset-logo.png -------------------------------------------------------------------------------- /docs/_static/knight_mozilla_on.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pudo/dataset/b2ab09e58c6f17334e4286009b20887b6a8a8fac/docs/_static/knight_mozilla_on.jpg -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | 2 | API documentation 3 | ================= 4 | 5 | Connecting 6 | ---------- 7 | 8 | .. autofunction:: dataset.connect 9 | 10 | Notes 11 | ----- 12 | 13 | * **dataset** uses SQLAlchemy connection pooling when connecting to the 14 | database. There is no way of explicitly clearing or shutting down the 15 | connections, other than having the dataset instance garbage collected. 16 | 17 | Database 18 | -------- 19 | 20 | .. autoclass:: dataset.Database 21 | :members: tables, get_table, create_table, load_table, query, begin, commit, rollback 22 | :special-members: 23 | 24 | 25 | Table 26 | ----- 27 | 28 | .. autoclass:: dataset.Table 29 | :members: columns, find, find_one, all, count, distinct, insert, insert_ignore, insert_many, update, update_many, upsert, upsert_many, delete, create_column, create_column_by_example, drop_column, create_index, drop, has_column, has_index 30 | :special-members: __len__, __iter__ 31 | 32 | 33 | Data Export 34 | ----------- 35 | 36 | **Note:** Data exporting has been extracted into a stand-alone package, datafreeze. See the relevant repository here_. 37 | 38 | .. _here: https://github.com/pudo/datafreeze 39 | 40 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # dataset documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Apr 1 18:41:21 2013. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | sys.path.insert(0, os.path.abspath("../")) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | # needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode"] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ["_templates"] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = ".rst" 35 | 36 | # The master toctree document. 37 | master_doc = "index" 38 | 39 | # General information about the project. 40 | project = u"dataset" 41 | copyright = u"2013-2021, Friedrich Lindenberg, Gregor Aisch, Stefan Wehrmeyer" 42 | 43 | # The version info for the project you're documenting, acts as replacement for 44 | # |version| and |release|, also used in various other places throughout the 45 | # built documents. 46 | # 47 | # The short X.Y version. 48 | version = "1.6.2" 49 | # The full version, including alpha/beta/rc tags. 50 | release = "1.6.2" 51 | 52 | # There are two options for replacing |today|: either, you set today to some 53 | # non-false value, then it is used: 54 | # today = '' 55 | # Else, today_fmt is used as the format for a strftime call. 56 | # today_fmt = '%B %d, %Y' 57 | 58 | # List of patterns, relative to source directory, that match files and 59 | # directories to ignore when looking for source files. 60 | exclude_patterns = ["_build"] 61 | 62 | # The reST default role (used for this markup: `text`) to use for all documents. 63 | # default_role = None 64 | 65 | # If true, '()' will be appended to :func: etc. cross-reference text. 66 | # add_function_parentheses = True 67 | 68 | # If true, the current module name will be prepended to all description 69 | # unit titles (such as .. function::). 70 | # add_module_names = True 71 | 72 | # If true, sectionauthor and moduleauthor directives will be shown in the 73 | # output. They are ignored by default. 74 | # show_authors = False 75 | 76 | # The name of the Pygments (syntax highlighting) style to use. 77 | pygments_style = "sphinx" 78 | 79 | # A list of ignored prefixes for module index sorting. 80 | # modindex_common_prefix = [] 81 | 82 | # If true, keep warnings as "system message" paragraphs in the built documents. 83 | # keep_warnings = False 84 | 85 | 86 | # -- Options for HTML output --------------------------------------------------- 87 | 88 | # The theme to use for HTML and HTML Help pages. See the documentation for 89 | # a list of builtin themes. 90 | html_theme = "furo" 91 | 92 | html_static_path = ["_static"] 93 | html_theme_options = { 94 | "light_logo": "dataset-logo-light.png", 95 | "dark_logo": "dataset-logo-dark.png", 96 | } 97 | 98 | # If true, links to the reST sources are added to the pages. 99 | html_show_sourcelink = False 100 | 101 | 102 | # Output file base name for HTML help builder. 103 | htmlhelp_basename = "datasetdoc" 104 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. dataset documentation master file, created by 2 | sphinx-quickstart on Mon Apr 1 18:41:21 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | dataset: databases for lazy people 7 | ================================== 8 | 9 | .. toctree:: 10 | :hidden: 11 | 12 | 13 | Although managing data in relational databases has plenty of benefits, they're 14 | rarely used in day-to-day work with small to medium scale datasets. But why is 15 | that? Why do we see an awful lot of data stored in static files in CSV or JSON 16 | format, even though they are hard to query and update incrementally? 17 | 18 | The answer is that **programmers are lazy**, and thus they tend to prefer the 19 | easiest solution they find. And in **Python**, a database isn't the simplest 20 | solution for storing a bunch of structured data. This is what **dataset** is 21 | going to change! 22 | 23 | **dataset** provides a simple abstraction layer that removes most direct SQL 24 | statements without the necessity for a full ORM model - essentially, databases 25 | can be used like a JSON file or NoSQL store. 26 | 27 | A simple data loading script using **dataset** might look like this: 28 | 29 | :: 30 | 31 | import dataset 32 | 33 | db = dataset.connect('sqlite:///:memory:') 34 | 35 | table = db['sometable'] 36 | table.insert(dict(name='John Doe', age=37)) 37 | table.insert(dict(name='Jane Doe', age=34, gender='female')) 38 | 39 | john = table.find_one(name='John Doe') 40 | 41 | 42 | Here is `similar code, without dataset `_. 43 | 44 | 45 | Features 46 | -------- 47 | 48 | * **Automatic schema**: If a table or column is written that does not 49 | exist in the database, it will be created automatically. 50 | * **Upserts**: Records are either created or updated, depending on 51 | whether an existing version can be found. 52 | * **Query helpers** for simple queries such as :py:meth:`all ` rows in a table or 53 | all :py:meth:`distinct ` values across a set of columns. 54 | * **Compatibility**: Being built on top of `SQLAlchemy `_, ``dataset`` works with all major databases, such as SQLite, PostgreSQL and MySQL. 55 | 56 | Contents 57 | -------- 58 | 59 | .. toctree:: 60 | :maxdepth: 2 61 | 62 | install 63 | quickstart 64 | api 65 | queries 66 | 67 | Contributors 68 | ------------ 69 | 70 | ``dataset`` is written and maintained by `Friedrich Lindenberg `_, 71 | `Gregor Aisch `_ and `Stefan Wehrmeyer `_. 72 | Its code is largely based on the preceding libraries `sqlaload `_ 73 | and datafreeze. And of course, we're standing on the `shoulders of giants `_. 74 | 75 | Our cute little `naked mole rat `_ was drawn by `Johannes Koch `_. 76 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | 2 | Installation Guide 3 | ================== 4 | 5 | The easiest way is to install ``dataset`` from the `Python Package Index 6 | `_ using ``pip`` or ``easy_install``: 7 | 8 | .. code-block:: bash 9 | 10 | $ pip install dataset 11 | 12 | To install it manually simply download the repository from Github: 13 | 14 | .. code-block:: bash 15 | 16 | $ git clone git://github.com/pudo/dataset.git 17 | $ cd dataset/ 18 | $ python setup.py install 19 | 20 | Depending on the type of database backend, you may also need to install a 21 | database specific driver package. For MySQL, this is ``MySQLdb``, for Postgres 22 | its ``psycopg2``. SQLite support is integrated into Python. 23 | -------------------------------------------------------------------------------- /docs/queries.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _advanced_filters: 3 | 4 | Advanced filters 5 | ================ 6 | 7 | ``dataset`` provides two methods for running queries: :py:meth:`table.find() ` 8 | and :py:meth:`db.query() `. The table find helper method provides 9 | limited, but simple filtering options:: 10 | 11 | results = table.find(column={operator: value}) 12 | # e.g.: 13 | results = table.find(name={'like': '%mole rat%'}) 14 | 15 | A special form is using keyword searches on specific columns:: 16 | 17 | results = table.find(value=5) 18 | # equal to: 19 | results = table.find(value={'=': 5}) 20 | 21 | # Lists, tuples and sets are turned into `IN` queries: 22 | results = table.find(category=('foo', 'bar')) 23 | # equal to: 24 | results = table.find(value={'in': ('foo', 'bar')}) 25 | 26 | The following comparison operators are supported: 27 | 28 | ============== ============================================================ 29 | Operator Description 30 | ============== ============================================================ 31 | gt, > Greater than 32 | lt, < Less than 33 | gte, >= Greater or equal 34 | lte, <= Less or equal 35 | !=, <>, not Not equal to a single value 36 | in Value is in the given sequence 37 | notin Value is not in the given sequence 38 | like, ilike Text search, ILIKE is case-insensitive. Use ``%`` as a wildcard 39 | notlike Like text search, except check if pattern does not exist 40 | between, .. Value is between two values in the given tuple 41 | startswith String starts with 42 | endswith String ends with 43 | ============== ============================================================ 44 | 45 | Querying for a specific value on a column that does not exist on the table 46 | will return no results. 47 | 48 | You can also pass additional SQLAlchemy clauses into the :py:meth:`table.find() ` method 49 | by falling back onto the SQLAlchemy core objects wrapped by `dataset`:: 50 | 51 | # Get the column `city` from the dataset table: 52 | column = table.table.columns.city 53 | # Define a SQLAlchemy clause: 54 | clause = column.ilike('amsterda%') 55 | # Query using the clause: 56 | results = table.find(clause) 57 | 58 | This can also be used to define combined OR clauses if needed (e.g. `city = 'Bla' OR country = 'Foo'`). 59 | 60 | Queries using raw SQL 61 | --------------------- 62 | 63 | To run more complex queries with JOINs, or to perform GROUP BY-style 64 | aggregation, you can also use :py:meth:`db.query() ` 65 | to run raw SQL queries instead. This also supports parameterisation to avoid 66 | SQL injections. 67 | 68 | Finally, you should consider falling back to SQLAlchemy_ core to construct 69 | queries if you are looking for a programmatic, composable method of generating 70 | SQL in Python. 71 | 72 | .. _SQLALchemy: https://docs.sqlalchemy.org/ -------------------------------------------------------------------------------- /docs/quickstart.rst: -------------------------------------------------------------------------------- 1 | 2 | Quickstart 3 | ========== 4 | 5 | 6 | Hi, welcome to the twelve-minute quick-start tutorial. 7 | 8 | Connecting to a database 9 | ------------------------ 10 | 11 | At first you need to import the dataset package :) :: 12 | 13 | import dataset 14 | 15 | To connect to a database you need to identify it by its `URL `_, which basically is a string of the form ``"dialect://user:password@host/dbname"``. Here are a few examples for different database backends:: 16 | 17 | # connecting to a SQLite database 18 | db = dataset.connect('sqlite:///mydatabase.db') 19 | 20 | # connecting to a MySQL database with user and password 21 | db = dataset.connect('mysql://user:password@localhost/mydatabase') 22 | 23 | # connecting to a PostgreSQL database 24 | db = dataset.connect('postgresql://scott:tiger@localhost:5432/mydatabase') 25 | 26 | It is also possible to define the `URL` as an environment variable called `DATABASE_URL` 27 | so you can initialize database connection without explicitly passing an `URL`:: 28 | 29 | db = dataset.connect() 30 | 31 | Depending on which database you're using, you may also have to install 32 | the database bindings to support that database. SQLite is included in 33 | the Python core, but PostgreSQL requires ``psycopg2`` to be installed. 34 | MySQL can be enabled by installing the ``mysql-db`` drivers. 35 | 36 | 37 | Storing data 38 | ------------ 39 | 40 | To store some data you need to get a reference to a table. You don't need 41 | to worry about whether the table already exists or not, since dataset 42 | will create it automatically:: 43 | 44 | # get a reference to the table 'user' 45 | table = db['user'] 46 | 47 | Now storing data in a table is a matter of a single function call. Just 48 | pass a `dict`_ to *insert*. Note that you don't need to create the columns 49 | *name* and *age* – dataset will do this automatically:: 50 | 51 | # Insert a new record. 52 | table.insert(dict(name='John Doe', age=46, country='China')) 53 | 54 | # dataset will create "missing" columns any time you insert a dict with an unknown key 55 | table.insert(dict(name='Jane Doe', age=37, country='France', gender='female')) 56 | 57 | .. _dict: http://docs.python.org/2/library/stdtypes.html#dict 58 | 59 | Updating existing entries is easy, too:: 60 | 61 | table.update(dict(name='John Doe', age=47), ['name']) 62 | 63 | The list of filter columns given as the second argument filter using the 64 | values in the first column. If you don't want to update over a 65 | particular value, just use the auto-generated ``id`` column. 66 | 67 | Using Transactions 68 | ------------------ 69 | 70 | You can group a set of database updates in a transaction. In that case, all updates 71 | are committed at once or, in case of exception, all of them are reverted. Transactions 72 | are supported through a context manager, so they can be used through a ``with`` 73 | statement:: 74 | 75 | with dataset.connect() as tx: 76 | tx['user'].insert(dict(name='John Doe', age=46, country='China')) 77 | 78 | You can get same functionality by invoking the methods :py:meth:`begin() `, 79 | :py:meth:`commit() ` and :py:meth:`rollback() ` 80 | explicitly:: 81 | 82 | db = dataset.connect() 83 | db.begin() 84 | try: 85 | db['user'].insert(dict(name='John Doe', age=46, country='China')) 86 | db.commit() 87 | except: 88 | db.rollback() 89 | 90 | Nested transactions are supported too:: 91 | 92 | db = dataset.connect() 93 | with db as tx1: 94 | tx1['user'].insert(dict(name='John Doe', age=46, country='China')) 95 | with db as tx2: 96 | tx2['user'].insert(dict(name='Jane Doe', age=37, country='France', gender='female')) 97 | 98 | 99 | 100 | Inspecting databases and tables 101 | ------------------------------- 102 | 103 | When dealing with unknown databases we might want to check their structure 104 | first. To start exploring, let's find out what tables are stored in the 105 | database: 106 | 107 | >>> print(db.tables) 108 | [u'user'] 109 | 110 | Now, let's list all columns available in the table ``user``: 111 | 112 | >>> print(db['user'].columns) 113 | [u'id', u'country', u'age', u'name', u'gender'] 114 | 115 | Using ``len()`` we can get the total number of rows in a table: 116 | 117 | >>> print(len(db['user'])) 118 | 2 119 | 120 | Reading data from tables 121 | ------------------------ 122 | 123 | Now let's get some real data out of the table:: 124 | 125 | users = db['user'].all() 126 | 127 | If we simply want to iterate over all rows in a table, we can omit :py:meth:`all() `:: 128 | 129 | for user in db['user']: 130 | print(user['age']) 131 | 132 | We can search for specific entries using :py:meth:`find() ` and 133 | :py:meth:`find_one() `:: 134 | 135 | # All users from China 136 | chinese_users = table.find(country='China') 137 | 138 | # Get a specific user 139 | john = table.find_one(name='John Doe') 140 | 141 | # Find multiple at once 142 | winners = table.find(id=[1, 3, 7]) 143 | 144 | # Find by comparison operator 145 | elderly_users = table.find(age={'>=': 70}) 146 | possible_customers = table.find(age={'between': [21, 80]}) 147 | 148 | # Use the underlying SQLAlchemy directly 149 | elderly_users = table.find(table.table.columns.age >= 70) 150 | 151 | See :ref:`advanced_filters` for details on complex filters. 152 | 153 | Using :py:meth:`distinct() ` we can grab a set of rows 154 | with unique values in one or more columns:: 155 | 156 | # Get one user per country 157 | db['user'].distinct('country') 158 | 159 | Finally, you can use the ``row_type`` parameter to choose the data type in which 160 | results will be returned:: 161 | 162 | import dataset 163 | from stuf import stuf 164 | 165 | db = dataset.connect('sqlite:///mydatabase.db', row_type=stuf) 166 | 167 | Now contents will be returned in ``stuf`` objects (basically, ``dict`` 168 | objects whose elements can be accessed as attributes (``item.name``) as well as 169 | by index (``item['name']``). 170 | 171 | Running custom SQL queries 172 | -------------------------- 173 | 174 | Of course the main reason you're using a database is that you want to 175 | use the full power of SQL queries. Here's how you run them with ``dataset``:: 176 | 177 | result = db.query('SELECT country, COUNT(*) c FROM user GROUP BY country') 178 | for row in result: 179 | print(row['country'], row['c']) 180 | 181 | The :py:meth:`query() ` method can also be used to 182 | access the underlying `SQLAlchemy core API `_, which allows for the 183 | programmatic construction of more complex queries:: 184 | 185 | table = db['user'].table 186 | statement = table.select(table.c.name.like('%John%')) 187 | result = db.query(statement) 188 | 189 | Limitations of dataset 190 | ---------------------- 191 | 192 | The goal of ``dataset`` is to make basic database operations simpler, by expressing 193 | some relatively basic operations in a Pythonic way. The downside of this approach 194 | is that as your application grows more complex, you may begin to need access to 195 | more advanced operations and be forced to switch to using SQLAlchemy proper, 196 | without the dataset layer (instead, you may want to play with SQLAlchemy's ORM). 197 | 198 | When that moment comes, take the hit. SQLAlchemy is an amazing piece of Python 199 | code, and it will provide you with idiomatic access to all of SQL's functions. 200 | 201 | Some of the specific aspects of SQL that are not exposed in ``dataset``, and are 202 | considered out of scope for the project, include: 203 | 204 | * Foreign key relationships between tables, and expressing one-to-many and 205 | many-to-many relationships in idiomatic Python. 206 | * Python-wrapped ``JOIN`` queries. 207 | * Creating databases, or managing DBMS software. 208 | * Support for Python 2.x 209 | 210 | There's also some functionality that might be cool to support in the future, but 211 | that requires significant engineering: 212 | 213 | * Async operations 214 | * Database-native ``UPSERT`` semantics 215 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | furo -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | 4 | [flake8] 5 | ignore = E501,E123,E124,E126,E127,E128,E722,E741 6 | 7 | [bdist_wheel] 8 | universal = 1 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md") as f: 4 | long_description = f.read() 5 | 6 | setup( 7 | name="dataset", 8 | version="1.6.2", 9 | description="Toolkit for Python-based database access.", 10 | long_description=long_description, 11 | long_description_content_type="text/markdown", 12 | classifiers=[ 13 | "Development Status :: 3 - Alpha", 14 | "Intended Audience :: Developers", 15 | "License :: OSI Approved :: MIT License", 16 | "Operating System :: OS Independent", 17 | "Programming Language :: Python :: 3.5", 18 | "Programming Language :: Python :: 3.6", 19 | "Programming Language :: Python :: 3.7", 20 | "Programming Language :: Python :: 3.8", 21 | "Programming Language :: Python :: 3.9", 22 | ], 23 | keywords="sql sqlalchemy etl loading utility", 24 | author="Friedrich Lindenberg, Gregor Aisch, Stefan Wehrmeyer", 25 | author_email="friedrich.lindenberg@gmail.com", 26 | url="http://github.com/pudo/dataset", 27 | license="MIT", 28 | packages=find_packages(exclude=["ez_setup", "examples", "test"]), 29 | namespace_packages=[], 30 | include_package_data=False, 31 | zip_safe=False, 32 | install_requires=[ 33 | "sqlalchemy >= 1.3.2, < 2.0.0", 34 | "alembic >= 0.6.2", 35 | "banal >= 1.0.1", 36 | ], 37 | extras_require={ 38 | "dev": [ 39 | "pip", 40 | "pytest", 41 | "wheel", 42 | "flake8", 43 | "coverage", 44 | "psycopg2-binary", 45 | "PyMySQL", 46 | "cryptography", 47 | ] 48 | }, 49 | tests_require=["pytest"], 50 | test_suite="test", 51 | entry_points={}, 52 | ) 53 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pudo/dataset/b2ab09e58c6f17334e4286009b20887b6a8a8fac/test/__init__.py -------------------------------------------------------------------------------- /test/sample_data.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from datetime import datetime 5 | 6 | 7 | TEST_CITY_1 = "B€rkeley" 8 | TEST_CITY_2 = "G€lway" 9 | 10 | TEST_DATA = [ 11 | {"date": datetime(2011, 1, 1), "temperature": 1, "place": TEST_CITY_2}, 12 | {"date": datetime(2011, 1, 2), "temperature": -1, "place": TEST_CITY_2}, 13 | {"date": datetime(2011, 1, 3), "temperature": 0, "place": TEST_CITY_2}, 14 | {"date": datetime(2011, 1, 1), "temperature": 6, "place": TEST_CITY_1}, 15 | {"date": datetime(2011, 1, 2), "temperature": 8, "place": TEST_CITY_1}, 16 | {"date": datetime(2011, 1, 3), "temperature": 5, "place": TEST_CITY_1}, 17 | ] 18 | -------------------------------------------------------------------------------- /test/test_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from datetime import datetime 4 | from collections import OrderedDict 5 | from sqlalchemy import TEXT, BIGINT 6 | from sqlalchemy.exc import IntegrityError, SQLAlchemyError, ArgumentError 7 | 8 | from dataset import connect, chunked 9 | 10 | from .sample_data import TEST_DATA, TEST_CITY_1 11 | 12 | 13 | class DatabaseTestCase(unittest.TestCase): 14 | def setUp(self): 15 | self.db = connect() 16 | self.tbl = self.db["weather"] 17 | self.tbl.insert_many(TEST_DATA) 18 | 19 | def tearDown(self): 20 | for table in self.db.tables: 21 | self.db[table].drop() 22 | 23 | def test_valid_database_url(self): 24 | assert self.db.url, os.environ["DATABASE_URL"] 25 | 26 | def test_database_url_query_string(self): 27 | db = connect("sqlite:///:memory:/?cached_statements=1") 28 | assert "cached_statements" in db.url, db.url 29 | 30 | def test_tables(self): 31 | assert self.db.tables == ["weather"], self.db.tables 32 | 33 | def test_contains(self): 34 | assert "weather" in self.db, self.db.tables 35 | 36 | def test_create_table(self): 37 | table = self.db["foo"] 38 | assert self.db.has_table(table.table.name) 39 | assert len(table.table.columns) == 1, table.table.columns 40 | assert "id" in table.table.c, table.table.c 41 | 42 | def test_create_table_no_ids(self): 43 | if "mysql" in self.db.engine.dialect.dbapi.__name__: 44 | return 45 | if "sqlite" in self.db.engine.dialect.dbapi.__name__: 46 | return 47 | table = self.db.create_table("foo_no_id", primary_id=False) 48 | assert table.table.exists() 49 | assert len(table.table.columns) == 0, table.table.columns 50 | 51 | def test_create_table_custom_id1(self): 52 | pid = "string_id" 53 | table = self.db.create_table("foo2", pid, self.db.types.string(255)) 54 | assert self.db.has_table(table.table.name) 55 | assert len(table.table.columns) == 1, table.table.columns 56 | assert pid in table.table.c, table.table.c 57 | table.insert({pid: "foobar"}) 58 | assert table.find_one(string_id="foobar")[pid] == "foobar" 59 | 60 | def test_create_table_custom_id2(self): 61 | pid = "string_id" 62 | table = self.db.create_table("foo3", pid, self.db.types.string(50)) 63 | assert self.db.has_table(table.table.name) 64 | assert len(table.table.columns) == 1, table.table.columns 65 | assert pid in table.table.c, table.table.c 66 | 67 | table.insert({pid: "foobar"}) 68 | assert table.find_one(string_id="foobar")[pid] == "foobar" 69 | 70 | def test_create_table_custom_id3(self): 71 | pid = "int_id" 72 | table = self.db.create_table("foo4", primary_id=pid) 73 | assert self.db.has_table(table.table.name) 74 | assert len(table.table.columns) == 1, table.table.columns 75 | assert pid in table.table.c, table.table.c 76 | 77 | table.insert({pid: 123}) 78 | table.insert({pid: 124}) 79 | assert table.find_one(int_id=123)[pid] == 123 80 | assert table.find_one(int_id=124)[pid] == 124 81 | self.assertRaises(IntegrityError, lambda: table.insert({pid: 123})) 82 | 83 | def test_create_table_shorthand1(self): 84 | pid = "int_id" 85 | table = self.db.get_table("foo5", pid) 86 | assert table.table.exists 87 | assert len(table.table.columns) == 1, table.table.columns 88 | assert pid in table.table.c, table.table.c 89 | 90 | table.insert({"int_id": 123}) 91 | table.insert({"int_id": 124}) 92 | assert table.find_one(int_id=123)["int_id"] == 123 93 | assert table.find_one(int_id=124)["int_id"] == 124 94 | self.assertRaises(IntegrityError, lambda: table.insert({"int_id": 123})) 95 | 96 | def test_create_table_shorthand2(self): 97 | pid = "string_id" 98 | table = self.db.get_table( 99 | "foo6", primary_id=pid, primary_type=self.db.types.string(255) 100 | ) 101 | assert table.table.exists 102 | assert len(table.table.columns) == 1, table.table.columns 103 | assert pid in table.table.c, table.table.c 104 | 105 | table.insert({"string_id": "foobar"}) 106 | assert table.find_one(string_id="foobar")["string_id"] == "foobar" 107 | 108 | def test_with(self): 109 | init_length = len(self.db["weather"]) 110 | with self.assertRaises(ValueError): 111 | with self.db as tx: 112 | tx["weather"].insert( 113 | { 114 | "date": datetime(2011, 1, 1), 115 | "temperature": 1, 116 | "place": "tmp_place", 117 | } 118 | ) 119 | raise ValueError() 120 | assert len(self.db["weather"]) == init_length 121 | 122 | def test_invalid_values(self): 123 | if "mysql" in self.db.engine.dialect.dbapi.__name__: 124 | # WARNING: mysql seems to be doing some weird type casting 125 | # upon insert. The mysql-python driver is not affected but 126 | # it isn't compatible with Python 3 127 | # Conclusion: use postgresql. 128 | return 129 | with self.assertRaises(SQLAlchemyError): 130 | tbl = self.db["weather"] 131 | tbl.insert( 132 | {"date": True, "temperature": "wrong_value", "place": "tmp_place"} 133 | ) 134 | 135 | def test_load_table(self): 136 | tbl = self.db.load_table("weather") 137 | assert tbl.table.name == self.tbl.table.name 138 | 139 | def test_query(self): 140 | r = self.db.query("SELECT COUNT(*) AS num FROM weather").next() 141 | assert r["num"] == len(TEST_DATA), r 142 | 143 | def test_table_cache_updates(self): 144 | tbl1 = self.db.get_table("people") 145 | data = OrderedDict([("first_name", "John"), ("last_name", "Smith")]) 146 | tbl1.insert(data) 147 | data["id"] = 1 148 | tbl2 = self.db.get_table("people") 149 | assert dict(tbl2.all().next()) == dict(data), (tbl2.all().next(), data) 150 | 151 | 152 | class TableTestCase(unittest.TestCase): 153 | def setUp(self): 154 | self.db = connect() 155 | self.tbl = self.db["weather"] 156 | for row in TEST_DATA: 157 | self.tbl.insert(row) 158 | 159 | def tearDown(self): 160 | self.tbl.drop() 161 | 162 | def test_insert(self): 163 | assert len(self.tbl) == len(TEST_DATA), len(self.tbl) 164 | last_id = self.tbl.insert( 165 | {"date": datetime(2011, 1, 2), "temperature": -10, "place": "Berlin"} 166 | ) 167 | assert len(self.tbl) == len(TEST_DATA) + 1, len(self.tbl) 168 | assert self.tbl.find_one(id=last_id)["place"] == "Berlin" 169 | 170 | def test_insert_ignore(self): 171 | self.tbl.insert_ignore( 172 | {"date": datetime(2011, 1, 2), "temperature": -10, "place": "Berlin"}, 173 | ["place"], 174 | ) 175 | assert len(self.tbl) == len(TEST_DATA) + 1, len(self.tbl) 176 | self.tbl.insert_ignore( 177 | {"date": datetime(2011, 1, 2), "temperature": -10, "place": "Berlin"}, 178 | ["place"], 179 | ) 180 | assert len(self.tbl) == len(TEST_DATA) + 1, len(self.tbl) 181 | 182 | def test_insert_ignore_all_key(self): 183 | for i in range(0, 4): 184 | self.tbl.insert_ignore( 185 | {"date": datetime(2011, 1, 2), "temperature": -10, "place": "Berlin"}, 186 | ["date", "temperature", "place"], 187 | ) 188 | assert len(self.tbl) == len(TEST_DATA) + 1, len(self.tbl) 189 | 190 | def test_insert_json(self): 191 | last_id = self.tbl.insert( 192 | { 193 | "date": datetime(2011, 1, 2), 194 | "temperature": -10, 195 | "place": "Berlin", 196 | "info": { 197 | "currency": "EUR", 198 | "language": "German", 199 | "population": 3292365, 200 | }, 201 | } 202 | ) 203 | assert len(self.tbl) == len(TEST_DATA) + 1, len(self.tbl) 204 | assert self.tbl.find_one(id=last_id)["place"] == "Berlin" 205 | 206 | def test_upsert(self): 207 | self.tbl.upsert( 208 | {"date": datetime(2011, 1, 2), "temperature": -10, "place": "Berlin"}, 209 | ["place"], 210 | ) 211 | assert len(self.tbl) == len(TEST_DATA) + 1, len(self.tbl) 212 | self.tbl.upsert( 213 | {"date": datetime(2011, 1, 2), "temperature": -10, "place": "Berlin"}, 214 | ["place"], 215 | ) 216 | assert len(self.tbl) == len(TEST_DATA) + 1, len(self.tbl) 217 | 218 | def test_upsert_single_column(self): 219 | table = self.db["banana_single_col"] 220 | table.upsert({"color": "Yellow"}, ["color"]) 221 | assert len(table) == 1, len(table) 222 | table.upsert({"color": "Yellow"}, ["color"]) 223 | assert len(table) == 1, len(table) 224 | 225 | def test_upsert_all_key(self): 226 | assert len(self.tbl) == len(TEST_DATA), len(self.tbl) 227 | for i in range(0, 2): 228 | self.tbl.upsert( 229 | {"date": datetime(2011, 1, 2), "temperature": -10, "place": "Berlin"}, 230 | ["date", "temperature", "place"], 231 | ) 232 | assert len(self.tbl) == len(TEST_DATA) + 1, len(self.tbl) 233 | 234 | def test_upsert_id(self): 235 | table = self.db["banana_with_id"] 236 | data = dict(id=10, title="I am a banana!") 237 | table.upsert(data, ["id"]) 238 | assert len(table) == 1, len(table) 239 | 240 | def test_update_while_iter(self): 241 | for row in self.tbl: 242 | row["foo"] = "bar" 243 | self.tbl.update(row, ["place", "date"]) 244 | assert len(self.tbl) == len(TEST_DATA), len(self.tbl) 245 | 246 | def test_weird_column_names(self): 247 | with self.assertRaises(ValueError): 248 | self.tbl.insert( 249 | { 250 | "date": datetime(2011, 1, 2), 251 | "temperature": -10, 252 | "foo.bar": "Berlin", 253 | "qux.bar": "Huhu", 254 | } 255 | ) 256 | 257 | def test_cased_column_names(self): 258 | tbl = self.db["cased_column_names"] 259 | tbl.insert({"place": "Berlin"}) 260 | tbl.insert({"Place": "Berlin"}) 261 | tbl.insert({"PLACE ": "Berlin"}) 262 | assert len(tbl.columns) == 2, tbl.columns 263 | assert len(list(tbl.find(Place="Berlin"))) == 3 264 | assert len(list(tbl.find(place="Berlin"))) == 3 265 | assert len(list(tbl.find(PLACE="Berlin"))) == 3 266 | 267 | def test_invalid_column_names(self): 268 | tbl = self.db["weather"] 269 | with self.assertRaises(ValueError): 270 | tbl.insert({None: "banana"}) 271 | 272 | with self.assertRaises(ValueError): 273 | tbl.insert({"": "banana"}) 274 | 275 | with self.assertRaises(ValueError): 276 | tbl.insert({"-": "banana"}) 277 | 278 | def test_delete(self): 279 | self.tbl.insert( 280 | {"date": datetime(2011, 1, 2), "temperature": -10, "place": "Berlin"} 281 | ) 282 | original_count = len(self.tbl) 283 | assert len(self.tbl) == len(TEST_DATA) + 1, len(self.tbl) 284 | # Test bad use of API 285 | with self.assertRaises(ArgumentError): 286 | self.tbl.delete({"place": "Berlin"}) 287 | assert len(self.tbl) == original_count, len(self.tbl) 288 | 289 | assert self.tbl.delete(place="Berlin") is True, "should return 1" 290 | assert len(self.tbl) == len(TEST_DATA), len(self.tbl) 291 | assert self.tbl.delete() is True, "should return non zero" 292 | assert len(self.tbl) == 0, len(self.tbl) 293 | 294 | def test_repr(self): 295 | assert ( 296 | repr(self.tbl) == "" 297 | ), "the representation should be " 298 | 299 | def test_delete_nonexist_entry(self): 300 | assert ( 301 | self.tbl.delete(place="Berlin") is False 302 | ), "entry not exist, should fail to delete" 303 | 304 | def test_find_one(self): 305 | self.tbl.insert( 306 | {"date": datetime(2011, 1, 2), "temperature": -10, "place": "Berlin"} 307 | ) 308 | d = self.tbl.find_one(place="Berlin") 309 | assert d["temperature"] == -10, d 310 | d = self.tbl.find_one(place="Atlantis") 311 | assert d is None, d 312 | 313 | def test_count(self): 314 | assert len(self.tbl) == 6, len(self.tbl) 315 | length = self.tbl.count(place=TEST_CITY_1) 316 | assert length == 3, length 317 | 318 | def test_find(self): 319 | ds = list(self.tbl.find(place=TEST_CITY_1)) 320 | assert len(ds) == 3, ds 321 | ds = list(self.tbl.find(place=TEST_CITY_1, _limit=2)) 322 | assert len(ds) == 2, ds 323 | ds = list(self.tbl.find(place=TEST_CITY_1, _limit=2, _step=1)) 324 | assert len(ds) == 2, ds 325 | ds = list(self.tbl.find(place=TEST_CITY_1, _limit=1, _step=2)) 326 | assert len(ds) == 1, ds 327 | ds = list(self.tbl.find(_step=2)) 328 | assert len(ds) == len(TEST_DATA), ds 329 | ds = list(self.tbl.find(order_by=["temperature"])) 330 | assert ds[0]["temperature"] == -1, ds 331 | ds = list(self.tbl.find(order_by=["-temperature"])) 332 | assert ds[0]["temperature"] == 8, ds 333 | ds = list(self.tbl.find(self.tbl.table.columns.temperature > 4)) 334 | assert len(ds) == 3, ds 335 | 336 | def test_find_dsl(self): 337 | ds = list(self.tbl.find(place={"like": "%lw%"})) 338 | assert len(ds) == 3, ds 339 | ds = list(self.tbl.find(temperature={">": 5})) 340 | assert len(ds) == 2, ds 341 | ds = list(self.tbl.find(temperature={">=": 5})) 342 | assert len(ds) == 3, ds 343 | ds = list(self.tbl.find(temperature={"<": 0})) 344 | assert len(ds) == 1, ds 345 | ds = list(self.tbl.find(temperature={"<=": 0})) 346 | assert len(ds) == 2, ds 347 | ds = list(self.tbl.find(temperature={"!=": -1})) 348 | assert len(ds) == 5, ds 349 | ds = list(self.tbl.find(temperature={"between": [5, 8]})) 350 | assert len(ds) == 3, ds 351 | ds = list(self.tbl.find(place={"=": "G€lway"})) 352 | assert len(ds) == 3, ds 353 | ds = list(self.tbl.find(place={"ilike": "%LwAy"})) 354 | assert len(ds) == 3, ds 355 | 356 | def test_offset(self): 357 | ds = list(self.tbl.find(place=TEST_CITY_1, _offset=1)) 358 | assert len(ds) == 2, ds 359 | ds = list(self.tbl.find(place=TEST_CITY_1, _limit=2, _offset=2)) 360 | assert len(ds) == 1, ds 361 | 362 | def test_streamed(self): 363 | ds = list(self.tbl.find(place=TEST_CITY_1, _streamed=True, _step=1)) 364 | assert len(ds) == 3, len(ds) 365 | for row in self.tbl.find(place=TEST_CITY_1, _streamed=True, _step=1): 366 | row["temperature"] = -1 367 | self.tbl.update(row, ["id"]) 368 | 369 | def test_distinct(self): 370 | x = list(self.tbl.distinct("place")) 371 | assert len(x) == 2, x 372 | x = list(self.tbl.distinct("place", "date")) 373 | assert len(x) == 6, x 374 | x = list( 375 | self.tbl.distinct( 376 | "place", 377 | "date", 378 | self.tbl.table.columns.date >= datetime(2011, 1, 2, 0, 0), 379 | ) 380 | ) 381 | assert len(x) == 4, x 382 | 383 | x = list(self.tbl.distinct("temperature", place="B€rkeley")) 384 | assert len(x) == 3, x 385 | x = list(self.tbl.distinct("temperature", place=["B€rkeley", "G€lway"])) 386 | assert len(x) == 6, x 387 | x = list(self.tbl.distinct("temperature", _limit=3, place=["B€rkeley", "G€lway"])) 388 | assert len(x) == 3, x 389 | x = list(self.tbl.distinct("temperature", _limit=6, _offset=1, place=["B€rkeley", "G€lway"])) 390 | assert len(x) == 5, x 391 | 392 | def test_insert_many(self): 393 | data = TEST_DATA * 100 394 | self.tbl.insert_many(data, chunk_size=13) 395 | assert len(self.tbl) == len(data) + 6, (len(self.tbl), len(data)) 396 | 397 | def test_chunked_insert(self): 398 | data = TEST_DATA * 100 399 | with chunked.ChunkedInsert(self.tbl) as chunk_tbl: 400 | for item in data: 401 | chunk_tbl.insert(item) 402 | assert len(self.tbl) == len(data) + 6, (len(self.tbl), len(data)) 403 | 404 | def test_chunked_insert_callback(self): 405 | data = TEST_DATA * 100 406 | N = 0 407 | 408 | def callback(queue): 409 | nonlocal N 410 | N += len(queue) 411 | 412 | with chunked.ChunkedInsert(self.tbl, callback=callback) as chunk_tbl: 413 | for item in data: 414 | chunk_tbl.insert(item) 415 | assert len(data) == N 416 | assert len(self.tbl) == len(data) + 6 417 | 418 | def test_update_many(self): 419 | tbl = self.db["update_many_test"] 420 | tbl.insert_many([dict(temp=10), dict(temp=20), dict(temp=30)]) 421 | tbl.update_many([dict(id=1, temp=50), dict(id=3, temp=50)], "id") 422 | 423 | # Ensure data has been updated. 424 | assert tbl.find_one(id=1)["temp"] == tbl.find_one(id=3)["temp"] 425 | 426 | def test_chunked_update(self): 427 | tbl = self.db["update_many_test"] 428 | tbl.insert_many( 429 | [ 430 | dict(temp=10, location="asdf"), 431 | dict(temp=20, location="qwer"), 432 | dict(temp=30, location="asdf"), 433 | ] 434 | ) 435 | 436 | chunked_tbl = chunked.ChunkedUpdate(tbl, "id") 437 | chunked_tbl.update(dict(id=1, temp=50)) 438 | chunked_tbl.update(dict(id=2, location="asdf")) 439 | chunked_tbl.update(dict(id=3, temp=50)) 440 | chunked_tbl.flush() 441 | 442 | # Ensure data has been updated. 443 | assert tbl.find_one(id=1)["temp"] == tbl.find_one(id=3)["temp"] == 50 444 | assert ( 445 | tbl.find_one(id=2)["location"] == tbl.find_one(id=3)["location"] == "asdf" 446 | ) # noqa 447 | 448 | def test_upsert_many(self): 449 | # Also tests updating on records with different attributes 450 | tbl = self.db["upsert_many_test"] 451 | 452 | W = 100 453 | tbl.upsert_many([dict(age=10), dict(weight=W)], "id") 454 | assert tbl.find_one(id=1)["age"] == 10 455 | 456 | tbl.upsert_many([dict(id=1, age=70), dict(id=2, weight=W / 2)], "id") 457 | assert tbl.find_one(id=2)["weight"] == W / 2 458 | 459 | def test_drop_operations(self): 460 | assert self.tbl._table is not None, "table shouldn't be dropped yet" 461 | self.tbl.drop() 462 | assert self.tbl._table is None, "table should be dropped now" 463 | assert list(self.tbl.all()) == [], self.tbl.all() 464 | assert self.tbl.count() == 0, self.tbl.count() 465 | 466 | def test_table_drop(self): 467 | assert "weather" in self.db 468 | self.db["weather"].drop() 469 | assert "weather" not in self.db 470 | 471 | def test_table_drop_then_create(self): 472 | assert "weather" in self.db 473 | self.db["weather"].drop() 474 | assert "weather" not in self.db 475 | self.db["weather"].insert({"foo": "bar"}) 476 | 477 | def test_columns(self): 478 | cols = self.tbl.columns 479 | assert len(list(cols)) == 4, "column count mismatch" 480 | assert "date" in cols and "temperature" in cols and "place" in cols 481 | 482 | def test_drop_column(self): 483 | try: 484 | self.tbl.drop_column("date") 485 | assert "date" not in self.tbl.columns 486 | except RuntimeError: 487 | pass 488 | 489 | def test_iter(self): 490 | c = 0 491 | for row in self.tbl: 492 | c += 1 493 | assert c == len(self.tbl) 494 | 495 | def test_update(self): 496 | date = datetime(2011, 1, 2) 497 | res = self.tbl.update( 498 | {"date": date, "temperature": -10, "place": TEST_CITY_1}, ["place", "date"] 499 | ) 500 | assert res, "update should return True" 501 | m = self.tbl.find_one(place=TEST_CITY_1, date=date) 502 | assert m["temperature"] == -10, ( 503 | "new temp. should be -10 but is %d" % m["temperature"] 504 | ) 505 | 506 | def test_create_column(self): 507 | tbl = self.tbl 508 | flt = self.db.types.float 509 | tbl.create_column("foo", flt) 510 | assert "foo" in tbl.table.c, tbl.table.c 511 | assert isinstance(tbl.table.c["foo"].type, flt), tbl.table.c["foo"].type 512 | assert "foo" in tbl.columns, tbl.columns 513 | 514 | def test_ensure_column(self): 515 | tbl = self.tbl 516 | flt = self.db.types.float 517 | tbl.create_column_by_example("foo", 0.1) 518 | assert "foo" in tbl.table.c, tbl.table.c 519 | assert isinstance(tbl.table.c["foo"].type, flt), tbl.table.c["bar"].type 520 | tbl.create_column_by_example("bar", 1) 521 | assert "bar" in tbl.table.c, tbl.table.c 522 | assert isinstance(tbl.table.c["bar"].type, BIGINT), tbl.table.c["bar"].type 523 | tbl.create_column_by_example("pippo", "test") 524 | assert "pippo" in tbl.table.c, tbl.table.c 525 | assert isinstance(tbl.table.c["pippo"].type, TEXT), tbl.table.c["pippo"].type 526 | tbl.create_column_by_example("bigbar", 11111111111) 527 | assert "bigbar" in tbl.table.c, tbl.table.c 528 | assert isinstance(tbl.table.c["bigbar"].type, BIGINT), tbl.table.c[ 529 | "bigbar" 530 | ].type 531 | tbl.create_column_by_example("littlebar", -11111111111) 532 | assert "littlebar" in tbl.table.c, tbl.table.c 533 | assert isinstance(tbl.table.c["littlebar"].type, BIGINT), tbl.table.c[ 534 | "littlebar" 535 | ].type 536 | 537 | def test_key_order(self): 538 | res = self.db.query("SELECT temperature, place FROM weather LIMIT 1") 539 | keys = list(res.next().keys()) 540 | assert keys[0] == "temperature" 541 | assert keys[1] == "place" 542 | 543 | def test_empty_query(self): 544 | empty = list(self.tbl.find(place="not in data")) 545 | assert len(empty) == 0, empty 546 | 547 | 548 | class Constructor(dict): 549 | """Very simple low-functionality extension to ``dict`` to 550 | provide attribute access to dictionary contents""" 551 | 552 | def __getattr__(self, name): 553 | return self[name] 554 | 555 | 556 | class RowTypeTestCase(unittest.TestCase): 557 | def setUp(self): 558 | self.db = connect(row_type=Constructor) 559 | self.tbl = self.db["weather"] 560 | for row in TEST_DATA: 561 | self.tbl.insert(row) 562 | 563 | def tearDown(self): 564 | for table in self.db.tables: 565 | self.db[table].drop() 566 | 567 | def test_find_one(self): 568 | self.tbl.insert( 569 | {"date": datetime(2011, 1, 2), "temperature": -10, "place": "Berlin"} 570 | ) 571 | d = self.tbl.find_one(place="Berlin") 572 | assert d["temperature"] == -10, d 573 | assert d.temperature == -10, d 574 | d = self.tbl.find_one(place="Atlantis") 575 | assert d is None, d 576 | 577 | def test_find(self): 578 | ds = list(self.tbl.find(place=TEST_CITY_1)) 579 | assert len(ds) == 3, ds 580 | for item in ds: 581 | assert isinstance(item, Constructor), item 582 | ds = list(self.tbl.find(place=TEST_CITY_1, _limit=2)) 583 | assert len(ds) == 2, ds 584 | for item in ds: 585 | assert isinstance(item, Constructor), item 586 | 587 | def test_distinct(self): 588 | x = list(self.tbl.distinct("place")) 589 | assert len(x) == 2, x 590 | for item in x: 591 | assert isinstance(item, Constructor), item 592 | x = list(self.tbl.distinct("place", "date")) 593 | assert len(x) == 6, x 594 | for item in x: 595 | assert isinstance(item, Constructor), item 596 | 597 | def test_iter(self): 598 | c = 0 599 | for row in self.tbl: 600 | c += 1 601 | assert isinstance(row, Constructor), row 602 | assert c == len(self.tbl) 603 | 604 | 605 | if __name__ == "__main__": 606 | unittest.main() 607 | --------------------------------------------------------------------------------