├── LEAD.md ├── tests ├── __init__.py ├── test_mapper.py └── test_storage.py ├── examples └── __init__.py ├── tableschema_pandas ├── VERSION ├── __init__.py ├── storage.py └── mapper.py ├── setup.cfg ├── pytest.ini ├── data ├── comments.csv ├── sample.csv ├── vix.csv ├── articles.csv ├── comments.json └── articles.json ├── MANIFEST.in ├── pylama.ini ├── .github ├── pull_request_template.md ├── issue_template.md ├── workflows │ └── release.yml └── stale.yml ├── tox.ini ├── .gitignore ├── Makefile ├── .travis.yml ├── setup.py ├── README.md └── LICENSE.md /LEAD.md: -------------------------------------------------------------------------------- 1 | roll 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tableschema_pandas/VERSION: -------------------------------------------------------------------------------- 1 | 1.1.0 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests 3 | -------------------------------------------------------------------------------- /data/comments.csv: -------------------------------------------------------------------------------- 1 | entry_id,comment 2 | 1,good 3 | -------------------------------------------------------------------------------- /data/sample.csv: -------------------------------------------------------------------------------- 1 | Id;Col1;Col2;Col3 2 | 101;1.1;1.2;1.3 3 | 102;2.1;2.2;2.3 4 | -------------------------------------------------------------------------------- /data/vix.csv: -------------------------------------------------------------------------------- 1 | Date;VIXClose;VIXHigh;VIXLow;VIXOpen 2 | 2004-01-05T00:00:00Z;17.49;18.49;17.44;18.45 3 | 2004-01-06T00:00:00Z;16.73;17.67;16.19;17.66 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-include VERSION 2 | include LICENSE.md 3 | include Makefile 4 | include pylama.ini 5 | include pytest.ini 6 | include README.md 7 | include tox.ini 8 | -------------------------------------------------------------------------------- /pylama.ini: -------------------------------------------------------------------------------- 1 | [pylama] 2 | linters = pyflakes,mccabe,pep8 3 | 4 | [pylama:pep8] 5 | max_line_length = 100 6 | 7 | [pylama:mccabe] 8 | complexity = 32 9 | 10 | [pylama:*/__init__.py] 11 | ignore = W0611 12 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Please replace this line with full information about your pull request. Make sure that tests pass before publishing it 4 | 5 | --- 6 | 7 | Please preserve this line to notify @roll (lead of this repository) 8 | -------------------------------------------------------------------------------- /.github/issue_template.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Please replace this line with full information about your idea or problem. If it's a bug share as much as possible to reproduce it 4 | 5 | --- 6 | 7 | Please preserve this line to notify @roll (lead of this repository) 8 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v1 14 | - name: Release 15 | uses: softprops/action-gh-release@v1 16 | env: 17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | -------------------------------------------------------------------------------- /data/articles.csv: -------------------------------------------------------------------------------- 1 | id,parent,name,current,rating,created_year,created_date,created_time,created_datetime,stats,persons,location 2 | 1,,Taxes,True,9.5,2015,2015-01-01,03:00:00,2015-01-01T03:00:00Z,{"chars":560},["mike"],"{""type"": ""Point"",""coordinates"":[50.00,50.00]}" 3 | 2,1,中国人,False,7,2015,2015-12-31,15:45:33,2015-12-31T15:45:33Z,{"chars":970},["chen"],"{""type"": ""Point"",""coordinates"":[33.33,33.33]}" 4 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | package=tableschema_pandas 3 | skip_missing_interpreters=true 4 | envlist= 5 | py27 6 | py36 7 | py37 8 | py38 9 | 10 | [testenv] 11 | deps= 12 | mock 13 | pytest 14 | pytest-cov 15 | coverage 16 | passenv= 17 | CI 18 | TRAVIS 19 | TRAVIS_JOB_ID 20 | TRAVIS_BRANCH 21 | commands= 22 | py.test \ 23 | --cov {[tox]package} \ 24 | --cov-config tox.ini \ 25 | --cov-report term-missing \ 26 | {posargs} 27 | -------------------------------------------------------------------------------- /tableschema_pandas/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | 8 | # Module API 9 | 10 | from .storage import Storage 11 | 12 | 13 | # Version 14 | 15 | import io 16 | import os 17 | __version__ = io.open( 18 | os.path.join(os.path.dirname(__file__), 'VERSION'), 19 | encoding='utf-8').read().strip() 20 | -------------------------------------------------------------------------------- /data/comments.json: -------------------------------------------------------------------------------- 1 | { 2 | "primaryKey": "entry_id", 3 | "foreignKeys": [ 4 | { 5 | "fields": "entry_id", 6 | "reference": { 7 | "fields": "id", 8 | "resource": "", 9 | "table": "articles" 10 | } 11 | } 12 | ], 13 | "fields": [ 14 | { 15 | "name": "entry_id", 16 | "type": "integer", 17 | "constraints": { 18 | "required": true 19 | } 20 | }, 21 | { 22 | "name": "comment", 23 | "type": "string" 24 | } 25 | ] 26 | } 27 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 90 3 | 4 | # Number of days of inactivity before a stale issue is closed 5 | daysUntilClose: 30 6 | 7 | # Issues with these labels will never be considered stale 8 | exemptLabels: 9 | - feature 10 | - enhancement 11 | - bug 12 | 13 | # Label to use when marking an issue as stale 14 | staleLabel: wontfix 15 | 16 | # Comment to post when marking an issue as stale. Set to `false` to disable 17 | markComment: > 18 | This issue has been automatically marked as stale because it has not had 19 | recent activity. It will be closed if no further activity occurs. Thank you 20 | for your contributions. 21 | 22 | # Comment to post when closing a stale issue. Set to `false` to disable 23 | closeComment: false 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all install list readme release templates test version 2 | 3 | 4 | PACKAGE := $(shell grep '^PACKAGE =' setup.py | cut -d "'" -f2) 5 | VERSION := $(shell head -n 1 $(PACKAGE)/VERSION) 6 | LEAD := $(shell head -n 1 LEAD.md) 7 | 8 | 9 | all: list 10 | 11 | install: 12 | pip install --upgrade -e .[develop] 13 | 14 | list: 15 | @grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n' 16 | 17 | readme: 18 | pip install md-toc 19 | pip install referencer 20 | referencer $(PACKAGE) README.md --in-place 21 | md_toc -p README.md github --header-levels 3 22 | sed -i '/(#tableschema-pandas-py)/,+2d' README.md 23 | 24 | release: 25 | git checkout master && git pull origin && git fetch -p && git diff 26 | @echo "\nContinuing in 10 seconds. Press to abort\n" && sleep 10 27 | @git log --pretty=format:"%C(yellow)%h%Creset %s%Cgreen%d" --reverse -20 28 | @echo "\nReleasing v$(VERSION) in 10 seconds. Press to abort\n" && sleep 10 29 | git commit -a -m 'v$(VERSION)' && git tag -a v$(VERSION) -m 'v$(VERSION)' 30 | git push --follow-tags 31 | 32 | templates: 33 | sed -i -E "s/@(\w*)/@$(LEAD)/" .github/issue_template.md 34 | sed -i -E "s/@(\w*)/@$(LEAD)/" .github/pull_request_template.md 35 | 36 | test: 37 | pylama $(PACKAGE) 38 | tox 39 | 40 | version: 41 | @echo $(VERSION) 42 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: 2 | xenial 3 | 4 | sudo: 5 | false 6 | 7 | language: 8 | python 9 | 10 | python: 11 | - 2.7 12 | - 3.6 13 | - 3.7 14 | - 3.8 15 | 16 | env: 17 | global: 18 | - TOXENV="py${PYTHON_VERSION//./}" 19 | 20 | install: 21 | - make install 22 | - pip install coveralls 23 | 24 | script: 25 | - make test 26 | 27 | after_success: 28 | - coveralls 29 | 30 | jobs: 31 | include: 32 | - stage: release 33 | if: tag IS present 34 | python: 3.8 35 | deploy: 36 | provider: pypi 37 | user: roll 38 | distributions: sdist bdist_wheel 39 | skip_cleanup: true 40 | on: 41 | tags: true 42 | password: 43 | secure: jAtHR6cR8G2+92dVZcgIDvIbBXBoZEWMuw0m/xIo58M5U/PBqBzizcfaYOkK5dSYewBN0+dt4dURymzA/KKEPRdOPXoZA4kKxA9dh8BqK7wKO9koY6Gg6WvCIDTO+36PLqHgPd2CQjdp3pSKbYoKkUAeUrlfUAWYL4C+D8N/WAYMEjlHsBxqZDuSJqaiIWiwaPAZKcavw5Tlr9WExM2baWd1zdHHU23FwCWqT4k2QvVU96fMBc8/3j8rxqdQxTcZSG0GRlcqhx0/px0JNH4x8emCriX25Hc24TNohLrZflBOkrJvlHZ0U9/+IUZZehUTeN86JslfkQQguLFAvQ/2htMf1Bv7LwIIdJRjTlR3x+ODZM3H0juA3paKztjp1GePuu4hGJf9KGI2caolryicQl1ficU/6KfLrlg3aVaXYg9um+9GqhhMbUuRtNjzhZLYj2vZfI5BSkb1FOpvP1ApEvKBWW/oQi+Sh7YEBMf4jTf0bVYqRZnvoohLG3GID9rR51Yh3rehZPMgU2CXnpCwRa1yprimq7qZGetleryTkRcF54s2+3kFnpL4Y6hXcXuD8UrWZ2+ZOakC3C2FS2o7rQc6kyumcHyM9wNtTdSORAnArkylqEEjTE87vkfn5OgHSlXXZ9MLClMWmh80JKe8VRNq4q5L9EfoOM+Ej2ue+7Y= 44 | -------------------------------------------------------------------------------- /data/articles.json: -------------------------------------------------------------------------------- 1 | { 2 | "primaryKey": "id", 3 | "foreignKeys": [ 4 | { 5 | "fields": "parent", 6 | "reference": { 7 | "fields": "id", 8 | "resource": "self" 9 | } 10 | } 11 | ], 12 | "fields": [ 13 | { 14 | "name": "id", 15 | "type": "integer", 16 | "constraints": { 17 | "required": true 18 | } 19 | }, 20 | { 21 | "name": "parent", 22 | "type": "integer" 23 | }, 24 | { 25 | "name": "name", 26 | "type": "string" 27 | }, 28 | { 29 | "name": "current", 30 | "type": "boolean" 31 | }, 32 | { 33 | "name": "rating", 34 | "type": "number" 35 | }, 36 | { 37 | "name": "created_year", 38 | "type": "date", 39 | "format": "fmt:%Y" 40 | }, 41 | { 42 | "name": "created_date", 43 | "type": "date" 44 | }, 45 | { 46 | "name": "created_time", 47 | "type": "time" 48 | }, 49 | { 50 | "name": "created_datetime", 51 | "type": "datetime" 52 | }, 53 | { 54 | "name": "stats", 55 | "type": "object" 56 | }, 57 | { 58 | "name": "persons", 59 | "type": "array" 60 | }, 61 | { 62 | "name": "location", 63 | "type": "geojson" 64 | } 65 | 66 | ] 67 | } 68 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | 6 | import io 7 | import os.path 8 | from setuptools import setup, find_packages 9 | 10 | 11 | # Helpers 12 | def read(*segments): 13 | path = os.path.join(*segments) 14 | with io.open(path, encoding='utf-8') as f: 15 | return f.read().strip() 16 | 17 | 18 | # Prepare 19 | PACKAGE = 'tableschema_pandas' 20 | NAME = PACKAGE.replace('_', '-') 21 | INSTALL_REQUIRES = [ 22 | 'six>=1.9', 23 | 'pandas>=0.18', 24 | 'tabulator>=1.0', 25 | 'tableschema>=1.1', 26 | 'isodate>=0.6', 27 | ] 28 | TESTS_REQUIRE = [ 29 | 'mock', 30 | 'pylama', 31 | 'pytest', 32 | 'pytest-cov', 33 | 'tox', 34 | ] 35 | README = read('README.md') 36 | VERSION = read(PACKAGE, 'VERSION') 37 | PACKAGES = find_packages(exclude=['tests']) 38 | 39 | 40 | # Run 41 | setup( 42 | name=NAME, 43 | version=VERSION, 44 | packages=PACKAGES, 45 | include_package_data=True, 46 | install_requires=INSTALL_REQUIRES, 47 | tests_require=TESTS_REQUIRE, 48 | extras_require={'develop': TESTS_REQUIRE}, 49 | zip_safe=False, 50 | long_description=README, 51 | long_description_content_type='text/markdown', 52 | description='Generate Pandas data frames, load and extract data, based on JSON Table Schema descriptors.', 53 | author='Open Knowledge Foundation', 54 | author_email='info@okfn.org', 55 | url='https://github.com/frictionlessdata/tableschema-pandas-py', 56 | license='LGPLv3+', 57 | keywords=['frictionless data', 'datapackage', 'pandas'], 58 | classifiers=[ 59 | 'Development Status :: 4 - Beta', 60 | 'Intended Audience :: Developers', 61 | 'Intended Audience :: Science/Research', 62 | 'License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)', 63 | 'Operating System :: OS Independent', 64 | 'Programming Language :: Python :: 2', 65 | 'Programming Language :: Python :: 2.7', 66 | 'Programming Language :: Python :: 3', 67 | 'Programming Language :: Python :: 3.4', 68 | 'Programming Language :: Python :: 3.5', 69 | 'Topic :: Scientific/Engineering :: Information Analysis', 70 | 'Topic :: Software Development :: Libraries :: Python Modules' 71 | ], 72 | ) 73 | -------------------------------------------------------------------------------- /tests/test_mapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import pytest 9 | import datetime 10 | import tableschema 11 | import numpy as np 12 | import pandas as pd 13 | from tableschema_pandas.mapper import Mapper 14 | 15 | 16 | # Tests 17 | 18 | def test_mapper_convert_descriptor_and_rows(): 19 | mapper = Mapper() 20 | df = pd.read_csv('data/sample.csv', sep=';', index_col=['Id']) 21 | descriptor = mapper.restore_descriptor(df) 22 | rows = df.reset_index().values 23 | df_new = mapper.convert_descriptor_and_rows(descriptor, rows) 24 | assert isinstance(df_new.index, pd.Index) 25 | 26 | 27 | @pytest.mark.skip 28 | def test_mapper_convert_descriptor_and_rows_with_datetime_index(): 29 | mapper = Mapper() 30 | df = pd.read_csv('data/vix.csv', sep=';', parse_dates=['Date'], index_col=['Date']) 31 | descriptor = mapper.restore_descriptor(df) 32 | rows = df.reset_index().values 33 | df_new = mapper.convert_descriptor_and_rows(descriptor, rows) 34 | assert isinstance(df_new.index, pd.DatetimeIndex) 35 | 36 | 37 | def test_mapper_convert_type(): 38 | mapper = Mapper() 39 | assert mapper.convert_type('string') == np.dtype('O') 40 | assert mapper.convert_type('year') == np.dtype(int) 41 | assert mapper.convert_type('yearmonth') == np.dtype(list) 42 | assert mapper.convert_type('duration') == np.dtype('O') 43 | with pytest.raises(tableschema.exceptions.StorageError): 44 | mapper.convert_type('non-existent') 45 | 46 | 47 | def test_mapper_restore_descriptor(): 48 | mapper = Mapper() 49 | df = pd.read_csv('data/sample.csv', sep=';', index_col=['Id']) 50 | descriptor = mapper.restore_descriptor(df) 51 | assert descriptor == { 52 | 'fields': [ 53 | {'name': 'Id', 'type': 'integer', 'constraints': {'required': True}}, 54 | {'name': 'Col1', 'type': 'number'}, 55 | {'name': 'Col2', 'type': 'number'}, 56 | {'name': 'Col3', 'type': 'number'}, 57 | ], 58 | 'primaryKey': 'Id', 59 | } 60 | 61 | 62 | def test_mapper_restore_type(): 63 | mapper = Mapper() 64 | df = pd.DataFrame([{ 65 | 'string': 'foo', 66 | 'number': 3.14, 67 | 'integer': 42, 68 | 'boolean': True, 69 | 'datetime': datetime.datetime.now(), 70 | }]) 71 | assert mapper.restore_type(df.dtypes['string']) == 'string' 72 | assert mapper.restore_type(df.dtypes['number']) == 'number' 73 | assert mapper.restore_type(df.dtypes['integer']) == 'integer' 74 | assert mapper.restore_type(df.dtypes['boolean']) == 'boolean' 75 | assert mapper.restore_type(df.dtypes['datetime']) == 'datetime' 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tableschema-pandas-py 2 | 3 | [![Travis](https://img.shields.io/travis/frictionlessdata/tableschema-pandas-py/master.svg)](https://travis-ci.org/frictionlessdata/tableschema-pandas-py) 4 | [![Coveralls](http://img.shields.io/coveralls/frictionlessdata/tableschema-pandas-py.svg?branch=master)](https://coveralls.io/r/frictionlessdata/tableschema-pandas-py?branch=master) 5 | [![PyPi](https://img.shields.io/pypi/v/tableschema-pandas.svg)](https://pypi.python.org/pypi/tableschema-pandas) 6 | [![Github](https://img.shields.io/badge/github-master-brightgreen)](https://github.com/frictionlessdata/tableschema-pandas-py) 7 | [![Gitter](https://img.shields.io/gitter/room/frictionlessdata/chat.svg)](https://gitter.im/frictionlessdata/chat) 8 | 9 | Generate and load Pandas data frames [Table Schema](http://specs.frictionlessdata.io/table-schema/) descriptors. 10 | 11 | ## Features 12 | 13 | - implements `tableschema.Storage` interface 14 | 15 | ## Contents 16 | 17 | 18 | 19 | - [Getting Started](#getting-started) 20 | - [Installation](#installation) 21 | - [Documentation](#documentation) 22 | - [API Reference](#api-reference) 23 | - [`Storage`](#storage) 24 | - [Contributing](#contributing) 25 | - [Changelog](#changelog) 26 | 27 | 28 | 29 | ## Getting Started 30 | 31 | ### Installation 32 | 33 | The package use semantic versioning. It means that major versions could include breaking changes. It's highly recommended to specify `package` version range in your `setup/requirements` file e.g. `package>=1.0,<2.0`. 34 | 35 | ``` 36 | $ pip install tableschema-pandas 37 | ``` 38 | 39 | ## Documentation 40 | 41 | ```python 42 | # pip install datapackage tableschema-pandas 43 | from datapackage import Package 44 | 45 | # Save to Pandas 46 | 47 | package = Package('http://data.okfn.org/data/core/country-list/datapackage.json') 48 | storage = package.save(storage='pandas') 49 | 50 | print(type(storage['data'])) 51 | # 52 | 53 | print(storage['data'].head()) 54 | # Name Code 55 | # 0 Afghanistan AF 56 | # 1 Åland Islands AX 57 | # 2 Albania AL 58 | # 3 Algeria DZ 59 | # 4 American Samoa AS 60 | 61 | # Load from Pandas 62 | 63 | package = Package(storage=storage) 64 | print(package.descriptor) 65 | print(package.resources[0].read()) 66 | ``` 67 | 68 | Storage works as a container for Pandas data frames. You can define new data frame inside storage using `storage.create` method: 69 | 70 | ```python 71 | >>> from tableschema_pandas import Storage 72 | 73 | >>> storage = Storage() 74 | ``` 75 | 76 | ```python 77 | >>> storage.create('data', { 78 | ... 'primaryKey': 'id', 79 | ... 'fields': [ 80 | ... {'name': 'id', 'type': 'integer'}, 81 | ... {'name': 'comment', 'type': 'string'}, 82 | ... ] 83 | ... }) 84 | 85 | >>> storage.buckets 86 | ['data'] 87 | 88 | >>> storage['data'].shape 89 | (0, 0) 90 | ``` 91 | 92 | Use `storage.write` to populate data frame with data: 93 | 94 | ```python 95 | >>> storage.write('data', [(1, 'a'), (2, 'b')]) 96 | 97 | >>> storage['data'] 98 | id comment 99 | 1 a 100 | 2 b 101 | ``` 102 | 103 | Also you can use [tabulator](https://github.com/frictionlessdata/tabulator-py) to populate data frame from external data file. As you see, subsequent writes simply appends new data on top of existing ones: 104 | 105 | ```python 106 | >>> import tabulator 107 | 108 | >>> with tabulator.Stream('data/comments.csv', headers=1) as stream: 109 | ... storage.write('data', stream) 110 | 111 | >>> storage['data'] 112 | id comment 113 | 1 a 114 | 2 b 115 | 1 good 116 | ``` 117 | 118 | ## API Reference 119 | 120 | ### `Storage` 121 | ```python 122 | Storage(self, dataframes=None) 123 | ``` 124 | Pandas storage 125 | 126 | Package implements 127 | [Tabular Storage](https://github.com/frictionlessdata/tableschema-py#storage) 128 | interface (see full documentation on the link): 129 | 130 | ![Storage](https://i.imgur.com/RQgrxqp.png) 131 | 132 | > Only additional API is documented 133 | 134 | __Arguments__ 135 | - __dataframes (object[])__: list of storage dataframes 136 | 137 | 138 | ## Contributing 139 | 140 | > The project follows the [Open Knowledge International coding standards](https://github.com/okfn/coding-standards). 141 | 142 | Recommended way to get started is to create and activate a project virtual environment. 143 | To install package and development dependencies into active environment: 144 | 145 | ```bash 146 | $ make install 147 | ``` 148 | 149 | To run tests with linting and coverage: 150 | 151 | ```bash 152 | $ make test 153 | ``` 154 | 155 | ## Changelog 156 | 157 | Here described only breaking and the most important changes. The full changelog and documentation for all released versions could be found in nicely formatted [commit history](https://github.com/frictionlessdata/tableschema-pandas-py/commits/master). 158 | 159 | #### v1.1 160 | 161 | - Added support for composite primary keys (loading to pandas) 162 | 163 | #### v1.0 164 | 165 | - Initial driver implementation 166 | -------------------------------------------------------------------------------- /tableschema_pandas/storage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import collections 9 | import tableschema 10 | import pandas as pd 11 | from .mapper import Mapper 12 | 13 | 14 | # Module API 15 | 16 | class Storage(tableschema.Storage): 17 | """Pandas storage 18 | 19 | Package implements 20 | [Tabular Storage](https://github.com/frictionlessdata/tableschema-py#storage) 21 | interface (see full documentation on the link): 22 | 23 | ![Storage](https://i.imgur.com/RQgrxqp.png) 24 | 25 | > Only additional API is documented 26 | 27 | # Arguments 28 | dataframes (object[]): list of storage dataframes 29 | 30 | """ 31 | 32 | # Public 33 | 34 | def __init__(self, dataframes=None): 35 | 36 | # Set attributes 37 | self.__dataframes = dataframes or collections.OrderedDict() 38 | self.__descriptors = {} 39 | 40 | # Create mapper 41 | self.__mapper = Mapper() 42 | 43 | def __repr__(self): 44 | return 'Storage' 45 | 46 | def __getitem__(self, key): 47 | """Returns Pandas dataframe 48 | 49 | # Arguments 50 | name (str): name 51 | 52 | """ 53 | return self.__dataframes[key] 54 | 55 | @property 56 | def buckets(self): 57 | return list(sorted(self.__dataframes.keys())) 58 | 59 | def create(self, bucket, descriptor, force=False): 60 | 61 | # Make lists 62 | buckets = bucket 63 | if isinstance(bucket, six.string_types): 64 | buckets = [bucket] 65 | descriptors = descriptor 66 | if isinstance(descriptor, dict): 67 | descriptors = [descriptor] 68 | 69 | # Check buckets for existence 70 | for bucket in buckets: 71 | if bucket in self.buckets: 72 | if not force: 73 | message = 'Bucket "%s" already exists' % bucket 74 | raise tableschema.exceptions.StorageError(message) 75 | self.delete(bucket) 76 | 77 | # Define dataframes 78 | for bucket, descriptor in zip(buckets, descriptors): 79 | tableschema.validate(descriptor) 80 | self.__descriptors[bucket] = descriptor 81 | self.__dataframes[bucket] = pd.DataFrame() 82 | 83 | def delete(self, bucket=None, ignore=False): 84 | 85 | # Make lists 86 | buckets = bucket 87 | if isinstance(bucket, six.string_types): 88 | buckets = [bucket] 89 | elif bucket is None: 90 | buckets = reversed(self.buckets) 91 | 92 | # Iterate over buckets 93 | for bucket in buckets: 94 | 95 | # Non existent bucket 96 | if bucket not in self.buckets: 97 | if not ignore: 98 | message = 'Bucket "%s" doesn\'t exist' % bucket 99 | raise tableschema.exceptions.StorageError(message) 100 | return 101 | 102 | # Remove from descriptors 103 | if bucket in self.__descriptors: 104 | del self.__descriptors[bucket] 105 | 106 | # Remove from dataframes 107 | if bucket in self.__dataframes: 108 | del self.__dataframes[bucket] 109 | 110 | def describe(self, bucket, descriptor=None): 111 | 112 | # Set descriptor 113 | if descriptor is not None: 114 | self.__descriptors[bucket] = descriptor 115 | 116 | # Get descriptor 117 | else: 118 | descriptor = self.__descriptors.get(bucket) 119 | if descriptor is None: 120 | dataframe = self.__dataframes[bucket] 121 | descriptor = self.__mapper.restore_descriptor(dataframe) 122 | 123 | return descriptor 124 | 125 | def iter(self, bucket): 126 | 127 | # Check existense 128 | if bucket not in self.buckets: 129 | message = 'Bucket "%s" doesn\'t exist.' % bucket 130 | raise tableschema.exceptions.StorageError(message) 131 | 132 | # Prepare 133 | descriptor = self.describe(bucket) 134 | schema = tableschema.Schema(descriptor) 135 | 136 | # Yield rows 137 | for pk, row in self.__dataframes[bucket].iterrows(): 138 | row = self.__mapper.restore_row(row, schema=schema, pk=pk) 139 | yield row 140 | 141 | def read(self, bucket): 142 | rows = list(self.iter(bucket)) 143 | return rows 144 | 145 | def write(self, bucket, rows): 146 | 147 | # Prepare 148 | descriptor = self.describe(bucket) 149 | new_data_frame = self.__mapper.convert_descriptor_and_rows(descriptor, rows) 150 | 151 | # Just set new DataFrame if current is empty 152 | if self.__dataframes[bucket].size == 0: 153 | self.__dataframes[bucket] = new_data_frame 154 | 155 | # Append new data frame to the old one setting new data frame 156 | # containing data from both old and new data frames 157 | else: 158 | self.__dataframes[bucket] = pd.concat([ 159 | self.__dataframes[bucket], 160 | new_data_frame, 161 | ]) 162 | -------------------------------------------------------------------------------- /tableschema_pandas/mapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import six 8 | import json 9 | import isodate 10 | import datetime 11 | import tableschema 12 | import numpy as np 13 | import pandas as pd 14 | 15 | # Starting from pandas@0.24 there is the new API 16 | # https://github.com/frictionlessdata/tableschema-pandas-py/issues/29 17 | try: 18 | import pandas.core.dtypes.api as pdc 19 | except ImportError: 20 | import pandas.core.common as pdc 21 | 22 | 23 | # Module API 24 | 25 | class Mapper(object): 26 | 27 | # Public 28 | 29 | def convert_descriptor_and_rows(self, descriptor, rows): 30 | """Convert descriptor and rows to Pandas 31 | """ 32 | schema = tableschema.Schema(descriptor) 33 | 34 | # Get data/index 35 | data_rows = [] 36 | index_rows = [] 37 | jtstypes_map = {} 38 | for row in rows: 39 | data_values = [] 40 | index_values = [] 41 | for field, value in zip(schema.fields, row): 42 | try: 43 | if isinstance(value, float) and np.isnan(value): 44 | value = None 45 | if value and field.type == 'integer': 46 | value = int(value) 47 | value = field.cast_value(value) 48 | except tableschema.exceptions.CastError: 49 | value = json.loads(value) 50 | # http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na 51 | if value is None and field.type in ('number', 'integer'): 52 | jtstypes_map[field.name] = 'number' 53 | value = np.NaN 54 | if field.name in schema.primary_key: 55 | index_values.append(value) 56 | else: 57 | data_values.append(value) 58 | if len(schema.primary_key) == 1: 59 | index_rows.append(index_values[0]) 60 | elif len(schema.primary_key) > 1: 61 | index_rows.append(tuple(index_values)) 62 | data_rows.append(tuple(data_values)) 63 | 64 | # Create index 65 | index = None 66 | if schema.primary_key: 67 | if len(schema.primary_key) == 1: 68 | index_class = pd.Index 69 | index_field = schema.get_field(schema.primary_key[0]) 70 | index_dtype = self.convert_type(index_field.type) 71 | if field.type in ['datetime', 'date']: 72 | index_class = pd.DatetimeIndex 73 | index = index_class(index_rows, name=index_field.name, dtype=index_dtype) 74 | elif len(schema.primary_key) > 1: 75 | index = pd.MultiIndex.from_tuples(index_rows, names=schema.primary_key) 76 | 77 | # Create dtypes/columns 78 | dtypes = [] 79 | columns = [] 80 | for field in schema.fields: 81 | if field.name not in schema.primary_key: 82 | field_name = field.name 83 | if six.PY2: 84 | field_name = field.name.encode('utf-8') 85 | dtype = self.convert_type(jtstypes_map.get(field.name, field.type)) 86 | dtypes.append((field_name, dtype)) 87 | columns.append(field.name) 88 | 89 | # Create dataframe 90 | array = np.array(data_rows, dtype=dtypes) 91 | dataframe = pd.DataFrame(array, index=index, columns=columns) 92 | 93 | return dataframe 94 | 95 | def convert_type(self, type): 96 | """Convert type to Pandas 97 | """ 98 | 99 | # Mapping 100 | mapping = { 101 | 'any': np.dtype('O'), 102 | 'array': np.dtype(list), 103 | 'boolean': np.dtype(bool), 104 | 'date': np.dtype('O'), 105 | 'datetime': np.dtype('datetime64[ns]'), 106 | 'duration': np.dtype('O'), 107 | 'geojson': np.dtype('O'), 108 | 'geopoint': np.dtype('O'), 109 | 'integer': np.dtype(int), 110 | 'number': np.dtype(float), 111 | 'object': np.dtype(dict), 112 | 'string': np.dtype('O'), 113 | 'time': np.dtype('O'), 114 | 'year': np.dtype(int), 115 | 'yearmonth': np.dtype('O'), 116 | } 117 | 118 | # Get type 119 | if type not in mapping: 120 | message = 'Type "%s" is not supported' % type 121 | raise tableschema.exceptions.StorageError(message) 122 | 123 | return mapping[type] 124 | 125 | def restore_descriptor(self, dataframe): 126 | """Restore descriptor from Pandas 127 | """ 128 | 129 | # Prepare 130 | fields = [] 131 | primary_key = None 132 | 133 | # Primary key 134 | if dataframe.index.name: 135 | field_type = self.restore_type(dataframe.index.dtype) 136 | field = { 137 | 'name': dataframe.index.name, 138 | 'type': field_type, 139 | 'constraints': {'required': True}, 140 | } 141 | fields.append(field) 142 | primary_key = dataframe.index.name 143 | 144 | # Fields 145 | for column, dtype in dataframe.dtypes.iteritems(): 146 | sample = dataframe[column].iloc[0] if len(dataframe) else None 147 | field_type = self.restore_type(dtype, sample=sample) 148 | field = {'name': column, 'type': field_type} 149 | # TODO: provide better required indication 150 | # if dataframe[column].isnull().sum() == 0: 151 | # field['constraints'] = {'required': True} 152 | fields.append(field) 153 | 154 | # Descriptor 155 | descriptor = {} 156 | descriptor['fields'] = fields 157 | if primary_key: 158 | descriptor['primaryKey'] = primary_key 159 | 160 | return descriptor 161 | 162 | def restore_row(self, row, schema, pk): 163 | """Restore row from Pandas 164 | """ 165 | result = [] 166 | for field in schema.fields: 167 | if schema.primary_key and schema.primary_key[0] == field.name: 168 | if field.type == 'number' and np.isnan(pk): 169 | pk = None 170 | if pk and field.type == 'integer': 171 | pk = int(pk) 172 | result.append(field.cast_value(pk)) 173 | else: 174 | value = row[field.name] 175 | if field.type == 'number' and np.isnan(value): 176 | value = None 177 | if value and field.type == 'integer': 178 | value = int(value) 179 | elif field.type == 'datetime': 180 | value = value.to_pydatetime() 181 | result.append(field.cast_value(value)) 182 | return result 183 | 184 | def restore_type(self, dtype, sample=None): 185 | """Restore type from Pandas 186 | """ 187 | 188 | # Pandas types 189 | if pdc.is_bool_dtype(dtype): 190 | return 'boolean' 191 | elif pdc.is_datetime64_any_dtype(dtype): 192 | return 'datetime' 193 | elif pdc.is_integer_dtype(dtype): 194 | return 'integer' 195 | elif pdc.is_numeric_dtype(dtype): 196 | return 'number' 197 | 198 | # Python types 199 | if sample is not None: 200 | if isinstance(sample, (list, tuple)): 201 | return 'array' 202 | elif isinstance(sample, datetime.date): 203 | return 'date' 204 | elif isinstance(sample, isodate.Duration): 205 | return 'duration' 206 | elif isinstance(sample, dict): 207 | return 'object' 208 | elif isinstance(sample, six.string_types): 209 | return 'string' 210 | elif isinstance(sample, datetime.time): 211 | return 'time' 212 | 213 | return 'string' 214 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (c) 2016 Mantas Zimnickas and Open Knowledge 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /tests/test_storage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import six 9 | import json 10 | import pytest 11 | import datetime 12 | import tableschema 13 | import pandas as pd 14 | from copy import deepcopy 15 | from decimal import Decimal 16 | from tabulator import Stream 17 | from collections import OrderedDict 18 | from tableschema_pandas import Storage 19 | 20 | 21 | # Resources 22 | 23 | ARTICLES = { 24 | 'schema': { 25 | 'fields': [ 26 | {'name': 'id', 'type': 'integer', 'constraints': {'required': True}}, 27 | {'name': 'parent', 'type': 'integer'}, 28 | {'name': 'name', 'type': 'string'}, 29 | {'name': 'current', 'type': 'boolean'}, 30 | {'name': 'rating', 'type': 'number'}, 31 | ], 32 | 'primaryKey': 'id', 33 | # 'foreignKeys': [ 34 | # {'fields': 'parent', 'reference': {'resource': '', 'fields': 'id'}}, 35 | # ], 36 | }, 37 | 'data': [ 38 | ['1', '', 'Taxes', 'True', '9.5'], 39 | ['2', '1', '中国人', 'False', '7'], 40 | ], 41 | } 42 | COMMENTS = { 43 | 'schema': { 44 | 'fields': [ 45 | {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}}, 46 | {'name': 'comment', 'type': 'string'}, 47 | {'name': 'note', 'type': 'any'}, 48 | ], 49 | 'primaryKey': 'entry_id', 50 | # 'foreignKeys': [ 51 | # {'fields': 'entry_id', 'reference': {'resource': 'articles', 'fields': 'id'}}, 52 | # ], 53 | }, 54 | 'data': [ 55 | ['1', 'good', 'note1'], 56 | ['2', 'bad', 'note2'], 57 | ], 58 | } 59 | TEMPORAL = { 60 | 'schema': { 61 | 'fields': [ 62 | {'name': 'date', 'type': 'date'}, 63 | {'name': 'date_year', 'type': 'date', 'format': '%Y'}, 64 | {'name': 'datetime', 'type': 'datetime'}, 65 | {'name': 'duration', 'type': 'duration'}, 66 | {'name': 'time', 'type': 'time'}, 67 | {'name': 'year', 'type': 'year'}, 68 | {'name': 'yearmonth', 'type': 'yearmonth'}, 69 | ], 70 | }, 71 | 'data': [ 72 | ['2015-01-01', '2015', '2015-01-01T03:00:00Z', 'P1Y1M', '03:00:00', '2015', '2015-01'], 73 | ['2015-12-31', '2015', '2015-12-31T15:45:33Z', 'P2Y2M', '15:45:33', '2015', '2015-01'], 74 | ], 75 | } 76 | LOCATION = { 77 | 'schema': { 78 | 'fields': [ 79 | {'name': 'location', 'type': 'geojson'}, 80 | {'name': 'geopoint', 'type': 'geopoint'}, 81 | ], 82 | }, 83 | 'data': [ 84 | ['{"type": "Point","coordinates":[33.33,33.33]}', '30,75'], 85 | ['{"type": "Point","coordinates":[50.00,50.00]}', '90,45'], 86 | ], 87 | } 88 | COMPOUND = { 89 | 'schema': { 90 | 'fields': [ 91 | {'name': 'stats', 'type': 'object'}, 92 | {'name': 'persons', 'type': 'array'}, 93 | ], 94 | }, 95 | 'data': [ 96 | ['{"chars":560}', '["Mike", "John"]'], 97 | ['{"chars":970}', '["Paul", "Alex"]'], 98 | ], 99 | } 100 | 101 | 102 | # Tests 103 | 104 | def test_storage(): 105 | 106 | # Create storage 107 | storage = Storage() 108 | 109 | # Delete buckets 110 | storage.delete() 111 | 112 | # Create buckets 113 | storage.create(['articles', 'comments'], [ARTICLES['schema'], COMMENTS['schema']]) 114 | storage.create('comments', COMMENTS['schema'], force=True) 115 | storage.create('temporal', TEMPORAL['schema']) 116 | storage.create('location', LOCATION['schema']) 117 | storage.create('compound', COMPOUND['schema']) 118 | 119 | # Write data 120 | storage.write('articles', ARTICLES['data']) 121 | storage.write('comments', COMMENTS['data']) 122 | storage.write('temporal', TEMPORAL['data']) 123 | storage.write('location', LOCATION['data']) 124 | storage.write('compound', COMPOUND['data']) 125 | 126 | # Create new storage to use reflection only 127 | dataframes = OrderedDict() 128 | dataframes['articles'] = storage['articles'] 129 | dataframes['comments'] = storage['comments'] 130 | dataframes['temporal'] = storage['temporal'] 131 | dataframes['location'] = storage['location'] 132 | dataframes['compound'] = storage['compound'] 133 | storage = Storage(dataframes=dataframes) 134 | 135 | # Create existent bucket 136 | with pytest.raises(tableschema.exceptions.StorageError): 137 | storage.create('articles', ARTICLES['schema']) 138 | 139 | # Assert buckets 140 | assert storage.buckets == ['articles', 'comments', 'compound', 'location', 'temporal'] 141 | 142 | # Assert schemas 143 | assert storage.describe('articles') == { 144 | 'fields': [ 145 | {'name': 'id', 'type': 'integer', 'constraints': {'required': True}}, 146 | {'name': 'parent', 'type': 'number'}, # type downgrade 147 | {'name': 'name', 'type': 'string'}, 148 | {'name': 'current', 'type': 'boolean'}, 149 | {'name': 'rating', 'type': 'number'}, 150 | ], 151 | 'primaryKey': 'id', 152 | } 153 | assert storage.describe('comments') == { 154 | 'fields': [ 155 | {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}}, 156 | {'name': 'comment', 'type': 'string'}, 157 | {'name': 'note', 'type': 'string'}, # type downgrade 158 | ], 159 | 'primaryKey': 'entry_id', 160 | } 161 | assert storage.describe('temporal') == { 162 | 'fields': [ 163 | {'name': 'date', 'type': 'date'}, 164 | {'name': 'date_year', 'type': 'date'}, # format removal 165 | {'name': 'datetime', 'type': 'datetime'}, 166 | {'name': 'duration', 'type': 'duration'}, 167 | {'name': 'time', 'type': 'time'}, 168 | {'name': 'year', 'type': 'integer'}, # type downgrade 169 | {'name': 'yearmonth', 'type': 'array'}, # type downgrade 170 | ], 171 | } 172 | assert storage.describe('location') == { 173 | 'fields': [ 174 | {'name': 'location', 'type': 'object'}, # type downgrade 175 | {'name': 'geopoint', 'type': 'array'}, # type downgrade 176 | ], 177 | } 178 | assert storage.describe('compound') == COMPOUND['schema'] 179 | 180 | assert storage.read('articles') == cast(ARTICLES)['data'] 181 | assert storage.read('comments') == cast(COMMENTS)['data'] 182 | assert storage.read('temporal') == cast(TEMPORAL, wrap={'yearmonth': list})['data'] 183 | assert storage.read('location') == cast(LOCATION, wrap_each={'geopoint': Decimal})['data'] 184 | assert storage.read('compound') == cast(COMPOUND)['data'] 185 | 186 | # Assert data with forced schema 187 | storage.describe('compound', COMPOUND['schema']) 188 | assert storage.read('compound') == cast(COMPOUND)['data'] 189 | 190 | # Delete non existent bucket 191 | with pytest.raises(tableschema.exceptions.StorageError): 192 | storage.delete('non_existent') 193 | 194 | # Delete buckets 195 | storage.delete() 196 | 197 | 198 | def test_storage_table_without_primary_key(): 199 | schema = { 200 | 'fields': [ 201 | {'name': 'a', 'type': 'integer'}, 202 | {'name': 'b', 'type': 'string'}, 203 | ] 204 | } 205 | data = [[1, 'x'], [2, 'y']] 206 | 207 | storage = Storage() 208 | storage.create('data', schema) 209 | storage.write('data', data) 210 | assert list(storage.read('data')) == data 211 | 212 | 213 | def test_storage_init_tables(): 214 | data = [ 215 | (1, 'a'), 216 | (2, 'b'), 217 | ] 218 | df = pd.DataFrame(data, columns=('key', 'value')) 219 | storage = Storage(dataframes={'data': df}) 220 | assert list(storage.read('data')) == [[1, 'a'], [2, 'b']] 221 | assert storage.describe('data') == { 222 | 'fields': [ 223 | {'name': 'key', 'type': 'integer'}, 224 | {'name': 'value', 'type': 'string'}, 225 | ] 226 | } 227 | 228 | 229 | def test_storage_restore_schema_with_primary_key(): 230 | data = [ 231 | ('a',), 232 | ('b',), 233 | ] 234 | index = pd.Index([1, 2], name='key') 235 | df = pd.DataFrame(data, columns=('value',), index=index) 236 | storage = Storage(dataframes={'data': df}) 237 | assert list(storage.read('data')) == [[1, 'a'], [2, 'b']] 238 | assert storage.describe('data') == { 239 | 'primaryKey': 'key', 240 | 'fields': [ 241 | {'name': 'key', 'type': 'integer', 'constraints': {'required': True}}, 242 | {'name': 'value', 'type': 'string'}, 243 | ] 244 | } 245 | 246 | 247 | def test_storage_read_missing_table(): 248 | storage = Storage() 249 | with pytest.raises(tableschema.exceptions.StorageError) as excinfo: 250 | list(storage.read('data')) 251 | assert str(excinfo.value) == 'Bucket "data" doesn\'t exist.' 252 | 253 | 254 | def test_storage_multiple_writes(): 255 | index = pd.Index([1, 2], name='key') 256 | df = pd.DataFrame([('a',), ('b',)], columns=('value',), index=index) 257 | storage = Storage(dataframes={'data': df}) 258 | storage.write('data', [(2, 'x'), (3, 'y')]) 259 | assert list(storage.read('data')) == [ 260 | [1, 'a'], 261 | [2, 'b'], 262 | [2, 'x'], 263 | [3, 'y'], 264 | ] 265 | 266 | 267 | def test_storage_composite_primary_key(): 268 | schema = { 269 | 'fields': [ 270 | {'name': 'field1', 'type': 'string'}, 271 | {'name': 'field2', 'type': 'string'}, 272 | {'name': 'field3', 'type': 'string'}, 273 | ], 274 | 'primaryKey': ['field1', 'field2'], 275 | } 276 | data = [['value1', 'value2', 'value3']] 277 | storage = Storage() 278 | storage.create('bucket', schema) 279 | storage.write('bucket', data) 280 | assert storage['bucket'].to_dict() == {'field3': {('value1', 'value2'): 'value3'}} 281 | 282 | 283 | # Helpers 284 | 285 | def cast(resource, skip=[], wrap={}, wrap_each={}): 286 | resource = deepcopy(resource) 287 | schema = tableschema.Schema(resource['schema']) 288 | for row in resource['data']: 289 | for index, field in enumerate(schema.fields): 290 | value = row[index] 291 | if field.type not in skip: 292 | value = field.cast_value(value) 293 | if field.type in wrap: 294 | value = wrap[field.type](value) 295 | if field.type in wrap_each: 296 | value = list(map(wrap_each[field.type], value)) 297 | row[index] = value 298 | return resource 299 | --------------------------------------------------------------------------------