├── LEAD.md
├── tests
├── __init__.py
├── test_mapper.py
└── test_storage.py
├── examples
└── __init__.py
├── tableschema_pandas
├── VERSION
├── __init__.py
├── storage.py
└── mapper.py
├── setup.cfg
├── pytest.ini
├── data
├── comments.csv
├── sample.csv
├── vix.csv
├── articles.csv
├── comments.json
└── articles.json
├── MANIFEST.in
├── pylama.ini
├── .github
├── pull_request_template.md
├── issue_template.md
├── workflows
│ └── release.yml
└── stale.yml
├── tox.ini
├── .gitignore
├── Makefile
├── .travis.yml
├── setup.py
├── README.md
└── LICENSE.md
/LEAD.md:
--------------------------------------------------------------------------------
1 | roll
2 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tableschema_pandas/VERSION:
--------------------------------------------------------------------------------
1 | 1.1.0
2 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests
3 |
--------------------------------------------------------------------------------
/data/comments.csv:
--------------------------------------------------------------------------------
1 | entry_id,comment
2 | 1,good
3 |
--------------------------------------------------------------------------------
/data/sample.csv:
--------------------------------------------------------------------------------
1 | Id;Col1;Col2;Col3
2 | 101;1.1;1.2;1.3
3 | 102;2.1;2.2;2.3
4 |
--------------------------------------------------------------------------------
/data/vix.csv:
--------------------------------------------------------------------------------
1 | Date;VIXClose;VIXHigh;VIXLow;VIXOpen
2 | 2004-01-05T00:00:00Z;17.49;18.49;17.44;18.45
3 | 2004-01-06T00:00:00Z;16.73;17.67;16.19;17.66
4 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | global-include VERSION
2 | include LICENSE.md
3 | include Makefile
4 | include pylama.ini
5 | include pytest.ini
6 | include README.md
7 | include tox.ini
8 |
--------------------------------------------------------------------------------
/pylama.ini:
--------------------------------------------------------------------------------
1 | [pylama]
2 | linters = pyflakes,mccabe,pep8
3 |
4 | [pylama:pep8]
5 | max_line_length = 100
6 |
7 | [pylama:mccabe]
8 | complexity = 32
9 |
10 | [pylama:*/__init__.py]
11 | ignore = W0611
12 |
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | Please replace this line with full information about your pull request. Make sure that tests pass before publishing it
4 |
5 | ---
6 |
7 | Please preserve this line to notify @roll (lead of this repository)
8 |
--------------------------------------------------------------------------------
/.github/issue_template.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | Please replace this line with full information about your idea or problem. If it's a bug share as much as possible to reproduce it
4 |
5 | ---
6 |
7 | Please preserve this line to notify @roll (lead of this repository)
8 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: release
2 |
3 | on:
4 | push:
5 | tags:
6 | - 'v*.*.*'
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - name: Checkout
13 | uses: actions/checkout@v1
14 | - name: Release
15 | uses: softprops/action-gh-release@v1
16 | env:
17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
18 |
--------------------------------------------------------------------------------
/data/articles.csv:
--------------------------------------------------------------------------------
1 | id,parent,name,current,rating,created_year,created_date,created_time,created_datetime,stats,persons,location
2 | 1,,Taxes,True,9.5,2015,2015-01-01,03:00:00,2015-01-01T03:00:00Z,{"chars":560},["mike"],"{""type"": ""Point"",""coordinates"":[50.00,50.00]}"
3 | 2,1,中国人,False,7,2015,2015-12-31,15:45:33,2015-12-31T15:45:33Z,{"chars":970},["chen"],"{""type"": ""Point"",""coordinates"":[33.33,33.33]}"
4 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | package=tableschema_pandas
3 | skip_missing_interpreters=true
4 | envlist=
5 | py27
6 | py36
7 | py37
8 | py38
9 |
10 | [testenv]
11 | deps=
12 | mock
13 | pytest
14 | pytest-cov
15 | coverage
16 | passenv=
17 | CI
18 | TRAVIS
19 | TRAVIS_JOB_ID
20 | TRAVIS_BRANCH
21 | commands=
22 | py.test \
23 | --cov {[tox]package} \
24 | --cov-config tox.ini \
25 | --cov-report term-missing \
26 | {posargs}
27 |
--------------------------------------------------------------------------------
/tableschema_pandas/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import division
3 | from __future__ import print_function
4 | from __future__ import absolute_import
5 | from __future__ import unicode_literals
6 |
7 |
8 | # Module API
9 |
10 | from .storage import Storage
11 |
12 |
13 | # Version
14 |
15 | import io
16 | import os
17 | __version__ = io.open(
18 | os.path.join(os.path.dirname(__file__), 'VERSION'),
19 | encoding='utf-8').read().strip()
20 |
--------------------------------------------------------------------------------
/data/comments.json:
--------------------------------------------------------------------------------
1 | {
2 | "primaryKey": "entry_id",
3 | "foreignKeys": [
4 | {
5 | "fields": "entry_id",
6 | "reference": {
7 | "fields": "id",
8 | "resource": "
",
9 | "table": "articles"
10 | }
11 | }
12 | ],
13 | "fields": [
14 | {
15 | "name": "entry_id",
16 | "type": "integer",
17 | "constraints": {
18 | "required": true
19 | }
20 | },
21 | {
22 | "name": "comment",
23 | "type": "string"
24 | }
25 | ]
26 | }
27 |
--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
1 | # Number of days of inactivity before an issue becomes stale
2 | daysUntilStale: 90
3 |
4 | # Number of days of inactivity before a stale issue is closed
5 | daysUntilClose: 30
6 |
7 | # Issues with these labels will never be considered stale
8 | exemptLabels:
9 | - feature
10 | - enhancement
11 | - bug
12 |
13 | # Label to use when marking an issue as stale
14 | staleLabel: wontfix
15 |
16 | # Comment to post when marking an issue as stale. Set to `false` to disable
17 | markComment: >
18 | This issue has been automatically marked as stale because it has not had
19 | recent activity. It will be closed if no further activity occurs. Thank you
20 | for your contributions.
21 |
22 | # Comment to post when closing a stale issue. Set to `false` to disable
23 | closeComment: false
24 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 |
55 | # Sphinx documentation
56 | docs/_build/
57 |
58 | # PyBuilder
59 | target/
60 |
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: all install list readme release templates test version
2 |
3 |
4 | PACKAGE := $(shell grep '^PACKAGE =' setup.py | cut -d "'" -f2)
5 | VERSION := $(shell head -n 1 $(PACKAGE)/VERSION)
6 | LEAD := $(shell head -n 1 LEAD.md)
7 |
8 |
9 | all: list
10 |
11 | install:
12 | pip install --upgrade -e .[develop]
13 |
14 | list:
15 | @grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n'
16 |
17 | readme:
18 | pip install md-toc
19 | pip install referencer
20 | referencer $(PACKAGE) README.md --in-place
21 | md_toc -p README.md github --header-levels 3
22 | sed -i '/(#tableschema-pandas-py)/,+2d' README.md
23 |
24 | release:
25 | git checkout master && git pull origin && git fetch -p && git diff
26 | @echo "\nContinuing in 10 seconds. Press to abort\n" && sleep 10
27 | @git log --pretty=format:"%C(yellow)%h%Creset %s%Cgreen%d" --reverse -20
28 | @echo "\nReleasing v$(VERSION) in 10 seconds. Press to abort\n" && sleep 10
29 | git commit -a -m 'v$(VERSION)' && git tag -a v$(VERSION) -m 'v$(VERSION)'
30 | git push --follow-tags
31 |
32 | templates:
33 | sed -i -E "s/@(\w*)/@$(LEAD)/" .github/issue_template.md
34 | sed -i -E "s/@(\w*)/@$(LEAD)/" .github/pull_request_template.md
35 |
36 | test:
37 | pylama $(PACKAGE)
38 | tox
39 |
40 | version:
41 | @echo $(VERSION)
42 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | dist:
2 | xenial
3 |
4 | sudo:
5 | false
6 |
7 | language:
8 | python
9 |
10 | python:
11 | - 2.7
12 | - 3.6
13 | - 3.7
14 | - 3.8
15 |
16 | env:
17 | global:
18 | - TOXENV="py${PYTHON_VERSION//./}"
19 |
20 | install:
21 | - make install
22 | - pip install coveralls
23 |
24 | script:
25 | - make test
26 |
27 | after_success:
28 | - coveralls
29 |
30 | jobs:
31 | include:
32 | - stage: release
33 | if: tag IS present
34 | python: 3.8
35 | deploy:
36 | provider: pypi
37 | user: roll
38 | distributions: sdist bdist_wheel
39 | skip_cleanup: true
40 | on:
41 | tags: true
42 | password:
43 | secure: jAtHR6cR8G2+92dVZcgIDvIbBXBoZEWMuw0m/xIo58M5U/PBqBzizcfaYOkK5dSYewBN0+dt4dURymzA/KKEPRdOPXoZA4kKxA9dh8BqK7wKO9koY6Gg6WvCIDTO+36PLqHgPd2CQjdp3pSKbYoKkUAeUrlfUAWYL4C+D8N/WAYMEjlHsBxqZDuSJqaiIWiwaPAZKcavw5Tlr9WExM2baWd1zdHHU23FwCWqT4k2QvVU96fMBc8/3j8rxqdQxTcZSG0GRlcqhx0/px0JNH4x8emCriX25Hc24TNohLrZflBOkrJvlHZ0U9/+IUZZehUTeN86JslfkQQguLFAvQ/2htMf1Bv7LwIIdJRjTlR3x+ODZM3H0juA3paKztjp1GePuu4hGJf9KGI2caolryicQl1ficU/6KfLrlg3aVaXYg9um+9GqhhMbUuRtNjzhZLYj2vZfI5BSkb1FOpvP1ApEvKBWW/oQi+Sh7YEBMf4jTf0bVYqRZnvoohLG3GID9rR51Yh3rehZPMgU2CXnpCwRa1yprimq7qZGetleryTkRcF54s2+3kFnpL4Y6hXcXuD8UrWZ2+ZOakC3C2FS2o7rQc6kyumcHyM9wNtTdSORAnArkylqEEjTE87vkfn5OgHSlXXZ9MLClMWmh80JKe8VRNq4q5L9EfoOM+Ej2ue+7Y=
44 |
--------------------------------------------------------------------------------
/data/articles.json:
--------------------------------------------------------------------------------
1 | {
2 | "primaryKey": "id",
3 | "foreignKeys": [
4 | {
5 | "fields": "parent",
6 | "reference": {
7 | "fields": "id",
8 | "resource": "self"
9 | }
10 | }
11 | ],
12 | "fields": [
13 | {
14 | "name": "id",
15 | "type": "integer",
16 | "constraints": {
17 | "required": true
18 | }
19 | },
20 | {
21 | "name": "parent",
22 | "type": "integer"
23 | },
24 | {
25 | "name": "name",
26 | "type": "string"
27 | },
28 | {
29 | "name": "current",
30 | "type": "boolean"
31 | },
32 | {
33 | "name": "rating",
34 | "type": "number"
35 | },
36 | {
37 | "name": "created_year",
38 | "type": "date",
39 | "format": "fmt:%Y"
40 | },
41 | {
42 | "name": "created_date",
43 | "type": "date"
44 | },
45 | {
46 | "name": "created_time",
47 | "type": "time"
48 | },
49 | {
50 | "name": "created_datetime",
51 | "type": "datetime"
52 | },
53 | {
54 | "name": "stats",
55 | "type": "object"
56 | },
57 | {
58 | "name": "persons",
59 | "type": "array"
60 | },
61 | {
62 | "name": "location",
63 | "type": "geojson"
64 | }
65 |
66 | ]
67 | }
68 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import division
3 | from __future__ import print_function
4 | from __future__ import absolute_import
5 |
6 | import io
7 | import os.path
8 | from setuptools import setup, find_packages
9 |
10 |
11 | # Helpers
12 | def read(*segments):
13 | path = os.path.join(*segments)
14 | with io.open(path, encoding='utf-8') as f:
15 | return f.read().strip()
16 |
17 |
18 | # Prepare
19 | PACKAGE = 'tableschema_pandas'
20 | NAME = PACKAGE.replace('_', '-')
21 | INSTALL_REQUIRES = [
22 | 'six>=1.9',
23 | 'pandas>=0.18',
24 | 'tabulator>=1.0',
25 | 'tableschema>=1.1',
26 | 'isodate>=0.6',
27 | ]
28 | TESTS_REQUIRE = [
29 | 'mock',
30 | 'pylama',
31 | 'pytest',
32 | 'pytest-cov',
33 | 'tox',
34 | ]
35 | README = read('README.md')
36 | VERSION = read(PACKAGE, 'VERSION')
37 | PACKAGES = find_packages(exclude=['tests'])
38 |
39 |
40 | # Run
41 | setup(
42 | name=NAME,
43 | version=VERSION,
44 | packages=PACKAGES,
45 | include_package_data=True,
46 | install_requires=INSTALL_REQUIRES,
47 | tests_require=TESTS_REQUIRE,
48 | extras_require={'develop': TESTS_REQUIRE},
49 | zip_safe=False,
50 | long_description=README,
51 | long_description_content_type='text/markdown',
52 | description='Generate Pandas data frames, load and extract data, based on JSON Table Schema descriptors.',
53 | author='Open Knowledge Foundation',
54 | author_email='info@okfn.org',
55 | url='https://github.com/frictionlessdata/tableschema-pandas-py',
56 | license='LGPLv3+',
57 | keywords=['frictionless data', 'datapackage', 'pandas'],
58 | classifiers=[
59 | 'Development Status :: 4 - Beta',
60 | 'Intended Audience :: Developers',
61 | 'Intended Audience :: Science/Research',
62 | 'License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)',
63 | 'Operating System :: OS Independent',
64 | 'Programming Language :: Python :: 2',
65 | 'Programming Language :: Python :: 2.7',
66 | 'Programming Language :: Python :: 3',
67 | 'Programming Language :: Python :: 3.4',
68 | 'Programming Language :: Python :: 3.5',
69 | 'Topic :: Scientific/Engineering :: Information Analysis',
70 | 'Topic :: Software Development :: Libraries :: Python Modules'
71 | ],
72 | )
73 |
--------------------------------------------------------------------------------
/tests/test_mapper.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import division
3 | from __future__ import print_function
4 | from __future__ import absolute_import
5 | from __future__ import unicode_literals
6 |
7 | import six
8 | import pytest
9 | import datetime
10 | import tableschema
11 | import numpy as np
12 | import pandas as pd
13 | from tableschema_pandas.mapper import Mapper
14 |
15 |
16 | # Tests
17 |
18 | def test_mapper_convert_descriptor_and_rows():
19 | mapper = Mapper()
20 | df = pd.read_csv('data/sample.csv', sep=';', index_col=['Id'])
21 | descriptor = mapper.restore_descriptor(df)
22 | rows = df.reset_index().values
23 | df_new = mapper.convert_descriptor_and_rows(descriptor, rows)
24 | assert isinstance(df_new.index, pd.Index)
25 |
26 |
27 | @pytest.mark.skip
28 | def test_mapper_convert_descriptor_and_rows_with_datetime_index():
29 | mapper = Mapper()
30 | df = pd.read_csv('data/vix.csv', sep=';', parse_dates=['Date'], index_col=['Date'])
31 | descriptor = mapper.restore_descriptor(df)
32 | rows = df.reset_index().values
33 | df_new = mapper.convert_descriptor_and_rows(descriptor, rows)
34 | assert isinstance(df_new.index, pd.DatetimeIndex)
35 |
36 |
37 | def test_mapper_convert_type():
38 | mapper = Mapper()
39 | assert mapper.convert_type('string') == np.dtype('O')
40 | assert mapper.convert_type('year') == np.dtype(int)
41 | assert mapper.convert_type('yearmonth') == np.dtype(list)
42 | assert mapper.convert_type('duration') == np.dtype('O')
43 | with pytest.raises(tableschema.exceptions.StorageError):
44 | mapper.convert_type('non-existent')
45 |
46 |
47 | def test_mapper_restore_descriptor():
48 | mapper = Mapper()
49 | df = pd.read_csv('data/sample.csv', sep=';', index_col=['Id'])
50 | descriptor = mapper.restore_descriptor(df)
51 | assert descriptor == {
52 | 'fields': [
53 | {'name': 'Id', 'type': 'integer', 'constraints': {'required': True}},
54 | {'name': 'Col1', 'type': 'number'},
55 | {'name': 'Col2', 'type': 'number'},
56 | {'name': 'Col3', 'type': 'number'},
57 | ],
58 | 'primaryKey': 'Id',
59 | }
60 |
61 |
62 | def test_mapper_restore_type():
63 | mapper = Mapper()
64 | df = pd.DataFrame([{
65 | 'string': 'foo',
66 | 'number': 3.14,
67 | 'integer': 42,
68 | 'boolean': True,
69 | 'datetime': datetime.datetime.now(),
70 | }])
71 | assert mapper.restore_type(df.dtypes['string']) == 'string'
72 | assert mapper.restore_type(df.dtypes['number']) == 'number'
73 | assert mapper.restore_type(df.dtypes['integer']) == 'integer'
74 | assert mapper.restore_type(df.dtypes['boolean']) == 'boolean'
75 | assert mapper.restore_type(df.dtypes['datetime']) == 'datetime'
76 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tableschema-pandas-py
2 |
3 | [](https://travis-ci.org/frictionlessdata/tableschema-pandas-py)
4 | [](https://coveralls.io/r/frictionlessdata/tableschema-pandas-py?branch=master)
5 | [](https://pypi.python.org/pypi/tableschema-pandas)
6 | [](https://github.com/frictionlessdata/tableschema-pandas-py)
7 | [](https://gitter.im/frictionlessdata/chat)
8 |
9 | Generate and load Pandas data frames [Table Schema](http://specs.frictionlessdata.io/table-schema/) descriptors.
10 |
11 | ## Features
12 |
13 | - implements `tableschema.Storage` interface
14 |
15 | ## Contents
16 |
17 |
18 |
19 | - [Getting Started](#getting-started)
20 | - [Installation](#installation)
21 | - [Documentation](#documentation)
22 | - [API Reference](#api-reference)
23 | - [`Storage`](#storage)
24 | - [Contributing](#contributing)
25 | - [Changelog](#changelog)
26 |
27 |
28 |
29 | ## Getting Started
30 |
31 | ### Installation
32 |
33 | The package use semantic versioning. It means that major versions could include breaking changes. It's highly recommended to specify `package` version range in your `setup/requirements` file e.g. `package>=1.0,<2.0`.
34 |
35 | ```
36 | $ pip install tableschema-pandas
37 | ```
38 |
39 | ## Documentation
40 |
41 | ```python
42 | # pip install datapackage tableschema-pandas
43 | from datapackage import Package
44 |
45 | # Save to Pandas
46 |
47 | package = Package('http://data.okfn.org/data/core/country-list/datapackage.json')
48 | storage = package.save(storage='pandas')
49 |
50 | print(type(storage['data']))
51 | #
52 |
53 | print(storage['data'].head())
54 | # Name Code
55 | # 0 Afghanistan AF
56 | # 1 Åland Islands AX
57 | # 2 Albania AL
58 | # 3 Algeria DZ
59 | # 4 American Samoa AS
60 |
61 | # Load from Pandas
62 |
63 | package = Package(storage=storage)
64 | print(package.descriptor)
65 | print(package.resources[0].read())
66 | ```
67 |
68 | Storage works as a container for Pandas data frames. You can define new data frame inside storage using `storage.create` method:
69 |
70 | ```python
71 | >>> from tableschema_pandas import Storage
72 |
73 | >>> storage = Storage()
74 | ```
75 |
76 | ```python
77 | >>> storage.create('data', {
78 | ... 'primaryKey': 'id',
79 | ... 'fields': [
80 | ... {'name': 'id', 'type': 'integer'},
81 | ... {'name': 'comment', 'type': 'string'},
82 | ... ]
83 | ... })
84 |
85 | >>> storage.buckets
86 | ['data']
87 |
88 | >>> storage['data'].shape
89 | (0, 0)
90 | ```
91 |
92 | Use `storage.write` to populate data frame with data:
93 |
94 | ```python
95 | >>> storage.write('data', [(1, 'a'), (2, 'b')])
96 |
97 | >>> storage['data']
98 | id comment
99 | 1 a
100 | 2 b
101 | ```
102 |
103 | Also you can use [tabulator](https://github.com/frictionlessdata/tabulator-py) to populate data frame from external data file. As you see, subsequent writes simply appends new data on top of existing ones:
104 |
105 | ```python
106 | >>> import tabulator
107 |
108 | >>> with tabulator.Stream('data/comments.csv', headers=1) as stream:
109 | ... storage.write('data', stream)
110 |
111 | >>> storage['data']
112 | id comment
113 | 1 a
114 | 2 b
115 | 1 good
116 | ```
117 |
118 | ## API Reference
119 |
120 | ### `Storage`
121 | ```python
122 | Storage(self, dataframes=None)
123 | ```
124 | Pandas storage
125 |
126 | Package implements
127 | [Tabular Storage](https://github.com/frictionlessdata/tableschema-py#storage)
128 | interface (see full documentation on the link):
129 |
130 | 
131 |
132 | > Only additional API is documented
133 |
134 | __Arguments__
135 | - __dataframes (object[])__: list of storage dataframes
136 |
137 |
138 | ## Contributing
139 |
140 | > The project follows the [Open Knowledge International coding standards](https://github.com/okfn/coding-standards).
141 |
142 | Recommended way to get started is to create and activate a project virtual environment.
143 | To install package and development dependencies into active environment:
144 |
145 | ```bash
146 | $ make install
147 | ```
148 |
149 | To run tests with linting and coverage:
150 |
151 | ```bash
152 | $ make test
153 | ```
154 |
155 | ## Changelog
156 |
157 | Here described only breaking and the most important changes. The full changelog and documentation for all released versions could be found in nicely formatted [commit history](https://github.com/frictionlessdata/tableschema-pandas-py/commits/master).
158 |
159 | #### v1.1
160 |
161 | - Added support for composite primary keys (loading to pandas)
162 |
163 | #### v1.0
164 |
165 | - Initial driver implementation
166 |
--------------------------------------------------------------------------------
/tableschema_pandas/storage.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import division
3 | from __future__ import print_function
4 | from __future__ import absolute_import
5 | from __future__ import unicode_literals
6 |
7 | import six
8 | import collections
9 | import tableschema
10 | import pandas as pd
11 | from .mapper import Mapper
12 |
13 |
14 | # Module API
15 |
16 | class Storage(tableschema.Storage):
17 | """Pandas storage
18 |
19 | Package implements
20 | [Tabular Storage](https://github.com/frictionlessdata/tableschema-py#storage)
21 | interface (see full documentation on the link):
22 |
23 | 
24 |
25 | > Only additional API is documented
26 |
27 | # Arguments
28 | dataframes (object[]): list of storage dataframes
29 |
30 | """
31 |
32 | # Public
33 |
34 | def __init__(self, dataframes=None):
35 |
36 | # Set attributes
37 | self.__dataframes = dataframes or collections.OrderedDict()
38 | self.__descriptors = {}
39 |
40 | # Create mapper
41 | self.__mapper = Mapper()
42 |
43 | def __repr__(self):
44 | return 'Storage'
45 |
46 | def __getitem__(self, key):
47 | """Returns Pandas dataframe
48 |
49 | # Arguments
50 | name (str): name
51 |
52 | """
53 | return self.__dataframes[key]
54 |
55 | @property
56 | def buckets(self):
57 | return list(sorted(self.__dataframes.keys()))
58 |
59 | def create(self, bucket, descriptor, force=False):
60 |
61 | # Make lists
62 | buckets = bucket
63 | if isinstance(bucket, six.string_types):
64 | buckets = [bucket]
65 | descriptors = descriptor
66 | if isinstance(descriptor, dict):
67 | descriptors = [descriptor]
68 |
69 | # Check buckets for existence
70 | for bucket in buckets:
71 | if bucket in self.buckets:
72 | if not force:
73 | message = 'Bucket "%s" already exists' % bucket
74 | raise tableschema.exceptions.StorageError(message)
75 | self.delete(bucket)
76 |
77 | # Define dataframes
78 | for bucket, descriptor in zip(buckets, descriptors):
79 | tableschema.validate(descriptor)
80 | self.__descriptors[bucket] = descriptor
81 | self.__dataframes[bucket] = pd.DataFrame()
82 |
83 | def delete(self, bucket=None, ignore=False):
84 |
85 | # Make lists
86 | buckets = bucket
87 | if isinstance(bucket, six.string_types):
88 | buckets = [bucket]
89 | elif bucket is None:
90 | buckets = reversed(self.buckets)
91 |
92 | # Iterate over buckets
93 | for bucket in buckets:
94 |
95 | # Non existent bucket
96 | if bucket not in self.buckets:
97 | if not ignore:
98 | message = 'Bucket "%s" doesn\'t exist' % bucket
99 | raise tableschema.exceptions.StorageError(message)
100 | return
101 |
102 | # Remove from descriptors
103 | if bucket in self.__descriptors:
104 | del self.__descriptors[bucket]
105 |
106 | # Remove from dataframes
107 | if bucket in self.__dataframes:
108 | del self.__dataframes[bucket]
109 |
110 | def describe(self, bucket, descriptor=None):
111 |
112 | # Set descriptor
113 | if descriptor is not None:
114 | self.__descriptors[bucket] = descriptor
115 |
116 | # Get descriptor
117 | else:
118 | descriptor = self.__descriptors.get(bucket)
119 | if descriptor is None:
120 | dataframe = self.__dataframes[bucket]
121 | descriptor = self.__mapper.restore_descriptor(dataframe)
122 |
123 | return descriptor
124 |
125 | def iter(self, bucket):
126 |
127 | # Check existense
128 | if bucket not in self.buckets:
129 | message = 'Bucket "%s" doesn\'t exist.' % bucket
130 | raise tableschema.exceptions.StorageError(message)
131 |
132 | # Prepare
133 | descriptor = self.describe(bucket)
134 | schema = tableschema.Schema(descriptor)
135 |
136 | # Yield rows
137 | for pk, row in self.__dataframes[bucket].iterrows():
138 | row = self.__mapper.restore_row(row, schema=schema, pk=pk)
139 | yield row
140 |
141 | def read(self, bucket):
142 | rows = list(self.iter(bucket))
143 | return rows
144 |
145 | def write(self, bucket, rows):
146 |
147 | # Prepare
148 | descriptor = self.describe(bucket)
149 | new_data_frame = self.__mapper.convert_descriptor_and_rows(descriptor, rows)
150 |
151 | # Just set new DataFrame if current is empty
152 | if self.__dataframes[bucket].size == 0:
153 | self.__dataframes[bucket] = new_data_frame
154 |
155 | # Append new data frame to the old one setting new data frame
156 | # containing data from both old and new data frames
157 | else:
158 | self.__dataframes[bucket] = pd.concat([
159 | self.__dataframes[bucket],
160 | new_data_frame,
161 | ])
162 |
--------------------------------------------------------------------------------
/tableschema_pandas/mapper.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 |
7 | import six
8 | import json
9 | import isodate
10 | import datetime
11 | import tableschema
12 | import numpy as np
13 | import pandas as pd
14 |
15 | # Starting from pandas@0.24 there is the new API
16 | # https://github.com/frictionlessdata/tableschema-pandas-py/issues/29
17 | try:
18 | import pandas.core.dtypes.api as pdc
19 | except ImportError:
20 | import pandas.core.common as pdc
21 |
22 |
23 | # Module API
24 |
25 | class Mapper(object):
26 |
27 | # Public
28 |
29 | def convert_descriptor_and_rows(self, descriptor, rows):
30 | """Convert descriptor and rows to Pandas
31 | """
32 | schema = tableschema.Schema(descriptor)
33 |
34 | # Get data/index
35 | data_rows = []
36 | index_rows = []
37 | jtstypes_map = {}
38 | for row in rows:
39 | data_values = []
40 | index_values = []
41 | for field, value in zip(schema.fields, row):
42 | try:
43 | if isinstance(value, float) and np.isnan(value):
44 | value = None
45 | if value and field.type == 'integer':
46 | value = int(value)
47 | value = field.cast_value(value)
48 | except tableschema.exceptions.CastError:
49 | value = json.loads(value)
50 | # http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na
51 | if value is None and field.type in ('number', 'integer'):
52 | jtstypes_map[field.name] = 'number'
53 | value = np.NaN
54 | if field.name in schema.primary_key:
55 | index_values.append(value)
56 | else:
57 | data_values.append(value)
58 | if len(schema.primary_key) == 1:
59 | index_rows.append(index_values[0])
60 | elif len(schema.primary_key) > 1:
61 | index_rows.append(tuple(index_values))
62 | data_rows.append(tuple(data_values))
63 |
64 | # Create index
65 | index = None
66 | if schema.primary_key:
67 | if len(schema.primary_key) == 1:
68 | index_class = pd.Index
69 | index_field = schema.get_field(schema.primary_key[0])
70 | index_dtype = self.convert_type(index_field.type)
71 | if field.type in ['datetime', 'date']:
72 | index_class = pd.DatetimeIndex
73 | index = index_class(index_rows, name=index_field.name, dtype=index_dtype)
74 | elif len(schema.primary_key) > 1:
75 | index = pd.MultiIndex.from_tuples(index_rows, names=schema.primary_key)
76 |
77 | # Create dtypes/columns
78 | dtypes = []
79 | columns = []
80 | for field in schema.fields:
81 | if field.name not in schema.primary_key:
82 | field_name = field.name
83 | if six.PY2:
84 | field_name = field.name.encode('utf-8')
85 | dtype = self.convert_type(jtstypes_map.get(field.name, field.type))
86 | dtypes.append((field_name, dtype))
87 | columns.append(field.name)
88 |
89 | # Create dataframe
90 | array = np.array(data_rows, dtype=dtypes)
91 | dataframe = pd.DataFrame(array, index=index, columns=columns)
92 |
93 | return dataframe
94 |
95 | def convert_type(self, type):
96 | """Convert type to Pandas
97 | """
98 |
99 | # Mapping
100 | mapping = {
101 | 'any': np.dtype('O'),
102 | 'array': np.dtype(list),
103 | 'boolean': np.dtype(bool),
104 | 'date': np.dtype('O'),
105 | 'datetime': np.dtype('datetime64[ns]'),
106 | 'duration': np.dtype('O'),
107 | 'geojson': np.dtype('O'),
108 | 'geopoint': np.dtype('O'),
109 | 'integer': np.dtype(int),
110 | 'number': np.dtype(float),
111 | 'object': np.dtype(dict),
112 | 'string': np.dtype('O'),
113 | 'time': np.dtype('O'),
114 | 'year': np.dtype(int),
115 | 'yearmonth': np.dtype('O'),
116 | }
117 |
118 | # Get type
119 | if type not in mapping:
120 | message = 'Type "%s" is not supported' % type
121 | raise tableschema.exceptions.StorageError(message)
122 |
123 | return mapping[type]
124 |
125 | def restore_descriptor(self, dataframe):
126 | """Restore descriptor from Pandas
127 | """
128 |
129 | # Prepare
130 | fields = []
131 | primary_key = None
132 |
133 | # Primary key
134 | if dataframe.index.name:
135 | field_type = self.restore_type(dataframe.index.dtype)
136 | field = {
137 | 'name': dataframe.index.name,
138 | 'type': field_type,
139 | 'constraints': {'required': True},
140 | }
141 | fields.append(field)
142 | primary_key = dataframe.index.name
143 |
144 | # Fields
145 | for column, dtype in dataframe.dtypes.iteritems():
146 | sample = dataframe[column].iloc[0] if len(dataframe) else None
147 | field_type = self.restore_type(dtype, sample=sample)
148 | field = {'name': column, 'type': field_type}
149 | # TODO: provide better required indication
150 | # if dataframe[column].isnull().sum() == 0:
151 | # field['constraints'] = {'required': True}
152 | fields.append(field)
153 |
154 | # Descriptor
155 | descriptor = {}
156 | descriptor['fields'] = fields
157 | if primary_key:
158 | descriptor['primaryKey'] = primary_key
159 |
160 | return descriptor
161 |
162 | def restore_row(self, row, schema, pk):
163 | """Restore row from Pandas
164 | """
165 | result = []
166 | for field in schema.fields:
167 | if schema.primary_key and schema.primary_key[0] == field.name:
168 | if field.type == 'number' and np.isnan(pk):
169 | pk = None
170 | if pk and field.type == 'integer':
171 | pk = int(pk)
172 | result.append(field.cast_value(pk))
173 | else:
174 | value = row[field.name]
175 | if field.type == 'number' and np.isnan(value):
176 | value = None
177 | if value and field.type == 'integer':
178 | value = int(value)
179 | elif field.type == 'datetime':
180 | value = value.to_pydatetime()
181 | result.append(field.cast_value(value))
182 | return result
183 |
184 | def restore_type(self, dtype, sample=None):
185 | """Restore type from Pandas
186 | """
187 |
188 | # Pandas types
189 | if pdc.is_bool_dtype(dtype):
190 | return 'boolean'
191 | elif pdc.is_datetime64_any_dtype(dtype):
192 | return 'datetime'
193 | elif pdc.is_integer_dtype(dtype):
194 | return 'integer'
195 | elif pdc.is_numeric_dtype(dtype):
196 | return 'number'
197 |
198 | # Python types
199 | if sample is not None:
200 | if isinstance(sample, (list, tuple)):
201 | return 'array'
202 | elif isinstance(sample, datetime.date):
203 | return 'date'
204 | elif isinstance(sample, isodate.Duration):
205 | return 'duration'
206 | elif isinstance(sample, dict):
207 | return 'object'
208 | elif isinstance(sample, six.string_types):
209 | return 'string'
210 | elif isinstance(sample, datetime.time):
211 | return 'time'
212 |
213 | return 'string'
214 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (c) 2016 Mantas Zimnickas and Open Knowledge
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/tests/test_storage.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 |
7 | import io
8 | import six
9 | import json
10 | import pytest
11 | import datetime
12 | import tableschema
13 | import pandas as pd
14 | from copy import deepcopy
15 | from decimal import Decimal
16 | from tabulator import Stream
17 | from collections import OrderedDict
18 | from tableschema_pandas import Storage
19 |
20 |
21 | # Resources
22 |
23 | ARTICLES = {
24 | 'schema': {
25 | 'fields': [
26 | {'name': 'id', 'type': 'integer', 'constraints': {'required': True}},
27 | {'name': 'parent', 'type': 'integer'},
28 | {'name': 'name', 'type': 'string'},
29 | {'name': 'current', 'type': 'boolean'},
30 | {'name': 'rating', 'type': 'number'},
31 | ],
32 | 'primaryKey': 'id',
33 | # 'foreignKeys': [
34 | # {'fields': 'parent', 'reference': {'resource': '', 'fields': 'id'}},
35 | # ],
36 | },
37 | 'data': [
38 | ['1', '', 'Taxes', 'True', '9.5'],
39 | ['2', '1', '中国人', 'False', '7'],
40 | ],
41 | }
42 | COMMENTS = {
43 | 'schema': {
44 | 'fields': [
45 | {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}},
46 | {'name': 'comment', 'type': 'string'},
47 | {'name': 'note', 'type': 'any'},
48 | ],
49 | 'primaryKey': 'entry_id',
50 | # 'foreignKeys': [
51 | # {'fields': 'entry_id', 'reference': {'resource': 'articles', 'fields': 'id'}},
52 | # ],
53 | },
54 | 'data': [
55 | ['1', 'good', 'note1'],
56 | ['2', 'bad', 'note2'],
57 | ],
58 | }
59 | TEMPORAL = {
60 | 'schema': {
61 | 'fields': [
62 | {'name': 'date', 'type': 'date'},
63 | {'name': 'date_year', 'type': 'date', 'format': '%Y'},
64 | {'name': 'datetime', 'type': 'datetime'},
65 | {'name': 'duration', 'type': 'duration'},
66 | {'name': 'time', 'type': 'time'},
67 | {'name': 'year', 'type': 'year'},
68 | {'name': 'yearmonth', 'type': 'yearmonth'},
69 | ],
70 | },
71 | 'data': [
72 | ['2015-01-01', '2015', '2015-01-01T03:00:00Z', 'P1Y1M', '03:00:00', '2015', '2015-01'],
73 | ['2015-12-31', '2015', '2015-12-31T15:45:33Z', 'P2Y2M', '15:45:33', '2015', '2015-01'],
74 | ],
75 | }
76 | LOCATION = {
77 | 'schema': {
78 | 'fields': [
79 | {'name': 'location', 'type': 'geojson'},
80 | {'name': 'geopoint', 'type': 'geopoint'},
81 | ],
82 | },
83 | 'data': [
84 | ['{"type": "Point","coordinates":[33.33,33.33]}', '30,75'],
85 | ['{"type": "Point","coordinates":[50.00,50.00]}', '90,45'],
86 | ],
87 | }
88 | COMPOUND = {
89 | 'schema': {
90 | 'fields': [
91 | {'name': 'stats', 'type': 'object'},
92 | {'name': 'persons', 'type': 'array'},
93 | ],
94 | },
95 | 'data': [
96 | ['{"chars":560}', '["Mike", "John"]'],
97 | ['{"chars":970}', '["Paul", "Alex"]'],
98 | ],
99 | }
100 |
101 |
102 | # Tests
103 |
104 | def test_storage():
105 |
106 | # Create storage
107 | storage = Storage()
108 |
109 | # Delete buckets
110 | storage.delete()
111 |
112 | # Create buckets
113 | storage.create(['articles', 'comments'], [ARTICLES['schema'], COMMENTS['schema']])
114 | storage.create('comments', COMMENTS['schema'], force=True)
115 | storage.create('temporal', TEMPORAL['schema'])
116 | storage.create('location', LOCATION['schema'])
117 | storage.create('compound', COMPOUND['schema'])
118 |
119 | # Write data
120 | storage.write('articles', ARTICLES['data'])
121 | storage.write('comments', COMMENTS['data'])
122 | storage.write('temporal', TEMPORAL['data'])
123 | storage.write('location', LOCATION['data'])
124 | storage.write('compound', COMPOUND['data'])
125 |
126 | # Create new storage to use reflection only
127 | dataframes = OrderedDict()
128 | dataframes['articles'] = storage['articles']
129 | dataframes['comments'] = storage['comments']
130 | dataframes['temporal'] = storage['temporal']
131 | dataframes['location'] = storage['location']
132 | dataframes['compound'] = storage['compound']
133 | storage = Storage(dataframes=dataframes)
134 |
135 | # Create existent bucket
136 | with pytest.raises(tableschema.exceptions.StorageError):
137 | storage.create('articles', ARTICLES['schema'])
138 |
139 | # Assert buckets
140 | assert storage.buckets == ['articles', 'comments', 'compound', 'location', 'temporal']
141 |
142 | # Assert schemas
143 | assert storage.describe('articles') == {
144 | 'fields': [
145 | {'name': 'id', 'type': 'integer', 'constraints': {'required': True}},
146 | {'name': 'parent', 'type': 'number'}, # type downgrade
147 | {'name': 'name', 'type': 'string'},
148 | {'name': 'current', 'type': 'boolean'},
149 | {'name': 'rating', 'type': 'number'},
150 | ],
151 | 'primaryKey': 'id',
152 | }
153 | assert storage.describe('comments') == {
154 | 'fields': [
155 | {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}},
156 | {'name': 'comment', 'type': 'string'},
157 | {'name': 'note', 'type': 'string'}, # type downgrade
158 | ],
159 | 'primaryKey': 'entry_id',
160 | }
161 | assert storage.describe('temporal') == {
162 | 'fields': [
163 | {'name': 'date', 'type': 'date'},
164 | {'name': 'date_year', 'type': 'date'}, # format removal
165 | {'name': 'datetime', 'type': 'datetime'},
166 | {'name': 'duration', 'type': 'duration'},
167 | {'name': 'time', 'type': 'time'},
168 | {'name': 'year', 'type': 'integer'}, # type downgrade
169 | {'name': 'yearmonth', 'type': 'array'}, # type downgrade
170 | ],
171 | }
172 | assert storage.describe('location') == {
173 | 'fields': [
174 | {'name': 'location', 'type': 'object'}, # type downgrade
175 | {'name': 'geopoint', 'type': 'array'}, # type downgrade
176 | ],
177 | }
178 | assert storage.describe('compound') == COMPOUND['schema']
179 |
180 | assert storage.read('articles') == cast(ARTICLES)['data']
181 | assert storage.read('comments') == cast(COMMENTS)['data']
182 | assert storage.read('temporal') == cast(TEMPORAL, wrap={'yearmonth': list})['data']
183 | assert storage.read('location') == cast(LOCATION, wrap_each={'geopoint': Decimal})['data']
184 | assert storage.read('compound') == cast(COMPOUND)['data']
185 |
186 | # Assert data with forced schema
187 | storage.describe('compound', COMPOUND['schema'])
188 | assert storage.read('compound') == cast(COMPOUND)['data']
189 |
190 | # Delete non existent bucket
191 | with pytest.raises(tableschema.exceptions.StorageError):
192 | storage.delete('non_existent')
193 |
194 | # Delete buckets
195 | storage.delete()
196 |
197 |
198 | def test_storage_table_without_primary_key():
199 | schema = {
200 | 'fields': [
201 | {'name': 'a', 'type': 'integer'},
202 | {'name': 'b', 'type': 'string'},
203 | ]
204 | }
205 | data = [[1, 'x'], [2, 'y']]
206 |
207 | storage = Storage()
208 | storage.create('data', schema)
209 | storage.write('data', data)
210 | assert list(storage.read('data')) == data
211 |
212 |
213 | def test_storage_init_tables():
214 | data = [
215 | (1, 'a'),
216 | (2, 'b'),
217 | ]
218 | df = pd.DataFrame(data, columns=('key', 'value'))
219 | storage = Storage(dataframes={'data': df})
220 | assert list(storage.read('data')) == [[1, 'a'], [2, 'b']]
221 | assert storage.describe('data') == {
222 | 'fields': [
223 | {'name': 'key', 'type': 'integer'},
224 | {'name': 'value', 'type': 'string'},
225 | ]
226 | }
227 |
228 |
229 | def test_storage_restore_schema_with_primary_key():
230 | data = [
231 | ('a',),
232 | ('b',),
233 | ]
234 | index = pd.Index([1, 2], name='key')
235 | df = pd.DataFrame(data, columns=('value',), index=index)
236 | storage = Storage(dataframes={'data': df})
237 | assert list(storage.read('data')) == [[1, 'a'], [2, 'b']]
238 | assert storage.describe('data') == {
239 | 'primaryKey': 'key',
240 | 'fields': [
241 | {'name': 'key', 'type': 'integer', 'constraints': {'required': True}},
242 | {'name': 'value', 'type': 'string'},
243 | ]
244 | }
245 |
246 |
247 | def test_storage_read_missing_table():
248 | storage = Storage()
249 | with pytest.raises(tableschema.exceptions.StorageError) as excinfo:
250 | list(storage.read('data'))
251 | assert str(excinfo.value) == 'Bucket "data" doesn\'t exist.'
252 |
253 |
254 | def test_storage_multiple_writes():
255 | index = pd.Index([1, 2], name='key')
256 | df = pd.DataFrame([('a',), ('b',)], columns=('value',), index=index)
257 | storage = Storage(dataframes={'data': df})
258 | storage.write('data', [(2, 'x'), (3, 'y')])
259 | assert list(storage.read('data')) == [
260 | [1, 'a'],
261 | [2, 'b'],
262 | [2, 'x'],
263 | [3, 'y'],
264 | ]
265 |
266 |
267 | def test_storage_composite_primary_key():
268 | schema = {
269 | 'fields': [
270 | {'name': 'field1', 'type': 'string'},
271 | {'name': 'field2', 'type': 'string'},
272 | {'name': 'field3', 'type': 'string'},
273 | ],
274 | 'primaryKey': ['field1', 'field2'],
275 | }
276 | data = [['value1', 'value2', 'value3']]
277 | storage = Storage()
278 | storage.create('bucket', schema)
279 | storage.write('bucket', data)
280 | assert storage['bucket'].to_dict() == {'field3': {('value1', 'value2'): 'value3'}}
281 |
282 |
283 | # Helpers
284 |
285 | def cast(resource, skip=[], wrap={}, wrap_each={}):
286 | resource = deepcopy(resource)
287 | schema = tableschema.Schema(resource['schema'])
288 | for row in resource['data']:
289 | for index, field in enumerate(schema.fields):
290 | value = row[index]
291 | if field.type not in skip:
292 | value = field.cast_value(value)
293 | if field.type in wrap:
294 | value = wrap[field.type](value)
295 | if field.type in wrap_each:
296 | value = list(map(wrap_each[field.type], value))
297 | row[index] = value
298 | return resource
299 |
--------------------------------------------------------------------------------