├── LEAD.md
├── tests
    ├── __init__.py
    ├── test_mapper.py
    └── test_storage.py
├── examples
    └── __init__.py
├── tableschema_pandas
    ├── VERSION
    ├── __init__.py
    ├── storage.py
    └── mapper.py
├── setup.cfg
├── pytest.ini
├── data
    ├── comments.csv
    ├── sample.csv
    ├── vix.csv
    ├── articles.csv
    ├── comments.json
    └── articles.json
├── MANIFEST.in
├── pylama.ini
├── .github
    ├── pull_request_template.md
    ├── issue_template.md
    ├── workflows
    │   └── release.yml
    └── stale.yml
├── tox.ini
├── .gitignore
├── Makefile
├── .travis.yml
├── setup.py
├── README.md
└── LICENSE.md


/LEAD.md:
--------------------------------------------------------------------------------
1 | roll
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tableschema_pandas/VERSION:
--------------------------------------------------------------------------------
1 | 1.1.0
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests
3 | 


--------------------------------------------------------------------------------
/data/comments.csv:
--------------------------------------------------------------------------------
1 | entry_id,comment
2 | 1,good
3 | 


--------------------------------------------------------------------------------
/data/sample.csv:
--------------------------------------------------------------------------------
1 | Id;Col1;Col2;Col3
2 | 101;1.1;1.2;1.3
3 | 102;2.1;2.2;2.3
4 | 


--------------------------------------------------------------------------------
/data/vix.csv:
--------------------------------------------------------------------------------
1 | Date;VIXClose;VIXHigh;VIXLow;VIXOpen
2 | 2004-01-05T00:00:00Z;17.49;18.49;17.44;18.45
3 | 2004-01-06T00:00:00Z;16.73;17.67;16.19;17.66
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | global-include VERSION
2 | include LICENSE.md
3 | include Makefile
4 | include pylama.ini
5 | include pytest.ini
6 | include README.md
7 | include tox.ini
8 | 


--------------------------------------------------------------------------------
/pylama.ini:
--------------------------------------------------------------------------------
 1 | [pylama]
 2 | linters = pyflakes,mccabe,pep8
 3 | 
 4 | [pylama:pep8]
 5 | max_line_length = 100
 6 | 
 7 | [pylama:mccabe]
 8 | complexity = 32
 9 | 
10 | [pylama:*/__init__.py]
11 | ignore = W0611
12 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | # Overview
2 | 
3 | Please replace this line with full information about your pull request. Make sure that tests pass before publishing it
4 | 
5 | ---
6 | 
7 | Please preserve this line to notify @roll (lead of this repository)
8 | 


--------------------------------------------------------------------------------
/.github/issue_template.md:
--------------------------------------------------------------------------------
1 | # Overview
2 | 
3 | Please replace this line with full information about your idea or problem. If it's a bug share as much as possible to reproduce it
4 | 
5 | ---
6 | 
7 | Please preserve this line to notify @roll (lead of this repository)
8 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*.*.*'
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout
13 |         uses: actions/checkout@v1
14 |       - name: Release
15 |         uses: softprops/action-gh-release@v1
16 |         env:
17 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
18 | 


--------------------------------------------------------------------------------
/data/articles.csv:
--------------------------------------------------------------------------------
1 | id,parent,name,current,rating,created_year,created_date,created_time,created_datetime,stats,persons,location
2 | 1,,Taxes,True,9.5,2015,2015-01-01,03:00:00,2015-01-01T03:00:00Z,{"chars":560},["mike"],"{""type"": ""Point"",""coordinates"":[50.00,50.00]}"
3 | 2,1,中国人,False,7,2015,2015-12-31,15:45:33,2015-12-31T15:45:33Z,{"chars":970},["chen"],"{""type"": ""Point"",""coordinates"":[33.33,33.33]}"
4 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | package=tableschema_pandas
 3 | skip_missing_interpreters=true
 4 | envlist=
 5 |   py27
 6 |   py36
 7 |   py37
 8 |   py38
 9 | 
10 | [testenv]
11 | deps=
12 |   mock
13 |   pytest
14 |   pytest-cov
15 |   coverage
16 | passenv=
17 |   CI
18 |   TRAVIS
19 |   TRAVIS_JOB_ID
20 |   TRAVIS_BRANCH
21 | commands=
22 |   py.test \
23 |     --cov {[tox]package} \
24 |     --cov-config tox.ini \
25 |     --cov-report term-missing \
26 |     {posargs}
27 | 


--------------------------------------------------------------------------------
/tableschema_pandas/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | 
 8 | # Module API
 9 | 
10 | from .storage import Storage
11 | 
12 | 
13 | # Version
14 | 
15 | import io
16 | import os
17 | __version__ = io.open(
18 |     os.path.join(os.path.dirname(__file__), 'VERSION'),
19 |     encoding='utf-8').read().strip()
20 | 


--------------------------------------------------------------------------------
/data/comments.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "primaryKey": "entry_id",
 3 |     "foreignKeys": [
 4 |         {
 5 |             "fields": "entry_id",
 6 |             "reference": {
 7 |                 "fields": "id",
 8 |                 "resource": "<table>",
 9 |                 "table": "articles"
10 |             }
11 |         }
12 |     ],
13 |     "fields": [
14 |         {
15 |             "name": "entry_id",
16 |             "type": "integer",
17 |             "constraints": {
18 |                 "required": true
19 |             }
20 |         },
21 |         {
22 |             "name": "comment",
23 |             "type": "string"
24 |         }
25 |     ]
26 | }
27 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 90
 3 | 
 4 | # Number of days of inactivity before a stale issue is closed
 5 | daysUntilClose: 30
 6 | 
 7 | # Issues with these labels will never be considered stale
 8 | exemptLabels:
 9 |   - feature
10 |   - enhancement
11 |   - bug
12 | 
13 | # Label to use when marking an issue as stale
14 | staleLabel: wontfix
15 | 
16 | # Comment to post when marking an issue as stale. Set to `false` to disable
17 | markComment: >
18 |   This issue has been automatically marked as stale because it has not had
19 |   recent activity. It will be closed if no further activity occurs. Thank you
20 |   for your contributions.
21 | 
22 | # Comment to post when closing a stale issue. Set to `false` to disable
23 | closeComment: false
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | #Ipython Notebook
62 | .ipynb_checkpoints
63 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all install list readme release templates test version
 2 | 
 3 | 
 4 | PACKAGE := $(shell grep '^PACKAGE =' setup.py | cut -d "'" -f2)
 5 | VERSION := $(shell head -n 1 $(PACKAGE)/VERSION)
 6 | LEAD := $(shell head -n 1 LEAD.md)
 7 | 
 8 | 
 9 | all: list
10 | 
11 | install:
12 | 	pip install --upgrade -e .[develop]
13 | 
14 | list:
15 | 	@grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n'
16 | 
17 | readme:
18 | 	pip install md-toc
19 | 	pip install referencer
20 | 	referencer $(PACKAGE) README.md --in-place
21 | 	md_toc -p README.md github --header-levels 3
22 | 	sed -i '/(#tableschema-pandas-py)/,+2d' README.md
23 | 
24 | release:
25 | 	git checkout master && git pull origin && git fetch -p && git diff
26 | 	@echo "\nContinuing in 10 seconds. Press <CTRL+C> to abort\n" && sleep 10
27 | 	@git log --pretty=format:"%C(yellow)%h%Creset %s%Cgreen%d" --reverse -20
28 | 	@echo "\nReleasing v$(VERSION) in 10 seconds. Press <CTRL+C> to abort\n" && sleep 10
29 | 	git commit -a -m 'v$(VERSION)' && git tag -a v$(VERSION) -m 'v$(VERSION)'
30 | 	git push --follow-tags
31 | 
32 | templates:
33 | 	sed -i -E "s/@(\w*)/@$(LEAD)/" .github/issue_template.md
34 | 	sed -i -E "s/@(\w*)/@$(LEAD)/" .github/pull_request_template.md
35 | 
36 | test:
37 | 	pylama $(PACKAGE)
38 | 	tox
39 | 
40 | version:
41 | 	@echo $(VERSION)
42 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist:
 2 |   xenial
 3 | 
 4 | sudo:
 5 |   false
 6 | 
 7 | language:
 8 |   python
 9 | 
10 | python:
11 |   - 2.7
12 |   - 3.6
13 |   - 3.7
14 |   - 3.8
15 | 
16 | env:
17 |   global:
18 |     - TOXENV="py${PYTHON_VERSION//./}"
19 | 
20 | install:
21 |   - make install
22 |   - pip install coveralls
23 | 
24 | script:
25 |   - make test
26 | 
27 | after_success:
28 |   - coveralls
29 | 
30 | jobs:
31 |   include:
32 |     - stage: release
33 |       if: tag IS present
34 |       python: 3.8
35 |       deploy:
36 |         provider: pypi
37 |         user: roll
38 |         distributions: sdist bdist_wheel
39 |         skip_cleanup: true
40 |         on:
41 |           tags: true
42 |         password:
43 |           secure: jAtHR6cR8G2+92dVZcgIDvIbBXBoZEWMuw0m/xIo58M5U/PBqBzizcfaYOkK5dSYewBN0+dt4dURymzA/KKEPRdOPXoZA4kKxA9dh8BqK7wKO9koY6Gg6WvCIDTO+36PLqHgPd2CQjdp3pSKbYoKkUAeUrlfUAWYL4C+D8N/WAYMEjlHsBxqZDuSJqaiIWiwaPAZKcavw5Tlr9WExM2baWd1zdHHU23FwCWqT4k2QvVU96fMBc8/3j8rxqdQxTcZSG0GRlcqhx0/px0JNH4x8emCriX25Hc24TNohLrZflBOkrJvlHZ0U9/+IUZZehUTeN86JslfkQQguLFAvQ/2htMf1Bv7LwIIdJRjTlR3x+ODZM3H0juA3paKztjp1GePuu4hGJf9KGI2caolryicQl1ficU/6KfLrlg3aVaXYg9um+9GqhhMbUuRtNjzhZLYj2vZfI5BSkb1FOpvP1ApEvKBWW/oQi+Sh7YEBMf4jTf0bVYqRZnvoohLG3GID9rR51Yh3rehZPMgU2CXnpCwRa1yprimq7qZGetleryTkRcF54s2+3kFnpL4Y6hXcXuD8UrWZ2+ZOakC3C2FS2o7rQc6kyumcHyM9wNtTdSORAnArkylqEEjTE87vkfn5OgHSlXXZ9MLClMWmh80JKe8VRNq4q5L9EfoOM+Ej2ue+7Y=
44 | 


--------------------------------------------------------------------------------
/data/articles.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "primaryKey": "id",
 3 |     "foreignKeys": [
 4 |         {
 5 |             "fields": "parent",
 6 |             "reference": {
 7 |                 "fields": "id",
 8 |                 "resource": "self"
 9 |             }
10 |         }
11 |     ],
12 |     "fields": [
13 |         {
14 |             "name": "id",
15 |             "type": "integer",
16 |             "constraints": {
17 |                 "required": true
18 |             }
19 |         },
20 |         {
21 |             "name": "parent",
22 |             "type": "integer"
23 |         },
24 |         {
25 |             "name": "name",
26 |             "type": "string"
27 |         },
28 |         {
29 |             "name": "current",
30 |             "type": "boolean"
31 |         },
32 |         {
33 |             "name": "rating",
34 |             "type": "number"
35 |         },
36 |         {
37 |             "name": "created_year",
38 |             "type": "date",
39 |             "format": "fmt:%Y"
40 |         },
41 |         {
42 |             "name": "created_date",
43 |             "type": "date"
44 |         },
45 |         {
46 |             "name": "created_time",
47 |             "type": "time"
48 |         },
49 |         {
50 |             "name": "created_datetime",
51 |             "type": "datetime"
52 |         },
53 |         {
54 |             "name": "stats",
55 |             "type": "object"
56 |         },
57 |         {
58 |             "name": "persons",
59 |             "type": "array"
60 |         },
61 |         {
62 |             "name": "location",
63 |             "type": "geojson"
64 |         }
65 | 
66 |     ]
67 | }
68 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | 
 6 | import io
 7 | import os.path
 8 | from setuptools import setup, find_packages
 9 | 
10 | 
11 | # Helpers
12 | def read(*segments):
13 |     path = os.path.join(*segments)
14 |     with io.open(path, encoding='utf-8') as f:
15 |         return f.read().strip()
16 | 
17 | 
18 | # Prepare
19 | PACKAGE = 'tableschema_pandas'
20 | NAME = PACKAGE.replace('_', '-')
21 | INSTALL_REQUIRES = [
22 |     'six>=1.9',
23 |     'pandas>=0.18',
24 |     'tabulator>=1.0',
25 |     'tableschema>=1.1',
26 |     'isodate>=0.6',
27 | ]
28 | TESTS_REQUIRE = [
29 |     'mock',
30 |     'pylama',
31 |     'pytest',
32 |     'pytest-cov',
33 |     'tox',
34 | ]
35 | README = read('README.md')
36 | VERSION = read(PACKAGE, 'VERSION')
37 | PACKAGES = find_packages(exclude=['tests'])
38 | 
39 | 
40 | # Run
41 | setup(
42 |     name=NAME,
43 |     version=VERSION,
44 |     packages=PACKAGES,
45 |     include_package_data=True,
46 |     install_requires=INSTALL_REQUIRES,
47 |     tests_require=TESTS_REQUIRE,
48 |     extras_require={'develop': TESTS_REQUIRE},
49 |     zip_safe=False,
50 |     long_description=README,
51 |     long_description_content_type='text/markdown',
52 |     description='Generate Pandas data frames, load and extract data, based on JSON Table Schema descriptors.',
53 |     author='Open Knowledge Foundation',
54 |     author_email='info@okfn.org',
55 |     url='https://github.com/frictionlessdata/tableschema-pandas-py',
56 |     license='LGPLv3+',
57 |     keywords=['frictionless data', 'datapackage', 'pandas'],
58 |     classifiers=[
59 |         'Development Status :: 4 - Beta',
60 |         'Intended Audience :: Developers',
61 |         'Intended Audience :: Science/Research',
62 |         'License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)',
63 |         'Operating System :: OS Independent',
64 |         'Programming Language :: Python :: 2',
65 |         'Programming Language :: Python :: 2.7',
66 |         'Programming Language :: Python :: 3',
67 |         'Programming Language :: Python :: 3.4',
68 |         'Programming Language :: Python :: 3.5',
69 |         'Topic :: Scientific/Engineering :: Information Analysis',
70 |         'Topic :: Software Development :: Libraries :: Python Modules'
71 |     ],
72 | )
73 | 


--------------------------------------------------------------------------------
/tests/test_mapper.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | import six
 8 | import pytest
 9 | import datetime
10 | import tableschema
11 | import numpy as np
12 | import pandas as pd
13 | from tableschema_pandas.mapper import Mapper
14 | 
15 | 
16 | # Tests
17 | 
18 | def test_mapper_convert_descriptor_and_rows():
19 |     mapper = Mapper()
20 |     df = pd.read_csv('data/sample.csv', sep=';', index_col=['Id'])
21 |     descriptor = mapper.restore_descriptor(df)
22 |     rows = df.reset_index().values
23 |     df_new = mapper.convert_descriptor_and_rows(descriptor, rows)
24 |     assert isinstance(df_new.index, pd.Index)
25 | 
26 | 
27 | @pytest.mark.skip
28 | def test_mapper_convert_descriptor_and_rows_with_datetime_index():
29 |     mapper = Mapper()
30 |     df = pd.read_csv('data/vix.csv', sep=';', parse_dates=['Date'], index_col=['Date'])
31 |     descriptor = mapper.restore_descriptor(df)
32 |     rows = df.reset_index().values
33 |     df_new = mapper.convert_descriptor_and_rows(descriptor, rows)
34 |     assert isinstance(df_new.index, pd.DatetimeIndex)
35 | 
36 | 
37 | def test_mapper_convert_type():
38 |     mapper = Mapper()
39 |     assert mapper.convert_type('string') == np.dtype('O')
40 |     assert mapper.convert_type('year') == np.dtype(int)
41 |     assert mapper.convert_type('yearmonth') == np.dtype(list)
42 |     assert mapper.convert_type('duration') == np.dtype('O')
43 |     with pytest.raises(tableschema.exceptions.StorageError):
44 |         mapper.convert_type('non-existent')
45 | 
46 | 
47 | def test_mapper_restore_descriptor():
48 |     mapper = Mapper()
49 |     df = pd.read_csv('data/sample.csv', sep=';', index_col=['Id'])
50 |     descriptor = mapper.restore_descriptor(df)
51 |     assert descriptor == {
52 |         'fields': [
53 |             {'name': 'Id', 'type': 'integer', 'constraints': {'required': True}},
54 |             {'name': 'Col1', 'type': 'number'},
55 |             {'name': 'Col2', 'type': 'number'},
56 |             {'name': 'Col3', 'type': 'number'},
57 |         ],
58 |         'primaryKey': 'Id',
59 |      }
60 | 
61 | 
62 | def test_mapper_restore_type():
63 |     mapper = Mapper()
64 |     df = pd.DataFrame([{
65 |         'string': 'foo',
66 |         'number': 3.14,
67 |         'integer': 42,
68 |         'boolean': True,
69 |         'datetime': datetime.datetime.now(),
70 |     }])
71 |     assert mapper.restore_type(df.dtypes['string']) == 'string'
72 |     assert mapper.restore_type(df.dtypes['number']) == 'number'
73 |     assert mapper.restore_type(df.dtypes['integer']) == 'integer'
74 |     assert mapper.restore_type(df.dtypes['boolean']) == 'boolean'
75 |     assert mapper.restore_type(df.dtypes['datetime']) == 'datetime'
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # tableschema-pandas-py
  2 | 
  3 | [![Travis](https://img.shields.io/travis/frictionlessdata/tableschema-pandas-py/master.svg)](https://travis-ci.org/frictionlessdata/tableschema-pandas-py)
  4 | [![Coveralls](http://img.shields.io/coveralls/frictionlessdata/tableschema-pandas-py.svg?branch=master)](https://coveralls.io/r/frictionlessdata/tableschema-pandas-py?branch=master)
  5 | [![PyPi](https://img.shields.io/pypi/v/tableschema-pandas.svg)](https://pypi.python.org/pypi/tableschema-pandas)
  6 | [![Github](https://img.shields.io/badge/github-master-brightgreen)](https://github.com/frictionlessdata/tableschema-pandas-py)
  7 | [![Gitter](https://img.shields.io/gitter/room/frictionlessdata/chat.svg)](https://gitter.im/frictionlessdata/chat)
  8 | 
  9 | Generate and load Pandas data frames [Table Schema](http://specs.frictionlessdata.io/table-schema/) descriptors.
 10 | 
 11 | ## Features
 12 | 
 13 | - implements `tableschema.Storage` interface
 14 | 
 15 | ## Contents
 16 | 
 17 | <!--TOC-->
 18 | 
 19 |   - [Getting Started](#getting-started)
 20 |     - [Installation](#installation)
 21 |   - [Documentation](#documentation)
 22 |   - [API Reference](#api-reference)
 23 |     - [`Storage`](#storage)
 24 |   - [Contributing](#contributing)
 25 |   - [Changelog](#changelog)
 26 | 
 27 | <!--TOC-->
 28 | 
 29 | ## Getting Started
 30 | 
 31 | ### Installation
 32 | 
 33 | The package use semantic versioning. It means that major versions  could include breaking changes. It's highly recommended to specify `package` version range in your `setup/requirements` file e.g. `package>=1.0,<2.0`.
 34 | 
 35 | ```
 36 | $ pip install tableschema-pandas
 37 | ```
 38 | 
 39 | ## Documentation
 40 | 
 41 | ```python
 42 | # pip install datapackage tableschema-pandas
 43 | from datapackage import Package
 44 | 
 45 | # Save to Pandas
 46 | 
 47 | package = Package('http://data.okfn.org/data/core/country-list/datapackage.json')
 48 | storage = package.save(storage='pandas')
 49 | 
 50 | print(type(storage['data']))
 51 | #  <class 'pandas.core.frame.DataFrame'>
 52 | 
 53 | print(storage['data'].head())
 54 | #               Name   Code
 55 | #  0     Afghanistan   AF
 56 | #  1   Åland Islands   AX
 57 | #  2         Albania   AL
 58 | #  3         Algeria   DZ
 59 | #  4  American Samoa   AS
 60 | 
 61 | # Load from Pandas
 62 | 
 63 | package = Package(storage=storage)
 64 | print(package.descriptor)
 65 | print(package.resources[0].read())
 66 | ```
 67 | 
 68 | Storage works as a container for Pandas data frames. You can define new data frame inside storage using `storage.create` method:
 69 | 
 70 | ```python
 71 | >>> from tableschema_pandas import Storage
 72 | 
 73 | >>> storage = Storage()
 74 | ```
 75 | 
 76 | ```python
 77 | >>> storage.create('data', {
 78 | ...     'primaryKey': 'id',
 79 | ...     'fields': [
 80 | ...         {'name': 'id', 'type': 'integer'},
 81 | ...         {'name': 'comment', 'type': 'string'},
 82 | ...     ]
 83 | ... })
 84 | 
 85 | >>> storage.buckets
 86 | ['data']
 87 | 
 88 | >>> storage['data'].shape
 89 | (0, 0)
 90 | ```
 91 | 
 92 | Use `storage.write` to populate data frame with data:
 93 | 
 94 | ```python
 95 | >>> storage.write('data', [(1, 'a'), (2, 'b')])
 96 | 
 97 | >>> storage['data']
 98 | id comment
 99 | 1        a
100 | 2        b
101 | ```
102 | 
103 | Also you can use [tabulator](https://github.com/frictionlessdata/tabulator-py) to populate data frame from external data file. As you see, subsequent writes simply appends new data on top of existing ones:
104 | 
105 | ```python
106 | >>> import tabulator
107 | 
108 | >>> with tabulator.Stream('data/comments.csv', headers=1) as stream:
109 | ...     storage.write('data', stream)
110 | 
111 | >>> storage['data']
112 | id comment
113 | 1        a
114 | 2        b
115 | 1     good
116 | ```
117 | 
118 | ## API Reference
119 | 
120 | ### `Storage`
121 | ```python
122 | Storage(self, dataframes=None)
123 | ```
124 | Pandas storage
125 | 
126 | Package implements
127 | [Tabular Storage](https://github.com/frictionlessdata/tableschema-py#storage)
128 | interface (see full documentation on the link):
129 | 
130 | ![Storage](https://i.imgur.com/RQgrxqp.png)
131 | 
132 | > Only additional API is documented
133 | 
134 | __Arguments__
135 | - __dataframes (object[])__: list of storage dataframes
136 | 
137 | 
138 | ## Contributing
139 | 
140 | > The project follows the [Open Knowledge International coding standards](https://github.com/okfn/coding-standards).
141 | 
142 | Recommended way to get started is to create and activate a project virtual environment.
143 | To install package and development dependencies into active environment:
144 | 
145 | ```bash
146 | $ make install
147 | ```
148 | 
149 | To run tests with linting and coverage:
150 | 
151 | ```bash
152 | $ make test
153 | ```
154 | 
155 | ## Changelog
156 | 
157 | Here described only breaking and the most important changes. The full changelog and documentation for all released versions could be found in nicely formatted [commit history](https://github.com/frictionlessdata/tableschema-pandas-py/commits/master).
158 | 
159 | #### v1.1
160 | 
161 | - Added support for composite primary keys (loading to pandas)
162 | 
163 | #### v1.0
164 | 
165 | - Initial driver implementation
166 | 


--------------------------------------------------------------------------------
/tableschema_pandas/storage.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import six
  8 | import collections
  9 | import tableschema
 10 | import pandas as pd
 11 | from .mapper import Mapper
 12 | 
 13 | 
 14 | # Module API
 15 | 
 16 | class Storage(tableschema.Storage):
 17 |     """Pandas storage
 18 | 
 19 |     Package implements
 20 |     [Tabular Storage](https://github.com/frictionlessdata/tableschema-py#storage)
 21 |     interface (see full documentation on the link):
 22 | 
 23 |     ![Storage](https://i.imgur.com/RQgrxqp.png)
 24 | 
 25 |     > Only additional API is documented
 26 | 
 27 |     # Arguments
 28 |         dataframes (object[]): list of storage dataframes
 29 | 
 30 |     """
 31 | 
 32 |     # Public
 33 | 
 34 |     def __init__(self, dataframes=None):
 35 | 
 36 |         # Set attributes
 37 |         self.__dataframes = dataframes or collections.OrderedDict()
 38 |         self.__descriptors = {}
 39 | 
 40 |         # Create mapper
 41 |         self.__mapper = Mapper()
 42 | 
 43 |     def __repr__(self):
 44 |         return 'Storage'
 45 | 
 46 |     def __getitem__(self, key):
 47 |         """Returns Pandas dataframe
 48 | 
 49 |         # Arguments
 50 |             name (str): name
 51 | 
 52 |         """
 53 |         return self.__dataframes[key]
 54 | 
 55 |     @property
 56 |     def buckets(self):
 57 |         return list(sorted(self.__dataframes.keys()))
 58 | 
 59 |     def create(self, bucket, descriptor, force=False):
 60 | 
 61 |         # Make lists
 62 |         buckets = bucket
 63 |         if isinstance(bucket, six.string_types):
 64 |             buckets = [bucket]
 65 |         descriptors = descriptor
 66 |         if isinstance(descriptor, dict):
 67 |             descriptors = [descriptor]
 68 | 
 69 |         # Check buckets for existence
 70 |         for bucket in buckets:
 71 |             if bucket in self.buckets:
 72 |                 if not force:
 73 |                     message = 'Bucket "%s" already exists' % bucket
 74 |                     raise tableschema.exceptions.StorageError(message)
 75 |                 self.delete(bucket)
 76 | 
 77 |         # Define dataframes
 78 |         for bucket, descriptor in zip(buckets, descriptors):
 79 |             tableschema.validate(descriptor)
 80 |             self.__descriptors[bucket] = descriptor
 81 |             self.__dataframes[bucket] = pd.DataFrame()
 82 | 
 83 |     def delete(self, bucket=None, ignore=False):
 84 | 
 85 |         # Make lists
 86 |         buckets = bucket
 87 |         if isinstance(bucket, six.string_types):
 88 |             buckets = [bucket]
 89 |         elif bucket is None:
 90 |             buckets = reversed(self.buckets)
 91 | 
 92 |         # Iterate over buckets
 93 |         for bucket in buckets:
 94 | 
 95 |             # Non existent bucket
 96 |             if bucket not in self.buckets:
 97 |                 if not ignore:
 98 |                     message = 'Bucket "%s" doesn\'t exist' % bucket
 99 |                     raise tableschema.exceptions.StorageError(message)
100 |                 return
101 | 
102 |             # Remove from descriptors
103 |             if bucket in self.__descriptors:
104 |                 del self.__descriptors[bucket]
105 | 
106 |             # Remove from dataframes
107 |             if bucket in self.__dataframes:
108 |                 del self.__dataframes[bucket]
109 | 
110 |     def describe(self, bucket, descriptor=None):
111 | 
112 |         # Set descriptor
113 |         if descriptor is not None:
114 |             self.__descriptors[bucket] = descriptor
115 | 
116 |         # Get descriptor
117 |         else:
118 |             descriptor = self.__descriptors.get(bucket)
119 |             if descriptor is None:
120 |                 dataframe = self.__dataframes[bucket]
121 |                 descriptor = self.__mapper.restore_descriptor(dataframe)
122 | 
123 |         return descriptor
124 | 
125 |     def iter(self, bucket):
126 | 
127 |         # Check existense
128 |         if bucket not in self.buckets:
129 |             message = 'Bucket "%s" doesn\'t exist.' % bucket
130 |             raise tableschema.exceptions.StorageError(message)
131 | 
132 |         # Prepare
133 |         descriptor = self.describe(bucket)
134 |         schema = tableschema.Schema(descriptor)
135 | 
136 |         # Yield rows
137 |         for pk, row in self.__dataframes[bucket].iterrows():
138 |             row = self.__mapper.restore_row(row, schema=schema, pk=pk)
139 |             yield row
140 | 
141 |     def read(self, bucket):
142 |         rows = list(self.iter(bucket))
143 |         return rows
144 | 
145 |     def write(self, bucket, rows):
146 | 
147 |         # Prepare
148 |         descriptor = self.describe(bucket)
149 |         new_data_frame = self.__mapper.convert_descriptor_and_rows(descriptor, rows)
150 | 
151 |         # Just set new DataFrame if current is empty
152 |         if self.__dataframes[bucket].size == 0:
153 |             self.__dataframes[bucket] = new_data_frame
154 | 
155 |         # Append new data frame to the old one setting new data frame
156 |         # containing data from both old and new data frames
157 |         else:
158 |             self.__dataframes[bucket] = pd.concat([
159 |                 self.__dataframes[bucket],
160 |                 new_data_frame,
161 |             ])
162 | 


--------------------------------------------------------------------------------
/tableschema_pandas/mapper.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import six
  8 | import json
  9 | import isodate
 10 | import datetime
 11 | import tableschema
 12 | import numpy as np
 13 | import pandas as pd
 14 | 
 15 | # Starting from pandas@0.24 there is the new API
 16 | # https://github.com/frictionlessdata/tableschema-pandas-py/issues/29
 17 | try:
 18 |     import pandas.core.dtypes.api as pdc
 19 | except ImportError:
 20 |     import pandas.core.common as pdc
 21 | 
 22 | 
 23 | # Module API
 24 | 
 25 | class Mapper(object):
 26 | 
 27 |     # Public
 28 | 
 29 |     def convert_descriptor_and_rows(self, descriptor, rows):
 30 |         """Convert descriptor and rows to Pandas
 31 |         """
 32 |         schema = tableschema.Schema(descriptor)
 33 | 
 34 |         # Get data/index
 35 |         data_rows = []
 36 |         index_rows = []
 37 |         jtstypes_map = {}
 38 |         for row in rows:
 39 |             data_values = []
 40 |             index_values = []
 41 |             for field, value in zip(schema.fields, row):
 42 |                 try:
 43 |                     if isinstance(value, float) and np.isnan(value):
 44 |                         value = None
 45 |                     if value and field.type == 'integer':
 46 |                         value = int(value)
 47 |                     value = field.cast_value(value)
 48 |                 except tableschema.exceptions.CastError:
 49 |                     value = json.loads(value)
 50 |                 # http://pandas.pydata.org/pandas-docs/stable/gotchas.html#support-for-integer-na
 51 |                 if value is None and field.type in ('number', 'integer'):
 52 |                     jtstypes_map[field.name] = 'number'
 53 |                     value = np.NaN
 54 |                 if field.name in schema.primary_key:
 55 |                     index_values.append(value)
 56 |                 else:
 57 |                     data_values.append(value)
 58 |             if len(schema.primary_key) == 1:
 59 |                 index_rows.append(index_values[0])
 60 |             elif len(schema.primary_key) > 1:
 61 |                 index_rows.append(tuple(index_values))
 62 |             data_rows.append(tuple(data_values))
 63 | 
 64 |         # Create index
 65 |         index = None
 66 |         if schema.primary_key:
 67 |             if len(schema.primary_key) == 1:
 68 |                 index_class = pd.Index
 69 |                 index_field = schema.get_field(schema.primary_key[0])
 70 |                 index_dtype = self.convert_type(index_field.type)
 71 |                 if field.type in ['datetime', 'date']:
 72 |                     index_class = pd.DatetimeIndex
 73 |                 index = index_class(index_rows, name=index_field.name, dtype=index_dtype)
 74 |             elif len(schema.primary_key) > 1:
 75 |                 index = pd.MultiIndex.from_tuples(index_rows, names=schema.primary_key)
 76 | 
 77 |         # Create dtypes/columns
 78 |         dtypes = []
 79 |         columns = []
 80 |         for field in schema.fields:
 81 |             if field.name not in schema.primary_key:
 82 |                 field_name = field.name
 83 |                 if six.PY2:
 84 |                     field_name = field.name.encode('utf-8')
 85 |                 dtype = self.convert_type(jtstypes_map.get(field.name, field.type))
 86 |                 dtypes.append((field_name, dtype))
 87 |                 columns.append(field.name)
 88 | 
 89 |         # Create dataframe
 90 |         array = np.array(data_rows, dtype=dtypes)
 91 |         dataframe = pd.DataFrame(array, index=index, columns=columns)
 92 | 
 93 |         return dataframe
 94 | 
 95 |     def convert_type(self, type):
 96 |         """Convert type to Pandas
 97 |         """
 98 | 
 99 |         # Mapping
100 |         mapping = {
101 |             'any': np.dtype('O'),
102 |             'array': np.dtype(list),
103 |             'boolean': np.dtype(bool),
104 |             'date': np.dtype('O'),
105 |             'datetime': np.dtype('datetime64[ns]'),
106 |             'duration': np.dtype('O'),
107 |             'geojson': np.dtype('O'),
108 |             'geopoint': np.dtype('O'),
109 |             'integer': np.dtype(int),
110 |             'number': np.dtype(float),
111 |             'object': np.dtype(dict),
112 |             'string': np.dtype('O'),
113 |             'time': np.dtype('O'),
114 |             'year': np.dtype(int),
115 |             'yearmonth': np.dtype('O'),
116 |         }
117 | 
118 |         # Get type
119 |         if type not in mapping:
120 |             message = 'Type "%s" is not supported' % type
121 |             raise tableschema.exceptions.StorageError(message)
122 | 
123 |         return mapping[type]
124 | 
125 |     def restore_descriptor(self, dataframe):
126 |         """Restore descriptor from Pandas
127 |         """
128 | 
129 |         # Prepare
130 |         fields = []
131 |         primary_key = None
132 | 
133 |         # Primary key
134 |         if dataframe.index.name:
135 |             field_type = self.restore_type(dataframe.index.dtype)
136 |             field = {
137 |                 'name': dataframe.index.name,
138 |                 'type': field_type,
139 |                 'constraints': {'required': True},
140 |             }
141 |             fields.append(field)
142 |             primary_key = dataframe.index.name
143 | 
144 |         # Fields
145 |         for column, dtype in dataframe.dtypes.iteritems():
146 |             sample = dataframe[column].iloc[0] if len(dataframe) else None
147 |             field_type = self.restore_type(dtype, sample=sample)
148 |             field = {'name': column, 'type': field_type}
149 |             # TODO: provide better required indication
150 |             # if dataframe[column].isnull().sum() == 0:
151 |             #     field['constraints'] = {'required': True}
152 |             fields.append(field)
153 | 
154 |         # Descriptor
155 |         descriptor = {}
156 |         descriptor['fields'] = fields
157 |         if primary_key:
158 |             descriptor['primaryKey'] = primary_key
159 | 
160 |         return descriptor
161 | 
162 |     def restore_row(self, row, schema, pk):
163 |         """Restore row from Pandas
164 |         """
165 |         result = []
166 |         for field in schema.fields:
167 |             if schema.primary_key and schema.primary_key[0] == field.name:
168 |                 if field.type == 'number' and np.isnan(pk):
169 |                     pk = None
170 |                 if pk and field.type == 'integer':
171 |                     pk = int(pk)
172 |                 result.append(field.cast_value(pk))
173 |             else:
174 |                 value = row[field.name]
175 |                 if field.type == 'number' and np.isnan(value):
176 |                     value = None
177 |                 if value and field.type == 'integer':
178 |                     value = int(value)
179 |                 elif field.type == 'datetime':
180 |                     value = value.to_pydatetime()
181 |                 result.append(field.cast_value(value))
182 |         return result
183 | 
184 |     def restore_type(self, dtype, sample=None):
185 |         """Restore type from Pandas
186 |         """
187 | 
188 |         # Pandas types
189 |         if pdc.is_bool_dtype(dtype):
190 |             return 'boolean'
191 |         elif pdc.is_datetime64_any_dtype(dtype):
192 |             return 'datetime'
193 |         elif pdc.is_integer_dtype(dtype):
194 |             return 'integer'
195 |         elif pdc.is_numeric_dtype(dtype):
196 |             return 'number'
197 | 
198 |         # Python types
199 |         if sample is not None:
200 |             if isinstance(sample, (list, tuple)):
201 |                 return 'array'
202 |             elif isinstance(sample, datetime.date):
203 |                 return 'date'
204 |             elif isinstance(sample, isodate.Duration):
205 |                 return 'duration'
206 |             elif isinstance(sample, dict):
207 |                 return 'object'
208 |             elif isinstance(sample, six.string_types):
209 |                 return 'string'
210 |             elif isinstance(sample, datetime.time):
211 |                 return 'time'
212 | 
213 |         return 'string'
214 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (c) 2016 Mantas Zimnickas and Open Knowledge
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/tests/test_storage.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import io
  8 | import six
  9 | import json
 10 | import pytest
 11 | import datetime
 12 | import tableschema
 13 | import pandas as pd
 14 | from copy import deepcopy
 15 | from decimal import Decimal
 16 | from tabulator import Stream
 17 | from collections import OrderedDict
 18 | from tableschema_pandas import Storage
 19 | 
 20 | 
 21 | # Resources
 22 | 
 23 | ARTICLES = {
 24 |     'schema': {
 25 |         'fields': [
 26 |             {'name': 'id', 'type': 'integer', 'constraints': {'required': True}},
 27 |             {'name': 'parent', 'type': 'integer'},
 28 |             {'name': 'name', 'type': 'string'},
 29 |             {'name': 'current', 'type': 'boolean'},
 30 |             {'name': 'rating', 'type': 'number'},
 31 |         ],
 32 |         'primaryKey': 'id',
 33 |         # 'foreignKeys': [
 34 |             # {'fields': 'parent', 'reference': {'resource': '', 'fields': 'id'}},
 35 |         # ],
 36 |     },
 37 |     'data': [
 38 |         ['1', '', 'Taxes', 'True', '9.5'],
 39 |         ['2', '1', '中国人', 'False', '7'],
 40 |     ],
 41 | }
 42 | COMMENTS = {
 43 |     'schema': {
 44 |         'fields': [
 45 |             {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}},
 46 |             {'name': 'comment', 'type': 'string'},
 47 |             {'name': 'note', 'type': 'any'},
 48 |         ],
 49 |         'primaryKey': 'entry_id',
 50 |         # 'foreignKeys': [
 51 |             # {'fields': 'entry_id', 'reference': {'resource': 'articles', 'fields': 'id'}},
 52 |         # ],
 53 |     },
 54 |     'data': [
 55 |         ['1', 'good', 'note1'],
 56 |         ['2', 'bad', 'note2'],
 57 |     ],
 58 | }
 59 | TEMPORAL = {
 60 |     'schema': {
 61 |         'fields': [
 62 |             {'name': 'date', 'type': 'date'},
 63 |             {'name': 'date_year', 'type': 'date', 'format': '%Y'},
 64 |             {'name': 'datetime', 'type': 'datetime'},
 65 |             {'name': 'duration', 'type': 'duration'},
 66 |             {'name': 'time', 'type': 'time'},
 67 |             {'name': 'year', 'type': 'year'},
 68 |             {'name': 'yearmonth', 'type': 'yearmonth'},
 69 |         ],
 70 |     },
 71 |     'data': [
 72 |         ['2015-01-01', '2015', '2015-01-01T03:00:00Z', 'P1Y1M', '03:00:00', '2015', '2015-01'],
 73 |         ['2015-12-31', '2015', '2015-12-31T15:45:33Z', 'P2Y2M', '15:45:33', '2015', '2015-01'],
 74 |     ],
 75 | }
 76 | LOCATION = {
 77 |     'schema': {
 78 |         'fields': [
 79 |             {'name': 'location', 'type': 'geojson'},
 80 |             {'name': 'geopoint', 'type': 'geopoint'},
 81 |         ],
 82 |     },
 83 |     'data': [
 84 |         ['{"type": "Point","coordinates":[33.33,33.33]}', '30,75'],
 85 |         ['{"type": "Point","coordinates":[50.00,50.00]}', '90,45'],
 86 |     ],
 87 | }
 88 | COMPOUND = {
 89 |     'schema': {
 90 |         'fields': [
 91 |             {'name': 'stats', 'type': 'object'},
 92 |             {'name': 'persons', 'type': 'array'},
 93 |         ],
 94 |     },
 95 |     'data': [
 96 |         ['{"chars":560}', '["Mike", "John"]'],
 97 |         ['{"chars":970}', '["Paul", "Alex"]'],
 98 |     ],
 99 | }
100 | 
101 | 
102 | # Tests
103 | 
104 | def test_storage():
105 | 
106 |     # Create storage
107 |     storage = Storage()
108 | 
109 |     # Delete buckets
110 |     storage.delete()
111 | 
112 |     # Create buckets
113 |     storage.create(['articles', 'comments'], [ARTICLES['schema'], COMMENTS['schema']])
114 |     storage.create('comments', COMMENTS['schema'], force=True)
115 |     storage.create('temporal', TEMPORAL['schema'])
116 |     storage.create('location', LOCATION['schema'])
117 |     storage.create('compound', COMPOUND['schema'])
118 | 
119 |     # Write data
120 |     storage.write('articles', ARTICLES['data'])
121 |     storage.write('comments', COMMENTS['data'])
122 |     storage.write('temporal', TEMPORAL['data'])
123 |     storage.write('location', LOCATION['data'])
124 |     storage.write('compound', COMPOUND['data'])
125 | 
126 |     # Create new storage to use reflection only
127 |     dataframes = OrderedDict()
128 |     dataframes['articles'] = storage['articles']
129 |     dataframes['comments'] = storage['comments']
130 |     dataframes['temporal'] = storage['temporal']
131 |     dataframes['location'] = storage['location']
132 |     dataframes['compound'] = storage['compound']
133 |     storage = Storage(dataframes=dataframes)
134 | 
135 |     # Create existent bucket
136 |     with pytest.raises(tableschema.exceptions.StorageError):
137 |         storage.create('articles', ARTICLES['schema'])
138 | 
139 |     # Assert buckets
140 |     assert storage.buckets == ['articles', 'comments', 'compound', 'location', 'temporal']
141 | 
142 |     # Assert schemas
143 |     assert storage.describe('articles') == {
144 |         'fields': [
145 |             {'name': 'id', 'type': 'integer', 'constraints': {'required': True}},
146 |             {'name': 'parent', 'type': 'number'}, # type downgrade
147 |             {'name': 'name', 'type': 'string'},
148 |             {'name': 'current', 'type': 'boolean'},
149 |             {'name': 'rating', 'type': 'number'},
150 |         ],
151 |         'primaryKey': 'id',
152 |     }
153 |     assert storage.describe('comments') == {
154 |         'fields': [
155 |             {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}},
156 |             {'name': 'comment', 'type': 'string'},
157 |             {'name': 'note', 'type': 'string'}, # type downgrade
158 |         ],
159 |         'primaryKey': 'entry_id',
160 |     }
161 |     assert storage.describe('temporal') == {
162 |         'fields': [
163 |             {'name': 'date', 'type': 'date'},
164 |             {'name': 'date_year', 'type': 'date'}, # format removal
165 |             {'name': 'datetime', 'type': 'datetime'},
166 |             {'name': 'duration', 'type': 'duration'},
167 |             {'name': 'time', 'type': 'time'},
168 |             {'name': 'year', 'type': 'integer'}, # type downgrade
169 |             {'name': 'yearmonth', 'type': 'array'}, # type downgrade
170 |         ],
171 |     }
172 |     assert storage.describe('location') == {
173 |         'fields': [
174 |             {'name': 'location', 'type': 'object'}, # type downgrade
175 |             {'name': 'geopoint', 'type': 'array'}, # type downgrade
176 |         ],
177 |     }
178 |     assert storage.describe('compound') == COMPOUND['schema']
179 | 
180 |     assert storage.read('articles') == cast(ARTICLES)['data']
181 |     assert storage.read('comments') == cast(COMMENTS)['data']
182 |     assert storage.read('temporal') == cast(TEMPORAL, wrap={'yearmonth': list})['data']
183 |     assert storage.read('location') == cast(LOCATION, wrap_each={'geopoint': Decimal})['data']
184 |     assert storage.read('compound') == cast(COMPOUND)['data']
185 | 
186 |     # Assert data with forced schema
187 |     storage.describe('compound', COMPOUND['schema'])
188 |     assert storage.read('compound') == cast(COMPOUND)['data']
189 | 
190 |     # Delete non existent bucket
191 |     with pytest.raises(tableschema.exceptions.StorageError):
192 |         storage.delete('non_existent')
193 | 
194 |     # Delete buckets
195 |     storage.delete()
196 | 
197 | 
198 | def test_storage_table_without_primary_key():
199 |     schema = {
200 |         'fields': [
201 |             {'name': 'a', 'type': 'integer'},
202 |             {'name': 'b', 'type': 'string'},
203 |         ]
204 |     }
205 |     data = [[1, 'x'], [2, 'y']]
206 | 
207 |     storage = Storage()
208 |     storage.create('data', schema)
209 |     storage.write('data', data)
210 |     assert list(storage.read('data')) == data
211 | 
212 | 
213 | def test_storage_init_tables():
214 |     data = [
215 |         (1, 'a'),
216 |         (2, 'b'),
217 |     ]
218 |     df = pd.DataFrame(data, columns=('key', 'value'))
219 |     storage = Storage(dataframes={'data': df})
220 |     assert list(storage.read('data')) == [[1, 'a'], [2, 'b']]
221 |     assert storage.describe('data') == {
222 |         'fields': [
223 |             {'name': 'key', 'type': 'integer'},
224 |             {'name': 'value', 'type': 'string'},
225 |         ]
226 |     }
227 | 
228 | 
229 | def test_storage_restore_schema_with_primary_key():
230 |     data = [
231 |         ('a',),
232 |         ('b',),
233 |     ]
234 |     index = pd.Index([1, 2], name='key')
235 |     df = pd.DataFrame(data, columns=('value',), index=index)
236 |     storage = Storage(dataframes={'data': df})
237 |     assert list(storage.read('data')) == [[1, 'a'], [2, 'b']]
238 |     assert storage.describe('data') == {
239 |         'primaryKey': 'key',
240 |         'fields': [
241 |             {'name': 'key', 'type': 'integer', 'constraints': {'required': True}},
242 |             {'name': 'value', 'type': 'string'},
243 |         ]
244 |     }
245 | 
246 | 
247 | def test_storage_read_missing_table():
248 |     storage = Storage()
249 |     with pytest.raises(tableschema.exceptions.StorageError) as excinfo:
250 |         list(storage.read('data'))
251 |     assert str(excinfo.value) == 'Bucket "data" doesn\'t exist.'
252 | 
253 | 
254 | def test_storage_multiple_writes():
255 |     index = pd.Index([1, 2], name='key')
256 |     df = pd.DataFrame([('a',), ('b',)], columns=('value',), index=index)
257 |     storage = Storage(dataframes={'data': df})
258 |     storage.write('data', [(2, 'x'), (3, 'y')])
259 |     assert list(storage.read('data')) == [
260 |         [1, 'a'],
261 |         [2, 'b'],
262 |         [2, 'x'],
263 |         [3, 'y'],
264 |     ]
265 | 
266 | 
267 | def test_storage_composite_primary_key():
268 |     schema = {
269 |         'fields': [
270 |             {'name': 'field1', 'type': 'string'},
271 |             {'name': 'field2', 'type': 'string'},
272 |             {'name': 'field3', 'type': 'string'},
273 |         ],
274 |         'primaryKey': ['field1', 'field2'],
275 |     }
276 |     data = [['value1', 'value2', 'value3']]
277 |     storage = Storage()
278 |     storage.create('bucket', schema)
279 |     storage.write('bucket', data)
280 |     assert storage['bucket'].to_dict() == {'field3': {('value1', 'value2'): 'value3'}}
281 | 
282 | 
283 | # Helpers
284 | 
285 | def cast(resource, skip=[], wrap={}, wrap_each={}):
286 |     resource = deepcopy(resource)
287 |     schema = tableschema.Schema(resource['schema'])
288 |     for row in resource['data']:
289 |         for index, field in enumerate(schema.fields):
290 |             value = row[index]
291 |             if field.type not in skip:
292 |                 value = field.cast_value(value)
293 |             if field.type in wrap:
294 |                 value = wrap[field.type](value)
295 |             if field.type in wrap_each:
296 |                 value = list(map(wrap_each[field.type], value))
297 |             row[index] = value
298 |     return resource
299 | 


--------------------------------------------------------------------------------