├── .credentials.json.enc ├── .github ├── issue_template.md ├── pull_request_template.md ├── stale.yml └── workflows │ └── release.yml ├── .gitignore ├── .travis.yml ├── LEAD.md ├── LICENSE.md ├── MANIFEST.in ├── Makefile ├── README.md ├── data ├── articles.csv └── articles.json ├── examples ├── __init__.py └── storage.py ├── pylama.ini ├── pytest.ini ├── setup.cfg ├── setup.py ├── tableschema_bigquery ├── VERSION ├── __init__.py ├── mapper.py └── storage.py ├── tests ├── __init__.py ├── test_mapper.py └── test_storage.py └── tox.ini /.credentials.json.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tableschema-bigquery-py/7ed9d002620619a819f73d97e03257dcc715c7a4/.credentials.json.enc -------------------------------------------------------------------------------- /.github/issue_template.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Please replace this line with full information about your idea or problem. If it's a bug share as much as possible to reproduce it 4 | 5 | --- 6 | 7 | Please preserve this line to notify @roll (lead of this repository) 8 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Please replace this line with full information about your pull request. Make sure that tests pass before publishing it 4 | 5 | --- 6 | 7 | Please preserve this line to notify @roll (lead of this repository) 8 | -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 90 3 | 4 | # Number of days of inactivity before a stale issue is closed 5 | daysUntilClose: 30 6 | 7 | # Issues with these labels will never be considered stale 8 | exemptLabels: 9 | - feature 10 | - enhancement 11 | - bug 12 | 13 | # Label to use when marking an issue as stale 14 | staleLabel: wontfix 15 | 16 | # Comment to post when marking an issue as stale. Set to `false` to disable 17 | markComment: > 18 | This issue has been automatically marked as stale because it has not had 19 | recent activity. It will be closed if no further activity occurs. Thank you 20 | for your contributions. 21 | 22 | # Comment to post when closing a stale issue. Set to `false` to disable 23 | closeComment: false 24 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v1 14 | - name: Release 15 | uses: softprops/action-gh-release@v1 16 | env: 17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # Project 60 | .credentials.json 61 | tmp/ 62 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: 2 | xenial 3 | 4 | sudo: 5 | false 6 | 7 | language: 8 | python 9 | 10 | python: 11 | - 2.7 12 | - 3.6 13 | - 3.7 14 | - 3.8 15 | 16 | env: 17 | global: 18 | - TOXENV="py${PYTHON_VERSION//./}" 19 | 20 | before_install: 21 | - openssl aes-256-cbc -K $encrypted_3343fdf878cf_key -iv $encrypted_3343fdf878cf_iv -in .credentials.json.enc -out .credentials.json -d 22 | 23 | install: 24 | - make install 25 | - pip install coveralls 26 | 27 | script: 28 | - make test 29 | 30 | after_success: 31 | - coveralls 32 | 33 | jobs: 34 | include: 35 | - stage: release 36 | if: tag IS present 37 | python: 3.8 38 | deploy: 39 | provider: pypi 40 | user: roll 41 | distributions: sdist bdist_wheel 42 | skip_cleanup: true 43 | on: 44 | tags: true 45 | password: 46 | secure: hmjc1R3LVbPZ0HK73QlW3HfGAAxuAXTRSZtn/hR1aIda5CACyfa67s4P7dLE8sV4tKO52L/hqxgpm+Tp1ssp3GnlFedcoROvqkm22JCHjDlSyu9VpYt/lTwCY1OKBennGjY9TfvfAJWup+e+kIDmLOBFTtCOvvEhHD7agtob14SbV65ELmzpPsP/GlG7n6hNN6B97HsjXhjdlDC6wR2yfAodkjIHMR2l5g5BnySVZ0QNIzIlpMILUXHb0mm2PfZU2mr2fsTDkpj/k95yCANoC0gOFzYnewohriEX6NA4xiD4R/sxsCVLcfOOrZPoUGtFt42K+wyMiJG3/N6quR4rPoLkaKVt5yHcrhxGboYDXs6hPEnf3CHJ4ENxi6xDZI4RvYCrEbTb9OqGqv8ci2C44H27c/qBTjulm4sb9mUIJnurR+D+U7TO6GNj52xrIS8wvjl2EQ0srrZll2BKhXzsLtn7qa24pyyrHW+AU3NZmEq+1nJG+X1F8e1VEBZKb3P+ft3lJTMl7bnOlc95sTWFn/CMiZb3itGT0pEbFWGw3Nrl1JAUfJSCr6kEkjqMvRyiWqCMsjKltAfQ7KqV6mW6zptOKn1xgyPysmQJqL+qFWRR3hagzPqIy/EoMbaQDJ9ISx8hbDroCxDDEHtxRy2og0oVgbZTk+7j4aKEi3XrDUk= 47 | -------------------------------------------------------------------------------- /LEAD.md: -------------------------------------------------------------------------------- 1 | roll 2 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Open Knowledge 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-include VERSION 2 | include LICENSE.md 3 | include Makefile 4 | include pylama.ini 5 | include pytest.ini 6 | include README.md 7 | include tox.ini 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all install list readme release templates test version 2 | 3 | 4 | PACKAGE := $(shell grep '^PACKAGE =' setup.py | cut -d "'" -f2) 5 | VERSION := $(shell head -n 1 $(PACKAGE)/VERSION) 6 | LEAD := $(shell head -n 1 LEAD.md) 7 | 8 | 9 | all: list 10 | 11 | install: 12 | pip install --upgrade -e .[develop] 13 | 14 | list: 15 | @grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n' 16 | 17 | lint: 18 | 19 | readme: 20 | pip install md-toc 21 | pip install referencer 22 | referencer $(PACKAGE) README.md --in-place 23 | md_toc -p README.md github --header-levels 3 24 | sed -i '/(#tableschema-bigquery-py)/,+2d' README.md 25 | 26 | release: 27 | git checkout master && git pull origin && git fetch -p && git diff 28 | @echo "\nContinuing in 10 seconds. Press to abort\n" && sleep 10 29 | @git log --pretty=format:"%C(yellow)%h%Creset %s%Cgreen%d" --reverse -20 30 | @echo "\nReleasing v$(VERSION) in 10 seconds. Press to abort\n" && sleep 10 31 | git commit -a -m 'v$(VERSION)' && git tag -a v$(VERSION) -m 'v$(VERSION)' 32 | git push --follow-tags 33 | 34 | templates: 35 | sed -i -E "s/@(\w*)/@$(LEAD)/" .github/issue_template.md 36 | sed -i -E "s/@(\w*)/@$(LEAD)/" .github/pull_request_template.md 37 | 38 | test: 39 | pylama $(PACKAGE) 40 | tox 41 | 42 | version: 43 | @echo $(VERSION) 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tableschema-bigquery-py 2 | 3 | [![Travis](https://img.shields.io/travis/frictionlessdata/tableschema-bigquery-py/master.svg)](https://travis-ci.org/frictionlessdata/tableschema-bigquery-py) 4 | [![Coveralls](http://img.shields.io/coveralls/frictionlessdata/tableschema-bigquery-py.svg?branch=master)](https://coveralls.io/r/frictionlessdata/tableschema-bigquery-py?branch=master) 5 | [![PyPi](https://img.shields.io/pypi/v/tableschema-bigquery.svg)](https://pypi.python.org/pypi/tableschema-bigquery) 6 | [![Github](https://img.shields.io/badge/github-master-brightgreen)](https://github.com/frictionlessdata/tableschema-bigquery-py) 7 | [![Gitter](https://img.shields.io/gitter/room/frictionlessdata/chat.svg)](https://gitter.im/frictionlessdata/chat) 8 | 9 | Generate and load BigQuery tables based on [Table Schema](http://specs.frictionlessdata.io/table-schema/) descriptors. 10 | 11 | ## Features 12 | 13 | - implements `tableschema.Storage` interface 14 | 15 | ## Contents 16 | 17 | 18 | 19 | - [Getting Started](#getting-started) 20 | - [Installation](#installation) 21 | - [Prepare BigQuery](#prepare-bigquery) 22 | - [Documentation](#documentation) 23 | - [API Reference](#api-reference) 24 | - [`Storage`](#storage) 25 | - [Contributing](#contributing) 26 | - [Changelog](#changelog) 27 | 28 | 29 | 30 | ## Getting Started 31 | 32 | ### Installation 33 | 34 | The package use semantic versioning. It means that major versions could include breaking changes. It's highly recommended to specify `package` version range in your `setup/requirements` file e.g. `package>=1.0,<2.0`. 35 | 36 | ```bash 37 | pip install tableschema-bigquery 38 | ``` 39 | 40 | ### Prepare BigQuery 41 | 42 | To start using Google BigQuery service: 43 | - Create a new project - [link](https://console.developers.google.com/home/dashboard) 44 | - Create a service key - [link](https://console.developers.google.com/apis/credentials) 45 | - Download json credentials and set `GOOGLE_APPLICATION_CREDENTIALS` environment variable 46 | 47 | ## Documentation 48 | 49 | ```python 50 | import io 51 | import os 52 | import json 53 | from datapackage import Package 54 | from apiclient.discovery import build 55 | from oauth2client.client import GoogleCredentials 56 | 57 | # Prepare BigQuery credentials 58 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '.credentials.json' 59 | credentials = GoogleCredentials.get_application_default() 60 | service = build('bigquery', 'v2', credentials=credentials) 61 | project = json.load(io.open('.credentials.json', encoding='utf-8'))['project_id'] 62 | 63 | # Save package to BigQuery 64 | package = Package('datapackage.json') 65 | package.save(storage='bigquery', service=service, project=project, dataset='dataset') 66 | 67 | # Load package from BigQuery 68 | package = Package(storage='bigquery', service=service, project=project, dataset='dataset') 69 | package.resources 70 | ``` 71 | 72 | ## API Reference 73 | 74 | ### `Storage` 75 | ```python 76 | Storage(self, service, project, dataset, prefix='') 77 | ``` 78 | BigQuery storage 79 | 80 | Package implements 81 | [Tabular Storage](https://github.com/frictionlessdata/tableschema-py#storage) 82 | interface (see full documentation on the link): 83 | 84 | ![Storage](https://i.imgur.com/RQgrxqp.png) 85 | 86 | > Only additional API is documented 87 | 88 | __Arguments__ 89 | - __service (object)__: BigQuery `Service` object 90 | - __project (str)__: BigQuery project name 91 | - __dataset (str)__: BigQuery dataset name 92 | - __prefix (str)__: prefix for all buckets 93 | 94 | 95 | ## Contributing 96 | 97 | > The project follows the [Open Knowledge International coding standards](https://github.com/okfn/coding-standards). 98 | 99 | Recommended way to get started is to create and activate a project virtual environment. 100 | To install package and development dependencies into active environment: 101 | 102 | ```bash 103 | $ make install 104 | ``` 105 | 106 | To run tests with linting and coverage: 107 | 108 | ```bash 109 | $ make test 110 | ``` 111 | 112 | ## Changelog 113 | 114 | Here described only breaking and the most important changes. The full changelog and documentation for all released versions could be found in nicely formatted [commit history](https://github.com/frictionlessdata/tableschema-bigquery-py/commits/master). 115 | 116 | #### v1.0 117 | 118 | - Initial driver realease 119 | -------------------------------------------------------------------------------- /data/articles.csv: -------------------------------------------------------------------------------- 1 | id,name,current,rating,created_year,created_date,created_datetime 2 | 1,Taxes,True,9.5,2015,2015-01-01,2015-01-01T03:00:00Z 3 | 2,中国人,False,7,2015,2015-12-31,2015-12-31T15:45:33Z 4 | -------------------------------------------------------------------------------- /data/articles.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "id", 5 | "type": "integer", 6 | "constraints": { 7 | "required": true 8 | } 9 | }, 10 | { 11 | "name": "name", 12 | "type": "string" 13 | }, 14 | { 15 | "name": "current", 16 | "type": "boolean" 17 | }, 18 | { 19 | "name": "rating", 20 | "type": "number" 21 | }, 22 | { 23 | "name": "created_year", 24 | "type": "date", 25 | "format": "fmt:%Y" 26 | }, 27 | { 28 | "name": "created_date", 29 | "type": "date" 30 | }, 31 | { 32 | "name": "created_datetime", 33 | "type": "datetime" 34 | } 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tableschema-bigquery-py/7ed9d002620619a819f73d97e03257dcc715c7a4/examples/__init__.py -------------------------------------------------------------------------------- /examples/storage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import io 9 | import json 10 | import uuid 11 | from tabulator import topen 12 | from apiclient.discovery import build 13 | from oauth2client.client import GoogleCredentials 14 | 15 | from tableschema_bigquery import Storage 16 | 17 | 18 | # Get resources 19 | articles_schema = json.load(io.open('data/articles.json', encoding='utf-8')) 20 | articles_data = topen('data/articles.csv', with_headers=True).read() 21 | 22 | # Prepare BigQuery 23 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '.credentials.json' 24 | credentials = GoogleCredentials.get_application_default() 25 | service = build('bigquery', 'v2', credentials=credentials) 26 | project = json.load(io.open('.credentials.json', encoding='utf-8'))['project_id'] 27 | dataset = 'resource' 28 | prefix = '%s_' % uuid.uuid4().hex 29 | 30 | # Storage 31 | storage = Storage(service, project, dataset, prefix=prefix) 32 | 33 | # Delete tables 34 | for table in reversed(storage.tables): 35 | storage.delete(table) 36 | 37 | # Create tables 38 | storage.create('articles', articles_schema) 39 | 40 | # Write data to tables 41 | storage.write('articles', articles_data) 42 | 43 | # List tables 44 | print(storage.tables) 45 | 46 | # Describe tables 47 | print(storage.describe('articles')) 48 | 49 | # Read data from tables 50 | print(list(storage.read('articles'))) 51 | 52 | # Delete tables 53 | for table in reversed(storage.tables): 54 | storage.delete(table) 55 | -------------------------------------------------------------------------------- /pylama.ini: -------------------------------------------------------------------------------- 1 | [pylama] 2 | linters = pyflakes,mccabe,pep8 3 | 4 | [pylama:pep8] 5 | max_line_length = 100 6 | 7 | [pylama:*/__init__.py] 8 | ignore = W0611 9 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | 6 | import os 7 | import io 8 | from setuptools import setup, find_packages 9 | 10 | 11 | # Helpers 12 | def read(*paths): 13 | """Read a text file.""" 14 | basedir = os.path.dirname(__file__) 15 | fullpath = os.path.join(basedir, *paths) 16 | contents = io.open(fullpath, encoding='utf-8').read().strip() 17 | return contents 18 | 19 | 20 | # Prepare 21 | PACKAGE = 'tableschema_bigquery' 22 | NAME = PACKAGE.replace('_', '-') 23 | INSTALL_REQUIRES = [ 24 | 'six>=1.9', 25 | 'rsa<=4.0', # for py2 26 | 'python-slugify>=1.2', 27 | 'google-api-python-client>=1.5', 28 | 'unicodecsv>=0.14', 29 | 'tableschema>=1.0', 30 | 'tabulator>=1.0', 31 | ] 32 | TESTS_REQUIRE = [ 33 | 'mock', 34 | 'pylama', 35 | 'pytest', 36 | 'pytest-cov', 37 | 'oauth2client', 38 | 'tox', 39 | ] 40 | README = read('README.md') 41 | VERSION = read(PACKAGE, 'VERSION') 42 | PACKAGES = find_packages(exclude=['examples', 'tests']) 43 | 44 | 45 | # Run 46 | setup( 47 | name=NAME, 48 | version=VERSION, 49 | packages=PACKAGES, 50 | include_package_data=True, 51 | install_requires=INSTALL_REQUIRES, 52 | tests_require=TESTS_REQUIRE, 53 | extras_require={'develop': TESTS_REQUIRE}, 54 | zip_safe=False, 55 | long_description=README, 56 | long_description_content_type='text/markdown', 57 | description='Generate BigQuery tables, load and extract data, based on JSON Table Schema descriptors.', 58 | author='Open Knowledge Foundation', 59 | author_email='info@okfn.org', 60 | url='https://github.com/frictionlessdata/jsontableschema-bigquery-py', 61 | license='MIT', 62 | keywords=[ 63 | 'frictionless data', 64 | ], 65 | classifiers=[ 66 | 'Development Status :: 4 - Beta', 67 | 'Environment :: Web Environment', 68 | 'Intended Audience :: Developers', 69 | 'License :: OSI Approved :: MIT License', 70 | 'Operating System :: OS Independent', 71 | 'Programming Language :: Python :: 2', 72 | 'Programming Language :: Python :: 2.7', 73 | 'Programming Language :: Python :: 3', 74 | 'Programming Language :: Python :: 3.3', 75 | 'Programming Language :: Python :: 3.4', 76 | 'Programming Language :: Python :: 3.5', 77 | 'Topic :: Internet :: WWW/HTTP :: Dynamic Content', 78 | 'Topic :: Software Development :: Libraries :: Python Modules' 79 | ], 80 | ) 81 | -------------------------------------------------------------------------------- /tableschema_bigquery/VERSION: -------------------------------------------------------------------------------- 1 | 1.0.1 2 | -------------------------------------------------------------------------------- /tableschema_bigquery/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # pylama:skip=1 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import absolute_import 6 | from __future__ import unicode_literals 7 | 8 | 9 | # Module API 10 | 11 | from .storage import Storage 12 | 13 | 14 | # Version 15 | 16 | import io 17 | import os 18 | __version__ = io.open( 19 | os.path.join(os.path.dirname(__file__), 'VERSION'), 20 | encoding='utf-8').read().strip() 21 | -------------------------------------------------------------------------------- /tableschema_bigquery/mapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import re 8 | import json 9 | import tableschema 10 | from slugify import slugify 11 | from dateutil.parser import parse 12 | 13 | 14 | # Module API 15 | 16 | class Mapper(object): 17 | 18 | # Public 19 | 20 | def __init__(self, prefix): 21 | """Mapper to convert/restore FD entities to/from BigQuery entities 22 | """ 23 | self.__prefix = prefix 24 | 25 | def convert_bucket(self, bucket): 26 | """Convert bucket to BigQuery 27 | """ 28 | return self.__prefix + bucket 29 | 30 | def convert_descriptor(self, descriptor): 31 | """Convert descriptor to BigQuery 32 | """ 33 | 34 | # Fields 35 | fields = [] 36 | fallbacks = [] 37 | schema = tableschema.Schema(descriptor) 38 | for index, field in enumerate(schema.fields): 39 | converted_type = self.convert_type(field.type) 40 | if not converted_type: 41 | converted_type = 'STRING' 42 | fallbacks.append(index) 43 | mode = 'NULLABLE' 44 | if field.required: 45 | mode = 'REQUIRED' 46 | fields.append({ 47 | 'name': _slugify_field_name(field.name), 48 | 'type': converted_type, 49 | 'mode': mode, 50 | }) 51 | 52 | # Descriptor 53 | converted_descriptor = { 54 | 'fields': fields, 55 | } 56 | 57 | return (converted_descriptor, fallbacks) 58 | 59 | def convert_row(self, row, schema, fallbacks): 60 | """Convert row to BigQuery 61 | """ 62 | for index, field in enumerate(schema.fields): 63 | value = row[index] 64 | if index in fallbacks: 65 | value = _uncast_value(value, field=field) 66 | else: 67 | value = field.cast_value(value) 68 | row[index] = value 69 | return row 70 | 71 | def convert_type(self, type): 72 | """Convert type to BigQuery 73 | """ 74 | 75 | # Mapping 76 | mapping = { 77 | 'any': 'STRING', 78 | 'array': None, 79 | 'boolean': 'BOOLEAN', 80 | 'date': 'DATE', 81 | 'datetime': 'DATETIME', 82 | 'duration': None, 83 | 'geojson': None, 84 | 'geopoint': None, 85 | 'integer': 'INTEGER', 86 | 'number': 'FLOAT', 87 | 'object': None, 88 | 'string': 'STRING', 89 | 'time': 'TIME', 90 | 'year': 'INTEGER', 91 | 'yearmonth': None, 92 | } 93 | 94 | # Not supported type 95 | if type not in mapping: 96 | message = 'Type %s is not supported' % type 97 | raise tableschema.exceptions.StorageError(message) 98 | 99 | return mapping[type] 100 | 101 | def restore_bucket(self, table_name): 102 | """Restore bucket from BigQuery 103 | """ 104 | if table_name.startswith(self.__prefix): 105 | return table_name.replace(self.__prefix, '', 1) 106 | return None 107 | 108 | def restore_descriptor(self, converted_descriptor): 109 | """Restore descriptor rom BigQuery 110 | """ 111 | 112 | # Convert 113 | fields = [] 114 | for field in converted_descriptor['fields']: 115 | field_type = self.restore_type(field['type']) 116 | resfield = { 117 | 'name': field['name'], 118 | 'type': field_type, 119 | } 120 | if field.get('mode', 'NULLABLE') != 'NULLABLE': 121 | resfield['constraints'] = {'required': True} 122 | fields.append(resfield) 123 | descriptor = {'fields': fields} 124 | 125 | return descriptor 126 | 127 | def restore_row(self, row, schema): 128 | """Restore row from BigQuery 129 | """ 130 | for index, field in enumerate(schema.fields): 131 | if field.type == 'datetime': 132 | row[index] = parse(row[index]) 133 | if field.type == 'date': 134 | row[index] = parse(row[index]).date() 135 | if field.type == 'time': 136 | row[index] = parse(row[index]).time() 137 | return schema.cast_row(row) 138 | 139 | def restore_type(self, type): 140 | """Restore type from BigQuery 141 | """ 142 | 143 | # Mapping 144 | mapping = { 145 | 'BOOLEAN': 'boolean', 146 | 'DATE': 'date', 147 | 'DATETIME': 'datetime', 148 | 'INTEGER': 'integer', 149 | 'FLOAT': 'number', 150 | 'STRING': 'string', 151 | 'TIME': 'time', 152 | } 153 | 154 | # Not supported type 155 | if type not in mapping: 156 | message = 'Type %s is not supported' % type 157 | raise tableschema.exceptions.StorageError(message) 158 | 159 | return mapping[type] 160 | 161 | 162 | # Internal 163 | 164 | def _slugify_field_name(name): 165 | 166 | # Referene: 167 | # https://cloud.google.com/bigquery/docs/reference/v2/tables 168 | MAX_LENGTH = 128 169 | VALID_NAME = r'^[a-zA-Z_]\w{0,%d}$' % (MAX_LENGTH-1) 170 | 171 | # Convert 172 | if not re.match(VALID_NAME, name): 173 | name = slugify(name, separator='_') 174 | if not re.match('^[a-zA-Z_]', name): 175 | name = '_' + name 176 | 177 | return name[:MAX_LENGTH] 178 | 179 | 180 | def _uncast_value(value, field): 181 | # Eventially should be moved to: 182 | # https://github.com/frictionlessdata/tableschema-py/issues/161 183 | if isinstance(value, (list, dict)): 184 | value = json.dumps(value) 185 | else: 186 | value = str(value) 187 | return value 188 | -------------------------------------------------------------------------------- /tableschema_bigquery/storage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import six 9 | import time 10 | import unicodecsv 11 | import tableschema 12 | from apiclient.http import MediaIoBaseUpload 13 | from .mapper import Mapper 14 | 15 | 16 | # Module API 17 | 18 | class Storage(tableschema.Storage): 19 | """BigQuery storage 20 | 21 | Package implements 22 | [Tabular Storage](https://github.com/frictionlessdata/tableschema-py#storage) 23 | interface (see full documentation on the link): 24 | 25 | ![Storage](https://i.imgur.com/RQgrxqp.png) 26 | 27 | > Only additional API is documented 28 | 29 | # Arguments 30 | service (object): BigQuery `Service` object 31 | project (str): BigQuery project name 32 | dataset (str): BigQuery dataset name 33 | prefix (str): prefix for all buckets 34 | 35 | """ 36 | 37 | # Public 38 | 39 | def __init__(self, service, project, dataset, prefix=''): 40 | 41 | # Set attributes 42 | self.__service = service 43 | self.__project = project 44 | self.__dataset = dataset 45 | self.__prefix = prefix 46 | self.__buckets = None 47 | self.__descriptors = {} 48 | self.__fallbacks = {} 49 | 50 | # Create mapper 51 | self.__mapper = Mapper(prefix=prefix) 52 | 53 | def __repr__(self): 54 | 55 | # Template and format 56 | template = 'Storage <{service}/{project}-{dataset}>' 57 | text = template.format( 58 | service=self.__service, 59 | project=self.__project, 60 | dataset=self.__dataset) 61 | 62 | return text 63 | 64 | @property 65 | def buckets(self): 66 | 67 | # No cached value 68 | if self.__buckets is None: 69 | 70 | # Get response 71 | response = self.__service.tables().list( 72 | projectId=self.__project, 73 | datasetId=self.__dataset).execute() 74 | 75 | # Extract buckets 76 | self.__buckets = [] 77 | for table in response.get('tables', []): 78 | table_name = table['tableReference']['tableId'] 79 | bucket = self.__mapper.restore_bucket(table_name) 80 | if bucket is not None: 81 | self.__buckets.append(bucket) 82 | 83 | return self.__buckets 84 | 85 | def create(self, bucket, descriptor, force=False): 86 | 87 | # Make lists 88 | buckets = bucket 89 | if isinstance(bucket, six.string_types): 90 | buckets = [bucket] 91 | descriptors = descriptor 92 | if isinstance(descriptor, dict): 93 | descriptors = [descriptor] 94 | 95 | # Iterate over buckets/descriptors 96 | for bucket, descriptor in zip(buckets, descriptors): 97 | 98 | # Existent bucket 99 | if bucket in self.buckets: 100 | if not force: 101 | message = 'Bucket "%s" already exists' % bucket 102 | raise tableschema.exceptions.StorageError(message) 103 | self.delete(bucket) 104 | 105 | # Prepare job body 106 | tableschema.validate(descriptor) 107 | table_name = self.__mapper.convert_bucket(bucket) 108 | converted_descriptor, fallbacks = self.__mapper.convert_descriptor(descriptor) 109 | body = { 110 | 'tableReference': { 111 | 'projectId': self.__project, 112 | 'datasetId': self.__dataset, 113 | 'tableId': table_name, 114 | }, 115 | 'schema': converted_descriptor, 116 | } 117 | 118 | # Make request 119 | self.__service.tables().insert( 120 | projectId=self.__project, 121 | datasetId=self.__dataset, 122 | body=body).execute() 123 | 124 | # Add to descriptors/fallbacks 125 | self.__descriptors[bucket] = descriptor 126 | self.__fallbacks[bucket] = fallbacks 127 | 128 | # Remove buckets cache 129 | self.__buckets = None 130 | 131 | def delete(self, bucket=None, ignore=False): 132 | 133 | # Make lists 134 | buckets = bucket 135 | if isinstance(bucket, six.string_types): 136 | buckets = [bucket] 137 | elif bucket is None: 138 | buckets = reversed(self.buckets) 139 | 140 | # Iterater over buckets 141 | for bucket in buckets: 142 | 143 | # Non-existent bucket 144 | if bucket not in self.buckets: 145 | if not ignore: 146 | message = 'Bucket "%s" doesn\'t exist.' % bucket 147 | raise tableschema.exceptions.StorageError(message) 148 | return 149 | 150 | # Remove from descriptors 151 | if bucket in self.__descriptors: 152 | del self.__descriptors[bucket] 153 | 154 | # Make delete request 155 | table_name = self.__mapper.convert_bucket(bucket) 156 | self.__service.tables().delete( 157 | projectId=self.__project, 158 | datasetId=self.__dataset, 159 | tableId=table_name).execute() 160 | 161 | # Remove tables cache 162 | self.__buckets = None 163 | 164 | def describe(self, bucket, descriptor=None): 165 | 166 | # Set descriptor 167 | if descriptor is not None: 168 | self.__descriptors[bucket] = descriptor 169 | 170 | # Get descriptor 171 | else: 172 | descriptor = self.__descriptors.get(bucket) 173 | if descriptor is None: 174 | table_name = self.__mapper.convert_bucket(bucket) 175 | response = self.__service.tables().get( 176 | projectId=self.__project, 177 | datasetId=self.__dataset, 178 | tableId=table_name).execute() 179 | converted_descriptor = response['schema'] 180 | descriptor = self.__mapper.restore_descriptor(converted_descriptor) 181 | 182 | return descriptor 183 | 184 | def iter(self, bucket): 185 | 186 | # Get schema/data 187 | schema = tableschema.Schema(self.describe(bucket)) 188 | table_name = self.__mapper.convert_bucket(bucket) 189 | response = self.__service.tabledata().list( 190 | projectId=self.__project, 191 | datasetId=self.__dataset, 192 | tableId=table_name).execute() 193 | 194 | # Collect rows 195 | rows = [] 196 | for fields in response['rows']: 197 | row = [field['v'] for field in fields['f']] 198 | rows.append(row) 199 | 200 | # Sort rows 201 | # TODO: provide proper sorting solution 202 | rows = sorted(rows, key=lambda row: row[0] if row[0] is not None else 'null') 203 | 204 | # Emit rows 205 | for row in rows: 206 | row = self.__mapper.restore_row(row, schema=schema) 207 | yield row 208 | 209 | def read(self, bucket): 210 | rows = list(self.iter(bucket)) 211 | return rows 212 | 213 | def write(self, bucket, rows): 214 | 215 | # Write buffer 216 | BUFFER_SIZE = 10000 217 | 218 | # Prepare schema, fallbacks 219 | schema = tableschema.Schema(self.describe(bucket)) 220 | fallbacks = self.__fallbacks.get(bucket, []) 221 | 222 | # Write data 223 | rows_buffer = [] 224 | for row in rows: 225 | row = self.__mapper.convert_row(row, schema=schema, fallbacks=fallbacks) 226 | rows_buffer.append(row) 227 | if len(rows_buffer) > BUFFER_SIZE: 228 | self.__write_rows_buffer(bucket, rows_buffer) 229 | rows_buffer = [] 230 | if len(rows_buffer) > 0: 231 | self.__write_rows_buffer(bucket, rows_buffer) 232 | 233 | # Private 234 | 235 | def __write_rows_buffer(self, bucket, rows_buffer): 236 | 237 | # Process data to byte stream csv 238 | bytes = io.BufferedRandom(io.BytesIO()) 239 | writer = unicodecsv.writer(bytes, encoding='utf-8') 240 | for row in rows_buffer: 241 | writer.writerow(row) 242 | bytes.seek(0) 243 | 244 | # Prepare job body 245 | table_name = self.__mapper.convert_bucket(bucket) 246 | body = { 247 | 'configuration': { 248 | 'load': { 249 | 'destinationTable': { 250 | 'projectId': self.__project, 251 | 'datasetId': self.__dataset, 252 | 'tableId': table_name 253 | }, 254 | 'sourceFormat': 'CSV', 255 | } 256 | } 257 | } 258 | 259 | # Prepare job media body 260 | mimetype = 'application/octet-stream' 261 | media_body = MediaIoBaseUpload(bytes, mimetype=mimetype) 262 | 263 | # Make request to Big Query 264 | response = self.__service.jobs().insert( 265 | projectId=self.__project, 266 | body=body, 267 | media_body=media_body).execute() 268 | self.__wait_response(response) 269 | 270 | def __wait_response(self, response): 271 | 272 | # Get job instance 273 | job = self.__service.jobs().get( 274 | projectId=response['jobReference']['projectId'], 275 | jobId=response['jobReference']['jobId']) 276 | 277 | # Wait done 278 | while True: 279 | result = job.execute(num_retries=1) 280 | if result['status']['state'] == 'DONE': 281 | if result['status'].get('errors'): 282 | errors = result['status']['errors'] 283 | message = '\n'.join(error['message'] for error in errors) 284 | raise tableschema.exceptions.StorageError(message) 285 | break 286 | time.sleep(1) 287 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/tableschema-bigquery-py/7ed9d002620619a819f73d97e03257dcc715c7a4/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_mapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | from tableschema_bigquery.mapper import Mapper 9 | 10 | 11 | # Tests 12 | 13 | def test_mapper_convert_bucket(): 14 | mapper = Mapper('prefix_') 15 | assert mapper.convert_bucket('bucket') == 'prefix_bucket' 16 | 17 | 18 | def test_mapper_restore_bucket(): 19 | mapper = Mapper('prefix_') 20 | assert mapper.restore_bucket('prefix_bucket') == 'bucket' 21 | assert mapper.restore_bucket('xxxxxx_bucket') == None 22 | -------------------------------------------------------------------------------- /tests/test_storage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import io 9 | import json 10 | import uuid 11 | import pytest 12 | import datetime 13 | import tableschema 14 | from copy import deepcopy 15 | from decimal import Decimal 16 | from tabulator import Stream 17 | from apiclient.discovery import build 18 | from oauth2client.client import GoogleCredentials 19 | from tableschema_bigquery import Storage 20 | 21 | 22 | # Resources 23 | 24 | ARTICLES = { 25 | 'schema': { 26 | 'fields': [ 27 | {'name': 'id', 'type': 'integer', 'constraints': {'required': True}}, 28 | {'name': 'parent', 'type': 'integer'}, 29 | {'name': 'name', 'type': 'string'}, 30 | {'name': 'current', 'type': 'boolean'}, 31 | {'name': 'rating', 'type': 'number'}, 32 | ], 33 | # 'primaryKey': 'id', 34 | # 'foreignKeys': [ 35 | # {'fields': 'parent', 'reference': {'resource': '', 'fields': 'id'}}, 36 | # ], 37 | }, 38 | 'data': [ 39 | ['1', '', 'Taxes', 'True', '9.5'], 40 | ['2', '1', '中国人', 'False', '7'], 41 | ], 42 | } 43 | COMMENTS = { 44 | 'schema': { 45 | 'fields': [ 46 | {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}}, 47 | {'name': 'comment', 'type': 'string'}, 48 | {'name': 'note', 'type': 'any'}, 49 | ], 50 | # 'primaryKey': 'entry_id', 51 | # 'foreignKeys': [ 52 | # {'fields': 'entry_id', 'reference': {'resource': 'articles', 'fields': 'id'}}, 53 | # ], 54 | }, 55 | 'data': [ 56 | ['1', 'good', 'note1'], 57 | ['2', 'bad', 'note2'], 58 | ], 59 | } 60 | TEMPORAL = { 61 | 'schema': { 62 | 'fields': [ 63 | {'name': 'date', 'type': 'date'}, 64 | {'name': 'date_year', 'type': 'date', 'format': '%Y'}, 65 | {'name': 'datetime', 'type': 'datetime'}, 66 | {'name': 'duration', 'type': 'duration'}, 67 | {'name': 'time', 'type': 'time'}, 68 | {'name': 'year', 'type': 'year'}, 69 | {'name': 'yearmonth', 'type': 'yearmonth'}, 70 | ], 71 | }, 72 | 'data': [ 73 | ['2015-01-01', '2015', '2015-01-01T03:00:00Z', 'P1Y1M', '03:00:00', '2015', '2015-01'], 74 | ['2015-12-31', '2015', '2015-12-31T15:45:33Z', 'P2Y2M', '15:45:33', '2015', '2015-01'], 75 | ], 76 | } 77 | LOCATION = { 78 | 'schema': { 79 | 'fields': [ 80 | {'name': 'location', 'type': 'geojson'}, 81 | {'name': 'geopoint', 'type': 'geopoint'}, 82 | ], 83 | }, 84 | 'data': [ 85 | ['{"type": "Point","coordinates":[33.33,33.33]}', '30,75'], 86 | ['{"type": "Point","coordinates":[50.00,50.00]}', '90,45'], 87 | ], 88 | } 89 | COMPOUND = { 90 | 'schema': { 91 | 'fields': [ 92 | {'name': 'stats', 'type': 'object'}, 93 | {'name': 'persons', 'type': 'array'}, 94 | ], 95 | }, 96 | 'data': [ 97 | ['{"chars":560}', '["Mike", "John"]'], 98 | ['{"chars":970}', '["Paul", "Alex"]'], 99 | ], 100 | } 101 | 102 | 103 | # Credentials 104 | 105 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '.credentials.json' 106 | CREDENTIALS = GoogleCredentials.get_application_default() 107 | SERVICE = build('bigquery', 'v2', credentials=CREDENTIALS) 108 | PROJECT = json.load(io.open('.credentials.json', encoding='utf-8'))['project_id'] 109 | DATASET = 'resource' 110 | PREFIX = '%s_' % uuid.uuid4().hex 111 | 112 | 113 | # Tests 114 | 115 | def test_storage(): 116 | 117 | # Create storage 118 | storage = Storage(SERVICE, project=PROJECT, dataset=DATASET, prefix=PREFIX) 119 | 120 | # Delete buckets 121 | storage.delete() 122 | 123 | # Create buckets 124 | storage.create(['articles', 'comments'], [ARTICLES['schema'], COMMENTS['schema']]) 125 | # TODO: investigate why it fails 126 | # storage.create('comments', COMMENTS['schema'], force=True) 127 | storage.create('temporal', TEMPORAL['schema']) 128 | storage.create('location', LOCATION['schema']) 129 | storage.create('compound', COMPOUND['schema']) 130 | 131 | # Write data 132 | storage.write('articles', ARTICLES['data']) 133 | storage.write('comments', COMMENTS['data']) 134 | storage.write('temporal', TEMPORAL['data']) 135 | storage.write('location', LOCATION['data']) 136 | storage.write('compound', COMPOUND['data']) 137 | 138 | # Create new storage to use reflection only 139 | storage = Storage(SERVICE, project=PROJECT, dataset=DATASET, prefix=PREFIX) 140 | 141 | # Create existent bucket 142 | # TODO: investigate why it fails 143 | # with pytest.raises(tableschema.exceptions.StorageError): 144 | # storage.create('articles', ARTICLES['schema']) 145 | 146 | # Assert buckets 147 | assert storage.buckets == ['articles', 'comments', 'compound', 'location', 'temporal'] 148 | 149 | # Assert schemas 150 | assert storage.describe('articles') == ARTICLES['schema'] 151 | assert storage.describe('comments') == { 152 | 'fields': [ 153 | {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}}, 154 | {'name': 'comment', 'type': 'string'}, 155 | {'name': 'note', 'type': 'string'}, # type downgrade 156 | ], 157 | } 158 | assert storage.describe('temporal') == { 159 | 'fields': [ 160 | {'name': 'date', 'type': 'date'}, 161 | {'name': 'date_year', 'type': 'date'}, # format removal 162 | {'name': 'datetime', 'type': 'datetime'}, 163 | {'name': 'duration', 'type': 'string'}, # type fallback 164 | {'name': 'time', 'type': 'time'}, 165 | {'name': 'year', 'type': 'integer'}, # type downgrade 166 | {'name': 'yearmonth', 'type': 'string'}, # type fallback 167 | ], 168 | } 169 | assert storage.describe('location') == { 170 | 'fields': [ 171 | {'name': 'location', 'type': 'string'}, # type fallback 172 | {'name': 'geopoint', 'type': 'string'}, # type fallback 173 | ], 174 | } 175 | assert storage.describe('compound') == { 176 | 'fields': [ 177 | {'name': 'stats', 'type': 'string'}, # type fallback 178 | {'name': 'persons', 'type': 'string'}, # type fallback 179 | ], 180 | } 181 | 182 | assert storage.read('articles') == cast(ARTICLES)['data'] 183 | assert storage.read('comments') == cast(COMMENTS)['data'] 184 | assert storage.read('temporal') == cast(TEMPORAL, skip=['duration', 'yearmonth'])['data'] 185 | assert storage.read('location') == cast(LOCATION, skip=['geojson', 'geopoint'])['data'] 186 | assert storage.read('compound') == cast(COMPOUND, skip=['array', 'object'])['data'] 187 | 188 | # Assert data with forced schema 189 | storage.describe('compound', COMPOUND['schema']) 190 | assert storage.read('compound') == cast(COMPOUND)['data'] 191 | 192 | # Delete non existent bucket 193 | with pytest.raises(tableschema.exceptions.StorageError): 194 | storage.delete('non_existent') 195 | 196 | # Delete buckets 197 | storage.delete() 198 | 199 | 200 | def test_storage_bigdata(): 201 | RESOURCE = { 202 | 'schema': { 203 | 'fields': [ 204 | {'name': 'id', 'type': 'integer'} 205 | ] 206 | }, 207 | 'data': [[value,] for value in range(0, 15000)] 208 | } 209 | 210 | # Write data 211 | storage = Storage(SERVICE, project=PROJECT, dataset=DATASET, prefix=PREFIX) 212 | storage.create('bucket', RESOURCE['schema'], force=True) 213 | storage.write('bucket', RESOURCE['data']) 214 | 215 | # Pull rows 216 | # TODO: remove sorting after proper soring solution implementation 217 | assert sorted(storage.read('bucket'), key=lambda row: row[0]) == RESOURCE['data'] 218 | 219 | 220 | # Helpers 221 | 222 | def cast(resource, skip=[]): 223 | resource = deepcopy(resource) 224 | schema = tableschema.Schema(resource['schema']) 225 | for row in resource['data']: 226 | for index, field in enumerate(schema.fields): 227 | if field.type not in skip: 228 | row[index] = field.cast_value(row[index]) 229 | return resource 230 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | package=tableschema_bigquery 3 | skip_missing_interpreters=true 4 | envlist= 5 | py27 6 | py36 7 | py37 8 | py38 9 | 10 | [testenv] 11 | deps= 12 | mock 13 | pytest 14 | pytest-cov 15 | coverage 16 | oauth2client 17 | passenv= 18 | CI 19 | TRAVIS 20 | TRAVIS_JOB_ID 21 | TRAVIS_BRANCH 22 | commands= 23 | py.test \ 24 | --cov {[tox]package} \ 25 | --cov-config tox.ini \ 26 | --cov-report term-missing \ 27 | {posargs} 28 | --------------------------------------------------------------------------------