├── luigi_bigquery ├── tests │ ├── __init__.py │ ├── test_targets_gcs.py │ ├── test_task.py │ ├── test_targets_bq.py │ ├── test_config.py │ └── test_helper.py ├── targets │ ├── __init__.py │ ├── gcs.py │ ├── bq.py │ └── result.py ├── job.py ├── __init__.py ├── client.py ├── config.py ├── gcs.py └── task.py ├── MANIFEST.in ├── requirements_dev.txt ├── requirements.txt ├── examples ├── templates │ ├── query_with_variables.sql │ ├── query_with_language.sql │ └── query_with_time_range.sql ├── config │ └── luigi.cfg ├── query.py └── tasks.py ├── .gitignore ├── client.cfg.template ├── .travis.yml ├── setup.py ├── README.md └── LICENSE /luigi_bigquery/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /luigi_bigquery/targets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | nose 2 | rednose 3 | mock 4 | coverage 5 | nose-exclude 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | BigQuery-Python>=1.4.0 2 | jinja2>=2.7.3 3 | luigi>=2.0,<3.0 4 | pyyaml==3.11 5 | pandas>=0.16.0 6 | six>=1.9.0 7 | -------------------------------------------------------------------------------- /examples/templates/query_with_variables.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | count(*) cnt 3 | FROM 4 | [publicdata:samples.github_nested] 5 | WHERE 6 | repository.language = '{{ language }}' 7 | -------------------------------------------------------------------------------- /examples/templates/query_with_language.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | count(*) cnt 3 | FROM 4 | [publicdata:samples.github_nested] 5 | WHERE 6 | repository.language = '{{ task.language }}' 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .coverage 3 | .idea 4 | .python-version 5 | client.cfg 6 | build 7 | dist 8 | luigi_bigquery.egg-info 9 | docs/_build 10 | docs/_build_html 11 | data 12 | env 13 | -------------------------------------------------------------------------------- /client.cfg.template: -------------------------------------------------------------------------------- 1 | # configuration for Luigi 2 | [core] 3 | error-email: you@example.com 4 | 5 | # configuration for Luigi-BigQuery 6 | [bigquery] 7 | project_id: your_project_id 8 | service_account: your_service_account 9 | private_key_file: /path/to/key.p12 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.4" 5 | install: 6 | - "pip install -r requirements.txt" 7 | - "pip install coveralls" 8 | script: 9 | nosetests --with-coverage --cover-package luigi_bigquery 10 | after_success: 11 | coveralls 12 | -------------------------------------------------------------------------------- /examples/config/luigi.cfg: -------------------------------------------------------------------------------- 1 | # configuration for Luigi 2 | [core] 3 | error-email: you@example.com 4 | 5 | # configuration for Luigi-BigQuery 6 | [bigquery] 7 | project_id: your_project_id 8 | service_account: your_service_account 9 | private_key_file: /path/to/key.p12 10 | -------------------------------------------------------------------------------- /examples/templates/query_with_time_range.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | STRFTIME_UTC_USEC(TIMESTAMP(repository_created_at), '%Y-%m') AS month, 3 | count(*) cnt 4 | FROM 5 | [publicdata:samples.github_timeline] 6 | WHERE 7 | TIMESTAMP(repository_created_at) >= '{{ task.year }}-01-01 00:00:00' 8 | AND 9 | TIMESTAMP(repository_created_at) <= '{{ task.year + 1 }}-01-01 00:00:00' 10 | GROUP BY 11 | month 12 | ORDER BY 13 | month 14 | -------------------------------------------------------------------------------- /examples/query.py: -------------------------------------------------------------------------------- 1 | import luigi 2 | import luigi_bigquery 3 | 4 | class MyQuery(luigi_bigquery.Query): 5 | def query(self): 6 | return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]" 7 | 8 | def output(self): 9 | return luigi.LocalTarget('MyQuery.csv') 10 | 11 | def run(self): 12 | result = self.run_query(self.query()) 13 | with self.output().open('w') as f: 14 | result.to_csv(f) 15 | 16 | if __name__ == '__main__': 17 | luigi.run() 18 | -------------------------------------------------------------------------------- /luigi_bigquery/tests/test_targets_gcs.py: -------------------------------------------------------------------------------- 1 | from .test_helper import TestConfig 2 | from luigi_bigquery import FileTarget 3 | 4 | from unittest import TestCase 5 | from nose.tools import eq_ 6 | 7 | import luigi 8 | 9 | test_config = TestConfig( 10 | objects = [ 11 | { 12 | "bucket": "bucket", 13 | "name": "path/to/file.csv" 14 | } 15 | ] 16 | ) 17 | 18 | class FileTargetTestCase(TestCase): 19 | def setUp(self): 20 | test_config.setUp() 21 | 22 | def tearDown(self): 23 | test_config.tearDown() 24 | 25 | def test_exists(self): 26 | eq_(FileTarget('bucket', 'path/to/file.csv', config=test_config).exists(), True) 27 | eq_(FileTarget('bucket', 'path/to/invalid.csv', config=test_config).exists(), False) 28 | -------------------------------------------------------------------------------- /luigi_bigquery/job.py: -------------------------------------------------------------------------------- 1 | class Job(object): 2 | def __init__(self, client, job_id): 3 | self.client = client 4 | self.id = job_id 5 | 6 | @property 7 | def job_id(self): 8 | return self.id 9 | 10 | @property 11 | def result_size(self): 12 | if not hasattr(self, '_result_size'): 13 | _, self._result_size = self.client.check_job(self.job_id) 14 | return self._result_size 15 | 16 | @property 17 | def schema(self): 18 | if not hasattr(self, '_schema'): 19 | self._schema = self.client.get_query_schema(self.job_id) 20 | return self._schema 21 | 22 | @property 23 | def result(self): 24 | if not hasattr(self, '_result'): 25 | self._result = self.client.get_query_rows(self.job_id) 26 | return self._result 27 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name="luigi-bigquery", 7 | version="0.2.0", 8 | descripition="Luigi integration for Google BigQuery", 9 | author="Kazuyuki Honda", 10 | author_email="hakobera@gmail.com", 11 | url="https://github.com/hakobera/luigi-bigquery", 12 | install_requires=open("requirements.txt").read().splitlines(), 13 | packages=find_packages(), 14 | license="Apache License 2.0", 15 | platforms="Posix; MacOS X; Windows", 16 | classifiers=[ 17 | "Development Status :: 3 - Alpha", 18 | "Environment :: Console", 19 | "Intended Audience :: Developers", 20 | "License :: OSI Approved :: Apache Software License", 21 | "Operating System :: OS Independent", 22 | "Topic :: Software Development", 23 | ], 24 | ) 25 | -------------------------------------------------------------------------------- /luigi_bigquery/__init__.py: -------------------------------------------------------------------------------- 1 | from luigi_bigquery.client import ResultProxy 2 | from luigi_bigquery.config import Config, ConfigLoader, get_config 3 | from luigi_bigquery.task import DatasetTask, TableTask, Query, QueryTable, QueryToGCS 4 | from luigi_bigquery.gcs import get_gcs_client 5 | from luigi_bigquery.targets.result import ResultTarget 6 | from luigi_bigquery.targets.bq import DatasetTarget, TableTarget 7 | from luigi_bigquery.targets.gcs import BucketTarget, FileTarget 8 | 9 | __all__ = [ 10 | # client 11 | 'ResultProxy', 12 | # config 13 | 'Config', 14 | 'ConfigLoader', 15 | 'get_config', 16 | # task 17 | 'DatasetTask', 18 | 'TableTask', 19 | 'Query', 20 | 'QueryTable', 21 | 'QueryToGCS', 22 | # targets.result 23 | 'ResultTarget', 24 | # targets.bq 25 | 'DatasetTarget', 26 | 'TableTarget', 27 | # targets.gcs 28 | 'BucketTarget', 29 | 'FileTarget', 30 | ] 31 | -------------------------------------------------------------------------------- /luigi_bigquery/targets/gcs.py: -------------------------------------------------------------------------------- 1 | from luigi_bigquery.config import get_config 2 | 3 | import luigi 4 | 5 | import logging 6 | logger = logging.getLogger('luigi-interface') 7 | 8 | class BucketTarget(luigi.Target): 9 | def __init__(self, bucket_name, config=None): 10 | self.bucket_name = bucket_name 11 | self.config = config or get_config() 12 | 13 | def exists(self): 14 | client = self.config.get_gcs_client() 15 | return client.check_bucket(self.bucket_name) 16 | 17 | class FileTarget(luigi.Target): 18 | def __init__(self, bucket_name, path, config=None): 19 | self.bucket_name = bucket_name 20 | if path[0] == '/': 21 | self.path = path[1:] 22 | else: 23 | self.path = path 24 | self.config = config or get_config() 25 | 26 | def exists(self): 27 | client = self.config.get_gcs_client() 28 | return client.check_file(self.bucket_name, self.path) 29 | 30 | def uri(self): 31 | return "gs://{0}/{1}".format(self.bucket_name, self.path) 32 | -------------------------------------------------------------------------------- /luigi_bigquery/targets/bq.py: -------------------------------------------------------------------------------- 1 | from luigi_bigquery.config import get_config 2 | 3 | import luigi 4 | 5 | import logging 6 | logger = logging.getLogger('luigi-interface') 7 | 8 | class SchemaError(Exception): 9 | pass 10 | 11 | class DatasetTarget(luigi.Target): 12 | def __init__(self, dataset_id, config=None): 13 | self.dataset_id = dataset_id 14 | self.config = config or get_config() 15 | 16 | def exists(self): 17 | client = self.config.get_client() 18 | return client.check_dataset(self.dataset_id) 19 | 20 | class TableTarget(luigi.Target): 21 | def __init__(self, dataset_id, table_id, empty=False, config=None, append=False): 22 | self.dataset_id = dataset_id 23 | self.table_id = table_id 24 | self.empty = empty 25 | self.config = config or get_config() 26 | self.append = append 27 | 28 | def exists(self): 29 | client = self.config.get_client() 30 | table = client.get_table(self.dataset_id, self.table_id) 31 | 32 | if not bool(table) or self.append: 33 | return False 34 | 35 | count = table.get('numRows', 0) 36 | 37 | if self.empty: 38 | if count == 0: 39 | return True 40 | else: 41 | logger.info('Deleting table: %s.%s', self.dataset_id, self.table_id) 42 | client.delete_table(self.dataset_id, self.table_id) 43 | return False 44 | else: 45 | return True 46 | -------------------------------------------------------------------------------- /luigi_bigquery/targets/result.py: -------------------------------------------------------------------------------- 1 | import luigi 2 | from luigi_bigquery.config import get_config 3 | from luigi_bigquery.client import ResultProxy 4 | from luigi_bigquery.job import Job 5 | 6 | import json 7 | import os 8 | 9 | import logging 10 | logger = logging.getLogger('luigi-interface') 11 | 12 | class ResultTarget(luigi.Target): 13 | def __init__(self, path, config=None): 14 | self.path = path 15 | self.config = config or get_config() 16 | 17 | # Job result handling 18 | 19 | def save_result_state(self, result): 20 | state_dir = os.path.dirname(self.path) 21 | if state_dir != '' and not os.path.exists(state_dir): 22 | os.makedirs(state_dir) 23 | with open(self.path, 'w') as f: 24 | state = {'job_id': result.job_id} 25 | json.dump(state, f) 26 | 27 | def load_result_state(self): 28 | with open(self.path) as f: 29 | return json.load(f) 30 | 31 | @property 32 | def job_id(self): 33 | return self.load_result_state()['job_id'] 34 | 35 | @property 36 | def result(self): 37 | if not hasattr(self, '_result'): 38 | client = self.config.get_client() 39 | self._result = ResultProxy(Job(client, self.job_id)) 40 | return self._result 41 | 42 | # Luigi support 43 | 44 | def exists(self): 45 | if not os.path.exists(self.path): 46 | return False 47 | 48 | client = self.config.get_client() 49 | complete, _ = client.check_job(self.job_id) 50 | return complete 51 | -------------------------------------------------------------------------------- /luigi_bigquery/client.py: -------------------------------------------------------------------------------- 1 | import six 2 | 3 | class ResultProxy(object): 4 | def __init__(self, job): 5 | self.job = job 6 | 7 | @property 8 | def job_id(self): 9 | return self.job.job_id 10 | 11 | @property 12 | def size(self): 13 | return self.job.result_size 14 | 15 | @property 16 | def description(self): 17 | return enumerate(self.job.schema) 18 | 19 | def __iter__(self): 20 | return iter(self._rows()) 21 | 22 | def _columns(self): 23 | return [c['name'] for i, c in self.description] 24 | 25 | def _rows(self): 26 | rows = [] 27 | for row in self.job.result: 28 | rows.append([row[c] if row[c] is not None else '' for c in self._columns()]) 29 | return rows 30 | 31 | def to_csv(self, path_or_file): 32 | def _write_row(f, values): 33 | line = u",".join([v if type(v) is unicode else unicode(str(v), encoding='UTF-8') for v in values]) + u"\n" 34 | f.write(line.encode('UTF-8')) 35 | 36 | def _to_csv(f): 37 | _write_row(f, self._columns()) 38 | for row in self._rows(): 39 | _write_row(f, row) 40 | 41 | if isinstance(path_or_file, six.string_types): 42 | with open(path_or_file, 'w', encoding='UTF-8') as f: 43 | return _to_csv(f) 44 | else: 45 | return _to_csv(path_or_file) 46 | 47 | def to_dataframe(self): 48 | import pandas as pd 49 | df = pd.DataFrame(columns=self._columns()) 50 | i = 0 51 | for row in self._rows(): 52 | df.loc[i] = row 53 | i += 1 54 | return df 55 | -------------------------------------------------------------------------------- /luigi_bigquery/tests/test_task.py: -------------------------------------------------------------------------------- 1 | from .test_helper import TestConfig 2 | from luigi_bigquery import Query, ResultTarget 3 | 4 | from unittest import TestCase 5 | from nose.tools import eq_, raises 6 | 7 | import luigi 8 | 9 | test_config = TestConfig( 10 | jobs = [ 11 | { 12 | 'job_id': 1, 13 | 'job_complete': True, 14 | 'total_rows': 20, 15 | 'schema': [{'name': 'cnt'}], 16 | 'rows': [{'name': 5000}] 17 | } 18 | ] 19 | ) 20 | 21 | class TestQuery(Query): 22 | config = test_config 23 | def query(self): 24 | return 'SELECT COUNT(1) cnt FROM www_access' 25 | 26 | class QueryTestCase(TestCase): 27 | def setUp(self): 28 | test_config.setUp() 29 | 30 | def tearDown(self): 31 | test_config.tearDown() 32 | 33 | def test_simple(self): 34 | class SimpleTestQuery(TestQuery): 35 | pass 36 | task = SimpleTestQuery() 37 | task.run() 38 | 39 | def test_with_output(self): 40 | class OutputTestQuery(TestQuery): 41 | def output(self): 42 | return ResultTarget(test_config.get_tmp_path('{0}.job'.format(self))) 43 | task = OutputTestQuery() 44 | task.run() 45 | 46 | def test_with_dependency(self): 47 | class DependencyTestQuery(TestQuery): 48 | def output(self): 49 | return ResultTarget(test_config.get_tmp_path('{0}.job'.format(self))) 50 | 51 | class DependencyTestResult(luigi.Task): 52 | def requires(self): 53 | return DependencyTestQuery() 54 | 55 | def output(self): 56 | return LocalTarget(test_config.get_tmp_path('{0}.csv'.format(self))) 57 | 58 | task = DependencyTestResult() 59 | task.run() 60 | -------------------------------------------------------------------------------- /luigi_bigquery/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import luigi 3 | from bigquery import get_client as bqclient 4 | from .gcs import get_gcs_client 5 | 6 | import logging 7 | logger = logging.getLogger('luigi-interface') 8 | 9 | class Config(object): 10 | def __init__(self, project_id, service_account, private_key_file): 11 | self.project_id = project_id 12 | self.service_account = service_account 13 | self.private_key_file = private_key_file 14 | 15 | def get_client(self): 16 | return bqclient( 17 | self.project_id, 18 | service_account=self.service_account, 19 | private_key_file=self.private_key_file, 20 | readonly=False) 21 | 22 | def get_gcs_client(self): 23 | return get_gcs_client( 24 | self.project_id, 25 | service_account=self.service_account, 26 | private_key_file=self.private_key_file) 27 | 28 | class ConfigLoader(object): 29 | _instance = None 30 | 31 | @classmethod 32 | def instance(cls, *args, **kwargs): 33 | if cls._instance is None: 34 | cls._instance = cls(*args, **kwargs) 35 | cls._instance.load_default() 36 | return cls._instance 37 | 38 | def __init__(self): 39 | self.config = None 40 | 41 | def get_config(self): 42 | return self.config 43 | 44 | def load_default(self): 45 | luigi_config = luigi.configuration.get_config() 46 | project_id = luigi_config.get('bigquery', 'project_id', os.environ.get('BQ_PROJECT_ID')) 47 | service_account = luigi_config.get('bigquery', 'service_account', os.environ.get('BQ_SERVICE_ACCOUNT')) 48 | private_key_file = luigi_config.get('bigquery', 'private_key_file', os.environ.get('BQ_PRIVATE_KEY_FILE')) 49 | self.config = Config(project_id, service_account, private_key_file) 50 | 51 | def get_config(): 52 | return ConfigLoader.instance().get_config() 53 | -------------------------------------------------------------------------------- /luigi_bigquery/tests/test_targets_bq.py: -------------------------------------------------------------------------------- 1 | from .test_helper import TestConfig 2 | from luigi_bigquery import DatasetTarget, TableTarget 3 | 4 | from unittest import TestCase 5 | from nose.tools import eq_ 6 | 7 | import luigi 8 | 9 | test_config = TestConfig( 10 | datasets = [ 11 | { 12 | "datasetReference": { 13 | "datasetId": 'dataset_1', 14 | "projectId": 'test-project-id' 15 | } 16 | } 17 | ], 18 | 19 | tables = [ 20 | { 21 | "tableReference": { 22 | "tableId": 'table_1', 23 | "datasetId": 'dataset_1', 24 | "projectId": 'test-project-id' 25 | }, 26 | "numRows": 1 27 | }, 28 | { 29 | "tableReference": { 30 | "tableId": 'table_2', 31 | "datasetId": 'dataset_1', 32 | "projectId": 'test-project-id' 33 | }, 34 | "numRows": 0 35 | } 36 | ] 37 | ) 38 | 39 | class DatasetTargetTestCase(TestCase): 40 | def setUp(self): 41 | test_config.setUp() 42 | 43 | def tearDown(self): 44 | test_config.tearDown() 45 | 46 | def test_exists(self): 47 | eq_(DatasetTarget('dataset_1', config=test_config).exists(), True) 48 | eq_(DatasetTarget('invalid_dataset_1', config=test_config).exists(), False) 49 | 50 | class TableTargetTestCase(TestCase): 51 | def setUp(self): 52 | test_config.setUp() 53 | 54 | def tearDown(self): 55 | test_config.tearDown() 56 | 57 | def test_exists(self): 58 | eq_(TableTarget('dataset_1', 'table_1', config=test_config).exists(), True) 59 | eq_(TableTarget('dataset_1', 'invalid_table_1', config=test_config).exists(), False) 60 | 61 | def test_exists_check_empty(self): 62 | eq_(TableTarget('dataset_1', 'table_1', empty=True, config=test_config).exists(), False) 63 | eq_(TableTarget('dataset_1', 'table_2', empty=True, config=test_config).exists(), True) 64 | -------------------------------------------------------------------------------- /luigi_bigquery/gcs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from apiclient.discovery import build 4 | from apiclient.errors import HttpError 5 | import httplib2 6 | 7 | def get_gcs_client(project_id, credentials=None, service_account=None, 8 | private_key=None, private_key_file=None): 9 | 10 | if not credentials: 11 | assert service_account and (private_key or private_key_file), \ 12 | 'Must provide AssertionCredentials or service account and key' 13 | 14 | if private_key_file: 15 | with open(private_key_file, 'rb') as key_file: 16 | private_key = key_file.read() 17 | 18 | gcs_service = _get_gcs_service(credentials=credentials, 19 | service_account=service_account, 20 | private_key=private_key) 21 | 22 | return GCSClient(gcs_service) 23 | 24 | def _get_gcs_service(credentials=None, service_account=None, private_key=None): 25 | 26 | assert credentials or (service_account and private_key), \ 27 | 'Must provide AssertionCredentials or service account and key' 28 | 29 | if not credentials: 30 | credentials = _credentials()(service_account, private_key, scope='https://www.googleapis.com/auth/devstorage.read_write') 31 | 32 | http = httplib2.Http() 33 | http = credentials.authorize(http) 34 | service = build('storage', 'v1', http=http) 35 | 36 | return service 37 | 38 | def _credentials(): 39 | from oauth2client.client import SignedJwtAssertionCredentials 40 | 41 | return SignedJwtAssertionCredentials 42 | 43 | class GCSClient(object): 44 | 45 | def __init__(self, gcs_service): 46 | self.gcs = gcs_service 47 | 48 | def get_bucket(self, bucket_name): 49 | try: 50 | bucket = self.gcs.buckets().get(bucket=bucket_name).execute() 51 | except HttpError: 52 | bucket = {} 53 | 54 | return bucket 55 | 56 | def check_bucket(self, bucket_name): 57 | bucket = self.get_bucket(bucket_name) 58 | return bool(bucket) 59 | 60 | def get_file(self, bucket_name, path): 61 | try: 62 | file = self.gcs.objects().get(bucket=bucket_name, object=path).execute() 63 | except HttpError: 64 | file = {} 65 | 66 | return file 67 | 68 | def check_file(self, bucket_name, path): 69 | file = self.get_file(bucket_name, path) 70 | return bool(file) 71 | -------------------------------------------------------------------------------- /luigi_bigquery/tests/test_config.py: -------------------------------------------------------------------------------- 1 | from luigi_bigquery import Config, ConfigLoader, get_config 2 | 3 | from unittest import TestCase 4 | from nose.tools import eq_, raises 5 | from bigquery import get_client as bqclient 6 | 7 | import os 8 | import luigi 9 | 10 | class ConfigTestCase(TestCase): 11 | 12 | def test_create(self): 13 | config = Config('test-project-id', 'test-service-account', '/path/to/key.p12') 14 | eq_(config.project_id, 'test-project-id') 15 | eq_(config.service_account, 'test-service-account') 16 | eq_(config.private_key_file, '/path/to/key.p12') 17 | 18 | class ConfigLoaderTestCase(TestCase): 19 | 20 | def setUp(self): 21 | self.environ = os.environ.copy() 22 | 23 | def tearDown(self): 24 | os.environ.clear() 25 | os.environ.update(self.environ) 26 | if os.path.exists('client.cfg'): 27 | os.unlink('client.cfg') 28 | luigi.configuration.LuigiConfigParser._instance = None 29 | 30 | def _config(self, values): 31 | with open('client.cfg', 'w') as f: 32 | f.write("[bigquery]\n") 33 | for key, val in values.items(): 34 | f.write("{0}: {1}\n".format(key, val)) 35 | luigi.configuration.LuigiConfigParser._instance = None 36 | 37 | def _get_config(self): 38 | loader = ConfigLoader() 39 | loader.load_default() 40 | return loader.get_config() 41 | 42 | @raises(AssertionError) 43 | def test_no_config(self): 44 | config = self._get_config() 45 | eq_(config.project_id, None) 46 | config.get_client() 47 | 48 | def test_credentials_by_environ(self): 49 | os.environ['BQ_PROJECT_ID'] = 'test-project-id' 50 | os.environ['BQ_SERVICE_ACCOUNT'] = 'test-service-account' 51 | os.environ['BQ_PRIVATE_KEY_FILE'] = '/path/to/key.p12' 52 | config = self._get_config() 53 | eq_(config.project_id, 'test-project-id') 54 | eq_(config.service_account, 'test-service-account') 55 | eq_(config.private_key_file, '/path/to/key.p12') 56 | 57 | def test_credentials_by_luigi_config(self): 58 | self._config( 59 | { 60 | 'project_id': 'test-project-id', 61 | 'service_account': 'test-service-account', 62 | 'private_key_file': '/path/to/key.p12', 63 | } 64 | ) 65 | config = self._get_config() 66 | eq_(config.project_id, 'test-project-id') 67 | eq_(config.service_account, 'test-service-account') 68 | eq_(config.private_key_file, '/path/to/key.p12') 69 | 70 | class GetConfigTestCase(TestCase): 71 | 72 | def test_default(self): 73 | config = get_config() 74 | eq_(type(config), Config) 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Luigi-BigQuery 2 | 3 | [![Build Status](https://travis-ci.org/hakobera/luigi-bigquery.svg?branch=master)](https://travis-ci.org/hakobera/luigi-bigquery) 4 | 5 | Luigi integration for Google BigQuery. 6 | 7 | **CAUTION: This module is currently under active development.** 8 | 9 | ## Prerequisities 10 | 11 | - Python >= 2.7 12 | - [luigi](https://github.com/spotify/luigi) >= 2.0.0 13 | 14 | If you want to run with luigi 1.x, use [lugi_1](https://github.com/hakobera/luigi-bigquery/tree/luigi_1) branch. 15 | 16 | ## Install 17 | 18 | You can install Luigi-BigQuery using pip: 19 | 20 | ```sh 21 | $ pip install git+https://github.com/hakobera/luigi-bigquery#egg=luigi=bigquery 22 | ``` 23 | 24 | ## Configuration 25 | 26 | You can set your `project_id`, `service_account` and `private_key_file` as an environment variables `BQ_PROJECT_ID`, `BQ_SERVICE_ACCOUNT` and `BQ_PRIVATE_KEY_FILE`: 27 | 28 | ```sh 29 | $ export BQ_PROJECT_ID=your-project-id 30 | $ export BQ_SERVICE_ACCOUNT=your-service-account 31 | $ export BQ_PRIVATE_KEY_FILE=/path/to/key.p12 32 | ``` 33 | 34 | Alternatively, you can use Luigi configuration file (`./client.cfg` or `/etc/lugi/client.cfg`): 35 | 36 | ``` 37 | # configuration for Luigi 38 | [core] 39 | error-email: you@example.com 40 | 41 | # configuration for Luigi-BigQuery 42 | [bigquery] 43 | project_id: your_project_id 44 | service_account: your_service_account 45 | private_key_file: /path/to/key.p12 46 | ``` 47 | 48 | ## Running Queries 49 | 50 | Queries are defined as subclasses of `luigi_bigquery.Query` 51 | 52 | ```python 53 | import luigi 54 | import luigi_bigquery 55 | 56 | class MyQuery(luigi_bigquery.Query): 57 | def query(self): 58 | return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]" 59 | 60 | def output(self): 61 | return luigi.LocalTarget('data/MyQuery.csv') 62 | 63 | def run(self): 64 | result = self.run_query(self.query()) 65 | with self.output().open('w') as f: 66 | result.to_csv(f) 67 | 68 | if __name__ == '__main__': 69 | luigi.run() 70 | ``` 71 | 72 | You can submit your query as a normal Python script as follows: 73 | 74 | ``` 75 | $ python myquery.py MyQuery --local-scheduler 76 | DEBUG: Checking if MyQuery() is complete 77 | INFO: Scheduled MyQuery() (PENDING) 78 | INFO: Done scheduling tasks 79 | INFO: Running Worker with 1 processes 80 | DEBUG: Asking scheduler for work... 81 | DEBUG: Pending tasks: 1 82 | INFO: [pid 1234] Worker Worker(salt=1234, workers=1, host=...) running MyQuery() 83 | INFO: MyQuery(): bigquery.job.id: job_1234 84 | INFO: MyQuery(): bigquery.job.result: job_id=job_1234 row_count=1 85 | INFO: [pid 1234] Worker Worker(salt=1234, workers=1, host=...) done MyQuery() 86 | DEBUG: 1 running tasks, waiting for next task to finish 87 | DEBUG: Asking scheduler for work... 88 | INFO: Done 89 | INFO: There are no more tasks to run at this time 90 | INFO: Worker Worker(salt=1234, workers=1, host=...) was stopped. Shutting down Keep-Alive thread 91 | ``` 92 | 93 | You can see the query result in file `data/MyQuery.csv` 94 | 95 | ``` 96 | $ cat data/MyQuery.csv 97 | cnt 98 | 2541639 99 | ``` 100 | 101 | ## For more examples 102 | 103 | See [examples/tasks.py](./examples/tasks.py) 104 | 105 | ## License 106 | 107 | Apache License Version 2.0 108 | -------------------------------------------------------------------------------- /luigi_bigquery/tests/test_helper.py: -------------------------------------------------------------------------------- 1 | from luigi_bigquery import ResultProxy 2 | 3 | import os 4 | import shutil 5 | import tempfile 6 | 7 | class MockClient(object): 8 | def __init__(self, datasets, tables, jobs): 9 | self._datasets = datasets 10 | self._tables = tables 11 | self._jobs = jobs 12 | 13 | def create_dataset(self, dataset_id, friendly_name=None, description=None, access=None): 14 | dataset_data = _dataset_resource(dataset_id, friendly_name, description, access) 15 | self._datasets.append(dataset_data) 16 | return dataset_data 17 | 18 | def get_datasets(self): 19 | return self._datasets 20 | 21 | def check_dataset(self, dataset_id): 22 | return dataset_id in [ds['datasetReference']['datasetId'] for ds in self.get_datasets()] 23 | 24 | def get_table(self, dataset_id, table_id): 25 | for table in self._tables: 26 | ref = table['tableReference'] 27 | if ref['datasetId'] == dataset_id and ref['tableId'] == table_id: 28 | return table 29 | return {} 30 | 31 | def delete_table(self, dataset_id, table_id): 32 | pass 33 | 34 | def check_job(self, job_id): 35 | job = self._job(job_id) 36 | return (job.get('job_complete', False), int(job.get('total_rows', 0))) 37 | 38 | def get_query_schema(self, job_id): 39 | job = self._job(job_id) 40 | return job['schema'] 41 | 42 | def get_query_rows(self, job_id): 43 | job = self._job(job_id) 44 | return job['rows'] 45 | 46 | def query(self, query): 47 | return (self._jobs[0]['job_id'], None) 48 | 49 | def _job(self, job_id): 50 | for job in self._jobs: 51 | if job['job_id'] == job_id: 52 | return job 53 | return {} 54 | 55 | def _dataset_resource(self, dataset_id, friendly_name=None, description=None, access=None): 56 | data = { 57 | "datasetReference": { 58 | "datasetId": dataset_id, 59 | "projectId": 'test-project-id' 60 | } 61 | } 62 | if friendly_name: 63 | data["friendlyName"] = friendly_name 64 | if description: 65 | data["description"] = description 66 | if access: 67 | data["access"] = access 68 | 69 | return data 70 | 71 | 72 | class MockGCSClient(object): 73 | 74 | def __init__(self, objects): 75 | self._objects = objects 76 | 77 | def get_file(self, bucket_name, path): 78 | for obj in self._objects: 79 | if obj['bucket'] == bucket_name and obj['name'] == path: 80 | return obj 81 | return {} 82 | 83 | def check_file(self, bucket_name, path): 84 | file = self.get_file(bucket_name, path) 85 | return bool(file) 86 | 87 | class TestConfig(object): 88 | def __init__(self, datasets=[], tables=[], jobs=[], objects=[]): 89 | self.datasets = datasets 90 | self.tables = tables 91 | self.objects = objects 92 | self._jobs = jobs 93 | self.tmp_dir = None 94 | 95 | def setUp(self): 96 | if not self.tmp_dir: 97 | self.tmp_dir = tempfile.mkdtemp() 98 | 99 | def tearDown(self): 100 | if self.tmp_dir: 101 | shutil.rmtree(self.tmp_dir) 102 | self.tmp_dir = None 103 | 104 | def get_tmp_path(self, filename): 105 | return os.path.join(self.tmp_dir, filename) 106 | 107 | def get_client(self): 108 | return MockClient(datasets=self.datasets, tables=self.tables, jobs=self._jobs) 109 | 110 | def get_gcs_client(self): 111 | return MockGCSClient(objects=self.objects) 112 | -------------------------------------------------------------------------------- /luigi_bigquery/task.py: -------------------------------------------------------------------------------- 1 | from luigi_bigquery.config import get_config 2 | from luigi_bigquery.client import ResultProxy 3 | from luigi_bigquery.job import Job 4 | from luigi_bigquery.targets.result import ResultTarget 5 | from luigi_bigquery.targets.bq import DatasetTarget 6 | from luigi_bigquery.targets.bq import TableTarget 7 | from luigi_bigquery.targets.gcs import BucketTarget 8 | from luigi_bigquery.targets.gcs import FileTarget 9 | 10 | import luigi 11 | import jinja2 12 | import time 13 | import bigquery 14 | import string 15 | import random 16 | import six 17 | 18 | import logging 19 | logger = logging.getLogger('luigi-interface') 20 | 21 | def _id_generator(size=16, chars=string.ascii_uppercase + string.digits): 22 | return ''.join(random.choice(chars) for _ in range(size)) 23 | 24 | # Dataset 25 | 26 | class DatasetTask(luigi.Task): 27 | config = get_config() 28 | dataset_id = luigi.Parameter() 29 | 30 | def output(self): 31 | return DatasetTarget(self.dataset_id) 32 | 33 | def run(self): 34 | client = self.config.get_client() 35 | logger.info('%s: creating dataset: %s', self, self.dataset_id) 36 | client.create_dataset(self.dataset_id) 37 | 38 | max_retry = 30 39 | retry = 0 40 | while True: 41 | time.sleep(5.0) 42 | if client.check_dataset(self.dataset_id): 43 | break 44 | retry += 1 45 | if retry > max_retry: 46 | msg = "DatasetTask(dataset_id={0}) max retry error.".format(self.dataset_id) 47 | logger.error(msg) 48 | raise Exception(msg) 49 | 50 | # Table 51 | 52 | class TableTask(luigi.Task): 53 | config = get_config() 54 | dataset_id = luigi.Parameter() 55 | table_id = luigi.Parameter() 56 | schema = luigi.Parameter(default=[], significant=False) 57 | empty = luigi.BooleanParameter(default=False, significant=False) 58 | 59 | def requires(self): 60 | return DatasetTask(self.dataset_id) 61 | 62 | def output(self): 63 | return TableTarget(self.dataset_id, self.table_id, empty=self.empty) 64 | 65 | def run(self): 66 | client = self.config.get_client() 67 | logger.info('%s: creating table: %s.%s', self, self.datasset_id, self.table_id) 68 | client.create_table(self.dataset_id, self.table_id, self.schema) 69 | 70 | # Query 71 | 72 | class QueryTimeout(Exception): 73 | pass 74 | 75 | class Query(luigi.Task): 76 | config = get_config() 77 | debug = False 78 | timeout = 3600 79 | source = None 80 | variables = {} 81 | 82 | def query(self): 83 | return NotImplemented() 84 | 85 | def load_query(self, source): 86 | env = jinja2.Environment(loader=jinja2.PackageLoader(self.__module__, '.')) 87 | template = env.get_template(source) 88 | return template.render(task=self, **self.variables) 89 | 90 | def run_query(self, query): 91 | result = self.output() 92 | client = self.config.get_client() 93 | 94 | logger.info("%s: query: %s", self, query) 95 | job_id, _ = client.query(query) 96 | logger.info("%s: bigquery.job.id: %s", self, job_id) 97 | 98 | complete, result_size = client.check_job(job_id) 99 | try: 100 | if self.timeout: 101 | timeout = time.time() + self.timeout 102 | else: 103 | timeout = None 104 | 105 | while not complete: 106 | if timeout and time.time() > timeout: 107 | raise QueryTimeout('{0} timed out'.format(self)) 108 | time.sleep(5) 109 | complete, result_size = client.check_job(job_id) 110 | except: 111 | raise 112 | 113 | logger.info("%s: bigquery.job.result: job_id=%s result_size=%d", self, job_id, result_size) 114 | 115 | return ResultProxy(Job(client, job_id)) 116 | 117 | def run(self): 118 | query = self.load_query(self.source) if self.source else self.query() 119 | result = self.run_query(query) 120 | target = self.output() 121 | 122 | if target and isinstance(target, ResultTarget): 123 | target.save_result_state(result) 124 | 125 | if self.debug: 126 | import pandas as pd 127 | TERMINAL_WIDTH = 120 128 | pd.options.display.width = TERMINAL_WIDTH 129 | six.print_('-' * TERMINAL_WIDTH) 130 | six.print_('Query result:') 131 | six.print_(result.to_dataframe()) 132 | six.print_('-' * TERMINAL_WIDTH) 133 | 134 | class QueryTable(Query): 135 | create_disposition = bigquery.JOB_CREATE_IF_NEEDED 136 | write_disposition = bigquery.JOB_WRITE_EMPTY 137 | 138 | def requires(self): 139 | return DatasetTask(self.dataset()) 140 | 141 | def output(self): 142 | return TableTarget(self.dataset(), self.table(), append=self._append()) 143 | 144 | def dataset(self): 145 | return NotImplemented() 146 | 147 | def table(self): 148 | return NotImplemented() 149 | 150 | def _append(self): 151 | return self.write_disposition == bigquery.JOB_WRITE_APPEND 152 | 153 | def save_as_table(self, query): 154 | result = self.output() 155 | client = self.config.get_client() 156 | 157 | logger.info("%s: query: %s", self, query) 158 | job = client.write_to_table( 159 | query, 160 | dataset=self.dataset(), 161 | table=self.table(), 162 | create_disposition=self.create_disposition, 163 | write_disposition=self.write_disposition, 164 | allow_large_results=True) 165 | job_id = job['jobReference'].get('jobId') 166 | logger.info("%s: bigquery.job.id: %s", self, job_id) 167 | 168 | complete, result_size = client.check_job(job_id) 169 | try: 170 | if self.timeout: 171 | timeout = time.time() + self.timeout 172 | else: 173 | timeout = None 174 | 175 | while not complete: 176 | if timeout and time.time() > timeout: 177 | raise QueryTimeout('{0} timed out'.format(self)) 178 | time.sleep(5) 179 | complete, result_size = client.check_job(job_id) 180 | except: 181 | raise 182 | 183 | logger.info("%s: bigquery.job.result: job_id=%s result_size=%d", self, job_id, result_size) 184 | 185 | return ResultProxy(Job(client, job_id)) 186 | 187 | def run(self): 188 | query = self.load_query(self.source) if self.source else self.query() 189 | self.save_as_table(query) 190 | 191 | class QueryToGCS(QueryTable): 192 | compression = luigi.Parameter(default='NONE') # or GZIP 193 | format = luigi.Parameter(default='CSV') # or NEWLINE_DELIMITED_JSON 194 | print_header = luigi.Parameter(default=True) 195 | use_temporary_table = luigi.Parameter(default=True) 196 | 197 | def __init__(self, *args, **kwargs): 198 | super(QueryToGCS, self).__init__(*args, **kwargs) 199 | self._random_id = 'tmp_{}'.format(_id_generator()) 200 | 201 | def dataset(self): 202 | if self.use_temporary_table: 203 | return self._random_id 204 | else: 205 | return NotImplemented() 206 | 207 | def table(self): 208 | if self.use_temporary_table: 209 | return self._random_id 210 | else: 211 | return NotImplemented() 212 | 213 | def output(self): 214 | return FileTarget(self.bucket(), self.path()) 215 | 216 | def bucket(self): 217 | return NotImplemented() 218 | 219 | def path(self): 220 | return NotImplemented() 221 | 222 | def export_to_gcs(self): 223 | result = self.output() 224 | client = self.config.get_client() 225 | 226 | logger.info("%s: export %s.%s to %s", self, self.dataset(), self.table(), result.uri()) 227 | job = client.export_data_to_uris( 228 | destination_uris=[result.uri()], 229 | dataset=self.dataset(), 230 | table=self.table(), 231 | compression=self.compression, 232 | destination_format=self.format, 233 | print_header=self.print_header) 234 | job_id = job['jobReference'].get('jobId') 235 | logger.info("%s: bigquery.job.id: %s", self, job_id) 236 | 237 | try: 238 | job_resource = client.wait_for_job(job, timeout=3600) 239 | except: 240 | raise 241 | 242 | def _cleanup(self): 243 | if self.use_temporary_table: 244 | client = self.config.get_client() 245 | client.delete_dataset(self.dataset(), delete_contents=True) 246 | 247 | def run(self): 248 | query = self.load_query(self.source) if self.source else self.query() 249 | try: 250 | self.save_as_table(query) 251 | self.export_to_gcs() 252 | finally: 253 | self._cleanup() 254 | -------------------------------------------------------------------------------- /examples/tasks.py: -------------------------------------------------------------------------------- 1 | import luigi 2 | import luigi_bigquery 3 | 4 | ## Running Queries 5 | 6 | class MyQuery(luigi_bigquery.Query): 7 | 8 | def query(self): 9 | return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]" 10 | 11 | ## Getting Results 12 | 13 | class MyQueryRun(luigi_bigquery.Query): 14 | 15 | def query(self): 16 | return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]" 17 | 18 | def run(self): 19 | result = self.run_query(self.query()) 20 | print "Job ID :", result.job_id 21 | print "Result size:", result.size 22 | print "Result :" 23 | print "\t".join([c['name'] for i, c in result.description]) 24 | print "----" 25 | for row in result: 26 | print "\t".join([str(c) for c in row]) 27 | print '====================' 28 | 29 | class MyQuerySave(luigi_bigquery.Query): 30 | 31 | def query(self): 32 | return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]" 33 | 34 | def output(self): 35 | return luigi.LocalTarget('MyQuerySave.csv') 36 | 37 | def run(self): 38 | result = self.run_query(self.query()) 39 | with self.output().open('w') as f: 40 | result.to_csv(f) 41 | 42 | ## Building Pipelines 43 | 44 | class MyQueryStep1(luigi_bigquery.Query): 45 | 46 | def query(self): 47 | return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]" 48 | 49 | def output(self): 50 | return luigi_bigquery.ResultTarget('MyQueryStep1.job') 51 | 52 | class MyQueryStep2(luigi.Task): 53 | def requires(self): 54 | return MyQueryStep1() 55 | 56 | def output(self): 57 | return luigi.LocalTarget('MyQueryStep2.csv') 58 | 59 | def run(self): 60 | # retrieve the result and save it as a CSV file 61 | with self.output().open('w') as f: 62 | self.input().result.to_csv(f) 63 | 64 | class MyQueryStep3(luigi.Task): 65 | def requires(self): 66 | return MyQueryStep2() 67 | 68 | def output(self): 69 | return luigi.LocalTarget('MyQueryStep3.txt') 70 | 71 | def run(self): 72 | with self.input().open() as f: 73 | # process the result here 74 | print f.read() 75 | with self.output().open('w') as f: 76 | # crate the final output 77 | f.write('done') 78 | 79 | ## Templating Queries 80 | 81 | class MyQueryFromTemplate(luigi_bigquery.Query): 82 | source = 'templates/query_with_language.sql' 83 | 84 | # variables used in the template 85 | language = 'Python' 86 | 87 | class MuQueryWithVariables(luigi_bigquery.Query): 88 | source = 'templates/query_with_variables.sql' 89 | 90 | # define variables 91 | variables = { 92 | 'language': 'Python', 93 | } 94 | 95 | # or use property for dynamic variables 96 | # @property 97 | # def variables(self): 98 | # return { 99 | # 'language': 'Python', 100 | # } 101 | 102 | ## Passing Parameters 103 | 104 | class MyQueryWithParameters(luigi_bigquery.Query): 105 | source = 'templates/query_with_time_range.sql' 106 | 107 | # parameters 108 | year = luigi.IntParameter() 109 | 110 | def output(self): 111 | # create a unique name for this output using parameters 112 | return luigi_bigquery.ResultTarget('MyQueryWithParameters-{0}.job'.format(self.year)) 113 | 114 | class MyQueryAggregator(luigi.Task): 115 | 116 | def requires(self): 117 | # create a list of tasks with different parameters 118 | return [ 119 | MyQueryWithParameters(2009), 120 | MyQueryWithParameters(2010), 121 | MyQueryWithParameters(2011), 122 | MyQueryWithParameters(2012) 123 | ] 124 | 125 | def output(self): 126 | return luigi.LocalTarget('MyQueryAggregator.txt') 127 | 128 | def run(self): 129 | with self.output().open('w') as f: 130 | # repeat for each ResultTarget 131 | for target in self.input(): 132 | # output results into a single file 133 | for row in target.result: 134 | f.write(str(row) + "\n") 135 | 136 | ## Building Pipelines using QueryTable 137 | 138 | class MyQueryTableStep1(luigi_bigquery.QueryTable): 139 | 140 | def dataset(self): 141 | return 'tmp' 142 | 143 | def table(self): 144 | return 'github_nested_count' 145 | 146 | def query(self): 147 | return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]" 148 | 149 | class MyQueryTableStep2(luigi_bigquery.Query): 150 | def requires(self): 151 | return MyQueryTableStep1() 152 | 153 | def query(self): 154 | input = self.input() 155 | print(input.dataset_id) 156 | print(input.table_id) 157 | return "SELECT cnt FROM [{0}.{1}]".format(input.dataset_id, input.table_id) 158 | 159 | def output(self): 160 | return luigi.LocalTarget('MyQueryTableStep2.csv') 161 | 162 | def run(self): 163 | # retrieve the result and save it as a CSV file 164 | result = self.run_query(self.query()) 165 | with self.output().open('w') as f: 166 | result.to_csv(f) 167 | 168 | if __name__ == '__main__': 169 | luigi.run() 170 | import luigi 171 | import luigi_bigquery 172 | 173 | ## Running Queries 174 | 175 | class MyQuery(luigi_bigquery.Query): 176 | 177 | def query(self): 178 | return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]" 179 | 180 | ## Getting Results 181 | 182 | class MyQueryRun(luigi_bigquery.Query): 183 | 184 | def query(self): 185 | return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]" 186 | 187 | def run(self): 188 | result = self.run_query(self.query()) 189 | print "Job ID :", result.job_id 190 | print "Result size:", result.size 191 | print "Result :" 192 | print "\t".join([c['name'] for i, c in result.description]) 193 | print "----" 194 | for row in result: 195 | print "\t".join([str(c) for c in row]) 196 | print '====================' 197 | 198 | class MyQuerySave(luigi_bigquery.Query): 199 | 200 | def query(self): 201 | return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]" 202 | 203 | def output(self): 204 | return luigi.LocalTarget('MyQuerySave.csv') 205 | 206 | def run(self): 207 | result = self.run_query(self.query()) 208 | with self.output().open('w') as f: 209 | result.to_csv(f) 210 | 211 | ## Building Pipelines 212 | 213 | class MyQueryStep1(luigi_bigquery.Query): 214 | 215 | def query(self): 216 | return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]" 217 | 218 | def output(self): 219 | return luigi_bigquery.ResultTarget('MyQueryStep1.job') 220 | 221 | class MyQueryStep2(luigi.Task): 222 | def requires(self): 223 | return MyQueryStep1() 224 | 225 | def output(self): 226 | return luigi.LocalTarget('MyQueryStep2.csv') 227 | 228 | def run(self): 229 | # retrieve the result and save it as a CSV file 230 | with self.output().open('w') as f: 231 | self.input().result.to_csv(f) 232 | 233 | class MyQueryStep3(luigi.Task): 234 | def requires(self): 235 | return MyQueryStep2() 236 | 237 | def output(self): 238 | return luigi.LocalTarget('MyQueryStep3.txt') 239 | 240 | def run(self): 241 | with self.input().open() as f: 242 | # process the result here 243 | print f.read() 244 | with self.output().open('w') as f: 245 | # crate the final output 246 | f.write('done') 247 | 248 | ## Templating Queries 249 | 250 | class MyQueryFromTemplate(luigi_bigquery.Query): 251 | source = 'templates/query_with_language.sql' 252 | 253 | # variables used in the template 254 | language = 'Python' 255 | 256 | class MuQueryWithVariables(luigi_bigquery.Query): 257 | source = 'templates/query_with_variables.sql' 258 | 259 | # define variables 260 | variables = { 261 | 'language': 'Python', 262 | } 263 | 264 | # or use property for dynamic variables 265 | # @property 266 | # def variables(self): 267 | # return { 268 | # 'language': 'Python', 269 | # } 270 | 271 | ## Passing Parameters 272 | 273 | class MyQueryWithParameters(luigi_bigquery.Query): 274 | source = 'templates/query_with_time_range.sql' 275 | 276 | # parameters 277 | year = luigi.IntParameter() 278 | 279 | def output(self): 280 | # create a unique name for this output using parameters 281 | return luigi_bigquery.ResultTarget('MyQueryWithParameters-{0}.job'.format(self.year)) 282 | 283 | class MyQueryAggregator(luigi.Task): 284 | 285 | def requires(self): 286 | # create a list of tasks with different parameters 287 | return [ 288 | MyQueryWithParameters(2009), 289 | MyQueryWithParameters(2010), 290 | MyQueryWithParameters(2011), 291 | MyQueryWithParameters(2012) 292 | ] 293 | 294 | def output(self): 295 | return luigi.LocalTarget('MyQueryAggregator.txt') 296 | 297 | def run(self): 298 | with self.output().open('w') as f: 299 | # repeat for each ResultTarget 300 | for target in self.input(): 301 | # output results into a single file 302 | for row in target.result: 303 | f.write(str(row) + "\n") 304 | 305 | ## Building Pipelines using QueryTable 306 | 307 | class MyQueryTableStep1(luigi_bigquery.QueryTable): 308 | 309 | def dataset(self): 310 | return 'tmp' 311 | 312 | def table(self): 313 | return 'github_nested_count' 314 | 315 | def query(self): 316 | return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]" 317 | 318 | class MyQueryTableStep2(luigi_bigquery.Query): 319 | def requires(self): 320 | return MyQueryTableStep1() 321 | 322 | def query(self): 323 | input = self.input() 324 | return "SELECT cnt FROM [{0}.{1}]".format(input.dataset_id, input.table_id) 325 | 326 | def output(self): 327 | return luigi.LocalTarget('MyQueryTableStep2.csv') 328 | 329 | def run(self): 330 | # retrieve the result and save it as a CSV file 331 | result = self.run_query(self.query()) 332 | with self.output().open('w') as f: 333 | result.to_csv(f) 334 | 335 | # QueryToGCS 336 | 337 | class MyQueryToGCS(luigi_bigquery.QueryToGCS): 338 | use_temporary_table = True 339 | 340 | def bucket(self): 341 | return 'my-bucket' 342 | 343 | def path(self): 344 | return '/path/to/file.csv' 345 | 346 | def query(self): 347 | return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]" 348 | 349 | if __name__ == '__main__': 350 | luigi.run() 351 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2015 Kazuyuki Honda 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. --------------------------------------------------------------------------------