├── luigi_bigquery
    ├── tests
    │   ├── __init__.py
    │   ├── test_targets_gcs.py
    │   ├── test_task.py
    │   ├── test_targets_bq.py
    │   ├── test_config.py
    │   └── test_helper.py
    ├── targets
    │   ├── __init__.py
    │   ├── gcs.py
    │   ├── bq.py
    │   └── result.py
    ├── job.py
    ├── __init__.py
    ├── client.py
    ├── config.py
    ├── gcs.py
    └── task.py
├── MANIFEST.in
├── requirements_dev.txt
├── requirements.txt
├── examples
    ├── templates
    │   ├── query_with_variables.sql
    │   ├── query_with_language.sql
    │   └── query_with_time_range.sql
    ├── config
    │   └── luigi.cfg
    ├── query.py
    └── tasks.py
├── .gitignore
├── client.cfg.template
├── .travis.yml
├── setup.py
├── README.md
└── LICENSE


/luigi_bigquery/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/luigi_bigquery/targets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | nose
2 | rednose
3 | mock
4 | coverage
5 | nose-exclude
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | BigQuery-Python>=1.4.0
2 | jinja2>=2.7.3
3 | luigi>=2.0,<3.0
4 | pyyaml==3.11
5 | pandas>=0.16.0
6 | six>=1.9.0
7 | 


--------------------------------------------------------------------------------
/examples/templates/query_with_variables.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 |   count(*) cnt
3 | FROM
4 |   [publicdata:samples.github_nested]
5 | WHERE
6 |   repository.language = '{{ language }}'
7 | 


--------------------------------------------------------------------------------
/examples/templates/query_with_language.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 |   count(*) cnt
3 | FROM
4 |   [publicdata:samples.github_nested]
5 | WHERE
6 |   repository.language = '{{ task.language }}'
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | .coverage
 3 | .idea
 4 | .python-version
 5 | client.cfg
 6 | build
 7 | dist
 8 | luigi_bigquery.egg-info
 9 | docs/_build
10 | docs/_build_html
11 | data
12 | env
13 | 


--------------------------------------------------------------------------------
/client.cfg.template:
--------------------------------------------------------------------------------
 1 | # configuration for Luigi
 2 | [core]
 3 | error-email: you@example.com
 4 | 
 5 | # configuration for Luigi-BigQuery
 6 | [bigquery]
 7 | project_id: your_project_id
 8 | service_account: your_service_account
 9 | private_key_file: /path/to/key.p12
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 |   - "3.4"
 5 | install:
 6 |   - "pip install -r requirements.txt"
 7 |   - "pip install coveralls"
 8 | script:
 9 |   nosetests --with-coverage --cover-package luigi_bigquery
10 | after_success:
11 |   coveralls
12 | 


--------------------------------------------------------------------------------
/examples/config/luigi.cfg:
--------------------------------------------------------------------------------
 1 | # configuration for Luigi
 2 | [core]
 3 | error-email: you@example.com
 4 | 
 5 | # configuration for Luigi-BigQuery
 6 | [bigquery]
 7 | project_id: your_project_id
 8 | service_account: your_service_account
 9 | private_key_file: /path/to/key.p12
10 | 


--------------------------------------------------------------------------------
/examples/templates/query_with_time_range.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   STRFTIME_UTC_USEC(TIMESTAMP(repository_created_at), '%Y-%m') AS month,
 3 |   count(*) cnt
 4 | FROM
 5 |   [publicdata:samples.github_timeline]
 6 | WHERE
 7 |   TIMESTAMP(repository_created_at) >= '{{ task.year }}-01-01 00:00:00'
 8 |   AND
 9 |   TIMESTAMP(repository_created_at) <= '{{ task.year + 1 }}-01-01 00:00:00'
10 | GROUP BY
11 |   month
12 | ORDER BY
13 |   month
14 | 


--------------------------------------------------------------------------------
/examples/query.py:
--------------------------------------------------------------------------------
 1 | import luigi
 2 | import luigi_bigquery
 3 | 
 4 | class MyQuery(luigi_bigquery.Query):
 5 |     def query(self):
 6 |         return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]"
 7 | 
 8 |     def output(self):
 9 |         return luigi.LocalTarget('MyQuery.csv')
10 | 
11 |     def run(self):
12 |         result = self.run_query(self.query())
13 |         with self.output().open('w') as f:
14 |             result.to_csv(f)
15 | 
16 | if __name__ == '__main__':
17 |     luigi.run()
18 | 


--------------------------------------------------------------------------------
/luigi_bigquery/tests/test_targets_gcs.py:
--------------------------------------------------------------------------------
 1 | from .test_helper import TestConfig
 2 | from luigi_bigquery import FileTarget
 3 | 
 4 | from unittest import TestCase
 5 | from nose.tools import eq_
 6 | 
 7 | import luigi
 8 | 
 9 | test_config = TestConfig(
10 |     objects = [
11 |         {
12 |             "bucket": "bucket",
13 |             "name": "path/to/file.csv"
14 |         }
15 |     ]
16 | )
17 | 
18 | class FileTargetTestCase(TestCase):
19 |     def setUp(self):
20 |         test_config.setUp()
21 | 
22 |     def tearDown(self):
23 |         test_config.tearDown()
24 | 
25 |     def test_exists(self):
26 |         eq_(FileTarget('bucket', 'path/to/file.csv', config=test_config).exists(), True)
27 |         eq_(FileTarget('bucket', 'path/to/invalid.csv', config=test_config).exists(), False)
28 | 


--------------------------------------------------------------------------------
/luigi_bigquery/job.py:
--------------------------------------------------------------------------------
 1 | class Job(object):
 2 |     def __init__(self, client, job_id):
 3 |         self.client = client
 4 |         self.id = job_id
 5 | 
 6 |     @property
 7 |     def job_id(self):
 8 |         return self.id
 9 | 
10 |     @property
11 |     def result_size(self):
12 |         if not hasattr(self, '_result_size'):
13 |             _, self._result_size = self.client.check_job(self.job_id)
14 |         return self._result_size
15 | 
16 |     @property
17 |     def schema(self):
18 |         if not hasattr(self, '_schema'):
19 |             self._schema = self.client.get_query_schema(self.job_id)
20 |         return self._schema
21 | 
22 |     @property
23 |     def result(self):
24 |         if not hasattr(self, '_result'):
25 |             self._result = self.client.get_query_rows(self.job_id)
26 |         return self._result
27 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |     name="luigi-bigquery",
 7 |     version="0.2.0",
 8 |     descripition="Luigi integration for Google BigQuery",
 9 |     author="Kazuyuki Honda",
10 |     author_email="hakobera@gmail.com",
11 |     url="https://github.com/hakobera/luigi-bigquery",
12 |     install_requires=open("requirements.txt").read().splitlines(),
13 |     packages=find_packages(),
14 |     license="Apache License 2.0",
15 |     platforms="Posix; MacOS X; Windows",
16 |     classifiers=[
17 |         "Development Status :: 3 - Alpha",
18 |         "Environment :: Console",
19 |         "Intended Audience :: Developers",
20 |         "License :: OSI Approved :: Apache Software License",
21 |         "Operating System :: OS Independent",
22 |         "Topic :: Software Development",
23 |     ],
24 | )
25 | 


--------------------------------------------------------------------------------
/luigi_bigquery/__init__.py:
--------------------------------------------------------------------------------
 1 | from luigi_bigquery.client import ResultProxy
 2 | from luigi_bigquery.config import Config, ConfigLoader, get_config
 3 | from luigi_bigquery.task import DatasetTask, TableTask, Query, QueryTable, QueryToGCS
 4 | from luigi_bigquery.gcs import get_gcs_client
 5 | from luigi_bigquery.targets.result import ResultTarget
 6 | from luigi_bigquery.targets.bq import DatasetTarget, TableTarget
 7 | from luigi_bigquery.targets.gcs import BucketTarget, FileTarget
 8 | 
 9 | __all__ = [
10 |     # client
11 |     'ResultProxy',
12 |     # config
13 |     'Config',
14 |     'ConfigLoader',
15 |     'get_config',
16 |     # task
17 |     'DatasetTask',
18 |     'TableTask',
19 |     'Query',
20 |     'QueryTable',
21 |     'QueryToGCS',
22 |     # targets.result
23 |     'ResultTarget',
24 |     # targets.bq
25 |     'DatasetTarget',
26 |     'TableTarget',
27 |     # targets.gcs
28 |     'BucketTarget',
29 |     'FileTarget',
30 | ]
31 | 


--------------------------------------------------------------------------------
/luigi_bigquery/targets/gcs.py:
--------------------------------------------------------------------------------
 1 | from luigi_bigquery.config import get_config
 2 | 
 3 | import luigi
 4 | 
 5 | import logging
 6 | logger = logging.getLogger('luigi-interface')
 7 | 
 8 | class BucketTarget(luigi.Target):
 9 |     def __init__(self, bucket_name, config=None):
10 |         self.bucket_name = bucket_name
11 |         self.config = config or get_config()
12 | 
13 |     def exists(self):
14 |         client = self.config.get_gcs_client()
15 |         return client.check_bucket(self.bucket_name)
16 | 
17 | class FileTarget(luigi.Target):
18 |     def __init__(self, bucket_name, path, config=None):
19 |         self.bucket_name = bucket_name
20 |         if path[0] == '/':
21 |             self.path = path[1:]
22 |         else:
23 |             self.path = path
24 |         self.config = config or get_config()
25 | 
26 |     def exists(self):
27 |         client = self.config.get_gcs_client()
28 |         return client.check_file(self.bucket_name, self.path)
29 | 
30 |     def uri(self):
31 |         return "gs://{0}/{1}".format(self.bucket_name, self.path)
32 | 


--------------------------------------------------------------------------------
/luigi_bigquery/targets/bq.py:
--------------------------------------------------------------------------------
 1 | from luigi_bigquery.config import get_config
 2 | 
 3 | import luigi
 4 | 
 5 | import logging
 6 | logger = logging.getLogger('luigi-interface')
 7 | 
 8 | class SchemaError(Exception):
 9 |     pass
10 | 
11 | class DatasetTarget(luigi.Target):
12 |     def __init__(self, dataset_id, config=None):
13 |         self.dataset_id = dataset_id
14 |         self.config = config or get_config()
15 | 
16 |     def exists(self):
17 |         client = self.config.get_client()
18 |         return client.check_dataset(self.dataset_id)
19 | 
20 | class TableTarget(luigi.Target):
21 |     def __init__(self, dataset_id, table_id, empty=False, config=None, append=False):
22 |         self.dataset_id = dataset_id
23 |         self.table_id = table_id
24 |         self.empty = empty
25 |         self.config = config or get_config()
26 |         self.append = append
27 | 
28 |     def exists(self):
29 |         client = self.config.get_client()
30 |         table = client.get_table(self.dataset_id, self.table_id)
31 | 
32 |         if not bool(table) or self.append:
33 |             return False
34 | 
35 |         count = table.get('numRows', 0)
36 | 
37 |         if self.empty:
38 |             if count == 0:
39 |                 return True
40 |             else:
41 |                 logger.info('Deleting table: %s.%s', self.dataset_id, self.table_id)
42 |                 client.delete_table(self.dataset_id, self.table_id)
43 |                 return False
44 |         else:
45 |             return True
46 | 


--------------------------------------------------------------------------------
/luigi_bigquery/targets/result.py:
--------------------------------------------------------------------------------
 1 | import luigi
 2 | from luigi_bigquery.config import get_config
 3 | from luigi_bigquery.client import ResultProxy
 4 | from luigi_bigquery.job import Job
 5 | 
 6 | import json
 7 | import os
 8 | 
 9 | import logging
10 | logger = logging.getLogger('luigi-interface')
11 | 
12 | class ResultTarget(luigi.Target):
13 |     def __init__(self, path, config=None):
14 |         self.path = path
15 |         self.config = config or get_config()
16 | 
17 |     # Job result handling
18 | 
19 |     def save_result_state(self, result):
20 |         state_dir = os.path.dirname(self.path)
21 |         if state_dir != '' and not os.path.exists(state_dir):
22 |             os.makedirs(state_dir)
23 |         with open(self.path, 'w') as f:
24 |             state = {'job_id': result.job_id}
25 |             json.dump(state, f)
26 | 
27 |     def load_result_state(self):
28 |         with open(self.path) as f:
29 |             return json.load(f)
30 | 
31 |     @property
32 |     def job_id(self):
33 |         return self.load_result_state()['job_id']
34 | 
35 |     @property
36 |     def result(self):
37 |         if not hasattr(self, '_result'):
38 |             client = self.config.get_client()
39 |             self._result = ResultProxy(Job(client, self.job_id))
40 |         return self._result
41 | 
42 |     # Luigi support
43 | 
44 |     def exists(self):
45 |         if not os.path.exists(self.path):
46 |             return False
47 | 
48 |         client = self.config.get_client()
49 |         complete, _ = client.check_job(self.job_id)
50 |         return complete
51 | 


--------------------------------------------------------------------------------
/luigi_bigquery/client.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | 
 3 | class ResultProxy(object):
 4 |     def __init__(self, job):
 5 |         self.job = job
 6 | 
 7 |     @property
 8 |     def job_id(self):
 9 |         return self.job.job_id
10 | 
11 |     @property
12 |     def size(self):
13 |         return self.job.result_size
14 | 
15 |     @property
16 |     def description(self):
17 |         return enumerate(self.job.schema)
18 | 
19 |     def __iter__(self):
20 |         return iter(self._rows())
21 | 
22 |     def _columns(self):
23 |         return [c['name'] for i, c in self.description]
24 | 
25 |     def _rows(self):
26 |         rows = []
27 |         for row in self.job.result:
28 |             rows.append([row[c] if row[c] is not None else '' for c in self._columns()])
29 |         return rows
30 | 
31 |     def to_csv(self, path_or_file):
32 |         def _write_row(f, values):
33 |             line = u",".join([v if type(v) is unicode else unicode(str(v), encoding='UTF-8') for v in values]) + u"\n"
34 |             f.write(line.encode('UTF-8'))
35 | 
36 |         def _to_csv(f):
37 |             _write_row(f, self._columns())
38 |             for row in self._rows():
39 |                 _write_row(f, row)
40 | 
41 |         if isinstance(path_or_file, six.string_types):
42 |             with open(path_or_file, 'w', encoding='UTF-8') as f:
43 |                 return _to_csv(f)
44 |         else:
45 |             return _to_csv(path_or_file)
46 | 
47 |     def to_dataframe(self):
48 |         import pandas as pd
49 |         df = pd.DataFrame(columns=self._columns())
50 |         i = 0
51 |         for row in self._rows():
52 |             df.loc[i] = row
53 |             i += 1
54 |         return df
55 | 


--------------------------------------------------------------------------------
/luigi_bigquery/tests/test_task.py:
--------------------------------------------------------------------------------
 1 | from .test_helper import TestConfig
 2 | from luigi_bigquery import Query, ResultTarget
 3 | 
 4 | from unittest import TestCase
 5 | from nose.tools import eq_, raises
 6 | 
 7 | import luigi
 8 | 
 9 | test_config = TestConfig(
10 |     jobs = [
11 |         {
12 |             'job_id': 1,
13 |             'job_complete': True,
14 |             'total_rows': 20,
15 |             'schema': [{'name': 'cnt'}],
16 |             'rows': [{'name': 5000}]
17 |         }
18 |     ]
19 | )
20 | 
21 | class TestQuery(Query):
22 |     config = test_config
23 |     def query(self):
24 |         return 'SELECT COUNT(1) cnt FROM www_access'
25 | 
26 | class QueryTestCase(TestCase):
27 |     def setUp(self):
28 |         test_config.setUp()
29 | 
30 |     def tearDown(self):
31 |         test_config.tearDown()
32 | 
33 |     def test_simple(self):
34 |         class SimpleTestQuery(TestQuery):
35 |             pass
36 |         task = SimpleTestQuery()
37 |         task.run()
38 | 
39 |     def test_with_output(self):
40 |         class OutputTestQuery(TestQuery):
41 |             def output(self):
42 |                 return ResultTarget(test_config.get_tmp_path('{0}.job'.format(self)))
43 |         task = OutputTestQuery()
44 |         task.run()
45 | 
46 |     def test_with_dependency(self):
47 |         class DependencyTestQuery(TestQuery):
48 |             def output(self):
49 |                 return ResultTarget(test_config.get_tmp_path('{0}.job'.format(self)))
50 | 
51 |         class DependencyTestResult(luigi.Task):
52 |             def requires(self):
53 |                 return DependencyTestQuery()
54 | 
55 |             def output(self):
56 |                 return LocalTarget(test_config.get_tmp_path('{0}.csv'.format(self)))
57 | 
58 |         task = DependencyTestResult()
59 |         task.run()
60 | 


--------------------------------------------------------------------------------
/luigi_bigquery/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import luigi
 3 | from bigquery import get_client as bqclient
 4 | from .gcs import get_gcs_client
 5 | 
 6 | import logging
 7 | logger = logging.getLogger('luigi-interface')
 8 | 
 9 | class Config(object):
10 |     def __init__(self, project_id, service_account, private_key_file):
11 |         self.project_id = project_id
12 |         self.service_account = service_account
13 |         self.private_key_file = private_key_file
14 | 
15 |     def get_client(self):
16 |         return bqclient(
17 |                 self.project_id,
18 |                 service_account=self.service_account,
19 |                 private_key_file=self.private_key_file,
20 |                 readonly=False)
21 | 
22 |     def get_gcs_client(self):
23 |         return get_gcs_client(
24 |                 self.project_id,
25 |                 service_account=self.service_account,
26 |                 private_key_file=self.private_key_file)
27 | 
28 | class ConfigLoader(object):
29 |     _instance = None
30 | 
31 |     @classmethod
32 |     def instance(cls, *args, **kwargs):
33 |         if cls._instance is None:
34 |             cls._instance = cls(*args, **kwargs)
35 |             cls._instance.load_default()
36 |         return cls._instance
37 | 
38 |     def __init__(self):
39 |         self.config = None
40 | 
41 |     def get_config(self):
42 |         return self.config
43 | 
44 |     def load_default(self):
45 |         luigi_config = luigi.configuration.get_config()
46 |         project_id = luigi_config.get('bigquery', 'project_id', os.environ.get('BQ_PROJECT_ID'))
47 |         service_account = luigi_config.get('bigquery', 'service_account', os.environ.get('BQ_SERVICE_ACCOUNT'))
48 |         private_key_file = luigi_config.get('bigquery', 'private_key_file', os.environ.get('BQ_PRIVATE_KEY_FILE'))
49 |         self.config = Config(project_id, service_account, private_key_file)
50 | 
51 | def get_config():
52 |     return ConfigLoader.instance().get_config()
53 | 


--------------------------------------------------------------------------------
/luigi_bigquery/tests/test_targets_bq.py:
--------------------------------------------------------------------------------
 1 | from .test_helper import TestConfig
 2 | from luigi_bigquery import DatasetTarget, TableTarget
 3 | 
 4 | from unittest import TestCase
 5 | from nose.tools import eq_
 6 | 
 7 | import luigi
 8 | 
 9 | test_config = TestConfig(
10 |     datasets = [
11 |         {
12 |             "datasetReference": {
13 |                 "datasetId": 'dataset_1',
14 |                 "projectId": 'test-project-id'
15 |             }
16 |         }
17 |     ],
18 | 
19 |     tables = [
20 |         {
21 |             "tableReference": {
22 |                 "tableId": 'table_1',
23 |                 "datasetId": 'dataset_1',
24 |                 "projectId": 'test-project-id'
25 |             },
26 |             "numRows": 1
27 |         },
28 |         {
29 |             "tableReference": {
30 |                 "tableId": 'table_2',
31 |                 "datasetId": 'dataset_1',
32 |                 "projectId": 'test-project-id'
33 |             },
34 |             "numRows": 0
35 |         }
36 |     ]
37 | )
38 | 
39 | class DatasetTargetTestCase(TestCase):
40 |     def setUp(self):
41 |         test_config.setUp()
42 | 
43 |     def tearDown(self):
44 |         test_config.tearDown()
45 | 
46 |     def test_exists(self):
47 |         eq_(DatasetTarget('dataset_1', config=test_config).exists(), True)
48 |         eq_(DatasetTarget('invalid_dataset_1', config=test_config).exists(), False)
49 | 
50 | class TableTargetTestCase(TestCase):
51 |     def setUp(self):
52 |         test_config.setUp()
53 | 
54 |     def tearDown(self):
55 |         test_config.tearDown()
56 | 
57 |     def test_exists(self):
58 |         eq_(TableTarget('dataset_1', 'table_1', config=test_config).exists(), True)
59 |         eq_(TableTarget('dataset_1', 'invalid_table_1', config=test_config).exists(), False)
60 | 
61 |     def test_exists_check_empty(self):
62 |         eq_(TableTarget('dataset_1', 'table_1', empty=True, config=test_config).exists(), False)
63 |         eq_(TableTarget('dataset_1', 'table_2', empty=True, config=test_config).exists(), True)
64 | 


--------------------------------------------------------------------------------
/luigi_bigquery/gcs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from apiclient.discovery import build
 4 | from apiclient.errors import HttpError
 5 | import httplib2
 6 | 
 7 | def get_gcs_client(project_id, credentials=None, service_account=None,
 8 |                private_key=None, private_key_file=None):
 9 | 
10 |     if not credentials:
11 |         assert service_account and (private_key or private_key_file), \
12 |             'Must provide AssertionCredentials or service account and key'
13 | 
14 |     if private_key_file:
15 |         with open(private_key_file, 'rb') as key_file:
16 |             private_key = key_file.read()
17 | 
18 |     gcs_service = _get_gcs_service(credentials=credentials,
19 |                                  service_account=service_account,
20 |                                  private_key=private_key)
21 | 
22 |     return GCSClient(gcs_service)
23 | 
24 | def _get_gcs_service(credentials=None, service_account=None, private_key=None):
25 | 
26 |     assert credentials or (service_account and private_key), \
27 |         'Must provide AssertionCredentials or service account and key'
28 | 
29 |     if not credentials:
30 |         credentials = _credentials()(service_account, private_key, scope='https://www.googleapis.com/auth/devstorage.read_write')
31 | 
32 |     http = httplib2.Http()
33 |     http = credentials.authorize(http)
34 |     service = build('storage', 'v1', http=http)
35 | 
36 |     return service
37 | 
38 | def _credentials():
39 |     from oauth2client.client import SignedJwtAssertionCredentials
40 | 
41 |     return SignedJwtAssertionCredentials
42 | 
43 | class GCSClient(object):
44 | 
45 |     def __init__(self, gcs_service):
46 |         self.gcs = gcs_service
47 | 
48 |     def get_bucket(self, bucket_name):
49 |         try:
50 |             bucket = self.gcs.buckets().get(bucket=bucket_name).execute()
51 |         except HttpError:
52 |             bucket = {}
53 | 
54 |         return bucket
55 | 
56 |     def check_bucket(self, bucket_name):
57 |         bucket = self.get_bucket(bucket_name)
58 |         return bool(bucket)
59 | 
60 |     def get_file(self, bucket_name, path):
61 |         try:
62 |             file = self.gcs.objects().get(bucket=bucket_name, object=path).execute()
63 |         except HttpError:
64 |             file = {}
65 | 
66 |         return file
67 | 
68 |     def check_file(self, bucket_name, path):
69 |         file = self.get_file(bucket_name, path)
70 |         return bool(file)
71 | 


--------------------------------------------------------------------------------
/luigi_bigquery/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | from luigi_bigquery import Config, ConfigLoader, get_config
 2 | 
 3 | from unittest import TestCase
 4 | from nose.tools import eq_, raises
 5 | from bigquery import get_client as bqclient
 6 | 
 7 | import os
 8 | import luigi
 9 | 
10 | class ConfigTestCase(TestCase):
11 | 
12 |     def test_create(self):
13 |         config = Config('test-project-id', 'test-service-account', '/path/to/key.p12')
14 |         eq_(config.project_id, 'test-project-id')
15 |         eq_(config.service_account, 'test-service-account')
16 |         eq_(config.private_key_file, '/path/to/key.p12')
17 | 
18 | class ConfigLoaderTestCase(TestCase):
19 | 
20 |     def setUp(self):
21 |         self.environ = os.environ.copy()
22 | 
23 |     def tearDown(self):
24 |         os.environ.clear()
25 |         os.environ.update(self.environ)
26 |         if os.path.exists('client.cfg'):
27 |             os.unlink('client.cfg')
28 |             luigi.configuration.LuigiConfigParser._instance = None
29 | 
30 |     def _config(self, values):
31 |         with open('client.cfg', 'w') as f:
32 |             f.write("[bigquery]\n")
33 |             for key, val in values.items():
34 |                 f.write("{0}: {1}\n".format(key, val))
35 |         luigi.configuration.LuigiConfigParser._instance = None
36 | 
37 |     def _get_config(self):
38 |         loader = ConfigLoader()
39 |         loader.load_default()
40 |         return loader.get_config()
41 | 
42 |     @raises(AssertionError)
43 |     def test_no_config(self):
44 |         config = self._get_config()
45 |         eq_(config.project_id, None)
46 |         config.get_client()
47 | 
48 |     def test_credentials_by_environ(self):
49 |         os.environ['BQ_PROJECT_ID'] = 'test-project-id'
50 |         os.environ['BQ_SERVICE_ACCOUNT'] = 'test-service-account'
51 |         os.environ['BQ_PRIVATE_KEY_FILE'] = '/path/to/key.p12'
52 |         config = self._get_config()
53 |         eq_(config.project_id, 'test-project-id')
54 |         eq_(config.service_account, 'test-service-account')
55 |         eq_(config.private_key_file, '/path/to/key.p12')
56 | 
57 |     def test_credentials_by_luigi_config(self):
58 |         self._config(
59 |             {
60 |                 'project_id': 'test-project-id',
61 |                 'service_account': 'test-service-account',
62 |                 'private_key_file': '/path/to/key.p12',
63 |             }
64 |         )
65 |         config = self._get_config()
66 |         eq_(config.project_id, 'test-project-id')
67 |         eq_(config.service_account, 'test-service-account')
68 |         eq_(config.private_key_file, '/path/to/key.p12')
69 | 
70 | class GetConfigTestCase(TestCase):
71 | 
72 |     def test_default(self):
73 |         config = get_config()
74 |         eq_(type(config), Config)
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Luigi-BigQuery
  2 | 
  3 | [![Build Status](https://travis-ci.org/hakobera/luigi-bigquery.svg?branch=master)](https://travis-ci.org/hakobera/luigi-bigquery)
  4 | 
  5 | Luigi integration for Google BigQuery.
  6 | 
  7 | **CAUTION: This module is currently under active development.**
  8 | 
  9 | ## Prerequisities
 10 | 
 11 | - Python >= 2.7
 12 | - [luigi](https://github.com/spotify/luigi) >= 2.0.0
 13 | 
 14 | If you want to run with luigi 1.x, use [lugi_1](https://github.com/hakobera/luigi-bigquery/tree/luigi_1) branch.
 15 | 
 16 | ## Install
 17 | 
 18 | You can install Luigi-BigQuery using pip:
 19 | 
 20 | ```sh
 21 | $ pip install git+https://github.com/hakobera/luigi-bigquery#egg=luigi=bigquery
 22 | ```
 23 | 
 24 | ## Configuration
 25 | 
 26 | You can set your `project_id`, `service_account` and `private_key_file` as an environment variables `BQ_PROJECT_ID`, `BQ_SERVICE_ACCOUNT` and `BQ_PRIVATE_KEY_FILE`:
 27 | 
 28 | ```sh
 29 | $ export BQ_PROJECT_ID=your-project-id
 30 | $ export BQ_SERVICE_ACCOUNT=your-service-account
 31 | $ export BQ_PRIVATE_KEY_FILE=/path/to/key.p12
 32 | ```
 33 | 
 34 | Alternatively, you can use Luigi configuration file (`./client.cfg` or `/etc/lugi/client.cfg`):
 35 | 
 36 | ```
 37 | # configuration for Luigi
 38 | [core]
 39 | error-email: you@example.com
 40 | 
 41 | # configuration for Luigi-BigQuery
 42 | [bigquery]
 43 | project_id: your_project_id
 44 | service_account: your_service_account
 45 | private_key_file: /path/to/key.p12
 46 | ```
 47 | 
 48 | ## Running Queries
 49 | 
 50 | Queries are defined as subclasses of `luigi_bigquery.Query`
 51 | 
 52 | ```python
 53 | import luigi
 54 | import luigi_bigquery
 55 | 
 56 | class MyQuery(luigi_bigquery.Query):
 57 |     def query(self):
 58 |         return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]"
 59 | 
 60 |     def output(self):
 61 |         return luigi.LocalTarget('data/MyQuery.csv')
 62 | 
 63 |     def run(self):
 64 |         result = self.run_query(self.query())
 65 |         with self.output().open('w') as f:
 66 |             result.to_csv(f)
 67 | 
 68 | if __name__ == '__main__':
 69 |     luigi.run()
 70 | ```
 71 | 
 72 | You can submit your query as a normal Python script as follows:
 73 | 
 74 | ```
 75 | $ python myquery.py MyQuery --local-scheduler
 76 | DEBUG: Checking if MyQuery() is complete
 77 | INFO: Scheduled MyQuery() (PENDING)
 78 | INFO: Done scheduling tasks
 79 | INFO: Running Worker with 1 processes
 80 | DEBUG: Asking scheduler for work...
 81 | DEBUG: Pending tasks: 1
 82 | INFO: [pid 1234] Worker Worker(salt=1234, workers=1, host=...) running   MyQuery()
 83 | INFO: MyQuery(): bigquery.job.id: job_1234
 84 | INFO: MyQuery(): bigquery.job.result: job_id=job_1234 row_count=1
 85 | INFO: [pid 1234] Worker Worker(salt=1234, workers=1, host=...) done      MyQuery()
 86 | DEBUG: 1 running tasks, waiting for next task to finish
 87 | DEBUG: Asking scheduler for work...
 88 | INFO: Done
 89 | INFO: There are no more tasks to run at this time
 90 | INFO: Worker Worker(salt=1234, workers=1, host=...) was stopped. Shutting down Keep-Alive thread
 91 | ```
 92 | 
 93 | You can see the query result in file `data/MyQuery.csv`
 94 | 
 95 | ```
 96 | $ cat data/MyQuery.csv
 97 | cnt
 98 | 2541639
 99 | ```
100 | 
101 | ## For more examples
102 | 
103 | See [examples/tasks.py](./examples/tasks.py)
104 | 
105 | ## License
106 | 
107 | Apache License Version 2.0
108 | 


--------------------------------------------------------------------------------
/luigi_bigquery/tests/test_helper.py:
--------------------------------------------------------------------------------
  1 | from luigi_bigquery import ResultProxy
  2 | 
  3 | import os
  4 | import shutil
  5 | import tempfile
  6 | 
  7 | class MockClient(object):
  8 |     def __init__(self, datasets, tables, jobs):
  9 |         self._datasets = datasets
 10 |         self._tables = tables
 11 |         self._jobs = jobs
 12 | 
 13 |     def create_dataset(self, dataset_id, friendly_name=None, description=None, access=None):
 14 |         dataset_data = _dataset_resource(dataset_id, friendly_name, description, access)
 15 |         self._datasets.append(dataset_data)
 16 |         return dataset_data
 17 | 
 18 |     def get_datasets(self):
 19 |         return self._datasets
 20 | 
 21 |     def check_dataset(self, dataset_id):
 22 |         return dataset_id in [ds['datasetReference']['datasetId'] for ds in self.get_datasets()]
 23 | 
 24 |     def get_table(self, dataset_id, table_id):
 25 |         for table in self._tables:
 26 |             ref = table['tableReference']
 27 |             if ref['datasetId'] == dataset_id and ref['tableId'] == table_id:
 28 |                 return table
 29 |         return {}
 30 | 
 31 |     def delete_table(self, dataset_id, table_id):
 32 |         pass
 33 | 
 34 |     def check_job(self, job_id):
 35 |         job = self._job(job_id)
 36 |         return (job.get('job_complete', False), int(job.get('total_rows', 0)))
 37 | 
 38 |     def get_query_schema(self, job_id):
 39 |         job = self._job(job_id)
 40 |         return job['schema']
 41 | 
 42 |     def get_query_rows(self, job_id):
 43 |         job = self._job(job_id)
 44 |         return job['rows']
 45 | 
 46 |     def query(self, query):
 47 |         return (self._jobs[0]['job_id'], None)
 48 | 
 49 |     def _job(self, job_id):
 50 |         for job in self._jobs:
 51 |             if job['job_id'] == job_id:
 52 |                 return job
 53 |         return {}
 54 | 
 55 |     def _dataset_resource(self, dataset_id, friendly_name=None, description=None, access=None):
 56 |         data = {
 57 |             "datasetReference": {
 58 |                 "datasetId": dataset_id,
 59 |                 "projectId": 'test-project-id'
 60 |             }
 61 |         }
 62 |         if friendly_name:
 63 |             data["friendlyName"] = friendly_name
 64 |         if description:
 65 |             data["description"] = description
 66 |         if access:
 67 |             data["access"] = access
 68 | 
 69 |         return data
 70 | 
 71 | 
 72 | class MockGCSClient(object):
 73 | 
 74 |     def __init__(self, objects):
 75 |         self._objects = objects
 76 | 
 77 |     def get_file(self, bucket_name, path):
 78 |         for obj in self._objects:
 79 |             if obj['bucket'] == bucket_name and obj['name'] == path:
 80 |                 return obj
 81 |         return {}
 82 | 
 83 |     def check_file(self, bucket_name, path):
 84 |         file = self.get_file(bucket_name, path)
 85 |         return bool(file)
 86 | 
 87 | class TestConfig(object):
 88 |     def __init__(self, datasets=[], tables=[], jobs=[], objects=[]):
 89 |         self.datasets = datasets
 90 |         self.tables = tables
 91 |         self.objects = objects
 92 |         self._jobs = jobs
 93 |         self.tmp_dir = None
 94 | 
 95 |     def setUp(self):
 96 |         if not self.tmp_dir:
 97 |             self.tmp_dir = tempfile.mkdtemp()
 98 | 
 99 |     def tearDown(self):
100 |         if self.tmp_dir:
101 |             shutil.rmtree(self.tmp_dir)
102 |             self.tmp_dir = None
103 | 
104 |     def get_tmp_path(self, filename):
105 |         return os.path.join(self.tmp_dir, filename)
106 | 
107 |     def get_client(self):
108 |         return MockClient(datasets=self.datasets, tables=self.tables, jobs=self._jobs)
109 | 
110 |     def get_gcs_client(self):
111 |         return MockGCSClient(objects=self.objects)
112 | 


--------------------------------------------------------------------------------
/luigi_bigquery/task.py:
--------------------------------------------------------------------------------
  1 | from luigi_bigquery.config import get_config
  2 | from luigi_bigquery.client import ResultProxy
  3 | from luigi_bigquery.job import Job
  4 | from luigi_bigquery.targets.result import ResultTarget
  5 | from luigi_bigquery.targets.bq import DatasetTarget
  6 | from luigi_bigquery.targets.bq import TableTarget
  7 | from luigi_bigquery.targets.gcs import BucketTarget
  8 | from luigi_bigquery.targets.gcs import FileTarget
  9 | 
 10 | import luigi
 11 | import jinja2
 12 | import time
 13 | import bigquery
 14 | import string
 15 | import random
 16 | import six
 17 | 
 18 | import logging
 19 | logger = logging.getLogger('luigi-interface')
 20 | 
 21 | def _id_generator(size=16, chars=string.ascii_uppercase + string.digits):
 22 |     return ''.join(random.choice(chars) for _ in range(size))
 23 | 
 24 | # Dataset
 25 | 
 26 | class DatasetTask(luigi.Task):
 27 |     config = get_config()
 28 |     dataset_id = luigi.Parameter()
 29 | 
 30 |     def output(self):
 31 |         return DatasetTarget(self.dataset_id)
 32 | 
 33 |     def run(self):
 34 |         client = self.config.get_client()
 35 |         logger.info('%s: creating dataset: %s', self, self.dataset_id)
 36 |         client.create_dataset(self.dataset_id)
 37 | 
 38 |         max_retry = 30
 39 |         retry = 0
 40 |         while True:
 41 |             time.sleep(5.0)
 42 |             if client.check_dataset(self.dataset_id):
 43 |                 break
 44 |             retry += 1
 45 |             if retry > max_retry:
 46 |                 msg = "DatasetTask(dataset_id={0}) max retry error.".format(self.dataset_id)
 47 |                 logger.error(msg)
 48 |                 raise Exception(msg)
 49 | 
 50 | # Table
 51 | 
 52 | class TableTask(luigi.Task):
 53 |     config = get_config()
 54 |     dataset_id = luigi.Parameter()
 55 |     table_id = luigi.Parameter()
 56 |     schema = luigi.Parameter(default=[], significant=False)
 57 |     empty = luigi.BooleanParameter(default=False, significant=False)
 58 | 
 59 |     def requires(self):
 60 |         return DatasetTask(self.dataset_id)
 61 | 
 62 |     def output(self):
 63 |         return TableTarget(self.dataset_id, self.table_id, empty=self.empty)
 64 | 
 65 |     def run(self):
 66 |         client = self.config.get_client()
 67 |         logger.info('%s: creating table: %s.%s', self, self.datasset_id, self.table_id)
 68 |         client.create_table(self.dataset_id, self.table_id, self.schema)
 69 | 
 70 | # Query
 71 | 
 72 | class QueryTimeout(Exception):
 73 |     pass
 74 | 
 75 | class Query(luigi.Task):
 76 |     config = get_config()
 77 |     debug = False
 78 |     timeout = 3600
 79 |     source = None
 80 |     variables = {}
 81 | 
 82 |     def query(self):
 83 |         return NotImplemented()
 84 | 
 85 |     def load_query(self, source):
 86 |         env = jinja2.Environment(loader=jinja2.PackageLoader(self.__module__, '.'))
 87 |         template = env.get_template(source)
 88 |         return template.render(task=self, **self.variables)
 89 | 
 90 |     def run_query(self, query):
 91 |         result = self.output()
 92 |         client = self.config.get_client()
 93 | 
 94 |         logger.info("%s: query: %s", self, query)
 95 |         job_id, _ = client.query(query)
 96 |         logger.info("%s: bigquery.job.id: %s", self, job_id)
 97 | 
 98 |         complete, result_size = client.check_job(job_id)
 99 |         try:
100 |             if self.timeout:
101 |                 timeout = time.time() + self.timeout
102 |             else:
103 |                 timeout = None
104 | 
105 |             while not complete:
106 |                 if timeout and time.time() > timeout:
107 |                     raise QueryTimeout('{0} timed out'.format(self))
108 |                 time.sleep(5)
109 |                 complete, result_size = client.check_job(job_id)
110 |         except:
111 |             raise
112 | 
113 |         logger.info("%s: bigquery.job.result: job_id=%s result_size=%d", self, job_id, result_size)
114 | 
115 |         return ResultProxy(Job(client, job_id))
116 | 
117 |     def run(self):
118 |         query = self.load_query(self.source) if self.source else self.query()
119 |         result = self.run_query(query)
120 |         target = self.output()
121 | 
122 |         if target and isinstance(target, ResultTarget):
123 |             target.save_result_state(result)
124 | 
125 |         if self.debug:
126 |             import pandas as pd
127 |             TERMINAL_WIDTH = 120
128 |             pd.options.display.width = TERMINAL_WIDTH
129 |             six.print_('-' * TERMINAL_WIDTH)
130 |             six.print_('Query result:')
131 |             six.print_(result.to_dataframe())
132 |             six.print_('-' * TERMINAL_WIDTH)
133 | 
134 | class QueryTable(Query):
135 |     create_disposition = bigquery.JOB_CREATE_IF_NEEDED
136 |     write_disposition = bigquery.JOB_WRITE_EMPTY
137 | 
138 |     def requires(self):
139 |         return DatasetTask(self.dataset())
140 | 
141 |     def output(self):
142 |         return TableTarget(self.dataset(), self.table(), append=self._append())
143 | 
144 |     def dataset(self):
145 |         return NotImplemented()
146 | 
147 |     def table(self):
148 |         return NotImplemented()
149 | 
150 |     def _append(self):
151 |         return self.write_disposition == bigquery.JOB_WRITE_APPEND
152 | 
153 |     def save_as_table(self, query):
154 |         result = self.output()
155 |         client = self.config.get_client()
156 | 
157 |         logger.info("%s: query: %s", self, query)
158 |         job = client.write_to_table(
159 |                 query,
160 |                 dataset=self.dataset(),
161 |                 table=self.table(),
162 |                 create_disposition=self.create_disposition,
163 |                 write_disposition=self.write_disposition,
164 |                 allow_large_results=True)
165 |         job_id = job['jobReference'].get('jobId')
166 |         logger.info("%s: bigquery.job.id: %s", self, job_id)
167 | 
168 |         complete, result_size = client.check_job(job_id)
169 |         try:
170 |             if self.timeout:
171 |                 timeout = time.time() + self.timeout
172 |             else:
173 |                 timeout = None
174 | 
175 |             while not complete:
176 |                 if timeout and time.time() > timeout:
177 |                     raise QueryTimeout('{0} timed out'.format(self))
178 |                 time.sleep(5)
179 |                 complete, result_size = client.check_job(job_id)
180 |         except:
181 |             raise
182 | 
183 |         logger.info("%s: bigquery.job.result: job_id=%s result_size=%d", self, job_id, result_size)
184 | 
185 |         return ResultProxy(Job(client, job_id))
186 | 
187 |     def run(self):
188 |         query = self.load_query(self.source) if self.source else self.query()
189 |         self.save_as_table(query)
190 | 
191 | class QueryToGCS(QueryTable):
192 |     compression = luigi.Parameter(default='NONE') # or GZIP
193 |     format = luigi.Parameter(default='CSV') # or NEWLINE_DELIMITED_JSON
194 |     print_header = luigi.Parameter(default=True)
195 |     use_temporary_table = luigi.Parameter(default=True)
196 | 
197 |     def __init__(self, *args, **kwargs):
198 |         super(QueryToGCS, self).__init__(*args, **kwargs)
199 |         self._random_id = 'tmp_{}'.format(_id_generator())
200 | 
201 |     def dataset(self):
202 |         if self.use_temporary_table:
203 |             return self._random_id
204 |         else:
205 |             return NotImplemented()
206 | 
207 |     def table(self):
208 |         if self.use_temporary_table:
209 |             return self._random_id
210 |         else:
211 |             return NotImplemented()
212 | 
213 |     def output(self):
214 |         return FileTarget(self.bucket(), self.path())
215 | 
216 |     def bucket(self):
217 |         return NotImplemented()
218 | 
219 |     def path(self):
220 |         return NotImplemented()
221 | 
222 |     def export_to_gcs(self):
223 |         result = self.output()
224 |         client = self.config.get_client()
225 | 
226 |         logger.info("%s: export %s.%s to %s", self, self.dataset(), self.table(), result.uri())
227 |         job = client.export_data_to_uris(
228 |                 destination_uris=[result.uri()],
229 |                 dataset=self.dataset(),
230 |                 table=self.table(),
231 |                 compression=self.compression,
232 |                 destination_format=self.format,
233 |                 print_header=self.print_header)
234 |         job_id = job['jobReference'].get('jobId')
235 |         logger.info("%s: bigquery.job.id: %s", self, job_id)
236 | 
237 |         try:
238 |             job_resource = client.wait_for_job(job, timeout=3600)
239 |         except:
240 |             raise
241 | 
242 |     def _cleanup(self):
243 |         if self.use_temporary_table:
244 |             client = self.config.get_client()
245 |             client.delete_dataset(self.dataset(), delete_contents=True)
246 | 
247 |     def run(self):
248 |         query = self.load_query(self.source) if self.source else self.query()
249 |         try:
250 |             self.save_as_table(query)
251 |             self.export_to_gcs()
252 |         finally:
253 |             self._cleanup()
254 | 


--------------------------------------------------------------------------------
/examples/tasks.py:
--------------------------------------------------------------------------------
  1 | import luigi
  2 | import luigi_bigquery
  3 | 
  4 | ## Running Queries
  5 | 
  6 | class MyQuery(luigi_bigquery.Query):
  7 | 
  8 |     def query(self):
  9 |         return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]"
 10 | 
 11 | ## Getting Results
 12 | 
 13 | class MyQueryRun(luigi_bigquery.Query):
 14 | 
 15 |     def query(self):
 16 |         return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]"
 17 | 
 18 |     def run(self):
 19 |         result = self.run_query(self.query())
 20 |         print "Job ID     :", result.job_id
 21 |         print "Result size:", result.size
 22 |         print "Result     :"
 23 |         print "\t".join([c['name'] for i, c in result.description])
 24 |         print "----"
 25 |         for row in result:
 26 |             print "\t".join([str(c) for c in row])
 27 |         print '===================='
 28 | 
 29 | class MyQuerySave(luigi_bigquery.Query):
 30 | 
 31 |     def query(self):
 32 |         return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]"
 33 | 
 34 |     def output(self):
 35 |         return luigi.LocalTarget('MyQuerySave.csv')
 36 | 
 37 |     def run(self):
 38 |         result = self.run_query(self.query())
 39 |         with self.output().open('w') as f:
 40 |             result.to_csv(f)
 41 | 
 42 | ## Building Pipelines
 43 | 
 44 | class MyQueryStep1(luigi_bigquery.Query):
 45 | 
 46 |     def query(self):
 47 |         return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]"
 48 | 
 49 |     def output(self):
 50 |         return luigi_bigquery.ResultTarget('MyQueryStep1.job')
 51 | 
 52 | class MyQueryStep2(luigi.Task):
 53 |     def requires(self):
 54 |         return MyQueryStep1()
 55 | 
 56 |     def output(self):
 57 |         return luigi.LocalTarget('MyQueryStep2.csv')
 58 | 
 59 |     def run(self):
 60 |         # retrieve the result and save it as a CSV file
 61 |         with self.output().open('w') as f:
 62 |             self.input().result.to_csv(f)
 63 | 
 64 | class MyQueryStep3(luigi.Task):
 65 |     def requires(self):
 66 |         return MyQueryStep2()
 67 | 
 68 |     def output(self):
 69 |         return luigi.LocalTarget('MyQueryStep3.txt')
 70 | 
 71 |     def run(self):
 72 |         with self.input().open() as f:
 73 |             # process the result here
 74 |             print f.read()
 75 |         with self.output().open('w') as f:
 76 |             # crate the final output
 77 |             f.write('done')
 78 | 
 79 | ## Templating Queries
 80 | 
 81 | class MyQueryFromTemplate(luigi_bigquery.Query):
 82 |     source = 'templates/query_with_language.sql'
 83 | 
 84 |     # variables used in the template
 85 |     language = 'Python'
 86 | 
 87 | class MuQueryWithVariables(luigi_bigquery.Query):
 88 |     source = 'templates/query_with_variables.sql'
 89 | 
 90 |     # define variables
 91 |     variables = {
 92 |         'language': 'Python',
 93 |     }
 94 | 
 95 |     # or use property for dynamic variables
 96 |     # @property
 97 |     # def variables(self):
 98 |     #     return {
 99 |     #         'language': 'Python',
100 |     #     }
101 | 
102 | ## Passing Parameters
103 | 
104 | class MyQueryWithParameters(luigi_bigquery.Query):
105 |     source = 'templates/query_with_time_range.sql'
106 | 
107 |     # parameters
108 |     year = luigi.IntParameter()
109 | 
110 |     def output(self):
111 |         # create a unique name for this output using parameters
112 |         return luigi_bigquery.ResultTarget('MyQueryWithParameters-{0}.job'.format(self.year))
113 | 
114 | class MyQueryAggregator(luigi.Task):
115 | 
116 |     def requires(self):
117 |         # create a list of tasks with different parameters
118 |         return [
119 |             MyQueryWithParameters(2009),
120 |             MyQueryWithParameters(2010),
121 |             MyQueryWithParameters(2011),
122 |             MyQueryWithParameters(2012)
123 |         ]
124 | 
125 |     def output(self):
126 |         return luigi.LocalTarget('MyQueryAggregator.txt')
127 | 
128 |     def run(self):
129 |         with self.output().open('w') as f:
130 |             # repeat for each ResultTarget
131 |             for target in self.input():
132 |                 # output results into a single file
133 |                 for row in target.result:
134 |                     f.write(str(row) + "\n")
135 | 
136 | ## Building Pipelines using QueryTable
137 | 
138 | class MyQueryTableStep1(luigi_bigquery.QueryTable):
139 | 
140 |     def dataset(self):
141 |         return 'tmp'
142 | 
143 |     def table(self):
144 |         return 'github_nested_count'
145 | 
146 |     def query(self):
147 |         return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]"
148 | 
149 | class MyQueryTableStep2(luigi_bigquery.Query):
150 |     def requires(self):
151 |         return MyQueryTableStep1()
152 | 
153 |     def query(self):
154 |         input = self.input()
155 |         print(input.dataset_id)
156 |         print(input.table_id)
157 |         return "SELECT cnt FROM [{0}.{1}]".format(input.dataset_id, input.table_id)
158 | 
159 |     def output(self):
160 |         return luigi.LocalTarget('MyQueryTableStep2.csv')
161 | 
162 |     def run(self):
163 |         # retrieve the result and save it as a CSV file
164 |         result = self.run_query(self.query())
165 |         with self.output().open('w') as f:
166 |             result.to_csv(f)
167 | 
168 | if __name__ == '__main__':
169 |     luigi.run()
170 | import luigi
171 | import luigi_bigquery
172 | 
173 | ## Running Queries
174 | 
175 | class MyQuery(luigi_bigquery.Query):
176 | 
177 |     def query(self):
178 |         return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]"
179 | 
180 | ## Getting Results
181 | 
182 | class MyQueryRun(luigi_bigquery.Query):
183 | 
184 |     def query(self):
185 |         return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]"
186 | 
187 |     def run(self):
188 |         result = self.run_query(self.query())
189 |         print "Job ID     :", result.job_id
190 |         print "Result size:", result.size
191 |         print "Result     :"
192 |         print "\t".join([c['name'] for i, c in result.description])
193 |         print "----"
194 |         for row in result:
195 |             print "\t".join([str(c) for c in row])
196 |         print '===================='
197 | 
198 | class MyQuerySave(luigi_bigquery.Query):
199 | 
200 |     def query(self):
201 |         return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]"
202 | 
203 |     def output(self):
204 |         return luigi.LocalTarget('MyQuerySave.csv')
205 | 
206 |     def run(self):
207 |         result = self.run_query(self.query())
208 |         with self.output().open('w') as f:
209 |             result.to_csv(f)
210 | 
211 | ## Building Pipelines
212 | 
213 | class MyQueryStep1(luigi_bigquery.Query):
214 | 
215 |     def query(self):
216 |         return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]"
217 | 
218 |     def output(self):
219 |         return luigi_bigquery.ResultTarget('MyQueryStep1.job')
220 | 
221 | class MyQueryStep2(luigi.Task):
222 |     def requires(self):
223 |         return MyQueryStep1()
224 | 
225 |     def output(self):
226 |         return luigi.LocalTarget('MyQueryStep2.csv')
227 | 
228 |     def run(self):
229 |         # retrieve the result and save it as a CSV file
230 |         with self.output().open('w') as f:
231 |             self.input().result.to_csv(f)
232 | 
233 | class MyQueryStep3(luigi.Task):
234 |     def requires(self):
235 |         return MyQueryStep2()
236 | 
237 |     def output(self):
238 |         return luigi.LocalTarget('MyQueryStep3.txt')
239 | 
240 |     def run(self):
241 |         with self.input().open() as f:
242 |             # process the result here
243 |             print f.read()
244 |         with self.output().open('w') as f:
245 |             # crate the final output
246 |             f.write('done')
247 | 
248 | ## Templating Queries
249 | 
250 | class MyQueryFromTemplate(luigi_bigquery.Query):
251 |     source = 'templates/query_with_language.sql'
252 | 
253 |     # variables used in the template
254 |     language = 'Python'
255 | 
256 | class MuQueryWithVariables(luigi_bigquery.Query):
257 |     source = 'templates/query_with_variables.sql'
258 | 
259 |     # define variables
260 |     variables = {
261 |         'language': 'Python',
262 |     }
263 | 
264 |     # or use property for dynamic variables
265 |     # @property
266 |     # def variables(self):
267 |     #     return {
268 |     #         'language': 'Python',
269 |     #     }
270 | 
271 | ## Passing Parameters
272 | 
273 | class MyQueryWithParameters(luigi_bigquery.Query):
274 |     source = 'templates/query_with_time_range.sql'
275 | 
276 |     # parameters
277 |     year = luigi.IntParameter()
278 | 
279 |     def output(self):
280 |         # create a unique name for this output using parameters
281 |         return luigi_bigquery.ResultTarget('MyQueryWithParameters-{0}.job'.format(self.year))
282 | 
283 | class MyQueryAggregator(luigi.Task):
284 | 
285 |     def requires(self):
286 |         # create a list of tasks with different parameters
287 |         return [
288 |             MyQueryWithParameters(2009),
289 |             MyQueryWithParameters(2010),
290 |             MyQueryWithParameters(2011),
291 |             MyQueryWithParameters(2012)
292 |         ]
293 | 
294 |     def output(self):
295 |         return luigi.LocalTarget('MyQueryAggregator.txt')
296 | 
297 |     def run(self):
298 |         with self.output().open('w') as f:
299 |             # repeat for each ResultTarget
300 |             for target in self.input():
301 |                 # output results into a single file
302 |                 for row in target.result:
303 |                     f.write(str(row) + "\n")
304 | 
305 | ## Building Pipelines using QueryTable
306 | 
307 | class MyQueryTableStep1(luigi_bigquery.QueryTable):
308 | 
309 |     def dataset(self):
310 |         return 'tmp'
311 | 
312 |     def table(self):
313 |         return 'github_nested_count'
314 | 
315 |     def query(self):
316 |         return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]"
317 | 
318 | class MyQueryTableStep2(luigi_bigquery.Query):
319 |     def requires(self):
320 |         return MyQueryTableStep1()
321 | 
322 |     def query(self):
323 |         input = self.input()
324 |         return "SELECT cnt FROM [{0}.{1}]".format(input.dataset_id, input.table_id)
325 | 
326 |     def output(self):
327 |         return luigi.LocalTarget('MyQueryTableStep2.csv')
328 | 
329 |     def run(self):
330 |         # retrieve the result and save it as a CSV file
331 |         result = self.run_query(self.query())
332 |         with self.output().open('w') as f:
333 |             result.to_csv(f)
334 | 
335 | # QueryToGCS
336 | 
337 | class MyQueryToGCS(luigi_bigquery.QueryToGCS):
338 |     use_temporary_table = True
339 | 
340 |     def bucket(self):
341 |         return 'my-bucket'
342 | 
343 |     def path(self):
344 |         return '/path/to/file.csv'
345 | 
346 |     def query(self):
347 |         return "SELECT count(*) cnt FROM [publicdata:samples.github_nested]"
348 | 
349 | if __name__ == '__main__':
350 |     luigi.run()
351 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2015 Kazuyuki Honda
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------