├── .gitignore ├── LICENSE ├── README.md ├── dags ├── __init__.py ├── bigquery.py ├── cloud_storage.py ├── dataproc.py ├── gcp_smoke │ └── gsob_extract_day.sql └── support │ ├── __init__.py │ └── schemas.py ├── external ├── afgc-dataflow-java │ ├── .gitignore │ ├── build.gradle │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ ├── gradlew │ ├── gradlew.bat │ ├── pipeline │ │ ├── build.gradle │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ └── airflow │ │ │ └── gcloud │ │ │ └── pipeline │ │ │ ├── Bq2BqPipeline.java │ │ │ └── CopyPipeline.java │ ├── settings.gradle │ └── transform │ │ ├── build.gradle │ │ └── src │ │ └── main │ │ └── java │ │ └── airflow │ │ └── gcloud │ │ ├── BqSchemaFor.java │ │ ├── data │ │ └── convert │ │ │ └── ConvertObjectToStringFn.java │ │ ├── model │ │ └── EmailEvent.java │ │ └── transform │ │ └── NoopFn.java └── afgc-spark-scala │ └── src │ └── main │ └── scala │ └── luigi │ └── gcloud │ └── spark │ └── Copy.scala └── img ├── airflow_connection.png ├── airflow_variables.png ├── console_service_account.png └── create_service_account.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | logs/ 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .idea/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # IPython Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # dotenv 81 | .env 82 | 83 | # virtualenv 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | 93 | airflow.cfg 94 | airflow-gcp-smoke.json 95 | airflow-webserver.pid 96 | unittests.cfg 97 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # airflow-gcp-examples 2 | 3 | Repository with examples and smoke tests for the GCP Airflow operators and hooks. 4 | 5 | ## Setup 6 | 7 | This Google Cloud Examples does assume you will have a standard Airflow setup up and 8 | running. This tutorial does work perfectly locally as in a production setup because 9 | the only requirement is that you have a service key, that we'll explain next. But first 10 | a quick rundown of what you need: 11 | 12 | * Running Airflow (as of this writing you need Airflow master branch!!!) 13 | * Create a service account (Cloud Console) 14 | * Setup a Google Cloud Connection in Airflow 15 | * Setup variables that the DAG's will need 16 | * Copy the DAG's to your dags folder 17 | 18 | ### Airflow setup 19 | 20 | * Checkout master of Airflow 21 | * pip install google-api-python-client 22 | * python setup.py install 23 | 24 | Make sure you're running the LocalExecutor and have a decent database setup. 25 | 26 | ### Google Cloud Service Key 27 | 28 | Next create a service account where you want your smoke tests and examples to run in. Go 29 | to the console: 30 | 31 | ![console](img/console_service_account.png?raw=true) 32 | 33 | And create a service key. Provision a JSON private key and give it Editor's rights 34 | 35 | ![console](img/create_service_account.png?raw=true) 36 | 37 | ### Airflow Connection 38 | 39 | In Airflow you need to define the *gcp_smoke* named connection to your project: 40 | 41 | ![console](img/airflow_connection.png?raw=true) 42 | 43 | Supply the path to the downloaded private key, supply the *project_id* and define the 44 | minimum scope of *https://www.googleapis.com/auth/cloud-platform* 45 | 46 | ### Airflow Variables 47 | 48 | You need to setup variables that are used in the examples. You can tweak them to suite 49 | your environment. 50 | 51 | ![console](img/airflow_variables.png?raw=true) 52 | 53 | variable | example value | note 54 | --- | --- | --- 55 | gc_project | my-project | Project where the examples will run in 56 | gcq_dataset | airflow | BigQuery dataset for examples 57 | gcq_tempset | airflow_temp | BiqQuery dataset with 1 day retentions 58 | gcs_bucket | airflow-gcp-smoke | Storage bucket 59 | gcs_root | data | Storage root path (required, no start and end with slash) 60 | -------------------------------------------------------------------------------- /dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexvanboxel/airflow-gcp-examples/a321f30c25070d2063a206cbe1c72710b1e21e49/dags/__init__.py -------------------------------------------------------------------------------- /dags/bigquery.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta, datetime 2 | 3 | from airflow import DAG 4 | from airflow.contrib.operators.bigquery_operator import BigQueryOperator 5 | from airflow.contrib.operators.bigquery_to_gcs import BigQueryToCloudStorageOperator 6 | from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator 7 | 8 | from dags.support import schemas 9 | 10 | seven_days_ago = datetime.combine(datetime.today() - timedelta(7), 11 | datetime.min.time()) 12 | 13 | default_args = { 14 | 'owner': 'airflow', 15 | 'depends_on_past': False, 16 | 'start_date': seven_days_ago, 17 | 'email': ['alex@vanboxel.be'], 18 | 'email_on_failure': False, 19 | 'email_on_retry': False, 20 | 'retries': 1, 21 | 'retry_delay': timedelta(minutes=30), 22 | } 23 | 24 | with DAG('v1_8_bigquery', schedule_interval=timedelta(days=1), 25 | default_args=default_args) as dag: 26 | bq_extract_one_day = BigQueryOperator( 27 | task_id='bq_extract_one_day', 28 | bql='gcp_smoke/gsob_extract_day.sql', 29 | destination_dataset_table= 30 | '{{var.value.gcq_dataset}}.gsod_partition{{ ds_nodash }}', 31 | write_disposition='WRITE_TRUNCATE', 32 | bigquery_conn_id='gcp_smoke', 33 | use_legacy_sql=False 34 | ) 35 | 36 | bq2gcp_avro = BigQueryToCloudStorageOperator( 37 | task_id='bq2gcp_avro', 38 | source_project_dataset_table='{{var.value.gcq_dataset}}.gsod_partition{{ ds_nodash }}', 39 | destination_cloud_storage_uris=[ 40 | 'gs://{{var.value.gcs_bucket}}/{{var.value.gcs_root}}/gcp_smoke_bq/bq_to_gcp_avro/{{ ds_nodash }}/part-*.avro' 41 | ], 42 | export_format='AVRO', 43 | bigquery_conn_id='gcp_smoke', 44 | ) 45 | 46 | bq2gcp_override = BigQueryToCloudStorageOperator( 47 | task_id='bq2gcp_override', 48 | source_project_dataset_table='{{var.value.gcq_dataset}}.gsod_partition{{ ds_nodash }}', 49 | destination_cloud_storage_uris=[ 50 | 'gs://{{var.value.gcs_bucket}}/{{var.value.gcs_root}}/gcp_smoke_bq/bq_to_gcp_avro/99999999/part-*.avro' 51 | ], 52 | export_format='AVRO', 53 | bigquery_conn_id='gcp_smoke', 54 | ) 55 | 56 | gcs2bq_avro_auto_schema = GoogleCloudStorageToBigQueryOperator( 57 | task_id='gcs2bq_avro_auto_schema', 58 | bucket='{{var.value.gcs_bucket}}', 59 | source_objects=[ 60 | '{{var.value.gcs_root}}/gcp_smoke_bq/bq_to_gcp_avro/{{ ds_nodash }}/part-*' 61 | ], 62 | destination_project_dataset_table='{{var.value.gcq_tempset}}.avro_auto_schema{{ ds_nodash }}', 63 | source_format='AVRO', 64 | create_disposition='CREATE_IF_NEEDED', 65 | write_disposition='WRITE_TRUNCATE', 66 | google_cloud_storage_conn_id='gcp_smoke', 67 | bigquery_conn_id='gcp_smoke' 68 | ) 69 | 70 | gcs2bq_avro_with_schema = GoogleCloudStorageToBigQueryOperator( 71 | task_id='gcs2bq_avro_with_schema', 72 | bucket='{{var.value.gcs_bucket}}', 73 | source_objects=[ 74 | '{{var.value.gcs_root}}/gcp_smoke_bq/bq_to_gcp_avro/{{ ds_nodash }}/part-*' 75 | ], 76 | destination_project_dataset_table='{{var.value.gcq_tempset}}.avro_with_schema{{ ds_nodash }}', 77 | source_format='AVRO', 78 | schema_fields=schemas.gsob(), 79 | create_disposition='CREATE_IF_NEEDED', 80 | write_disposition='WRITE_TRUNCATE', 81 | google_cloud_storage_conn_id='gcp_smoke', 82 | bigquery_conn_id='gcp_smoke' 83 | ) 84 | 85 | bq_extract_one_day >> bq2gcp_avro >> bq2gcp_override 86 | bq2gcp_avro >> gcs2bq_avro_auto_schema 87 | bq2gcp_avro >> gcs2bq_avro_with_schema 88 | -------------------------------------------------------------------------------- /dags/cloud_storage.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta, datetime 2 | 3 | from airflow import DAG 4 | from airflow.contrib.sensors.gcs_sensor import GoogleCloudStorageObjectSensor, \ 5 | GoogleCloudStorageObjectUpdatedSensor 6 | 7 | seven_days_ago = datetime.combine(datetime.today() - timedelta(7), 8 | datetime.min.time()) 9 | 10 | default_args = { 11 | 'owner': 'airflow', 12 | 'depends_on_past': False, 13 | 'start_date': seven_days_ago, 14 | 'email': ['alex@vanboxel.be'], 15 | 'email_on_failure': False, 16 | 'email_on_retry': False, 17 | 'retries': 1, 18 | 'retry_delay': timedelta(minutes=30), 19 | } 20 | 21 | with DAG('v1_8_cloud_storage', schedule_interval=timedelta(days=1), 22 | default_args=default_args) as dag: 23 | sens_object_create = GoogleCloudStorageObjectSensor( 24 | task_id='sens_object_create', 25 | bucket='{{var.value.gcs_bucket}}', 26 | object='{{var.value.gcs_root}}/gcp_smoke_bq/bq_to_gcp_avro/{{ ds_nodash }}/part-000000000000.avro', 27 | google_cloud_conn_id='gcp_smoke' 28 | ) 29 | 30 | sens_object_update = GoogleCloudStorageObjectUpdatedSensor( 31 | task_id='sens_object_update', 32 | bucket='{{var.value.gcs_bucket}}', 33 | object='{{var.value.gcs_root}}/gcp_smoke_bq/bq_to_gcp_avro/99999999/part-000000000000.avro', 34 | google_cloud_conn_id='gcp_smoke' 35 | ) 36 | -------------------------------------------------------------------------------- /dags/dataproc.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta, datetime 2 | 3 | from airflow import DAG 4 | from airflow.contrib.operators.dataproc_operator import DataprocClusterCreateOperator, \ 5 | DataprocClusterDeleteOperator 6 | from airflow.models import Variable 7 | from airflow.operators.dummy_operator import DummyOperator 8 | from airflow.operators.python_operator import BranchPythonOperator 9 | 10 | yesterday = datetime.combine(datetime.today() - timedelta(1), 11 | datetime.min.time()) 12 | 13 | default_args = { 14 | 'owner': 'airflow', 15 | 'depends_on_past': False, 16 | 'start_date': yesterday, 17 | 'email': ['alex@vanboxel.be'], 18 | 'email_on_failure': False, 19 | 'email_on_retry': False, 20 | 'retries': 1, 21 | 'retry_delay': timedelta(minutes=30), 22 | } 23 | 24 | with DAG('v1_8_dataproc', schedule_interval=timedelta(days=1), 25 | default_args=default_args) as dag: 26 | def should_run(ds, **kwargs): 27 | 28 | if datetime.now() < kwargs['execution_date'] + timedelta(days=2): 29 | return "start_cluster" 30 | else: 31 | return "no_run" 32 | 33 | 34 | start = BranchPythonOperator( 35 | task_id='start', 36 | provide_context=True, 37 | python_callable=should_run, 38 | ) 39 | 40 | start_cluster = DataprocClusterCreateOperator( 41 | task_id='start_cluster', 42 | cluster_name='smoke-cluster-{{ ds_nodash }}', 43 | project_id=Variable.get('gc_project'), 44 | num_workers=2, 45 | num_preemptible_workers=1, 46 | properties={ 47 | 'spark:spark.executorEnv.PYTHONHASHSEED': '0', 48 | 'spark:spark.yarn.am.memory': '1024m', 49 | 'spark:spark.sql.avro.compression.codec': 'deflate' 50 | }, 51 | worker_disk_size=50, 52 | master_disk_size=50, 53 | labels={ 54 | 'example': 'label' 55 | }, 56 | zone=Variable.get('gc_zone'), 57 | google_cloud_conn_id='gcp_smoke' 58 | ) 59 | 60 | stop_cluster = DataprocClusterDeleteOperator( 61 | task_id='stop_cluster', 62 | cluster_name='smoke-cluster-{{ ds_nodash }}', 63 | project_id=Variable.get('gc_project'), 64 | google_cloud_conn_id='gcp_smoke' 65 | ) 66 | 67 | no_run = DummyOperator(task_id='no_run') 68 | 69 | end = DummyOperator( 70 | trigger_rule='one_success', 71 | task_id='end') 72 | 73 | start >> start_cluster >> stop_cluster >> end 74 | start >> no_run >> end 75 | -------------------------------------------------------------------------------- /dags/gcp_smoke/gsob_extract_day.sql: -------------------------------------------------------------------------------- 1 | #standardSQL 2 | SELECT 3 | CAST(CONCAT(CAST((CAST(year AS INT64) + 8) AS STRING),'-',CAST(mo AS STRING),'-',CAST(da AS STRING)) AS DATE) AS partition_date, 4 | IF(temp > 9999, 5 | NULL, 6 | (temp - 32) * 5/9) AS temperature_mean, 7 | IF( min > 9999, 8 | NULL, 9 | ( min - 32) * 5/9) AS temperature_min, 10 | IF( max > 9999, 11 | NULL, 12 | ( max - 32) * 5/9) AS temperature_max, 13 | IF(dewp > 9999, 14 | NULL, 15 | (dewp - 32) * 5/9) AS dew_point_mean, 16 | IF(slp > 9999, 17 | NULL, 18 | slp )AS pressure_sea_level_mean, 19 | IF(stp > 9999, 20 | NULL, 21 | stp ) AS pressure_station_level_mean, 22 | IF(visib > 9999, 23 | NULL, 24 | visib ) AS visibility_mean, 25 | IF(CAST(wdsp AS FLOAT64) > 9999, 26 | NULL, 27 | CAST(wdsp AS FLOAT64) ) AS wind_speed_mean, 28 | IF( CAST(mxpsd AS FLOAT64) > 999, 29 | NULL, 30 | CAST(mxpsd AS FLOAT64) ) AS wind_speed_sustained_max, 31 | IF( prcp > 99, 32 | NULL, 33 | prcp ) AS precipitation, 34 | IF( sndp > 999, 35 | NULL, 36 | sndp) AS snow_depth, 37 | IF ( CAST( fog AS INT64) = 1, 38 | TRUE, 39 | FALSE) AS fog, 40 | IF ( CAST( rain_drizzle AS INT64) = 1, 41 | TRUE, 42 | FALSE) AS rain_drizzle, 43 | IF ( CAST( snow_ice_pellets AS INT64) = 1, 44 | TRUE, 45 | FALSE) AS snow_ice_pellets, 46 | IF ( CAST( hail AS INT64) = 1, 47 | TRUE, 48 | FALSE) AS hail, 49 | IF ( CAST( thunder AS INT64) = 1, 50 | TRUE, 51 | FALSE) AS thunder, 52 | IF ( CAST( tornado_funnel_cloud AS INT64) = 1, 53 | TRUE, 54 | FALSE) AS tornado_funnel_cloud, 55 | station.usaf AS usaf, 56 | station.wban AS wban, 57 | station.name AS station_name, 58 | station.country AS station_country, 59 | station.state AS station_state, 60 | station.lat AS station_latitude, 61 | station.lon AS station_longitude 62 | FROM 63 | `bigquery-public-data.noaa_gsod.gsod{{execution_date.year-8}}` AS gsob 64 | JOIN 65 | `bigquery-public-data.noaa_gsod.stations` AS station 66 | ON 67 | CAST(gsob.stn AS INT64)=station.usaf 68 | AND CAST(gsob.wban AS INT64)=station.wban 69 | WHERE 70 | CAST(CONCAT(CAST((CAST(year AS INT64) + 8) AS STRING),'-',CAST(mo AS STRING),'-',CAST(da AS STRING)) AS DATE) 71 | = '{{ ds }}' 72 | -------------------------------------------------------------------------------- /dags/support/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexvanboxel/airflow-gcp-examples/a321f30c25070d2063a206cbe1c72710b1e21e49/dags/support/__init__.py -------------------------------------------------------------------------------- /dags/support/schemas.py: -------------------------------------------------------------------------------- 1 | def gsob(): 2 | return [ 3 | {'name': 'partition_date', 'type': 'STRING', 'mode': 'NULLABLE'}, 4 | {'name': 'temperature_mean', 'type': 'FLOAT', 'mode': 'NULLABLE'}, 5 | {'name': 'temperature_min', 'type': 'FLOAT', 'mode': 'NULLABLE'}, 6 | {'name': 'temperature_max', 'type': 'FLOAT', 'mode': 'NULLABLE'}, 7 | {'name': 'dew_point_mean', 'type': 'FLOAT', 'mode': 'NULLABLE'}, 8 | {'name': 'pressure_sea_level_mean', 'type': 'FLOAT', 'mode': 'NULLABLE'}, 9 | {'name': 'pressure_station_level_mean', 'type': 'FLOAT', 'mode': 'NULLABLE'}, 10 | {'name': 'visibility_mean', 'type': 'FLOAT', 'mode': 'NULLABLE'}, 11 | {'name': 'wind_speed_mean', 'type': 'FLOAT', 'mode': 'NULLABLE'}, 12 | {'name': 'wind_speed_sustained_max', 'type': 'FLOAT', 'mode': 'NULLABLE'}, 13 | {'name': 'precipitation', 'type': 'FLOAT', 'mode': 'NULLABLE'}, 14 | {'name': 'snow_depth', 'type': 'FLOAT', 'mode': 'NULLABLE'}, 15 | {'name': 'fog', 'type': 'BOOLEAN', 'mode': 'NULLABLE'}, 16 | {'name': 'rain_drizzle', 'type': 'BOOLEAN', 'mode': 'NULLABLE'}, 17 | {'name': 'snow_ice_pellets', 'type': 'BOOLEAN', 'mode': 'NULLABLE'}, 18 | {'name': 'hail', 'type': 'BOOLEAN', 'mode': 'NULLABLE'}, 19 | {'name': 'thunder', 'type': 'BOOLEAN', 'mode': 'NULLABLE'}, 20 | {'name': 'tornado_funnel_cloud', 'type': 'BOOLEAN', 'mode': 'NULLABLE'}, 21 | {'name': 'usaf', 'type': 'INTEGER', 'mode': 'NULLABLE'}, 22 | {'name': 'wban', 'type': 'INTEGER', 'mode': 'NULLABLE'}, 23 | {'name': 'station_name', 'type': 'STRING', 'mode': 'NULLABLE'}, 24 | {'name': 'station_country', 'type': 'STRING', 'mode': 'NULLABLE'}, 25 | {'name': 'station_state', 'type': 'STRING', 'mode': 'NULLABLE'}, 26 | {'name': 'station_latitude', 'type': 'FLOAT', 'mode': 'NULLABLE'}, 27 | {'name': 'station_longitude', 'type': 'FLOAT', 'mode': 'NULLABLE'} 28 | 29 | ] 30 | -------------------------------------------------------------------------------- /external/afgc-dataflow-java/.gitignore: -------------------------------------------------------------------------------- 1 | .gradle 2 | /local.properties 3 | .DS_Store 4 | 5 | *.iws 6 | *.ipr 7 | *.iml 8 | bin 9 | build 10 | bin/* 11 | build/* 12 | .class 13 | .DS_Store? 14 | ._* 15 | .Spotlight-V100 16 | .Trashes 17 | Icon? 18 | ehthumbs.db 19 | Thumbs.db 20 | .tpignore 21 | .settings/* 22 | .idea/* 23 | gen/* 24 | out/* 25 | work/* -------------------------------------------------------------------------------- /external/afgc-dataflow-java/build.gradle: -------------------------------------------------------------------------------- 1 | subprojects { 2 | apply plugin: 'java' 3 | 4 | repositories { 5 | maven { 6 | url "http://jcenter.bintray.com" 7 | } 8 | mavenLocal() 9 | } 10 | 11 | dependencies { 12 | compile 'com.google.cloud.dataflow:google-cloud-dataflow-java-sdk-all:1.9.0' 13 | testCompile 'org.hamcrest:hamcrest-all:1.3' 14 | testCompile 'org.assertj:assertj-core:3.0.0' 15 | testCompile 'junit:junit:4.12' 16 | } 17 | 18 | sourceCompatibility = 1.8 19 | version = '1.0' 20 | compileJava.options.encoding = 'UTF-8' 21 | 22 | jar { 23 | manifest.attributes provider: 'gradle' 24 | } 25 | } 26 | 27 | task wrapper(type: Wrapper) { 28 | gradleVersion = '3.2.1' 29 | } 30 | -------------------------------------------------------------------------------- /external/afgc-dataflow-java/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexvanboxel/airflow-gcp-examples/a321f30c25070d2063a206cbe1c72710b1e21e49/external/afgc-dataflow-java/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /external/afgc-dataflow-java/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Sat Dec 24 14:36:24 CET 2016 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-3.2.1-all.zip 7 | -------------------------------------------------------------------------------- /external/afgc-dataflow-java/gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Attempt to set APP_HOME 10 | # Resolve links: $0 may be a link 11 | PRG="$0" 12 | # Need this for relative symlinks. 13 | while [ -h "$PRG" ] ; do 14 | ls=`ls -ld "$PRG"` 15 | link=`expr "$ls" : '.*-> \(.*\)$'` 16 | if expr "$link" : '/.*' > /dev/null; then 17 | PRG="$link" 18 | else 19 | PRG=`dirname "$PRG"`"/$link" 20 | fi 21 | done 22 | SAVED="`pwd`" 23 | cd "`dirname \"$PRG\"`/" >/dev/null 24 | APP_HOME="`pwd -P`" 25 | cd "$SAVED" >/dev/null 26 | 27 | APP_NAME="Gradle" 28 | APP_BASE_NAME=`basename "$0"` 29 | 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 31 | DEFAULT_JVM_OPTS="" 32 | 33 | # Use the maximum available, or set MAX_FD != -1 to use that value. 34 | MAX_FD="maximum" 35 | 36 | warn ( ) { 37 | echo "$*" 38 | } 39 | 40 | die ( ) { 41 | echo 42 | echo "$*" 43 | echo 44 | exit 1 45 | } 46 | 47 | # OS specific support (must be 'true' or 'false'). 48 | cygwin=false 49 | msys=false 50 | darwin=false 51 | nonstop=false 52 | case "`uname`" in 53 | CYGWIN* ) 54 | cygwin=true 55 | ;; 56 | Darwin* ) 57 | darwin=true 58 | ;; 59 | MINGW* ) 60 | msys=true 61 | ;; 62 | NONSTOP* ) 63 | nonstop=true 64 | ;; 65 | esac 66 | 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 68 | 69 | # Determine the Java command to use to start the JVM. 70 | if [ -n "$JAVA_HOME" ] ; then 71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 72 | # IBM's JDK on AIX uses strange locations for the executables 73 | JAVACMD="$JAVA_HOME/jre/sh/java" 74 | else 75 | JAVACMD="$JAVA_HOME/bin/java" 76 | fi 77 | if [ ! -x "$JAVACMD" ] ; then 78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 79 | 80 | Please set the JAVA_HOME variable in your environment to match the 81 | location of your Java installation." 82 | fi 83 | else 84 | JAVACMD="java" 85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 86 | 87 | Please set the JAVA_HOME variable in your environment to match the 88 | location of your Java installation." 89 | fi 90 | 91 | # Increase the maximum file descriptors if we can. 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 93 | MAX_FD_LIMIT=`ulimit -H -n` 94 | if [ $? -eq 0 ] ; then 95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 96 | MAX_FD="$MAX_FD_LIMIT" 97 | fi 98 | ulimit -n $MAX_FD 99 | if [ $? -ne 0 ] ; then 100 | warn "Could not set maximum file descriptor limit: $MAX_FD" 101 | fi 102 | else 103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 104 | fi 105 | fi 106 | 107 | # For Darwin, add options to specify how the application appears in the dock 108 | if $darwin; then 109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 110 | fi 111 | 112 | # For Cygwin, switch paths to Windows format before running java 113 | if $cygwin ; then 114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 116 | JAVACMD=`cygpath --unix "$JAVACMD"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Escape application args 158 | save ( ) { 159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 160 | echo " " 161 | } 162 | APP_ARGS=$(save "$@") 163 | 164 | # Collect all arguments for the java command, following the shell quoting and substitution rules 165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 166 | 167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong 168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then 169 | cd "$(dirname "$0")" 170 | fi 171 | 172 | exec "$JAVACMD" "$@" 173 | -------------------------------------------------------------------------------- /external/afgc-dataflow-java/gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | set DIRNAME=%~dp0 12 | if "%DIRNAME%" == "" set DIRNAME=. 13 | set APP_BASE_NAME=%~n0 14 | set APP_HOME=%DIRNAME% 15 | 16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 17 | set DEFAULT_JVM_OPTS= 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | 53 | :win9xME_args 54 | @rem Slurp the command line arguments. 55 | set CMD_LINE_ARGS= 56 | set _SKIP=2 57 | 58 | :win9xME_args_slurp 59 | if "x%~1" == "x" goto execute 60 | 61 | set CMD_LINE_ARGS=%* 62 | 63 | :execute 64 | @rem Setup the command line 65 | 66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 67 | 68 | @rem Execute Gradle 69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 70 | 71 | :end 72 | @rem End local scope for the variables with windows NT shell 73 | if "%ERRORLEVEL%"=="0" goto mainEnd 74 | 75 | :fail 76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 77 | rem the _cmd.exe /c_ return code! 78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 79 | exit /b 1 80 | 81 | :mainEnd 82 | if "%OS%"=="Windows_NT" endlocal 83 | 84 | :omega 85 | -------------------------------------------------------------------------------- /external/afgc-dataflow-java/pipeline/build.gradle: -------------------------------------------------------------------------------- 1 | 2 | apply plugin: 'java' 3 | 4 | compileJava.options.encoding = 'UTF-8' 5 | sourceCompatibility = 1.8 6 | 7 | task pipelineCopy(type: Jar) { 8 | manifest { 9 | attributes 'Implementation-Title': 'DataFlow Copy Pipeline', 10 | 'Implementation-Version': version, 11 | 'Main-Class': 'airflow.gcloud.pipeline.CopyPipeline' 12 | } 13 | baseName = project.name + '-copy' 14 | from { configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } } 15 | with jar 16 | } 17 | 18 | task pipelineBq2Bq(type: Jar) { 19 | manifest { 20 | attributes 'Implementation-Title': 'DataFlow Copy Pipeline', 21 | 'Implementation-Version': version, 22 | 'Main-Class': 'airflow.gcloud.pipeline.Bq2BqPipeline' 23 | } 24 | baseName = project.name + '-bq2bq' 25 | from { configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } } 26 | with jar 27 | } 28 | 29 | jar.dependsOn pipelineCopy, pipelineBq2Bq 30 | 31 | dependencies { 32 | compile project(':transform') 33 | testCompile group: 'junit', name: 'junit', version: '4.11' 34 | } 35 | -------------------------------------------------------------------------------- /external/afgc-dataflow-java/pipeline/src/main/java/airflow/gcloud/pipeline/Bq2BqPipeline.java: -------------------------------------------------------------------------------- 1 | package airflow.gcloud.pipeline; 2 | 3 | import airflow.gcloud.BqSchemaFor; 4 | import com.google.cloud.dataflow.sdk.Pipeline; 5 | import com.google.cloud.dataflow.sdk.io.BigQueryIO; 6 | import com.google.cloud.dataflow.sdk.options.DataflowPipelineOptions; 7 | import com.google.cloud.dataflow.sdk.options.Description; 8 | import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory; 9 | import com.google.cloud.dataflow.sdk.options.ValueProvider; 10 | 11 | import java.io.IOException; 12 | 13 | 14 | public class Bq2BqPipeline { 15 | 16 | private interface Options extends DataflowPipelineOptions { 17 | @Description("Path of the file to read from") 18 | ValueProvider getIn(); 19 | 20 | void setIn(ValueProvider value); 21 | 22 | @Description("Path of the file to write to") 23 | ValueProvider getOut(); 24 | 25 | void setOut(ValueProvider value); 26 | } 27 | 28 | public static void main(String[] args) throws IOException { 29 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 30 | Pipeline pipeline = Pipeline.create(options); 31 | 32 | pipeline.apply(BigQueryIO.Read.from(options.getIn())) 33 | .apply(BigQueryIO.Write.to(options.getOut()).withSchema(BqSchemaFor.gsob())); 34 | 35 | pipeline.run(); 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /external/afgc-dataflow-java/pipeline/src/main/java/airflow/gcloud/pipeline/CopyPipeline.java: -------------------------------------------------------------------------------- 1 | package airflow.gcloud.pipeline; 2 | 3 | import com.google.cloud.dataflow.sdk.Pipeline; 4 | import com.google.cloud.dataflow.sdk.io.TextIO; 5 | import com.google.cloud.dataflow.sdk.options.Description; 6 | import com.google.cloud.dataflow.sdk.options.PipelineOptions; 7 | import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory; 8 | 9 | import java.io.IOException; 10 | 11 | 12 | public class CopyPipeline { 13 | 14 | private interface Options extends PipelineOptions { 15 | @Description("Path of the file to read from") 16 | String getIn(); 17 | 18 | void setIn(String value); 19 | 20 | @Description("Path of the file to write to") 21 | String getOut(); 22 | 23 | void setOut(String value); 24 | } 25 | 26 | public static void main(String[] args) throws IOException { 27 | Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); 28 | Pipeline pipeline = Pipeline.create(options); 29 | 30 | pipeline.apply(TextIO.Read.from(options.getIn())) 31 | .apply(TextIO.Write.to(options.getOut())); 32 | 33 | pipeline.run(); 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /external/afgc-dataflow-java/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'afgc-dataflow-java' 2 | 3 | include 'transform' 4 | include 'pipeline' 5 | 6 | -------------------------------------------------------------------------------- /external/afgc-dataflow-java/transform/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | 3 | sourceCompatibility = 1.8 4 | compileJava.options.encoding = 'UTF-8' 5 | version = '1.0' 6 | 7 | repositories { 8 | mavenCentral() 9 | } 10 | 11 | dependencies { 12 | testCompile group: 'junit', name: 'junit', version: '4.11' 13 | } -------------------------------------------------------------------------------- /external/afgc-dataflow-java/transform/src/main/java/airflow/gcloud/BqSchemaFor.java: -------------------------------------------------------------------------------- 1 | package airflow.gcloud; 2 | 3 | import com.google.api.services.bigquery.model.TableFieldSchema; 4 | import com.google.api.services.bigquery.model.TableSchema; 5 | 6 | import java.io.IOException; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | /** 11 | * Created by alexvanboxel on 24/12/16. 12 | */ 13 | public class BqSchemaFor { 14 | 15 | public static TableSchema gsob() throws IOException { 16 | List fields = new ArrayList<>(); 17 | fields.add(new TableFieldSchema().setName("partition_date").setType("DATE").setMode("NULLABLE")); 18 | fields.add(new TableFieldSchema().setName("temperature_mean").setType("FLOAT").setMode("NULLABLE")); 19 | fields.add(new TableFieldSchema().setName("temperature_min").setType("FLOAT").setMode("NULLABLE")); 20 | fields.add(new TableFieldSchema().setName("temperature_max").setType("FLOAT").setMode("NULLABLE")); 21 | fields.add(new TableFieldSchema().setName("dew_point_mean").setType("FLOAT").setMode("NULLABLE")); 22 | fields.add(new TableFieldSchema().setName("pressure_sea_level_mean").setType("FLOAT").setMode("NULLABLE")); 23 | fields.add(new TableFieldSchema().setName("pressure_station_level_mean").setType("FLOAT").setMode("NULLABLE")); 24 | fields.add(new TableFieldSchema().setName("visibility_mean").setType("FLOAT").setMode("NULLABLE")); 25 | fields.add(new TableFieldSchema().setName("wind_speed_mean").setType("FLOAT").setMode("NULLABLE")); 26 | fields.add(new TableFieldSchema().setName("wind_speed_sustained_max").setType("FLOAT").setMode("NULLABLE")); 27 | fields.add(new TableFieldSchema().setName("precipitation").setType("FLOAT").setMode("NULLABLE")); 28 | fields.add(new TableFieldSchema().setName("snow_depth").setType("FLOAT").setMode("NULLABLE")); 29 | fields.add(new TableFieldSchema().setName("fog").setType("BOOLEAN").setMode("NULLABLE")); 30 | fields.add(new TableFieldSchema().setName("rain_drizzle").setType("BOOLEAN").setMode("NULLABLE")); 31 | fields.add(new TableFieldSchema().setName("snow_ice_pellets").setType("BOOLEAN").setMode("NULLABLE")); 32 | fields.add(new TableFieldSchema().setName("hail").setType("BOOLEAN").setMode("NULLABLE")); 33 | fields.add(new TableFieldSchema().setName("thunder").setType("BOOLEAN").setMode("NULLABLE")); 34 | fields.add(new TableFieldSchema().setName("tornado_funnel_cloud").setType("BOOLEAN").setMode("NULLABLE")); 35 | fields.add(new TableFieldSchema().setName("usaf").setType("INTEGER").setMode("NULLABLE")); 36 | fields.add(new TableFieldSchema().setName("wban").setType("INTEGER").setMode("NULLABLE")); 37 | fields.add(new TableFieldSchema().setName("station_name").setType("STRING").setMode("NULLABLE")); 38 | fields.add(new TableFieldSchema().setName("station_country").setType("STRING").setMode("NULLABLE")); 39 | fields.add(new TableFieldSchema().setName("station_state").setType("STRING").setMode("NULLABLE")); 40 | fields.add(new TableFieldSchema().setName("station_latitude").setType("FLOAT").setMode("NULLABLE")); 41 | fields.add(new TableFieldSchema().setName("station_longitude").setType("FLOAT").setMode("NULLABLE")); 42 | return new TableSchema().setFields(fields); 43 | } 44 | } 45 | 46 | 47 | -------------------------------------------------------------------------------- /external/afgc-dataflow-java/transform/src/main/java/airflow/gcloud/data/convert/ConvertObjectToStringFn.java: -------------------------------------------------------------------------------- 1 | package airflow.gcloud.data.convert; 2 | 3 | import com.google.cloud.dataflow.sdk.transforms.DoFn; 4 | 5 | public class ConvertObjectToStringFn extends DoFn { 6 | @Override 7 | public void processElement(ProcessContext c) throws Exception { 8 | c.output(c.element().toString()); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /external/afgc-dataflow-java/transform/src/main/java/airflow/gcloud/model/EmailEvent.java: -------------------------------------------------------------------------------- 1 | package airflow.gcloud.model; 2 | 3 | import java.io.Serializable; 4 | 5 | 6 | public class EmailEvent implements Serializable { 7 | 8 | String timestamp; 9 | String event; 10 | String email; 11 | String campaign; 12 | String id; 13 | 14 | } 15 | -------------------------------------------------------------------------------- /external/afgc-dataflow-java/transform/src/main/java/airflow/gcloud/transform/NoopFn.java: -------------------------------------------------------------------------------- 1 | package airflow.gcloud.transform; 2 | 3 | import com.google.cloud.dataflow.sdk.transforms.DoFn; 4 | import airflow.gcloud.model.EmailEvent; 5 | 6 | public class NoopFn extends DoFn { 7 | private static final long serialVersionUID = 0; 8 | 9 | @Override 10 | public void processElement(ProcessContext c) throws Exception { 11 | EmailEvent element = c.element(); 12 | c.output(element); 13 | } 14 | } 15 | 16 | -------------------------------------------------------------------------------- /external/afgc-spark-scala/src/main/scala/luigi/gcloud/spark/Copy.scala: -------------------------------------------------------------------------------- 1 | package luigi.gcloud.spark 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | object Copy { 6 | 7 | def main(args: Array[String]) { 8 | val sparkConf = new SparkConf().setAppName("DataProcSpark") 9 | 10 | if (args.length != 2) { 11 | throw new scala.RuntimeException("args.length = " + args.length) 12 | } 13 | 14 | val in = args(0) 15 | val out = args(1) 16 | 17 | val sc = new SparkContext(sparkConf) 18 | 19 | val data = sc.textFile(in) 20 | data.saveAsTextFile(out) 21 | 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /img/airflow_connection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexvanboxel/airflow-gcp-examples/a321f30c25070d2063a206cbe1c72710b1e21e49/img/airflow_connection.png -------------------------------------------------------------------------------- /img/airflow_variables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexvanboxel/airflow-gcp-examples/a321f30c25070d2063a206cbe1c72710b1e21e49/img/airflow_variables.png -------------------------------------------------------------------------------- /img/console_service_account.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexvanboxel/airflow-gcp-examples/a321f30c25070d2063a206cbe1c72710b1e21e49/img/console_service_account.png -------------------------------------------------------------------------------- /img/create_service_account.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexvanboxel/airflow-gcp-examples/a321f30c25070d2063a206cbe1c72710b1e21e49/img/create_service_account.png --------------------------------------------------------------------------------