├── .gitignore ├── renovate.json ├── .github ├── dependabot.yml └── workflows │ ├── maven.yml │ └── dependencies.yml ├── src ├── test │ ├── resources │ │ ├── artist.json │ │ └── artist_credit_name.json │ └── java │ │ └── com │ │ └── google │ │ └── cloud │ │ └── bqetl │ │ ├── JSONReaderTest.java │ │ └── mbdata │ │ └── MusicBrainzTransformsTest.java └── main │ └── java │ └── com │ └── google │ └── cloud │ └── bqetl │ ├── options │ └── BQETLOptions.java │ ├── json │ └── JSONReader.java │ ├── mbdata │ ├── MusicBrainzDataObject.java │ └── MusicBrainzTransforms.java │ ├── BQETLNested.java │ ├── BQETLSimple.java │ └── mbschema │ └── FieldSchemaListBuilder.java ├── README.md ├── CONTRIBUTING.md ├── run.sh ├── pom.xml └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.imi 3 | *.iml 4 | target/ 5 | run-simple 6 | run-nested 7 | 8 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "config:recommended" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "maven" # See documentation for possible values 4 | directory: "/" # Location of package manifests 5 | schedule: 6 | interval: "weekly" 7 | -------------------------------------------------------------------------------- /src/test/resources/artist.json: -------------------------------------------------------------------------------- 1 | {"id":634509,"gid":"e0140a67-e4d1-4f13-8a01-364355bee46e","name":"Justin Bieber","sort_name":"Bieber, Justin","begin_date_year":1994,"begin_date_month":3,"begin_date_day":1,"end_date_year":null,"end_date_month":null,"end_date_day":null,"type":1,"area":38,"gender":1,"comment":"","edits_pending":0,"last_updated":"2016-03-27T21:04:19.254165+00:00","ended":false,"begin_area":29450,"end_area":null} 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Performing ETL into Big Query Tutorial Sample Code 2 | 3 | This is the sample code for the [Performing ETL from a Relational Database into BigQuery using Dataflow](https://cloud.google.com/architecture/performing-etl-from-relational-database-into-bigquery) tutorial 4 | 5 | The tutorial explains how to ingest highly normalized (OLTP database style) data into Big Query using DataFlow. To understand this sample code it is recommended that you review the [Apache Beam programming model](https://beam.apache.org/documentation/programming-guide/). 6 | 7 | ## More Information 8 | 9 | * [Google Cloud Dataflow](https://cloud.google.com/dataflow/) 10 | * [Apache Beam programming model](https://beam.apache.org/documentation/programming-guide/). 11 | * [Java API Reference](https://cloud.google.com/dataflow/java-sdk/JavaDoc/index) 12 | -------------------------------------------------------------------------------- /.github/workflows/maven.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a Java project with Maven, and cache/restore any dependencies to improve the workflow execution time 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-java-with-maven 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Java CI with Maven 10 | 11 | on: 12 | push: 13 | branches: [ "master" ] 14 | pull_request: 15 | branches: [ "master" ] 16 | 17 | jobs: 18 | build: 19 | 20 | runs-on: ubuntu-latest 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | - name: Set up JDK 17 25 | uses: actions/setup-java@v4 26 | with: 27 | java-version: '17' 28 | distribution: 'temurin' 29 | cache: maven 30 | - name: Build with Maven 31 | run: mvn -B package --file pom.xml 32 | -------------------------------------------------------------------------------- /.github/workflows/dependencies.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a Java project with Maven, and cache/restore any dependencies to improve the workflow execution time 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-java-with-maven 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Dependency Tree upload 10 | 11 | on: 12 | push: 13 | branches: [ "master" ] 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | - name: Set up JDK 17 23 | uses: actions/setup-java@v4 24 | with: 25 | java-version: '17' 26 | distribution: 'temurin' 27 | cache: maven 28 | - name: Submit Dependency Snapshot 29 | uses: advanced-security/maven-dependency-submission-action@v4 30 | 31 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows [Google's Open Source Community 28 | Guidelines](https://opensource.google.com/conduct/). 29 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/bqetl/options/BQETLOptions.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.google.cloud.bqetl.options; 17 | 18 | import org.apache.beam.sdk.options.Default; 19 | import org.apache.beam.sdk.options.Description; 20 | import org.apache.beam.sdk.options.PipelineOptions; 21 | 22 | /** The specific pipeline options for this project. */ 23 | public interface BQETLOptions extends PipelineOptions { 24 | @Description("Location of artist credit name json.") 25 | @Default.String("gs://mb-data") 26 | String getLoadingBucketURL(); 27 | 28 | void setLoadingBucketURL(String loadingBucketURL); 29 | 30 | @Description("Big Query table name") 31 | @Default.String("musicbrainz_recordings_by_artist") 32 | String getBigQueryTablename(); 33 | 34 | void setBigQueryTablename(String bigQueryTablename); 35 | 36 | @Description("Overwrite BigQuery table") 37 | @Default.Boolean(false) 38 | Boolean getOverwriteBigQueryTable(); 39 | 40 | void setOverwriteBigQueryTable(Boolean overwriteBigQueryTable); 41 | 42 | @Description("Perform lookups for gender and area") 43 | @Default.Boolean(false) 44 | Boolean getPerformLookups(); 45 | 46 | void setPerformLookups(Boolean performLookups); 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/test/java/com/google/cloud/bqetl/JSONReaderTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.bqetl; 18 | 19 | import com.google.cloud.bqetl.json.JSONReader; 20 | import com.google.cloud.bqetl.mbdata.MusicBrainzDataObject; 21 | import java.io.BufferedReader; 22 | import java.io.InputStream; 23 | import java.io.InputStreamReader; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | import java.util.Map; 27 | 28 | public class JSONReaderTest { 29 | 30 | private String testArtistJSON; 31 | 32 | @org.junit.Test 33 | public void readMap() { 34 | MusicBrainzDataObject musicBrainzDataObject = JSONReader.readObject("artist", testArtistJSON); 35 | List> entries = new ArrayList<>(); 36 | musicBrainzDataObject.getColumns().forEachRemaining(entries::add); 37 | System.out.printf( 38 | "Columns set for artist %s : %d ", 39 | musicBrainzDataObject.getColumnValue("artist_name"), entries.size()); 40 | assert (entries.size() == 15); 41 | } 42 | 43 | @org.junit.Before 44 | public void setUp() throws Exception { 45 | ClassLoader classLoader = getClass().getClassLoader(); 46 | InputStream stream = classLoader.getResourceAsStream("artist.json"); 47 | BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); 48 | testArtistJSON = reader.readLine(); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2022 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # https://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | case "$1" in 18 | "nested") 19 | CLASS_NAME=BQETLNested 20 | USE_LOOKUPS="" 21 | ;; 22 | "simple") 23 | CLASS_NAME=BQETLSimple 24 | USE_LOOKUPS="" 25 | ;; 26 | "simple-with-lookups") 27 | CLASS_NAME=BQETLSimple 28 | USE_LOOKUPS="--performLookups" 29 | ;; 30 | *) 31 | echo "Pipeline type not specified (simple|simple-with-lookups|nested)" 32 | exit 33 | ;; 34 | esac 35 | 36 | check_required_value() { 37 | if [ _$1 == _ ]; then 38 | echo "$2 was not provided" 39 | exit 40 | fi 41 | } 42 | 43 | check_required_value "$PROJECT_ID" PROJECT_ID 44 | check_required_value "$DATASET" DATASET 45 | check_required_value "$DESTINATION_TABLE" DESTINATION_TABLE 46 | check_required_value "$REGION" REGION 47 | check_required_value "$SERVICE_ACCOUNT" SERVICE_ACCOUNT 48 | check_required_value "$DATAFLOW_TEMP_BUCKET" DATAFLOW_TEMP_BUCKET 49 | 50 | echo "Executing: " 51 | set -x 52 | 53 | mvn compile exec:java -e \ 54 | -Dexec.mainClass=com.google.cloud.bqetl.${CLASS_NAME} \ 55 | -Dexec.args="\ 56 | --project=${PROJECT_ID} \ 57 | --loadingBucketURL=gs://solutions-public-assets/bqetl \ 58 | --runner=DataflowRunner \ 59 | --numWorkers=5 \ 60 | --maxNumWorkers=10 \ 61 | --bigQueryTablename=${PROJECT_ID}:${DATASET}.${DESTINATION_TABLE} \ 62 | --region=${REGION} \ 63 | --serviceAccount=${SERVICE_ACCOUNT} \ 64 | --gcpTempLocation=${DATAFLOW_TEMP_BUCKET}/dftemp/ \ 65 | --tempLocation=${DATAFLOW_TEMP_BUCKET}/temp/ \ 66 | ${USE_LOOKUPS} \ 67 | " 68 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/bqetl/json/JSONReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.bqetl.json; 18 | 19 | import com.fasterxml.jackson.core.JsonFactory; 20 | import com.fasterxml.jackson.core.JsonParser; 21 | import com.fasterxml.jackson.core.JsonToken; 22 | import com.fasterxml.jackson.databind.JsonNode; 23 | import com.fasterxml.jackson.databind.ObjectMapper; 24 | import com.google.cloud.bqetl.mbdata.MusicBrainzDataObject; 25 | import org.slf4j.Logger; 26 | import org.slf4j.LoggerFactory; 27 | 28 | /** 29 | * Uses builtin jackson parser to parse a line of JSON and turn it into an MusicBrainzDataObject. 30 | */ 31 | public class JSONReader { 32 | 33 | private static final Logger logger = LoggerFactory.getLogger(JSONReader.class); 34 | private static final JsonFactory JSON_FACTORY = new JsonFactory(); 35 | 36 | /** 37 | * This method attempts to transform the json node into an object with a known type. 38 | * 39 | * @return an Object with the apparent type from JSON (number types are given their wide 40 | * equivalent (Long for ints, Double for float) 41 | */ 42 | private static Object nodeValueToObject( 43 | JsonNode node) { // No child objects or arrays in this flat data just text/number 44 | switch (node.getNodeType()) { 45 | case NUMBER: 46 | if (node.isFloat() || node.isDouble()) { 47 | return node.doubleValue(); 48 | } else { 49 | // For simplicity let all integers be Long. 50 | return node.asLong(); 51 | } 52 | case STRING: 53 | return node.asText(); 54 | case BOOLEAN: 55 | return node.asBoolean(); 56 | case NULL: 57 | return null; 58 | default: 59 | logger.warn("Unknown node type:" + node.getNodeType()); 60 | return null; 61 | } 62 | } 63 | 64 | /** 65 | * Reads an MusicBrainzDataObject from a json string. 66 | * 67 | * @param objectName - the namespace for the object 68 | * @param json the json string 69 | * @return the parsed object 70 | */ 71 | public static MusicBrainzDataObject readObject(String objectName, String json) { 72 | MusicBrainzDataObject datum = new MusicBrainzDataObject(objectName); 73 | try { 74 | JsonParser parser = JSON_FACTORY.createParser(json); 75 | parser.setCodec(new ObjectMapper()); 76 | while (!parser.isClosed()) { 77 | JsonToken token = parser.nextToken(); 78 | 79 | if (token != null && token.equals(JsonToken.START_OBJECT)) { 80 | 81 | JsonNode jsonTree = parser.readValueAsTree(); 82 | jsonTree 83 | .fields() 84 | .forEachRemaining( 85 | entry -> { 86 | if (entry.getValue() != null) { 87 | Object value = nodeValueToObject(entry.getValue()); 88 | if (value != null) { 89 | datum.addColumnValue(entry.getKey(), nodeValueToObject(entry.getValue())); 90 | } 91 | } else { 92 | logger.warn("null value for entry : " + entry.getKey()); 93 | } 94 | }); 95 | } 96 | } 97 | } catch (Exception e) { 98 | logger.error("parse exception", e); 99 | } 100 | return datum; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 4.0.0 19 | bqii 20 | bqii-dataflow-musibrainz-releases 21 | 2.0-SNAPSHOT 22 | 23 | UTF-8 24 | 3.13.0 25 | 3.3.0 26 | 2.56.0 27 | 2.0.13 28 | 4.13.2 29 | 2.2 30 | 31 | 32 | 33 | 34 | org.apache.maven.plugins 35 | maven-compiler-plugin 36 | ${maven-compiler-plugin.version} 37 | 38 | 1.8 39 | 1.8 40 | 41 | 42 | 43 | org.apache.maven.plugins 44 | maven-enforcer-plugin 45 | 3.5.0 46 | 47 | 48 | enforce-maven 49 | 50 | enforce 51 | 52 | 53 | 54 | 55 | 3.2.5 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | org.codehaus.mojo 64 | versions-maven-plugin 65 | 2.16.2 66 | 67 | 68 | 69 | 70 | 71 | org.codehaus.mojo 72 | exec-maven-plugin 73 | ${exec-maven-plugin.version} 74 | 75 | false 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | org.apache.beam 84 | beam-sdks-java-core 85 | ${beam-version} 86 | 87 | 88 | org.apache.beam 89 | beam-runners-google-cloud-dataflow-java 90 | ${beam-version} 91 | runtime 92 | 93 | 94 | org.apache.beam 95 | beam-runners-direct-java 96 | ${beam-version} 97 | runtime 98 | 99 | 100 | org.apache.beam 101 | beam-sdks-java-io-google-cloud-platform 102 | ${beam-version} 103 | 104 | 105 | 106 | org.slf4j 107 | slf4j-api 108 | ${slf4j.version} 109 | 110 | 111 | org.slf4j 112 | slf4j-jdk14 113 | ${slf4j.version} 114 | 115 | 116 | junit 117 | junit 118 | ${junit.version} 119 | test 120 | 121 | 122 | org.hamcrest 123 | hamcrest 124 | ${hamcrest.version} 125 | test 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/bqetl/mbdata/MusicBrainzDataObject.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.bqetl.mbdata; 18 | 19 | import java.io.Serializable; 20 | import java.util.HashMap; 21 | import java.util.Iterator; 22 | import java.util.Map; 23 | import java.util.Objects; 24 | import org.slf4j.Logger; 25 | import org.slf4j.LoggerFactory; 26 | 27 | /** 28 | * Class that represents a row of musicbrainz data from any table using wrapped 29 | * HashMap<String,Object> It represents each column by adding the tablename as a namespace for 30 | * the column in the format tablename_ to distinguish keys 31 | */ 32 | public class MusicBrainzDataObject implements Serializable { 33 | 34 | // the namespace field for this object, typically the name of the table from musicbrainz 35 | private final String namespace; 36 | 37 | // the column names and values. 38 | private Map columns = new HashMap<>(); 39 | 40 | private static final Logger LOG = LoggerFactory.getLogger(MusicBrainzDataObject.class); 41 | 42 | /** 43 | * Constructs a new MusicBrainzDataObject with the namespace (tablename in the RDBMS) set to 44 | * supplied argument. 45 | * 46 | * @param namespace namespace for this MusicBrainzDataObject 47 | */ 48 | public MusicBrainzDataObject(String namespace) { 49 | this.namespace = namespace; 50 | } 51 | 52 | /** 53 | * Get the namespace for this MusicBrainzDataObject 54 | * 55 | * @return name of the table for this MusicBrainzDataObject 56 | */ 57 | public String getNamespace() { 58 | return namespace; 59 | } 60 | 61 | /** Get an iterator to iterate through all the columns on this MusicBrainzDataObject. */ 62 | public Iterator> getColumns() { 63 | return columns.entrySet().iterator(); 64 | } 65 | 66 | /** 67 | * Adds a column value to this object prepending the namespace to the beginning. resulting key 68 | * will be tablename_columnname 69 | * 70 | * @param name the name of the column 71 | * @param value the value for the column 72 | */ 73 | public void addColumnValue(String name, Object value) { 74 | String namespaced_name = getNamespace() + "_" + name; 75 | columns.put(namespaced_name, value); 76 | } 77 | 78 | /** 79 | * Removes a column value from this object. 80 | * 81 | * @param name - name of the column to delete (including namespace) 82 | */ 83 | public Object removeColumnValue(String name) { 84 | return columns.remove(name); 85 | } 86 | 87 | /** 88 | * Get's a column value by its namespaced name. 89 | * 90 | * @param name - a string of the format tablename_columnname 91 | */ 92 | public Object getColumnValue(String name) { 93 | return columns.get(name); 94 | } 95 | 96 | /** 97 | * Merges a another MusicBrainzDataObject's entries with this object's. When doing so it leaves 98 | * the other MusicBrainzDataObject's field namespaces intact. If the other MusicBrainzDataObject 99 | * has "othertablename_othercolumnname", the entry in this MusicBrainzDataObject will be 100 | * "othertablename_othercolumnname" and the namespace will not be changed. 101 | * 102 | * @param other - the Row to merge with this one 103 | */ 104 | public MusicBrainzDataObject merge(MusicBrainzDataObject other) { 105 | 106 | if (other != null) { 107 | other.columns.forEach( 108 | (String key, Object value) -> { 109 | if (columns.containsKey(key)) { 110 | LOG.warn( 111 | "Duplicate key:" 112 | + key 113 | + "found merging MusicBrainzDataObject " 114 | + namespace 115 | + " with " 116 | + other.getNamespace()); 117 | } 118 | columns.put(key, value); 119 | }); 120 | } 121 | return this; 122 | } 123 | 124 | public void replace(String key, Object value) { 125 | columns.replace(key, value); 126 | } 127 | 128 | /** Makes a shallow clone of this object. */ 129 | @SuppressWarnings("unchecked") 130 | public MusicBrainzDataObject duplicate() { 131 | MusicBrainzDataObject duplicate = new MusicBrainzDataObject(namespace); 132 | duplicate.columns = (HashMap) ((HashMap) columns).clone(); 133 | return duplicate; 134 | } 135 | 136 | /** Deep equals */ 137 | @Override 138 | public boolean equals(Object o) { 139 | if (this == o) { 140 | return true; 141 | } 142 | if (!(o instanceof MusicBrainzDataObject)) { 143 | return false; 144 | } 145 | MusicBrainzDataObject that = (MusicBrainzDataObject) o; 146 | return namespace.equals(that.namespace) && columns.equals(that.columns); 147 | } 148 | 149 | @Override 150 | public int hashCode() { 151 | return Objects.hash(namespace, columns); 152 | } 153 | 154 | @Override 155 | public String toString() { 156 | return "MusicBrainzDataObject{" 157 | + "namespace='" 158 | + namespace 159 | + '\'' 160 | + ", columns=" 161 | + columns 162 | + '}'; 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/bqetl/BQETLNested.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.bqetl; 18 | 19 | import com.google.api.services.bigquery.model.TableRow; 20 | import com.google.api.services.bigquery.model.TableSchema; 21 | import com.google.cloud.bqetl.mbdata.MusicBrainzDataObject; 22 | import com.google.cloud.bqetl.mbdata.MusicBrainzTransforms; 23 | import com.google.cloud.bqetl.mbschema.FieldSchemaListBuilder; 24 | import com.google.cloud.bqetl.options.BQETLOptions; 25 | import org.apache.beam.sdk.Pipeline; 26 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 27 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 28 | import org.apache.beam.sdk.values.KV; 29 | import org.apache.beam.sdk.values.PCollection; 30 | import org.slf4j.Logger; 31 | import org.slf4j.LoggerFactory; 32 | 33 | public class BQETLNested { 34 | private static final Logger logger = LoggerFactory.getLogger(BQETLNested.class); 35 | 36 | public static void main(String[] args) { 37 | PipelineOptionsFactory.register(BQETLOptions.class); 38 | 39 | /* 40 | * get the custom options 41 | */ 42 | BQETLOptions options = 43 | PipelineOptionsFactory.fromArgs(args).withValidation().as(BQETLOptions.class); 44 | Pipeline p = Pipeline.create(options); 45 | 46 | /* 47 | * load the line delimited JSON into keyed PCollections 48 | */ 49 | 50 | PCollection> artists = 51 | MusicBrainzTransforms.loadTable( 52 | p, 53 | "artist", 54 | "id", 55 | MusicBrainzTransforms.lookup("area", "id", "name", "area", "begin_area"), 56 | MusicBrainzTransforms.lookup("gender", "id", "name", "gender")); 57 | PCollection> artistCreditName = 58 | MusicBrainzTransforms.loadTable(p, "artist_credit_name", "artist_credit"); 59 | PCollection> recordingsByArtistCredit = 60 | MusicBrainzTransforms.loadTable(p, "recording", "artist_credit"); 61 | 62 | PCollection recordingCredits = 63 | MusicBrainzTransforms.innerJoin( 64 | "nested recordings", artistCreditName, recordingsByArtistCredit); 65 | 66 | PCollection artistsWithRecordings = 67 | MusicBrainzTransforms.nest( 68 | artists, 69 | MusicBrainzTransforms.by("artist_credit_name_artist", recordingCredits), 70 | "recordings"); 71 | 72 | /* 73 | * create the table schema for Big Query 74 | */ 75 | TableSchema bqTableSchema = bqSchema(); 76 | /* 77 | * transform the joined MusicBrainzDataObject results into BQ Table rows 78 | */ 79 | PCollection tableRows = 80 | MusicBrainzTransforms.transformToTableRows(artistsWithRecordings, bqTableSchema); 81 | /* 82 | * write the tablerows to Big Query 83 | */ 84 | try { 85 | tableRows.apply( 86 | "Write to BigQuery", 87 | BigQueryIO.writeTableRows() 88 | .to(options.getBigQueryTablename()) 89 | .withSchema(bqTableSchema) 90 | .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE) 91 | .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)); 92 | } catch (Exception e) { 93 | logger.error("error writing to BQ: ", e); 94 | } 95 | p.run().waitUntilFinish(); 96 | } 97 | 98 | private static TableSchema bqSchema() { 99 | return FieldSchemaListBuilder.create() 100 | .intField("artist_id") 101 | .stringField("artist_gid") 102 | .stringField("artist_name") 103 | .stringField("artist_sort_name") 104 | .intField("artist_begin_date_year") 105 | .intField("artist_begin_date_month") 106 | .intField("artist_begin_date_day") 107 | .intField("artist_end_date_year") 108 | .intField("artist_end_date_month") 109 | .intField("artist_end_date_day") 110 | .intField("artist_type") 111 | .stringField("artist_area") 112 | .stringField("artist_gender") 113 | .intField("artist_edits_pending") 114 | .timestampField("artist_last_updated") 115 | .stringField("artist_comment") 116 | .boolField("artist_ended") 117 | .stringField("artist_begin_area") 118 | .field( 119 | FieldSchemaListBuilder.create() 120 | .intField("artist_credit_name_artist_credit") 121 | .intField("artist_credit_name_position") 122 | .intField("artist_credit_name_artist") 123 | .stringField("artist_credit_name_name") 124 | .stringField("artist_credit_name_join_phrase") 125 | .intField("recording_id") 126 | .stringField("recording_gid") 127 | .stringField("recording_name") 128 | .intField("recording_length") 129 | .stringField("recording_comment") 130 | .intField("recording_edits_pending") 131 | .timestampField("recording_last_updated") 132 | .boolField("recording_video") 133 | .repeatedRecord("artist_recordings")) 134 | .schema(); 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/bqetl/BQETLSimple.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.bqetl; 18 | 19 | import com.google.api.services.bigquery.model.TableRow; 20 | import com.google.api.services.bigquery.model.TableSchema; 21 | import com.google.cloud.bqetl.mbdata.MusicBrainzDataObject; 22 | import com.google.cloud.bqetl.mbdata.MusicBrainzTransforms; 23 | import com.google.cloud.bqetl.mbschema.FieldSchemaListBuilder; 24 | import com.google.cloud.bqetl.options.BQETLOptions; 25 | import org.apache.beam.sdk.Pipeline; 26 | import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; 27 | import org.apache.beam.sdk.options.PipelineOptionsFactory; 28 | import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider; 29 | import org.apache.beam.sdk.values.KV; 30 | import org.apache.beam.sdk.values.PCollection; 31 | 32 | /** 33 | * This is a pipeline that denormalizes exported data from the musicbrainz dataset to create a 34 | * flattened, denormalized Big Query table of artists' recordings that repeats artist information 35 | * for each of their credited recordings. 36 | * 37 | *

In addition to standard Pipeline parameters, this main program takes the following additional 38 | * parameters: --bigQueryTablename= :. 39 | * --loadingBucketURL=gs:// 40 | * 41 | *

An example of how to run this pipeline: mvn compile exec:java \ -Dexec.mainClass=BQETLSimple \ 42 | * -Dexec.args="--project=jlb-onboarding \ --loadingBucketURL=gs://mb-data \ 43 | * --stagingLocation=gs://mb-data \ --runner=BlockingDataflowPipelineRunner \ --numWorkers=185 \ 44 | * --maxNumWorkers=500 \ --bigQueryTablename=example_project:example_dataset.example_table \ 45 | * --diskSizeGb=1000 \ --workerMachineType=n1-standard-1" 46 | */ 47 | public class BQETLSimple { 48 | 49 | public static void main(String[] args) { 50 | PipelineOptionsFactory.register(BQETLOptions.class); 51 | 52 | /* 53 | * get the custom options 54 | */ 55 | BQETLOptions options = 56 | PipelineOptionsFactory.fromArgs(args).withValidation().as(BQETLOptions.class); 57 | Pipeline p = Pipeline.create(options); 58 | 59 | /* 60 | * load the line delimited JSON files into keyed PCollections 61 | */ 62 | PCollection> artists; 63 | if (options.getPerformLookups()) { 64 | // [START loadArtistsWithLookups] 65 | artists = 66 | MusicBrainzTransforms.loadTable( 67 | p, 68 | "artist", 69 | "id", 70 | MusicBrainzTransforms.lookup("area", "id", "name", "area", "begin_area"), 71 | MusicBrainzTransforms.lookup("gender", "id", "name", "gender")); 72 | // [END loadArtistsWithLookups] 73 | } else { 74 | artists = MusicBrainzTransforms.loadTable(p, "artist", "id"); 75 | } 76 | PCollection> artistCreditName = 77 | MusicBrainzTransforms.loadTable(p, "artist_credit_name", "artist"); 78 | PCollection> recordingsByArtistCredit = 79 | MusicBrainzTransforms.loadTable(p, "recording", "artist_credit"); 80 | 81 | /* 82 | * perform inner joins 83 | */ 84 | // [START artist_artist_credit_join] 85 | PCollection artistCredits = 86 | MusicBrainzTransforms.innerJoin("artists with artist credits", artists, artistCreditName); 87 | // [END artist_artist_credit_join] 88 | // [START byCall] 89 | PCollection> artistCreditNamesByArtistCredit = 90 | MusicBrainzTransforms.by("artist_credit_name_artist_credit", artistCredits); 91 | // [END byCall] 92 | // [START joinCall] 93 | PCollection artistRecordings = 94 | MusicBrainzTransforms.innerJoin( 95 | "joined recordings", artistCreditNamesByArtistCredit, recordingsByArtistCredit); 96 | // [END joinCall] 97 | 98 | /* 99 | * create the table schema for Big Query 100 | */ 101 | TableSchema bqTableSchema = bqSchema(options.getPerformLookups()); 102 | /* 103 | * transform the joined MusicBrainzDataObject results into BQ Table rows 104 | */ 105 | // [START transformToTableRowCall] 106 | PCollection tableRows = 107 | MusicBrainzTransforms.transformToTableRows(artistRecordings, bqTableSchema); 108 | // [END transformToTableRowCall] 109 | /* 110 | * write the tablerows to Big Query 111 | */ 112 | // [START bigQueryWrite] 113 | tableRows.apply( 114 | "Write to BigQuery", 115 | BigQueryIO.writeTableRows() 116 | .to(options.getBigQueryTablename()) 117 | .withSchema(bqTableSchema) 118 | .withCustomGcsTempLocation(StaticValueProvider.of(options.getTempLocation())) 119 | .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE) 120 | .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)); 121 | // [END bigQueryWrite] 122 | 123 | p.run().waitUntilFinish(); 124 | } 125 | 126 | private static TableSchema bqSchema(boolean usingAreaGenderLookups) { 127 | FieldSchemaListBuilder fieldSchemaListBuilder = new FieldSchemaListBuilder(); 128 | 129 | fieldSchemaListBuilder 130 | .intField("artist_id") 131 | .stringField("artist_gid") 132 | .stringField("artist_name") 133 | .stringField("artist_sort_name") 134 | .intField("artist_begin_date_year") 135 | .intField("artist_begin_date_month") 136 | .intField("artist_begin_date_day") 137 | .intField("artist_end_date_year") 138 | .intField("artist_end_date_month") 139 | .intField("artist_end_date_day") 140 | .intField("artist_type") 141 | .intField("artist_edits_pending") 142 | .timestampField("artist_last_updated") 143 | .stringField("artist_comment") 144 | .boolField("artist_ended") 145 | .intField("artist_credit_name_artist_credit") 146 | .intField("artist_credit_name_position") 147 | .intField("artist_credit_name_artist") 148 | .stringField("artist_credit_name_name") 149 | .stringField("artist_credit_name_join_phrase") 150 | .intField("recording_id") 151 | .stringField("recording_gid") 152 | .stringField("recording_name") 153 | .intField("recording_artist_credit") 154 | .intField("recording_length") 155 | .stringField("recording_comment") 156 | .intField("recording_edits_pending") 157 | .timestampField("recording_last_updated") 158 | .boolField("recording_video"); 159 | 160 | if (usingAreaGenderLookups) { 161 | fieldSchemaListBuilder 162 | .stringField("artist_area") 163 | .stringField("artist_gender") 164 | .stringField("artist_begin_area"); 165 | } else { 166 | fieldSchemaListBuilder 167 | .intField("artist_area") 168 | .intField("artist_gender") 169 | .intField("artist_begin_area"); 170 | } 171 | 172 | return fieldSchemaListBuilder.schema(); 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /src/test/resources/artist_credit_name.json: -------------------------------------------------------------------------------- 1 | {"artist_credit":634509,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":""} 2 | {"artist_credit":820684,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 3 | {"artist_credit":835671,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 4 | {"artist_credit":846330,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 5 | {"artist_credit":846331,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 6 | {"artist_credit":846332,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 7 | {"artist_credit":857848,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 8 | {"artist_credit":857849,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 9 | {"artist_credit":871457,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 10 | {"artist_credit":999502,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 11 | {"artist_credit":999503,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 12 | {"artist_credit":904694,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" with "} 13 | {"artist_credit":904702,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 14 | {"artist_credit":904703,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 15 | {"artist_credit":904704,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 16 | {"artist_credit":904705,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 17 | {"artist_credit":890575,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 18 | {"artist_credit":890576,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 19 | {"artist_credit":890577,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 20 | {"artist_credit":897870,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 21 | {"artist_credit":999754,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 22 | {"artist_credit":917362,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 23 | {"artist_credit":999099,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 24 | {"artist_credit":1007132,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 25 | {"artist_credit":919196,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 26 | {"artist_credit":964779,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 27 | {"artist_credit":1027242,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 28 | {"artist_credit":1052228,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 29 | {"artist_credit":1067702,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 30 | {"artist_credit":1093597,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 31 | {"artist_credit":1170806,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":", "} 32 | {"artist_credit":1122987,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 33 | {"artist_credit":1141329,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 34 | {"artist_credit":1182229,"position":2,"artist":634509,"name":"Justin Bieber","join_phrase":""} 35 | {"artist_credit":1183195,"position":2,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 36 | {"artist_credit":1482172,"position":2,"artist":634509,"name":"Justin Bieber","join_phrase":""} 37 | {"artist_credit":1208426,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 38 | {"artist_credit":1218129,"position":2,"artist":634509,"name":"Justin Bieber","join_phrase":""} 39 | {"artist_credit":1234197,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 40 | {"artist_credit":1239695,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 41 | {"artist_credit":1247041,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 42 | {"artist_credit":1249078,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 43 | {"artist_credit":1249079,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 44 | {"artist_credit":1286965,"position":2,"artist":634509,"name":"Justin Bieber","join_phrase":""} 45 | {"artist_credit":1293773,"position":5,"artist":634509,"name":"Justin Bieber","join_phrase":", "} 46 | {"artist_credit":1294704,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 47 | {"artist_credit":1331108,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 48 | {"artist_credit":1345536,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 49 | {"artist_credit":1355031,"position":2,"artist":634509,"name":"Justin Bieber","join_phrase":""} 50 | {"artist_credit":1356581,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 51 | {"artist_credit":1420071,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 52 | {"artist_credit":1436473,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 53 | {"artist_credit":1493844,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 54 | {"artist_credit":1529552,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 55 | {"artist_credit":1534055,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 56 | {"artist_credit":1561294,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 57 | {"artist_credit":1564254,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 58 | {"artist_credit":1564256,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 59 | {"artist_credit":1564257,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 60 | {"artist_credit":1564258,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 61 | {"artist_credit":1564259,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 62 | {"artist_credit":1564540,"position":2,"artist":634509,"name":"Justin Bieber","join_phrase":""} 63 | {"artist_credit":1568753,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 64 | {"artist_credit":1586223,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 65 | {"artist_credit":1588820,"position":0,"artist":634509,"name":"Justin Bieber feat. Usher","join_phrase":""} 66 | {"artist_credit":1588821,"position":0,"artist":634509,"name":"Justin Bieber feat. Ludacris","join_phrase":""} 67 | {"artist_credit":1588822,"position":0,"artist":634509,"name":"Justin Bieber feat. Jessica Jarrell","join_phrase":""} 68 | {"artist_credit":1588823,"position":0,"artist":634509,"name":"Justin Bieber with Sean Kingston","join_phrase":""} 69 | {"artist_credit":1596359,"position":2,"artist":634509,"name":"Justin Bieber","join_phrase":""} 70 | {"artist_credit":1602071,"position":2,"artist":634509,"name":"Justin Bieber","join_phrase":""} 71 | {"artist_credit":1602072,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" feat. "} 72 | {"artist_credit":1620423,"position":0,"artist":634509,"name":"Justin Bieber","join_phrase":" duet with "} 73 | {"artist_credit":1625803,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 74 | {"artist_credit":1630569,"position":3,"artist":634509,"name":"Justin Bieber","join_phrase":""} 75 | {"artist_credit":1656741,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 76 | {"artist_credit":1674737,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 77 | {"artist_credit":1689843,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 78 | {"artist_credit":1690683,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":""} 79 | {"artist_credit":1690777,"position":1,"artist":634509,"name":"Justin Bieber","join_phrase":" & "} 80 | -------------------------------------------------------------------------------- /src/test/java/com/google/cloud/bqetl/mbdata/MusicBrainzTransformsTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.bqetl.mbdata; 18 | 19 | import static java.util.stream.Collectors.toList; 20 | 21 | import java.io.BufferedReader; 22 | import java.io.InputStream; 23 | import java.io.InputStreamReader; 24 | import java.util.AbstractMap.SimpleEntry; 25 | import java.util.ArrayList; 26 | import java.util.Collections; 27 | import java.util.List; 28 | import java.util.Map; 29 | import org.apache.beam.sdk.coders.StringUtf8Coder; 30 | import org.apache.beam.sdk.testing.PAssert; 31 | import org.apache.beam.sdk.testing.TestPipeline; 32 | import org.apache.beam.sdk.transforms.Count; 33 | import org.apache.beam.sdk.transforms.Create; 34 | import org.apache.beam.sdk.transforms.Keys; 35 | import org.apache.beam.sdk.transforms.MapElements; 36 | import org.apache.beam.sdk.values.KV; 37 | import org.apache.beam.sdk.values.PCollection; 38 | import org.apache.beam.sdk.values.PCollectionView; 39 | import org.apache.beam.sdk.values.TypeDescriptor; 40 | import org.junit.runner.RunWith; 41 | import org.junit.runners.JUnit4; 42 | import org.slf4j.Logger; 43 | import org.slf4j.LoggerFactory; 44 | 45 | @RunWith(JUnit4.class) 46 | public class MusicBrainzTransformsTest { 47 | 48 | private List artistCreditLinesOfJson; 49 | private List recordingLinesOfJson; 50 | private List artistLinesOfJson; 51 | private List areaLinesOfJson; 52 | 53 | static final Logger LOG = LoggerFactory.getLogger(MusicBrainzTransformsTest.class); 54 | 55 | @org.junit.Test 56 | public void loadArtistCreditsByKey() { 57 | 58 | TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false); 59 | 60 | Long[] artistCreditIds = {634509L, 846332L}; 61 | PCollection text = 62 | p.apply(Create.of(artistCreditLinesOfJson)).setCoder(StringUtf8Coder.of()); 63 | PCollection> artistCredits = 64 | MusicBrainzTransforms.loadTableFromText(text, "artist_credit_name", "artist_credit"); 65 | 66 | PCollection artistCreditIdPCollection = artistCredits.apply(Keys.create()); 67 | PAssert.that(artistCreditIdPCollection).containsInAnyOrder(634509L, 846332L); 68 | } 69 | 70 | @org.junit.Test 71 | public void joinArtistCreditsWithRecordings() { 72 | 73 | TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false); 74 | 75 | PCollection artistCreditText = 76 | p.apply("artistCredits", Create.of(artistCreditLinesOfJson)).setCoder(StringUtf8Coder.of()); 77 | 78 | PCollection> artistCredits = 79 | MusicBrainzTransforms.loadTableFromText( 80 | artistCreditText, "artist_credit_name", "artist_credit"); 81 | 82 | PCollection recordingText = 83 | p.apply("recordings", Create.of(recordingLinesOfJson)).setCoder(StringUtf8Coder.of()); 84 | 85 | PCollection> recordings = 86 | MusicBrainzTransforms.loadTableFromText(recordingText, "recording", "artist_credit"); 87 | 88 | PCollection joinedRecordings = 89 | MusicBrainzTransforms.innerJoin( 90 | "artist credits with recordings", artistCredits, recordings); 91 | 92 | PCollection recordingIds = 93 | joinedRecordings.apply( 94 | MapElements.into(new TypeDescriptor() {}) 95 | .via((MusicBrainzDataObject mbo) -> (Long) mbo.getColumnValue("recording_id"))); 96 | 97 | Long bieberRecording = 17069165L; 98 | Long bieberRecording2 = 15508507L; 99 | 100 | PAssert.that(recordingIds) 101 | .satisfies( 102 | (longs) -> { 103 | List theList = new ArrayList<>(); 104 | longs.forEach(theList::add); 105 | assert (theList.contains(bieberRecording)); 106 | assert (theList.contains(bieberRecording2)); 107 | return null; 108 | }); 109 | 110 | PCollection numberJoined = 111 | joinedRecordings.apply("count joined recordings", Count.globally()); 112 | PCollection numberOfArtistCredits = 113 | artistCredits.apply("count artist credits", Count.globally()); 114 | 115 | PAssert.thatSingleton(numberJoined).isEqualTo(448L); 116 | 117 | p.run(); 118 | } 119 | 120 | @org.junit.Test 121 | public void loadArtistsWithMapping() { 122 | 123 | TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false); 124 | 125 | PCollection artistText = 126 | p.apply("artist", Create.of(artistLinesOfJson)).setCoder(StringUtf8Coder.of()); 127 | 128 | List, PCollectionView>>> maps = new ArrayList<>(); 129 | 130 | PCollection areaMapText = 131 | p.apply("area", Create.of(areaLinesOfJson)).setCoder(StringUtf8Coder.of()); 132 | PCollectionView> areamap = 133 | MusicBrainzTransforms.loadMapFromText(areaMapText, "id", "area", "area"); 134 | 135 | maps.add(new SimpleEntry<>(Collections.singletonList("area"), areamap)); 136 | 137 | PCollection> loadedArtists = 138 | MusicBrainzTransforms.loadTableFromText(artistText, "artist", "id", maps); 139 | 140 | PCollection areas = 141 | loadedArtists.apply( 142 | "areaLabels", 143 | MapElements.into(new TypeDescriptor() {}) 144 | .via( 145 | (KV row) -> 146 | (String) row.getValue().getColumnValue("area"))); 147 | 148 | PAssert.that(areas) 149 | .satisfies( 150 | (areaLabels) -> { 151 | List theList = new ArrayList<>(); 152 | areaLabels.forEach(theList::add); 153 | assert (theList.contains("Canada")); 154 | return null; 155 | }); 156 | } 157 | 158 | @org.junit.Test 159 | public void testNest() { 160 | TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false); 161 | PCollection artistText = 162 | p.apply("artist", Create.of(artistLinesOfJson)).setCoder(StringUtf8Coder.of()); 163 | PCollection artistCreditNameText = 164 | p.apply("artist_credit_name", Create.of(artistCreditLinesOfJson)); 165 | PCollection recordingText = 166 | p.apply("recording", Create.of(recordingLinesOfJson)).setCoder(StringUtf8Coder.of()); 167 | 168 | PCollection> artistsById = 169 | MusicBrainzTransforms.loadTableFromText(artistText, "artist", "id"); 170 | 171 | PCollection> recordingsByArtistCredit = 172 | MusicBrainzTransforms.loadTableFromText(recordingText, "recording", "artist_credit"); 173 | PCollection> artistCreditByArtistCredit = 174 | MusicBrainzTransforms.loadTableFromText( 175 | artistCreditNameText, "artist_credit_name", "artist_credit"); 176 | 177 | PCollection recordingsWithCredits = 178 | MusicBrainzTransforms.innerJoin( 179 | "credited recordings", artistCreditByArtistCredit, recordingsByArtistCredit); 180 | PCollection> recordingsJoinedWithCredits = 181 | MusicBrainzTransforms.by("artist_credit_name_artist", recordingsWithCredits); 182 | PCollection artistsWithNestedRecordings = 183 | MusicBrainzTransforms.nest(artistsById, recordingsJoinedWithCredits, "recordings"); 184 | 185 | PAssert.that(artistsWithNestedRecordings) 186 | .satisfies( 187 | (artistCollection) -> { 188 | List theList = new ArrayList<>(); 189 | artistCollection.forEach(theList::add); 190 | 191 | assert (theList.size() == 1); 192 | @SuppressWarnings("unchecked") 193 | List artist_recordings = 194 | (List) theList.get(0).getColumnValue("artist_recordings"); 195 | assert (artist_recordings.size() == 448); 196 | 197 | return null; 198 | }); 199 | 200 | p.run(); 201 | } 202 | 203 | @org.junit.Before 204 | public void setUp() { 205 | ClassLoader classLoader = getClass().getClassLoader(); 206 | InputStream stream = classLoader.getResourceAsStream("artist_credit_name.json"); 207 | BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); 208 | artistCreditLinesOfJson = reader.lines().collect(toList()); 209 | 210 | stream = classLoader.getResourceAsStream("recording.json"); 211 | reader = new BufferedReader(new InputStreamReader(stream)); 212 | recordingLinesOfJson = reader.lines().collect(toList()); 213 | 214 | stream = classLoader.getResourceAsStream("artist.json"); 215 | reader = new BufferedReader(new InputStreamReader(stream)); 216 | artistLinesOfJson = reader.lines().collect(toList()); 217 | 218 | stream = classLoader.getResourceAsStream("area.json"); 219 | reader = new BufferedReader(new InputStreamReader(stream)); 220 | areaLinesOfJson = reader.lines().collect(toList()); 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/bqetl/mbschema/FieldSchemaListBuilder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.bqetl.mbschema; 18 | 19 | import com.google.api.services.bigquery.model.TableFieldSchema; 20 | import com.google.api.services.bigquery.model.TableSchema; 21 | import java.util.ArrayList; 22 | import java.util.List; 23 | 24 | /** Convenience Builder class for building a list of TableFieldSchema's */ 25 | public class FieldSchemaListBuilder { 26 | 27 | public static final String INTEGER = "INTEGER"; 28 | public static final String STRING = "STRING"; 29 | public static final String TIMESTAMP = "TIMESTAMP"; 30 | public static final String BOOLEAN = "BOOLEAN"; 31 | public static final String RECORD = "RECORD"; 32 | public static final String REQUIRED = "REQUIRED"; 33 | public static final String NULLABLE = "NULLABLE"; 34 | public static final String REPEATED = "REPEATED"; 35 | 36 | final List schemaFields = new ArrayList<>(); 37 | 38 | /** Factory method returns a new FieldSchemaListBuilder. */ 39 | public static FieldSchemaListBuilder create() { 40 | return new FieldSchemaListBuilder(); 41 | } 42 | 43 | /** 44 | * Creates a record TableSchemaField out of the given list and returns it so that it may be added 45 | * to another FieldSchemaListBuilder 46 | */ 47 | public TableFieldSchema fieldSchema(FieldSchemaListBuilder list) { 48 | TableFieldSchema tfs = new TableFieldSchema(); 49 | tfs.setType("RECORD"); 50 | tfs.setFields(list.schemaFields); 51 | return tfs; 52 | } 53 | 54 | /** 55 | * Creates and returns a TableFieldSchema with the given parameters. 56 | * 57 | * @param type - the datatype @see https://cloud.google.com/bigquery/data-types 58 | * @param name - the name of the field 59 | * @param mode - the mode of the field 60 | * @see TableFieldSchema 61 | */ 62 | public TableFieldSchema fieldSchema(String type, String name, String mode) { 63 | return fieldSchema(type, name, mode, ""); 64 | } 65 | 66 | /** 67 | * Creates a TableFieldSchema with all the parameters 68 | * 69 | * @param type - the datatype @see https://cloud.google.com/bigquery/data-types 70 | * @param name - the name of the field 71 | * @param mode - the mode of the field 72 | * @param description - a description of the field to create. 73 | * @see TableFieldSchema * @return 74 | */ 75 | public TableFieldSchema fieldSchema(String type, String name, String mode, String description) { 76 | TableFieldSchema tfs = new TableFieldSchema(); 77 | tfs.setType(type); 78 | tfs.setName(name); 79 | tfs.setMode(mode); 80 | tfs.setDescription(description); 81 | return tfs; 82 | } 83 | 84 | /** 85 | * Convenience method for builder that constructs an INTEGER type field and adds it to the 86 | * FieldSchemaListBuilder's list of fields. 87 | * 88 | * @param name - the name of the field 89 | * @param mode the mode of the field 90 | * @param description a description of the field 91 | * @see TableFieldSchema * @return this 92 | */ 93 | public FieldSchemaListBuilder intField(String name, String mode, String description) { 94 | schemaFields.add(fieldSchema(INTEGER, name, mode, description)); 95 | return this; 96 | } 97 | /** 98 | * Convenience method for builder that constructs an INTEGER type field with an empty description 99 | * and adds it to the FieldSchemaListBuilder's list of fields. 100 | * 101 | * @param name - the name of the field 102 | * @param mode the mode of the field 103 | * @see TableFieldSchema * @return this 104 | */ 105 | public FieldSchemaListBuilder intField(String name, String mode) { 106 | return intField(name, mode, ""); 107 | } 108 | /** 109 | * Convenience method for builder that constructs an INTEGER type field with an empty description 110 | * and Nullable mode adds it to the FieldSchemaListBuilder's list of fields. 111 | * 112 | * @param name - the name of the field 113 | * @see TableFieldSchema * @return this 114 | */ 115 | public FieldSchemaListBuilder intField(String name) { 116 | return intField(name, NULLABLE); 117 | } 118 | 119 | /** 120 | * Convenience method for builder that constructs an STRING type field with the given parameters 121 | * adds it to the FieldSchemaListBuilder's list of fields. 122 | * 123 | * @param name - the name of the field 124 | * @param mode the mode of the field 125 | * @param description the description for the field. 126 | * @see TableFieldSchema * @return this 127 | */ 128 | public FieldSchemaListBuilder stringField(String name, String mode, String description) { 129 | schemaFields.add(fieldSchema(STRING, name, mode, description)); 130 | return this; 131 | } 132 | /** 133 | * Convenience method for builder that constructs an STRING type field with the given parameters 134 | * adds it to the FieldSchemaListBuilder's list of fields. 135 | * 136 | * @param name - the name of the field 137 | * @param mode the mode of the field 138 | * @see TableFieldSchema * @return this 139 | */ 140 | public FieldSchemaListBuilder stringField(String name, String mode) { 141 | return stringField(name, mode, ""); 142 | } 143 | /** 144 | * Convenience method for builder that constructs an STRING type field with the given parameters 145 | * adds it to the FieldSchemaListBuilder's list of fields. 146 | * 147 | * @param name - the name of the field 148 | * @see TableFieldSchema * @return this 149 | */ 150 | public FieldSchemaListBuilder stringField(String name) { 151 | return stringField(name, NULLABLE); 152 | } 153 | 154 | /** 155 | * Convenience method for builder that constructs an BOOLEAN type field with the given parameters 156 | * adds it to the FieldSchemaListBuilder's list of fields. 157 | * 158 | * @param name - the name of the field 159 | * @param mode the mode of the field 160 | * @param description the description for the field. 161 | * @see TableFieldSchema * @return this 162 | */ 163 | public FieldSchemaListBuilder boolField(String name, String mode, String description) { 164 | schemaFields.add(fieldSchema(BOOLEAN, name, mode, description)); 165 | return this; 166 | } 167 | /** 168 | * Convenience method for builder that constructs a Boolean type field with the given name and 169 | * NULLABLE mode adds it to the builder's list and returns the builder. 170 | * 171 | * @param name - name of the field. 172 | * @param mode - the mode for the field. 173 | * @see TableFieldSchema * @return this 174 | */ 175 | public FieldSchemaListBuilder boolField(String name, String mode) { 176 | return boolField(name, mode, ""); 177 | } 178 | 179 | /** 180 | * Convenience method for builder that constructs a Boolean type field with the given name and 181 | * NULLABLE mode adds it to the builder's list and returns the builder. 182 | * 183 | * @param name - the name of the field 184 | * @see TableFieldSchema 185 | * @return this 186 | */ 187 | public FieldSchemaListBuilder boolField(String name) { 188 | return boolField(name, NULLABLE); 189 | } 190 | 191 | /** 192 | * Convenience method for builder that constructs a Timestamp type field with the given parameters 193 | * adds it to the FieldSchemaListBuilder's list of fields. 194 | * 195 | * @param name - the name of the field 196 | * @param mode the mode of the field 197 | * @param description the description for the field. 198 | * @see TableFieldSchema 199 | * @return this 200 | */ 201 | public FieldSchemaListBuilder timestampField(String name, String mode, String description) { 202 | schemaFields.add(fieldSchema(TIMESTAMP, name, mode, description)); 203 | return this; 204 | } 205 | 206 | /** 207 | * Adds a timestamp field to the builder's list with the given name and mode and returns the 208 | * builder. 209 | * 210 | * @param name - name of the field 211 | * @param mode - mode for the timestamp field 212 | * @see TableFieldSchema 213 | * @return this 214 | */ 215 | public FieldSchemaListBuilder timestampField(String name, String mode) { 216 | return timestampField(name, mode, ""); 217 | } 218 | 219 | /** 220 | * Creates a timestampField with a NULLABLE mode adds it to the builder's list and returns the 221 | * builder 222 | * 223 | * @param name - name of the TableFieldSchema 224 | * @return this 225 | */ 226 | public FieldSchemaListBuilder timestampField(String name) { 227 | return timestampField(name, NULLABLE); 228 | } 229 | 230 | /** 231 | * This adds the field to the FieldSchemaListBuilders list of fields 232 | * 233 | * @param field a TableFieldSchema to add to the list. 234 | */ 235 | public FieldSchemaListBuilder field(TableFieldSchema field) { 236 | schemaFields.add(field); 237 | return this; 238 | } 239 | /** 240 | * Returns a new repeated record field. 241 | * 242 | * @param name the name of the field. 243 | */ 244 | public TableFieldSchema repeatedRecord(String name) { 245 | TableFieldSchema tfs = fieldSchema(this); 246 | tfs.setName(name); 247 | tfs.setMode("REPEATED"); 248 | return tfs; 249 | } 250 | 251 | /** 252 | * Returns a TableSchema for this list of fields. 253 | * 254 | * @return the BigQuery TableSchema object for this list of fields 255 | */ 256 | public TableSchema schema() { 257 | TableSchema result = new TableSchema(); 258 | result.setFields(schemaFields); 259 | return result; 260 | } 261 | } 262 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /src/main/java/com/google/cloud/bqetl/mbdata/MusicBrainzTransforms.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * https://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.cloud.bqetl.mbdata; 18 | 19 | import com.google.api.services.bigquery.model.TableFieldSchema; 20 | import com.google.api.services.bigquery.model.TableRow; 21 | import com.google.api.services.bigquery.model.TableSchema; 22 | import com.google.cloud.bqetl.json.JSONReader; 23 | import com.google.cloud.bqetl.mbschema.FieldSchemaListBuilder; 24 | import com.google.cloud.bqetl.options.BQETLOptions; 25 | import java.util.AbstractMap.SimpleEntry; 26 | import java.util.ArrayList; 27 | import java.util.Arrays; 28 | import java.util.HashMap; 29 | import java.util.List; 30 | import java.util.Map; 31 | import java.util.Set; 32 | import java.util.stream.Collectors; 33 | import org.apache.beam.sdk.Pipeline; 34 | import org.apache.beam.sdk.io.TextIO; 35 | import org.apache.beam.sdk.transforms.DoFn; 36 | import org.apache.beam.sdk.transforms.Flatten; 37 | import org.apache.beam.sdk.transforms.MapElements; 38 | import org.apache.beam.sdk.transforms.ParDo; 39 | import org.apache.beam.sdk.transforms.View; 40 | import org.apache.beam.sdk.transforms.join.CoGbkResult; 41 | import org.apache.beam.sdk.transforms.join.CoGroupByKey; 42 | import org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple; 43 | import org.apache.beam.sdk.values.KV; 44 | import org.apache.beam.sdk.values.PCollection; 45 | import org.apache.beam.sdk.values.PCollectionView; 46 | import org.apache.beam.sdk.values.TupleTag; 47 | import org.apache.beam.sdk.values.TypeDescriptor; 48 | import org.slf4j.Logger; 49 | import org.slf4j.LoggerFactory; 50 | 51 | public class MusicBrainzTransforms { 52 | 53 | /** This is a library of reusable transforms for the musicbrainz dataset. */ 54 | private static final int BIGQUERY_NESTING_LIMIT = 1000; 55 | 56 | private static final Logger logger = LoggerFactory.getLogger(MusicBrainzTransforms.class); 57 | 58 | /** 59 | * Given a PCollection of MusicBrainzDataObject's turn into to a keyed collection keyed by the 60 | * given key. 61 | * 62 | * @param name - name of the column value to use as the key 63 | * @param input - the PCollection of MusicBrainzDataObject's note that the column value is assumed 64 | * to be a Long 65 | */ 66 | public static PCollection> by( 67 | String name, PCollection input) { 68 | return input.apply( 69 | "by " + name, 70 | MapElements.into(new TypeDescriptor>() {}) 71 | .via( 72 | (MusicBrainzDataObject inputObject) -> { 73 | try { 74 | return KV.of((Long) inputObject.getColumnValue(name), inputObject); 75 | } catch (Exception e) { 76 | logger.error(" exception in by " + name, e); 77 | return null; 78 | } 79 | })); 80 | } 81 | 82 | private static PCollection> group( 83 | String name, 84 | PCollection> first, 85 | PCollection> second, 86 | TupleTag firstTag, 87 | TupleTag secondTag) { 88 | PCollection> joinedResult; 89 | try { 90 | joinedResult = 91 | KeyedPCollectionTuple.of(firstTag, first) 92 | .and(secondTag, second) 93 | .apply("joinResult_" + name, CoGroupByKey.create()); 94 | } catch (Exception e) { 95 | logger.error("exception grouping.", e); 96 | return null; 97 | } 98 | return joinedResult; 99 | } 100 | 101 | /** 102 | * Perform an inner join of two keyed PCollections of MusicBrainzDataObjects and merge the results 103 | * into a list of MusicBrainzDataObjects. 104 | * 105 | * @param table1 - Keyed PCollection of MusicBrainzDataObject's 106 | * @param table2 - Keyed PCollection of MusicBrainzDataObject's 107 | */ 108 | // [START innerJoin] 109 | public static PCollection innerJoin( 110 | String name, 111 | PCollection> table1, 112 | PCollection> table2) { 113 | final TupleTag t1 = new TupleTag() {}; 114 | final TupleTag t2 = new TupleTag() {}; 115 | PCollection> joinedResult = group(name, table1, table2, t1, t2); 116 | // [END innerJoin] 117 | // [START mergeJoinResults] 118 | PCollection> mergedResult = 119 | joinedResult.apply( 120 | "merge join results", 121 | MapElements.into(new TypeDescriptor>() {}) 122 | .via( 123 | (KV group) -> { 124 | List result = new ArrayList<>(); 125 | Iterable leftObjects = group.getValue().getAll(t1); 126 | Iterable rightObjects = group.getValue().getAll(t2); 127 | leftObjects.forEach( 128 | (MusicBrainzDataObject l) -> 129 | rightObjects.forEach( 130 | (MusicBrainzDataObject r) -> result.add(l.duplicate().merge(r)))); 131 | return result; 132 | })); 133 | // [END mergeJoinResults] 134 | // [START flattenMergedResults] 135 | return mergedResult.apply("Flatten List to Objects", Flatten.iterables()); 136 | // [END flattenMergedResults] 137 | } 138 | 139 | /** 140 | * Given a parent PCollection with a known value for a key, nest a given child collection based on 141 | * it's key value within elements of the first collection. 142 | * 143 | * @param parent - Keyed PCollection of Parent MusicBrainzDataObject's 144 | * @param child - Keyed PCollection of Child MusicBrainzDataObject's 145 | */ 146 | // [START nestTransform] 147 | public static PCollection nest( 148 | PCollection> parent, 149 | PCollection> child, 150 | String nestingKey) { 151 | final TupleTag parentTag = new TupleTag() {}; 152 | final TupleTag childTag = new TupleTag() {}; 153 | 154 | PCollection> joinedResult = 155 | group("nest " + nestingKey, parent, child, parentTag, childTag); 156 | return joinedResult.apply( 157 | "merge join results " + nestingKey, 158 | MapElements.into(new TypeDescriptor() {}) 159 | .via( 160 | (KV group) -> { 161 | MusicBrainzDataObject parentObject = group.getValue().getOnly(parentTag); 162 | Iterable children = group.getValue().getAll(childTag); 163 | List childList = new ArrayList<>(); 164 | children.forEach(childList::add); 165 | parentObject = parentObject.duplicate(); 166 | parentObject.addColumnValue("recordings", childList); 167 | return parentObject; 168 | })); 169 | } 170 | // [END nestTransform] 171 | 172 | /* 173 | * Create a simple serializable version of the TableSchema usable across worker nodes. 174 | */ 175 | private static Map serializeableTableSchema(TableSchema schema) { 176 | return serializeableTableSchema(schema.getFields()); 177 | } 178 | 179 | /* 180 | * Recursable method for serializable schema 181 | */ 182 | private static Map serializeableTableSchema(List fields) { 183 | HashMap current = new HashMap<>(); 184 | for (TableFieldSchema field : fields) { 185 | if (field.getType().equals(FieldSchemaListBuilder.RECORD)) { 186 | current.put(field.getName(), serializeableTableSchema(field.getFields())); 187 | } else { 188 | current.put(field.getName(), field.getType()); 189 | } 190 | } 191 | return current; 192 | } 193 | 194 | /** 195 | * Given a set of MusicBrainzDataObject's representing table rows and a TableSchema that has 196 | * keynames that match with the PCollection of MusicBrainzDataObjects, execute a transform to 197 | * transform those objects in to BigQuery table rows. Only use the fields of the data objects 198 | * found in the Table schema. 199 | * 200 | * @param objects - the PCollection of data objects to transform into TableRows 201 | * @param schema - the table schema to use 202 | */ 203 | public static PCollection transformToTableRows( 204 | PCollection objects, TableSchema schema) { 205 | Map serializableSchema = serializeableTableSchema(schema); 206 | return objects 207 | .apply( 208 | "Big Query TableRow Transform", 209 | MapElements.into(new TypeDescriptor>() {}) 210 | .via( 211 | (MusicBrainzDataObject inputObject) -> 212 | toTableRows(inputObject, serializableSchema))) 213 | .apply("Flatten TableRow List to TableRows", Flatten.iterables()); 214 | } 215 | 216 | /** 217 | * This converts a single MusicBrainzDataObject into a list of one or more TableRows, nesting as 218 | * necessary. It uses the BIGQUERY_NESTING_LIMIT to duplicate rows and continue adding nested 219 | * records to their duplicate. For example if a MusicBrainzDataObject has a child list of 220 | * BIGQUERY_NESTING_LIMIT + 1 nested objects the result will be a list containing two table rows 1 221 | * with BIGQUERY_NESTINGLIMIT children and a duplicate with one child. 222 | */ 223 | // [START toTableRows] 224 | private static List toTableRows( 225 | MusicBrainzDataObject mbdo, Map serializableSchema) { 226 | TableRow row = new TableRow(); 227 | List result = new ArrayList<>(); 228 | Map> nestedLists = new HashMap<>(); 229 | Set keySet = serializableSchema.keySet(); 230 | /* 231 | * construct a row object without the nested objects 232 | */ 233 | int maxListSize = 0; 234 | for (String key : keySet) { 235 | Object value = serializableSchema.get(key); 236 | Object fieldValue = mbdo.getColumnValue(key); 237 | if (fieldValue != null) { 238 | if (value instanceof Map) { 239 | @SuppressWarnings("unchecked") 240 | List list = (List) fieldValue; 241 | if (list.size() > maxListSize) { 242 | maxListSize = list.size(); 243 | } 244 | nestedLists.put(key, list); 245 | } else { 246 | row.set(key, fieldValue); 247 | } 248 | } 249 | } 250 | /* 251 | * add the nested objects but break up the nested objects across duplicate rows if nesting 252 | * limit exceeded 253 | */ 254 | TableRow parent = row.clone(); 255 | Set listFields = nestedLists.keySet(); 256 | for (int i = 0; i < maxListSize; i++) { 257 | parent = (parent == null ? row.clone() : parent); 258 | final TableRow parentRow = parent; 259 | nestedLists.forEach( 260 | (String key, List nestedList) -> { 261 | if (nestedList.size() > 0) { 262 | if (parentRow.get(key) == null) { 263 | parentRow.set(key, new ArrayList()); 264 | } 265 | @SuppressWarnings("unchecked") 266 | List childRows = (List) parentRow.get(key); 267 | @SuppressWarnings("unchecked") 268 | Map map = (Map) serializableSchema.get(key); 269 | childRows.add(toChildRow(nestedList.remove(0), map)); 270 | } 271 | }); 272 | if ((i > 0) && (i % BIGQUERY_NESTING_LIMIT == 0)) { 273 | result.add(parent); 274 | parent = null; 275 | } 276 | } 277 | if (parent != null) { 278 | result.add(parent); 279 | } 280 | return result; 281 | } 282 | // [END toTableRows] 283 | 284 | /** 285 | * A child row cannot have any nested repeated records. This turns a MusicBrainzDataObject into a 286 | * child row that is presumably nested inside a parent. 287 | */ 288 | private static TableRow toChildRow( 289 | MusicBrainzDataObject object, Map childSchema) { 290 | TableRow row = new TableRow(); 291 | childSchema.forEach((String key, Object value) -> row.set(key, object.getColumnValue(key))); 292 | return row; 293 | } 294 | 295 | /** 296 | * Given the cloud storage object containing a line delimited json file, a pipeline and a keyname 297 | * load MusicBrainzDataObject's into a keyed PCollection with the key being the column value for 298 | * keyName and applies the supplied mappings. 299 | * 300 | * @param p Pipeline object to use to load the data objects 301 | * @param name the name of the google cloud storage object 302 | * @param keyName the name of the column to use as the key for this PCollection. Note that this 303 | * key assumed to be a Long. 304 | * @param mappers variable sized list of lookup descriptions. mappers map a Long integer to a 305 | * String 306 | */ 307 | public static PCollection> loadTable( 308 | Pipeline p, String name, String keyName, LookupDescription... mappers) { 309 | PCollection text = loadText(p, name); 310 | return loadTableFromText(text, name, keyName, mappers); 311 | } 312 | 313 | /** 314 | * Given the cloud storage object containing a line delimited json file, a pipeline and a keyname 315 | * load MusicBrainzDataObjects into a keyed PCollection with the key being the column value for 316 | * keyName 317 | * 318 | * @param p Pipeline object to use to load the data objects 319 | * @param name the name of the google cloud storage object 320 | * @param keyName the name of the column to use as the key for this PCollection Note that this key 321 | * assumed to be a Long. 322 | */ 323 | public static PCollection> loadTable( 324 | Pipeline p, String name, String keyName) { 325 | PCollection text = loadText(p, name); 326 | return loadTableFromText(text, name, keyName); 327 | } 328 | 329 | /** 330 | * Given the cloud storage object containing a line delimited json file, and a pipeline that has 331 | * the BQETLOptions set, load the MusicBrainzDataObjects into a PCollection. 332 | * 333 | * @param p Pipeline object to use for the load 334 | * @param name name of the google cloud storage object 335 | */ 336 | public static PCollection loadTable(Pipeline p, String name) { 337 | return loadTableFromText(loadText(p, name), name); 338 | } 339 | 340 | /** 341 | * Given a PCollection of String's each representing an MusicBrainzDataObject transform those 342 | * strings into KV where the name is the namespace of the data object 343 | * key is the value of the object's keyName property. 344 | * 345 | * @param text - the PCollection of strings 346 | * @param name - the namespace for the data objects (or row name) 347 | * @param keyName - the key to use as the key in the KV object. 348 | */ 349 | // [START loadTableByValue] 350 | public static PCollection> loadTableFromText( 351 | PCollection text, String name, String keyName) { 352 | final String namespacedKeyname = name + "_" + keyName; 353 | return text.apply( 354 | "load " + name, 355 | MapElements.into(new TypeDescriptor>() {}) 356 | .via( 357 | (String input) -> { 358 | MusicBrainzDataObject datum = JSONReader.readObject(name, input); 359 | Long key = (Long) datum.getColumnValue(namespacedKeyname); 360 | return KV.of(key, datum); 361 | })); 362 | } 363 | // [END loadTableByValue] 364 | 365 | /** 366 | * Given a PCollection of String's each representing an MusicBrainzDataObject transform those 367 | * strings into MusicBrainzDataObject's where the namespace for the MusicBrainzDataObject is 368 | * 'name' 369 | * 370 | * @param text the json string representing the MusicBrainzDataObject 371 | * @param name the namespace for hte MusicBrainzDataObject 372 | * @return PCollection of MusicBrainzDataObjects 373 | */ 374 | public static PCollection loadTableFromText( 375 | PCollection text, String name) { 376 | return text.apply( 377 | "load : " + name, 378 | MapElements.into(new TypeDescriptor() {}) 379 | .via((String input) -> JSONReader.readObject(name, input))); 380 | } 381 | 382 | /** 383 | * Given a PCollection of Strings each with json containing a mapping from a Long to a String 384 | * create a Singleton PCollection with the mapping. Example mapping: 385 | * 386 | *

387 |    * {
388 |    * ...
389 |    * "id" : 38,
390 |    * .... "name": "Canada
391 |    * ....
392 |    * }
393 |    * 
394 | * 395 | * In this case keyKey is id and valueKey is name. 396 | * 397 | * @param text - json string containing the mapping 398 | * @param keyKey - the json key for the value that will serve as the key for the mapping 399 | * @param valueKey - the json key for the value that will serve as the value for the mapping 400 | */ 401 | // [START lookupTableWithSideInputs1] 402 | public static PCollectionView> loadMapFromText( 403 | PCollection text, String name, String keyKey, String valueKey) { 404 | // column/Key names are namespaced in MusicBrainzDataObject 405 | String keyKeyName = name + "_" + keyKey; 406 | String valueKeyName = name + "_" + valueKey; 407 | 408 | PCollection> entries = 409 | text.apply( 410 | "sideInput_" + name, 411 | MapElements.into(new TypeDescriptor>() {}) 412 | .via( 413 | (String input) -> { 414 | MusicBrainzDataObject object = JSONReader.readObject(name, input); 415 | Long key = (Long) object.getColumnValue(keyKeyName); 416 | 417 | String value = (String) object.getColumnValue(valueKeyName); 418 | return KV.of(key, value); 419 | })); 420 | 421 | return entries.apply(View.asMap()); 422 | } 423 | // [END lookupTableWithSideInputs1] 424 | 425 | /** 426 | * Given a PCollection of String's each representing an MusicBrainzDataObject transform those 427 | * strings into MusicBrainzDataObject's where the name space for the MusicBrainzDataObject is 428 | * 'name' 429 | * 430 | * @param text the json string representing the MusicBrainzDataObject 431 | * @param name the namespace for hte MusicBrainzDataObject 432 | * @param mappers variable number of lookup descriptions - lookup descriptions can be created 433 | * using the factory method lookup(); 434 | * @return PCollection of MusicBrainzDataObjects 435 | */ 436 | public static PCollection> loadTableFromText( 437 | PCollection text, String name, String keyName, LookupDescription... mappers) { 438 | // [START lookupTableWithSideInputs2] 439 | List, PCollectionView>>> mapSideInputs = 440 | new ArrayList<>(); 441 | 442 | for (LookupDescription mapper : mappers) { 443 | PCollectionView> mapView = 444 | loadMap(text.getPipeline(), mapper.objectName, mapper.keyKey, mapper.valueKey); 445 | List destKeyList = 446 | mapper.destinationKeys.stream() 447 | .map(destinationKey -> name + "_" + destinationKey) 448 | .collect(Collectors.toList()); 449 | 450 | mapSideInputs.add(new SimpleEntry<>(destKeyList, mapView)); 451 | } 452 | // [END lookupTableWithSideInputs2] 453 | return loadTableFromText(text, name, keyName, mapSideInputs); 454 | } 455 | 456 | static PCollection> loadTableFromText( 457 | PCollection text, 458 | String name, 459 | String keyName, 460 | List, PCollectionView>>> sideMappings) { 461 | // List>> sideMappings) { 462 | 463 | final String namespacedKeyname = name + "_" + keyName; 464 | 465 | return text.apply( 466 | "load with SideMappings", 467 | ParDo.of( 468 | new DoFn>() { 469 | 470 | @ProcessElement 471 | // public void processElement(ProcessContext processContext) throws Exception { 472 | public void processElement( 473 | @Element String input, 474 | OutputReceiver> out, 475 | ProcessContext c) { 476 | 477 | MusicBrainzDataObject result = JSONReader.readObject(name, input); 478 | 479 | sideMappings.forEach( 480 | (SimpleEntry, PCollectionView>> mapping) -> { 481 | // [START lookupTableWithSideInputs3] 482 | Map sideInputMap = c.sideInput(mapping.getValue()); 483 | 484 | List keyList = mapping.getKey(); 485 | 486 | keyList.forEach( 487 | (String key) -> { 488 | Long id = (Long) result.getColumnValue(key); 489 | if (id != null) { 490 | String label = sideInputMap.get(id); 491 | if (label == null) { 492 | label = "" + id; 493 | } 494 | result.replace(key, label); 495 | // [END lookupTableWithSideInputs3] 496 | } 497 | }); 498 | }); 499 | 500 | Long key = (Long) result.getColumnValue(namespacedKeyname); 501 | 502 | out.output(KV.of(key, result)); 503 | } 504 | }) 505 | .withSideInputs( 506 | sideMappings.stream().map(SimpleEntry::getValue).collect(Collectors.toList()))); 507 | } 508 | 509 | /** 510 | * Given the cloud storage object containing a line delimited json file from which a map should be 511 | * defined by: value of json key (keyKey) -> value of other json key (valueKey) (maps should be 512 | * relatively small as they are generally used as side inputs for adding a mapping from a 513 | * non-user-friendly ID to a user-friendly name or description. 514 | * 515 | * @param name - name 516 | * @param keyKey - the name of the json key to use as the key in the resulting map. 517 | * @param valueKey - the name of hte json key to use as the value in the resulting map. 518 | */ 519 | private static PCollectionView> loadMap( 520 | Pipeline p, String name, String keyKey, String valueKey) { 521 | PCollection text = loadText(p, name); 522 | return loadMapFromText(text, name, keyKey, valueKey); 523 | } 524 | 525 | /** 526 | * Load a json line delimited file into a PCollection of strings each representing a line of JSON. 527 | * 528 | * @param name name of objects to load. 529 | */ 530 | // [START loadStrings] 531 | public static PCollection loadText(Pipeline p, String name) { 532 | BQETLOptions options = (BQETLOptions) p.getOptions(); 533 | String loadingBucket = options.getLoadingBucketURL(); 534 | String objectToLoad = storedObjectName(loadingBucket, name); 535 | return p.apply(name, TextIO.read().from(objectToLoad)); 536 | } 537 | // [END loadStrings] 538 | 539 | /** 540 | * Derive a storedObject name from the loading bucket and the name of the object. 541 | * 542 | * @param loadingBucket name of bucket 543 | * @param name - name of the object 544 | */ 545 | private static String storedObjectName(String loadingBucket, String name) { 546 | return loadingBucket + "/" + name + ".json"; 547 | } 548 | 549 | /** 550 | * Factory method for creating a lookup table description a lookup usually maps a primary key to a 551 | * label. Note that in this demonstration maps are assumed to be of the form Long -> String. 552 | * 553 | * @param objectName - the storage object name 554 | * @param keyKey - the keyname in the mapping table that will match the value in the target object 555 | * to be replaced (e.g. id) 556 | * @param valueKey - the keyname in the mapping table that contains the value 557 | * @param destinationKeys - the key to replace in the target object 558 | */ 559 | // [START lookupMethod] 560 | public static LookupDescription lookup( 561 | String objectName, String keyKey, String valueKey, String... destinationKeys) { 562 | return new LookupDescription(objectName, keyKey, valueKey, destinationKeys); 563 | } 564 | // [END lookupMethod] 565 | 566 | /** 567 | * Simple class for encapsulating the description of a lookup. Created through factory method 568 | * lookup(). 569 | */ 570 | public static class LookupDescription { 571 | 572 | final String objectName; 573 | final List destinationKeys; 574 | final String keyKey; 575 | final String valueKey; 576 | 577 | LookupDescription( 578 | String objectName, String keyKey, String valueKey, String... destinationKeys) { 579 | this.objectName = objectName; 580 | this.destinationKeys = Arrays.asList(destinationKeys); 581 | this.keyKey = keyKey; 582 | this.valueKey = valueKey; 583 | } 584 | } 585 | } 586 | --------------------------------------------------------------------------------