├── .gitignore.xml ├── flow.png ├── src ├── test │ ├── java │ │ ├── utils │ │ │ ├── TestCase.java │ │ │ ├── TestResourceLoader.java │ │ │ ├── FakeBigQueryServiceFactory.java │ │ │ ├── GoogleTypesToJsonConverter.java │ │ │ └── FakeBigquery.java │ │ ├── examples │ │ │ ├── BigQuerySqlParserBQSchemaTest.java │ │ │ ├── BigQuerySqlParserLocalSchemaTest.java │ │ │ └── ASTExplorerTest.java │ │ └── extractor │ │ │ └── BigQueryTableCreatorTest.java │ └── resources │ │ ├── sql │ │ ├── benchmark │ │ │ ├── external │ │ │ │ └── test.sql │ │ │ ├── select_prune.yaml │ │ │ ├── recursive_cte.yaml │ │ │ ├── stg_payments.yaml │ │ │ ├── json_functions_struct.yaml │ │ │ ├── stg_customers.yaml │ │ │ ├── json_functions.yaml │ │ │ ├── messy_struct.yaml │ │ │ ├── pivot.yaml │ │ │ ├── json_functions_with_literals.yaml │ │ │ ├── timestamps.yaml │ │ │ ├── messy_unnesting.yaml │ │ │ ├── unnest.yaml │ │ │ ├── udf.yaml │ │ │ ├── parameter.yaml │ │ │ ├── unnest_create.yaml │ │ │ ├── unnest_create_view.yaml │ │ │ ├── count.yaml │ │ │ ├── pivot_where_with_literals.yaml │ │ │ ├── looker_subquery.yaml │ │ │ ├── subquery_unnest.yaml │ │ │ ├── customers_groupby.yaml │ │ │ ├── customers.yaml │ │ │ ├── customers_groupby_sets.yaml │ │ │ ├── array_agg_multiplenests.yaml │ │ │ ├── looker_subquery_crazy.yaml │ │ │ ├── analytical_functions.yaml │ │ │ └── multiple_refs.yaml │ │ ├── kitchen_sink_concat.yaml │ │ ├── kitchen_sink_multiple_output_columns_with_alias.yaml │ │ ├── kitchen_sink_multiple_output_columns_without_alias.yaml │ │ └── bigquery_daily_report_error_stats_join_group_by_aggr_functions.yaml │ │ └── schemas │ │ ├── tableA_schema.json │ │ ├── tableB_schema.json │ │ ├── bigquery_simple_type_table_schema.json │ │ ├── SensitiveData_schema.json │ │ ├── MyDataSet_ExtractionPartnerInformation_schema.json │ │ ├── MyDataSet_PartnerInformation_schema.json │ │ ├── OutputTable_schema.json │ │ ├── bigquery_demo_events_schema.json │ │ ├── public_dataset_mbb_team_colors_schema.json │ │ ├── bigquery_simple_all_types_table_schema.json │ │ ├── error_stats_table_schema.json │ │ ├── CorePii_schema.json │ │ ├── bigquery_demo_transactions_schema.json │ │ ├── simple_daily_report_table_schema.json │ │ ├── public_dataset_mbb_teams_schema.json │ │ ├── daily_report_table_schema.json │ │ └── lineage_bigquery_table_schema.json └── main │ └── java │ └── com │ └── borjav │ └── data │ ├── options │ └── Options.java │ ├── exception │ └── BigQueryOperationException.java │ ├── model │ ├── ResolvedJoinExtended.java │ ├── ResolvedNodeExtended.java │ ├── BigQueryTableEntity.java │ └── ResolvedColumnExtended.java │ ├── service │ ├── BigQueryServiceFactory.java │ ├── BigQueryZetaSqlSchemaLoader.java │ ├── BigQueryZetaSqlSchemaLoaderFactory.java │ └── BigQueryTableLoadService.java │ ├── utils │ └── UtilsParser.java │ ├── output │ └── OutputModel.java │ ├── extractor │ └── BigQueryTableCreator.java │ ├── converter │ └── BigQuerySchemaConverter.java │ └── parser │ └── ZetaSQLResolver.java ├── LICENSE └── README.md /.gitignore.xml: -------------------------------------------------------------------------------- 1 | .DS_Store -------------------------------------------------------------------------------- /flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/borjavb/bq-lineage-tool/HEAD/flow.png -------------------------------------------------------------------------------- /src/test/java/utils/TestCase.java: -------------------------------------------------------------------------------- 1 | package utils; 2 | 3 | import com.borjav.data.output.OutputModel; 4 | import com.fasterxml.jackson.annotation.JsonProperty; 5 | 6 | public class TestCase { 7 | 8 | @JsonProperty("query") 9 | public String query; 10 | @JsonProperty("expected_output") 11 | public OutputModel.Model expected_output; 12 | } -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/options/Options.java: -------------------------------------------------------------------------------- 1 | package com.borjav.data.options; 2 | 3 | import java.util.HashMap; 4 | 5 | public class Options { 6 | 7 | //if no project is defined in a query, this is the default project that will be used 8 | public static String default_project = "data-default-project"; 9 | public static HashMap missing_project = new HashMap<>(); 10 | } 11 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/external/test.sql: -------------------------------------------------------------------------------- 1 | with tbl as (Select 1 as x,[struct(2 as y, [123,456,789] as z), struct(3,[301,302])] as lst), 2 | tbl2 as (Select x, A.y, z from tbl, unnest(lst) as A, unnest(A.z) as z) 3 | 4 | #select * from tbl2 # run this query first 5 | # Then this: 6 | Select x, array_Agg(struct(y,Z)) 7 | from 8 | ( 9 | select x,y,array_agg(z) as Z 10 | from tbl2 11 | group by 1,2 12 | ) 13 | group by 1 -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/exception/BigQueryOperationException.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | package com.borjav.data.exception; 4 | 5 | import com.borjav.data.model.BigQueryTableEntity; 6 | 7 | /** 8 | * Wrapped RunTime exception thrown from BigQuery operations. 9 | */ 10 | public class BigQueryOperationException extends RuntimeException { 11 | 12 | 13 | public BigQueryOperationException(BigQueryTableEntity table, Throwable cause) { 14 | super(String.format("BigQuery Operation exception for%n%s", table), cause); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/select_prune.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with source as ( 3 | 4 | 5 | select * from `catalog.jaffle_shop.raw_payments` 6 | 7 | ) 8 | 9 | select id as payment_id from source 10 | 11 | expected_output: 12 | name: "select_prune" 13 | output_columns: 14 | - name: "payment_id" 15 | references: 16 | - project_name: "catalog" 17 | dataset_name: "jaffle_shop" 18 | table_name: "raw_payments" 19 | name: "id" 20 | type: "select" 21 | selected_tables: 22 | - "catalog.jaffle_shop.raw_payments" -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/model/ResolvedJoinExtended.java: -------------------------------------------------------------------------------- 1 | package com.borjav.data.model; 2 | 3 | import java.util.List; 4 | 5 | 6 | public class ResolvedJoinExtended { 7 | 8 | public enum JOIN_TYPE {CROSS, LEFT, RIGHT, FULL, INNER} 9 | 10 | 11 | public JOIN_TYPE join_type; 12 | 13 | public List left; 14 | public List right; 15 | 16 | public ResolvedJoinExtended(String join_type, List left, 17 | List right) { 18 | this.left = left; 19 | this.right = right; 20 | this.join_type = JOIN_TYPE.valueOf(join_type.toUpperCase()); 21 | 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/test/resources/schemas/tableA_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "id": "project1:datasetA.TableA", 4 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/myproject/datasets/reporting/tables/daily_report", 5 | "tableReference": { 6 | "projectId": "project1", 7 | "datasetId": "datasetA", 8 | "tableId": "TableA" 9 | }, 10 | "schema": { 11 | "fields": [ 12 | { 13 | "name": "colA", 14 | "type": "STRING", 15 | "mode": "NULLABLE" 16 | }, 17 | { 18 | "name": "colC", 19 | "type": "STRING", 20 | "mode": "NULLABLE" 21 | } 22 | ] 23 | }, 24 | "type": "TABLE", 25 | "location": "US" 26 | } 27 | -------------------------------------------------------------------------------- /src/test/resources/schemas/tableB_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "id": "project2:datasetB.TableB", 4 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/myproject/datasets/reporting/tables/daily_report", 5 | "tableReference": { 6 | "projectId": "project2", 7 | "datasetId": "datasetB", 8 | "tableId": "TableB" 9 | }, 10 | "schema": { 11 | "fields": [ 12 | { 13 | "name": "colB", 14 | "type": "STRING", 15 | "mode": "NULLABLE" 16 | }, 17 | { 18 | "name": "colC", 19 | "type": "STRING", 20 | "mode": "NULLABLE" 21 | } 22 | ] 23 | }, 24 | "type": "TABLE", 25 | "location": "US" 26 | } 27 | -------------------------------------------------------------------------------- /src/test/resources/schemas/bigquery_simple_type_table_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "myproject:dataset.table", 3 | "tableReference": { 4 | "projectId": "myproject", 5 | "datasetId": "dataset", 6 | "tableId": "table" 7 | }, 8 | "schema": { 9 | "fields": [ 10 | { 11 | "mode": "NULLABLE", 12 | "name": "afloat", 13 | "type": "FLOAT" 14 | }, 15 | { 16 | "mode": "NULLABLE", 17 | "name": "aString", 18 | "type": "STRING" 19 | }, 20 | { 21 | "mode": "NULLABLE", 22 | "name": "aInteger", 23 | "type": "INTEGER" 24 | }, 25 | { 26 | "mode": "NULLABLE", 27 | "name": "aBool", 28 | "type": "BOOLEAN" 29 | } 30 | ] 31 | }, 32 | "type": "TABLE" 33 | } -------------------------------------------------------------------------------- /src/test/resources/schemas/SensitiveData_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "id": "demoProject:demo.SensitiveData", 4 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/demoProject/datasets/demo/tables/SensitiveData", 5 | "tableReference": { 6 | "projectId": "demoProject", 7 | "datasetId": "demo", 8 | "tableId": "SensitiveData" 9 | }, 10 | "schema": { 11 | "fields": [ 12 | { 13 | "name": "sensitive_data", 14 | "type": "STRING", 15 | "mode": "NULLABLE", 16 | "policyTags": { 17 | "names": [ 18 | "projects/GovernanceProject/locations/us/taxonomies/8150274556907504807/policyTags/7890" 19 | ] 20 | } 21 | } 22 | ] 23 | }, 24 | "type": "TABLE" 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/model/ResolvedNodeExtended.java: -------------------------------------------------------------------------------- 1 | package com.borjav.data.model; 2 | 3 | import com.fasterxml.jackson.annotation.JsonIgnore; 4 | import com.google.common.collect.ImmutableList; 5 | import com.google.zetasql.resolvedast.ResolvedColumn; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | public class ResolvedNodeExtended { 10 | 11 | public List columns = new ArrayList<>(); 12 | public List extra_columns = new ArrayList<>(); 13 | @JsonIgnore 14 | public ImmutableList originalColumns; 15 | public String name; 16 | public String type; 17 | public String table_name; 18 | public List selected_tables = new ArrayList(); 19 | public List joins = new ArrayList<>(); 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/test/resources/schemas/MyDataSet_ExtractionPartnerInformation_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "id": "bq-lineage-demo:MyDataSet.ExtractedPartnerInformation", 4 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/bq-lineage-demo/datasets/MyDataSet/tables/ExtractedPartnerInformation", 5 | "tableReference": { 6 | "projectId": "bq-lineage-demo", 7 | "datasetId": "MyDataSet", 8 | "tableId": "ExtractedPartnerInformation" 9 | }, 10 | "schema": { 11 | "fields": [ 12 | { 13 | "name": "partner_id", 14 | "type": "INTEGER", 15 | "mode": "NULLABLE" 16 | }, 17 | { 18 | "name": "partner_name", 19 | "type": "STRING", 20 | "mode": "NULLABLE" 21 | }, 22 | { 23 | "name": "partner_phone_number", 24 | "type": "STRING", 25 | "mode": "NULLABLE" 26 | } 27 | ] 28 | }, 29 | "type": "TABLE", 30 | "location": "US" 31 | } 32 | -------------------------------------------------------------------------------- /src/test/resources/schemas/MyDataSet_PartnerInformation_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "id": "bq-lineage-demo:MyDataSet.PartnerInformation", 4 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/bq-lineage-demo/datasets/MyDataSet/tables/PartnerInformation", 5 | "tableReference": { 6 | "projectId": "bq-lineage-demo", 7 | "datasetId": "MyDataSet", 8 | "tableId": "PartnerInformation" 9 | }, 10 | "schema": { 11 | "fields": [ 12 | { 13 | "name": "partner_id", 14 | "type": "INTEGER", 15 | "policyTags": { 16 | "names": [ 17 | "projects/bq-lineage-demo/locations/us/taxonomies/544279842572406327/policyTags/2123206183673327057" 18 | ] 19 | } 20 | }, 21 | { 22 | "name": "partner_phone_number", 23 | "type": "STRING" 24 | }, 25 | { 26 | "name": "partner_name", 27 | "type": "STRING" 28 | } 29 | ] 30 | }, 31 | "type": "TABLE", 32 | "location": "US" 33 | } 34 | 35 | -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/service/BigQueryServiceFactory.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | package com.borjav.data.service; 4 | 5 | import com.google.api.client.http.javanet.NetHttpTransport; 6 | import com.google.api.client.json.jackson2.JacksonFactory; 7 | import com.google.api.services.bigquery.Bigquery; 8 | import com.google.api.services.bigquery.BigqueryScopes; 9 | import com.google.auth.http.HttpCredentialsAdapter; 10 | import com.google.auth.oauth2.GoogleCredentials; 11 | 12 | import java.io.IOException; 13 | import java.io.Serializable; 14 | 15 | public interface BigQueryServiceFactory extends Serializable { 16 | 17 | Bigquery buildService() throws IOException; 18 | 19 | static BigQueryServiceFactory defaultFactory() { 20 | return () -> 21 | new Bigquery.Builder( 22 | new NetHttpTransport(), 23 | new JacksonFactory(), 24 | new HttpCredentialsAdapter(GoogleCredentials 25 | .getApplicationDefault() 26 | .createScoped(BigqueryScopes.all()))) 27 | .setApplicationName("column-lineage-extraction") 28 | .build(); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Borja Vazquez-Barreiros 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /src/test/resources/schemas/OutputTable_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "id": "demoProject:demo.OutputTable", 4 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/demoProject/datasets/demo/tables/OutputTable", 5 | "tableReference": { 6 | "projectId": "demoProject", 7 | "datasetId": "demo", 8 | "tableId": "OutputTable" 9 | }, 10 | "schema": { 11 | "fields": [ 12 | { 13 | "type": "STRING", 14 | "mode": "NULLABLE", 15 | "name": "combined_telephone" 16 | }, 17 | { 18 | "type": "STRING", 19 | "mode": "NULLABLE", 20 | "name": "telephone_number" 21 | }, 22 | { 23 | "type": "DOUBLE", 24 | "mode": "NULLABLE", 25 | "name": "lat" 26 | }, 27 | { 28 | "type": "DOUBLE", 29 | "mode": "NULLABLE", 30 | "name": "lon" 31 | }, 32 | { 33 | "type": "INT64", 34 | "mode": "NULLABLE", 35 | "name": "id" 36 | }, 37 | { 38 | "type": "INT64", 39 | "mode": "NULLABLE", 40 | "name": "bucket" 41 | } 42 | ] 43 | }, 44 | "type": "TABLE" 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/service/BigQueryZetaSqlSchemaLoader.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | package com.borjav.data.service; 4 | 5 | import com.borjav.data.converter.BigQuerySchemaConverter; 6 | import com.google.common.collect.ImmutableSet; 7 | import com.google.zetasql.SimpleTable; 8 | 9 | import static com.google.common.collect.ImmutableSet.toImmutableSet; 10 | 11 | /** 12 | * Loads requested Table Schema using provided {@link BigQueryTableLoadService} followed by schema 13 | * translation using {@link BigQuerySchemaConverter}. 14 | */ 15 | public final class BigQueryZetaSqlSchemaLoader { 16 | 17 | private final BigQueryTableLoadService bqTableLoader; 18 | 19 | public BigQueryZetaSqlSchemaLoader(BigQueryTableLoadService bqTableLoader) { 20 | this.bqTableLoader = bqTableLoader; 21 | } 22 | 23 | 24 | public ImmutableSet loadSchemas(String... tableNames) { 25 | return bqTableLoader.loadTables(tableNames).stream() 26 | .map(BigQuerySchemaConverter::convert) 27 | .collect(toImmutableSet()); 28 | } 29 | 30 | 31 | public ImmutableSet loadSchemas(ImmutableSet tableNames) { 32 | return loadSchemas(tableNames.asList().toArray(new String[0])); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/test/resources/schemas/bigquery_demo_events_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "etag": "FnYOaTcOYbWI4hAWC+kwjA==", 4 | "id": "myproject:demo.events", 5 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/myproject/datasets/demo/tables/events", 6 | "tableReference": { 7 | "projectId": "myproject", 8 | "datasetId": "demo", 9 | "tableId": "events" 10 | }, 11 | "schema": { 12 | "fields": [ 13 | { 14 | "name": "transaction_time", 15 | "type": "TIMESTAMP" 16 | }, 17 | { 18 | "name": "keyid", 19 | "type": "INTEGER" 20 | }, 21 | { 22 | "name": "lat", 23 | "type": "FLOAT" 24 | }, 25 | { 26 | "name": "lon", 27 | "type": "FLOAT" 28 | }, 29 | { 30 | "name": "id", 31 | "type": "INTEGER" 32 | }, 33 | { 34 | "name": "bucket", 35 | "type": "INTEGER" 36 | } 37 | ] 38 | }, 39 | "numBytes": "31206297408", 40 | "numLongTermBytes": "31206297408", 41 | "numRows": "650131196", 42 | "creationTime": "1572934222137", 43 | "lastModifiedTime": "1572934222137", 44 | "type": "TABLE", 45 | "location": "US" 46 | } 47 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/recursive_cte.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | WITH RECURSIVE 3 | T0 AS (SELECT 1 AS n), 4 | T1 AS ((SELECT * FROM T0) UNION ALL (SELECT n + 1 FROM T1 WHERE n < 4)), 5 | T2 AS ((SELECT 1 AS n) UNION ALL (SELECT n + 1 FROM T2 WHERE n < 4)), 6 | T3 AS (SELECT * FROM T1 INNER JOIN T2 USING (n)) 7 | SELECT * FROM T3 ORDER BY n 8 | 9 | expected_output: 10 | name: "recursive cte" 11 | output_columns: 12 | - name: "n" 13 | references: 14 | - table_name: "$union_all" 15 | name: "n" 16 | other_used_columns: 17 | - name: "_n_" 18 | references: 19 | - table_name: "$union_all" 20 | name: "n" 21 | used_for: 22 | - "ORDER_BY" 23 | joins: 24 | - join_type: "INNER" 25 | left_columns: 26 | - name: "n" 27 | references: 28 | - table_name: "$union_all" 29 | name: "n" 30 | used_for: 31 | - "JOIN_LEFT_TABLE" 32 | right_columns: 33 | - name: "n" 34 | references: 35 | - table_name: "$union_all" 36 | name: "n" 37 | used_for: 38 | - "JOIN_RIGHT_TABLE" 39 | type: "select" 40 | selected_tables: 41 | - "$union_all" -------------------------------------------------------------------------------- /src/test/resources/schemas/public_dataset_mbb_team_colors_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "etag": "6ts/A4Iuhza8C7naNAHI0Q==", 4 | "id": "bigquery-public-data:ncaa_basketball.team_colors", 5 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/bigquery-public-data/datasets/ncaa_basketball/tables/team_colors", 6 | "tableReference": { 7 | "projectId": "bigquery-public-data", 8 | "datasetId": "ncaa_basketball", 9 | "tableId": "team_colors" 10 | }, 11 | "description": "Hex color codes for the 351 current men's D1 basketball teams.", 12 | "schema": { 13 | "fields": [ 14 | { 15 | "name": "market", 16 | "type": "STRING", 17 | "mode": "NULLABLE" 18 | }, 19 | { 20 | "name": "id", 21 | "type": "STRING", 22 | "mode": "NULLABLE" 23 | }, 24 | { 25 | "name": "code_ncaa", 26 | "type": "INTEGER", 27 | "mode": "NULLABLE" 28 | }, 29 | { 30 | "name": "color", 31 | "type": "STRING", 32 | "mode": "NULLABLE" 33 | } 34 | ] 35 | }, 36 | "numBytes": "23909", 37 | "numLongTermBytes": "23909", 38 | "numRows": "351", 39 | "creationTime": "1520301383660", 40 | "lastModifiedTime": "1527778180427", 41 | "type": "TABLE", 42 | "location": "US" 43 | } 44 | -------------------------------------------------------------------------------- /src/test/java/utils/TestResourceLoader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package utils; 18 | 19 | import java.io.IOException; 20 | import java.nio.charset.StandardCharsets; 21 | import java.nio.file.Files; 22 | import java.nio.file.Paths; 23 | 24 | public final class TestResourceLoader { 25 | 26 | private static final String TEST_RESOURCE_FOLDER = "test"; 27 | 28 | public static String load(String resourceFileName) { 29 | try { 30 | byte[] bytes = Files 31 | .readAllBytes(Paths.get("src", TEST_RESOURCE_FOLDER, "resources", resourceFileName)); 32 | return new String(bytes, StandardCharsets.UTF_8); 33 | } catch (IOException ioException) { 34 | return ""; 35 | } 36 | } 37 | 38 | private TestResourceLoader() { 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/stg_payments.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with source as ( 3 | 4 | 5 | select * from `catalog.jaffle_shop.raw_payments` 6 | 7 | ), 8 | 9 | renamed as ( 10 | 11 | select 12 | id as payment_id, 13 | order_id, 14 | payment_method, 15 | 16 | -- `amount` is currently stored in cents, so we convert it to dollars 17 | amount / 100 as amount 18 | 19 | from source 20 | 21 | ) 22 | 23 | select * from renamed 24 | 25 | expected_output: 26 | name: "stg_payments" 27 | output_columns: 28 | - name: "payment_id" 29 | references: 30 | - project_name: "catalog" 31 | dataset_name: "jaffle_shop" 32 | table_name: "raw_payments" 33 | name: "id" 34 | - name: "order_id" 35 | references: 36 | - project_name: "catalog" 37 | dataset_name: "jaffle_shop" 38 | table_name: "raw_payments" 39 | name: "order_id" 40 | - name: "payment_method" 41 | references: 42 | - project_name: "catalog" 43 | dataset_name: "jaffle_shop" 44 | table_name: "raw_payments" 45 | name: "payment_method" 46 | - name: "amount" 47 | references: 48 | - project_name: "catalog" 49 | dataset_name: "jaffle_shop" 50 | table_name: "raw_payments" 51 | name: "amount" 52 | type: "select" 53 | selected_tables: 54 | - "catalog.jaffle_shop.raw_payments" -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/service/BigQueryZetaSqlSchemaLoaderFactory.java: -------------------------------------------------------------------------------- 1 | 2 | package com.borjav.data.service; 3 | 4 | import com.google.common.flogger.GoogleLogger; 5 | 6 | import java.util.concurrent.TimeUnit; 7 | 8 | /** 9 | * Factory to build BigQuerySchemaLoaders by instantiating BigQuery service using the provided 10 | * Credentials. 11 | */ 12 | public final class BigQueryZetaSqlSchemaLoaderFactory { 13 | 14 | private static final GoogleLogger logger = GoogleLogger.forEnclosingClass(); 15 | 16 | private final BigQueryServiceFactory bigQueryServiceFactory; 17 | 18 | public BigQueryZetaSqlSchemaLoaderFactory(BigQueryServiceFactory bigQueryServiceFactory) { 19 | this.bigQueryServiceFactory = bigQueryServiceFactory; 20 | } 21 | 22 | public static BigQueryZetaSqlSchemaLoaderFactory usingServiceFactory( 23 | BigQueryServiceFactory bigQueryServiceFactory) { 24 | return new BigQueryZetaSqlSchemaLoaderFactory(bigQueryServiceFactory); 25 | } 26 | 27 | public BigQueryZetaSqlSchemaLoader newLoader() { 28 | try { 29 | return new BigQueryZetaSqlSchemaLoader( 30 | BigQueryTableLoadService 31 | .usingServiceFactory(bigQueryServiceFactory)); 32 | } catch (RuntimeException exception) { 33 | logger.atWarning() 34 | .withCause(exception) 35 | .atMostEvery(10, TimeUnit.MINUTES) 36 | .log("unable to create Bigquery service."); 37 | 38 | return null; 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/test/java/utils/FakeBigQueryServiceFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package utils; 18 | 19 | import com.borjav.data.service.BigQueryServiceFactory; 20 | import com.google.api.services.bigquery.Bigquery; 21 | 22 | public final class FakeBigQueryServiceFactory implements BigQueryServiceFactory { 23 | 24 | private final String[] tableSchemas; 25 | 26 | public FakeBigQueryServiceFactory(String[] tableSchemas) { 27 | this.tableSchemas = tableSchemas; 28 | } 29 | 30 | public static FakeBigQueryServiceFactory forTableSchemas(String... tableSchemas) { 31 | return new FakeBigQueryServiceFactory(tableSchemas); 32 | } 33 | 34 | public static BigQueryServiceFactory forStub(FakeBigquery fakeService) { 35 | return () -> fakeService; 36 | } 37 | 38 | @Override 39 | public Bigquery buildService() { 40 | return FakeBigquery.forTableSchemas(tableSchemas); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/json_functions_struct.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with base as( 3 | SELECT 4 | JSON_EXTRACT(json_field, '$.class') AS class, 5 | JSON_EXTRACT_SCALAR(json_field, '$.class.other') AS class_other, 6 | JSON_EXTRACT_ARRAY(json_field, '$.class.nested_array') AS class_nested, 7 | id, 8 | FROM `catalog.jaffle_shop.json_table` 9 | ) 10 | 11 | SELECT 12 | STRUCT(id as id, class as class) key, 13 | class_other, 14 | nested_classes, 15 | from base,unnest(class_nested) as nested_classes 16 | 17 | 18 | expected_output: 19 | name: "json_functions" 20 | output_columns: 21 | - name: "key.id" 22 | references: 23 | - project_name: "catalog" 24 | dataset_name: "jaffle_shop" 25 | table_name: "json_table" 26 | name: "id" 27 | - name: "key.class" 28 | references: 29 | - project_name: "catalog" 30 | dataset_name: "jaffle_shop" 31 | table_name: "json_table" 32 | name: "json_field" 33 | - name: "class_other" 34 | references: 35 | - project_name: "catalog" 36 | dataset_name: "jaffle_shop" 37 | table_name: "json_table" 38 | name: "json_field" 39 | - name: "nested_classes" 40 | references: 41 | - project_name: "catalog" 42 | dataset_name: "jaffle_shop" 43 | table_name: "json_table" 44 | name: "json_field" 45 | type: "select" 46 | selected_tables: 47 | - "catalog.jaffle_shop.json_table" -------------------------------------------------------------------------------- /src/test/resources/sql/kitchen_sink_concat.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | #standardSQL 3 | WITH base AS ( 4 | SELECT 5 | CONCAT(tA.colA, tB.colB) as joined_column 6 | FROM 7 | `project1.datasetA.TableA` as tA 8 | LEFT OUTER JOIN 9 | `project2.datasetB.TableB` as tB 10 | USING (colC) 11 | ) 12 | SELECT * FROM base 13 | 14 | expected_output: 15 | name: "json_functions" 16 | output_columns: 17 | - name: "joined_column" 18 | references: 19 | - project_name: "project1" 20 | dataset_name: "datasetA" 21 | table_name: "TableA" 22 | name: "colA" 23 | - project_name: "project2" 24 | dataset_name: "datasetB" 25 | table_name: "TableB" 26 | name: "colB" 27 | joins: 28 | - join_type: "LEFT" 29 | left_columns: 30 | - name: "colC" 31 | references: 32 | - project_name: "project1" 33 | dataset_name: "datasetA" 34 | table_name: "TableA" 35 | name: "colC" 36 | used_for: 37 | - "JOIN_LEFT_TABLE" 38 | right_columns: 39 | - name: "colC" 40 | references: 41 | - project_name: "project2" 42 | dataset_name: "datasetB" 43 | table_name: "TableB" 44 | name: "colC" 45 | used_for: 46 | - "JOIN_RIGHT_TABLE" 47 | type: "select" 48 | selected_tables: 49 | - "project1.datasetA.TableA" 50 | - "project2.datasetB.TableB" 51 | 52 | 53 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/stg_customers.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with source as ( 3 | 4 | select * from `catalog.jaffle_shop.raw_customers` 5 | 6 | ), 7 | 8 | renamed as ( 9 | 10 | select 11 | id as customer_id, 12 | first_name, 13 | last_name, 14 | COALESCE(last_name, first_name) as full_name, 15 | 16 | from source 17 | 18 | ) 19 | 20 | select * from renamed 21 | 22 | expected_output: 23 | name: "stg_customers" 24 | output_columns: 25 | - name: "customer_id" 26 | references: 27 | - project_name: "catalog" 28 | dataset_name: "jaffle_shop" 29 | table_name: "raw_customers" 30 | name: "id" 31 | - name: "first_name" 32 | references: 33 | - project_name: "catalog" 34 | dataset_name: "jaffle_shop" 35 | table_name: "raw_customers" 36 | name: "first_name" 37 | - name: "last_name" 38 | references: 39 | - project_name: "catalog" 40 | dataset_name: "jaffle_shop" 41 | table_name: "raw_customers" 42 | name: "last_name" 43 | - name: "full_name" 44 | references: 45 | - project_name: "catalog" 46 | dataset_name: "jaffle_shop" 47 | table_name: "raw_customers" 48 | name: "last_name" 49 | - project_name: "catalog" 50 | dataset_name: "jaffle_shop" 51 | table_name: "raw_customers" 52 | name: "first_name" 53 | type: "select" 54 | selected_tables: 55 | - "catalog.jaffle_shop.raw_customers" -------------------------------------------------------------------------------- /src/test/resources/schemas/bigquery_simple_all_types_table_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "myproject:dataset.table", 3 | "tableReference": { 4 | "projectId": "myproject", 5 | "datasetId": "dataset", 6 | "tableId": "table" 7 | }, 8 | "schema": { 9 | "fields": [ 10 | { 11 | "mode": "NULLABLE", 12 | "name": "afloat", 13 | "type": "FLOAT" 14 | }, 15 | { 16 | "mode": "NULLABLE", 17 | "name": "aString", 18 | "type": "STRING" 19 | }, 20 | { 21 | "mode": "NULLABLE", 22 | "name": "aInteger", 23 | "type": "INTEGER" 24 | }, 25 | { 26 | "mode": "NULLABLE", 27 | "name": "aBool", 28 | "type": "BOOLEAN" 29 | }, 30 | { 31 | "mode": "NULLABLE", 32 | "name": "aBytes", 33 | "type": "BYTES" 34 | }, 35 | { 36 | "mode": "NULLABLE", 37 | "name": "aNumeric", 38 | "type": "NUMERIC" 39 | }, 40 | { 41 | "mode": "NULLABLE", 42 | "name": "aTimestamp", 43 | "type": "TIMESTAMP" 44 | }, 45 | { 46 | "mode": "NULLABLE", 47 | "name": "aDate", 48 | "type": "DATE" 49 | }, 50 | { 51 | "mode": "NULLABLE", 52 | "name": "aTime", 53 | "type": "TIME" 54 | }, 55 | { 56 | "mode": "NULLABLE", 57 | "name": "aDateTime", 58 | "type": "DATETIME" 59 | }, 60 | { 61 | "mode": "NULLABLE", 62 | "name": "aGeoPoint", 63 | "type": "GEOGRAPHY" 64 | } 65 | ] 66 | }, 67 | "type": "TABLE" 68 | } 69 | -------------------------------------------------------------------------------- /src/test/java/examples/BigQuerySqlParserBQSchemaTest.java: -------------------------------------------------------------------------------- 1 | package examples; 2 | 3 | import com.borjav.data.model.ResolvedNodeExtended; 4 | import com.borjav.data.output.OutputLineage; 5 | import com.borjav.data.parser.ZetaSQLResolver; 6 | import com.borjav.data.service.BigQueryServiceFactory; 7 | import com.borjav.data.service.BigQueryTableLoadService; 8 | import com.borjav.data.service.BigQueryZetaSqlSchemaLoader; 9 | import org.junit.Test; 10 | import utils.TestResourceLoader; 11 | 12 | public class BigQuerySqlParserBQSchemaTest { 13 | 14 | 15 | @Test 16 | public void testEndToEnd() { 17 | // The default factory will try to connect to the real BigQuery API, which requires 18 | // the user to be authenticated: 19 | // $ gcloud auth application-default login 20 | BigQueryServiceFactory service = BigQueryServiceFactory.defaultFactory(); 21 | 22 | BigQueryZetaSqlSchemaLoader schemaLoader = 23 | new BigQueryZetaSqlSchemaLoader( 24 | BigQueryTableLoadService.usingServiceFactory(service)); 25 | ZetaSQLResolver parser = new ZetaSQLResolver(schemaLoader); 26 | 27 | String sql = TestResourceLoader.load("sql/benchmark/external/test.sql"); 28 | // String sql = """ 29 | // SELECT 30 | // word, 31 | // SUM(word_count) AS count 32 | // FROM 33 | // `bigquery-public-data.samples.shakespeare` 34 | // WHERE 35 | // word LIKE "%raisin%" 36 | // GROUP BY 37 | // word; 38 | // """; 39 | ResolvedNodeExtended table = parser.extractLineage(sql); 40 | OutputLineage printer = new OutputLineage(); 41 | System.out.println(printer.toYaml(table, "", true)); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/test/resources/sql/kitchen_sink_multiple_output_columns_with_alias.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | 3 | SELECT 4 | CONCAT(tA.colA, tB.colB) as joined_column, 5 | tA.colA as columnA 6 | FROM 7 | `project1.datasetA.TableA` as tA 8 | LEFT OUTER JOIN 9 | `project2.datasetB.TableB` as tB 10 | USING (colC) 11 | 12 | expected_output: 13 | name: "json_functions" 14 | output_columns: 15 | - name: "joined_column" 16 | references: 17 | - project_name: "project1" 18 | dataset_name: "datasetA" 19 | table_name: "TableA" 20 | name: "colA" 21 | - project_name: "project2" 22 | dataset_name: "datasetB" 23 | table_name: "TableB" 24 | name: "colB" 25 | - name: "columnA" 26 | references: 27 | - project_name: "project1" 28 | dataset_name: "datasetA" 29 | table_name: "TableA" 30 | name: "colA" 31 | joins: 32 | - join_type: "LEFT" 33 | left_columns: 34 | - name: "colC" 35 | references: 36 | - project_name: "project1" 37 | dataset_name: "datasetA" 38 | table_name: "TableA" 39 | name: "colC" 40 | used_for: 41 | - "JOIN_LEFT_TABLE" 42 | right_columns: 43 | - name: "colC" 44 | references: 45 | - project_name: "project2" 46 | dataset_name: "datasetB" 47 | table_name: "TableB" 48 | name: "colC" 49 | used_for: 50 | - "JOIN_RIGHT_TABLE" 51 | type: "select" 52 | selected_tables: 53 | - "project1.datasetA.TableA" 54 | - "project2.datasetB.TableB" 55 | 56 | 57 | -------------------------------------------------------------------------------- /src/test/java/extractor/BigQueryTableCreatorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package extractor; 18 | 19 | import static com.borjav.data.extractor.BigQueryTableCreator.fromLegacyTableName; 20 | import static com.borjav.data.extractor.BigQueryTableCreator.usingBestEffort; 21 | import static org.junit.Assert.assertEquals; 22 | 23 | import com.borjav.data.model.BigQueryTableEntity; 24 | import org.junit.Test; 25 | 26 | 27 | public final class BigQueryTableCreatorTest { 28 | 29 | @Test 30 | public void fromLegacyTableName_projectIdWithHyphens_valid() { 31 | assertEquals(fromLegacyTableName("column-lineage:temp.lineage"), 32 | BigQueryTableEntity.create( 33 | /*projectId=*/ "column-lineage", 34 | /*dataset=*/ "temp", 35 | /*table=*/ "lineage")); 36 | } 37 | 38 | @Test 39 | public void usingBestEffort_standardSqlName_valid() { 40 | assertEquals(usingBestEffort("column-lineage.temp.lineage"), 41 | BigQueryTableEntity.create( 42 | /*projectId=*/ "column-lineage", 43 | /*dataset=*/ "temp", 44 | /*table=*/ "lineage")); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/json_functions.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with base as( 3 | SELECT 4 | JSON_EXTRACT(json_field, '$.class') AS class, 5 | JSON_EXTRACT_SCALAR(json_field, '$.class.other') AS class_other, 6 | JSON_EXTRACT_ARRAY(json_field, '$.class.nested_array') AS class_nested, 7 | id, 8 | FROM `catalog.jaffle_shop.json_table` 9 | ), 10 | redundant_cte AS ( 11 | select * from base 12 | ) 13 | 14 | SELECT 15 | id, 16 | class, 17 | class_other, 18 | class_nested[OFFSET(0)] as nested_classes_offset, 19 | nested_classes, 20 | from redundant_cte,unnest(class_nested) as nested_classes 21 | 22 | 23 | expected_output: 24 | name: "json_functions" 25 | output_columns: 26 | - name: "id" 27 | references: 28 | - project_name: "catalog" 29 | dataset_name: "jaffle_shop" 30 | table_name: "json_table" 31 | name: "id" 32 | - name: "class" 33 | references: 34 | - project_name: "catalog" 35 | dataset_name: "jaffle_shop" 36 | table_name: "json_table" 37 | name: "json_field" 38 | - name: "class_other" 39 | references: 40 | - project_name: "catalog" 41 | dataset_name: "jaffle_shop" 42 | table_name: "json_table" 43 | name: "json_field" 44 | - name: "nested_classes_offset" 45 | references: 46 | - project_name: "catalog" 47 | dataset_name: "jaffle_shop" 48 | table_name: "json_table" 49 | name: "json_field" 50 | - name: "nested_classes" 51 | references: 52 | - project_name: "catalog" 53 | dataset_name: "jaffle_shop" 54 | table_name: "json_table" 55 | name: "json_field" 56 | type: "select" 57 | selected_tables: 58 | - "catalog.jaffle_shop.json_table" 59 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/messy_struct.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | WITH two AS( 3 | 4 | select 5 | STRUCT( STRUCT(content.subnested_id.sub_b AS b, content.id AS content, "asdfs23ad" as literal2) AS 6 | nested_nes) test, 7 | from `catalog.jaffle_shop.struct_table` 8 | ), three as( 9 | select STRUCT(test) as testing2 from two 10 | 11 | ), four as ( 12 | SELECT STRUCT(testing2.test.nested_nes.b as ac, "asdfsad" as literal) AS messy_struct, 13 | LEAD(testing2.test.nested_nes.b) 14 | OVER (PARTITION BY testing2.test.nested_nes.content ORDER BY testing2.test.nested_nes.b ASC) AS 15 | next_name, 16 | 17 | FROM three 18 | WHERE testing2.test.nested_nes.b = "a" 19 | ) 20 | select * from four 21 | 22 | expected_output: 23 | name: "messy_struct" 24 | output_columns: 25 | - name: "messy_struct.ac" 26 | references: 27 | - project_name: "catalog" 28 | dataset_name: "jaffle_shop" 29 | table_name: "struct_table" 30 | name: "content.subnested_id.sub_b" 31 | - name: "messy_struct.literal" 32 | - name: "next_name" 33 | references: 34 | - project_name: "catalog" 35 | dataset_name: "jaffle_shop" 36 | table_name: "struct_table" 37 | name: "content.subnested_id.sub_b" 38 | - project_name: "catalog" 39 | dataset_name: "jaffle_shop" 40 | table_name: "struct_table" 41 | name: "content.id" 42 | filters: 43 | - name: "_testing2.test.nested_nes.b_" 44 | references: 45 | - project_name: "catalog" 46 | dataset_name: "jaffle_shop" 47 | table_name: "struct_table" 48 | name: "content.subnested_id.sub_b" 49 | used_for: 50 | - "FILTER" 51 | type: "select" 52 | selected_tables: 53 | - "catalog.jaffle_shop.struct_table" 54 | 55 | -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/model/BigQueryTableEntity.java: -------------------------------------------------------------------------------- 1 | 2 | package com.borjav.data.model; 3 | 4 | import com.google.auto.value.AutoValue; 5 | 6 | import java.io.Serializable; 7 | 8 | /** 9 | * Value class to represent a BiQgQuery Table entity. 10 | */ 11 | @AutoValue 12 | public abstract class BigQueryTableEntity implements Serializable { 13 | 14 | public abstract String getProjectId(); 15 | 16 | public abstract String getDataset(); 17 | 18 | public abstract String getTable(); 19 | 20 | /** 21 | * Returns {@code true} if the table is a temporary table. 22 | *

It uses rule dataset name starts with '_' or the table name starts with '_' or 'anon'. 23 | */ 24 | public final boolean isTempTable() { 25 | return getDataset().startsWith("_") 26 | || getTable().startsWith("_") 27 | || getTable().startsWith("anon"); 28 | } 29 | 30 | public static Builder builder() { 31 | return new AutoValue_BigQueryTableEntity.Builder(); 32 | } 33 | 34 | public static BigQueryTableEntity create(String projectId, String dataset, String table) { 35 | return builder() 36 | .setProjectId(projectId) 37 | .setDataset(dataset) 38 | .setTable(table) 39 | .build(); 40 | } 41 | 42 | 43 | public String getLegacySqlName() { 44 | return String.format("%s:%s.%s", getProjectId(), getDataset(), getTable()); 45 | } 46 | 47 | public String getStandSqlName() { 48 | return String.format("%s.%s.%s", getProjectId(), getDataset(), getTable()); 49 | } 50 | 51 | @AutoValue.Builder 52 | public abstract static class Builder { 53 | 54 | public abstract Builder setProjectId(String projectId); 55 | 56 | public abstract Builder setDataset(String dataset); 57 | 58 | public abstract Builder setTable(String table); 59 | 60 | public abstract BigQueryTableEntity build(); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/pivot.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with source as ( 3 | 4 | 5 | select * from `catalog.jaffle_shop.stg_payments` 6 | 7 | ) 8 | SELECT * FROM 9 | (SELECT payment_method,amount FROM source) 10 | PIVOT(SUM(amount) FOR payment_method IN ('card', 'cash', 'bank_transfer', 'other')) 11 | 12 | expected_output: 13 | name: "pivot" 14 | output_columns: 15 | - name: "card" 16 | references: 17 | - project_name: "catalog" 18 | dataset_name: "jaffle_shop" 19 | table_name: "stg_payments" 20 | name: "amount" 21 | - project_name: "catalog" 22 | dataset_name: "jaffle_shop" 23 | table_name: "stg_payments" 24 | name: "payment_method" 25 | - name: "cash" 26 | references: 27 | - project_name: "catalog" 28 | dataset_name: "jaffle_shop" 29 | table_name: "stg_payments" 30 | name: "amount" 31 | - project_name: "catalog" 32 | dataset_name: "jaffle_shop" 33 | table_name: "stg_payments" 34 | name: "payment_method" 35 | - name: "bank_transfer" 36 | references: 37 | - project_name: "catalog" 38 | dataset_name: "jaffle_shop" 39 | table_name: "stg_payments" 40 | name: "amount" 41 | - project_name: "catalog" 42 | dataset_name: "jaffle_shop" 43 | table_name: "stg_payments" 44 | name: "payment_method" 45 | - name: "other" 46 | references: 47 | - project_name: "catalog" 48 | dataset_name: "jaffle_shop" 49 | table_name: "stg_payments" 50 | name: "amount" 51 | - project_name: "catalog" 52 | dataset_name: "jaffle_shop" 53 | table_name: "stg_payments" 54 | name: "payment_method" 55 | type: "select" 56 | selected_tables: 57 | - "catalog.jaffle_shop.stg_payments" -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/json_functions_with_literals.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with base as( 3 | SELECT 4 | JSON_EXTRACT(json_field, '$.class') AS class, 5 | JSON_EXTRACT_SCALAR(json_field, '$.class.other') AS class_other, 6 | JSON_EXTRACT_ARRAY(json_field, '$.class.nested_array') AS class_nested, 7 | id, 8 | FROM `catalog.jaffle_shop.json_table` 9 | ) 10 | 11 | SELECT 12 | id, 13 | class, 14 | class_other, 15 | nested_classes, 16 | from base,unnest(class_nested) as nested_classes 17 | 18 | 19 | expected_output: 20 | name: "json_functions" 21 | output_columns: 22 | - name: "id" 23 | references: 24 | - project_name: "catalog" 25 | dataset_name: "jaffle_shop" 26 | table_name: "json_table" 27 | name: "id" 28 | - name: "class" 29 | references: 30 | - table_name: "_literal_" 31 | name: "_literal_" 32 | literal_value: 33 | - "$.class" 34 | - project_name: "catalog" 35 | dataset_name: "jaffle_shop" 36 | table_name: "json_table" 37 | name: "json_field" 38 | - name: "class_other" 39 | references: 40 | - table_name: "_literal_" 41 | name: "_literal_" 42 | literal_value: 43 | - "$.class.other" 44 | - project_name: "catalog" 45 | dataset_name: "jaffle_shop" 46 | table_name: "json_table" 47 | name: "json_field" 48 | - name: "nested_classes" 49 | references: 50 | - table_name: "_literal_" 51 | name: "_literal_" 52 | literal_value: 53 | - "$.class.nested_array" 54 | - project_name: "catalog" 55 | dataset_name: "jaffle_shop" 56 | table_name: "json_table" 57 | name: "json_field" 58 | type: "select" 59 | selected_tables: 60 | - "catalog.jaffle_shop.json_table" -------------------------------------------------------------------------------- /src/test/resources/schemas/error_stats_table_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "etag": "67ikpVxNRNgZ/qN3lzqfag==", 4 | "id": "myproject:reporting.error_stats", 5 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/myproject/datasets/reporting/tables/error_stats", 6 | "tableReference": { 7 | "projectId": "myproject", 8 | "datasetId": "reporting", 9 | "tableId": "error_stats" 10 | }, 11 | "schema": { 12 | "fields": [ 13 | { 14 | "name": "partner_id", 15 | "type": "INTEGER", 16 | "mode": "NULLABLE" 17 | }, 18 | { 19 | "name": "partner_name", 20 | "type": "STRING", 21 | "mode": "NULLABLE" 22 | }, 23 | { 24 | "name": "hit_timestamp", 25 | "type": "TIMESTAMP", 26 | "mode": "NULLABLE" 27 | }, 28 | { 29 | "name": "conversion_type", 30 | "type": "STRING", 31 | "mode": "NULLABLE" 32 | }, 33 | { 34 | "name": "num_hits", 35 | "type": "INTEGER", 36 | "mode": "NULLABLE" 37 | }, 38 | { 39 | "name": "status_200", 40 | "type": "INTEGER", 41 | "mode": "NULLABLE" 42 | }, 43 | { 44 | "name": "status_300", 45 | "type": "INTEGER", 46 | "mode": "NULLABLE" 47 | }, 48 | { 49 | "name": "status_400", 50 | "type": "INTEGER", 51 | "mode": "NULLABLE" 52 | }, 53 | { 54 | "name": "status_500", 55 | "type": "INTEGER", 56 | "mode": "NULLABLE" 57 | } 58 | ] 59 | }, 60 | "timePartitioning": { 61 | "type": "DAY" 62 | }, 63 | "numBytes": "338991985246", 64 | "numLongTermBytes": "217579338469", 65 | "numRows": "4647568034", 66 | "creationTime": "1546166683743", 67 | "lastModifiedTime": "1589587976402", 68 | "type": "TABLE", 69 | "location": "US" 70 | } 71 | -------------------------------------------------------------------------------- /src/test/resources/sql/kitchen_sink_multiple_output_columns_without_alias.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | #standardSQL 3 | SELECT 4 | CONCAT(tA.colA, tB.colB) as joined_column, 5 | tA.colA as columnA, 6 | tB.colB 7 | FROM 8 | `project1.datasetA.TableA` as tA 9 | LEFT OUTER JOIN 10 | `project2.datasetB.TableB` as tB 11 | USING (colC) 12 | 13 | expected_output: 14 | name: "json_functions" 15 | output_columns: 16 | - name: "joined_column" 17 | references: 18 | - project_name: "project1" 19 | dataset_name: "datasetA" 20 | table_name: "TableA" 21 | name: "colA" 22 | - project_name: "project2" 23 | dataset_name: "datasetB" 24 | table_name: "TableB" 25 | name: "colB" 26 | - name: "columnA" 27 | references: 28 | - project_name: "project1" 29 | dataset_name: "datasetA" 30 | table_name: "TableA" 31 | name: "colA" 32 | - name: "colB" 33 | references: 34 | - project_name: "project2" 35 | dataset_name: "datasetB" 36 | table_name: "TableB" 37 | name: "colB" 38 | joins: 39 | - join_type: "LEFT" 40 | left_columns: 41 | - name: "colC" 42 | references: 43 | - project_name: "project1" 44 | dataset_name: "datasetA" 45 | table_name: "TableA" 46 | name: "colC" 47 | used_for: 48 | - "JOIN_LEFT_TABLE" 49 | right_columns: 50 | - name: "colC" 51 | references: 52 | - project_name: "project2" 53 | dataset_name: "datasetB" 54 | table_name: "TableB" 55 | name: "colC" 56 | used_for: 57 | - "JOIN_RIGHT_TABLE" 58 | type: "select" 59 | selected_tables: 60 | - "project1.datasetA.TableA" 61 | - "project2.datasetB.TableB" 62 | 63 | -------------------------------------------------------------------------------- /src/test/resources/schemas/CorePii_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "id": "demoProject:demo.CorePii", 4 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/demoProject/datasets/demo/tables/CorePii", 5 | "tableReference": { 6 | "projectId": "demoProject", 7 | "datasetId": "demo", 8 | "tableId": "CorePii" 9 | }, 10 | "schema": { 11 | "fields": [ 12 | { 13 | "name": "partner_phone_number", 14 | "type": "STRING", 15 | "mode": "NULLABLE", 16 | "policyTags": { 17 | "names": [ 18 | "projects/GovernanceProject/locations/us/taxonomies/8150274556907504807/policyTags/1234" 19 | ] 20 | } 21 | }, 22 | { 23 | "name": "partner_id", 24 | "type": "INTEGER", 25 | "mode": "NULLABLE" 26 | }, 27 | { 28 | "name": "partner_name", 29 | "type": "STRING", 30 | "mode": "NULLABLE", 31 | "policyTags": { 32 | "names": [ 33 | "projects/GovernanceProject/locations/us/taxonomies/8150274556907504807/policyTags/7890" 34 | ] 35 | } 36 | }, 37 | { 38 | "name": "hit_timestamp", 39 | "type": "TIMESTAMP", 40 | "mode": "NULLABLE" 41 | }, 42 | { 43 | "name": "conversion_type", 44 | "type": "STRING", 45 | "mode": "NULLABLE" 46 | }, 47 | { 48 | "name": "num_products", 49 | "type": "INTEGER", 50 | "mode": "NULLABLE" 51 | }, 52 | { 53 | "name": "num_hits", 54 | "type": "INTEGER", 55 | "mode": "NULLABLE" 56 | }, 57 | { 58 | "name": "matched_tag_count", 59 | "type": "INTEGER", 60 | "mode": "NULLABLE" 61 | }, 62 | { 63 | "name": "is_matched_product", 64 | "type": "INTEGER", 65 | "mode": "NULLABLE" 66 | } 67 | ] 68 | }, 69 | "type": "TABLE" 70 | } 71 | -------------------------------------------------------------------------------- /src/test/resources/sql/bigquery_daily_report_error_stats_join_group_by_aggr_functions.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | #standardSQL 3 | SELECT 4 | report.partner_id, 5 | SUM(num_lines) AS total_lines, 6 | SUM(status_200) AS total_200 7 | FROM 8 | `myproject.reporting.daily_report` as report 9 | LEFT OUTER JOIN 10 | `myproject.reporting.error_stats` as stats 11 | USING 12 | (partner_id) 13 | GROUP BY 14 | partner_id 15 | 16 | expected_output: 17 | name: "json_functions" 18 | output_columns: 19 | - name: "partner_id" 20 | references: 21 | - project_name: "myproject" 22 | dataset_name: "reporting" 23 | table_name: "daily_report" 24 | name: "partner_id" 25 | - name: "total_lines" 26 | references: 27 | - project_name: "myproject" 28 | dataset_name: "reporting" 29 | table_name: "daily_report" 30 | name: "num_lines" 31 | - name: "total_200" 32 | references: 33 | - project_name: "myproject" 34 | dataset_name: "reporting" 35 | table_name: "error_stats" 36 | name: "status_200" 37 | joins: 38 | - join_type: "LEFT" 39 | left_columns: 40 | - name: "partner_id" 41 | references: 42 | - project_name: "myproject" 43 | dataset_name: "reporting" 44 | table_name: "daily_report" 45 | name: "partner_id" 46 | used_for: 47 | - "JOIN_LEFT_TABLE" 48 | right_columns: 49 | - name: "partner_id" 50 | references: 51 | - project_name: "myproject" 52 | dataset_name: "reporting" 53 | table_name: "error_stats" 54 | name: "partner_id" 55 | used_for: 56 | - "JOIN_RIGHT_TABLE" 57 | aggregations: 58 | - name: "_partner_id_" 59 | references: 60 | - project_name: "myproject" 61 | dataset_name: "reporting" 62 | table_name: "daily_report" 63 | name: "partner_id" 64 | used_for: 65 | - "GROUP_BY" 66 | type: "select" 67 | selected_tables: 68 | - "myproject.reporting.error_stats" 69 | - "myproject.reporting.daily_report" 70 | 71 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/timestamps.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | SELECT TIMESTAMP_TRUNC(TIMESTAMP(PARSE_DATETIME('%Y%m%d',_TABLE_SUFFIX)), MINUTE) time, 3 | CONCAT(id, '-', other_id) as metric, 4 | value, 5 | PARSE_DATETIME('%Y%m%d',_TABLE_SUFFIX) + INTERVAL 1 DAY AS next_day, 6 | FROM `catalog.jaffle_shop.timestamps` 7 | WHERE source = "blabla" 8 | AND _TABLE_SUFFIX BETWEEN FORMAT_DATE("%Y%m%d", DATE_SUB(CURRENT_DATE(), INTERVAL 14 DAY)) AND FORMAT_DATE("%Y%m%d", CURRENT_DATE()) 9 | ORDER BY 1 10 | 11 | expected_output: 12 | name: "timestamps" 13 | output_columns: 14 | - name: "time" 15 | references: 16 | - project_name: "catalog" 17 | dataset_name: "jaffle_shop" 18 | table_name: "timestamps" 19 | name: "_TABLE_SUFFIX" 20 | - name: "metric" 21 | references: 22 | - project_name: "catalog" 23 | dataset_name: "jaffle_shop" 24 | table_name: "timestamps" 25 | name: "id" 26 | - project_name: "catalog" 27 | dataset_name: "jaffle_shop" 28 | table_name: "timestamps" 29 | name: "other_id" 30 | - name: "value" 31 | references: 32 | - project_name: "catalog" 33 | dataset_name: "jaffle_shop" 34 | table_name: "timestamps" 35 | name: "value" 36 | - name: "next_day" 37 | references: 38 | - project_name: "catalog" 39 | dataset_name: "jaffle_shop" 40 | table_name: "timestamps" 41 | name: "_TABLE_SUFFIX" 42 | filters: 43 | - name: "_source_" 44 | references: 45 | - project_name: "catalog" 46 | dataset_name: "jaffle_shop" 47 | table_name: "timestamps" 48 | name: "source" 49 | used_for: 50 | - "FILTER" 51 | - name: "__TABLE_SUFFIX_" 52 | references: 53 | - project_name: "catalog" 54 | dataset_name: "jaffle_shop" 55 | table_name: "timestamps" 56 | name: "_TABLE_SUFFIX" 57 | used_for: 58 | - "FILTER" 59 | other_used_columns: 60 | - name: "_time_" 61 | references: 62 | - project_name: "catalog" 63 | dataset_name: "jaffle_shop" 64 | table_name: "timestamps" 65 | name: "_TABLE_SUFFIX" 66 | used_for: 67 | - "ORDER_BY" 68 | type: "select" 69 | selected_tables: 70 | - "catalog.jaffle_shop.timestamps" 71 | -------------------------------------------------------------------------------- /src/test/resources/schemas/bigquery_demo_transactions_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "etag": "I+jiUp/Ro6l5L+O7Jvm1Hw==", 4 | "id": "myproject:demo.transactions", 5 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/myproject/datasets/demo/tables/transactions", 6 | "tableReference": { 7 | "projectId": "myproject", 8 | "datasetId": "demo", 9 | "tableId": "transactions" 10 | }, 11 | "schema": { 12 | "fields": [ 13 | { 14 | "name": "timestamp", 15 | "type": "TIMESTAMP", 16 | "mode": "NULLABLE" 17 | }, 18 | { 19 | "name": "partner_id", 20 | "type": "STRING", 21 | "mode": "NULLABLE" 22 | }, 23 | { 24 | "name": "advertiser_id", 25 | "type": "STRING", 26 | "mode": "NULLABLE" 27 | }, 28 | { 29 | "name": "event_name", 30 | "type": "STRING", 31 | "mode": "NULLABLE" 32 | }, 33 | { 34 | "name": "user_id", 35 | "type": "STRING", 36 | "mode": "NULLABLE" 37 | }, 38 | { 39 | "name": "user_id_type", 40 | "type": "STRING", 41 | "mode": "NULLABLE" 42 | }, 43 | { 44 | "name": "order_id", 45 | "type": "STRING", 46 | "mode": "NULLABLE" 47 | }, 48 | { 49 | "name": "products", 50 | "type": "RECORD", 51 | "mode": "REPEATED", 52 | "fields": [ 53 | { 54 | "name": "id", 55 | "type": "STRING", 56 | "mode": "NULLABLE" 57 | }, 58 | { 59 | "name": "name", 60 | "type": "STRING", 61 | "mode": "NULLABLE" 62 | }, 63 | { 64 | "name": "seller", 65 | "type": "STRING", 66 | "mode": "NULLABLE" 67 | }, 68 | { 69 | "name": "quantity", 70 | "type": "INTEGER", 71 | "mode": "NULLABLE" 72 | }, 73 | { 74 | "name": "value", 75 | "type": "FLOAT", 76 | "mode": "NULLABLE" 77 | }, 78 | { 79 | "name": "currency", 80 | "type": "STRING", 81 | "mode": "NULLABLE" 82 | } 83 | ] 84 | } 85 | ] 86 | }, 87 | "numBytes": "2996364119", 88 | "numLongTermBytes": "0", 89 | "numRows": "13771407", 90 | "creationTime": "1587043107502", 91 | "lastModifiedTime": "1587043928108", 92 | "type": "TABLE", 93 | "location": "US" 94 | } 95 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/messy_unnesting.yaml: -------------------------------------------------------------------------------- 1 | 2 | query: | 3 | WITH base AS( 4 | SELECT 5 | SUM(( 6 | SELECT 7 | SUM(c.amount) 8 | FROM 9 | UNNEST(nested_ids) c)) AS net_cost 10 | FROM `catalog.jaffle_shop.nested_table` 11 | 12 | ), 13 | 14 | weird AS( 15 | SELECT 16 | * 17 | FROM `catalog.jaffle_shop.nested_table` LEFT JOIN UNNEST(nested_ids) c 18 | ), join_random AS( 19 | 20 | select * from base 21 | FULL outer join weird on base.net_cost = weird.amount 22 | ) 23 | select 24 | STRUCT(net_cost AS net_cost, STRUCT (first_name as first_name, last_name as last_name, COALESCE 25 | (last_name,first_name) AS full_name) AS random_struct) AS key 26 | from join_random 27 | 28 | expected_output: 29 | name: "messy_unnesting" 30 | output_columns: 31 | - name: "key.net_cost" 32 | references: 33 | - project_name: "catalog" 34 | dataset_name: "jaffle_shop" 35 | table_name: "nested_table" 36 | name: "nested_ids.amount" 37 | - name: "key.random_struct.first_name" 38 | references: 39 | - project_name: "catalog" 40 | dataset_name: "jaffle_shop" 41 | table_name: "nested_table" 42 | name: "first_name" 43 | - name: "key.random_struct.last_name" 44 | references: 45 | - project_name: "catalog" 46 | dataset_name: "jaffle_shop" 47 | table_name: "nested_table" 48 | name: "last_name" 49 | - name: "key.random_struct.full_name" 50 | references: 51 | - project_name: "catalog" 52 | dataset_name: "jaffle_shop" 53 | table_name: "nested_table" 54 | name: "first_name" 55 | - project_name: "catalog" 56 | dataset_name: "jaffle_shop" 57 | table_name: "nested_table" 58 | name: "last_name" 59 | joins: 60 | - join_type: "FULL" 61 | left_columns: 62 | - name: "net_cost" 63 | references: 64 | - project_name: "catalog" 65 | dataset_name: "jaffle_shop" 66 | table_name: "nested_table" 67 | name: "nested_ids.amount" 68 | used_for: 69 | - "JOIN_LEFT_TABLE" 70 | right_columns: 71 | - name: "amount" 72 | references: 73 | - project_name: "catalog" 74 | dataset_name: "jaffle_shop" 75 | table_name: "nested_table" 76 | name: "nested_ids.amount" 77 | used_for: 78 | - "JOIN_RIGHT_TABLE" 79 | type: "select" 80 | selected_tables: 81 | - "catalog.jaffle_shop.nested_table" -------------------------------------------------------------------------------- /src/test/java/utils/GoogleTypesToJsonConverter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package utils; 18 | 19 | import static com.google.common.collect.ImmutableList.toImmutableList; 20 | 21 | import com.google.api.client.json.JsonGenerator; 22 | import com.google.api.client.json.JsonParser; 23 | import com.google.api.client.json.gson.GsonFactory; 24 | import com.google.common.collect.ImmutableList; 25 | import java.io.IOException; 26 | import java.io.StringWriter; 27 | import java.util.Arrays; 28 | import java.util.Collection; 29 | import java.util.Objects; 30 | 31 | public final class GoogleTypesToJsonConverter { 32 | 33 | public static T convertFromJson(Class clazz, String typeJson) { 34 | try (JsonParser parser = new GsonFactory().createJsonParser(typeJson)) { 35 | return parser.parse(clazz); 36 | } catch (IOException jsonProcessingException) { 37 | return null; 38 | } 39 | } 40 | 41 | public static ImmutableList convertFromJson(Class clazz, String... typeJsons) { 42 | return convertFromJson(clazz, Arrays.asList(typeJsons)); 43 | } 44 | 45 | public static ImmutableList convertFromJson(Class clazz, Collection typeJsons) { 46 | return typeJsons.stream() 47 | .map(GoogleTypesToJsonConverter::convertToJson) 48 | .map(item -> convertFromJson(clazz, item)) 49 | .filter(Objects::nonNull) 50 | .collect(toImmutableList()); 51 | } 52 | 53 | public static String convertToJson(T item) { 54 | if (item instanceof String) { 55 | return item.toString(); 56 | } 57 | 58 | StringWriter stringWriter = new StringWriter(); 59 | try (JsonGenerator jsonGenerator = new GsonFactory().createJsonGenerator(stringWriter)) { 60 | // Output is primarily used for testing and hence enable pretty printing. 61 | jsonGenerator.enablePrettyPrint(); 62 | jsonGenerator.serialize(item); 63 | jsonGenerator.flush(); 64 | return stringWriter.toString(); 65 | } catch (IOException ioException) { 66 | return ""; 67 | } 68 | } 69 | 70 | private GoogleTypesToJsonConverter() { 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/test/resources/schemas/simple_daily_report_table_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "etag": "btl8vEprHYEpyiYwf/WFQw==", 4 | "id": "myproject:reporting.daily_report", 5 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/myproject/datasets/reporting/tables/daily_report", 6 | "tableReference": { 7 | "projectId": "myproject", 8 | "datasetId": "reporting", 9 | "tableId": "daily_report" 10 | }, 11 | "schema": { 12 | "fields": [ 13 | { 14 | "name": "hit_timestamp", 15 | "type": "TIMESTAMP", 16 | "mode": "NULLABLE" 17 | }, 18 | { 19 | "name": "partner_id", 20 | "type": "INTEGER", 21 | "mode": "NULLABLE" 22 | }, 23 | { 24 | "name": "partner_name", 25 | "type": "STRING", 26 | "mode": "NULLABLE" 27 | }, 28 | { 29 | "name": "products", 30 | "type": "RECORD", 31 | "mode": "REPEATED", 32 | "fields": [ 33 | { 34 | "name": "jsonError", 35 | "type": "STRING", 36 | "mode": "NULLABLE" 37 | }, 38 | { 39 | "name": "product", 40 | "type": "RECORD", 41 | "mode": "NULLABLE", 42 | "fields": [ 43 | { 44 | "name": "id", 45 | "type": "STRING", 46 | "mode": "NULLABLE" 47 | }, 48 | { 49 | "name": "name", 50 | "type": "STRING", 51 | "mode": "NULLABLE" 52 | }, 53 | { 54 | "name": "seller", 55 | "type": "STRING", 56 | "mode": "NULLABLE" 57 | }, 58 | { 59 | "name": "quantity", 60 | "type": "STRING", 61 | "mode": "NULLABLE" 62 | }, 63 | { 64 | "name": "value", 65 | "type": "STRING", 66 | "mode": "NULLABLE" 67 | }, 68 | { 69 | "name": "currency", 70 | "type": "STRING", 71 | "mode": "NULLABLE" 72 | } 73 | ] 74 | } 75 | ] 76 | }, 77 | { 78 | "name": "latency", 79 | "type": "FLOAT", 80 | "mode": "NULLABLE" 81 | }, 82 | { 83 | "name": "is_ok", 84 | "type": "BOOLEAN", 85 | "mode": "NULLABLE" 86 | } 87 | ] 88 | }, 89 | "timePartitioning": { 90 | "type": "DAY" 91 | }, 92 | "numBytes": "4243039242539", 93 | "numLongTermBytes": "2756175095381", 94 | "numRows": "4688903566", 95 | "creationTime": "1544756481320", 96 | "lastModifiedTime": "1589501539422", 97 | "type": "TABLE", 98 | "location": "US" 99 | } 100 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/unnest.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | WITH 3 | mapping AS ( 4 | SELECT 5 | id record_id, 6 | unnested_id.id as unnested_id 7 | FROM 8 | `catalog.jaffle_shop.nested_table`, 9 | UNNEST(nested_ids) unnested_id ), 10 | mapping_2 AS ( 11 | SELECT 12 | *, 13 | RIGHT(content.id, 1) unnested_id 14 | FROM 15 | `catalog.jaffle_shop.struct_table`), final AS( 16 | SELECT 17 | mapping.record_id AS record_id, 18 | ARRAY_AGG(DISTINCT content.id ) AS agg_content, 19 | count( distinct content.id) AS count_content 20 | FROM 21 | mapping 22 | LEFT JOIN 23 | mapping_2 24 | USING 25 | (unnested_id) 26 | GROUP BY 1 27 | ORDER BY 3 desc 28 | ) 29 | 30 | SELECT 31 | * 32 | FROM final 33 | 34 | 35 | expected_output: 36 | name: "unnest" 37 | output_columns: 38 | - name: "record_id" 39 | references: 40 | - project_name: "catalog" 41 | dataset_name: "jaffle_shop" 42 | table_name: "nested_table" 43 | name: "nested_ids.id" 44 | - name: "agg_content" 45 | references: 46 | - project_name: "catalog" 47 | dataset_name: "jaffle_shop" 48 | table_name: "struct_table" 49 | name: "content.id" 50 | - name: "count_content" 51 | references: 52 | - project_name: "catalog" 53 | dataset_name: "jaffle_shop" 54 | table_name: "struct_table" 55 | name: "content.id" 56 | joins: 57 | - join_type: "LEFT" 58 | left_columns: 59 | - name: "unnested_id" 60 | references: 61 | - project_name: "catalog" 62 | dataset_name: "jaffle_shop" 63 | table_name: "nested_table" 64 | name: "nested_ids.id" 65 | used_for: 66 | - "JOIN_LEFT_TABLE" 67 | right_columns: 68 | - name: "unnested_id" 69 | references: 70 | - project_name: "catalog" 71 | dataset_name: "jaffle_shop" 72 | table_name: "struct_table" 73 | name: "content.id" 74 | used_for: 75 | - "JOIN_RIGHT_TABLE" 76 | aggregations: 77 | - name: "_record_id_" 78 | references: 79 | - project_name: "catalog" 80 | dataset_name: "jaffle_shop" 81 | table_name: "nested_table" 82 | name: "nested_ids.id" 83 | used_for: 84 | - "GROUP_BY" 85 | other_used_columns: 86 | - name: "_count_content_" 87 | references: 88 | - project_name: "catalog" 89 | dataset_name: "jaffle_shop" 90 | table_name: "struct_table" 91 | name: "content.id" 92 | used_for: 93 | - "ORDER_BY" 94 | type: "select" 95 | selected_tables: 96 | - "catalog.jaffle_shop.struct_table" 97 | - "catalog.jaffle_shop.nested_table" 98 | -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/model/ResolvedColumnExtended.java: -------------------------------------------------------------------------------- 1 | package com.borjav.data.model; 2 | 3 | import com.google.zetasql.Type; 4 | import com.google.zetasql.resolvedast.ResolvedColumn; 5 | 6 | import com.google.zetasql.resolvedast.ResolvedJoinScanEnums; 7 | import java.util.ArrayList; 8 | import java.util.HashSet; 9 | import java.util.LinkedList; 10 | import java.util.List; 11 | 12 | 13 | public class ResolvedColumnExtended { 14 | 15 | public enum EXTRACOLUMNS { FILTER, GROUP_BY, ORDER_BY, JOIN_LEFT_TABLE, JOIN_RIGHT_TABLE, 16 | PARTITION_BY_ANALYTIC_FUNCTION,ORDER_BY_ANALYTIC_FUNCTION} 17 | 18 | 19 | public String name; 20 | public String tableName; 21 | public Long indexOriginalTable; 22 | public Long resolvedIndex; 23 | public List columnsReferenced = new ArrayList<>(); 24 | public HashSet usedFor; 25 | public ResolvedJoinScanEnums.JoinType joinType; 26 | public Type type; 27 | public String literalValue; 28 | 29 | // used to keep track of the elements in the list when accessing getStructResolved. 30 | // it works as a queue 31 | public LinkedList indexStructList = new LinkedList<>(); 32 | // This keeps track of what was the original source of the column when it comes from a struct 33 | // or an array 34 | public LinkedList originalIndexStructList = new LinkedList<>(); 35 | 36 | // this keeps track of whats the new position when a field has been converted into a struct 37 | public LinkedList makeStructIndex = new LinkedList<>(); 38 | 39 | public ResolvedColumnExtended(ResolvedColumn resolvedColumn, Long indexOriginalTable) { 40 | this.name = resolvedColumn.getName(); 41 | this.tableName = resolvedColumn.getTableName(); 42 | this.resolvedIndex = resolvedColumn.getId(); 43 | this.indexOriginalTable = indexOriginalTable; 44 | this.type = resolvedColumn.getType(); 45 | this.usedFor = new HashSet<>(); 46 | this.joinType = null; 47 | } 48 | 49 | 50 | public ResolvedColumnExtended(String name, String tableName, Long indexOriginalTable, 51 | Long resolvedIndex, String literalValue) { 52 | this.name = name; 53 | this.tableName = tableName; 54 | this.indexOriginalTable = indexOriginalTable; 55 | this.resolvedIndex = resolvedIndex; 56 | this.type= null; 57 | this.literalValue = literalValue; 58 | this.usedFor = new HashSet<>(); 59 | this.joinType = null; 60 | } 61 | 62 | public ResolvedColumnExtended(String name, String tableName, Long indexOriginalTable, 63 | Long resolvedIndex, Type type, Long resolvedSubIndex, 64 | HashSet usedFor, ResolvedJoinScanEnums.JoinType joinType) { 65 | this.name = name; 66 | this.tableName = tableName; 67 | this.indexOriginalTable = indexOriginalTable; 68 | this.resolvedIndex = resolvedIndex; 69 | this.type= type; 70 | this.usedFor = usedFor; 71 | this.joinType = joinType; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/udf.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | CREATE TEMP FUNCTION 3 | decode(base STRING) 4 | RETURNS STRING 5 | LANGUAGE js AS """ 6 | placeholder = '0000000000000000000000'; 7 | """ OPTIONS ( library=[ "..." ] ); 8 | WITH 9 | mapping AS ( 10 | SELECT 11 | id record_id, 12 | unnested_id.id as unnested_id 13 | FROM 14 | `catalog.jaffle_shop.nested_table`, 15 | UNNEST(nested_ids) unnested_id ), 16 | mapping_2 AS ( 17 | SELECT 18 | *, 19 | decode(RIGHT(content.id, 1)) unnested_id 20 | FROM 21 | `catalog.jaffle_shop.struct_table`) 22 | SELECT 23 | mapping.record_id AS record_id, 24 | ARRAY_AGG(DISTINCT content.id ) AS agg_content, 25 | count( distinct content.id) AS count_content 26 | FROM 27 | mapping 28 | LEFT JOIN 29 | mapping_2 30 | USING 31 | (unnested_id) 32 | GROUP BY 1 33 | ORDER BY 3 desc 34 | 35 | expected_output: 36 | name: "unnest" 37 | output_columns: 38 | - name: "record_id" 39 | references: 40 | - project_name: "catalog" 41 | dataset_name: "jaffle_shop" 42 | table_name: "nested_table" 43 | name: "nested_ids.id" 44 | - name: "agg_content" 45 | references: 46 | - project_name: "catalog" 47 | dataset_name: "jaffle_shop" 48 | table_name: "struct_table" 49 | name: "content.id" 50 | - name: "count_content" 51 | references: 52 | - project_name: "catalog" 53 | dataset_name: "jaffle_shop" 54 | table_name: "struct_table" 55 | name: "content.id" 56 | joins: 57 | - join_type: "LEFT" 58 | left_columns: 59 | - name: "unnested_id" 60 | references: 61 | - project_name: "catalog" 62 | dataset_name: "jaffle_shop" 63 | table_name: "nested_table" 64 | name: "nested_ids.id" 65 | used_for: 66 | - "JOIN_LEFT_TABLE" 67 | right_columns: 68 | - name: "unnested_id" 69 | references: 70 | - project_name: "catalog" 71 | dataset_name: "jaffle_shop" 72 | table_name: "struct_table" 73 | name: "content.id" 74 | used_for: 75 | - "JOIN_RIGHT_TABLE" 76 | aggregations: 77 | - name: "_record_id_" 78 | references: 79 | - project_name: "catalog" 80 | dataset_name: "jaffle_shop" 81 | table_name: "nested_table" 82 | name: "nested_ids.id" 83 | used_for: 84 | - "GROUP_BY" 85 | other_used_columns: 86 | - name: "_count_content_" 87 | references: 88 | - project_name: "catalog" 89 | dataset_name: "jaffle_shop" 90 | table_name: "struct_table" 91 | name: "content.id" 92 | used_for: 93 | - "ORDER_BY" 94 | type: "select" 95 | selected_tables: 96 | - "catalog.jaffle_shop.struct_table" 97 | - "catalog.jaffle_shop.nested_table" 98 | 99 | -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/utils/UtilsParser.java: -------------------------------------------------------------------------------- 1 | package com.borjav.data.utils; 2 | 3 | import com.google.api.client.util.Data; 4 | import com.google.api.services.bigquery.model.TableCell; 5 | import com.google.zetasql.resolvedast.ResolvedNodes; 6 | import java.io.File; 7 | import java.io.IOException; 8 | import java.nio.charset.Charset; 9 | import java.nio.file.Files; 10 | import java.nio.file.Paths; 11 | 12 | public class UtilsParser { 13 | 14 | public static String readFile(File path, Charset encoding) 15 | throws IOException { 16 | byte[] encoded = Files.readAllBytes(Paths.get(path.toURI())); 17 | return new String(encoded, encoding); 18 | } 19 | 20 | public static String readFile(String path, Charset encoding) 21 | throws IOException { 22 | byte[] encoded = Files.readAllBytes(Paths.get(path)); 23 | return new String(encoded, encoding); 24 | } 25 | 26 | public static String removeComments(String sql) { 27 | String pattern = "-{2,}.*"; 28 | return sql.replaceAll(pattern, ""); 29 | } 30 | 31 | 32 | public static String getStringValue(TableCell cell) { 33 | return Data.isNull(cell.getV()) ? null : cell.getV().toString(); 34 | } 35 | 36 | public static File createTempDirectory() 37 | throws IOException { 38 | final File temp; 39 | 40 | temp = File.createTempFile("temp", Long.toString(System.nanoTime())); 41 | 42 | if (!(temp.delete())) { 43 | throw new IOException("Could not delete temp file: " + temp.getAbsolutePath()); 44 | } 45 | 46 | if (!(temp.mkdir())) { 47 | throw new IOException("Could not create temp directory: " + temp.getAbsolutePath()); 48 | } 49 | 50 | return (temp); 51 | } 52 | 53 | public static String getLiteral(ResolvedNodes.ResolvedLiteral node) { 54 | try { 55 | switch (node.getType().getKind()) { 56 | case TYPE_STRING: 57 | return node.getValue().getStringValue(); 58 | case TYPE_BYTES: 59 | return node.getValue().getBytesValue().toStringUtf8(); 60 | case TYPE_INT64: 61 | return String.valueOf(node.getValue().getInt64Value()); 62 | case TYPE_FLOAT: 63 | return String.valueOf(node.getValue().getFloatValue()); 64 | case TYPE_DOUBLE: 65 | return String.valueOf(node.getValue().getDoubleValue()); 66 | case TYPE_NUMERIC: 67 | return node.getValue().getNumericValue().toString(); 68 | case TYPE_BOOL: 69 | return node.getValue().getBoolValue() ? "true" : "false"; 70 | case TYPE_TIMESTAMP: 71 | return String.valueOf(node.getValue().getTimestampUnixMicros()); 72 | case TYPE_DATE: 73 | return String.valueOf(node.getValue().getDateValue()); 74 | case TYPE_TIME: 75 | return String.valueOf(node.getValue().getTimeValue()); 76 | case TYPE_DATETIME: 77 | return String.valueOf(node.getValue().getDatetimeValue()); 78 | case TYPE_BIGNUMERIC: 79 | return String.valueOf(node.getValue().getBigNumericValue().floatValue()); 80 | case TYPE_JSON: 81 | return node.getValue().getJsonValue(); 82 | default: 83 | return node.getValue().getProto().toString(); 84 | } 85 | } catch (Exception e) { 86 | return node.getValue().getProto().toString(); 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/parameter.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | WITH 3 | mapping AS ( 4 | SELECT 5 | id record_id, 6 | unnested_id.id as unnested_id 7 | FROM 8 | `catalog.jaffle_shop.nested_table`, 9 | UNNEST(nested_ids) unnested_id ), 10 | mapping_2 AS ( 11 | SELECT 12 | *, 13 | RIGHT(content.id, 1) unnested_id 14 | FROM 15 | `catalog.jaffle_shop.struct_table`) 16 | SELECT 17 | mapping.record_id AS record_id, 18 | ARRAY_AGG(DISTINCT content.id ) AS agg_content, 19 | count( distinct content.id) AS count_content 20 | FROM 21 | mapping 22 | LEFT JOIN 23 | mapping_2 24 | USING 25 | (unnested_id) 26 | GROUP BY 1 27 | HAVING count( distinct content.id) > @PARAMETER 28 | ORDER BY 3 desc 29 | 30 | 31 | 32 | expected_output: 33 | name: "unnest" 34 | output_columns: 35 | - name: "record_id" 36 | references: 37 | - project_name: "catalog" 38 | dataset_name: "jaffle_shop" 39 | table_name: "nested_table" 40 | name: "nested_ids.id" 41 | - name: "agg_content" 42 | references: 43 | - project_name: "catalog" 44 | dataset_name: "jaffle_shop" 45 | table_name: "struct_table" 46 | name: "content.id" 47 | - name: "count_content" 48 | references: 49 | - project_name: "catalog" 50 | dataset_name: "jaffle_shop" 51 | table_name: "struct_table" 52 | name: "content.id" 53 | joins: 54 | - join_type: "LEFT" 55 | left_columns: 56 | - name: "unnested_id" 57 | references: 58 | - project_name: "catalog" 59 | dataset_name: "jaffle_shop" 60 | table_name: "nested_table" 61 | name: "nested_ids.id" 62 | used_for: 63 | - "JOIN_LEFT_TABLE" 64 | right_columns: 65 | - name: "unnested_id" 66 | references: 67 | - project_name: "catalog" 68 | dataset_name: "jaffle_shop" 69 | table_name: "struct_table" 70 | name: "content.id" 71 | used_for: 72 | - "JOIN_RIGHT_TABLE" 73 | filters: 74 | - name: "_$agg3_" 75 | references: 76 | - project_name: "catalog" 77 | dataset_name: "jaffle_shop" 78 | table_name: "struct_table" 79 | name: "content.id" 80 | used_for: 81 | - "FILTER" 82 | aggregations: 83 | - name: "_record_id_" 84 | references: 85 | - project_name: "catalog" 86 | dataset_name: "jaffle_shop" 87 | table_name: "nested_table" 88 | name: "nested_ids.id" 89 | used_for: 90 | - "GROUP_BY" 91 | other_used_columns: 92 | - name: "_count_content_" 93 | references: 94 | - project_name: "catalog" 95 | dataset_name: "jaffle_shop" 96 | table_name: "struct_table" 97 | name: "content.id" 98 | used_for: 99 | - "ORDER_BY" 100 | type: "select" 101 | selected_tables: 102 | - "catalog.jaffle_shop.struct_table" 103 | - "catalog.jaffle_shop.nested_table" -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/unnest_create.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | CREATE TEMP FUNCTION 3 | decode(base STRING) 4 | RETURNS STRING 5 | LANGUAGE js AS """ 6 | placeholder = '0000000000000000000000'; 7 | """ OPTIONS ( library=[ "..." ] ); 8 | 9 | CREATE TABLE temp.table AS( 10 | WITH 11 | mapping AS ( 12 | SELECT 13 | id record_id, 14 | unnested_id.id as unnested_id 15 | FROM 16 | `catalog.jaffle_shop.nested_table`, 17 | UNNEST(nested_ids) unnested_id ), 18 | mapping_2 AS ( 19 | SELECT 20 | *, 21 | decode(RIGHT(content.id, 1)) unnested_id 22 | FROM 23 | `catalog.jaffle_shop.struct_table`) 24 | SELECT 25 | mapping.record_id AS record_id, 26 | ARRAY_AGG(DISTINCT content.id ) AS agg_content, 27 | count( distinct content.id) AS count_content 28 | FROM 29 | mapping 30 | LEFT JOIN 31 | mapping_2 32 | USING 33 | (unnested_id) 34 | GROUP BY 1 35 | ORDER BY 3 desc 36 | ) 37 | 38 | expected_output: 39 | name: "unnest_create" 40 | output_columns: 41 | - name: "record_id" 42 | references: 43 | - project_name: "catalog" 44 | dataset_name: "jaffle_shop" 45 | table_name: "nested_table" 46 | name: "nested_ids.id" 47 | - name: "agg_content" 48 | references: 49 | - project_name: "catalog" 50 | dataset_name: "jaffle_shop" 51 | table_name: "struct_table" 52 | name: "content.id" 53 | - name: "count_content" 54 | references: 55 | - project_name: "catalog" 56 | dataset_name: "jaffle_shop" 57 | table_name: "struct_table" 58 | name: "content.id" 59 | joins: 60 | - join_type: "LEFT" 61 | left_columns: 62 | - name: "unnested_id" 63 | references: 64 | - project_name: "catalog" 65 | dataset_name: "jaffle_shop" 66 | table_name: "nested_table" 67 | name: "nested_ids.id" 68 | used_for: 69 | - "JOIN_LEFT_TABLE" 70 | right_columns: 71 | - name: "unnested_id" 72 | references: 73 | - project_name: "catalog" 74 | dataset_name: "jaffle_shop" 75 | table_name: "struct_table" 76 | name: "content.id" 77 | used_for: 78 | - "JOIN_RIGHT_TABLE" 79 | aggregations: 80 | - name: "_record_id_" 81 | references: 82 | - project_name: "catalog" 83 | dataset_name: "jaffle_shop" 84 | table_name: "nested_table" 85 | name: "nested_ids.id" 86 | used_for: 87 | - "GROUP_BY" 88 | other_used_columns: 89 | - name: "_count_content_" 90 | references: 91 | - project_name: "catalog" 92 | dataset_name: "jaffle_shop" 93 | table_name: "struct_table" 94 | name: "content.id" 95 | used_for: 96 | - "ORDER_BY" 97 | type: "create" 98 | selected_tables: 99 | - "catalog.jaffle_shop.struct_table" 100 | - "catalog.jaffle_shop.nested_table" 101 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/unnest_create_view.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | CREATE TEMP FUNCTION 3 | decode(base STRING) 4 | RETURNS STRING 5 | LANGUAGE js AS """ 6 | placeholder = '0000000000000000000000'; 7 | """ OPTIONS ( library=[ "..." ] ); 8 | 9 | CREATE TABLE temp.table AS( 10 | WITH 11 | mapping AS ( 12 | SELECT 13 | id record_id, 14 | unnested_id.id as unnested_id 15 | FROM 16 | `catalog.jaffle_shop.nested_table`, 17 | UNNEST(nested_ids) unnested_id ), 18 | mapping_2 AS ( 19 | SELECT 20 | *, 21 | decode(RIGHT(content.id, 1)) unnested_id 22 | FROM 23 | `catalog.jaffle_shop.struct_table`) 24 | SELECT 25 | mapping.record_id AS record_id, 26 | ARRAY_AGG(DISTINCT content.id ) AS agg_content, 27 | count( distinct content.id) AS count_content 28 | FROM 29 | mapping 30 | LEFT JOIN 31 | mapping_2 32 | USING 33 | (unnested_id) 34 | GROUP BY 1 35 | ORDER BY 3 desc 36 | ) 37 | 38 | expected_output: 39 | name: "unnest_create_view" 40 | output_columns: 41 | - name: "record_id" 42 | references: 43 | - project_name: "catalog" 44 | dataset_name: "jaffle_shop" 45 | table_name: "nested_table" 46 | name: "nested_ids.id" 47 | - name: "agg_content" 48 | references: 49 | - project_name: "catalog" 50 | dataset_name: "jaffle_shop" 51 | table_name: "struct_table" 52 | name: "content.id" 53 | - name: "count_content" 54 | references: 55 | - project_name: "catalog" 56 | dataset_name: "jaffle_shop" 57 | table_name: "struct_table" 58 | name: "content.id" 59 | joins: 60 | - join_type: "LEFT" 61 | left_columns: 62 | - name: "unnested_id" 63 | references: 64 | - project_name: "catalog" 65 | dataset_name: "jaffle_shop" 66 | table_name: "nested_table" 67 | name: "nested_ids.id" 68 | used_for: 69 | - "JOIN_LEFT_TABLE" 70 | right_columns: 71 | - name: "unnested_id" 72 | references: 73 | - project_name: "catalog" 74 | dataset_name: "jaffle_shop" 75 | table_name: "struct_table" 76 | name: "content.id" 77 | used_for: 78 | - "JOIN_RIGHT_TABLE" 79 | aggregations: 80 | - name: "_record_id_" 81 | references: 82 | - project_name: "catalog" 83 | dataset_name: "jaffle_shop" 84 | table_name: "nested_table" 85 | name: "nested_ids.id" 86 | used_for: 87 | - "GROUP_BY" 88 | other_used_columns: 89 | - name: "_count_content_" 90 | references: 91 | - project_name: "catalog" 92 | dataset_name: "jaffle_shop" 93 | table_name: "struct_table" 94 | name: "content.id" 95 | used_for: 96 | - "ORDER_BY" 97 | type: "create" 98 | selected_tables: 99 | - "catalog.jaffle_shop.struct_table" 100 | - "catalog.jaffle_shop.nested_table" 101 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/count.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with customers as ( 3 | 4 | select * from `catalog.jaffle_shop.stg_customers` 5 | 6 | ), 7 | 8 | orders as ( 9 | 10 | select * from `catalog.jaffle_shop.stg_orders` 11 | 12 | ), 13 | 14 | payments as ( 15 | 16 | select * from `catalog.jaffle_shop.stg_payments` 17 | 18 | ), 19 | 20 | customer_orders as ( 21 | 22 | select 23 | customer_id, 24 | 25 | min(order_date) as first_order, 26 | max(order_date) as most_recent_order, 27 | count(order_id) as number_of_orders 28 | from orders 29 | 30 | group by customer_id 31 | 32 | ), 33 | 34 | customer_payments as ( 35 | 36 | select 37 | orders.customer_id, 38 | sum(amount) as total_amount 39 | 40 | from payments 41 | 42 | left join orders on 43 | payments.order_id = orders.order_id 44 | 45 | group by orders.customer_id 46 | 47 | ), 48 | 49 | final as ( 50 | 51 | select 52 | customers.customer_id, 53 | customers.first_name, 54 | customers.last_name, 55 | customer_orders.first_order, 56 | customer_orders.most_recent_order, 57 | customer_orders.number_of_orders, 58 | customer_payments.total_amount as customer_lifetime_value 59 | 60 | from customers 61 | 62 | left join customer_orders 63 | on customers.customer_id = customer_orders.customer_id 64 | 65 | left join customer_payments 66 | on customers.customer_id = customer_payments.customer_id 67 | 68 | ) 69 | 70 | select count(*) from final 71 | 72 | expected_output: 73 | name: "customers" 74 | output_columns: 75 | - name: "$col1" 76 | joins: 77 | - join_type: "LEFT" 78 | left_columns: 79 | - name: "order_id" 80 | references: 81 | - project_name: "catalog" 82 | dataset_name: "jaffle_shop" 83 | table_name: "stg_payments" 84 | name: "order_id" 85 | used_for: 86 | - "JOIN_LEFT_TABLE" 87 | right_columns: 88 | - name: "order_id" 89 | references: 90 | - project_name: "catalog" 91 | dataset_name: "jaffle_shop" 92 | table_name: "stg_orders" 93 | name: "order_id" 94 | used_for: 95 | - "JOIN_RIGHT_TABLE" 96 | - join_type: "LEFT" 97 | left_columns: 98 | - name: "customer_id" 99 | references: 100 | - project_name: "catalog" 101 | dataset_name: "jaffle_shop" 102 | table_name: "stg_customers" 103 | name: "customer_id" 104 | used_for: 105 | - "JOIN_LEFT_TABLE" 106 | right_columns: 107 | - name: "customer_id" 108 | references: 109 | - project_name: "catalog" 110 | dataset_name: "jaffle_shop" 111 | table_name: "stg_orders" 112 | name: "customer_id" 113 | used_for: 114 | - "JOIN_RIGHT_TABLE" 115 | - join_type: "LEFT" 116 | left_columns: 117 | - name: "customer_id" 118 | references: 119 | - project_name: "catalog" 120 | dataset_name: "jaffle_shop" 121 | table_name: "stg_customers" 122 | name: "customer_id" 123 | used_for: 124 | - "JOIN_LEFT_TABLE" 125 | right_columns: 126 | - name: "customer_id" 127 | references: 128 | - project_name: "catalog" 129 | dataset_name: "jaffle_shop" 130 | table_name: "stg_orders" 131 | name: "customer_id" 132 | used_for: 133 | - "JOIN_RIGHT_TABLE" 134 | aggregations: 135 | - name: "_customer_id_" 136 | references: 137 | - project_name: "catalog" 138 | dataset_name: "jaffle_shop" 139 | table_name: "stg_orders" 140 | name: "customer_id" 141 | used_for: 142 | - "GROUP_BY" 143 | type: "select" 144 | selected_tables: 145 | - "catalog.jaffle_shop.stg_orders" 146 | - "catalog.jaffle_shop.stg_customers" 147 | - "catalog.jaffle_shop.stg_payments" 148 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/pivot_where_with_literals.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with source as ( 3 | 4 | 5 | select * from `catalog.jaffle_shop.stg_payments` 6 | WHERE payment_method in ('card', 'cash', 'bank_transfer', 'other') 7 | AND amount > 100000 8 | 9 | ) 10 | SELECT * FROM 11 | (SELECT payment_method,amount FROM source) 12 | PIVOT(SUM(amount) FOR payment_method IN ('card', 'cash', 'bank_transfer', 'other')) 13 | 14 | expected_output: 15 | name: "pivot" 16 | output_columns: 17 | - name: "card" 18 | references: 19 | - project_name: "catalog" 20 | dataset_name: "jaffle_shop" 21 | table_name: "stg_payments" 22 | name: "amount" 23 | - table_name: "_literal_" 24 | name: "_literal_" 25 | literal_value: 26 | - "card" 27 | - project_name: "catalog" 28 | dataset_name: "jaffle_shop" 29 | table_name: "stg_payments" 30 | name: "payment_method" 31 | - name: "cash" 32 | references: 33 | - project_name: "catalog" 34 | dataset_name: "jaffle_shop" 35 | table_name: "stg_payments" 36 | name: "amount" 37 | - table_name: "_literal_" 38 | name: "_literal_" 39 | literal_value: 40 | - "cash" 41 | - project_name: "catalog" 42 | dataset_name: "jaffle_shop" 43 | table_name: "stg_payments" 44 | name: "payment_method" 45 | - name: "bank_transfer" 46 | references: 47 | - project_name: "catalog" 48 | dataset_name: "jaffle_shop" 49 | table_name: "stg_payments" 50 | name: "amount" 51 | - table_name: "_literal_" 52 | name: "_literal_" 53 | literal_value: 54 | - "bank_transfer" 55 | - project_name: "catalog" 56 | dataset_name: "jaffle_shop" 57 | table_name: "stg_payments" 58 | name: "payment_method" 59 | - name: "other" 60 | references: 61 | - project_name: "catalog" 62 | dataset_name: "jaffle_shop" 63 | table_name: "stg_payments" 64 | name: "amount" 65 | - table_name: "_literal_" 66 | name: "_literal_" 67 | literal_value: 68 | - "other" 69 | - project_name: "catalog" 70 | dataset_name: "jaffle_shop" 71 | table_name: "stg_payments" 72 | name: "payment_method" 73 | filters: 74 | - name: "_payment_method_" 75 | references: 76 | - project_name: "catalog" 77 | dataset_name: "jaffle_shop" 78 | table_name: "stg_payments" 79 | name: "payment_method" 80 | used_for: 81 | - "FILTER" 82 | - name: "__literal__" 83 | references: 84 | - table_name: "_literal_" 85 | name: "_literal_" 86 | literal_value: 87 | - "card" 88 | used_for: 89 | - "FILTER" 90 | - name: "__literal__" 91 | references: 92 | - table_name: "_literal_" 93 | name: "_literal_" 94 | literal_value: 95 | - "cash" 96 | used_for: 97 | - "FILTER" 98 | - name: "__literal__" 99 | references: 100 | - table_name: "_literal_" 101 | name: "_literal_" 102 | literal_value: 103 | - "bank_transfer" 104 | used_for: 105 | - "FILTER" 106 | - name: "__literal__" 107 | references: 108 | - table_name: "_literal_" 109 | name: "_literal_" 110 | literal_value: 111 | - "other" 112 | used_for: 113 | - "FILTER" 114 | - name: "_amount_" 115 | references: 116 | - project_name: "catalog" 117 | dataset_name: "jaffle_shop" 118 | table_name: "stg_payments" 119 | name: "amount" 120 | used_for: 121 | - "FILTER" 122 | - name: "__literal__" 123 | references: 124 | - table_name: "_literal_" 125 | name: "_literal_" 126 | literal_value: 127 | - "100000.0" 128 | used_for: 129 | - "FILTER" 130 | type: "select" 131 | selected_tables: 132 | - "catalog.jaffle_shop.stg_payments" 133 | 134 | -------------------------------------------------------------------------------- /src/test/resources/schemas/public_dataset_mbb_teams_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "etag": "TTdR7J49Pxl6qLJjsXyGKA==", 4 | "id": "bigquery-public-data:ncaa_basketball.mbb_teams", 5 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/bigquery-public-data/datasets/ncaa_basketball/tables/mbb_teams", 6 | "tableReference": { 7 | "projectId": "bigquery-public-data", 8 | "datasetId": "ncaa_basketball", 9 | "tableId": "mbb_teams" 10 | }, 11 | "description": "General information about the 351 current men's D1 basketball teams.", 12 | "schema": { 13 | "fields": [ 14 | { 15 | "name": "market", 16 | "type": "STRING", 17 | "mode": "NULLABLE" 18 | }, 19 | { 20 | "name": "alias", 21 | "type": "STRING", 22 | "mode": "NULLABLE" 23 | }, 24 | { 25 | "name": "name", 26 | "type": "STRING", 27 | "mode": "NULLABLE" 28 | }, 29 | { 30 | "name": "id", 31 | "type": "STRING", 32 | "mode": "NULLABLE" 33 | }, 34 | { 35 | "name": "code_ncaa", 36 | "type": "INTEGER", 37 | "mode": "NULLABLE" 38 | }, 39 | { 40 | "name": "kaggle_team_id", 41 | "type": "INTEGER", 42 | "mode": "NULLABLE" 43 | }, 44 | { 45 | "name": "school_ncaa", 46 | "type": "STRING", 47 | "mode": "NULLABLE" 48 | }, 49 | { 50 | "name": "turner_name", 51 | "type": "STRING", 52 | "mode": "NULLABLE" 53 | }, 54 | { 55 | "name": "league_name", 56 | "type": "STRING", 57 | "mode": "NULLABLE" 58 | }, 59 | { 60 | "name": "league_alias", 61 | "type": "STRING", 62 | "mode": "NULLABLE" 63 | }, 64 | { 65 | "name": "league_id", 66 | "type": "STRING", 67 | "mode": "NULLABLE" 68 | }, 69 | { 70 | "name": "conf_name", 71 | "type": "STRING", 72 | "mode": "NULLABLE" 73 | }, 74 | { 75 | "name": "conf_alias", 76 | "type": "STRING", 77 | "mode": "NULLABLE" 78 | }, 79 | { 80 | "name": "conf_id", 81 | "type": "STRING", 82 | "mode": "NULLABLE" 83 | }, 84 | { 85 | "name": "division_name", 86 | "type": "STRING", 87 | "mode": "NULLABLE" 88 | }, 89 | { 90 | "name": "division_alias", 91 | "type": "STRING", 92 | "mode": "NULLABLE" 93 | }, 94 | { 95 | "name": "division_id", 96 | "type": "STRING", 97 | "mode": "NULLABLE" 98 | }, 99 | { 100 | "name": "venue_id", 101 | "type": "STRING", 102 | "mode": "NULLABLE" 103 | }, 104 | { 105 | "name": "venue_city", 106 | "type": "STRING", 107 | "mode": "NULLABLE" 108 | }, 109 | { 110 | "name": "venue_state", 111 | "type": "STRING", 112 | "mode": "NULLABLE" 113 | }, 114 | { 115 | "name": "venue_address", 116 | "type": "STRING", 117 | "mode": "NULLABLE" 118 | }, 119 | { 120 | "name": "venue_zip", 121 | "type": "STRING", 122 | "mode": "NULLABLE" 123 | }, 124 | { 125 | "name": "venue_country", 126 | "type": "STRING", 127 | "mode": "NULLABLE" 128 | }, 129 | { 130 | "name": "venue_name", 131 | "type": "STRING", 132 | "mode": "NULLABLE" 133 | }, 134 | { 135 | "name": "venue_capacity", 136 | "type": "INTEGER", 137 | "mode": "NULLABLE" 138 | }, 139 | { 140 | "name": "logo_large", 141 | "type": "STRING", 142 | "mode": "NULLABLE" 143 | }, 144 | { 145 | "name": "logo_medium", 146 | "type": "STRING", 147 | "mode": "NULLABLE" 148 | }, 149 | { 150 | "name": "logo_small", 151 | "type": "STRING", 152 | "mode": "NULLABLE" 153 | } 154 | ] 155 | }, 156 | "numBytes": "230400", 157 | "numLongTermBytes": "230400", 158 | "numRows": "351", 159 | "creationTime": "1520610568869", 160 | "lastModifiedTime": "1527778126304", 161 | "type": "TABLE", 162 | "location": "US" 163 | } 164 | -------------------------------------------------------------------------------- /src/test/resources/schemas/daily_report_table_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "bigquery#table", 3 | "etag": "btl8vEprHYEpyiYwf/WFQw==", 4 | "id": "myproject:reporting.daily_report", 5 | "selfLink": "https://content-bigquery.googleapis.com/bigquery/v2/projects/myproject/datasets/reporting/tables/daily_report", 6 | "tableReference": { 7 | "projectId": "myproject", 8 | "datasetId": "reporting", 9 | "tableId": "daily_report" 10 | }, 11 | "schema": { 12 | "fields": [ 13 | { 14 | "name": "hit_timestamp", 15 | "type": "TIMESTAMP", 16 | "mode": "NULLABLE" 17 | }, 18 | { 19 | "name": "partner_id", 20 | "type": "INTEGER", 21 | "mode": "NULLABLE" 22 | }, 23 | { 24 | "name": "partner_name", 25 | "type": "STRING", 26 | "mode": "NULLABLE" 27 | }, 28 | { 29 | "name": "conversion_event", 30 | "type": "STRING", 31 | "mode": "NULLABLE" 32 | }, 33 | { 34 | "name": "conversion_type", 35 | "type": "STRING", 36 | "mode": "NULLABLE" 37 | }, 38 | { 39 | "name": "status_code", 40 | "type": "INTEGER", 41 | "mode": "NULLABLE" 42 | }, 43 | { 44 | "name": "method", 45 | "type": "STRING", 46 | "mode": "NULLABLE" 47 | }, 48 | { 49 | "name": "num_lines", 50 | "type": "INTEGER", 51 | "mode": "NULLABLE" 52 | }, 53 | { 54 | "name": "matched_tag_count", 55 | "type": "INTEGER", 56 | "mode": "NULLABLE" 57 | }, 58 | { 59 | "name": "products", 60 | "type": "RECORD", 61 | "mode": "REPEATED", 62 | "fields": [ 63 | { 64 | "name": "jsonError", 65 | "type": "STRING", 66 | "mode": "NULLABLE" 67 | }, 68 | { 69 | "name": "product", 70 | "type": "RECORD", 71 | "mode": "NULLABLE", 72 | "fields": [ 73 | { 74 | "name": "id", 75 | "type": "STRING", 76 | "mode": "NULLABLE" 77 | }, 78 | { 79 | "name": "name", 80 | "type": "STRING", 81 | "mode": "NULLABLE" 82 | }, 83 | { 84 | "name": "seller", 85 | "type": "STRING", 86 | "mode": "NULLABLE" 87 | }, 88 | { 89 | "name": "quantity", 90 | "type": "STRING", 91 | "mode": "NULLABLE" 92 | }, 93 | { 94 | "name": "value", 95 | "type": "STRING", 96 | "mode": "NULLABLE" 97 | }, 98 | { 99 | "name": "currency", 100 | "type": "STRING", 101 | "mode": "NULLABLE" 102 | } 103 | ] 104 | } 105 | ] 106 | }, 107 | { 108 | "name": "logLines", 109 | "type": "STRING", 110 | "mode": "REPEATED" 111 | }, 112 | { 113 | "name": "url", 114 | "type": "STRING", 115 | "mode": "NULLABLE" 116 | }, 117 | { 118 | "name": "request_id", 119 | "type": "STRING", 120 | "mode": "NULLABLE" 121 | }, 122 | { 123 | "name": "latency", 124 | "type": "FLOAT", 125 | "mode": "NULLABLE" 126 | }, 127 | { 128 | "name": "service_version", 129 | "type": "STRING", 130 | "mode": "NULLABLE" 131 | }, 132 | { 133 | "name": "user_agent", 134 | "type": "STRING", 135 | "mode": "NULLABLE" 136 | }, 137 | { 138 | "name": "hour", 139 | "type": "STRING", 140 | "mode": "NULLABLE" 141 | }, 142 | { 143 | "name": "day_of_week", 144 | "type": "STRING", 145 | "mode": "NULLABLE" 146 | } 147 | ] 148 | }, 149 | "timePartitioning": { 150 | "type": "DAY" 151 | }, 152 | "numBytes": "4243039242539", 153 | "numLongTermBytes": "2756175095381", 154 | "numRows": "4688903566", 155 | "creationTime": "1544756481320", 156 | "lastModifiedTime": "1589501539422", 157 | "type": "TABLE", 158 | "location": "US" 159 | } 160 | -------------------------------------------------------------------------------- /src/test/resources/schemas/lineage_bigquery_table_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "fields": [ 3 | { 4 | "name": "lineageReconcileTime", 5 | "type": "TIMESTAMP", 6 | "mode": "REQUIRED", 7 | "description": "The time lineage was processed in UTC" 8 | }, 9 | { 10 | "name": "item", 11 | "type": "RECORD", 12 | "mode": "REQUIRED", 13 | "description": "The entity changed or added", 14 | "fields": [ 15 | { 16 | "name": "type", 17 | "type": "STRING", 18 | "mode": "NULLABLE", 19 | "description": "The type of entity e.g. BIGQUERY_TABLE" 20 | }, 21 | { 22 | "name": "entity", 23 | "type": "STRING", 24 | "mode": "REQUIRED", 25 | "description": "The BigQuery entity changed or added." 26 | } 27 | ] 28 | }, 29 | { 30 | "name": "jobInformation", 31 | "type": "RECORD", 32 | "mode": "NULLABLE", 33 | "description": "BigQuery Job information", 34 | "fields": [ 35 | { 36 | "name": "lastUpdated", 37 | "type": "TIMESTAMP", 38 | "mode": "REQUIRED", 39 | "description": "Job completion time in UTC" 40 | }, 41 | { 42 | "name": "actuator", 43 | "type": "STRING", 44 | "mode": "NULLABLE", 45 | "description": "The email address of the authorized executor of Job, It can be a person or a service account." 46 | }, 47 | { 48 | "name": "causalJobId", 49 | "type": "STRING", 50 | "mode": "NULLABLE", 51 | "description": "The BigQuery/Operation Job Id which made the change to the table." 52 | } 53 | ] 54 | }, 55 | { 56 | "name": "tableLineage", 57 | "type": "RECORD", 58 | "mode": "REQUIRED", 59 | "description": "Table level information", 60 | "fields": [ 61 | { 62 | "name": "parents", 63 | "type": "RECORD", 64 | "mode": "REPEATED", 65 | "description": "The tables which were read for generating/updating the source table.", 66 | "fields": [ 67 | { 68 | "name": "type", 69 | "type": "STRING", 70 | "mode": "NULLABLE", 71 | "description": "type of DataEntity e.g CLOUD_STORAGE, BIGQUERY_TABLE" 72 | }, 73 | { 74 | "name": "entity", 75 | "type": "STRING", 76 | "mode": "NULLABLE", 77 | "description": "The BigQuery Job type e.g. QUERY, COPY, IMPORT" 78 | } 79 | ] 80 | }, 81 | { 82 | "name": "operation", 83 | "type": "STRING", 84 | "mode": "NULLABLE", 85 | "description": "The BigQuery Job type e.g. QUERY, COPY, IMPORT" 86 | } 87 | ] 88 | }, 89 | { 90 | "name": "columnLineage", 91 | "type": "RECORD", 92 | "mode": "REPEATED", 93 | "description": "Column Level lineage if its a QUERY type operation", 94 | "fields": [ 95 | { 96 | "name": "name", 97 | "type": "STRING", 98 | "mode": "NULLABLE" 99 | }, 100 | { 101 | "name": "operation", 102 | "type": "STRING", 103 | "mode": "REPEATED", 104 | "description": "List of Functions/Operations done on the source columns" 105 | }, 106 | { 107 | "name": "parents", 108 | "type": "RECORD", 109 | "mode": "REPEATED", 110 | "description": "The source columns used to compute this column", 111 | "fields": [ 112 | { 113 | "name": "table", 114 | "type": "RECORD", 115 | "mode": "NULLABLE", 116 | "fields": [ 117 | { 118 | "name": "type", 119 | "type": "STRING", 120 | "mode": "NULLABLE", 121 | "description": "type of DataEntity e.g CLOUD_STORAGE, BIGQUERY_TABLE" 122 | }, 123 | { 124 | "name": "entity", 125 | "type": "STRING", 126 | "mode": "NULLABLE", 127 | "description": "The BigQuery Job type e.g. QUERY, COPY, IMPORT" 128 | } 129 | ] 130 | }, 131 | { 132 | "name": "column", 133 | "type": "STRING", 134 | "mode": "NULLABLE" 135 | } 136 | ] 137 | } 138 | ] 139 | } 140 | ] 141 | } -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/looker_subquery.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | 3 | SELECT * 4 | FROM (SELECT clmn1_, SUM(clmn2_) AS clmn100000_ 5 | FROM (SELECT * 6 | FROM (SELECT t0.cost AS clmn2_, t0.date AS clmn0_, t0.anothernesting AS clmn1_ 7 | FROM (WITH base 8 | AS (SELECT DATE_TRUNC(DATE(anotherdate), DAY) AS date, rescordenesed.anothernesting, ARRAY_TO_STRING(ARRAY( 9 | SELECT 10 | value 11 | FROM 12 | UNNEST(labels) 13 | WHERE 14 | key = "goog-resource-type" 15 | ), " ") AS resource_type, first_name, SUM (cost) AS cost 16 | FROM 17 | `catalog.jaffle_shop.nested_table` 18 | WHERE 19 | DATE (_PARTITIONTIME) 20 | > DATE_SUB(`CURRENT_DATE`() 21 | , INTERVAL 62 day) 22 | AND acolumn.id IN ("blabla" 23 | , "bloblob") 24 | GROUP BY 1, 2, 3, 4) SELECT 25 | date, 26 | anothernesting, 27 | SUM(cost) AS cost 28 | FROM 29 | base 30 | GROUP BY 1, 2) t0) 31 | WHERE ((clmn0_ >= DATE "2023-12-13") AND (clmn0_ <= DATE "2024-01-11"))) 32 | GROUP BY clmn1_ ) LIMIT 20000000 33 | 34 | expected_output: 35 | name: "messy_struct" 36 | output_columns: 37 | - name: "clmn1_" 38 | references: 39 | - project_name: "catalog" 40 | dataset_name: "jaffle_shop" 41 | table_name: "nested_table" 42 | name: "rescordenesed.anothernesting" 43 | - name: "clmn100000_" 44 | references: 45 | - project_name: "catalog" 46 | dataset_name: "jaffle_shop" 47 | table_name: "nested_table" 48 | name: "cost" 49 | filters: 50 | - name: "_date_" 51 | references: 52 | - project_name: "catalog" 53 | dataset_name: "jaffle_shop" 54 | table_name: "nested_table" 55 | name: "anotherdate" 56 | used_for: 57 | - "FILTER" 58 | - "GROUP_BY" 59 | - name: "__PARTITIONTIME_" 60 | references: 61 | - project_name: "catalog" 62 | dataset_name: "jaffle_shop" 63 | table_name: "nested_table" 64 | name: "_PARTITIONTIME" 65 | used_for: 66 | - "FILTER" 67 | - name: "_acolumn.id_" 68 | references: 69 | - project_name: "catalog" 70 | dataset_name: "jaffle_shop" 71 | table_name: "nested_table" 72 | name: "acolumn.id" 73 | used_for: 74 | - "FILTER" 75 | aggregations: 76 | - name: "_clmn1__" 77 | references: 78 | - project_name: "catalog" 79 | dataset_name: "jaffle_shop" 80 | table_name: "nested_table" 81 | name: "rescordenesed.anothernesting" 82 | used_for: 83 | - "GROUP_BY" 84 | - name: "_date_" 85 | references: 86 | - project_name: "catalog" 87 | dataset_name: "jaffle_shop" 88 | table_name: "nested_table" 89 | name: "anotherdate" 90 | used_for: 91 | - "FILTER" 92 | - "GROUP_BY" 93 | - name: "_date_" 94 | references: 95 | - project_name: "catalog" 96 | dataset_name: "jaffle_shop" 97 | table_name: "nested_table" 98 | name: "anotherdate" 99 | used_for: 100 | - "GROUP_BY" 101 | - name: "_anothernesting_" 102 | references: 103 | - project_name: "catalog" 104 | dataset_name: "jaffle_shop" 105 | table_name: "nested_table" 106 | name: "rescordenesed.anothernesting" 107 | used_for: 108 | - "GROUP_BY" 109 | - name: "_resource_type_" 110 | references: 111 | - project_name: "catalog" 112 | dataset_name: "jaffle_shop" 113 | table_name: "nested_table" 114 | name: "labels.value" 115 | used_for: 116 | - "GROUP_BY" 117 | - name: "_first_name_" 118 | references: 119 | - project_name: "catalog" 120 | dataset_name: "jaffle_shop" 121 | table_name: "nested_table" 122 | name: "first_name" 123 | used_for: 124 | - "GROUP_BY" 125 | type: "select" 126 | selected_tables: 127 | - "catalog.jaffle_shop.nested_table" 128 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/subquery_unnest.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | SELECT 3 | acolumn.id as project_id, 4 | rescordenesed.anothernesting AS job_id, 5 | rescordenesed.anothernesting2 as full_asset_name, 6 | ARRAY_TO_STRING(ARRAY(SELECT CONCAT(key,':',value) FROM UNNEST(labels)), ",") as labels, 7 | SUM(cost) + COALESCE(SUM(( SELECT SUM(c.amount) FROM UNNEST(nested_ids) AS c)), 0.1) AS cost 8 | FROM 9 | `catalog.jaffle_shop.nested_table` 10 | WHERE 11 | TIMESTAMP_TRUNC(_PARTITIONTIME, DAY) >= TIMESTAMP("2024-01-01") 12 | AND rescordenesed.anothernesting23 = 'this is a filter string' 13 | 14 | GROUP BY 15 | 1, 16 | 2, 17 | 3, 18 | 4 19 | ORDER BY 5 DESC 20 | 21 | expected_output: 22 | name: "subquery_unnest" 23 | output_columns: 24 | - name: "project_id" 25 | references: 26 | - project_name: "catalog" 27 | dataset_name: "jaffle_shop" 28 | table_name: "nested_table" 29 | name: "acolumn.id" 30 | - name: "job_id" 31 | references: 32 | - project_name: "catalog" 33 | dataset_name: "jaffle_shop" 34 | table_name: "nested_table" 35 | name: "rescordenesed.anothernesting" 36 | - name: "full_asset_name" 37 | references: 38 | - project_name: "catalog" 39 | dataset_name: "jaffle_shop" 40 | table_name: "nested_table" 41 | name: "rescordenesed.anothernesting2" 42 | - name: "labels" 43 | references: 44 | - project_name: "catalog" 45 | dataset_name: "jaffle_shop" 46 | table_name: "nested_table" 47 | name: "labels.key" 48 | - project_name: "catalog" 49 | dataset_name: "jaffle_shop" 50 | table_name: "nested_table" 51 | name: "labels.value" 52 | - name: "cost" 53 | references: 54 | - project_name: "catalog" 55 | dataset_name: "jaffle_shop" 56 | table_name: "nested_table" 57 | name: "cost" 58 | - project_name: "catalog" 59 | dataset_name: "jaffle_shop" 60 | table_name: "nested_table" 61 | name: "nested_ids.amount" 62 | filters: 63 | - name: "__PARTITIONTIME_" 64 | references: 65 | - project_name: "catalog" 66 | dataset_name: "jaffle_shop" 67 | table_name: "nested_table" 68 | name: "_PARTITIONTIME" 69 | used_for: 70 | - "FILTER" 71 | - name: "_rescordenesed.anothernesting23_" 72 | references: 73 | - project_name: "catalog" 74 | dataset_name: "jaffle_shop" 75 | table_name: "nested_table" 76 | name: "rescordenesed.anothernesting23" 77 | used_for: 78 | - "FILTER" 79 | aggregations: 80 | - name: "_project_id_" 81 | references: 82 | - project_name: "catalog" 83 | dataset_name: "jaffle_shop" 84 | table_name: "nested_table" 85 | name: "acolumn.id" 86 | used_for: 87 | - "GROUP_BY" 88 | - name: "_job_id_" 89 | references: 90 | - project_name: "catalog" 91 | dataset_name: "jaffle_shop" 92 | table_name: "nested_table" 93 | name: "rescordenesed.anothernesting" 94 | used_for: 95 | - "GROUP_BY" 96 | - name: "_full_asset_name_" 97 | references: 98 | - project_name: "catalog" 99 | dataset_name: "jaffle_shop" 100 | table_name: "nested_table" 101 | name: "rescordenesed.anothernesting2" 102 | used_for: 103 | - "GROUP_BY" 104 | - name: "_labels_" 105 | references: 106 | - project_name: "catalog" 107 | dataset_name: "jaffle_shop" 108 | table_name: "nested_table" 109 | name: "labels.key" 110 | used_for: 111 | - "GROUP_BY" 112 | - project_name: "catalog" 113 | dataset_name: "jaffle_shop" 114 | table_name: "nested_table" 115 | name: "labels.value" 116 | used_for: 117 | - "GROUP_BY" 118 | other_used_columns: 119 | - name: "_cost_" 120 | references: 121 | - project_name: "catalog" 122 | dataset_name: "jaffle_shop" 123 | table_name: "nested_table" 124 | name: "cost" 125 | used_for: 126 | - "ORDER_BY" 127 | - project_name: "catalog" 128 | dataset_name: "jaffle_shop" 129 | table_name: "nested_table" 130 | name: "nested_ids.amount" 131 | used_for: 132 | - "ORDER_BY" 133 | type: "select" 134 | selected_tables: 135 | - "catalog.jaffle_shop.nested_table" 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/service/BigQueryTableLoadService.java: -------------------------------------------------------------------------------- 1 | 2 | package com.borjav.data.service; 3 | 4 | import com.borjav.data.exception.BigQueryOperationException; 5 | import com.borjav.data.extractor.BigQueryTableCreator; 6 | import com.borjav.data.model.BigQueryTableEntity; 7 | import com.google.api.services.bigquery.model.Table; 8 | import com.google.common.annotations.VisibleForTesting; 9 | import com.google.common.cache.Cache; 10 | import com.google.common.cache.CacheBuilder; 11 | import com.google.common.collect.ImmutableMap; 12 | import com.google.common.collect.ImmutableSet; 13 | import com.google.common.collect.Sets; 14 | import com.google.common.flogger.GoogleLogger; 15 | 16 | import java.io.IOException; 17 | import java.io.Serializable; 18 | import java.time.Duration; 19 | import java.util.Arrays; 20 | import java.util.concurrent.ExecutionException; 21 | import java.util.concurrent.TimeUnit; 22 | 23 | import static com.google.common.collect.ImmutableSet.toImmutableSet; 24 | 25 | /** 26 | * Provides loading Table information using BigQuery API. It {@link Cache}s the reads from the API 27 | * for 5 minutes. 28 | */ 29 | public final class BigQueryTableLoadService implements Serializable { 30 | 31 | private static final GoogleLogger logger = GoogleLogger.forEnclosingClass(); 32 | 33 | private final BigQueryServiceFactory bqServiceFactory; 34 | private static final Cache LOCAL_CACHE = buildCache(); 35 | 36 | public BigQueryTableLoadService( 37 | BigQueryServiceFactory bqServiceFactory) { 38 | this.bqServiceFactory = bqServiceFactory; 39 | } 40 | 41 | public static BigQueryTableLoadService usingServiceFactory( 42 | BigQueryServiceFactory bqServiceFactory) { 43 | return new BigQueryTableLoadService(bqServiceFactory); 44 | } 45 | 46 | private static Cache buildCache() { 47 | return CacheBuilder.newBuilder() 48 | .expireAfterWrite(Duration.ofMinutes(20)) 49 | .maximumSize(10000L) 50 | .build(); 51 | } 52 | 53 | /** 54 | * Loads a single table information. 55 | * 56 | * @param tableName the fully qualified table name. 57 | */ 58 | public Table loadTable(String tableName) { 59 | return loadTable(BigQueryTableCreator.usingBestEffort(tableName)); 60 | } 61 | 62 | /** 63 | * Loads a single table information. 64 | * 65 | * @param table the fully qualified table name. 66 | */ 67 | public Table loadTable(BigQueryTableEntity table) { 68 | try { 69 | //check in cache 70 | return LOCAL_CACHE.get(table, () -> loadSingleTableFromServer(table)); 71 | } catch (ExecutionException executionException) { 72 | logger.atWarning() 73 | .withCause(executionException) 74 | .atMostEvery(1, TimeUnit.MINUTES) 75 | .log(String.format("Unable to load table %s", table)); 76 | 77 | throw new BigQueryOperationException(table, executionException); 78 | } 79 | } 80 | 81 | /** 82 | * Loads multiple tables' information. 83 | * 84 | * @param tableNames multiple fully qualified table names. 85 | */ 86 | public ImmutableSet loadTables(String... tableNames) { 87 | return loadTables( 88 | Arrays.stream(tableNames) 89 | .distinct() 90 | .map(BigQueryTableCreator::usingBestEffort) 91 | .collect(toImmutableSet())); 92 | } 93 | 94 | /** 95 | * Loads multiple tables from BigQuery, with Cache-through. 96 | * It does NOT use Batch API. 97 | * 98 | * @param tables the list of table names to load 99 | * @return all the loaded tables. It can throw runtime exception if table is not found. 100 | */ 101 | public ImmutableSet
loadTables(ImmutableSet tables) { 102 | ImmutableMap allCachedData = 103 | LOCAL_CACHE.getAllPresent(tables); 104 | ImmutableSet tablesToFetch 105 | = Sets.difference(tables, allCachedData.keySet()).immutableCopy(); 106 | 107 | return Sets.union(ImmutableSet.copyOf(allCachedData.values()), 108 | bulkFetchFromServer(tablesToFetch)).immutableCopy(); 109 | } 110 | 111 | private Table loadSingleTableFromServer(BigQueryTableEntity tableSpec) throws IOException { 112 | return bqServiceFactory.buildService() 113 | .tables() 114 | .get(tableSpec.getProjectId(), tableSpec.getDataset(), tableSpec.getTable()) 115 | .execute(); 116 | } 117 | 118 | private ImmutableSet
bulkFetchFromServer(ImmutableSet tablesToFetch) { 119 | return tablesToFetch.stream() 120 | .map(this::loadTable) 121 | .collect(toImmutableSet()); 122 | } 123 | 124 | @VisibleForTesting 125 | static void clearLocalCache() { 126 | LOCAL_CACHE.invalidateAll(); 127 | LOCAL_CACHE.cleanUp(); 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/customers_groupby.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with customers as ( 3 | 4 | select * from `catalog.jaffle_shop.stg_customers` 5 | 6 | ), 7 | 8 | orders as ( 9 | 10 | select * from `catalog.jaffle_shop.stg_orders` 11 | 12 | ), 13 | 14 | payments as ( 15 | 16 | select * from `catalog.jaffle_shop.stg_payments` 17 | 18 | ), 19 | 20 | customer_orders as ( 21 | 22 | select 23 | customer_id, 24 | 25 | min(order_date) as first_order, 26 | max(order_date) as most_recent_order, 27 | count(order_id) as number_of_orders 28 | from orders 29 | 30 | group by customer_id 31 | 32 | ), 33 | 34 | customer_payments as ( 35 | 36 | select 37 | orders.customer_id, 38 | sum(amount) as total_amount 39 | 40 | from payments 41 | 42 | left join orders on 43 | payments.order_id = orders.order_id 44 | 45 | group by orders.customer_id 46 | 47 | ), 48 | 49 | final as ( 50 | 51 | select 52 | customers.customer_id, 53 | customers.first_name, 54 | customers.last_name, 55 | customer_orders.first_order, 56 | customer_orders.most_recent_order, 57 | customer_orders.number_of_orders, 58 | customer_payments.total_amount as customer_lifetime_value 59 | 60 | from customers 61 | 62 | left join customer_orders 63 | on customers.customer_id = customer_orders.customer_id 64 | 65 | left join customer_payments 66 | on customers.customer_id = customer_payments.customer_id 67 | 68 | ), groupby AS( 69 | select customer_id, 70 | SUM(customer_lifetime_value) AS total_customer_lifetime_value 71 | FROM final 72 | GROUP BY 1 73 | ) 74 | 75 | select customer_id,total_customer_lifetime_value 76 | from groupby 77 | ORDER BY 1 78 | 79 | expected_output: 80 | name: "customers" 81 | output_columns: 82 | - name: "customer_id" 83 | references: 84 | - project_name: "catalog" 85 | dataset_name: "jaffle_shop" 86 | table_name: "stg_customers" 87 | name: "customer_id" 88 | - name: "total_customer_lifetime_value" 89 | references: 90 | - project_name: "catalog" 91 | dataset_name: "jaffle_shop" 92 | table_name: "stg_payments" 93 | name: "amount" 94 | joins: 95 | - join_type: "LEFT" 96 | left_columns: 97 | - name: "order_id" 98 | references: 99 | - project_name: "catalog" 100 | dataset_name: "jaffle_shop" 101 | table_name: "stg_payments" 102 | name: "order_id" 103 | used_for: 104 | - "JOIN_LEFT_TABLE" 105 | right_columns: 106 | - name: "order_id" 107 | references: 108 | - project_name: "catalog" 109 | dataset_name: "jaffle_shop" 110 | table_name: "stg_orders" 111 | name: "order_id" 112 | used_for: 113 | - "JOIN_RIGHT_TABLE" 114 | - join_type: "LEFT" 115 | left_columns: 116 | - name: "customer_id" 117 | references: 118 | - project_name: "catalog" 119 | dataset_name: "jaffle_shop" 120 | table_name: "stg_customers" 121 | name: "customer_id" 122 | used_for: 123 | - "JOIN_LEFT_TABLE" 124 | right_columns: 125 | - name: "customer_id" 126 | references: 127 | - project_name: "catalog" 128 | dataset_name: "jaffle_shop" 129 | table_name: "stg_orders" 130 | name: "customer_id" 131 | used_for: 132 | - "JOIN_RIGHT_TABLE" 133 | - join_type: "LEFT" 134 | left_columns: 135 | - name: "customer_id" 136 | references: 137 | - project_name: "catalog" 138 | dataset_name: "jaffle_shop" 139 | table_name: "stg_customers" 140 | name: "customer_id" 141 | used_for: 142 | - "JOIN_LEFT_TABLE" 143 | right_columns: 144 | - name: "customer_id" 145 | references: 146 | - project_name: "catalog" 147 | dataset_name: "jaffle_shop" 148 | table_name: "stg_orders" 149 | name: "customer_id" 150 | used_for: 151 | - "JOIN_RIGHT_TABLE" 152 | aggregations: 153 | - name: "_customer_id_" 154 | references: 155 | - project_name: "catalog" 156 | dataset_name: "jaffle_shop" 157 | table_name: "stg_orders" 158 | name: "customer_id" 159 | used_for: 160 | - "GROUP_BY" 161 | - name: "_customer_id_" 162 | references: 163 | - project_name: "catalog" 164 | dataset_name: "jaffle_shop" 165 | table_name: "stg_customers" 166 | name: "customer_id" 167 | used_for: 168 | - "GROUP_BY" 169 | other_used_columns: 170 | - name: "_customer_id_" 171 | references: 172 | - project_name: "catalog" 173 | dataset_name: "jaffle_shop" 174 | table_name: "stg_customers" 175 | name: "customer_id" 176 | used_for: 177 | - "ORDER_BY" 178 | type: "select" 179 | selected_tables: 180 | - "catalog.jaffle_shop.stg_orders" 181 | - "catalog.jaffle_shop.stg_customers" 182 | - "catalog.jaffle_shop.stg_payments" 183 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/customers.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with customers as ( 3 | 4 | select * from `catalog.jaffle_shop.stg_customers` 5 | 6 | ), 7 | 8 | orders as ( 9 | 10 | select * from `catalog.jaffle_shop.stg_orders` 11 | 12 | ), 13 | 14 | payments as ( 15 | 16 | select * from `catalog.jaffle_shop.stg_payments` 17 | 18 | ), 19 | 20 | customer_orders as ( 21 | 22 | select 23 | customer_id, 24 | 25 | min(order_date) as first_order, 26 | max(order_date) as most_recent_order, 27 | count(order_id) as number_of_orders 28 | from orders 29 | 30 | group by customer_id 31 | 32 | ), 33 | 34 | customer_payments as ( 35 | 36 | select 37 | orders.customer_id, 38 | sum(amount) as total_amount 39 | 40 | from payments 41 | 42 | left join orders on 43 | payments.order_id = orders.order_id 44 | 45 | group by orders.customer_id 46 | 47 | ), 48 | 49 | final as ( 50 | 51 | select 52 | customers.customer_id, 53 | customers.first_name, 54 | customers.last_name, 55 | customer_orders.first_order, 56 | customer_orders.most_recent_order, 57 | customer_orders.number_of_orders, 58 | customer_payments.total_amount as customer_lifetime_value 59 | 60 | from customers 61 | 62 | left join customer_orders 63 | on customers.customer_id = customer_orders.customer_id 64 | 65 | left join customer_payments 66 | on customers.customer_id = customer_payments.customer_id 67 | 68 | ) 69 | 70 | select * from final 71 | 72 | expected_output: 73 | name: "customers" 74 | output_columns: 75 | - name: "customer_id" 76 | references: 77 | - project_name: "catalog" 78 | dataset_name: "jaffle_shop" 79 | table_name: "stg_customers" 80 | name: "customer_id" 81 | - name: "first_name" 82 | references: 83 | - project_name: "catalog" 84 | dataset_name: "jaffle_shop" 85 | table_name: "stg_customers" 86 | name: "first_name" 87 | - name: "last_name" 88 | references: 89 | - project_name: "catalog" 90 | dataset_name: "jaffle_shop" 91 | table_name: "stg_customers" 92 | name: "last_name" 93 | - name: "first_order" 94 | references: 95 | - project_name: "catalog" 96 | dataset_name: "jaffle_shop" 97 | table_name: "stg_orders" 98 | name: "order_date" 99 | - name: "most_recent_order" 100 | references: 101 | - project_name: "catalog" 102 | dataset_name: "jaffle_shop" 103 | table_name: "stg_orders" 104 | name: "order_date" 105 | - name: "number_of_orders" 106 | references: 107 | - project_name: "catalog" 108 | dataset_name: "jaffle_shop" 109 | table_name: "stg_orders" 110 | name: "order_id" 111 | - name: "customer_lifetime_value" 112 | references: 113 | - project_name: "catalog" 114 | dataset_name: "jaffle_shop" 115 | table_name: "stg_payments" 116 | name: "amount" 117 | joins: 118 | - join_type: "LEFT" 119 | left_columns: 120 | - name: "order_id" 121 | references: 122 | - project_name: "catalog" 123 | dataset_name: "jaffle_shop" 124 | table_name: "stg_payments" 125 | name: "order_id" 126 | used_for: 127 | - "JOIN_LEFT_TABLE" 128 | right_columns: 129 | - name: "order_id" 130 | references: 131 | - project_name: "catalog" 132 | dataset_name: "jaffle_shop" 133 | table_name: "stg_orders" 134 | name: "order_id" 135 | used_for: 136 | - "JOIN_RIGHT_TABLE" 137 | - join_type: "LEFT" 138 | left_columns: 139 | - name: "customer_id" 140 | references: 141 | - project_name: "catalog" 142 | dataset_name: "jaffle_shop" 143 | table_name: "stg_customers" 144 | name: "customer_id" 145 | used_for: 146 | - "JOIN_LEFT_TABLE" 147 | right_columns: 148 | - name: "customer_id" 149 | references: 150 | - project_name: "catalog" 151 | dataset_name: "jaffle_shop" 152 | table_name: "stg_orders" 153 | name: "customer_id" 154 | used_for: 155 | - "JOIN_RIGHT_TABLE" 156 | - join_type: "LEFT" 157 | left_columns: 158 | - name: "customer_id" 159 | references: 160 | - project_name: "catalog" 161 | dataset_name: "jaffle_shop" 162 | table_name: "stg_customers" 163 | name: "customer_id" 164 | used_for: 165 | - "JOIN_LEFT_TABLE" 166 | right_columns: 167 | - name: "customer_id" 168 | references: 169 | - project_name: "catalog" 170 | dataset_name: "jaffle_shop" 171 | table_name: "stg_orders" 172 | name: "customer_id" 173 | used_for: 174 | - "JOIN_RIGHT_TABLE" 175 | aggregations: 176 | - name: "_customer_id_" 177 | references: 178 | - project_name: "catalog" 179 | dataset_name: "jaffle_shop" 180 | table_name: "stg_orders" 181 | name: "customer_id" 182 | used_for: 183 | - "GROUP_BY" 184 | type: "select" 185 | selected_tables: 186 | - "catalog.jaffle_shop.stg_orders" 187 | - "catalog.jaffle_shop.stg_customers" 188 | - "catalog.jaffle_shop.stg_payments" 189 | -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/output/OutputModel.java: -------------------------------------------------------------------------------- 1 | package com.borjav.data.output; 2 | 3 | import com.borjav.data.model.ResolvedColumnExtended; 4 | import com.fasterxml.jackson.annotation.JsonInclude; 5 | import com.fasterxml.jackson.annotation.JsonProperty; 6 | import com.google.zetasql.resolvedast.ResolvedJoinScanEnums; 7 | import java.util.ArrayList; 8 | import java.util.HashSet; 9 | import java.util.LinkedHashSet; 10 | import java.util.List; 11 | import java.util.Objects; 12 | 13 | public class OutputModel { 14 | 15 | public static class Model { 16 | 17 | @JsonInclude(JsonInclude.Include.NON_NULL) 18 | @JsonProperty("name") 19 | public String name; 20 | @JsonInclude(JsonInclude.Include.NON_NULL) 21 | @JsonProperty("output_columns") 22 | public List output_columns = new ArrayList<>(); 23 | @JsonInclude(JsonInclude.Include.NON_NULL) 24 | @JsonProperty("joins") 25 | public LinkedHashSet joins; 26 | @JsonInclude(JsonInclude.Include.NON_NULL) 27 | @JsonProperty("filters") 28 | public LinkedHashSet filters; 29 | @JsonInclude(JsonInclude.Include.NON_NULL) 30 | @JsonProperty("aggregations") 31 | public LinkedHashSet aggregations; 32 | @JsonInclude(JsonInclude.Include.NON_NULL) 33 | @JsonProperty("other_used_columns") 34 | public LinkedHashSet other_used_columns; 35 | @JsonInclude(JsonInclude.Include.NON_NULL) 36 | @JsonProperty("tables") 37 | public List
tables; 38 | @JsonInclude(JsonInclude.Include.NON_NULL) 39 | @JsonProperty("type") 40 | public String type; 41 | @JsonInclude(JsonInclude.Include.NON_NULL) 42 | @JsonProperty("selected_tables") 43 | public HashSet selected_tables = new HashSet<>(); 44 | @JsonInclude(JsonInclude.Include.NON_NULL) 45 | @JsonProperty("error") 46 | public String error; 47 | 48 | 49 | } 50 | 51 | public static class Join { 52 | @JsonProperty("join_type") 53 | public String join_type; 54 | @JsonProperty("left_columns") 55 | public List left_columns = new ArrayList<>(); 56 | @JsonProperty("right_columns") 57 | public List right_columns = new ArrayList<>(); 58 | 59 | } 60 | 61 | 62 | public static class Table { 63 | 64 | @JsonProperty("name") 65 | public String name; 66 | @JsonProperty("columns") 67 | public HashSet columns = new HashSet<>(); 68 | @JsonInclude(JsonInclude.Include.NON_NULL) 69 | @JsonProperty("other_used_columns") 70 | public HashSet other_used_columns; 71 | 72 | } 73 | 74 | public static class OutputColumn { 75 | 76 | @JsonInclude(JsonInclude.Include.NON_NULL) 77 | @JsonProperty("name") 78 | public String name; 79 | @JsonInclude(JsonInclude.Include.NON_NULL) 80 | @JsonProperty("references") 81 | public HashSet references; 82 | 83 | @Override 84 | public boolean equals(Object o) { 85 | if (this == o) { 86 | return true; 87 | } 88 | if (o == null || getClass() != o.getClass()) { 89 | return false; 90 | } 91 | OutputColumn that = (OutputColumn) o; 92 | return Objects.equals(name, that.name) && Objects.equals(references, that.references); 93 | } 94 | 95 | @Override 96 | public int hashCode() { 97 | return Objects.hash(name, references); 98 | } 99 | } 100 | 101 | public static class Column { 102 | 103 | @JsonInclude(JsonInclude.Include.NON_NULL) 104 | @JsonProperty("project_name") 105 | public String project_name; 106 | @JsonInclude(JsonInclude.Include.NON_NULL) 107 | @JsonProperty("dataset_name") 108 | public String dataset_name; 109 | @JsonInclude(JsonInclude.Include.NON_NULL) 110 | @JsonProperty("table_name") 111 | public String table_name; 112 | @JsonInclude(JsonInclude.Include.NON_NULL) 113 | @JsonProperty("name") 114 | public String name; 115 | @JsonInclude(JsonInclude.Include.NON_NULL) 116 | @JsonProperty("references") 117 | public HashSet references; 118 | @JsonInclude(JsonInclude.Include.NON_NULL) 119 | @JsonProperty("literal_value") 120 | public List literal_value; 121 | @JsonInclude(JsonInclude.Include.NON_NULL) 122 | @JsonProperty("used_for") 123 | public HashSet used_for; 124 | @JsonInclude(JsonInclude.Include.NON_NULL) 125 | @JsonProperty("join_type") 126 | public ResolvedJoinScanEnums.JoinType join_type; 127 | 128 | 129 | public void setNameSplit(String table_name, String name) { 130 | String[] split = table_name.split("\\."); 131 | 132 | if (split.length == 3) { 133 | this.project_name = split[split.length - 3]; 134 | this.dataset_name = split[split.length - 2]; 135 | } 136 | this.table_name = split[split.length - 1]; 137 | this.name = name; 138 | } 139 | 140 | @Override 141 | public boolean equals(Object o) { 142 | if (this == o) { 143 | return true; 144 | } 145 | if (o == null || getClass() != o.getClass()) { 146 | return false; 147 | } 148 | Column column = (Column) o; 149 | return Objects.equals(table_name, column.table_name) && Objects.equals(name, column.name) 150 | && Objects.equals(dataset_name, column.dataset_name) 151 | && Objects.equals(references, column.references) && Objects.equals(literal_value, 152 | column.literal_value) && Objects.equals(used_for, 153 | column.used_for) && Objects.equals(join_type, 154 | column.join_type); 155 | } 156 | 157 | @Override 158 | public int hashCode() { 159 | return Objects.hash(table_name, name, references); 160 | } 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/extractor/BigQueryTableCreator.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | package com.borjav.data.extractor; 4 | 5 | import com.borjav.data.model.BigQueryTableEntity; 6 | import com.google.common.collect.ImmutableList; 7 | import com.google.common.flogger.GoogleLogger; 8 | import com.borjav.data.options.Options; 9 | import java.util.concurrent.TimeUnit; 10 | import java.util.regex.Matcher; 11 | import java.util.regex.Pattern; 12 | 13 | /** 14 | * A Factory to create BigQuery table entity by parsing different naming formats. 15 | */ 16 | public abstract class BigQueryTableCreator { 17 | 18 | private static final GoogleLogger logger = GoogleLogger.forEnclosingClass(); 19 | 20 | /** 21 | * Matches the given String as a Legacy or Standard Table name. If no match found, it returns a 22 | * 23 | * @param bigQueryTableName the table name in legacy or standard SQL format. 24 | * @return Table information with valid values or if no match found then tableName set as the 25 | * input String or empty table name. 26 | */ 27 | public static BigQueryTableEntity usingBestEffort(String bigQueryTableName) { 28 | if (bigQueryTableName != null && bigQueryTableName.startsWith("$")) { 29 | return BigQueryTableEntity.create(null, null, bigQueryTableName); 30 | } 31 | 32 | for (String pattern : ImmutableList 33 | .of(BQ_LEGACY_STANDARD_TABLE_NAME_FORMAT, BQ_RESOURCE_FORMAT, BQ_LINKED_RESOURCE_FORMAT, 34 | BQ_LEGACY_STANDARD_TABLE_NAME_FORMAT_NOT_FULLY_QUALIFIED)) { 35 | try { 36 | return extractInformation(pattern, bigQueryTableName); 37 | } catch (IllegalArgumentException aex) { 38 | logger.atInfo().atMostEvery(1, TimeUnit.MINUTES).withCause(aex) 39 | .log("error parsing %s", bigQueryTableName); 40 | } 41 | } 42 | 43 | throw new IllegalArgumentException( 44 | "Couldn't convert into any known types: (" + bigQueryTableName + ")"); 45 | } 46 | 47 | /** 48 | * Returns a parsed TableEntity from the legacy SQL form (:.) of 49 | * a BigQuery table. 50 | */ 51 | public static BigQueryTableEntity fromLegacyTableName(String legacyName) { 52 | return extractInformation(LEGACY_TABLE_FORMAT, legacyName); 53 | } 54 | 55 | public static BigQueryTableEntity fromSqlResource(String sqlResource) { 56 | return extractInformation(SQL_RESOURCE_FORMAT, sqlResource); 57 | } 58 | 59 | public static BigQueryTableEntity fromBigQueryResource(String resource) { 60 | return extractInformation(BQ_RESOURCE_FORMAT, resource); 61 | } 62 | 63 | public static BigQueryTableEntity fromLinkedResource(String linkedResource) { 64 | return extractInformation(BQ_LINKED_RESOURCE_FORMAT, linkedResource); 65 | } 66 | 67 | private static final String PROJECT_ID_TAG = "projectId"; 68 | 69 | private static final String DATASET_ID_TAG = "dataset"; 70 | 71 | private static final String TABLE_ID_TAG = "table"; 72 | 73 | private static final String PROJECT_PATTERN = "[a-zA-Z0-9\\.\\-\\:]+"; 74 | 75 | private static final String DATASET_PATTERN = "[a-zA-Z_][a-zA-Z0-9\\_]+"; 76 | 77 | private static final String TABLE_PATTERN = "[a-zA-Z0-9][a-zA-Z0-9\\_\\*]+"; 78 | 79 | private static final String LEGACY_TABLE_FORMAT = 80 | String.format( 81 | "^(?<%s>%s)\\:(?<%s>%s)\\.(?<%s>%s)$", 82 | PROJECT_ID_TAG, PROJECT_PATTERN, DATASET_ID_TAG, DATASET_PATTERN, TABLE_ID_TAG, 83 | TABLE_PATTERN); 84 | 85 | private static final String SQL_RESOURCE_FORMAT = 86 | String.format( 87 | "^bigquery\\.table\\.(?<%s>%s)\\.(?<%s>%s)\\.(?<%s>%s)$", 88 | PROJECT_ID_TAG, PROJECT_PATTERN, DATASET_ID_TAG, DATASET_PATTERN, TABLE_ID_TAG, 89 | TABLE_PATTERN); 90 | 91 | private static final String BQ_RESOURCE_FORMAT = 92 | String.format( 93 | "^projects/(?<%s>%s)/datasets/(?<%s>%s)/tables/(?<%s>%s)$", 94 | PROJECT_ID_TAG, PROJECT_PATTERN, DATASET_ID_TAG, DATASET_PATTERN, TABLE_ID_TAG, 95 | TABLE_PATTERN); 96 | 97 | private static final String BQ_LINKED_RESOURCE_FORMAT = 98 | String.format( 99 | "^//bigquery.googleapis.com/projects/(?<%s>%s)/datasets/(?<%s>%s)/tables/(?<%s>%s)$", 100 | PROJECT_ID_TAG, PROJECT_PATTERN, DATASET_ID_TAG, DATASET_PATTERN, TABLE_ID_TAG, 101 | TABLE_PATTERN); 102 | 103 | private static final String BQ_LEGACY_STANDARD_TABLE_NAME_FORMAT = 104 | String.format( 105 | "^(?<%s>%s)[:\\.](?<%s>%s)\\.(?<%s>%s)$", 106 | PROJECT_ID_TAG, PROJECT_PATTERN, DATASET_ID_TAG, DATASET_PATTERN, TABLE_ID_TAG, 107 | TABLE_PATTERN); 108 | 109 | private static final String BQ_LEGACY_STANDARD_TABLE_NAME_FORMAT_NOT_FULLY_QUALIFIED = 110 | String.format( 111 | "^(?<%s>%s)\\.(?<%s>%s)$", 112 | DATASET_ID_TAG, DATASET_PATTERN, TABLE_ID_TAG, TABLE_PATTERN); 113 | 114 | //TODO: How to access INFORMATION_SCHEMA 115 | 116 | private static BigQueryTableEntity extractInformation(String pattern, String resource) { 117 | Matcher matcher = Pattern.compile(pattern).matcher(resource); 118 | if (!matcher.find()) { 119 | throw new IllegalArgumentException( 120 | "input (" + resource + ") not in correct format (" + pattern + ")"); 121 | } 122 | 123 | String projectID = Options.default_project; 124 | if (matcher.groupCount() > 2) { 125 | projectID = matcher.group(PROJECT_ID_TAG); 126 | } else { 127 | Options.missing_project.put( 128 | projectID + "." + matcher.group(DATASET_ID_TAG) + "." + matcher.group(TABLE_ID_TAG), 129 | matcher.group(DATASET_ID_TAG) + "." + matcher.group(TABLE_ID_TAG)); 130 | } 131 | 132 | return BigQueryTableEntity.builder() 133 | .setProjectId(projectID) 134 | .setDataset(matcher.group(DATASET_ID_TAG)) 135 | .setTable(matcher.group(TABLE_ID_TAG)) 136 | .build(); 137 | } 138 | 139 | private BigQueryTableCreator() { 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/converter/BigQuerySchemaConverter.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | package com.borjav.data.converter; 4 | 5 | import static com.google.common.collect.ImmutableList.toImmutableList; 6 | 7 | import com.google.api.services.bigquery.model.Table; 8 | import com.google.api.services.bigquery.model.TableFieldSchema; 9 | import com.google.api.services.bigquery.model.TableReference; 10 | import com.google.api.services.bigquery.model.TableSchema; 11 | import com.google.common.collect.ImmutableList; 12 | import com.google.common.collect.ImmutableSet; 13 | import com.google.zetasql.SimpleColumn; 14 | import com.google.zetasql.SimpleTable; 15 | import com.google.zetasql.StructType; 16 | import com.google.zetasql.Type; 17 | import com.google.zetasql.TypeFactory; 18 | import com.google.zetasql.ZetaSQLType; 19 | import com.google.zetasql.ZetaSQLType.TypeKind; 20 | import com.borjav.data.model.BigQueryTableEntity; 21 | import java.util.ArrayList; 22 | import java.util.List; 23 | 24 | /** 25 | * An adaptor to convert BigQuery Table schema to ZetaSQL Table schema. 26 | */ 27 | public final class BigQuerySchemaConverter { 28 | 29 | /** 30 | * Converts a BigQuery {@link TableSchema} to ZetaSql Table schema. 31 | * 32 | * @param bigQueryTable the BigQuery Table as returned from BigQuery API. 33 | * @return a ZetaSql table reflecting the same name and schema as the input. 34 | */ 35 | public static SimpleTable convert(Table bigQueryTable) { 36 | 37 | if (!bigQueryTable.getType().equals("TABLE") && !bigQueryTable.getType().equals("EXTERNAL") 38 | && !bigQueryTable.getType().equals("VIEW")) { 39 | throw new IllegalArgumentException( 40 | "Table Type should be \"TABLE\" or \"EXTERNAL\" found \"" + bigQueryTable.getType() 41 | + "\""); 42 | } 43 | BigQueryTableEntity tableSpec = extractTableSpec(bigQueryTable.getTableReference()); 44 | return 45 | new SimpleTable( 46 | tableSpec.getStandSqlName(), 47 | extractSchema(tableSpec.getStandSqlName(), bigQueryTable.getSchema())); 48 | } 49 | 50 | private static BigQueryTableEntity extractTableSpec(TableReference bqTableRef) { 51 | return 52 | BigQueryTableEntity.builder() 53 | .setProjectId(bqTableRef.getProjectId()) 54 | .setDataset(bqTableRef.getDatasetId()) 55 | .setTable(bqTableRef.getTableId()) 56 | .build(); 57 | } 58 | 59 | private static ImmutableList extractSchema( 60 | String tableName, 61 | TableSchema bqTableSchema) { 62 | ImmutableList simpleSchema = bqTableSchema.getFields().stream() 63 | .map(field -> new SimpleColumn(tableName, field.getName(), extractColumnType(field))) 64 | .collect(toImmutableList()); 65 | return ImmutableList.builder() 66 | .addAll(simpleSchema) 67 | .addAll(buildPseudoColumns(tableName)) 68 | .build(); 69 | 70 | 71 | } 72 | 73 | 74 | private static ImmutableList buildPseudoColumns(String tableName) { 75 | // adding default columns from BQ to the schemas 76 | List pseudoColumns = new ArrayList<>() { 77 | { 78 | add(new SimpleColumn( 79 | tableName, 80 | "_PARTITIONTIME", 81 | TypeFactory.createSimpleType(ZetaSQLType.TypeKind.TYPE_TIMESTAMP), 82 | true, 83 | false));// 84 | add(new SimpleColumn( 85 | tableName, 86 | "_PARTTIONDATE", 87 | TypeFactory.createSimpleType(TypeKind.TYPE_DATE), 88 | true, 89 | false)); 90 | add(new SimpleColumn( 91 | tableName, 92 | "_TABLE_SUFFIX", 93 | TypeFactory.createSimpleType(TypeKind.TYPE_STRING), 94 | true, 95 | false)); 96 | } 97 | }; 98 | 99 | return pseudoColumns.stream().collect(toImmutableList()); 100 | } 101 | 102 | 103 | private static Type extractColumnType(TableFieldSchema fieldSchema) { 104 | Type fieldType; 105 | 106 | if ("RECORD".equals(fieldSchema.getType())) { 107 | 108 | ImmutableSet.Builder fieldBuilder = ImmutableSet.builder(); 109 | 110 | for (TableFieldSchema recordField : fieldSchema.getFields()) { 111 | Type recordFieldType = extractColumnType(recordField); 112 | 113 | fieldBuilder.add(new StructType.StructField(recordField.getName(), recordFieldType)); 114 | } 115 | 116 | fieldType = TypeFactory.createStructType(fieldBuilder.build()); 117 | } else { 118 | fieldType = TypeFactory.createSimpleType(convertSimpleType(fieldSchema.getType())); 119 | } 120 | 121 | if ("REPEATED".equals(fieldSchema.getMode())) { 122 | return TypeFactory.createArrayType(fieldType); 123 | } 124 | 125 | return fieldType; 126 | } 127 | 128 | private static TypeKind convertSimpleType(String bqType) { 129 | switch (bqType) { 130 | case "STRING": 131 | return TypeKind.TYPE_STRING; 132 | case "BYTES": 133 | return TypeKind.TYPE_BYTES; 134 | case "INTEGER": 135 | return TypeKind.TYPE_INT64; 136 | case "FLOAT": 137 | return TypeKind.TYPE_FLOAT; 138 | case "NUMERIC": 139 | return TypeKind.TYPE_NUMERIC; 140 | case "BOOLEAN": 141 | return TypeKind.TYPE_BOOL; 142 | case "TIMESTAMP": 143 | return TypeKind.TYPE_TIMESTAMP; 144 | case "DATE": 145 | return TypeKind.TYPE_DATE; 146 | case "TIME": 147 | return TypeKind.TYPE_TIME; 148 | case "DATETIME": 149 | return TypeKind.TYPE_DATETIME; 150 | case "GEOGRAPHY": 151 | return TypeKind.TYPE_GEOGRAPHY; 152 | case "BIGNUMERIC": 153 | return TypeKind.TYPE_BIGNUMERIC; 154 | case "INTERVAL": 155 | return TypeKind.TYPE_INTERVAL; 156 | case "JSON": 157 | return TypeKind.TYPE_JSON; 158 | default: 159 | return TypeKind.TYPE_UNKNOWN; 160 | } 161 | } 162 | 163 | private BigQuerySchemaConverter() { 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/test/java/examples/BigQuerySqlParserLocalSchemaTest.java: -------------------------------------------------------------------------------- 1 | package examples; 2 | 3 | import com.borjav.data.model.ResolvedNodeExtended; 4 | import com.borjav.data.output.OutputLineage; 5 | import com.borjav.data.parser.ZetaSQLResolver; 6 | import com.borjav.data.service.BigQueryTableLoadService; 7 | import com.borjav.data.service.BigQueryZetaSqlSchemaLoader; 8 | import com.fasterxml.jackson.core.JsonProcessingException; 9 | import com.fasterxml.jackson.databind.ObjectMapper; 10 | import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; 11 | import org.junit.Assert; 12 | import org.junit.Test; 13 | import utils.FakeBigQueryServiceFactory; 14 | import utils.TestCase; 15 | import utils.TestResourceLoader; 16 | 17 | public final class BigQuerySqlParserLocalSchemaTest { 18 | 19 | @Test 20 | public void extractColumnLineage_concatColumns_correctColumnNames() 21 | throws JsonProcessingException { 22 | FakeBigQueryServiceFactory fakeBigqueryFactory = 23 | FakeBigQueryServiceFactory 24 | .forTableSchemas( 25 | TestResourceLoader.load("schemas/tableA_schema.json"), 26 | TestResourceLoader.load("schemas/tableB_schema.json")); 27 | BigQueryZetaSqlSchemaLoader fakeSchemaLoader = 28 | new BigQueryZetaSqlSchemaLoader( 29 | BigQueryTableLoadService.usingServiceFactory(fakeBigqueryFactory)); 30 | 31 | ZetaSQLResolver parser = new ZetaSQLResolver(fakeSchemaLoader); 32 | ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); 33 | OutputLineage printer = new OutputLineage(); 34 | 35 | String inputTest = TestResourceLoader.load("sql/kitchen_sink_concat.yaml"); 36 | TestCase testString = mapper.readValue(inputTest, TestCase.class); 37 | String sql = parser.replaceQuotesFullyQualifiedName(testString.query); 38 | ResolvedNodeExtended table = parser.extractLineage(sql); 39 | 40 | Assert.assertEquals( 41 | printer.toYaml(printer.toModel(table, testString.expected_output.name, null,false)), 42 | printer.toYaml(testString.expected_output)); 43 | 44 | } 45 | 46 | @Test 47 | public void extractColumnLineage_multipleOutputColumnsWithAlias_correctColumnLineage() 48 | throws JsonProcessingException { 49 | FakeBigQueryServiceFactory fakeBigqueryFactory = 50 | FakeBigQueryServiceFactory.forTableSchemas( 51 | TestResourceLoader.load("schemas/tableA_schema.json"), 52 | TestResourceLoader.load("schemas/tableB_schema.json")); 53 | BigQueryZetaSqlSchemaLoader fakeSchemaLoader = 54 | new BigQueryZetaSqlSchemaLoader( 55 | BigQueryTableLoadService.usingServiceFactory(fakeBigqueryFactory)); 56 | 57 | ZetaSQLResolver parser = new ZetaSQLResolver(fakeSchemaLoader); 58 | ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); 59 | OutputLineage printer = new OutputLineage(); 60 | 61 | String inputTest = TestResourceLoader.load("sql/kitchen_sink_multiple_output_columns_with_alias.yaml"); 62 | TestCase testString = mapper.readValue(inputTest, TestCase.class); 63 | String sql = parser.replaceQuotesFullyQualifiedName(testString.query); 64 | ResolvedNodeExtended table = parser.extractLineage(sql); 65 | 66 | Assert.assertEquals( 67 | printer.toYaml(printer.toModel(table, testString.expected_output.name, null,false)), 68 | printer.toYaml(testString.expected_output)); 69 | } 70 | 71 | @Test 72 | public void extractColumnLineage_multipleOutputColumnsWithoutAlias_correctColumnLineage() 73 | throws JsonProcessingException { 74 | FakeBigQueryServiceFactory fakeBigqueryFactory = 75 | FakeBigQueryServiceFactory.forTableSchemas( 76 | TestResourceLoader.load("schemas/tableA_schema.json"), 77 | TestResourceLoader.load("schemas/tableB_schema.json")); 78 | BigQueryZetaSqlSchemaLoader fakeSchemaLoader = 79 | new BigQueryZetaSqlSchemaLoader( 80 | BigQueryTableLoadService.usingServiceFactory(fakeBigqueryFactory)); 81 | 82 | ZetaSQLResolver parser = new ZetaSQLResolver(fakeSchemaLoader); 83 | ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); 84 | OutputLineage printer = new OutputLineage(); 85 | 86 | String inputTest = TestResourceLoader.load("sql/kitchen_sink_multiple_output_columns_without_alias.yaml"); 87 | TestCase testString = mapper.readValue(inputTest, TestCase.class); 88 | String sql = parser.replaceQuotesFullyQualifiedName(testString.query); 89 | ResolvedNodeExtended table = parser.extractLineage(sql); 90 | 91 | Assert.assertEquals( 92 | printer.toYaml(printer.toModel(table, testString.expected_output.name, null,false)), 93 | printer.toYaml(testString.expected_output)); 94 | } 95 | 96 | @Test 97 | public void 98 | extractColumnLineage_bigQuerySchemaMultipleOutputColumnsWithoutAlias_correctColumnLineage() 99 | throws JsonProcessingException { 100 | FakeBigQueryServiceFactory fakeBigqueryFactory = 101 | FakeBigQueryServiceFactory.forTableSchemas( 102 | TestResourceLoader.load("schemas/daily_report_table_schema.json"), 103 | TestResourceLoader.load("schemas/error_stats_table_schema.json")); 104 | BigQueryZetaSqlSchemaLoader fakeSchemaLoader = 105 | new BigQueryZetaSqlSchemaLoader( 106 | BigQueryTableLoadService.usingServiceFactory(fakeBigqueryFactory)); 107 | 108 | ZetaSQLResolver parser = new ZetaSQLResolver(fakeSchemaLoader); 109 | ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); 110 | OutputLineage printer = new OutputLineage(); 111 | 112 | String inputTest = TestResourceLoader.load( 113 | "sql/bigquery_daily_report_error_stats_join_group_by_aggr_functions.yaml"); 114 | TestCase testString = mapper.readValue(inputTest, TestCase.class); 115 | String sql = parser.replaceQuotesFullyQualifiedName(testString.query); 116 | ResolvedNodeExtended table = parser.extractLineage(sql); 117 | 118 | Assert.assertEquals( 119 | printer.toYaml(printer.toModel(table, testString.expected_output.name, null,false)), 120 | printer.toYaml(testString.expected_output)); 121 | 122 | 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/test/java/utils/FakeBigquery.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 Google LLC 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package utils; 18 | 19 | import static com.google.common.collect.ImmutableMap.toImmutableMap; 20 | import static java.util.function.Function.identity; 21 | 22 | import com.borjav.data.model.BigQueryTableEntity; 23 | import com.google.api.client.json.gson.GsonFactory; 24 | import com.google.api.client.testing.http.MockHttpTransport; 25 | import com.google.api.services.bigquery.Bigquery; 26 | import com.google.api.services.bigquery.model.Table; 27 | import com.google.api.services.bigquery.model.TableFieldSchema; 28 | import com.google.common.annotations.VisibleForTesting; 29 | import com.google.common.collect.ImmutableMap; 30 | import java.io.IOException; 31 | import java.util.HashMap; 32 | import java.util.Map; 33 | import java.util.Objects; 34 | import java.util.concurrent.atomic.AtomicInteger; 35 | 36 | 37 | /** 38 | * Fake implementation of Bigquery client for testing. Does not implement batch() methods. 39 | */ 40 | public class FakeBigquery extends Bigquery { 41 | 42 | public final Map predefinedTables; 43 | @VisibleForTesting 44 | public final AtomicInteger numTimesCounterTables; 45 | @VisibleForTesting 46 | public final AtomicInteger numTimesCounterTablesGetExecute; 47 | 48 | public static FakeBigquery forTableSchemas(String... predefinedTableSchemas) { 49 | ImmutableMap predefinedTables = 50 | GoogleTypesToJsonConverter 51 | .convertFromJson(Table.class, predefinedTableSchemas) 52 | .stream() 53 | .collect(toImmutableMap(Table::getId, identity())); 54 | 55 | return new FakeBigquery(predefinedTables); 56 | } 57 | 58 | public FakeBigquery(Map predefinedTables) { 59 | super(new MockHttpTransport(), new GsonFactory(), null); 60 | this.predefinedTables = new HashMap<>(predefinedTables); 61 | this.numTimesCounterTables = new AtomicInteger(0); 62 | this.numTimesCounterTablesGetExecute = new AtomicInteger(0); 63 | } 64 | 65 | 66 | @Override 67 | public Tables tables() { 68 | 69 | class FakeTables extends Tables { 70 | 71 | @Override 72 | public Get get(String projectId, String datasetId, String tableId) { 73 | return new Get(projectId, datasetId, tableId) { 74 | @Override 75 | public Table execute() throws IOException { 76 | numTimesCounterTablesGetExecute.incrementAndGet(); 77 | 78 | return safeTableGet(projectId, datasetId, tableId); 79 | } 80 | }; 81 | } 82 | 83 | @Override 84 | public Patch patch(String projectId, String datasetId, String tableId, Table content) { 85 | 86 | return new Patch(projectId, datasetId, tableId, content) { 87 | @Override 88 | public Table execute() throws IOException { 89 | Table table = safeTableGet(projectId, datasetId, tableId); 90 | validateFields(table.getSchema().getFields(), content.getSchema().getFields()); 91 | table.getSchema().setFields(content.getSchema().getFields()); 92 | return table; 93 | } 94 | 95 | private void validateFields(java.util.List existingFields, 96 | java.util.List updatedFields) throws IOException { 97 | Map existingFieldMap = convertFieldListToMap(existingFields); 98 | Map updatedFieldMap = convertFieldListToMap(updatedFields); 99 | 100 | for (Entry entry : existingFieldMap.entrySet()) { 101 | TableFieldSchema existingField = entry.getValue(); 102 | TableFieldSchema updatedField = updatedFieldMap.get(entry.getKey()); 103 | checkFieldMatches(existingField, updatedField); 104 | 105 | if (existingField.getType().equals("RECORD")) { 106 | validateFields(existingField.getFields(), updatedField.getFields()); 107 | } 108 | } 109 | } 110 | 111 | private ImmutableMap convertFieldListToMap( 112 | java.util.List fields) { 113 | return fields.stream().collect(toImmutableMap(TableFieldSchema::getName, identity())); 114 | } 115 | 116 | private void checkFieldMatches( 117 | TableFieldSchema existingField, 118 | TableFieldSchema updatedField) throws IOException { 119 | boolean matches = 120 | (existingField != null) 121 | && (updatedField != null) 122 | && Objects.equals(existingField.getName(), updatedField.getName()) 123 | && Objects.equals(existingField.getType(), updatedField.getType()) 124 | && Objects.equals(existingField.getMode(), updatedField.getMode()); 125 | 126 | if (!matches) { 127 | String fieldName = (existingField == null) ? "null" : existingField.getName(); 128 | throw new IOException("Field does not match: " + fieldName); 129 | } 130 | } 131 | }; 132 | } 133 | } 134 | 135 | numTimesCounterTables.incrementAndGet(); 136 | return new FakeTables(); 137 | } 138 | 139 | private Table safeTableGet(String projectId, String datasetId, String tableId) 140 | throws IOException { 141 | BigQueryTableEntity tableEntity = BigQueryTableEntity.create(projectId, datasetId, tableId); 142 | Table table = predefinedTables.get(tableEntity.getLegacySqlName()); 143 | 144 | if (table == null) { 145 | throw new IOException( 146 | String.format("Table Not Found %s:%s.%s", projectId, datasetId, tableId)); 147 | } 148 | return table; 149 | } 150 | 151 | //TODO: Implement batch request tracking using MockTransport. 152 | } 153 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/customers_groupby_sets.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with customers as ( 3 | 4 | select * from `catalog.jaffle_shop.stg_customers` 5 | 6 | ), 7 | 8 | orders as ( 9 | 10 | select * from `catalog.jaffle_shop.stg_orders` 11 | 12 | ), 13 | 14 | payments as ( 15 | 16 | select * from `catalog.jaffle_shop.stg_payments` 17 | 18 | ), 19 | 20 | customer_orders as ( 21 | 22 | select 23 | customer_id, 24 | 25 | min(order_date) as first_order, 26 | max(order_date) as most_recent_order, 27 | count(order_id) as number_of_orders 28 | from orders 29 | 30 | group by customer_id 31 | 32 | ), 33 | 34 | customer_payments as ( 35 | 36 | select 37 | orders.customer_id, 38 | sum(amount) as total_amount 39 | 40 | from payments 41 | 42 | left join orders on 43 | payments.order_id = orders.order_id 44 | 45 | group by orders.customer_id 46 | 47 | ), 48 | 49 | final as ( 50 | 51 | select 52 | customers.customer_id, 53 | customers.first_name, 54 | customers.last_name, 55 | customer_orders.first_order, 56 | customer_orders.most_recent_order, 57 | customer_orders.number_of_orders, 58 | customer_payments.total_amount as customer_lifetime_value 59 | 60 | from customers 61 | 62 | left join customer_orders 63 | on customers.customer_id = customer_orders.customer_id 64 | 65 | left join customer_payments 66 | on customers.customer_id = customer_payments.customer_id 67 | 68 | ), groupby AS( 69 | select customer_id,first_name, 70 | SUM(customer_lifetime_value) AS total_customer_lifetime_value 71 | FROM final 72 | where first_order is not null 73 | GROUP BY GROUPING SETS (customer_id, first_name) 74 | ), final_no_limit AS( 75 | 76 | select customer_id,first_name,total_customer_lifetime_value 77 | from groupby 78 | ORDER BY 1 79 | ) 80 | select * from final_no_limit 81 | ORDER BY 2 82 | LIMIT 12 83 | 84 | expected_output: 85 | name: "customers" 86 | output_columns: 87 | - name: "customer_id" 88 | references: 89 | - project_name: "catalog" 90 | dataset_name: "jaffle_shop" 91 | table_name: "stg_customers" 92 | name: "customer_id" 93 | - name: "first_name" 94 | references: 95 | - project_name: "catalog" 96 | dataset_name: "jaffle_shop" 97 | table_name: "stg_customers" 98 | name: "first_name" 99 | - name: "total_customer_lifetime_value" 100 | references: 101 | - project_name: "catalog" 102 | dataset_name: "jaffle_shop" 103 | table_name: "stg_payments" 104 | name: "amount" 105 | joins: 106 | - join_type: "LEFT" 107 | left_columns: 108 | - name: "order_id" 109 | references: 110 | - project_name: "catalog" 111 | dataset_name: "jaffle_shop" 112 | table_name: "stg_payments" 113 | name: "order_id" 114 | used_for: 115 | - "JOIN_LEFT_TABLE" 116 | right_columns: 117 | - name: "order_id" 118 | references: 119 | - project_name: "catalog" 120 | dataset_name: "jaffle_shop" 121 | table_name: "stg_orders" 122 | name: "order_id" 123 | used_for: 124 | - "JOIN_RIGHT_TABLE" 125 | - join_type: "LEFT" 126 | left_columns: 127 | - name: "customer_id" 128 | references: 129 | - project_name: "catalog" 130 | dataset_name: "jaffle_shop" 131 | table_name: "stg_customers" 132 | name: "customer_id" 133 | used_for: 134 | - "JOIN_LEFT_TABLE" 135 | right_columns: 136 | - name: "customer_id" 137 | references: 138 | - project_name: "catalog" 139 | dataset_name: "jaffle_shop" 140 | table_name: "stg_orders" 141 | name: "customer_id" 142 | used_for: 143 | - "JOIN_RIGHT_TABLE" 144 | - join_type: "LEFT" 145 | left_columns: 146 | - name: "customer_id" 147 | references: 148 | - project_name: "catalog" 149 | dataset_name: "jaffle_shop" 150 | table_name: "stg_customers" 151 | name: "customer_id" 152 | used_for: 153 | - "JOIN_LEFT_TABLE" 154 | right_columns: 155 | - name: "customer_id" 156 | references: 157 | - project_name: "catalog" 158 | dataset_name: "jaffle_shop" 159 | table_name: "stg_orders" 160 | name: "customer_id" 161 | used_for: 162 | - "JOIN_RIGHT_TABLE" 163 | filters: 164 | - name: "_first_order_" 165 | references: 166 | - project_name: "catalog" 167 | dataset_name: "jaffle_shop" 168 | table_name: "stg_orders" 169 | name: "order_date" 170 | used_for: 171 | - "FILTER" 172 | aggregations: 173 | - name: "_customer_id_" 174 | references: 175 | - project_name: "catalog" 176 | dataset_name: "jaffle_shop" 177 | table_name: "stg_orders" 178 | name: "customer_id" 179 | used_for: 180 | - "GROUP_BY" 181 | - name: "_customer_id_" 182 | references: 183 | - project_name: "catalog" 184 | dataset_name: "jaffle_shop" 185 | table_name: "stg_customers" 186 | name: "customer_id" 187 | used_for: 188 | - "GROUP_BY" 189 | - name: "_first_name_" 190 | references: 191 | - project_name: "catalog" 192 | dataset_name: "jaffle_shop" 193 | table_name: "stg_customers" 194 | name: "first_name" 195 | used_for: 196 | - "GROUP_BY" 197 | other_used_columns: 198 | - name: "_first_name_" 199 | references: 200 | - project_name: "catalog" 201 | dataset_name: "jaffle_shop" 202 | table_name: "stg_customers" 203 | name: "first_name" 204 | used_for: 205 | - "ORDER_BY" 206 | - name: "_customer_id_" 207 | references: 208 | - project_name: "catalog" 209 | dataset_name: "jaffle_shop" 210 | table_name: "stg_customers" 211 | name: "customer_id" 212 | used_for: 213 | - "ORDER_BY" 214 | type: "select" 215 | selected_tables: 216 | - "catalog.jaffle_shop.stg_orders" 217 | - "catalog.jaffle_shop.stg_customers" 218 | - "catalog.jaffle_shop.stg_payments" -------------------------------------------------------------------------------- /src/test/java/examples/ASTExplorerTest.java: -------------------------------------------------------------------------------- 1 | package examples; 2 | 3 | import com.borjav.data.model.ResolvedNodeExtended; 4 | import com.borjav.data.output.OutputLineage; 5 | import com.borjav.data.parser.ZetaSQLResolver; 6 | import com.fasterxml.jackson.databind.ObjectMapper; 7 | import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; 8 | import com.google.zetasql.SimpleCatalog; 9 | import java.io.File; 10 | import org.junit.Assert; 11 | import org.junit.Test; 12 | import utils.FakeCatalogBuilder; 13 | import utils.TestCase; 14 | 15 | public class ASTExplorerTest { 16 | 17 | 18 | @Test 19 | public void testUnnestCreateView() throws Exception { 20 | File file = new File("src/test/resources/sql/benchmark/unnest_create_view.yaml"); 21 | genericTest(file, null, false); 22 | } 23 | 24 | @Test 25 | public void testUnnestCreate() throws Exception { 26 | File file = new File("src/test/resources/sql/benchmark/unnest_create.yaml"); 27 | genericTest(file, null, false); 28 | } 29 | 30 | @Test 31 | public void testUnnest() throws Exception { 32 | File file = new File("src/test/resources/sql/benchmark/unnest.yaml"); 33 | genericTest(file, null, false); 34 | } 35 | 36 | @Test 37 | public void testUdf() throws Exception { 38 | File file = new File("src/test/resources/sql/benchmark/udf.yaml"); 39 | genericTest(file, null, false); 40 | } 41 | 42 | @Test 43 | public void testTimestamps() throws Exception { 44 | File file = new File("src/test/resources/sql/benchmark/timestamps.yaml"); 45 | genericTest(file, null, false); 46 | } 47 | 48 | @Test 49 | public void testStgPayments() throws Exception { 50 | File file = new File("src/test/resources/sql/benchmark/stg_payments.yaml"); 51 | genericTest(file, null, false); 52 | } 53 | 54 | @Test 55 | public void testStgCustomers() throws Exception { 56 | File file = new File("src/test/resources/sql/benchmark/stg_customers.yaml"); 57 | genericTest(file, null, false); 58 | } 59 | 60 | @Test 61 | public void testSelectPrune() throws Exception { 62 | File file = new File("src/test/resources/sql/benchmark/select_prune.yaml"); 63 | genericTest(file, null, false); 64 | } 65 | 66 | 67 | @Test 68 | public void testSubqueryUnnest() throws Exception { 69 | File file = new File("src/test/resources/sql/benchmark/subquery_unnest.yaml"); 70 | genericTest(file, null, false); 71 | } 72 | 73 | @Test 74 | public void testPivot() throws Exception { 75 | File file = new File("src/test/resources/sql/benchmark/pivot.yaml"); 76 | genericTest(file, null, false); 77 | } 78 | 79 | @Test 80 | public void testMultipleRefs() throws Exception { 81 | File file = new File("src/test/resources/sql/benchmark/multiple_refs.yaml"); 82 | genericTest(file, null, false); 83 | } 84 | 85 | @Test 86 | public void testMessyStruct() throws Exception { 87 | File file = new File("src/test/resources/sql/benchmark/messy_struct.yaml"); 88 | genericTest(file, null, false); 89 | } 90 | 91 | @Test 92 | public void testCustomers() throws Exception { 93 | File file = new File("src/test/resources/sql/benchmark/customers.yaml"); 94 | genericTest(file, null, false); 95 | } 96 | 97 | @Test 98 | public void testJsonFunctions() throws Exception { 99 | File file = new File("src/test/resources/sql/benchmark/json_functions.yaml"); 100 | genericTest(file, null, false); 101 | } 102 | 103 | @Test 104 | public void testGroupBy() throws Exception { 105 | File file = new File("src/test/resources/sql/benchmark/customers_groupby.yaml"); 106 | genericTest(file, null, false); 107 | } 108 | 109 | @Test 110 | public void testGroupBySets() throws Exception { 111 | File file = new File("src/test/resources/sql/benchmark/customers_groupby_sets.yaml"); 112 | genericTest(file, null, false); 113 | } 114 | 115 | @Test 116 | public void testJsonFunctionsStruct() throws Exception { 117 | File file = new File("src/test/resources/sql/benchmark/json_functions_struct.yaml"); 118 | genericTest(file, null, false); 119 | } 120 | 121 | @Test 122 | public void testParameters() throws Exception { 123 | File file = new File("src/test/resources/sql/benchmark/parameter.yaml"); 124 | genericTest(file, null, false); 125 | } 126 | 127 | 128 | @Test 129 | public void testJsonWithLiterals() throws Exception { 130 | File file = new File("src/test/resources/sql/benchmark/json_functions_with_literals.yaml"); 131 | genericTest(file, null, true); 132 | } 133 | 134 | @Test 135 | public void testMessyUnnesting() throws Exception { 136 | File file = new File("src/test/resources/sql/benchmark/messy_unnesting.yaml"); 137 | genericTest(file, null, false); 138 | } 139 | 140 | @Test 141 | public void testAnalyticalFunctions() throws Exception { 142 | File file = new File("src/test/resources/sql/benchmark/analytical_functions.yaml"); 143 | genericTest(file, null, false); 144 | } 145 | 146 | @Test 147 | public void testPivotWithLiterals() throws Exception { 148 | File file = new File("src/test/resources/sql/benchmark/pivot_where_with_literals.yaml"); 149 | genericTest(file, null, true); 150 | } 151 | 152 | 153 | @Test 154 | public void testRecursiveCTE() throws Exception { 155 | File file = new File("src/test/resources/sql/benchmark/recursive_cte.yaml"); 156 | genericTest(file, null, false); 157 | } 158 | 159 | @Test 160 | public void testLookerSubqueries() throws Exception { 161 | File file = new File("src/test/resources/sql/benchmark/looker_subquery.yaml"); 162 | genericTest(file, null, false); 163 | } 164 | 165 | @Test 166 | public void testLookerSubqueryCrazy() throws Exception { 167 | File file = new File("src/test/resources/sql/benchmark/looker_subquery_crazy.yaml"); 168 | genericTest(file, null, false); 169 | } 170 | 171 | @Test 172 | public void testCount() throws Exception { 173 | File file = new File("src/test/resources/sql/benchmark/count.yaml"); 174 | genericTest(file, null, false); 175 | } 176 | 177 | @Test 178 | public void testArrayAggMultiNests() throws Exception { 179 | File file = new File("src/test/resources/sql/benchmark/array_agg_multiplenests.yaml"); 180 | genericTest(file, null, false); 181 | } 182 | 183 | public static void genericTest(File file, SimpleCatalog catalog, boolean printLeafs) 184 | throws Exception { 185 | if (catalog == null) { 186 | catalog = FakeCatalogBuilder.buildCatalog(); 187 | } 188 | ZetaSQLResolver parser = new ZetaSQLResolver(catalog); 189 | ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); 190 | OutputLineage printer = new OutputLineage(); 191 | 192 | TestCase test = mapper.readValue(file, TestCase.class); 193 | String sql = parser.replaceQuotesFullyQualifiedName(test.query); 194 | ResolvedNodeExtended table = parser.extractLineage(sql); 195 | 196 | Assert.assertEquals( 197 | printer.toYaml(printer.toModel(table, test.expected_output.name, null, printLeafs)), 198 | printer.toYaml(test.expected_output)); 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /src/main/java/com/borjav/data/parser/ZetaSQLResolver.java: -------------------------------------------------------------------------------- 1 | package com.borjav.data.parser; 2 | 3 | import static com.google.common.collect.ImmutableSet.toImmutableSet; 4 | import static com.google.zetasql.Analyzer.extractTableNamesFromNextStatement; 5 | 6 | import com.borjav.data.model.ResolvedNodeExtended; 7 | import com.borjav.data.options.Options; 8 | import com.borjav.data.service.BigQueryZetaSqlSchemaLoader; 9 | import com.google.common.collect.ImmutableSet; 10 | import com.google.zetasql.Analyzer; 11 | import com.google.zetasql.AnalyzerOptions; 12 | import com.google.zetasql.LanguageOptions; 13 | import com.google.zetasql.ParseResumeLocation; 14 | import com.google.zetasql.SimpleCatalog; 15 | import com.google.zetasql.ZetaSQLBuiltinFunctionOptions; 16 | import com.google.zetasql.ZetaSQLOptions; 17 | import com.google.zetasql.resolvedast.ResolvedNodes; 18 | import java.util.Iterator; 19 | import java.util.Map; 20 | import javax.annotation.Nullable; 21 | import org.apache.commons.lang3.StringUtils; 22 | 23 | public class ZetaSQLResolver { 24 | 25 | private final BigQueryZetaSqlSchemaLoader tableSchemaLoader; 26 | private SimpleCatalog catalog; 27 | 28 | public ZetaSQLResolver(@Nullable BigQueryZetaSqlSchemaLoader tableSchemaLoader) { 29 | this.catalog = new SimpleCatalog("data-catalog"); 30 | catalog.addZetaSQLFunctionsAndTypes( 31 | new ZetaSQLBuiltinFunctionOptions(enableAllLanguageFeatures())); 32 | this.tableSchemaLoader = tableSchemaLoader; 33 | } 34 | 35 | public ZetaSQLResolver(SimpleCatalog catalog) { 36 | this.catalog = catalog; 37 | catalog.addZetaSQLFunctionsAndTypes( 38 | new ZetaSQLBuiltinFunctionOptions(enableAllLanguageFeatures())); 39 | this.tableSchemaLoader = null; 40 | } 41 | 42 | private ImmutableSet extractReferencedTables(ParseResumeLocation aParseResumeLocation, 43 | AnalyzerOptions analyzerOptions) { 44 | 45 | return extractTableNamesFromNextStatement(aParseResumeLocation, analyzerOptions).stream() 46 | .map(k -> StringUtils.join(k, ".")) 47 | .collect(toImmutableSet()); 48 | } 49 | 50 | 51 | public String replaceNotFullyQualifiedTables(String sql) { 52 | if (Options.missing_project.size() >= 1) { 53 | Iterator> it = Options.missing_project.entrySet().iterator(); 54 | while (it.hasNext()) { 55 | Map.Entry pair = (Map.Entry) it.next(); 56 | sql = sql.replaceAll("`?" + pair.getValue().toString().replace(".", "\\.") + "`?", 57 | "`" + pair.getKey().toString() + "`"); 58 | it.remove(); // avoids a ConcurrentModificationException 59 | } 60 | } 61 | 62 | return sql; 63 | } 64 | 65 | 66 | public String replaceQuotesFullyQualifiedName(String sql) { 67 | return sql.replaceAll("`(.*)`\\.`(.*)`\\.`(.*)`", "`$1.$2.$3`"); 68 | } 69 | 70 | 71 | public ResolvedNodeExtended extractLineage(String sql) { 72 | 73 | buildCatalogWithQueryTables(sql); 74 | 75 | ParseResumeLocation aParseResumeLocation = new ParseResumeLocation(sql); 76 | aParseResumeLocation.getInput(); 77 | ResolvedNodeExtended finalTable = null; 78 | while (sql.getBytes().length > aParseResumeLocation.getBytePosition()) { 79 | try { 80 | ASTExplorer resolver = new ASTExplorer(this.catalog); 81 | finalTable = resolver.resolve( 82 | Analyzer.analyzeNextStatement(aParseResumeLocation, enableAllFeatures(), this.catalog)); 83 | } catch (Exception e) { 84 | e.printStackTrace(); 85 | throw e; 86 | } 87 | } 88 | 89 | return finalTable; 90 | } 91 | 92 | 93 | public ResolvedNodeExtended extractLineage(String sql, SimpleCatalog catalog) throws Exception { 94 | this.catalog = catalog; 95 | 96 | ParseResumeLocation aParseResumeLocation = new ParseResumeLocation(sql); 97 | aParseResumeLocation.getInput(); 98 | ResolvedNodeExtended finalTable = null; 99 | 100 | while (sql.getBytes().length > aParseResumeLocation.getBytePosition()) { 101 | try { 102 | ASTExplorer resolver = new ASTExplorer(this.catalog); 103 | finalTable = resolver.resolve( 104 | Analyzer.analyzeNextStatement(aParseResumeLocation, enableAllFeatures(), this.catalog)); 105 | final ResolvedNodes.ResolvedStatement stmt = 106 | Analyzer.analyzeNextStatement(aParseResumeLocation, enableAllFeatures(), 107 | this.catalog); 108 | Analyzer.buildStatement(stmt, this.catalog); 109 | } catch (Exception e) { 110 | e.printStackTrace(); 111 | } 112 | } 113 | return finalTable; 114 | } 115 | 116 | public SimpleCatalog getCatalog() { 117 | 118 | return catalog; 119 | } 120 | 121 | 122 | private LanguageOptions enableAllLanguageFeatures() { 123 | LanguageOptions languageOptions = new LanguageOptions(); 124 | languageOptions.setSupportsAllStatementKinds(); 125 | languageOptions = languageOptions.enableMaximumLanguageFeatures(); 126 | //usually some new syntax are not supported by the parser, so we need to enable them manually 127 | languageOptions.enableLanguageFeature( 128 | ZetaSQLOptions.LanguageFeature.FEATURE_V_1_3_CONCAT_MIXED_TYPES); 129 | languageOptions.enableLanguageFeature(ZetaSQLOptions.LanguageFeature.FEATURE_V_1_3_QUALIFY); 130 | languageOptions.enableLanguageFeature( 131 | ZetaSQLOptions.LanguageFeature.FEATURE_ANALYTIC_FUNCTIONS); 132 | languageOptions.enableLanguageFeature(ZetaSQLOptions.LanguageFeature.FEATURE_EXTENDED_TYPES); 133 | languageOptions.enableLanguageFeature( 134 | ZetaSQLOptions.LanguageFeature.FEATURE_V_1_3_DECIMAL_ALIAS); 135 | languageOptions.enableLanguageFeature( 136 | ZetaSQLOptions.LanguageFeature.FEATURE_BETWEEN_UINT64_INT64); 137 | languageOptions.enableLanguageFeature( 138 | ZetaSQLOptions.LanguageFeature.FEATURE_V_1_3_FORMAT_IN_CAST); 139 | languageOptions.enableLanguageFeature(ZetaSQLOptions.LanguageFeature.FEATURE_RANGE_TYPE); 140 | languageOptions.enableLanguageFeature(ZetaSQLOptions.LanguageFeature.FEATURE_INTERVAL_TYPE); 141 | languageOptions.enableLanguageFeature( 142 | ZetaSQLOptions.LanguageFeature.FEATURE_V_1_1_ORDER_BY_IN_AGGREGATE); 143 | languageOptions.enableLanguageFeature( 144 | ZetaSQLOptions.LanguageFeature.FEATURE_V_1_4_GROUPING_SETS); 145 | 146 | // needed to enable qualify without the where clause 147 | languageOptions.enableReservableKeyword("QUALIFY"); 148 | 149 | return languageOptions; 150 | } 151 | 152 | private AnalyzerOptions enableAllFeatures() { 153 | AnalyzerOptions analyzerOptions = new AnalyzerOptions(); 154 | // if false, the parser will extract all the columns from the referenced tables 155 | 156 | analyzerOptions.setLanguageOptions(enableAllLanguageFeatures()); 157 | analyzerOptions.setPruneUnusedColumns(true); 158 | analyzerOptions.setAllowUndeclaredParameters(true); 159 | return analyzerOptions; 160 | } 161 | 162 | 163 | private void buildCatalogWithQueryTables(String sql) { 164 | 165 | if (tableSchemaLoader != null) { 166 | ParseResumeLocation aParseResumeLocation = new ParseResumeLocation(sql); 167 | while (sql.getBytes().length > aParseResumeLocation.getBytePosition()) { 168 | tableSchemaLoader.loadSchemas( 169 | extractReferencedTables(aParseResumeLocation, enableAllFeatures())) 170 | .forEach(k -> { 171 | if (catalog.getTable(k.getName(), null) == null) { 172 | catalog.addSimpleTable(k); 173 | } 174 | }); 175 | } 176 | } 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/array_agg_multiplenests.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | WITH pc AS ( 3 | SELECT 4 | DISTINCT 5 | triplenest.secondnesting.d as user_id, 6 | triplenest.secondnesting.firstnesting.c as model, 7 | triplenest.secondnesting.firstnesting.b as manufacturer 8 | FROM 9 | `catalog.jaffle_shop.multiplenests` 10 | WHERE _TABLE_SUFFIX BETWEEN '20231001' AND '20231231' 11 | ), pd AS ( 12 | SELECT 13 | DISTINCT rescordenesed.anothernesting23 AS user_id 14 | FROM 15 | `catalog.jaffle_shop.nested_table` 16 | WHERE _TABLE_SUFFIX BETWEEN '20231001' AND '20231231' 17 | ), 18 | 19 | sub AS ( 20 | SELECT 21 | pc.user_id AS closed_uid, 22 | pd.user_id AS discovered_uid, 23 | triplenest.secondnesting.d AS reporting_country, 24 | model, 25 | manufacturer 26 | FROM 27 | pc 28 | FULL JOIN 29 | pd ON pd.user_id = pc.user_id 30 | LEFT JOIN 31 | `catalog.jaffle_shop.multiplenests` s ON COALESCE(pc.user_id, pd.user_id) = s.triplenest.secondnesting.d 32 | ) 33 | 34 | SELECT 35 | reporting_country, 36 | ARRAY_AGG( 37 | STRUCT( 38 | model, 39 | manufacturer, 40 | closed, 41 | discovered, 42 | IEEE_DIVIDE(closed, discovered) AS ratio) 43 | ORDER BY 44 | closed DESC LIMIT 5) AS devices 45 | FROM 46 | ( 47 | SELECT 48 | reporting_country, 49 | model, 50 | manufacturer, 51 | COUNT(DISTINCT closed_uid) AS closed, 52 | COUNT(DISTINCT discovered_uid) AS discovered, 53 | FROM sub 54 | GROUP BY 1,2,3 55 | ) 56 | GROUP BY 57 | 1 58 | 59 | expected_output: 60 | name: "array_agg_multiplenests" 61 | output_columns: 62 | - name: "reporting_country" 63 | references: 64 | - project_name: "catalog" 65 | dataset_name: "jaffle_shop" 66 | table_name: "multiplenests" 67 | name: "triplenest.secondnesting.d" 68 | - name: "devices.model" 69 | references: 70 | - project_name: "catalog" 71 | dataset_name: "jaffle_shop" 72 | table_name: "multiplenests" 73 | name: "triplenest.secondnesting.firstnesting.c" 74 | - name: "devices.manufacturer" 75 | references: 76 | - project_name: "catalog" 77 | dataset_name: "jaffle_shop" 78 | table_name: "multiplenests" 79 | name: "triplenest.secondnesting.firstnesting.b" 80 | - name: "devices.closed" 81 | references: 82 | - project_name: "catalog" 83 | dataset_name: "jaffle_shop" 84 | table_name: "multiplenests" 85 | name: "triplenest.secondnesting.d" 86 | - name: "devices.discovered" 87 | references: 88 | - project_name: "catalog" 89 | dataset_name: "jaffle_shop" 90 | table_name: "nested_table" 91 | name: "rescordenesed.anothernesting23" 92 | - name: "devices.ratio" 93 | references: 94 | - project_name: "catalog" 95 | dataset_name: "jaffle_shop" 96 | table_name: "multiplenests" 97 | name: "triplenest.secondnesting.d" 98 | - project_name: "catalog" 99 | dataset_name: "jaffle_shop" 100 | table_name: "nested_table" 101 | name: "rescordenesed.anothernesting23" 102 | joins: 103 | - join_type: "FULL" 104 | left_columns: 105 | - name: "user_id" 106 | references: 107 | - project_name: "catalog" 108 | dataset_name: "jaffle_shop" 109 | table_name: "multiplenests" 110 | name: "triplenest.secondnesting.d" 111 | used_for: 112 | - "JOIN_LEFT_TABLE" 113 | right_columns: 114 | - name: "user_id" 115 | references: 116 | - project_name: "catalog" 117 | dataset_name: "jaffle_shop" 118 | table_name: "nested_table" 119 | name: "rescordenesed.anothernesting23" 120 | used_for: 121 | - "JOIN_RIGHT_TABLE" 122 | - join_type: "LEFT" 123 | left_columns: 124 | - name: "user_id" 125 | references: 126 | - project_name: "catalog" 127 | dataset_name: "jaffle_shop" 128 | table_name: "multiplenests" 129 | name: "triplenest.secondnesting.d" 130 | used_for: 131 | - "JOIN_LEFT_TABLE" 132 | - name: "user_id" 133 | references: 134 | - project_name: "catalog" 135 | dataset_name: "jaffle_shop" 136 | table_name: "nested_table" 137 | name: "rescordenesed.anothernesting23" 138 | used_for: 139 | - "JOIN_LEFT_TABLE" 140 | right_columns: 141 | - name: "triplenest.secondnesting.d" 142 | references: 143 | - project_name: "catalog" 144 | dataset_name: "jaffle_shop" 145 | table_name: "multiplenests" 146 | name: "triplenest.secondnesting.d" 147 | used_for: 148 | - "JOIN_RIGHT_TABLE" 149 | filters: 150 | - name: "__TABLE_SUFFIX_" 151 | references: 152 | - project_name: "catalog" 153 | dataset_name: "jaffle_shop" 154 | table_name: "multiplenests" 155 | name: "_TABLE_SUFFIX" 156 | used_for: 157 | - "FILTER" 158 | - name: "__TABLE_SUFFIX_" 159 | references: 160 | - project_name: "catalog" 161 | dataset_name: "jaffle_shop" 162 | table_name: "nested_table" 163 | name: "_TABLE_SUFFIX" 164 | used_for: 165 | - "FILTER" 166 | aggregations: 167 | - name: "_reporting_country_" 168 | references: 169 | - project_name: "catalog" 170 | dataset_name: "jaffle_shop" 171 | table_name: "multiplenests" 172 | name: "triplenest.secondnesting.d" 173 | used_for: 174 | - "GROUP_BY" 175 | - name: "_model_" 176 | references: 177 | - project_name: "catalog" 178 | dataset_name: "jaffle_shop" 179 | table_name: "multiplenests" 180 | name: "triplenest.secondnesting.firstnesting.c" 181 | used_for: 182 | - "GROUP_BY" 183 | - name: "_manufacturer_" 184 | references: 185 | - project_name: "catalog" 186 | dataset_name: "jaffle_shop" 187 | table_name: "multiplenests" 188 | name: "triplenest.secondnesting.firstnesting.b" 189 | used_for: 190 | - "GROUP_BY" 191 | - name: "_user_id_" 192 | references: 193 | - project_name: "catalog" 194 | dataset_name: "jaffle_shop" 195 | table_name: "multiplenests" 196 | name: "triplenest.secondnesting.d" 197 | used_for: 198 | - "GROUP_BY" 199 | - name: "_user_id_" 200 | references: 201 | - project_name: "catalog" 202 | dataset_name: "jaffle_shop" 203 | table_name: "nested_table" 204 | name: "rescordenesed.anothernesting23" 205 | used_for: 206 | - "GROUP_BY" 207 | type: "select" 208 | selected_tables: 209 | - "catalog.jaffle_shop.multiplenests" 210 | - "catalog.jaffle_shop.nested_table" 211 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/looker_subquery_crazy.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | 3 | WITH base AS ( SELECT * 4 | FROM (SELECT clmn1_, SUM(clmn2_) AS clmn100000_ 5 | FROM (SELECT * 6 | FROM (SELECT t0.cost AS clmn2_, t0.date AS clmn0_, t0.anothernesting AS clmn1_ 7 | FROM (WITH base 8 | AS (SELECT DATE_TRUNC(DATE(anotherdate), DAY) AS date, rescordenesed.anothernesting, ARRAY_TO_STRING(ARRAY( 9 | SELECT 10 | value 11 | FROM 12 | UNNEST(labels) 13 | WHERE 14 | key = "goog-resource-type" 15 | ), " ") AS resource_type, first_name, SUM (cost) AS cost 16 | FROM 17 | `catalog.jaffle_shop.nested_table` 18 | WHERE 19 | DATE (_PARTITIONTIME) 20 | > DATE_SUB(`CURRENT_DATE`() 21 | , INTERVAL 62 day) 22 | AND acolumn.id IN ("blabla" 23 | , "bloblob") 24 | GROUP BY 1, 2, 3, 4) SELECT 25 | date, 26 | anothernesting, 27 | SUM(cost) AS cost 28 | FROM 29 | base 30 | GROUP BY 1, 2) t0) 31 | WHERE ((clmn0_ >= DATE "2023-12-13") AND (clmn0_ <= DATE "2024-01-11"))) 32 | GROUP BY clmn1_ ) LIMIT 20000000), final2 AS( 33 | 34 | select STRUCT( clmn100000_ as description, clmn1_ as cost) as clmn0_ from base 35 | WHERE (clmn100000_ > 0) 36 | ), otherfinal AS( 37 | 38 | SELECT clmn0_.description, 39 | ARRAY_AGG(STRUCT(clmn0_.cost, 40 | clmn0_.description)) AS clmn0_ 41 | from final2 42 | GROUP BY 1 43 | ), another AS ( 44 | 45 | select otherfinal.description,cost,c.description AS description2 from otherfinal,unnest(clmn0_) as c 46 | WHERE (c.cost > "0") 47 | ), another3 AS( 48 | select STRUCT(description,cost,description2 AS other) as test from another 49 | ), final2323 AS( 50 | 51 | select test.description, ARRAY_AGG(STRUCT(test.cost,test.other)) as test3 from another3 52 | group by 1 53 | ) 54 | select * from final2323,UNNEST(test3) as unnested_test 55 | 56 | expected_output: 57 | name: "looker subquery but with a lot of nesting and random stuff" 58 | output_columns: 59 | - name: "description" 60 | references: 61 | - project_name: "catalog" 62 | dataset_name: "jaffle_shop" 63 | table_name: "nested_table" 64 | name: "cost" 65 | - name: "test3.cost" 66 | references: 67 | - project_name: "catalog" 68 | dataset_name: "jaffle_shop" 69 | table_name: "nested_table" 70 | name: "cost" 71 | - project_name: "catalog" 72 | dataset_name: "jaffle_shop" 73 | table_name: "nested_table" 74 | name: "rescordenesed.anothernesting" 75 | - name: "test3.other" 76 | references: 77 | - project_name: "catalog" 78 | dataset_name: "jaffle_shop" 79 | table_name: "nested_table" 80 | name: "cost" 81 | - project_name: "catalog" 82 | dataset_name: "jaffle_shop" 83 | table_name: "nested_table" 84 | name: "rescordenesed.anothernesting" 85 | - name: "cost" 86 | references: 87 | - project_name: "catalog" 88 | dataset_name: "jaffle_shop" 89 | table_name: "nested_table" 90 | name: "cost" 91 | - project_name: "catalog" 92 | dataset_name: "jaffle_shop" 93 | table_name: "nested_table" 94 | name: "rescordenesed.anothernesting" 95 | - name: "other" 96 | references: 97 | - project_name: "catalog" 98 | dataset_name: "jaffle_shop" 99 | table_name: "nested_table" 100 | name: "cost" 101 | - project_name: "catalog" 102 | dataset_name: "jaffle_shop" 103 | table_name: "nested_table" 104 | name: "rescordenesed.anothernesting" 105 | filters: 106 | - name: "_date_" 107 | references: 108 | - project_name: "catalog" 109 | dataset_name: "jaffle_shop" 110 | table_name: "nested_table" 111 | name: "anotherdate" 112 | used_for: 113 | - "FILTER" 114 | - "GROUP_BY" 115 | - name: "__PARTITIONTIME_" 116 | references: 117 | - project_name: "catalog" 118 | dataset_name: "jaffle_shop" 119 | table_name: "nested_table" 120 | name: "_PARTITIONTIME" 121 | used_for: 122 | - "FILTER" 123 | - name: "_acolumn.id_" 124 | references: 125 | - project_name: "catalog" 126 | dataset_name: "jaffle_shop" 127 | table_name: "nested_table" 128 | name: "acolumn.id" 129 | used_for: 130 | - "FILTER" 131 | - name: "_clmn100000__" 132 | references: 133 | - project_name: "catalog" 134 | dataset_name: "jaffle_shop" 135 | table_name: "nested_table" 136 | name: "cost" 137 | used_for: 138 | - "FILTER" 139 | - name: "_c.cost_" 140 | references: 141 | - project_name: "catalog" 142 | dataset_name: "jaffle_shop" 143 | table_name: "nested_table" 144 | name: "cost" 145 | used_for: 146 | - "FILTER" 147 | - project_name: "catalog" 148 | dataset_name: "jaffle_shop" 149 | table_name: "nested_table" 150 | name: "rescordenesed.anothernesting" 151 | used_for: 152 | - "FILTER" 153 | aggregations: 154 | - name: "_clmn1__" 155 | references: 156 | - project_name: "catalog" 157 | dataset_name: "jaffle_shop" 158 | table_name: "nested_table" 159 | name: "rescordenesed.anothernesting" 160 | used_for: 161 | - "GROUP_BY" 162 | - name: "_date_" 163 | references: 164 | - project_name: "catalog" 165 | dataset_name: "jaffle_shop" 166 | table_name: "nested_table" 167 | name: "anotherdate" 168 | used_for: 169 | - "FILTER" 170 | - "GROUP_BY" 171 | - name: "_date_" 172 | references: 173 | - project_name: "catalog" 174 | dataset_name: "jaffle_shop" 175 | table_name: "nested_table" 176 | name: "anotherdate" 177 | used_for: 178 | - "GROUP_BY" 179 | - name: "_anothernesting_" 180 | references: 181 | - project_name: "catalog" 182 | dataset_name: "jaffle_shop" 183 | table_name: "nested_table" 184 | name: "rescordenesed.anothernesting" 185 | used_for: 186 | - "GROUP_BY" 187 | - name: "_resource_type_" 188 | references: 189 | - project_name: "catalog" 190 | dataset_name: "jaffle_shop" 191 | table_name: "nested_table" 192 | name: "labels.value" 193 | used_for: 194 | - "GROUP_BY" 195 | - name: "_first_name_" 196 | references: 197 | - project_name: "catalog" 198 | dataset_name: "jaffle_shop" 199 | table_name: "nested_table" 200 | name: "first_name" 201 | used_for: 202 | - "GROUP_BY" 203 | - name: "_description_" 204 | references: 205 | - project_name: "catalog" 206 | dataset_name: "jaffle_shop" 207 | table_name: "nested_table" 208 | name: "cost" 209 | used_for: 210 | - "GROUP_BY" 211 | type: "select" 212 | selected_tables: 213 | - "catalog.jaffle_shop.nested_table" 214 | 215 | 216 | 217 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/analytical_functions.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with customers as ( 3 | 4 | select * from `catalog.jaffle_shop.stg_customers` 5 | 6 | ), 7 | 8 | orders as ( 9 | 10 | select * from `catalog.jaffle_shop.stg_orders` 11 | 12 | ), 13 | 14 | payments as ( 15 | 16 | select * from `catalog.jaffle_shop.stg_payments` 17 | 18 | ), 19 | 20 | customer_orders as ( 21 | 22 | select 23 | customer_id, 24 | 25 | min(order_date) as first_order, 26 | max(order_date) as most_recent_order, 27 | count(order_id) as number_of_orders 28 | from orders 29 | where order_date is not null 30 | group by customer_id 31 | 32 | 33 | 34 | ), 35 | customer_orders_deduped AS( 36 | select 37 | * from customer_orders 38 | 39 | QUALIFY row_number() OVER (PARTITION BY customer_id ORDER BY most_recent_order) = 1 40 | ), 41 | 42 | customer_payments as ( 43 | 44 | select 45 | orders.customer_id, 46 | sum(amount) as total_amount 47 | 48 | from payments 49 | 50 | left join orders on 51 | payments.order_id = orders.order_id 52 | 53 | group by orders.customer_id 54 | 55 | ), 56 | 57 | final as ( 58 | 59 | select 60 | customers.customer_id, 61 | customers.first_name, 62 | customers.last_name, 63 | customer_orders_deduped.first_order, 64 | customer_orders_deduped.most_recent_order, 65 | customer_orders_deduped.number_of_orders, 66 | customer_payments.total_amount as customer_lifetime_value, 67 | LAG(customers.first_name) 68 | OVER (PARTITION BY customers.customer_id ORDER BY customer_orders_deduped.number_of_orders 69 | ASC) AS 70 | preceding_name, 71 | LEAD(customers.first_name) 72 | OVER (PARTITION BY customers.customer_id ORDER BY customer_orders_deduped.number_of_orders 73 | ASC) AS 74 | next_name, 75 | 76 | from customers 77 | 78 | left join customer_orders_deduped 79 | on customers.customer_id = customer_orders_deduped.customer_id 80 | 81 | left join customer_payments 82 | on customers.customer_id = customer_payments.customer_id 83 | 84 | ) 85 | 86 | select * from final 87 | 88 | expected_output: 89 | name: "analytical_functions" 90 | output_columns: 91 | - name: "customer_id" 92 | references: 93 | - project_name: "catalog" 94 | dataset_name: "jaffle_shop" 95 | table_name: "stg_customers" 96 | name: "customer_id" 97 | - name: "first_name" 98 | references: 99 | - project_name: "catalog" 100 | dataset_name: "jaffle_shop" 101 | table_name: "stg_customers" 102 | name: "first_name" 103 | - name: "last_name" 104 | references: 105 | - project_name: "catalog" 106 | dataset_name: "jaffle_shop" 107 | table_name: "stg_customers" 108 | name: "last_name" 109 | - name: "first_order" 110 | references: 111 | - project_name: "catalog" 112 | dataset_name: "jaffle_shop" 113 | table_name: "stg_orders" 114 | name: "order_date" 115 | - name: "most_recent_order" 116 | references: 117 | - project_name: "catalog" 118 | dataset_name: "jaffle_shop" 119 | table_name: "stg_orders" 120 | name: "order_date" 121 | - name: "number_of_orders" 122 | references: 123 | - project_name: "catalog" 124 | dataset_name: "jaffle_shop" 125 | table_name: "stg_orders" 126 | name: "order_id" 127 | - name: "customer_lifetime_value" 128 | references: 129 | - project_name: "catalog" 130 | dataset_name: "jaffle_shop" 131 | table_name: "stg_payments" 132 | name: "amount" 133 | - name: "preceding_name" 134 | references: 135 | - project_name: "catalog" 136 | dataset_name: "jaffle_shop" 137 | table_name: "stg_orders" 138 | name: "order_id" 139 | - project_name: "catalog" 140 | dataset_name: "jaffle_shop" 141 | table_name: "stg_customers" 142 | name: "customer_id" 143 | - project_name: "catalog" 144 | dataset_name: "jaffle_shop" 145 | table_name: "stg_customers" 146 | name: "first_name" 147 | - name: "next_name" 148 | references: 149 | - project_name: "catalog" 150 | dataset_name: "jaffle_shop" 151 | table_name: "stg_orders" 152 | name: "order_id" 153 | - project_name: "catalog" 154 | dataset_name: "jaffle_shop" 155 | table_name: "stg_customers" 156 | name: "customer_id" 157 | - project_name: "catalog" 158 | dataset_name: "jaffle_shop" 159 | table_name: "stg_customers" 160 | name: "first_name" 161 | joins: 162 | - join_type: "LEFT" 163 | left_columns: 164 | - name: "order_id" 165 | references: 166 | - project_name: "catalog" 167 | dataset_name: "jaffle_shop" 168 | table_name: "stg_payments" 169 | name: "order_id" 170 | used_for: 171 | - "JOIN_LEFT_TABLE" 172 | right_columns: 173 | - name: "order_id" 174 | references: 175 | - project_name: "catalog" 176 | dataset_name: "jaffle_shop" 177 | table_name: "stg_orders" 178 | name: "order_id" 179 | used_for: 180 | - "JOIN_RIGHT_TABLE" 181 | - join_type: "LEFT" 182 | left_columns: 183 | - name: "customer_id" 184 | references: 185 | - project_name: "catalog" 186 | dataset_name: "jaffle_shop" 187 | table_name: "stg_customers" 188 | name: "customer_id" 189 | used_for: 190 | - "JOIN_LEFT_TABLE" 191 | right_columns: 192 | - name: "customer_id" 193 | references: 194 | - project_name: "catalog" 195 | dataset_name: "jaffle_shop" 196 | table_name: "stg_orders" 197 | name: "customer_id" 198 | used_for: 199 | - "JOIN_RIGHT_TABLE" 200 | - join_type: "LEFT" 201 | left_columns: 202 | - name: "customer_id" 203 | references: 204 | - project_name: "catalog" 205 | dataset_name: "jaffle_shop" 206 | table_name: "stg_customers" 207 | name: "customer_id" 208 | used_for: 209 | - "JOIN_LEFT_TABLE" 210 | right_columns: 211 | - name: "customer_id" 212 | references: 213 | - project_name: "catalog" 214 | dataset_name: "jaffle_shop" 215 | table_name: "stg_orders" 216 | name: "customer_id" 217 | used_for: 218 | - "JOIN_RIGHT_TABLE" 219 | filters: 220 | - name: "_order_date_" 221 | references: 222 | - project_name: "catalog" 223 | dataset_name: "jaffle_shop" 224 | table_name: "stg_orders" 225 | name: "order_date" 226 | used_for: 227 | - "FILTER" 228 | - name: "_$analytic1_" 229 | references: 230 | - project_name: "catalog" 231 | dataset_name: "jaffle_shop" 232 | table_name: "stg_orders" 233 | name: "customer_id" 234 | used_for: 235 | - "FILTER" 236 | - project_name: "catalog" 237 | dataset_name: "jaffle_shop" 238 | table_name: "stg_orders" 239 | name: "order_date" 240 | used_for: 241 | - "FILTER" 242 | aggregations: 243 | - name: "_customer_id_" 244 | references: 245 | - project_name: "catalog" 246 | dataset_name: "jaffle_shop" 247 | table_name: "stg_orders" 248 | name: "customer_id" 249 | used_for: 250 | - "GROUP_BY" 251 | type: "select" 252 | selected_tables: 253 | - "catalog.jaffle_shop.stg_orders" 254 | - "catalog.jaffle_shop.stg_customers" 255 | - "catalog.jaffle_shop.stg_payments" 256 | 257 | -------------------------------------------------------------------------------- /src/test/resources/sql/benchmark/multiple_refs.yaml: -------------------------------------------------------------------------------- 1 | query: | 2 | with customers as ( 3 | 4 | select * from `catalog.jaffle_shop.stg_customers` 5 | 6 | ), 7 | 8 | orders as ( 9 | 10 | select * from `catalog.jaffle_shop.stg_orders` 11 | 12 | ), 13 | 14 | payments as ( 15 | 16 | select * from `catalog.jaffle_shop.stg_payments` 17 | 18 | ), 19 | 20 | customer_orders as ( 21 | 22 | select 23 | customer_id, 24 | 25 | min(order_date) as first_order, 26 | max(order_date) as most_recent_order, 27 | max(order_date) - min(order_date) as max_days_between_orders, 28 | count(order_id) as number_of_orders 29 | from orders 30 | where order_date is not null 31 | group by customer_id 32 | 33 | ), 34 | customer_orders_deduped AS( 35 | select 36 | * from customer_orders 37 | 38 | QUALIFY row_number() OVER (PARTITION BY customer_id ORDER BY most_recent_order) = 1 39 | ), 40 | 41 | customer_payments as ( 42 | 43 | select 44 | orders.customer_id, 45 | sum(amount) as total_amount 46 | 47 | from payments 48 | 49 | left join orders on 50 | payments.order_id = orders.order_id 51 | 52 | group by orders.customer_id 53 | 54 | ), 55 | 56 | final as ( 57 | 58 | select 59 | customers.customer_id, 60 | customers.first_name, 61 | customers.last_name, 62 | customer_orders_deduped.first_order, 63 | customer_orders_deduped.most_recent_order, 64 | customer_orders_deduped.number_of_orders, 65 | customer_payments.total_amount as customer_lifetime_value, 66 | LAG(customers.first_name) 67 | OVER (PARTITION BY customers.customer_id ORDER BY customer_orders_deduped.number_of_orders 68 | ASC) AS 69 | preceding_name, 70 | LEAD(customers.first_name) 71 | OVER (PARTITION BY customers.customer_id ORDER BY customer_orders_deduped.number_of_orders 72 | ASC) AS 73 | next_name, 74 | 75 | from customers 76 | 77 | left join customer_orders_deduped 78 | on customers.customer_id = customer_orders_deduped.customer_id 79 | 80 | left join customer_payments 81 | on customers.customer_id = customer_payments.customer_id 82 | 83 | ) 84 | 85 | select * from final 86 | 87 | expected_output: 88 | name: "multiple_refs" 89 | output_columns: 90 | - name: "customer_id" 91 | references: 92 | - project_name: "catalog" 93 | dataset_name: "jaffle_shop" 94 | table_name: "stg_customers" 95 | name: "customer_id" 96 | - name: "first_name" 97 | references: 98 | - project_name: "catalog" 99 | dataset_name: "jaffle_shop" 100 | table_name: "stg_customers" 101 | name: "first_name" 102 | - name: "last_name" 103 | references: 104 | - project_name: "catalog" 105 | dataset_name: "jaffle_shop" 106 | table_name: "stg_customers" 107 | name: "last_name" 108 | - name: "first_order" 109 | references: 110 | - project_name: "catalog" 111 | dataset_name: "jaffle_shop" 112 | table_name: "stg_orders" 113 | name: "order_date" 114 | - name: "most_recent_order" 115 | references: 116 | - project_name: "catalog" 117 | dataset_name: "jaffle_shop" 118 | table_name: "stg_orders" 119 | name: "order_date" 120 | - name: "number_of_orders" 121 | references: 122 | - project_name: "catalog" 123 | dataset_name: "jaffle_shop" 124 | table_name: "stg_orders" 125 | name: "order_id" 126 | - name: "customer_lifetime_value" 127 | references: 128 | - project_name: "catalog" 129 | dataset_name: "jaffle_shop" 130 | table_name: "stg_payments" 131 | name: "amount" 132 | - name: "preceding_name" 133 | references: 134 | - project_name: "catalog" 135 | dataset_name: "jaffle_shop" 136 | table_name: "stg_orders" 137 | name: "order_id" 138 | - project_name: "catalog" 139 | dataset_name: "jaffle_shop" 140 | table_name: "stg_customers" 141 | name: "customer_id" 142 | - project_name: "catalog" 143 | dataset_name: "jaffle_shop" 144 | table_name: "stg_customers" 145 | name: "first_name" 146 | - name: "next_name" 147 | references: 148 | - project_name: "catalog" 149 | dataset_name: "jaffle_shop" 150 | table_name: "stg_orders" 151 | name: "order_id" 152 | - project_name: "catalog" 153 | dataset_name: "jaffle_shop" 154 | table_name: "stg_customers" 155 | name: "customer_id" 156 | - project_name: "catalog" 157 | dataset_name: "jaffle_shop" 158 | table_name: "stg_customers" 159 | name: "first_name" 160 | joins: 161 | - join_type: "LEFT" 162 | left_columns: 163 | - name: "order_id" 164 | references: 165 | - project_name: "catalog" 166 | dataset_name: "jaffle_shop" 167 | table_name: "stg_payments" 168 | name: "order_id" 169 | used_for: 170 | - "JOIN_LEFT_TABLE" 171 | right_columns: 172 | - name: "order_id" 173 | references: 174 | - project_name: "catalog" 175 | dataset_name: "jaffle_shop" 176 | table_name: "stg_orders" 177 | name: "order_id" 178 | used_for: 179 | - "JOIN_RIGHT_TABLE" 180 | - join_type: "LEFT" 181 | left_columns: 182 | - name: "customer_id" 183 | references: 184 | - project_name: "catalog" 185 | dataset_name: "jaffle_shop" 186 | table_name: "stg_customers" 187 | name: "customer_id" 188 | used_for: 189 | - "JOIN_LEFT_TABLE" 190 | right_columns: 191 | - name: "customer_id" 192 | references: 193 | - project_name: "catalog" 194 | dataset_name: "jaffle_shop" 195 | table_name: "stg_orders" 196 | name: "customer_id" 197 | used_for: 198 | - "JOIN_RIGHT_TABLE" 199 | - join_type: "LEFT" 200 | left_columns: 201 | - name: "customer_id" 202 | references: 203 | - project_name: "catalog" 204 | dataset_name: "jaffle_shop" 205 | table_name: "stg_customers" 206 | name: "customer_id" 207 | used_for: 208 | - "JOIN_LEFT_TABLE" 209 | right_columns: 210 | - name: "customer_id" 211 | references: 212 | - project_name: "catalog" 213 | dataset_name: "jaffle_shop" 214 | table_name: "stg_orders" 215 | name: "customer_id" 216 | used_for: 217 | - "JOIN_RIGHT_TABLE" 218 | filters: 219 | - name: "_order_date_" 220 | references: 221 | - project_name: "catalog" 222 | dataset_name: "jaffle_shop" 223 | table_name: "stg_orders" 224 | name: "order_date" 225 | used_for: 226 | - "FILTER" 227 | - name: "_$analytic1_" 228 | references: 229 | - project_name: "catalog" 230 | dataset_name: "jaffle_shop" 231 | table_name: "stg_orders" 232 | name: "customer_id" 233 | used_for: 234 | - "FILTER" 235 | - project_name: "catalog" 236 | dataset_name: "jaffle_shop" 237 | table_name: "stg_orders" 238 | name: "order_date" 239 | used_for: 240 | - "FILTER" 241 | aggregations: 242 | - name: "_customer_id_" 243 | references: 244 | - project_name: "catalog" 245 | dataset_name: "jaffle_shop" 246 | table_name: "stg_orders" 247 | name: "customer_id" 248 | used_for: 249 | - "GROUP_BY" 250 | type: "select" 251 | selected_tables: 252 | - "catalog.jaffle_shop.stg_orders" 253 | - "catalog.jaffle_shop.stg_customers" 254 | - "catalog.jaffle_shop.stg_payments" 255 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bq-Lineage-tool 2 | 3 | 4 | Bq-lineage tool is a column level lineage parser for BigQuery using ZetaSQL. This 5 | parser started as a fork of this [project by google](https://github.com/GoogleCloudPlatform/bigquery-data-lineage), but it 6 | has been heavily modified to cover the whole bigquery syntax offered by ZetaSQL. The 7 | output of this parser is a DAG of the columns used in a query from sources to outputs, including 8 | auxiliary fields that could be used as part of filters or other operations that don't result in the 9 | materialisation of a field. 10 | 11 | 12 | 13 | From any arbitrary BigQuery query, you will get the following outputs: 14 | - `output_columns`: The columns that are part of the output of the query, with all the input 15 | columns references that were needed to produce them. 16 | - `joins`: List of joins used in the query, considering the columns used for the join 17 | - `aggregations`: List of columns used for aggregations 18 | - `filters`: List of columns used for filtering 19 | - `other_used_columns`: Any other columns used across the query, like order by 20 | - `selected_tables`: A list of all the tables that were selected in the query. 21 | - `Type`: The type of sql statement `{SELECT, CREATE_VIEW, MERGE...}` 22 | 23 | ![image](./flow.png) 24 | 25 | 26 | ## What can this parser do? 27 | 28 | * It's schema aware. This means that a query like `SELECT * FROM table` will generate a DAG 29 | with all the output columns of `table`, and not just a single node with a `*` symbol. 30 | * It prunes unused columns. This means that for a query like `WITH base AS (SELECT * FROM 31 | table) SELECT aColumn FROM base` the output DAG will only contain the column `aColumn` and not the 32 | whole input table. 33 | * It covers pretty much all the BigQuery syntax, including: 34 | * `WITH` (CTE) clauses 35 | * Subqueries 36 | * `UNNEST`-based `JOINS` 37 | * `STRUCTS` and `ARRAYS` 38 | * `JOINS` 39 | * Analytical functions (`QUALIFY`, `LAG`/`LEAD`, `WINDOWS` etc.) 40 | * Map aliases to original columns 41 | * `JSON` functions 42 | * Access to the `PATH` used in `JSON` functions (`JSON_EXTRACT(field,"$.path.to.field")`) 43 | * Access to the literals used in the query, for example, in a `WHERE` clause 44 | * Access to fields that are not part of the output columns of the table (fields only used in a 45 | `WHERE` clause) 46 | * `PIVOT` and `UNPIVOT` transformations 47 | * `GROUP BY GROUPING SETS`, `ROLLUP` and `CUBE` 48 | * `UDF` and temporary functions 49 | * Usage of parameters @param 50 | * Recursive CTEs 51 | * It parses `SELECTS`, `CREATE {VIEWS}` and `MERGE` statements 52 | * It automatically infers internal BQ fields like `_TABLE_SUFFIX` 53 | 54 | ## What can't it do? 55 | 56 | * This parser won't work with procedural SQL. For example, it will fail trying to parse a 57 | DECLARE or SET operations. 58 | * This parser won't read the logic within UDF functions. It only checks inputs and outputs. 59 | * ZetaSQL might not be up-to-date with the latest BigQuery features, so if there's something 60 | super new, it will involve either waiting for ZetaSQL to be updated, or going deep into 61 | ZetaSQL to build the feature. 62 | * It doesn't work while trying to parse queries accessing `INFORMATION_SCHEMA`-type of tables. I 63 | guess we could bypass this by using a different type of access, but never when through it deeply. 64 | * This parser won't build the DAG of multiple queries. It only parses a single query at a 65 | time. To build a full dag of your dbt project, for example, you can use libraries like 66 | `networkx` to connect the edges from the output of this parser. 67 | * Parse SQL syntax that is not supported by ZetaSQL (for example the + operator in Snowflake joins) 68 | * When doing a `SELECT count(*) FROM table`, the output of the parser would act as if no columns 69 | were selected. This could be subject to interpretation: should all the columns of the input 70 | `table` be marked as used? Or should the output be an empty list because this query doesn't 71 | care about any specific column or number of columns? 72 | * Unexpected bugs - even though this parser has been texted over more than 7000 SQL queries, 73 | there still might be some edge cases that suddenly are not covered. SQL is hard. 74 | * It doesn't work with `TVF` (Table Valued Functions) - although ZetaSQL parses it, the output 75 | won't show the columns of the TVF. 76 | * Automatically infer UDFs - they have to be defined as part of the script that is going to be parsed. 77 | 78 | ## How to use 79 | The folder `/src/test/examples` has multiple examples of how to use this parser. The main caveat 80 | relies on how to build the catalog that ZetaSQL needs. Depending on how much you want the parser 81 | to automate the whole process for you, there are three different methods to build a catalog, 82 | from the "let the parser to it for me" to "I'll build the catalog myself" 83 | 84 | - `/src/test/examples/BigQuerySqlParserBQSchemaTest.java` shows how to rely on the metadata of 85 | BiGQuery to build the catalog. To use this method the user has to be authenticated with gcloud. 86 | Note that there's zero data access/movement in this operation. The only access that is being 87 | done is directly to the metadata of the tables, and only to the tables that are being used in 88 | the parsed query, i.e., this parser won't scan the whole database. The access is done using 89 | the bigquery API. You can use `gcloud auth application-default login` to authenticate. 90 | - `/src/test/examples/BigQuerySqlParserLocalSchemaTest.java` shows how to use local json files 91 | to build the schema. `/src/test/resources/schemas/` has examples of these files. They are 92 | exact copies of the metadata information you can get 93 | [through the API](https://cloud.google.com/bigquery/docs/reference/rest/v2/tables). Note that 94 | this is basically what we automate with the previous method. 95 | - `/src/test/examples/ASTExplorerTest.java` Shows an example on how can manually create your own 96 | catalog using ZetaSQL methods/constructors and feed it into the parser. 97 | 98 | ### Example 99 | The following example uses the first method to build the catalog - it will use the bigquery API 100 | to request the metadata of the tables used in the query. 101 | ```java 102 | BigQueryZetaSqlSchemaLoader schemaLoader = 103 | new BigQueryZetaSqlSchemaLoader( 104 | BigQueryTableLoadService.usingServiceFactory( 105 | BigQueryServiceFactory.defaultFactory() 106 | ) 107 | ); 108 | 109 | ZetaSQLResolver parser = new ZetaSQLResolver(schemaLoader); 110 | 111 | String sql = """ 112 | SELECT 113 | word, 114 | SUM(word_count) AS count 115 | FROM 116 | `bigquery-public-data.samples.shakespeare` 117 | WHERE 118 | word LIKE "%raisin%" 119 | GROUP BY 120 | word; 121 | """; 122 | 123 | ResolvedNodeExtended table = parser.extractLineage(sql); 124 | OutputLineage printer = new OutputLineage(); 125 | printer.toYaml(table, "test", true); 126 | ``` 127 | Output: 128 | ``` 129 | name: "test" 130 | output_columns: 131 | - name: "word" 132 | references: 133 | - project_name: "bigquery-public-data.samples.shakespeare" 134 | column_name: "word" 135 | - name: "count" 136 | references: 137 | - project_name: "bigquery-public-data.samples.shakespeare" 138 | column_name: "word_count" 139 | other_used_columns: 140 | - name: "_word_" 141 | references: 142 | - project_name: "bigquery-public-data.samples.shakespeare" 143 | column_name: "word" 144 | literal_value: 145 | - "%raisin%" 146 | type: "select" 147 | selected_tables: 148 | - "bigquery-public-data.samples.shakespeare" 149 | ``` 150 | 151 | ### Notes 152 | - This parser **never** accesses the data of the tables or any bigquery instance. The only 153 | connection needed is to the metadata of the tables. 154 | - The parser will use a default project+dataset if these are missing in the reference tables of 155 | a project. Please refer to `src/main/java/com/borjav/data/options/Options.java` in case you 156 | need to set a specific project. 157 | - When using UDFs, they also have to be defined within the code. The parser won't be able to 158 | resolve them if they are not defined in the code. Please refer to 159 | `src/test/resources/sql/benchmark/udf.yaml`. 160 | --------------------------------------------------------------------------------