├── .gitignore ├── LICENSE ├── README.md ├── pom.xml └── src ├── main ├── resources │ └── schema.yaml.template └── scala │ ├── com │ └── uber │ │ └── engsec │ │ └── dp │ │ ├── analysis │ │ ├── columns_used │ │ │ └── ColumnsUsedAnalysis.scala │ │ ├── differential_privacy │ │ │ ├── ElasticSensitivityAnalysis.scala │ │ │ ├── RestrictedSensitivityAnalysis.scala │ │ │ ├── SensitivityDomain.scala │ │ │ └── StabilityDomain.scala │ │ ├── histogram │ │ │ ├── HistogramAnalysis.scala │ │ │ └── QueryType.scala │ │ ├── join │ │ │ └── JoinKeysUsed.scala │ │ ├── name_resolution │ │ │ ├── NameResolutionAnalysis.scala │ │ │ ├── NameResolutionDomain.scala │ │ │ └── ReferenceInfo.scala │ │ └── taint │ │ │ └── TaintAnalysis.scala │ │ ├── dataflow │ │ ├── AbstractDataflowAnalysis.scala │ │ ├── AggFunctions.scala │ │ ├── column │ │ │ ├── AbstractColumnAnalysis.scala │ │ │ ├── DFGColumnAnalysis.scala │ │ │ └── RelNodeColumnAnalysis.scala │ │ ├── domain │ │ │ ├── AbstractDomain.scala │ │ │ ├── Basic.scala │ │ │ ├── Collection.scala │ │ │ ├── DomainElement.scala │ │ │ └── Lattice.scala │ │ └── node │ │ │ ├── ASTDataflowAnalysis.scala │ │ │ └── DFGVisitorAnalysis.scala │ │ ├── exception │ │ ├── AnalysisException.scala │ │ ├── DPException.scala │ │ └── TransformationException.scala │ │ ├── rewriting │ │ ├── DPUtil.scala │ │ ├── Rewriter.scala │ │ ├── coverage │ │ │ └── CoverageRewriter.scala │ │ ├── differential_privacy │ │ │ ├── ElasticSensitivityRewriter.scala │ │ │ ├── RestrictedSensitivityRewriter.scala │ │ │ ├── SampleAndAggregateRewriter.scala │ │ │ ├── SensitivityRewriter.scala │ │ │ └── WPINQRewriter.scala │ │ └── rules │ │ │ ├── ColumnDefinition.scala │ │ │ ├── Expr.scala │ │ │ └── Operations.scala │ │ ├── schema │ │ ├── CachingSchema.scala │ │ ├── DatabaseModel.scala │ │ └── Schema.scala │ │ ├── sql │ │ ├── AbstractAnalysis.scala │ │ ├── QueryParser.scala │ │ ├── TreeFunctions.scala │ │ ├── TreePrinter.scala │ │ ├── ast │ │ │ ├── ASTFunctions.scala │ │ │ └── Transformer.scala │ │ ├── dataflow_graph │ │ │ ├── DataflowGraphFunctions.scala │ │ │ ├── DataflowGraphUtils.scala │ │ │ ├── Node.scala │ │ │ ├── reference │ │ │ │ ├── ColumnReference.scala │ │ │ │ ├── Function.scala │ │ │ │ ├── Reference.scala │ │ │ │ └── UnstructuredReference.scala │ │ │ └── relation │ │ │ │ ├── DataTable.scala │ │ │ │ ├── Except.scala │ │ │ │ ├── Join.scala │ │ │ │ ├── Relation.scala │ │ │ │ ├── Select.scala │ │ │ │ └── Union.scala │ │ └── relational_algebra │ │ │ ├── RelOrExpr.scala │ │ │ ├── RelTreeFunctions.scala │ │ │ ├── RelUtils.scala │ │ │ └── Transformer.scala │ │ └── util │ │ ├── ElasticSensitivity.scala │ │ └── IdentityHashMap.scala │ └── examples │ ├── ElasticSensitivityExample.scala │ └── QueryRewritingExample.scala └── test ├── resources └── schema.yaml └── scala └── com └── uber └── engsec └── dp ├── analysis ├── columns_used │ └── ColumnsUsedAnalysisTest.scala ├── differential_privacy │ ├── ElasticSensitivityAnalysisTest.scala │ └── RestrictedSensitivityAnalysisTest.scala ├── histogram │ └── HistogramAnalysisTest.scala └── taint │ └── TaintAnalysisTest.scala ├── core └── SchemaTest.scala ├── rewriting ├── CoverageRewriterTest.scala ├── ElasticSensitivityRewriterTest.scala ├── RestrictedSensitivityRewriterTest.scala ├── SampleAndAggregateRewriterTest.scala └── WPINQRewriterTest.scala └── sql └── ast └── TreeTransformationTest.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Eclipse 4 | .classpath 5 | .project 6 | .settings/ 7 | 8 | # Intellij 9 | .idea/ 10 | *.iml 11 | *.iws 12 | 13 | # Mac 14 | .DS_Store 15 | 16 | # Maven 17 | dependency-reduced-pom.xml 18 | target/ 19 | 20 | log/ 21 | tmp/ 22 | 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Uber Technologies, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | (This project is deprecated and not maintained.) 4 | 5 | This repository contains a query analysis and rewriting framework to enforce differential privacy for general-purpose 6 | SQL queries. The rewriting engine can automatically transform an input query into an *intrinsically private query* which 7 | embeds a differential privacy mechanism in the query directly; the transformed query enforces differential privacy on 8 | its results and can be executed on any standard SQL database. This approach supports many state-of-the-art 9 | differential privacy mechanisms; the code currently includes rewriters based on [Elastic Sensitivity](https://arxiv.org/abs/1706.09479) and 10 | [Sample and Aggregate](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.296.2379&rep=rep1&type=pdf), and more will be added soon. 11 | 12 | The rewriting framework is built on a robust dataflow analyses engine for SQL queries. This framework 13 | provides an abstract representation of queries, plus several kinds of built-in dataflow analyses tailored to this 14 | representation. This framework can be used to implement other types of dataflow analyses, as described below. 15 | 16 | ## Building & Running 17 | 18 | This framework is written in Scala and built using Maven. The code has been tested on Mac OS X and Linux. To build the code: 19 | 20 | ``` 21 | $ mvn package 22 | ``` 23 | 24 | ## Example: Query Rewriting 25 | 26 | The file `examples/QueryRewritingExample.scala` contains sample code for query rewriting and demonstrates the supported 27 | mechanisms using a few simple queries. To run this example: 28 | ``` 29 | mvn exec:java -Dexec.mainClass="examples.QueryRewritingExample" 30 | ``` 31 | 32 | This example code can be easily modified, e.g., to test different queries or change parameter values. 33 | 34 | ## Background: Elastic Sensitivity 35 | 36 | Elastic sensitivity is an approach for efficiently approximating the local sensitivity of a query, which can be used to 37 | enforce differential privacy for the query. The approach requires only a static analysis of the query and therefore 38 | imposes minimal performance overhead. Importantly, it does not require any changes to the database. 39 | Details of the approach are available in [this paper](https://arxiv.org/abs/1706.09479). 40 | 41 | Elastic sensitivity can be used to determine the scale of random noise necessary to make the results of a query 42 | differentially private. For a given output column of a query with elastic sensitivity *s*, to achieve 43 | differential privacy for that column it suffices to *smooth* *s* according to the smooth sensitivity approach to obtain 44 | *S*, then add random noise drawn from the Laplace distribution, scaled to *(S/epsilon)* and centered at 0, to the true 45 | result of the query. The smoothing can be accomplished using the smooth sensitivity approach introduced by [Nissim et al](http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf). 46 | 47 | The file `examples.ElasticSensitivityExample` contains code demonstrating this approach directly (i.e., applying noise manually rather than generating an intrinsically private query). 48 | 49 | To run this example: 50 | ``` 51 | mvn exec:java -Dexec.mainClass="examples.ElasticSensitivityExample" 52 | ``` 53 | 54 | 55 | ## Analysis Framework 56 | 57 | This framework can perform additional analyses on SQL queries, and can be extended with new analyses. 58 | Each analysis in this framework extends the base class `com.uber.engsec.dp.sql.AbstractAnalysis`. 59 | 60 | To run an analysis on a query, call the method `com.uber.engsec.dp.sql.AbstractAnalysis.analyzeQuery`. 61 | The parameter of this method is a string containing a SQL query, and its return value is an abstract domain representing 62 | the results of the analysis. 63 | 64 | The source code includes several example analyses to demonstrate features of the framework. The simplest example is `com.uber.engsec.dp.analysis.taint.TaintAnalysis`, which returns an abstract domain containing information about which output columns of the query might contain data flowing from "tainted" columns in the database. The database schema determines which columns are tainted. You can invoke this analysis as follows: 65 | 66 | ```scala 67 | scala> (new com.uber.engsec.dp.analysis.taint.TaintAnalysis).analyzeQuery("SELECT my_col1 FROM my_table") 68 | BooleanDomain = my_col1 -> False 69 | ``` 70 | 71 | This code includes several built-in analyses, including: 72 | 73 | - The elastic sensitivity analysis, available in `com.uber.engsec.dp.analysis.differential_privacy.ElasticSensitivityAnalysis`, returns an abstract domain (`com.uber.engsec.dp.analysis.differential_privacy.SensitivityDomain`) that maps each output column of the query to its elastic sensitivity. 74 | - `com.uber.engsec.dp.analysis.columns_used.ColumnsUsedAnalysis` lists the original database columns 75 | from which the results of each output column are computed. 76 | - `com.uber.engsec.dp.analysis.histogram.HistogramAnalysis` lists the aggregation-ness of each 77 | output column of the query (i.e. whether or not the output is an aggregation, and if so, which type). 78 | - `com.uber.engsec.dp.analysis.join.JoinKeysUsed` lists the original database columns used as equijoin 79 | keys for each output column of the query. 80 | 81 | ## Writing New Analyses 82 | 83 | New analyses can be implemented by extending one of the abstract analysis classes and implementing *transfer functions* 84 | which describe how to update the analysis state for relevant query constructs. Analyses are written to update a 85 | specific type of *abstract domain* which represents the current state of the analysis. Each abstract domain type 86 | implements the trait `com.uber.engsec.dp.dataflow.AbstractDomain`. 87 | 88 | The simplest way to implement a new analysis is to use `com.uber.engsec.dp.dataflow.dp.column.AbstractColumnAnalysis`, 89 | which automatically tracks analysis state for each column of the query independently. Most of the example analyses are 90 | of this type. 91 | 92 | New analyses can be invoked in the same way as the built-in example analyses. 93 | 94 | ## Reporting Security Bugs 95 | 96 | Please report security bugs through [HackerOne](https://hackerone.com/uber). 97 | 98 | ## License 99 | 100 | This project is released under the MIT License. 101 | 102 | ## Contact Information 103 | 104 | This project is developed and maintained by [Noah Johnson](mailto:noahj@berkeley.edu) and [Joe Near](mailto:jnear@berkeley.edu). 105 | -------------------------------------------------------------------------------- /src/main/resources/schema.yaml.template: -------------------------------------------------------------------------------- 1 | --- 2 | databases: 3 | - database: "my_database" 4 | dialect: "postgres" 5 | namespace: "public" 6 | tables: 7 | - table: "my_table" 8 | columns: 9 | - name: "col1" 10 | - name: "col2" 11 | 12 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/analysis/columns_used/ColumnsUsedAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.analysis.columns_used 24 | import com.uber.engsec.dp.dataflow.column.DataflowGraphColumnAnalysis 25 | import com.uber.engsec.dp.dataflow.domain.SetDomain 26 | import com.uber.engsec.dp.sql.dataflow_graph.relation.DataTable 27 | 28 | /** Returns a set of all data table columns influencing each output column. 29 | */ 30 | class ColumnsUsedAnalysis extends DataflowGraphColumnAnalysis(new SetDomain[String]) { 31 | override def transferDataTable(d: DataTable, idx: Int, fact: Set[String]): Set[String] = { 32 | val qualifiedColName = s"${d.name}.${d.getColumnName(idx)}" 33 | fact ++ Set(qualifiedColName) 34 | } 35 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/analysis/differential_privacy/RestrictedSensitivityAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.analysis.differential_privacy 24 | 25 | import com.uber.engsec.dp.dataflow.column.NodeColumnFacts 26 | import com.uber.engsec.dp.exception.{AnalysisException, UnsupportedQueryException} 27 | import com.uber.engsec.dp.sql.relational_algebra._ 28 | import org.apache.calcite.rel.core.Join 29 | 30 | /** Restricted sensitivity analysis. Calculates the global sensitivity of a query over a restricted class of datasets 31 | * defined by properties of the data model (in particular the max frequency of join keys), which is presumed known by 32 | * the querier. 33 | * 34 | * @see [[https://arxiv.org/abs/1208.4586 Differentially Private Data Analysis of Social Networks via Restricted Sensitivity]] 35 | */ 36 | class RestrictedSensitivityAnalysis extends ElasticSensitivityAnalysis { 37 | 38 | override def transferJoin(node: Join, state: NodeColumnFacts[RelStability,ColSensitivity]): NodeColumnFacts[RelStability,ColSensitivity] = { 39 | /** Update the stability at every join, per restricted sensitivity definition. 40 | */ 41 | val equijoinColumns = RelUtils.extractEquiJoinColumns(node, node.getCondition) 42 | if (equijoinColumns.isEmpty) 43 | throw new UnsupportedQueryException(s"This analysis only works on single-clause equijoins.") 44 | 45 | val (leftColumnIndex, rightColumnIndex) = equijoinColumns.get 46 | 47 | val leftState = resultMap(Relation(node.getLeft)) 48 | val rightState = resultMap(Relation(node.getRight)) 49 | 50 | val leftStability = leftState.nodeFact.stability 51 | val rightStability = rightState.nodeFact.stability 52 | 53 | // Determine if this is a self-join: get the intersection of ancestors for the left and right relations 54 | // If the intersection is not empty, then this is a self-join (and restricted sensitivity doesn't support it) 55 | val isSelfJoin = (leftState.nodeFact.ancestors intersect rightState.nodeFact.ancestors).nonEmpty 56 | if (isSelfJoin) 57 | throw new UnsupportedQueryException("This analysis does not support self joins") 58 | 59 | // Determine the stability of the join 60 | val leftColFact = leftState.colFacts(leftColumnIndex) 61 | val rightColFact = rightState.colFacts(rightColumnIndex) 62 | 63 | val maxFreqLeftJoinColumn = leftColFact.maxFreq 64 | val maxFreqRightJoinColumn = rightColFact.maxFreq 65 | 66 | val newStability = 67 | (maxFreqLeftJoinColumn, maxFreqRightJoinColumn) match { 68 | case (l, r) if l <= 1.0 => r * leftStability 69 | case (l, r) if r <= 1.0 => l * rightStability 70 | case _ => throw new UnsupportedQueryException("This analysis does not support many-to-many joins") 71 | } 72 | 73 | val newNodeState = state.nodeFact.copy( 74 | stability = newStability 75 | ) 76 | 77 | /** Update the max frequency for every column by a factor of the max frequency of the join key in the opposing 78 | * relation. This models the worst-case situation where each record containing the most-frequent-key is duplicated 79 | * this many times by the join. 80 | */ 81 | val newColState = 82 | leftState.colFacts.map { x => x.copy(maxFreq = x.maxFreq * maxFreqRightJoinColumn) } ++ 83 | rightState.colFacts.map { x => x.copy(maxFreq = x.maxFreq * maxFreqLeftJoinColumn) } 84 | 85 | NodeColumnFacts(newNodeState, newColState) 86 | } 87 | 88 | override def setK(k: Int): Unit = throw new AnalysisException("This analysis does not use K") 89 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/analysis/differential_privacy/SensitivityDomain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.analysis.differential_privacy 24 | 25 | import com.uber.engsec.dp.dataflow.domain.AbstractDomain 26 | 27 | /** Abstract domain for columns in elastic sensitivity analysis. 28 | * 29 | * @param sensitivity Elastic sensitivity for this column. Always an upper bound of local sensitivity. 30 | * This is a floating point lattice with bottom (undefined sensitivity) represented by Option.None, 31 | * top (unbounded sensitivity) represented by Some(Infinity), and partial order defined by max. 32 | * @param maxFreq Max frequency of the column. The lattice is defined by the natural ordering. 33 | * @param aggregationApplied Has an aggregation already been applied to this column? 34 | * @param postAggregationArithmeticApplied Was a function/operation applied to post-aggregated result? We track this 35 | * only to print a helpful error message since this results in infinite 36 | * sensitivity. 37 | * @param canRelease Can the values of this column be released without adding noise? This is true for columns 38 | * of public tables and columns in private tables explicitly marked with canRelease=true 39 | * (as well as values derived therefrom). This is used to determine whether histogram bin 40 | * columns are safe for release. Boolean lattice with bottom = true and top = false 41 | */ 42 | case class ColSensitivity(sensitivity: Option[Double], 43 | maxFreq: Double, 44 | aggregationApplied: Boolean, 45 | postAggregationArithmeticApplied: Boolean, 46 | canRelease: Boolean) { 47 | override def toString: String = s"sensitivity: $sensitivity, maxFreq: $maxFreq, aggregationApplied: $aggregationApplied, postAggregationArithmeticApplied: $postAggregationArithmeticApplied, canRelease: $canRelease" 48 | } 49 | 50 | /** The abstract domain is a product lattice with pointwise ordering of the element types defined above. 51 | */ 52 | object SensitivityDomain extends AbstractDomain[ColSensitivity] { 53 | override val bottom: ColSensitivity = 54 | ColSensitivity( 55 | sensitivity = None, // sensitivity is undefined until aggregations are applied 56 | maxFreq = 0.0, 57 | aggregationApplied = false, 58 | postAggregationArithmeticApplied = false, 59 | canRelease = true) 60 | 61 | override def leastUpperBound(first: ColSensitivity, second: ColSensitivity): ColSensitivity = 62 | ColSensitivity( 63 | sensitivity = (first.sensitivity ++ second.sensitivity).reduceLeftOption(math.max), 64 | maxFreq = math.max(first.maxFreq, second.maxFreq), 65 | aggregationApplied = first.aggregationApplied || second.aggregationApplied, 66 | postAggregationArithmeticApplied = first.postAggregationArithmeticApplied || second.postAggregationArithmeticApplied, 67 | canRelease = first.canRelease && second.canRelease) 68 | } 69 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/analysis/differential_privacy/StabilityDomain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.analysis.differential_privacy 24 | 25 | import com.uber.engsec.dp.dataflow.domain.AbstractDomain 26 | 27 | /** Abstract domain for relations in elastic sensitivity analysis. 28 | * 29 | * @param stability Stability of the relation as defined by elastic sensitivity. 30 | * @param isPublic Does this relation contain only publicly-derived data (as determined by the isPublic table flag)? 31 | * When public tables are joined with a protected table the entire relation becomes non-public. 32 | * @param ancestors Set of this node's ancestor tables, used to detect self-joins. 33 | */ 34 | case class RelStability(stability: Double, 35 | isPublic: Boolean, 36 | ancestors: Set[String]) { 37 | override def toString: String = s"stability: $stability, isPublic: $isPublic, ancestors: $ancestors" 38 | } 39 | 40 | /** The abstract domain is a product lattice with pointwise ordering of the element types defined above. 41 | */ 42 | object StabilityDomain extends AbstractDomain[RelStability] { 43 | override val bottom: RelStability = 44 | RelStability( 45 | stability = 1.0, 46 | isPublic = false, 47 | ancestors = Set.empty) 48 | 49 | override def leastUpperBound(first: RelStability, second: RelStability): RelStability = 50 | RelStability( 51 | stability = math.max(first.stability, second.stability), 52 | isPublic = first.isPublic && second.isPublic, 53 | ancestors = first.ancestors ++ second.ancestors) 54 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/analysis/histogram/HistogramAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.analysis.histogram 24 | 25 | import com.uber.engsec.dp.dataflow.AggFunctions._ 26 | import com.uber.engsec.dp.dataflow.column.{NodeColumnFacts, RelNodeColumnAnalysis} 27 | import com.uber.engsec.dp.dataflow.domain._ 28 | import com.uber.engsec.dp.dataflow.domain.lattice.FlatLatticeDomain 29 | import com.uber.engsec.dp.sql.relational_algebra.RelUtils 30 | import org.apache.calcite.rel.core.{Aggregate, TableScan} 31 | import org.apache.calcite.rex.{RexNode, RexSlot} 32 | 33 | /** Returns the aggregation status of each output column of a query. The results of this analysis are used to classify 34 | * queries as statistical or raw data, determine which columns contain aggregations, and track the provenance of 35 | * aggregated columns and histogram bins. 36 | */ 37 | class HistogramAnalysis extends RelNodeColumnAnalysis(UnitDomain, AggregationDomain) { 38 | 39 | override def transferAggregate(node: Aggregate, aggFunctions: IndexedSeq[Option[AggFunction]], state: NodeColumnFacts[Unit, AggregationInfo]) = { 40 | val newColFacts = state.colFacts.zipWithIndex.map { case (state, idx) => 41 | val aggFunction = aggFunctions(idx) 42 | 43 | if (aggFunction.isEmpty) // grouped column 44 | state.copy(isGroupBy = true) 45 | else { 46 | val newReferences: Set[QualifiedColumnName] = aggFunction.get match { 47 | case COUNT => state.references.map{ _.table }.toList.distinct.map{ QualifiedColumnName(_, "*") }.toSet 48 | case _ => state.references 49 | } 50 | 51 | AggregationInfo( 52 | isAggregation = true, 53 | outermostAggregation = aggFunction, 54 | references = newReferences, 55 | valueModified = true, 56 | isGroupBy = false 57 | ) 58 | } 59 | } 60 | 61 | NodeColumnFacts(UnitDomain.bottom, newColFacts) 62 | } 63 | 64 | override def transferExpression(node: RexNode, state: AggregationInfo): AggregationInfo = { 65 | node match { 66 | case _: RexSlot => state 67 | case _ => state.copy(valueModified = true) 68 | } 69 | } 70 | 71 | override def transferTableScan(node: TableScan, state: NodeColumnFacts[Unit, AggregationInfo]) = { 72 | import scala.collection.JavaConverters._ 73 | 74 | val tableName = RelUtils.getQualifiedTableName(node) 75 | val colNames = node.getRowType.getFieldNames.asScala 76 | 77 | val newColFacts = state.colFacts.zip(colNames).map { case (state, colName) => 78 | val qualifiedColName = QualifiedColumnName(tableName, colName) 79 | state.copy(references = Set(qualifiedColName)) 80 | } 81 | 82 | NodeColumnFacts(UnitDomain.bottom, newColFacts) 83 | } 84 | } 85 | 86 | /** Information about the aggregation status of a column 87 | * 88 | * @param isAggregation Is this column any type of aggregation? 89 | * @param outermostAggregation Outermost aggregation function applied to references 90 | * @param references Data provenance of the column (i.e., each database column influencing this column's value) 91 | * @param valueModified Was any function/operation/expression applied to this column? If (and only if) false, values of 92 | * this column are guaranteed to correspond exactly to values in database table [references]. 93 | * @param isGroupBy Is this column grouped? 94 | */ 95 | case class AggregationInfo(isAggregation: Boolean, 96 | outermostAggregation: DomainElem[AggFunction], 97 | references: Set[QualifiedColumnName], 98 | valueModified: Boolean, 99 | isGroupBy: Boolean) 100 | 101 | object AggregationDomain extends AbstractDomain[AggregationInfo] { 102 | override val bottom: AggregationInfo = AggregationInfo(false, FlatLatticeDomain.bottom, Set.empty, false, false) 103 | 104 | override def leastUpperBound(first: AggregationInfo, second: AggregationInfo): AggregationInfo = { 105 | AggregationInfo( 106 | isAggregation=first.isAggregation || second.isAggregation, 107 | outermostAggregation=FlatLatticeDomain.leastUpperBound(first.outermostAggregation, second.outermostAggregation), 108 | references=first.references ++ second.references, 109 | valueModified=first.valueModified || second.valueModified, 110 | isGroupBy=first.isGroupBy || second.isGroupBy 111 | ) 112 | } 113 | } 114 | 115 | case class QualifiedColumnName(table: String, column: String) 116 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/analysis/histogram/QueryType.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.analysis.histogram 24 | 25 | import com.uber.engsec.dp.schema.Database 26 | import org.apache.calcite.rel.RelNode 27 | 28 | /** Classification of queries: histogram, non-histogram statistical, and raw data. */ 29 | object QueryType extends Enumeration { 30 | type QueryType = Value 31 | val HISTOGRAM, NON_HISTOGRAM_STATISTICAL, RAW_DATA = Value 32 | 33 | /** Inspects results of HistogramAnalysis to categorize the query as statistical (histogram or non-histogram) 34 | * or raw data. 35 | * 36 | * @param results A set of column facts representing the results of a histogram analysis 37 | * @return The type of the query: histogram, non-histogram statistical, or raw data 38 | */ 39 | def getQueryType(results: HistogramAnalysis#ResultType): QueryType = { 40 | var groupedColumns = 0 41 | var nonGroupedAggregations = 0 42 | var rawColumns = 0 43 | 44 | results.colFacts.foreach { info => 45 | if (info.isGroupBy) 46 | groupedColumns += 1 47 | 48 | if (info.isAggregation && !info.isGroupBy) 49 | nonGroupedAggregations += 1 50 | 51 | if (!info.isAggregation && !info.isGroupBy) 52 | rawColumns += 1 53 | } 54 | 55 | // A histogram is a query with one or more (non-grouped) aggregations, and all remaining columns grouped. 56 | if ((groupedColumns > 0) && (nonGroupedAggregations > 0) && (rawColumns == 0)) 57 | QueryType.HISTOGRAM 58 | 59 | // A statistical query has every column aggregated. 60 | else if (nonGroupedAggregations == results.colFacts.size) 61 | QueryType.NON_HISTOGRAM_STATISTICAL 62 | 63 | // Everything else is "raw data" 64 | else 65 | QueryType.RAW_DATA 66 | } 67 | 68 | /** Categorize the query using an already parsed tree. */ 69 | def getQueryType(root: RelNode, database: Database): QueryType = { 70 | val results = new HistogramAnalysis().run(root, database) 71 | getQueryType(results) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/analysis/join/JoinKeysUsed.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.analysis.join 24 | 25 | import com.uber.engsec.dp.dataflow.node.DFGVisitorAnalysis 26 | import com.uber.engsec.dp.sql.dataflow_graph.Node 27 | import com.uber.engsec.dp.sql.dataflow_graph.reference.{ColumnReference, Function} 28 | import com.uber.engsec.dp.sql.dataflow_graph.relation.{DataTable, Join} 29 | 30 | import scala.collection.mutable 31 | 32 | /** Analysis that returns the set of all columns used as equi-join keys in a given query */ 33 | class JoinKeysUsed extends DFGVisitorAnalysis[JoinKeyDomain] { 34 | 35 | override def run(node: Node): JoinKeyDomain = { 36 | 37 | val state = new JoinKeyDomain() 38 | 39 | node.foreach { 40 | case d: DataTable => 41 | state.tables.add(d) 42 | 43 | case c: ColumnReference => 44 | state.tables.foreach { table => state.refs.add(table.name + "." + table.getColumnName(c.colIndex)) } 45 | state.tables.clear() 46 | 47 | case f: Function => 48 | if (f.functionName == "EQUAL") { 49 | state.refs.foreach { state.eqKeys.add } 50 | state.refs.clear() 51 | } 52 | 53 | case j: Join => 54 | state.eqKeys.foreach { state.joinKeys.add } 55 | state.eqKeys.clear() 56 | 57 | case _ => () 58 | } 59 | 60 | state 61 | } 62 | } 63 | 64 | /** Abstract domain for the join keys used analysis */ 65 | class JoinKeyDomain { 66 | val joinKeys = new mutable.HashSet[String]() 67 | val refs = new mutable.HashSet[String] 68 | val eqKeys = new mutable.HashSet[String] 69 | val tables = new mutable.HashSet[DataTable] 70 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/analysis/name_resolution/NameResolutionAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.analysis.name_resolution 24 | 25 | import com.facebook.presto.sql.tree._ 26 | import com.uber.engsec.dp.dataflow.node.ASTDataflowAnalysis 27 | import com.uber.engsec.dp.sql.ast.ASTFunctions 28 | 29 | /** Dataflow analysis on ASTs to resolve identifiers in the query. Used internally in the tree transformation 30 | * process to transform ASTs into dataflow graphs. 31 | */ 32 | class NameResolutionAnalysis extends ASTDataflowAnalysis(NameResolutionDomain) { 33 | 34 | override def transferNode(node: Node, state: NameResolution): NameResolution = node match { 35 | case table: Table => 36 | // Assume every table node refers to an alias until proven otherwise (i.e., unless we don't find an alias 37 | // of this name within the scope of the current subquery namespace). 38 | val newState = state.copy() 39 | newState.addReference(table) 40 | newState.setTargetRelation(table, true) 41 | newState 42 | 43 | case alias: AliasedRelation => 44 | val newState = state.copy() 45 | newState.setTargetRelation(alias.getRelation, true) 46 | newState.addRelationToScope(alias) 47 | newState 48 | 49 | case table: TableSubquery => 50 | val newState = state.copy() 51 | newState.setTargetRelation(table, true) 52 | newState 53 | 54 | case join: Join => 55 | val newState = state.copy() 56 | newState.setTargetRelation(join, false) 57 | // Select items can reference the left and right relations by name, e.g., to disambiguate columns of the same name. 58 | newState.addRelationToScope(join.getLeft) 59 | newState.addRelationToScope(join.getRight) 60 | newState 61 | 62 | case withQuery: WithQuery => 63 | // a WITH node adds a new relation in the global scope. We will match Table alias references to this relation 64 | // in the transferQuery method. 65 | val newState = state.copy() 66 | newState.addRelationToScope(withQuery) 67 | newState 68 | 69 | case deref: DereferenceExpression => 70 | val newState = state.copy() 71 | newState.addReference(deref) 72 | // getBase will be a QualifiedNameReference, which we will look at and handle when matching the dereference 73 | // expression, so no point keeping the orphan child. We only do this for QualifiedNameReference because we may 74 | // see other types of base expressions in the future. 75 | if (deref.getBase.isInstanceOf[QualifiedNameReference]) 76 | newState.removeOrphanReference(deref.getBase) 77 | newState 78 | 79 | case func: FunctionCall => 80 | val newState = state.copy() 81 | if ((func.getArguments.size == 0) || (func.getName.toString == "count")) 82 | newState.addReference(func) 83 | newState 84 | 85 | case all: AllColumns => 86 | val newState = state.copy() 87 | newState.addReference(all) 88 | newState 89 | 90 | case qual: QualifiedNameReference => 91 | val newState = state.copy() 92 | newState.addReference(qual) 93 | newState 94 | 95 | case query: Query => 96 | val newState = state.copy() 97 | newState.matchOrphanReferences() 98 | newState.clearScope() 99 | if (query eq treeRoot.get) { 100 | // Any remaining table nodes must be database tables, so remove them from the orphans list or else we'll get 101 | // errors about unresolved columns/tables. 102 | newState.removeTableOrphanReferences() 103 | } 104 | newState 105 | 106 | case withNode: With => 107 | val newState = state.copy() 108 | newState.matchOrphanReferences() 109 | newState 110 | 111 | case spec: QuerySpecification => 112 | val newState = state.copy() 113 | newState.matchOrphanReferences() 114 | newState.clearScope() // no target relation persists above a query specification node 115 | newState 116 | 117 | case _ => state 118 | } 119 | 120 | override def joinNode(node: Node, children: Iterable[Node]): NameResolution = node match { 121 | case join: Join => 122 | val leftState = resultMap(join.getLeft).copy() 123 | val rightState = resultMap(join.getRight).copy() 124 | val result = new NameResolution() 125 | 126 | val children = List(join.getLeft, join.getRight) ++ ASTFunctions.stripOption(join.getCriteria).collect { case c: JoinOn => c.getExpression } 127 | 128 | children.foreach { child => 129 | val childState = resultMap(child) 130 | result.addReferencesFromState(childState) 131 | childState.namedRelationsInScope.foreach { 132 | result.namedRelationsInScope += _ 133 | } 134 | } 135 | 136 | // Each relation within a join may be exposed in scope of SELECT as a possible inner relation if the relation has a name. 137 | result.addRelationToScope(leftState.targetRelation.get) 138 | result.addRelationToScope(rightState.targetRelation.get) 139 | 140 | // Add the named relations of the JOIN left/right so the matchOrphanReferences method can disambiguate 141 | // deference expressions in the join condition between the left or right based on which relation names are in 142 | // scope of each. 143 | leftState.addRelationToScope(leftState.targetRelation.get) 144 | rightState.addRelationToScope(rightState.targetRelation.get) 145 | 146 | // This is our only chance to match column references in the join condition to the respective joined table. 147 | // We need to do this before set the target relation. We temporarily set the target relation to None so that 148 | // references inside the join condition don't resolve to the overall join. The target relation will be set 149 | // to this node in the transferJoin() method. 150 | result.targetRelation = None 151 | result.matchOrphanReferences(Some(join.getLeft), Some(join.getRight), leftState.namedRelationsInScope.keySet, rightState.namedRelationsInScope.keySet) 152 | 153 | result 154 | 155 | case _ => super.joinNode(node, children) 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/analysis/name_resolution/ReferenceInfo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.analysis.name_resolution 24 | 25 | import com.facebook.presto.sql.tree.{Node, Query} 26 | import com.uber.engsec.dp.exception.TransformationException 27 | 28 | /** Stores information about references into relations as needed by TreeTransformAnalysis. 29 | */ 30 | class RefOption(val first: Node, val second: Option[Node] = None) { 31 | def isUnique: Boolean = !hasTwoRelations 32 | def hasTwoRelations: Boolean = second.isDefined 33 | 34 | def getOnly: Node = { 35 | if (hasTwoRelations) throw new TransformationException("getOnly called on reference with multiple possible relations.") 36 | first 37 | } 38 | } 39 | 40 | case class ReferenceInfo(relation1: Node, 41 | relation2: Option[Node] = None, 42 | // The node representing the "inner relation" being referenced, or None if the reference does not 43 | // specify an inner relation. See comments in NameResolutionDomain for details about inner relation references. 44 | var innerRelation: Option[Node] = None) { 45 | 46 | /** The presto node representing the relation being referenced into. This is usually a single relation, but it may 47 | * include two relations if the reference might point to either of them and must be resolved using schema 48 | * information. For example, in query 49 | * 50 | * SELECT blah from a JOIN b ON col1 = col2 51 | * 52 | * both col1 and col2 may refer to either a or b, and we can only determine which one it is by consulting the schema 53 | * (which is only available during tree transformation). 54 | */ 55 | val ref = new RefOption(relation1, relation2) 56 | 57 | override def toString: String = { 58 | def node2Str(node: Node): String = node match { 59 | case _ : Query => "Query" 60 | case _ => node.toString 61 | } 62 | 63 | var refStr = "" 64 | if (ref.second.isDefined) 65 | refStr = "Refs[1:" + node2Str(ref.first) + ", 2:" + node2Str(ref.second.get) + "]" 66 | else refStr = "Ref[" + node2Str(ref.first) + "]" 67 | 68 | refStr += (if (innerRelation.isEmpty) "" else " InnerRelation[" + node2Str(innerRelation.get) + "]") 69 | refStr 70 | } 71 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/analysis/taint/TaintAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.analysis.taint 24 | 25 | import com.uber.engsec.dp.dataflow.column.{NodeColumnFacts, RelNodeColumnAnalysis} 26 | import com.uber.engsec.dp.dataflow.domain.{BooleanDomain, UnitDomain} 27 | import com.uber.engsec.dp.sql.relational_algebra.RelUtils 28 | import org.apache.calcite.rel.core.TableScan 29 | 30 | /** Returns true for each output column that is derived from a column marked as tainted (isTaint=true in the schema config). 31 | */ 32 | class TaintAnalysis extends RelNodeColumnAnalysis(UnitDomain, BooleanDomain) { 33 | 34 | override def transferTableScan(node: TableScan, state: NodeColumnFacts[Unit, Boolean]) = NodeColumnFacts( 35 | UnitDomain.bottom, 36 | state.colFacts.zipWithIndex.map { case (colState, idx) => 37 | val isTainted = RelUtils.getColumnProperty[Boolean]("isTainted", node, idx, this.getDatabase).getOrElse(false) 38 | isTainted 39 | }) 40 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/dataflow/AbstractDataflowAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.dataflow 24 | 25 | import com.uber.engsec.dp.sql.AbstractAnalysis 26 | 27 | /** Common trait of dataflow analyses across different representations (AST, dataflow graph, and relational algebra) 28 | */ 29 | trait AbstractDataflowAnalysis[N <: AnyRef, T1] extends AbstractAnalysis[N, T1] { 30 | /** Invokes the transfer function of the analysis implementation and returns a new abstract state. Implemented by 31 | * analysis type subclasses. 32 | */ 33 | def transferNode(node: N, state: T1): T1 34 | 35 | /** Returns the children of the given node. Implemented by analysis type subclasses. 36 | */ 37 | def getNodeChildren(node: N): Iterable[N] 38 | 39 | /** Invokes the join function of the analysis implementation and returns a new abstract state. Implemented by 40 | * analysis type subclasses. 41 | */ 42 | def joinNode(node: N, children: Iterable[N]): T1 43 | 44 | /** Recursive procedure to visit nodes in the tree and invoke analysis transfer/join methods. 45 | */ 46 | def process(node: N): Unit = { 47 | if (resultMap.contains(node)) 48 | return 49 | 50 | val children = getNodeChildren(node) 51 | children.foreach { process } 52 | 53 | currentNode = Some(node) 54 | 55 | val joinResult = joinNode(node, children) 56 | val transferResult = transferNode(node, joinResult) 57 | resultMap += (node -> transferResult) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/dataflow/AggFunctions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.dataflow 24 | 25 | /** Enumeration of SQL aggregation functions. 26 | */ 27 | object AggFunctions { 28 | sealed abstract class AggFunction 29 | case object COUNT extends AggFunction 30 | case object SUM extends AggFunction 31 | case object MIN extends AggFunction 32 | case object MAX extends AggFunction 33 | case object AVG extends AggFunction 34 | case object VAR extends AggFunction 35 | case object STDDEV extends AggFunction 36 | case object SINGLE_VALUE extends AggFunction 37 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/dataflow/column/AbstractColumnAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.dataflow.column 24 | 25 | import com.uber.engsec.dp.dataflow.AbstractDataflowAnalysis 26 | import com.uber.engsec.dp.dataflow.column.AbstractColumnAnalysis.ColumnFacts 27 | import com.uber.engsec.dp.dataflow.domain.AbstractDomain 28 | 29 | /** Tracks dataflow facts (abstract domains) individually for each column, automatically propagating 30 | * facts up the tree by figuring out which columns in a relation/reference correspond to which columns of its 31 | * subrelations. In other words, this analysis tracks data provenance automatically so subclasses need only define 32 | * methods for updating these facts at appropriate nodes. 33 | * 34 | * @tparam N The tree node type 35 | * @tparam E The result fact type 36 | * @tparam D The abstract domain for the analysis (i.e., lattice with element type E) 37 | */ 38 | abstract class AbstractColumnAnalysis[N <: AnyRef, E, D <: AbstractDomain[E]] 39 | extends AbstractDataflowAnalysis[N, ColumnFacts[E]] { 40 | 41 | def flattenJoinChildren(domain: AbstractDomain[E], node: N, children: Iterable[N]): ColumnFacts[E] = { 42 | val childrenFacts = children.flatMap{ resultMap(_) } 43 | val resultFacts = AbstractColumnAnalysis.joinFacts(domain, childrenFacts) 44 | IndexedSeq(resultFacts) 45 | } 46 | 47 | /** Implemented by analysis subclasses. 48 | */ 49 | override def transferNode(node: N, state: ColumnFacts[E]): ColumnFacts[E] 50 | override def joinNode(node: N, children: Iterable[N]): ColumnFacts[E] 51 | } 52 | 53 | object AbstractColumnAnalysis { 54 | import scala.language.implicitConversions 55 | 56 | type ColumnFacts[+J] = IndexedSeq[J] 57 | implicit def elemListToColumnFacts[J](elems: List[J]): ColumnFacts[J] = elems.toIndexedSeq 58 | implicit def elemsToColumnFacts[J](elems: J*): ColumnFacts[J] = elems.toIndexedSeq 59 | implicit def elemToColumnFacts[J](elem: J): ColumnFacts[J] = IndexedSeq(elem) 60 | 61 | def joinFacts[E](domain: AbstractDomain[E], facts: Iterable[E]): E = { 62 | val resultFact: E = 63 | if (facts.isEmpty) 64 | domain.bottom 65 | else if (facts.size == 1) 66 | facts.head 67 | else 68 | facts.reduce( (first, second) => domain.leastUpperBound(first, second) ) 69 | 70 | resultFact 71 | } 72 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/dataflow/column/DFGColumnAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.dataflow.column 24 | 25 | import com.uber.engsec.dp.dataflow.column.AbstractColumnAnalysis.ColumnFacts 26 | import com.uber.engsec.dp.dataflow.domain.AbstractDomain 27 | import com.uber.engsec.dp.exception.AnalysisException 28 | import com.uber.engsec.dp.sql.dataflow_graph.reference.{ColumnReference, Function, UnstructuredReference} 29 | import com.uber.engsec.dp.sql.dataflow_graph.relation._ 30 | import com.uber.engsec.dp.sql.dataflow_graph.{DataflowGraphFunctions, Node} 31 | 32 | /** Column fact analysis on dataflow graphs. For more details see [[AbstractColumnAnalysis]]. 33 | */ 34 | abstract class DataflowGraphColumnAnalysis[E, D <: AbstractDomain[E]](domain: AbstractDomain[E]) 35 | extends AbstractColumnAnalysis[Node, E, D] 36 | with DataflowGraphFunctions 37 | with DataflowGraphColumnAnalysisFunctions[E] { 38 | 39 | override final def transferNode(node: Node, state: ColumnFacts[E]): ColumnFacts[E] = { 40 | 41 | val newFacts: Seq[E] = node match { 42 | case s: Select => state.zipWithIndex.map { case (fact,idx) => transferSelect(s, idx, fact) } 43 | 44 | case c: ColumnReference => List(transferColumnReference(c, 0, state.head)) 45 | 46 | case f: Function => 47 | assert (state.length == 1) 48 | List(transferFunction(f, 0, state.head)) 49 | 50 | case u: UnstructuredReference => 51 | assert (state.length == 1) 52 | List(transferUnstructuredReference(u, 0, state.head)) 53 | 54 | case t: DataTable => 55 | (0 until t.numCols).map { idx => 56 | transferDataTable(t, idx, domain.bottom) 57 | } 58 | 59 | case j: Join => 60 | if (state.size != j.numCols) throw new AnalysisException("Schema size mismatch (probably caused by unknown table) in JOIN[" + node.toString + "]. Some columns in this relation have unknown provenance, so analysis results may be incorrect.") 61 | state.zipWithIndex.map { case (fact,idx) => transferJoin(j, idx, fact) } 62 | 63 | case u: Union => 64 | state.zipWithIndex.map { case (fact,idx) => transferUnion(u, idx, fact) } 65 | 66 | case e: Except => 67 | state.zipWithIndex.map { case (fact,idx) => transferExcept(e, idx, fact) } 68 | } 69 | 70 | newFacts.toIndexedSeq 71 | } 72 | 73 | override def joinNode(node: Node, children: Iterable[Node]): ColumnFacts[E] = { 74 | node match { 75 | /** For Select, join fact from where condition (if present) with fact from each SelectItem. 76 | */ 77 | case s: Select => 78 | val colResults = s.items.map{ item => 79 | val childResult = resultMap(item.ref) 80 | assert (childResult.size == 1) 81 | childResult.head 82 | } 83 | 84 | colResults.toIndexedSeq 85 | 86 | case c: ColumnReference => 87 | val result = resultMap(c.of)(c.colIndex) 88 | IndexedSeq(result) 89 | 90 | /** For Function and UnstructedReference, reduce all columns from all children into a single column fact. 91 | */ 92 | case f: Function => flattenJoinChildren(domain, node, children) 93 | case u: UnstructuredReference => flattenJoinChildren(domain, node, children) 94 | 95 | /** For Join, pass through state from left and right relations, joined with join condition fact. 96 | */ 97 | case j: Join => 98 | val colResults = resultMap(j.left) ++ resultMap(j.right) 99 | val result = 100 | if (j.condition.isDefined) 101 | colResults.map { x => domain.leastUpperBound(x, resultMap(j.condition.get).head) } 102 | else 103 | colResults 104 | result 105 | 106 | /** For Union and Except, join column facts of corresponding columns from all children (schemas of children are guaranteed to match). 107 | */ 108 | case u: Union => children.map{ resultMap(_) }.transpose.map{ _.reduce( (x,y) => domain.leastUpperBound(x, y) )}.toIndexedSeq 109 | case e: Except => children.map{ resultMap(_) }.transpose.map{ _.reduce( (x,y) => domain.leastUpperBound(x, y) )}.toIndexedSeq 110 | case d: DataTable => IndexedSeq.empty // we'll initialize the facts to bottom in the transfer function. 111 | case _ => throw new RuntimeException(s"Unsupported join node type ${node.getClass.getSimpleName}") 112 | } 113 | } 114 | } 115 | 116 | /** Subclasses may override any of these methods as appropriate. */ 117 | trait DataflowGraphColumnAnalysisFunctions[E] { 118 | def transferSelect(s: Select, idx: Int, fact: E): E = fact 119 | def transferColumnReference(c: ColumnReference, idx: Int, fact: E): E = fact 120 | def transferFunction(f: Function, idx: Int, fact: E): E = fact 121 | def transferUnstructuredReference(u: UnstructuredReference, idx: Int, fact: E): E = fact 122 | def transferDataTable(d: DataTable, idx: Int, fact: E): E = fact 123 | def transferJoin(j: Join, idx: Int, fact: E): E = fact 124 | def transferUnion(u: Union, idx: Int, fact: E): E = fact 125 | def transferExcept(e: Except, idx: Int, fact: E): E = fact 126 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/dataflow/domain/AbstractDomain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.dataflow.domain 24 | 25 | /** Models a domain lattice whose elements are represented by type E and partial order is defined (implicitly) by the 26 | * leastUpperBound method. 27 | * 28 | * This is the common interface for abstract domains, which store a particular type of dataflow fact for an analysis. 29 | * A dataflow analysis updates this abstract state by modeling the semantics of nodes in the tree with respect to the 30 | * domain of choice. 31 | * 32 | * Each abstract domain must implement a leastUpperBound operation that computes (or at minimum, over-approximates) the 33 | * lowest domain element that is greater than both input elements per to the domain's partial order. This method 34 | * is the means by which the analysis framework conservatively combines multiple states at branches in the tree. 35 | */ 36 | trait AbstractDomain[E] { 37 | /** The bottom element for this domain. 38 | */ 39 | val bottom: E 40 | 41 | /** The least upper bound of elements a and b as defined by the partial order of this abstract domain. 42 | */ 43 | def leastUpperBound(a: E, b: E): E 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/dataflow/domain/Basic.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.dataflow.domain 24 | 25 | /** An abstract domain with just top and bottom values: 26 | * 27 | * ⊤ (true) 28 | * | 29 | * ⊥ (false) 30 | */ 31 | object BooleanDomain extends AbstractDomain[Boolean] { 32 | override val bottom: Boolean = false 33 | override def leastUpperBound(first: Boolean, second: Boolean): Boolean = first || second 34 | } 35 | 36 | /** An abstract domain representing an optional *fixed* value, where bottom is None. This lattice has no top element; 37 | * only one element value may be stored. */ 38 | class OptionDomain[T] extends AbstractDomain[Option[T]] { 39 | override val bottom: Option[T] = Option.empty 40 | override def leastUpperBound(first: Option[T], second: Option[T]): Option[T] = { 41 | if (first.equals(second)) 42 | first 43 | else 44 | throw new java.util.NoSuchElementException("OptionDomain.leastUpperBound with different element values") 45 | } 46 | } 47 | 48 | /** The void domain, storing nothing */ 49 | object UnitDomain extends AbstractDomain[Unit] { 50 | override val bottom: Unit = () 51 | override def leastUpperBound(a: Unit, b: Unit): Unit = bottom 52 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/dataflow/domain/Collection.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.dataflow.domain 24 | 25 | /** An abstract domain that implements a Map of facts, with join defined as map union. If A and B have intersecting 26 | * keys, behavior is undefined (only one value for each key is retained). 27 | */ 28 | class MapDomain[K,V] extends AbstractDomain[Map[K,V]] { 29 | override val bottom: Map[K,V] = Map.empty 30 | override def leastUpperBound(first: Map[K, V], second: Map[K, V]): Map[K, V] = first ++ second 31 | } 32 | 33 | /** An abstract domain that implements a Set of facts, with leastUpperBound defined as set union. 34 | */ 35 | class SetDomain[T] extends AbstractDomain[Set[T]] { 36 | override val bottom: Set[T] = Set.empty 37 | override def leastUpperBound(first: Set[T], second: Set[T]): Set[T] = first ++ second 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/dataflow/domain/DomainElement.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.dataflow.domain 24 | 25 | /** A monad for lattice values represented by type E augmented with (type-less) top and bottom elements. 26 | */ 27 | abstract class DomainElem[+E] { 28 | /** Returns the element value, or throws java.util.NoSuchElementException if the lattice element is Top or Bottom 29 | */ 30 | def get: E 31 | def isTop: Boolean 32 | def isBottom: Boolean 33 | 34 | /** Returns true if the lattice value contains the given element. Always returns false if the lattice value is Top or Bottom. */ 35 | def contains[F >: E](elem: F): Boolean 36 | 37 | /** Retrieves the lattice value as an option, with Bottom returning None and element type E returning Some(e). 38 | * Should only be used on semi-bounded lattices which are guaranteed never to have value Top (e.g., SetLattice) since 39 | * this will raise an exception. 40 | */ 41 | def asOption: Option[E] 42 | } 43 | 44 | case object Top extends DomainElem[Nothing] { 45 | override def isTop: Boolean = true 46 | override def isBottom: Boolean = false 47 | override def contains[F >: Nothing](elem: F): Boolean = false 48 | override def asOption: Option[Nothing] = throw new java.util.NoSuchElementException("Top.asOption") 49 | override def get = throw new java.util.NoSuchElementException("Top.get") 50 | } 51 | 52 | case object Bottom extends DomainElem[Nothing] { 53 | override def isTop: Boolean = false 54 | override def isBottom: Boolean = true 55 | override def contains[F >: Nothing](elem: F): Boolean = false 56 | override def asOption: Option[Nothing] = None 57 | override def get = throw new java.util.NoSuchElementException("Bottom.get") 58 | } 59 | 60 | /** External code shouldn't need to interact directly with this class; the implicit definitions below automatically 61 | * convert to and from this wrapper and the underlying element type. 62 | */ 63 | case class Mid[E](value: E) extends DomainElem[E] { 64 | override def isTop: Boolean = false 65 | override def isBottom: Boolean = false 66 | override def get: E = value 67 | override def contains[F >: E](elem: F): Boolean = elem.equals(value) 68 | override def asOption: Option[E] = Some(value) 69 | override def toString: String = value.toString 70 | } 71 | 72 | object DomainElem { 73 | import scala.language.implicitConversions 74 | implicit def val2DomainElem[E](value: E): DomainElem[E] = Mid(value) 75 | implicit def elem2Val[E](value: Mid[E]): E = value.get 76 | 77 | /** Convert from Option[E] to lattice element, with Option.None mapped to Bottom. */ 78 | implicit def option2DomainElem[E](value: Option[E]): DomainElem[E] = value.fold[DomainElem[E]](Bottom)(Mid(_)) 79 | } 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/dataflow/domain/Lattice.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.dataflow.domain.lattice 24 | 25 | import com.uber.engsec.dp.dataflow.domain._ 26 | 27 | /** Models a flat lattice of a finite set of elements of type [E]: 28 | * 29 | * ⊤ 30 | * / | \ 31 | * / | \ 32 | * e1 e2 ... 33 | * \ | / 34 | * \ | / 35 | * ⊥ 36 | */ 37 | class FlatLatticeDomain[E] extends AbstractDomain[DomainElem[E]] { 38 | override val bottom: DomainElem[E] = Bottom 39 | override def leastUpperBound(first: DomainElem[E], second: DomainElem[E]): DomainElem[E] = FlatLatticeDomain.leastUpperBound(first, second) 40 | } 41 | 42 | object FlatLatticeDomain { 43 | def bottom[E]: DomainElem[E] = Bottom 44 | 45 | def leastUpperBound[E](first: DomainElem[E], second: DomainElem[E]): DomainElem[E] = { 46 | (first, second) match { 47 | case (Top, _) | (_, Top) => Top 48 | case (Bottom, _) => second 49 | case (_, Bottom) => first 50 | case (Mid(a), Mid(b)) => if (a == b) first else Top 51 | case _ => Top 52 | } 53 | } 54 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/dataflow/node/ASTDataflowAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.dataflow.node 24 | 25 | import com.facebook.presto.sql.tree.Node 26 | import com.uber.engsec.dp.dataflow.AbstractDataflowAnalysis 27 | import com.uber.engsec.dp.dataflow.domain.AbstractDomain 28 | import com.uber.engsec.dp.sql.ast.ASTFunctions 29 | 30 | /** Dataflow analysis on AST nodes. For more information see [[AbstractDataflowAnalysis]]. 31 | */ 32 | abstract class ASTDataflowAnalysis[E, T <: AbstractDomain[E]](domain: AbstractDomain[E]) 33 | extends AbstractDataflowAnalysis[Node, E] 34 | with ASTFunctions { 35 | 36 | override def joinNode(node: Node, children: Iterable[Node]): E = { 37 | if (children.isEmpty) 38 | domain.bottom 39 | else if (children.size == 1) 40 | resultMap(children.head) 41 | else 42 | children.map{ resultMap(_) }.reduce { (first, second) => domain.leastUpperBound(first, second) } 43 | } 44 | 45 | } 46 | 47 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/dataflow/node/DFGVisitorAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.dataflow.node 24 | 25 | import com.uber.engsec.dp.sql.AbstractAnalysis 26 | import com.uber.engsec.dp.sql.dataflow_graph.{DataflowGraphFunctions, Node} 27 | 28 | /** Interface for simple DFG analyses that don't require dataflow tracking (e.g., visitor analyses). 29 | * 30 | * @tparam T The result type 31 | */ 32 | trait DFGVisitorAnalysis[T <: Any] extends AbstractAnalysis[Node,T] with DataflowGraphFunctions { 33 | // Handle book keeping for abstract analysis 34 | final override def process(root: Node): Unit = resultMap += (root -> run(root)) 35 | 36 | /** The only method that needs to be implemented by subclasses. Runs the analysis on the given tree and returns 37 | * the result. 38 | */ 39 | def run(root: Node): T 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/exception/AnalysisException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.exception 24 | 25 | /** Exception encountered during analysis (as opposed to parsing, tree transformation, etc.) 26 | */ 27 | class AnalysisException(val msg: String) extends RuntimeException(msg) 28 | 29 | /** Indicates that an analysis does not support a given query. 30 | */ 31 | class UnsupportedQueryException(message: String) extends AnalysisException(message) 32 | 33 | /** Indicates that an analysis does not support a specific construct used in the query. 34 | */ 35 | class UnsupportedConstructException(message: String) extends UnsupportedQueryException(message) -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/exception/DPException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.exception 24 | 25 | /** The exception type is raised for any exceptional condition encountered during end-to-end analysis of a query (i.e., 26 | * by calls to AbstractAnalysis.analyzeQuery()). This includes parsing exceptions, tree transformation exceptions, and 27 | * analysis runtime errors. 28 | * 29 | * This is a checked exception, requiring callers to explicitly handle errors. Internal code may throw any of 30 | * the unchecked error types defined in [[com.uber.engsec.dp.exception]]. All public interfaces to this tool should 31 | * catch internal errors and wrap with this exception type. Callers can use the getCause() method to retrieve details 32 | * about the underlying exception. 33 | */ 34 | class DPException(val message: String, val cause: Throwable) extends Exception(message, cause) -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/exception/TransformationException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.exception 24 | 25 | /** Thrown when fatal errors are encountered during Presto to dataflow graph transformation. 26 | */ 27 | class TransformationException(val message: String) extends RuntimeException(message) {} 28 | 29 | /** Thrown when Presto parsing fails, e.g., because the query has a syntax problem or because it uses a dialect of SQL 30 | * not supported by Presto's SQL grammar. 31 | */ 32 | class ParsingException(message: String) extends TransformationException(message) {} 33 | 34 | /** Thrown when processing a query that uses SELECT * on a relation for which the schema is unknown or incomplete. 35 | */ 36 | class AmbiguousWildcardException(message: String) extends TransformationException(message) {} 37 | 38 | /** Thrown when processing a query that references a column in two or more relations such that the reference is 39 | * ambiguous. For example, if tables A and B both have column "city_id", this query is ambiguous and would 40 | * produce a runtime error on the database: 41 | * 42 | * SELECT blah FROM A JOIN B on column_from_a = city_id 43 | * 44 | * Note this query would be legal if either: ambigous "city_id" is qualified with a dereference expression 45 | * (e.g., "B.city_id") OR all non-deference columns are unambiguous by schema (e.g., "column_from_a" appears only in 46 | * relation A and "city_id" appears only in relation B). 47 | */ 48 | class AmbiguousColumnReference(message: String) extends TransformationException(message) {} 49 | 50 | /** Thrown when tree transformation detects an infinite loop (e.g., because the tree has a cycle), which would otherwise 51 | * result in a StackOverflowException. 52 | */ 53 | class InfiniteLoopException(message: String) extends TransformationException(message) {} 54 | 55 | /** Thrown in exceptional cases when processing joins in the query. 56 | */ 57 | class JoinException(message: String) extends TransformationException(message) {} 58 | 59 | /** Thrown during graph transformation when the schema mode is STRICT and the query references a table whose schema is 60 | * not defined. 61 | */ 62 | class UndefinedSchemaException(message: String) extends TransformationException(message) {} 63 | 64 | /** Thrown when transforming a query that references a relation in such a way as we cannot determine which columns are 65 | * accessed. Possible causes: the query is invalid, schema is invalid or incomplete, or the name resolution analysis 66 | * has a bug. 67 | */ 68 | class UnknownColumnException(message: String) extends TransformationException(message) {} 69 | 70 | /** Thrown when trying to parse a query that is known to be invalid, for example because the list of returned columns is 71 | * either empty or "error", or when trying to rewrite a query that is not a supported type (e.g., a raw data query in a 72 | * differential privacy rewriter). 73 | */ 74 | class InvalidQueryException(message: String) extends TransformationException(message) {} 75 | 76 | /** Thrown when the tree contains a node type that is not recognized or unsupported. 77 | */ 78 | class UnrecognizedNodeTypeException(message: String) extends TransformationException(message) {} -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/rewriting/DPUtil.scala: -------------------------------------------------------------------------------- 1 | package com.uber.engsec.dp.rewriting 2 | 3 | import com.uber.engsec.dp.analysis.histogram.AggregationInfo 4 | import com.uber.engsec.dp.dataflow.column.AbstractColumnAnalysis.ColumnFacts 5 | import com.uber.engsec.dp.rewriting.rules.ColumnDefinition._ 6 | import com.uber.engsec.dp.rewriting.rules.Expr.{Abs, Case, Ln, Rand, _} 7 | import com.uber.engsec.dp.rewriting.rules.Operations._ 8 | import com.uber.engsec.dp.rewriting.rules.{Helpers, ValueExpr} 9 | import com.uber.engsec.dp.schema.Schema 10 | import com.uber.engsec.dp.sql.relational_algebra.Relation 11 | import org.apache.calcite.rel.core._ 12 | 13 | /** Utilities for differential privacy-based rewriters. */ 14 | object DPUtil { 15 | // Expression to sample a random value from the Laplace distribution. 16 | val LaplaceSample: ValueExpr = Case((Rand-0.5) < 0, -1.0, 1.0) * Ln(1 - (2 * Abs(Rand-0.5))) 17 | 18 | /** Rewrites the relation to add all values in the domain of the binned column that are not present in the result set 19 | * if necessary (as determined by the [fillMissingBins] flag). If the provided aggregation contains no grouped 20 | * columns, returns the original relation. 21 | * 22 | * To support this feature the schema must define flag 'domainSet' for any database column usable as a histogram bin. 23 | * The value of this flag is a fully qualified column in the same database whose records enumerate all values in that 24 | * column's domain. This flag may point to itself (e.g., if the column's values already span the domain) or it may 25 | * refer to an auxiliary table. If the flag is not defined, this method throws an error since the rewritten query's 26 | * result cannot be safely returned; in such cases the mechanism must either perform additional processing on the 27 | * results or interpose between the results and analyst. 28 | */ 29 | def addBinsFromDomain(node: Aggregate, 30 | histogramResults: ColumnFacts[AggregationInfo], 31 | config: DPRewriterConfig): Relation = { 32 | import scala.collection.JavaConverters._ 33 | 34 | if (!config.fillMissingBins) 35 | return Relation(node) 36 | 37 | val cols = node.getRowType.getFieldList.asScala 38 | val (groupedCols, aggCols) = cols.splitAt(node.getGroupCount) 39 | 40 | if (groupedCols.length > 1) throw new RewritingException("Multi-column grouping in histograms is not yet supported.") 41 | val groupedColIdx = groupedCols.head.getIndex 42 | val groupedColInfo = histogramResults(groupedColIdx) 43 | val groupedColName = cols(groupedColIdx).getName 44 | 45 | if (aggCols.length > 1) throw new RewritingException("Multi-column aggregations in histograms are not yet supported.") 46 | val aggColIdx = aggCols.head.getIndex 47 | val origAggColAlias = cols(aggColIdx).getName 48 | 49 | // Ensure aggregation column has explicit alias in relation, otherwise Calcite will reference it using a derived 50 | // alias (e.g. EXPR$0), which will fail on the actual database. 51 | val (withAggColAlias, explicitAggAlias) = 52 | if (Helpers.isDerivedAlias(origAggColAlias)) { 53 | val explicitAlias = "_agg" 54 | val rel = Relation(node).mapCols { col => 55 | if (col.idx == aggColIdx) 56 | EnsureAlias(col.expr) AS explicitAlias 57 | else col 58 | } 59 | (rel, explicitAlias) 60 | } 61 | else (Relation(node), origAggColAlias) 62 | 63 | val defaultVal = 0 64 | 65 | // If the value of the histogram bin has been modified prior to grouping, this approach will not work. 66 | if (groupedColInfo.valueModified) throw new RewritingException(s"Histogram column $groupedColName has modified valued.") 67 | 68 | // If the histogram is not derived from exactly one database column, this approach will not work. 69 | if (groupedColInfo.references.size != 1) throw new RewritingException(s"Histogram column must derive its values from exactly one database column.") 70 | 71 | // Figure out which database column contains the domain values for to the histogram column. 72 | val targetCol = groupedColInfo.references.head 73 | val colProperties = Schema.getSchemaMapForTable(config.database, targetCol.table)(targetCol.column).properties 74 | val domainSetFlag = colProperties.getOrElse("domainSet", throw new RewritingException( 75 | s"Column '${targetCol.column}' in table '${targetCol.table}' is used as a histogram bin. " + 76 | "Please define 'domainSet' parameter specifying a table/column that enumerates all values from this column's domain. " + 77 | "To disable this check set fillMissingBins = false in rewriter config (if disabled, query results are NOT safe for release)")) 78 | 79 | val (domainSetTable, domainSetCol) = { 80 | val elems = domainSetFlag.split('.') 81 | val (tbl, col) = elems.splitAt(elems.length-1) 82 | (tbl.mkString("."), col.head) 83 | } 84 | 85 | val domainSetRel = table(domainSetTable, config.database).project(col(domainSetCol) AS "_domain") 86 | 87 | withAggColAlias 88 | .asAlias("_orig") 89 | .project(col(groupedColIdx), EnsureAlias(col(explicitAggAlias))) 90 | .join(domainSetRel, left(0) == right(0), JoinRelType.RIGHT) 91 | .project(right(0) AS groupedColName, 92 | Case(IsNull(left(0)), defaultVal, left(1)) AS origAggColAlias) 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/rewriting/Rewriter.scala: -------------------------------------------------------------------------------- 1 | package com.uber.engsec.dp.rewriting 2 | 3 | import com.uber.engsec.dp.schema.Database 4 | import com.uber.engsec.dp.sql.relational_algebra.{RelUtils, Relation} 5 | import com.uber.engsec.dp.sql.{AbstractAnalysis, QueryParser, TreePrinter} 6 | import org.apache.calcite.plan.{Convention, RelOptAbstractTable} 7 | import org.apache.calcite.rel.core._ 8 | import org.apache.calcite.rel.rules.ProjectMergeRule 9 | 10 | /** Root class for rewriters. 11 | * 12 | * @tparam C Config for rewriter. 13 | */ 14 | abstract class Rewriter[C <: RewriterConfig](config: C) { 15 | /** Rewrites the given relational algebra tree with the given config. Implemented by subclasses. 16 | */ 17 | def rewrite(root: Relation): Relation 18 | 19 | /** Entry point for rewriting by callers. Rewrites the given query with this rewriter using the given config. 20 | */ 21 | def run(query: String): RewriterResult = { 22 | val root = QueryParser.parseToRelTree(query, config.database) 23 | run(root) 24 | } 25 | 26 | /** Rewrites the given relational algebra tree with this rewriter using the given config. 27 | */ 28 | def run(root: Relation): RewriterResult = { 29 | if (AbstractAnalysis.DEBUG) { 30 | println("--- Original query ---") 31 | TreePrinter.printRelTree(root) 32 | } 33 | 34 | val rewrittenTree = rewrite(root) 35 | 36 | if (AbstractAnalysis.DEBUG) { 37 | println("--- Rewritten query (${this.getClass.getSimpleName}) ---") 38 | printTreeAndSql(rewrittenTree) 39 | } 40 | 41 | new RewriterResult(rewrittenTree, config) 42 | } 43 | 44 | /** For debugging. */ 45 | def printTreeAndSql(root: Relation): Unit = { 46 | val withQueries = root.collect{ case Relation(w: WithTable) => w }.toSet 47 | withQueries.foreach{ q => 48 | println(s"WITH ${q.alias} AS") 49 | TreePrinter.printRelTree(q.definition) 50 | println("\n") 51 | } 52 | TreePrinter.printRelTree(root) 53 | 54 | println("---") 55 | println(Rewriter.toSqlWithAliases(root, config.database.dialect)) 56 | println("") 57 | } 58 | 59 | class RewriterResult(val root: Relation, config: C) { 60 | /** Emits a SQL query for the given rewritten result. 61 | */ 62 | def toSql(dialect: String = config.database.dialect): String = Rewriter.toSqlWithAliases(root, dialect) 63 | } 64 | } 65 | 66 | object Rewriter { 67 | import com.uber.engsec.dp.rewriting.rules.Operations._ 68 | 69 | /** Transforms the given relation into SQL, preserving any aliases specified by the rewriter. 70 | */ 71 | def toSqlWithAliases(root: Relation, dialect: String, aliasRelationsInScope: Set[WithTable] = Set.empty): String = { 72 | val withQueries = root.collect{ case Relation(w: WithTable) => w }.toList.distinct.filter(!aliasRelationsInScope.contains(_)).sortBy(_.alias) 73 | 74 | val withClauses = withQueries.zipWithIndex.map { case (w, idx) => 75 | val queryStr = toSqlWithAliases(w.definition, dialect, aliasRelationsInScope ++ withQueries.take(idx)).replace("\n", "\n ") 76 | s"${w.alias} AS (\n ${queryStr}\n)" 77 | } 78 | 79 | val withPrefix = if (withClauses.isEmpty) "" else withClauses.mkString("WITH ", ", ", "\n") 80 | val querySql = RelUtils.relToSql(root.optimize(ProjectMergeRule.INSTANCE), dialect) 81 | withPrefix + querySql 82 | } 83 | } 84 | 85 | /** Dummy class to store relations that are to be defined as WITH clauses in the rewritten query (the relational 86 | * algebra tree has no representation for this since it does not admit aliases). 87 | */ 88 | case class WithTable(definition: Relation, alias: String) extends TableScan( 89 | definition.getCluster, 90 | definition.getCluster.traitSetOf(Convention.NONE), 91 | new RelOptAbstractTable(null, alias, definition.getRowType) {}) { 92 | override def equals(obj: scala.Any): Boolean = { 93 | obj match { 94 | case w: WithTable => this.definition.equals(w.definition) && this.alias.equals(w.alias) 95 | case _ => false 96 | } 97 | } 98 | } 99 | 100 | /** Flags for all rewriters */ 101 | class RewriterConfig(val database: Database) 102 | 103 | /** Flags for differential privacy-based rewriters */ 104 | class DPRewriterConfig( 105 | /** The privacy budget allocated to this query. Callers are responsible for tracking the remaining budget. */ 106 | val epsilon: Double, 107 | 108 | /** The database being queried. */ 109 | override val database: Database, 110 | 111 | /** Should rewriter add logic to automatically insert histogram bins from domain? This flag should be true if 112 | * query results are released directly. 113 | * 114 | * This is necessary for histogram queries since a DP mechanism must return a noisy result for all records in the 115 | * domain - including those not appearing in the output - in order to avoid leaking information via the presence or 116 | * absence of a bin. If this flag is true, missing bins will be populated with noisy empty results, and query 117 | * rewriting will fail if this cannot be achieved. 118 | */ 119 | val fillMissingBins: Boolean) 120 | extends RewriterConfig(database) 121 | 122 | 123 | class RewritingException(val msg: String) extends RuntimeException(msg) -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/rewriting/coverage/CoverageRewriter.scala: -------------------------------------------------------------------------------- 1 | package com.uber.engsec.dp.rewriting.coverage 2 | 3 | import com.uber.engsec.dp.rewriting.rules.ColumnDefinition._ 4 | import com.uber.engsec.dp.rewriting.rules.Expr._ 5 | import com.uber.engsec.dp.rewriting.rules.Operations._ 6 | import com.uber.engsec.dp.rewriting.{Rewriter, RewriterConfig} 7 | import com.uber.engsec.dp.sql.relational_algebra.{RelUtils, Relation} 8 | import org.apache.calcite.rel.logical.{LogicalAggregate, LogicalSort} 9 | import org.apache.calcite.rel.rules.FilterProjectTransposeRule 10 | 11 | /** 12 | * Rewriter that calculates coverage of aggregation queries. 13 | */ 14 | class CoverageRewriter(config: RewriterConfig) extends Rewriter(config) { 15 | override def rewrite(root: Relation): Relation = { 16 | /** Find first aggregation node (strip away projections and other post-processing of aggregation column). */ 17 | val rootAggNode = root.collectFirst{ case Relation(l: LogicalAggregate) => l }.get 18 | val groupedColumns = RelUtils.getGroupedCols(rootAggNode) 19 | 20 | /** Replace aggregation with a count-histogram, grouping by the same bins of original aggregation. */ 21 | val coverageRelation = Relation(rootAggNode.getInput) 22 | .agg (groupedColumns: _*) (Count(*) AS "coverage") 23 | .optimize(FilterProjectTransposeRule.INSTANCE) 24 | 25 | /** Reconstruct sort node, if present in original query, to preserve ORDER BY and LIMIT clauses. */ 26 | val newRoot = root.unwrap match { 27 | case l: LogicalSort => Relation(LogicalSort.create(coverageRelation, l.getCollation, l.offset, l.fetch)) 28 | case _ => coverageRelation 29 | } 30 | 31 | /** For histogram queries, compute median coverage across all bins */ 32 | val result = if (groupedColumns.nonEmpty) 33 | newRoot.asAlias("_count") 34 | .project(Median(col("coverage")) AS "coverage") 35 | .fetch(1) 36 | else 37 | newRoot 38 | 39 | result 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/rewriting/differential_privacy/ElasticSensitivityRewriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.rewriting.differential_privacy 24 | 25 | import com.uber.engsec.dp.rewriting._ 26 | import com.uber.engsec.dp.schema.Database 27 | import com.uber.engsec.dp.sql.relational_algebra.Relation 28 | import com.uber.engsec.dp.util.ElasticSensitivity 29 | 30 | /** Rewriter that enforces differential privacy using Elastic Sensitivity. */ 31 | class ElasticSensitivityRewriter(config: ElasticSensitivityConfig) extends SensitivityRewriter(config) { 32 | def getLaplaceNoiseScale(node: Relation, colIdx: Int): Double = 33 | 2 * ElasticSensitivity.smoothElasticSensitivity(node, config.database, colIdx, config.epsilon, config.delta) / config.epsilon 34 | } 35 | 36 | class ElasticSensitivityConfig( 37 | override val epsilon: Double, 38 | val delta: Double, 39 | override val database: Database, 40 | override val fillMissingBins: Boolean = true) 41 | extends DPRewriterConfig(epsilon, database, fillMissingBins) 42 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/rewriting/differential_privacy/RestrictedSensitivityRewriter.scala: -------------------------------------------------------------------------------- 1 | package com.uber.engsec.dp.rewriting.differential_privacy 2 | 3 | import com.uber.engsec.dp.analysis.differential_privacy.RestrictedSensitivityAnalysis 4 | import com.uber.engsec.dp.rewriting.DPRewriterConfig 5 | import com.uber.engsec.dp.schema.Database 6 | import com.uber.engsec.dp.sql.relational_algebra.Relation 7 | 8 | /** Rewriter that enforces differential privacy using Restricted Sensitivity. */ 9 | class RestrictedSensitivityRewriter(config: RestrictedSensitivityConfig) extends SensitivityRewriter(config) { 10 | def getLaplaceNoiseScale(node: Relation, colIdx: Int): Double = 11 | new RestrictedSensitivityAnalysis().run(node, config.database).colFacts(colIdx).sensitivity.get / config.epsilon 12 | } 13 | 14 | class RestrictedSensitivityConfig( 15 | override val epsilon: Double, 16 | override val database: Database, 17 | override val fillMissingBins: Boolean = true) 18 | extends DPRewriterConfig(epsilon, database, fillMissingBins) -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/rewriting/differential_privacy/SensitivityRewriter.scala: -------------------------------------------------------------------------------- 1 | package com.uber.engsec.dp.rewriting.differential_privacy 2 | 3 | import com.uber.engsec.dp.analysis.histogram.HistogramAnalysis 4 | import com.uber.engsec.dp.dataflow.domain.UnitDomain 5 | import com.uber.engsec.dp.rewriting.rules.ColumnDefinition._ 6 | import com.uber.engsec.dp.rewriting.rules.Operations._ 7 | import com.uber.engsec.dp.rewriting.{DPRewriterConfig, DPUtil, Rewriter} 8 | import com.uber.engsec.dp.sql.relational_algebra.Relation 9 | import org.apache.calcite.rel.core.Aggregate 10 | 11 | 12 | /** Parent class for sensitivity-based mechanisms, which add Laplace noise scaled to each output column's sensitivity. 13 | * Each mechanism has a specific way of computing the scale of this noise for its supported class of queries. 14 | * 15 | * See [ElasticSensitivityRewriter] and [RestrictedSensitivityRewriter]. 16 | */ 17 | abstract class SensitivityRewriter[C <: DPRewriterConfig](config: C) extends Rewriter(config) { 18 | 19 | /** Returns the scale of Laplace noise required for the given column as defined by the mechanism. Implemented by subclasses. */ 20 | def getLaplaceNoiseScale(node: Relation, colIdx: Int): Double 21 | 22 | def rewrite(root: Relation): Relation = { 23 | root.rewriteRecursive(UnitDomain) { (node, orig, _) => 24 | node match { 25 | case Relation(a: Aggregate) => 26 | // For histogram queries, ensure all values from domain appear in result set 27 | val withFilledBins = 28 | if (a.getGroupCount > 0) { 29 | val histogramResults = new HistogramAnalysis().run(node, config.database).colFacts 30 | DPUtil.addBinsFromDomain(a, histogramResults, config) 31 | } 32 | else Relation(a) 33 | 34 | val result = withFilledBins.mapCols { col => 35 | // Compute the scale of Laplace noise for the column. 36 | val laplaceNoiseScale = getLaplaceNoiseScale(node, col.idx) 37 | 38 | if (laplaceNoiseScale == 0) 39 | // No noise added to histogram bins that are marked safe for release. 40 | col 41 | else { 42 | // Rewrite the column expression to add scaled Laplace noise. 43 | val noiseExpr = laplaceNoiseScale * DPUtil.LaplaceSample 44 | (col.expr + noiseExpr) AS col.alias 45 | } 46 | } 47 | 48 | (result, ()) 49 | 50 | case _ => (node, ()) 51 | } 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/rewriting/differential_privacy/WPINQRewriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.rewriting.differential_privacy 24 | 25 | import com.uber.engsec.dp.analysis.histogram.{HistogramAnalysis, QueryType} 26 | import com.uber.engsec.dp.dataflow.AggFunctions.COUNT 27 | import com.uber.engsec.dp.dataflow.domain.UnitDomain 28 | import com.uber.engsec.dp.exception.UnsupportedQueryException 29 | import com.uber.engsec.dp.rewriting.rules.ColumnDefinition._ 30 | import com.uber.engsec.dp.rewriting.rules.Expr.{col, _} 31 | import com.uber.engsec.dp.rewriting.rules.Operations._ 32 | import com.uber.engsec.dp.rewriting.rules._ 33 | import com.uber.engsec.dp.rewriting.{DPRewriterConfig, DPUtil, Rewriter} 34 | import com.uber.engsec.dp.schema.Database 35 | import com.uber.engsec.dp.sql.relational_algebra.{RelUtils, Relation} 36 | import org.apache.calcite.rel.core._ 37 | 38 | /** 39 | * Rewriter for WPINQ. Converts a SQL counting query into a query that returns a noisy count of weights. 40 | * 41 | * @see [[https://arxiv.org/abs/1203.3453 Calibrating Data to Sensitivity in Private Data Analysis]] 42 | */ 43 | class WPINQRewriter(config: WPINQConfig) extends Rewriter(config) { 44 | def rewrite(root: Relation): Relation = { 45 | // Reject unsupported queries 46 | val histogramResults = new HistogramAnalysis().runAll(root, config.database) 47 | val queryType = QueryType.getQueryType(histogramResults(root)) 48 | 49 | val isValidQueryType = 50 | Set(QueryType.HISTOGRAM, QueryType.NON_HISTOGRAM_STATISTICAL).contains(queryType) && 51 | histogramResults(root).colFacts.filter(_.isAggregation).forall(_.outermostAggregation.contains(COUNT)) 52 | 53 | if (!isValidQueryType) throw new UnsupportedQueryException("This rewriter only works on counting queries") 54 | 55 | val joinNodes = root.collect{ case Relation(j: Join) => j } 56 | if (joinNodes.exists{ join => RelUtils.extractEquiJoinColumns(join, join.getCondition).isEmpty }) 57 | throw new UnsupportedQueryException("This rewriter only works on queries with equijoins") 58 | 59 | root.rewriteRecursive(UnitDomain) { (node, orig, _) => 60 | node match { 61 | // Add initial weight column to tables. 62 | case Relation(tbl: TableScan) => (node.project(*, (config.initialWeights(tbl): ValueExpr) AS "_weight"), ()) 63 | 64 | // Ensure the weight column is projected through project nodes. 65 | case Relation(p: Project) => (p.reproject(*, col("_weight")), ()) 66 | 67 | case Relation(j: Join) => 68 | val (leftJoinCol, rightJoinCol) = RelUtils.extractEquiJoinColumns(j, j.getCondition).getOrElse(throw new UnsupportedQueryException("This rewriter only supports equijoin conditions.")) 69 | 70 | val A = Relation(j.getLeft).rename(col("_weight") AS "_A_w").asAlias("_A") 71 | val B = Relation(j.getRight).rename(col("_weight") AS "_B_w").asAlias("_B") 72 | 73 | val Ak = A.agg (col(leftJoinCol)) (Sum(col("_A_w")) AS "_A_s") 74 | val Bk = B.agg (col(rightJoinCol)) (Sum(col("_B_w")) AS "_B_s") 75 | 76 | val newNode = node 77 | .replaceInputs(_ => List(A, B)) 78 | .join(Ak, left(leftJoinCol) == right(0)) 79 | .join(Bk, left(leftJoinCol) == right(0)) 80 | .project(*, ((col("_A_w") * col("_B_w")) / (col("_A_s") + col("_B_s"))) AS "_weight") 81 | .remove(col("_A_w"), col("_B_w"), col("_A_s"), col("_B_s")) 82 | 83 | (newNode, ()) 84 | 85 | case Relation(a: Aggregate) => 86 | val groupedCols = RelUtils.getGroupedCols(a) 87 | val origColName = a.getRowType.getFieldNames.get(groupedCols.length) 88 | val weightSumRelation = Relation(a.getInput).agg (groupedCols: _*) (Sum(col("_weight")) AS "_weight_sum") 89 | 90 | // For histogram queries, ensure all values from domain appear in result set, assigning weighted sum 0 to 91 | // absent bins. 92 | val withFilledBins = DPUtil.addBinsFromDomain(weightSumRelation.unwrap.asInstanceOf[Aggregate], histogramResults(orig).colFacts, config) 93 | 94 | // Add noise to weights 95 | val result = withFilledBins 96 | .project(*, col("_weight_sum") + (1.0 / config.epsilon) * DPUtil.LaplaceSample AS origColName) 97 | .remove(col("_weight_sum")) 98 | 99 | (result, ()) 100 | 101 | case _ => (node, ()) 102 | } 103 | } 104 | } 105 | } 106 | 107 | class WPINQConfig( 108 | override val epsilon: Double, 109 | override val database: Database, 110 | override val fillMissingBins: Boolean = true) 111 | extends DPRewriterConfig(epsilon, database, fillMissingBins) { 112 | /** The initial weight assigned to each record in the given table. Default is 1.0. */ 113 | def initialWeights(table: TableScan): Double = 1.0 114 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/rewriting/rules/ColumnDefinition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.rewriting.rules 24 | 25 | import com.uber.engsec.dp.dataflow.column.AbstractColumnAnalysis.ColumnFacts 26 | import com.uber.engsec.dp.dataflow.column.NodeColumnFacts 27 | import com.uber.engsec.dp.rewriting.rules.Expr.ColumnReferenceByName 28 | import com.uber.engsec.dp.sql.relational_algebra.{Relation, Transformer} 29 | import org.apache.calcite.rel.logical.{LogicalProject, LogicalValues} 30 | import org.apache.calcite.tools.Frameworks 31 | 32 | /** Methods for specifying column references and definitions in rewriting operations. */ 33 | 34 | class ColumnDefinition[+T <: Expr](val expr: T) 35 | case class ColumnDefinitionWithAlias[+T <: Expr](override val expr: T, alias: String) extends ColumnDefinition[T](expr) 36 | case class ColumnDefinitionWithOrdinal[+T <: Expr](override val expr: T, alias: String, idx: Int) extends ColumnDefinition[T](expr) 37 | 38 | object ColumnDefinition { 39 | import scala.collection.JavaConverters._ 40 | import scala.language.implicitConversions 41 | 42 | // Automatically cast to column if alias is attached to an expression 43 | implicit class ExprColumnAlias[T <: Expr](expr: T) { 44 | def AS(alias: String): ColumnDefinitionWithAlias[T] = ColumnDefinitionWithAlias[T](expr, alias) 45 | def AS(alias: ColumnReferenceByName): ColumnDefinitionWithAlias[T] = ColumnDefinitionWithAlias[T](expr, alias.name) 46 | } 47 | 48 | // Allow renaming of a column (keeping the same expression) 49 | implicit class ColumnAlias[T <: Expr](col: ColumnDefinition[T]) { 50 | def AS(alias: String): ColumnDefinitionWithAlias[T] = ColumnDefinitionWithAlias[T](col.expr, alias) 51 | } 52 | 53 | // Allow easy lookup of the column fact from an analysis result 54 | implicit class ColumnFactLookup[F](results: ColumnFacts[F]) { 55 | def apply[T <: Expr](col: ColumnDefinitionWithOrdinal[T]): F = results(col.idx) 56 | } 57 | implicit class NodeColumnFactLookup[F](results: NodeColumnFacts[_,F]) { 58 | def apply[T <: Expr](col: ColumnDefinitionWithOrdinal[T]): F = results.colFacts(col.idx) 59 | } 60 | 61 | // Creates a relation from a list of column definitions 62 | def rel(cols: ColumnDefinition[Expr]*): Relation = columnDefsToRelation(cols) 63 | implicit def columnDefsToRelation(cols: Seq[ColumnDefinition[Expr]]): Relation = { 64 | val cluster = new Transformer( 65 | Frameworks.newConfigBuilder 66 | .defaultSchema(Frameworks.createRootSchema(true)) 67 | .build 68 | ).cluster 69 | 70 | val inputRel = LogicalValues.createOneRow(cluster) 71 | val projections = cols.map{ _.expr.toRex(Relation(inputRel)) } 72 | val rowType = Helpers.getRecordType( cols.zip(projections) ) 73 | val result = LogicalProject.create(inputRel, projections.asJava, rowType) 74 | Relation(result) 75 | } 76 | 77 | implicit def columnReferenceToColumnDefinitionWithName(col: ColumnReferenceByName): ColumnDefinitionWithAlias[ColumnReferenceByName] = ColumnDefinitionWithAlias[ColumnReferenceByName](col, col.name) 78 | implicit def columnDefinitionWithAliasToColumnReferenceByName[T <: Expr](col: ColumnDefinitionWithAlias[T]): ColumnReferenceByName = Expr.col(col.alias) 79 | implicit def exprToColumnDefinition[T <: Expr](expr: T): ColumnDefinition[T] = new ColumnDefinition(expr) 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/schema/CachingSchema.scala: -------------------------------------------------------------------------------- 1 | package org.apache.calcite.jdbc 2 | 3 | import org.apache.calcite.schema.{Schema, SchemaPlus} 4 | 5 | object SchemaAdapter { 6 | def toRootSchemaPlus(schema: Schema, name: String): SchemaPlus = new CachingCalciteSchema(null, schema, name).plus() 7 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/schema/DatabaseModel.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.schema 24 | 25 | import com.facebook.presto.sql.tree._ 26 | 27 | /** A class to model differences between SQL database dialects and vendors that are material to query analysis. This 28 | * will be updated over time 29 | */ 30 | object DatabaseModel { 31 | /** Returns the column name assigned implicitly by the database for the given expression, i.e., 32 | * the name of the output column in the absence of an explicit alias. Note this is highly 33 | * database-specific. Logic below is for Vertica. 34 | */ 35 | def getImplicitColumnName(expr: Expression) = expr match { 36 | case q: QualifiedNameReference => q.getName.toString 37 | case d: DereferenceExpression => d.getFieldName 38 | case f: FunctionCall => f.getName.toString 39 | case s: StringLiteral => s.getValue 40 | case _ : ArithmeticBinaryExpression => "?column?" 41 | case _ : AtTimeZone => "timezone" 42 | case _ : SearchedCaseExpression => "case" 43 | case _ : Extract => "date_part" 44 | case _ : CurrentTime => "?column?" 45 | case _ : InPredicate => "?column?" 46 | case _ : CoalesceExpression => "coalesce" 47 | case _ : LongLiteral => "?column?" 48 | case _ : ComparisonExpression => "?column?" 49 | case _ => "?column?" // unknown/default 50 | } 51 | 52 | /** Is the given name a built-in function? If so, all QualifiedNameReference nodes with this value will be interpreted 53 | * as functions rather than column references. 54 | */ 55 | def isBuiltInFunction(name: String): Boolean = { 56 | name == "sysdate" 57 | } 58 | 59 | /** Returns true if the given function's ordinal argument (0-indexed) is known to be a literal value, in which case 60 | * it should be interpreted as a literal value even if parsed as a QualifiedName reference because it may not 61 | * be quoted/escaped in the original query. 62 | */ 63 | def isFunctionArgumentLiteral(functionName: String, argNum: Int): Boolean = { 64 | // TODO: extend this. 65 | (argNum == 0) && (functionName == "datediff" || functionName == "timestampadd") 66 | } 67 | 68 | /** Normalizes the table name to a canonical representation, e.g., by stripping out namespace and/or optional prefixes. 69 | * This canonical table name should match the name provided in the schema config to ensure that schema information 70 | * can be retrieved for the table. 71 | */ 72 | def normalizeTableName(tableName: String) = { 73 | tableName.replaceAll("^public.", "") // strip any "public." prefix 74 | } 75 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/AbstractAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql 24 | 25 | import com.uber.engsec.dp.exception.DPException 26 | import com.uber.engsec.dp.schema.Database 27 | import com.uber.engsec.dp.util.IdentityHashMap 28 | 29 | import scala.collection.mutable 30 | 31 | /** Abstract class for all analyses on parsed SQL queries. 32 | * 33 | * @tparam N The node type for the tree (AST, dataflow graph, or relational algebra) 34 | * @tparam T The return type of the analysis. For column fact analysis, [T] derives from ColumnFacts[_]. For visitor 35 | * analyses, T is any object reference type. For abstract interpretation-based dataflow analyses, [T] 36 | * derives from AbstractDomain. 37 | */ 38 | abstract class AbstractAnalysis[N <: AnyRef, T] extends TreeFunctions[N] { 39 | 40 | /** Allows code to symbolically reference return type of an analysis (e.g., HistogramAnalysis#ResultType) */ 41 | type ResultType = T 42 | 43 | /****************************************************************************************************************** 44 | * Public methods for analysis callers. 45 | *******************************************************************************************************************/ 46 | 47 | /** Runs the analysis on the given query and returns the abstract results at tree root. 48 | */ 49 | final def analyzeQuery(query: String, database: Database): T = { 50 | try { 51 | val treeRoot = parseQueryToTree(query, database) 52 | run(treeRoot, database) 53 | } 54 | catch { 55 | case e: Exception => 56 | // Catch all exceptions that may occur during query parsing and analysis, and wrap in DPException type. 57 | throw new DPException("Error during query analysis", e) 58 | } 59 | } 60 | 61 | /** Runs the analysis on the given parsed representation of the query. 62 | */ 63 | final def analyzeQuery(root: N, database: Database): T = { 64 | try { 65 | run(root, database) 66 | } 67 | catch { 68 | case e: Exception => 69 | // Catch all exceptions that may occur during query parsing and analysis, and wrap in DPException type. 70 | throw new DPException("Error during query analysis", e) 71 | } 72 | } 73 | 74 | /** Runs the analysis on the tree and returns the abstract result at the tree root. Subclases may override this 75 | * method to pre-process the query before analysis begins, but must call super.run(). 76 | */ 77 | def run(root: N, database: Database): T = { 78 | try { 79 | treeRoot = Some(root) 80 | currentDb = Some(database) 81 | resultMap.clear() 82 | this.process(root) 83 | currentNode = None 84 | } 85 | finally { // Print the tree even if analysis throws an exception 86 | if (AbstractAnalysis.DEBUG) { 87 | System.out.println("\n********** " + this.getClass.getSimpleName + " **********") 88 | printTree(treeRoot.get) 89 | } 90 | } 91 | resultMap(root) 92 | } 93 | 94 | /** Returns the current database being queried. */ 95 | def getDatabase: Database = currentDb.get 96 | 97 | /** Runs the analysis on the tree and returns a map from nodes in the tree to analysis state at that node. */ 98 | def runAll(root: N, database: Database): mutable.HashMap[N, T] = { 99 | run(root, database) 100 | resultMap 101 | } 102 | 103 | /****************************************************************************************************************** 104 | * Analysis engine internals. 105 | ******************************************************************************************************************/ 106 | 107 | /** Map from each node in the tree to analysis results at that node. May be inspected by analysis implementations in 108 | * their transfer/join functions; results are guaranteed to exist for all nodes *below* the current node in the tree. 109 | */ 110 | val resultMap: mutable.HashMap[N, T] = new IdentityHashMap[N, T]() 111 | 112 | /** The root node of the tree under analysis. */ 113 | final var treeRoot: Option[N] = None 114 | 115 | /** The database being queried. */ 116 | final var currentDb: Option[Database] = None 117 | 118 | /** The current node being processed. Subclasses should update this variable as tree is traversed to enable helpful 119 | * debugging when analysis throws an exception. 120 | */ 121 | var currentNode: Option[N] = None 122 | 123 | /** Analysis entry point. Runs analysis and stores results in resultMap. */ 124 | def process(root: N): Unit 125 | } 126 | 127 | object AbstractAnalysis { 128 | // Set this argument to print all analysis trees along with result state. 129 | val DEBUG: Boolean = System.getProperty("query.debug", "false").toBoolean 130 | } 131 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/QueryParser.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql 24 | 25 | import com.facebook.presto.sql.parser.{SqlParser => PrestoSqlParser} 26 | import com.facebook.presto.sql.tree.{Query, Statement} 27 | import com.uber.engsec.dp.exception.ParsingException 28 | import com.uber.engsec.dp.schema.Database 29 | import com.uber.engsec.dp.sql.ast.{Transformer => ASTTransformer} 30 | import com.uber.engsec.dp.sql.dataflow_graph.Node 31 | import com.uber.engsec.dp.sql.relational_algebra.{Relation, Transformer => RelTransformer} 32 | 33 | /** Utility class for parsing SQL queries into different representations. 34 | */ 35 | object QueryParser { 36 | private val prestoParser: PrestoSqlParser = new PrestoSqlParser 37 | 38 | def printQuery(query: String, treeType: String): Unit = { 39 | if (AbstractAnalysis.DEBUG) { 40 | println(s">>>>>>>>>>>>>>>>>>>>>>>>>>>> Parsing query to ${treeType}:") 41 | println(query) 42 | println("<<<<<<<<<<<<<<<<<<<<<<<<<<<") 43 | } 44 | } 45 | 46 | /** Parse a SQL query into an AST (represented by a Presto tree) 47 | * @param query The SQL query to be parsed 48 | * @return The AST root node representing the query 49 | */ 50 | def parseToPrestoTree(query: String): Query = { 51 | printQuery(query, "presto tree") 52 | 53 | try { 54 | return prestoParser.createStatement(query).asInstanceOf[Query] 55 | } 56 | catch { 57 | case e: Exception => { 58 | // Catch all exceptions that occur during presto parsing and wrap them in our ParsingException exception type. 59 | throw new ParsingException(e.getMessage) 60 | } 61 | } 62 | } 63 | 64 | /** Parse a SQL query and transform it into a dataflow graph 65 | * @param query The SQL query to be parsed 66 | * @return The dataflow graph root node 67 | */ 68 | def parseToDataflowGraph(query: String, database: Database): Node = { 69 | printQuery(query, "dataflow graph") 70 | 71 | val prestoRoot: Statement = parseToPrestoTree(query) 72 | val transform = new ASTTransformer(database) 73 | transform.convertToDataflowGraph(prestoRoot) 74 | } 75 | 76 | /** Parse a SQL query and transform it into a relational algebra representation. 77 | * @param query The SQL query to be parsed 78 | * @return The relational algebra tree root node 79 | */ 80 | def parseToRelTree(query: String, database: Database): Relation = { 81 | printQuery(query, "relational algebra tree") 82 | 83 | val transformer = RelTransformer.create(database) 84 | val root = transformer.convertToRelTree(query) 85 | Relation(root) 86 | } 87 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/TreeFunctions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql 24 | 25 | import com.uber.engsec.dp.schema.Database 26 | 27 | /** Common trait for all query representations (AST, dataflow graph, and relational algebra tree). 28 | * 29 | * @tparam N Node type of tree. 30 | */ 31 | abstract trait TreeFunctions[N] { 32 | /** Returns the children for the given node. 33 | */ 34 | def getNodeChildren(node: N): Iterable[N] 35 | 36 | /** Parses and converts the given SQL string query to this tree type. 37 | */ 38 | def parseQueryToTree(query: String, database: Database): N 39 | 40 | /** Prints the tree for debugging. 41 | */ 42 | def printTree(node: N): Unit 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/ast/ASTFunctions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.ast 24 | 25 | import com.facebook.presto.sql.tree._ 26 | import com.uber.engsec.dp.exception.{TransformationException, UnsupportedConstructException} 27 | import com.uber.engsec.dp.schema.Database 28 | import com.uber.engsec.dp.sql.{AbstractAnalysis, QueryParser, TreeFunctions, TreePrinter} 29 | 30 | trait ASTFunctions extends TreeFunctions[Node] { 31 | this: AbstractAnalysis[Node, _] => 32 | override def getNodeChildren(node: Node): Iterable[Node] = ASTFunctions.getChildren(node) 33 | override def parseQueryToTree(query: String, database: Database): Node = QueryParser.parseToPrestoTree(query) 34 | override def printTree(node: Node) = TreePrinter.printTreePresto(node, resultMap, currentNode) 35 | } 36 | 37 | object ASTFunctions { 38 | /** Returns the children of the given AST node. This is used to traverse ASTs in lieu of Presto's Java visitor interface. 39 | */ 40 | def getChildren(node: Node): Iterable[Node] = { 41 | import scala.collection.JavaConverters._ 42 | val result = node match { 43 | case _: IntervalLiteral => Nil 44 | case _: Literal => Nil 45 | case e: Explain => List(e.getStatement) ++ e.getOptions.asScala 46 | case e: ExistsPredicate => List(e.getSubquery) 47 | case e: Extract => List(e.getExpression) 48 | case c: Cast => List(c.getExpression) 49 | case a: ArithmeticBinaryExpression => List(a.getLeft, a.getRight) 50 | case b: BetweenPredicate => List(b.getMin, b.getMax, b.getValue) 51 | case c: CoalesceExpression => c.getOperands.asScala 52 | case a: AtTimeZone => List(a.getValue, a.getTimeZone) 53 | case a: ArrayConstructor => a.getValues.asScala 54 | case s: SubscriptExpression => List(s.getBase, s.getIndex) 55 | case c: ComparisonExpression => List(c.getLeft, c.getRight) 56 | case q: QualifiedNameReference => Nil 57 | case q: Query => stripOption(q.getWith) ++ List(q.getQueryBody) ++ q.getOrderBy.asScala 58 | case w: With => w.getQueries.asScala 59 | case w: WithQuery => List(w.getQuery) 60 | case s: Select => s.getSelectItems.asScala 61 | case s: SingleColumn => List(s.getExpression) 62 | case w: WhenClause => List(w.getOperand, w.getResult) 63 | case i: InPredicate => List(i.getValue, i.getValueList) 64 | case f: FunctionCall => f.getArguments.asScala ++ stripOption(f.getWindow) 65 | case d: DereferenceExpression => List(d.getBase) 66 | case w: Window => w.getOrderBy.asScala ++ w.getPartitionBy.asScala ++ stripOption(w.getFrame) 67 | case w: WindowFrame => List(w.getStart) ++ stripOption(w.getEnd) 68 | case f: FrameBound => if (f.getValue.isPresent) List(f.getValue.get) else Nil 69 | case s: SimpleCaseExpression => s.getWhenClauses.asScala ++ List(s.getOperand) ++ stripOption(s.getDefaultValue) 70 | case i: InListExpression => i.getValues.asScala 71 | case n: NullIfExpression => List(n.getFirst, n.getSecond) 72 | case i: IfExpression => List(i.getCondition, i.getTrueValue) ++ stripOption(i.getFalseValue) 73 | case t: TryExpression => List(t.getInnerExpression) 74 | case a: ArithmeticUnaryExpression => List(a.getValue) 75 | case n: NotExpression => List(n.getValue) 76 | case s: SearchedCaseExpression => s.getWhenClauses.asScala ++ stripOption(s.getDefaultValue) 77 | case l: LikePredicate => List(l.getValue, l.getPattern, l.getEscape) 78 | case i: IsNotNullPredicate => List(i.getValue) 79 | case i: IsNullPredicate => List(i.getValue) 80 | case l: LogicalBinaryExpression => List(l.getRight, l.getLeft) 81 | case s: SubqueryExpression => List(s.getQuery) 82 | case s: SortItem => List(s.getSortKey) 83 | case q: QuerySpecification => List(q.getSelect) ++ stripOption(q.getFrom) ++ stripOption(q.getWhere) ++ stripOption(q.getGroupBy) ++ stripOption(q.getHaving) 84 | case s: SetOperation => s.getRelations.asScala 85 | case v: Values => v.getRows.asScala 86 | case r: Row => r.getItems.asScala 87 | case t: Table => Nil 88 | case t: TableSubquery => List(t.getQuery) 89 | case a: AliasedRelation => List(a.getRelation) 90 | case s: SampledRelation => List(s.getRelation, s.getSamplePercentage) 91 | case j: Join => List(j.getLeft, j.getRight) ++ stripOption(j.getCriteria).collect{ case c: JoinOn => c.getExpression } 92 | case u: Unnest => u.getExpressions.asScala 93 | case g: GroupBy => g.getGroupingElements.asScala 94 | case s: SimpleGroupBy => s.getColumnExpressions.asScala 95 | case g: GroupingElement => g.enumerateGroupingSets.asScala.flatMap{ _.asScala } 96 | case i: Insert => illegalOperation(i) 97 | case d: Delete => illegalOperation(d) 98 | case c: CreateTableAsSelect => illegalOperation(c) 99 | case c: AllColumns => Nil 100 | case c: CurrentTime => Nil 101 | case _ => throw new UnsupportedConstructException("getChildren on unsupported AST node type: " + node.getClass.getSimpleName) 102 | } 103 | result.filter{ _ != null } 104 | } 105 | 106 | private[ast] def illegalOperation(node: Node): Nothing = throw new TransformationException("Found illegal/unsupported operation in query: " + node.getClass.toString) 107 | def stripOption[T](node: java.util.Optional[T]): List[T] = { if (node.isPresent) List(node.get) else Nil } 108 | } 109 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/dataflow_graph/DataflowGraphFunctions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.dataflow_graph 24 | 25 | import com.uber.engsec.dp.schema.Database 26 | import com.uber.engsec.dp.sql.{AbstractAnalysis, QueryParser, TreeFunctions, TreePrinter} 27 | 28 | trait DataflowGraphFunctions extends TreeFunctions[Node] { 29 | this: AbstractAnalysis[Node, _] => 30 | override def getNodeChildren(node: Node): Iterable[Node] = node.children 31 | override def parseQueryToTree(query: String, database: Database): Node = QueryParser.parseToDataflowGraph(query, database) 32 | override def printTree(node: Node) = TreePrinter.printTree(node, resultMap, currentNode) 33 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/dataflow_graph/DataflowGraphUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.dataflow_graph 24 | 25 | import com.uber.engsec.dp.sql.dataflow_graph.reference.{ColumnReference, Function} 26 | import com.uber.engsec.dp.sql.dataflow_graph.relation.Join 27 | 28 | object DataflowGraphUtils { 29 | /** Extracts the left and right column indexes, respectively, used in an equijoin condition, or None if 30 | * the join node uses any other type of join condition (including an empty join condition). 31 | */ 32 | def extractEquiJoinColumns(node: Join): Option[(Int,Int)] = { 33 | node.condition.collect { 34 | case Function("EQUAL", ColumnReference(leftIdx, node.left) :: ColumnReference(rightIdx, node.right) :: Nil) => (leftIdx, rightIdx) 35 | case Function("EQUAL", ColumnReference(rightIdx, node.right) :: ColumnReference(leftIdx, node.left) :: Nil) => (leftIdx, rightIdx) 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/dataflow_graph/Node.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.dataflow_graph 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | 27 | import scala.collection.mutable 28 | 29 | /** A dataflow graph is a custom representation of a SQL query where all data dependencies are explicitly expressed via 30 | * graph edges. It can be used as the basis for both column-based and relation-based dataflow analyses. 31 | * 32 | * This class is the parent type of all dataflow graph nodes. 33 | */ 34 | abstract class Node(val prestoSource: Option[PrestoNode]) extends Traversable[Node] { 35 | 36 | val nodeStr: String 37 | val children: List[Node] 38 | 39 | /** Implementing the foreach method from Traversable gives access to many useful higher-order functions on dataflow 40 | * graphs including fold*, reduce*, exists, collect, etc. 41 | * 42 | * Since dataflow graphs may contain cycles, our implementation of foreach must keep track of which children nodes 43 | * have been traversed already. 44 | */ 45 | override def foreach[U](f: Node => U) = _foreach(f, new mutable.HashSet()) 46 | private def _foreach[U](f: Node => U, visited: mutable.HashSet[Node]): Unit = { 47 | if (visited.contains(this)) 48 | return 49 | visited += this 50 | 51 | f(this) 52 | children.foreach { 53 | _._foreach(f, visited) 54 | } 55 | } 56 | 57 | /** Optimized version of some traversable methods. 58 | */ 59 | override def isEmpty: Boolean = false 60 | override def head: Node = this 61 | // tail is inherited from TraversableLike (and implemented using foreach) 62 | 63 | override val hashCode: Int = super.hashCode 64 | 65 | // We override the equals method to ensure reference equality (by default, Scala uses structural equality for case classes) 66 | override def equals(that: Any): Boolean = 67 | that match { 68 | case ref: AnyRef => this eq ref 69 | case _ => false 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/dataflow_graph/reference/ColumnReference.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.dataflow_graph.reference 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | import com.uber.engsec.dp.sql.dataflow_graph.relation.Relation 27 | 28 | /** ColumnReference: A column reference is a node that reads a specific column (referenced by zero-indexed ordinal) from 29 | * a relation. For example, if my_table has columns [foo, bar, baz] then for query "SELECT baz from my_table", the 30 | * dataflow graph includes a ColumnReference node with .of pointing to relation DataTable[my_table] and .colIndex = 2. 31 | * 32 | * If you want to know the name of the column, you can ask the relation, e.g., this.of.getColumnName(this.colIndex) but 33 | * be aware a column reference is not uniquely defined by the column name since relations in a dataflow graph can 34 | * have more than one column the same name. 35 | */ 36 | case class ColumnReference(colIndex: Int, of: Relation)(implicit override val prestoSource: Option[PrestoNode] = None) 37 | extends Reference(prestoSource) { 38 | 39 | override val children = List( of ) 40 | 41 | override val nodeStr = colIndex.toString 42 | 43 | override def toString: String = s"${of.toString}.${colIndex.toString}" 44 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/dataflow_graph/reference/Function.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.dataflow_graph.reference 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | 27 | /** Function: A SQL function application, e.g., COUNT(x). In the most general sense, this node type captures 28 | * any SQL construct that can be modeled as a function of one of more references (the references themselves being 29 | * perhaps subtrees). 30 | */ 31 | case class Function(functionName: String, args: List[Reference] = Nil)(implicit override val prestoSource: Option[PrestoNode] = None) 32 | extends Reference(prestoSource) { 33 | 34 | override val children = args 35 | 36 | override val nodeStr = functionName 37 | 38 | override def toString: String = functionName 39 | } 40 | 41 | object Function { 42 | def apply(functionName: String, arg: Reference)(implicit prestoSource: Option[PrestoNode]) = new Function(functionName, List(arg))(prestoSource) 43 | def apply(functionName: String, args: Reference*)(implicit prestoSource: Option[PrestoNode]) = new Function(functionName, args.toList)(prestoSource) 44 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/dataflow_graph/reference/Reference.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.dataflow_graph.reference 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | import com.uber.engsec.dp.sql.dataflow_graph.Node 27 | 28 | /** Generic parent class for Reference nodes. 29 | * 30 | * Conceptually, a reference node captures a specific and well-defined data dependence into a relation, either by 31 | * direct column reference, e.g., "SELECT a.x from blah", or function application, e.g., "SELECT count(*) from blah". 32 | * In both examples, the part immediately after the SELECT is represented by a specific subclass of this class 33 | * which knows that it is executed w.r.t. relation "blah". For functions, this is tracked by the 'args' field; for 34 | * ColumnReference, it's tracked by the 'of' field. 35 | */ 36 | abstract class Reference(override val prestoSource: Option[PrestoNode]) extends Node(prestoSource) 37 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/dataflow_graph/reference/UnstructuredReference.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.dataflow_graph.reference 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | import com.uber.engsec.dp.sql.dataflow_graph.Node 27 | 28 | /** An unstructured reference is a type of reference node that stores a relationship between children nodes without any 29 | * other semantic information. It is used as a "catch all" to represent SQL constructs that need no distinct 30 | * representation for our analyses but for which we wish to still capture a coarse data dependency between nodes. 31 | */ 32 | case class UnstructuredReference(refType: String, refChildren: List[Node] = Nil)(implicit override val prestoSource: Option[PrestoNode] = None) 33 | extends Reference(prestoSource) { 34 | 35 | override val children = refChildren 36 | 37 | override val nodeStr: String = refType 38 | 39 | override def toString: String = refType 40 | } 41 | 42 | object UnstructuredReference { 43 | def apply(refType: String, children: Node*)(implicit prestoSource: Option[PrestoNode]) = new UnstructuredReference(refType, children.toList)(prestoSource) 44 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/dataflow_graph/relation/DataTable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.dataflow_graph.relation 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | import com.uber.engsec.dp.schema.{Database, Schema} 27 | 28 | /** A DataTable is a leaf node of that represents a table in the database. 29 | */ 30 | case class DataTable( 31 | name: String, 32 | database: Database, 33 | override val columnNames: IndexedSeq[String]) 34 | (implicit override val prestoSource: Option[PrestoNode] = None) 35 | extends Relation(columnNames, prestoSource ) { 36 | 37 | override val children = Nil 38 | 39 | override val nodeStr: String = "\"" + name + "\"" 40 | 41 | /** Metadata properties (from the schema config file) for the columns in this table. 42 | */ 43 | lazy val colProperties: IndexedSeq[Map[String,String]] = { 44 | val colMap = Schema.getSchemaMapForTable(database, name) 45 | columnNames.map { colName => colMap.get(colName).fold(Map.empty[String,String])(_.properties) } 46 | } 47 | 48 | override def toString: String = name 49 | } 50 | 51 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/dataflow_graph/relation/Except.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.dataflow_graph.relation 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | 27 | /** A relation created from SQL's EXCEPT clause. 28 | */ 29 | case class Except(val left: Relation, val right: Relation)(implicit override val prestoSource: Option[PrestoNode] = None) 30 | extends Relation(left.columnNames, prestoSource ) { 31 | 32 | override val children = List(left, right) 33 | 34 | override val nodeStr: String = "\"EXCEPT\"" 35 | 36 | override def toString: String = "EXCEPT" 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/dataflow_graph/relation/Join.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.dataflow_graph.relation 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | import com.uber.engsec.dp.exception.AmbiguousColumnReference 27 | import com.uber.engsec.dp.sql.dataflow_graph.reference.Reference 28 | 29 | /** A relation created by JOIN. 30 | */ 31 | object JoinType { 32 | sealed trait JoinType 33 | case object CROSS extends JoinType 34 | case object INNER extends JoinType 35 | case object LEFT extends JoinType 36 | case object RIGHT extends JoinType 37 | case object FULL extends JoinType 38 | 39 | def parse(name: String): JoinType = { 40 | name match { 41 | case "CROSS" => CROSS 42 | case "INNER" => INNER 43 | case "LEFT" => LEFT 44 | case "RIGHT" => RIGHT 45 | case "FULL" => FULL 46 | case "IMPLICIT" => CROSS // implicit joins are effectively cross joins 47 | case _ => throw new IllegalArgumentException(s"Unknown join type: $name") 48 | } 49 | } 50 | } 51 | 52 | case class Join( 53 | left: Relation, 54 | right: Relation, 55 | joinType: JoinType.JoinType, 56 | condition: Option[Reference] = None) 57 | (implicit override val prestoSource: Option[PrestoNode] = None) 58 | extends Relation(left.columnNames ++ right.columnNames, prestoSource) { 59 | 60 | val children = List(left, right) ++ (if (condition.isDefined) List(condition.get) else Nil) 61 | 62 | /** Returns the column index for the column appearing in the *specified* inner relation. This is a substitute for the 63 | * Relation.getColumnIndex method for cases where the select item references a joined table by name. For example, 64 | * if table1 and table2 both have a column "uuid", this method can be used to resolve the correct index for the 65 | * query: SELECT table2.uuid from table1 JOIN table2 ... 66 | * 67 | * Returns the index of the specific column, or None if the column doesn't exist. 68 | */ 69 | def getColumnIndexForInnerRelation(colName: String, innerRelation: Relation): Option[Int] = { 70 | 71 | def visitRelation(indexSoFar: Int, relation: Relation): Option[Int] = 72 | 73 | if (relation == innerRelation) { 74 | // We found the specified inner relation. Return. 75 | val result = relation.getColumnIndexes(colName) 76 | result.size match { 77 | case 0 => None // Column not found 78 | case 1 => Some(indexSoFar + result(0)) 79 | case _ => throw new AmbiguousColumnReference("Relation " + this.toString + " has more than one column named " + colName) 80 | } 81 | 82 | } else { 83 | relation match { 84 | case join: Join => 85 | // We are processing a Join node. Call the visitRelation method recursively on both the left and right relations 86 | val leftResult = visitRelation(indexSoFar, join.left) 87 | val rightResult = visitRelation(indexSoFar + join.left.numCols, join.right) 88 | 89 | (leftResult, rightResult) match { 90 | case (Some(a), None) => Some(a) // we found the column in .left 91 | case (None, Some(b)) => Some(b) // we found the column in .right 92 | case (None, None) => None 93 | case (Some(a), Some(b)) => 94 | if (join.left == join.right) // a table is joined with itself (so the graph node is shared, and both sides match). Return the left one; it doesn't matter. 95 | Some(a) 96 | else // This should never happen 97 | throw new AmbiguousColumnReference("Children of relation " + this.toString + " both match target inner relation node and have a column named " + colName) 98 | } 99 | 100 | case _ => None 101 | } 102 | } 103 | 104 | visitRelation(0, this) 105 | } 106 | 107 | override val nodeStr = joinType.toString 108 | override def toString = joinType.toString + " JOIN" 109 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/dataflow_graph/relation/Relation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.dataflow_graph.relation 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | import com.uber.engsec.dp.sql.dataflow_graph.Node 27 | 28 | /** Generic parent class for all relations in dataflow graphs, which includes: the result of the entire query, named 29 | * tables in the database, and subqueries. 30 | */ 31 | abstract class Relation( 32 | val columnNames: IndexedSeq[String], // Ordered list of columns in this relation. We use IndexedSeq rather 33 | // than List to ensure fast lookups by index, an operation performed 34 | // frequently by analyses. 35 | override val prestoSource: Option[PrestoNode] = None) 36 | extends Node(prestoSource) { 37 | /** Optimization: because we frequently perform lookups by column name, we maintain a map to do the lookup 38 | * without having to loop over the list. 39 | * 40 | * Note that in Vertica, column references by name are case-insensitive, i.e., the following queries are valid: 41 | * WITH t1 as (SELECT a as BLAH) select blah from t1" 42 | * WITH t1 as (SELECT a as BLAH) select Blah from t1" 43 | * so although we preserve case in the schema because it determines output column names, we perform name-to-index 44 | * lookups without considering case. 45 | */ 46 | private val colIndexMap: Map[String, List[Int]] = 47 | columnNames.map{ _.toUpperCase } 48 | .zipWithIndex 49 | .groupBy(_._1) 50 | .map { case (k,v) => (k,v.map(_._2).toList) } 51 | 52 | /** Returns the number of columns in this relation. 53 | */ 54 | final def numCols: Int = columnNames.size 55 | 56 | /** Returns the index(es) of the column(s) with the given name, or Nil if this relation does not contain the given column. 57 | */ 58 | def getColumnIndexes(colName: String): List[Int] = colIndexMap.getOrElse(colName.toUpperCase, Nil) 59 | 60 | /** Returns the name of the column at the given ordinal. 61 | */ 62 | def getColumnName(index: Int): String = columnNames(index) 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/dataflow_graph/relation/Select.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.dataflow_graph.relation 24 | 25 | import com.facebook.presto.sql.tree.{SingleColumn, Node => PrestoNode} 26 | import com.uber.engsec.dp.sql.dataflow_graph.reference.Reference 27 | 28 | /** Select: A relation created with SQL's SELECT. Note that in dataflow graphs, a select node has no "from" field, 29 | * as this information is explicitly encoded inside each column reference, which maintains a pointer to the node 30 | * to which it refers. 31 | */ 32 | case class Select( 33 | items: List[SelectItem], 34 | where: Option[Reference] = None, 35 | groupBy: List[Int] = Nil) 36 | (implicit override val prestoSource: Option[PrestoNode] = None) 37 | extends Relation(items.map{ _.as }.toIndexedSeq, prestoSource ) { 38 | 39 | override val children = items.map{ _.ref } ++ List(where).flatten ++ groupBy.map{ items(_).ref } 40 | 41 | override val nodeStr : String = "" 42 | 43 | override def toString: String = items.toString 44 | } 45 | 46 | /** A selection of a single column. 47 | */ 48 | case class SelectItem(as: String, ref: Reference, prestoSource: Option[SingleColumn] = None) 49 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/dataflow_graph/relation/Union.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.dataflow_graph.relation 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | 27 | /** A relation created from the UNION of two or more relations. 28 | */ 29 | case class Union(val relations: List[Relation])(implicit override val prestoSource: Option[PrestoNode] = None) 30 | extends Relation(relations.head.columnNames, prestoSource ) { 31 | 32 | override val children = relations 33 | 34 | override val nodeStr: String = "\"UNION\"" 35 | 36 | override def toString: String = "UNION" 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/relational_algebra/RelOrExpr.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.relational_algebra 24 | 25 | import org.apache.calcite.rel.RelNode 26 | import org.apache.calcite.rex.RexNode 27 | 28 | /** Wrapper for union type RelNode | RexNode, the root node type for relational algebra trees. 29 | */ 30 | sealed abstract class RelOrExpr extends Traversable[RelOrExpr] { 31 | 32 | /** Implementing the foreach method from Traversable gives access to many useful higher-order functions on relational 33 | * algebra trees fold*, reduce*, exists, collect, etc. 34 | */ 35 | override def foreach[U](f: RelOrExpr => U): Unit = { 36 | f(this) 37 | RelTreeFunctions.getChildren(this).foreach { _.foreach(f) } 38 | } 39 | 40 | /** Optimized version of some traversable methods. 41 | */ 42 | override def isEmpty: Boolean = false 43 | override def head: RelOrExpr = this 44 | // tail is inherited from TraversableLike 45 | 46 | /** Returns the underlying node element. 47 | */ 48 | def unwrap: AnyRef 49 | } 50 | 51 | case class Relation(node: RelNode) extends RelOrExpr { 52 | override def hashCode: Int = System.identityHashCode(node) 53 | override def equals(other: Any): Boolean = other match { 54 | case other: Relation => other.node eq node 55 | case _ => false 56 | } 57 | override def unwrap: RelNode = node 58 | override def toString: String = node.toString 59 | } 60 | 61 | case class Expression(node: RexNode) extends RelOrExpr { 62 | override def hashCode: Int = System.identityHashCode(node) 63 | override def equals(other: Any): Boolean = other match { 64 | case other: Expression => other.node eq node 65 | case _ => false 66 | } 67 | override def unwrap: RexNode = node 68 | override def toString: String = node.toString 69 | } 70 | 71 | /** Conversions to and from RelOrExpr */ 72 | object RelOrExpr { 73 | import scala.language.implicitConversions 74 | implicit def rel2Sum(node: RelNode): RelOrExpr = Relation(node) 75 | implicit def rex2Sum(node: RexNode): RelOrExpr = Expression(node) 76 | 77 | implicit def relIterable2Sum(nodes: Iterable[RelNode]): Iterable[RelOrExpr] = nodes.map{Relation} 78 | implicit def rexIterable2Sum(nodes: Iterable[RexNode]): Iterable[RelOrExpr] = nodes.map{Expression} 79 | 80 | implicit def sum2Rel(rel: Relation): RelNode = rel.node 81 | implicit def sum2Rex(rex: Expression): RexNode = rex.node 82 | } 83 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/relational_algebra/RelTreeFunctions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.relational_algebra 24 | 25 | import com.uber.engsec.dp.schema.Database 26 | import com.uber.engsec.dp.sql.{AbstractAnalysis, QueryParser, TreeFunctions, TreePrinter} 27 | import org.apache.calcite.rel.core._ 28 | import org.apache.calcite.rex._ 29 | 30 | import scala.collection.JavaConverters._ 31 | 32 | /** Common trait for analyses on relational algebra trees. 33 | */ 34 | trait RelTreeFunctions extends TreeFunctions[RelOrExpr] { 35 | this: AbstractAnalysis[RelOrExpr, _] => 36 | override def getNodeChildren(node: RelOrExpr): Iterable[RelOrExpr] = RelTreeFunctions.getChildren(node) 37 | 38 | override def parseQueryToTree(query: String, database: Database): RelOrExpr = { 39 | QueryParser.parseToRelTree(query, database) 40 | } 41 | 42 | override def printTree(node: RelOrExpr): Unit = TreePrinter.printRelTree(node, resultMap, currentNode) 43 | } 44 | 45 | object RelTreeFunctions { 46 | def getChildren(node: RelOrExpr): Iterable[RelOrExpr] = node match { 47 | case Relation(p: Project) => Relation(p.getInput) :: p.getProjects.asScala.map{Expression}.toList 48 | case Relation(a: Aggregate) => List(a.getInput) 49 | case Relation(t: TableScan) => Nil 50 | case Relation(j: Join) => j.getInputs.asScala.map{Relation} ++ List(Expression(j.getCondition)) 51 | case Relation(c: Correlate) => c.getInputs.asScala.map{Relation} 52 | case Relation(f: Filter) => Relation(f.getInput) :: Expression(f.getCondition) :: Nil 53 | case Relation(s: Sort) => (Relation(s.getInput) :: Expression(s.fetch) :: Expression(s.offset) :: Nil).filter{ _.unwrap != null } 54 | case Relation(v: Values) => Nil 55 | case Relation(u: SetOp) => u.getInputs.asScala.map{Relation} 56 | 57 | case Expression(c: RexCall) => c.operands.asScala 58 | case Expression(i: RexInputRef) => Nil 59 | case Expression(l: RexLiteral) => Nil 60 | case Expression(f: RexFieldAccess) => List(f.getReferenceExpr) 61 | case Expression(c: RexCorrelVariable) => Nil 62 | case Expression(e) => throw new RuntimeException("Unimplemented: " + e.getClass.getSimpleName) 63 | case Relation(e) => throw new RuntimeException("Unimplemented: " + e.getClass.getSimpleName) 64 | } 65 | } -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/sql/relational_algebra/RelUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.sql.relational_algebra 24 | 25 | import com.uber.engsec.dp.rewriting.rules.Expr.{ColumnReferenceByOrdinal, col} 26 | import com.uber.engsec.dp.schema.{Database, Schema} 27 | import org.apache.calcite.plan.hep.{HepPlanner, HepProgram} 28 | import org.apache.calcite.rel.RelNode 29 | import org.apache.calcite.rel.core.{Aggregate, Join, TableScan} 30 | import org.apache.calcite.rel.rel2sql.RelToSqlConverter 31 | import org.apache.calcite.rel.rules.FilterJoinRule 32 | import org.apache.calcite.rex.{RexCall, RexInputRef, RexNode} 33 | import org.apache.calcite.sql.SqlDialect.DatabaseProduct 34 | import org.apache.calcite.sql.SqlKind 35 | 36 | object RelUtils { 37 | /** Extracts the left and right column indexes, respectively, used in an equijoin condition, or None if the join node 38 | * uses any other type of join condition (including an empty join condition). Note the returned indexes are relative 39 | * to the schemas of the left/right relations rather than the schema of the join. 40 | * 41 | * @param node The join node 42 | * @param condition The clause of the join condition. If desired, caller can decompose AND-clauses in join condition 43 | * using the [decomposeConjunction] method, and call this method on each clause. 44 | * @return The indices of the equijoin columns, or None if clause is not equijoin. 45 | */ 46 | def extractEquiJoinColumns(node: Join, condition: RexNode): Option[(Int,Int)] = { 47 | condition match { 48 | case c: RexCall if c.op.kind == SqlKind.EQUALS && c.operands.size == 2 => 49 | 50 | val numColsLeft = node.getLeft.getRowType.getFieldCount 51 | 52 | (c.getOperands.get(0), c.getOperands.get(1)) match { 53 | case (first: RexInputRef, second: RexInputRef) => 54 | val firstIdx = first.getIndex 55 | val secondIdx = second.getIndex 56 | 57 | if ((firstIdx < numColsLeft) && (secondIdx >= numColsLeft)) 58 | Some((firstIdx, secondIdx-numColsLeft)) 59 | else if ((secondIdx < numColsLeft) && (firstIdx >= numColsLeft)) 60 | Some((secondIdx, firstIdx-numColsLeft)) 61 | else 62 | None 63 | 64 | case _ => None 65 | } 66 | 67 | case _ => None 68 | } 69 | } 70 | 71 | /** Decomposes a given expression into a list of conjunctive clauses. */ 72 | def decomposeConjunction(expression: RexNode): List[RexNode] = { 73 | import scala.collection.JavaConverters._ 74 | 75 | if (expression == null || expression.isAlwaysTrue) 76 | Nil 77 | 78 | expression match { 79 | case c: RexCall if c.isA(SqlKind.AND) => c.getOperands.asScala.flatMap{ decomposeConjunction }.toList 80 | case _ => List(expression) 81 | } 82 | } 83 | 84 | /** Returns the grouped columns of an aggregation node 85 | * 86 | * @param agg The target aggregation node 87 | * @return A list of column expressions which reference each grouping column in the aggregation. 88 | */ 89 | def getGroupedCols(agg: Aggregate): Seq[ColumnReferenceByOrdinal] = { 90 | import scala.collection.JavaConverters._ 91 | agg.getGroupSet.asList.asScala.map { col(_) } 92 | } 93 | 94 | /** Converts the given relational algebra tree to a SQL string. 95 | */ 96 | def relToSql(rel: RelNode, dialect: String): String = { 97 | val _dialect = DatabaseProduct.valueOf(dialect.toUpperCase).getDialect 98 | val converter = new RelToSqlConverter(_dialect) 99 | converter.visitChild(0, rel).asStatement.toSqlString(_dialect).getSql 100 | } 101 | 102 | /** Returns a new tree with filter predicates pushed down into join nodes (where possible). 103 | * For example, the tree representing the following query: 104 | * 105 | * SELECT * FROM a JOIN b WHERE a.x = b.x 106 | * 107 | * would be transformed into: 108 | * 109 | * SELECT * FROM a JOIN b ON a.x = b.x 110 | */ 111 | def pushFiltersOnJoins(rel: RelNode): RelNode = { 112 | val program = HepProgram.builder.addRuleInstance(FilterJoinRule.FILTER_ON_JOIN).build 113 | val optPlanner = new HepPlanner(program) 114 | optPlanner.setRoot(rel) 115 | optPlanner.findBestExp 116 | } 117 | 118 | /****************************************************************************************************************** 119 | * Helper methods, may be called by analyses and rewriters. 120 | ****************************************************************************************************************/ 121 | 122 | /** Returns the fully qualified name of the table represented by the given TableScan node. 123 | */ 124 | def getQualifiedTableName(node: TableScan): String = { 125 | import scala.collection.JavaConverters._ 126 | node.getTable.getQualifiedName.asScala.mkString(".") 127 | } 128 | 129 | /** Retrieves the config properties for the database table represented by the given TableScan node. 130 | * 131 | * @param node Node representing target table. 132 | * @return Map of table properties, or empty map if no config is defined for the table. 133 | */ 134 | def getTableProperties(node: TableScan, database: Database): Map[String, String] = { 135 | val tableName = getQualifiedTableName(node) 136 | Schema.getTableProperties(database, tableName) 137 | } 138 | 139 | /** Retrieves the config properties for a specific column in the given table. 140 | * 141 | * @param node Node representing target table. 142 | * @param colIdx Column ordinal in target table. 143 | * @return Map of column properties, or empty map if no config is defined for the column. 144 | */ 145 | def getColumnProperties(node: TableScan, colIdx: Int, database: Database): Map[String, String] = { 146 | val tableName = RelUtils.getQualifiedTableName(node) 147 | val colName = node.getRowType.getFieldNames.get(colIdx) 148 | Schema.getSchemaMapForTable(database, tableName).get(colName).map { _.properties }.getOrElse{ Map.empty } 149 | } 150 | 151 | def getColumnProperty[T](propName: String, node: TableScan, colIdx: Int, database: Database): Option[T] = { 152 | val tableName = RelUtils.getQualifiedTableName(node) 153 | val colName = node.getRowType.getFieldNames.get(colIdx) 154 | Schema.getSchemaMapForTable(database, tableName).get(colName).get.get[T](propName) 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/util/ElasticSensitivity.scala: -------------------------------------------------------------------------------- 1 | package com.uber.engsec.dp.util 2 | 3 | import com.uber.engsec.dp.analysis.differential_privacy.ElasticSensitivityAnalysis 4 | import com.uber.engsec.dp.schema.Database 5 | import com.uber.engsec.dp.sql.QueryParser 6 | import com.uber.engsec.dp.sql.relational_algebra.Relation 7 | 8 | /** Utility methods for elastic sensitivity-based differential privacy. */ 9 | object ElasticSensitivity { 10 | /** Generate Laplace noise centered at 0 with the given scale. 11 | * 12 | * @param scale The scale of the noise 13 | * @return A single random number drawn from the distribution 14 | */ 15 | def laplace(scale: Double): Double = { 16 | val u = 0.5 - scala.util.Random.nextDouble() 17 | -math.signum(u) * scale * math.log(1 - 2*math.abs(u)) 18 | } 19 | 20 | /** Compute the elastic sensitivity of the query at distance k. 21 | * 22 | * Note: when calculating elastic sensitivity for sequential values of k (e.g., to use a smoothing function), use the 23 | * stream method below, which caches the query parse tree and is therefore much more efficient. 24 | * 25 | * @param query The input query 26 | * @param k The desired distance from the true database 27 | * @return Elastic sensitivity of query at distance k 28 | */ 29 | def elasticSensitivity(query: Relation, database: Database, k: Int): Double = { 30 | val analysis = new ElasticSensitivityAnalysis() 31 | analysis.setK(k) 32 | 33 | val result = analysis.analyzeQuery(query, database).colFacts 34 | assert (result.size == 1) // this function works for single-column queries. 35 | result.head.sensitivity.get 36 | } 37 | 38 | /** Returns a (lazily evaluated) stream of elastic sensitivities of the given column for the query at every distance k. 39 | * 40 | * @param query The input query 41 | * @return Elastic sensitivities for every distance k from the true database (k = 0, 1, 2, ...) 42 | */ 43 | def elasticSensitivityStream(query: Relation, database: Database, col: Int): Stream[Double] = { 44 | val analysis = new ElasticSensitivityAnalysis() 45 | 46 | Stream.from(0).map{ k => 47 | analysis.setK(k) 48 | val result = analysis.analyzeQuery(query, database).colFacts 49 | result(col).sensitivity.get 50 | } 51 | } 52 | 53 | /** Compute the smoothed elastic sensitivity for a given column of the query with a given epsilon. 54 | * 55 | * @param query The input query 56 | * @param col The index of the target column (0-based) 57 | * @param epsilon The desired privacy budget 58 | * @param delta The value of the delta parameter 59 | * @return The smoothed elastic sensitivity 60 | */ 61 | def smoothElasticSensitivity(query: Relation, database: Database, col: Int, epsilon: Double, delta: Double): Double = { 62 | /** Calculates the smooth elastic sensitivity by recursively computing smooth sensitivity for each value of k 63 | * until the function decreases at k+1. Since elastic sensitivity increases polynomially (at worst) in k while the 64 | * smoothing factor decays exponentially in k, this provides the correct (maximum) smooth sensitivity without 65 | * requiring computation for every k up to the size of the database. 66 | */ 67 | def sensitivityAtDistance(k: Int, prevSensitivity: Double, esStream: Stream[Double]): Double = { 68 | val elasticSensitivityAtK = esStream.head 69 | val beta = epsilon / (2 * Math.log(2 / delta)) 70 | val smoothSensitivity = Math.exp(-k * beta) * elasticSensitivityAtK 71 | 72 | if ((elasticSensitivityAtK == 0) || (smoothSensitivity < prevSensitivity)) prevSensitivity 73 | else sensitivityAtDistance(k+1, smoothSensitivity, esStream.tail) 74 | } 75 | 76 | sensitivityAtDistance(0, 0, elasticSensitivityStream(query, database, col)) 77 | } 78 | 79 | /** Produce a differentially private result for a query given its non-private result and the desired privacy budget. 80 | * 81 | * @param query The input query. It must return a single row and single column. 82 | * @param result The non-private result of running the query (a single number). 83 | * @param epsilon The desired privacy budget (e.g. 0.1). 84 | * @param delta The desired delta parameter (e.g. 1/n^2) 85 | * @return A differentially private answer to the input query. 86 | */ 87 | def addNoise(query: String, database: Database, result: Double, epsilon: Double, delta: Double): Double = { 88 | val tree = QueryParser.parseToRelTree(query, database) 89 | val sensitivity = ElasticSensitivity.smoothElasticSensitivity(tree, database, 0, epsilon, delta) 90 | result + laplace(2 * sensitivity / epsilon) 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/main/scala/com/uber/engsec/dp/util/IdentityHashMap.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.util 24 | 25 | import scala.collection.generic.CanBuildFrom 26 | import scala.collection.mutable 27 | 28 | /** Identity hash map: compares objects by object ID, not value. 29 | */ 30 | final class IdentityHashMap[A <: AnyRef, B]() extends mutable.HashMap[A, B] with mutable.MapLike[A, B, IdentityHashMap[A, B]] { 31 | override protected def elemEquals(key1: A, key2: A): Boolean = key1 eq key2 32 | override protected def elemHashCode(key: A) = System.identityHashCode(key) 33 | override def empty: IdentityHashMap[A, B] = IdentityHashMap.empty 34 | } 35 | 36 | object IdentityHashMap { 37 | type Coll = IdentityHashMap[_, _] 38 | 39 | implicit def canBuildFrom[A <: AnyRef, B] = new CanBuildFrom[Coll, (A, B), IdentityHashMap[A, B]] { 40 | def apply() = newBuilder[A, B] 41 | def apply(from: Coll) = { 42 | val builder = newBuilder[A, B] 43 | builder.sizeHint(from.size) 44 | builder 45 | } 46 | } 47 | 48 | def empty[A <: AnyRef, B]: IdentityHashMap[A, B] = new IdentityHashMap[A, B] 49 | 50 | def newBuilder[A <: AnyRef, B] = new mutable.MapBuilder[A, B, IdentityHashMap[A, B]](empty[A, B]) { 51 | override def +=(x: (A, B)): this.type = { 52 | elems += x 53 | this 54 | } 55 | override def sizeHint(size: Int): Unit = elems.sizeHint(size) 56 | } 57 | 58 | def apply[A <: AnyRef, B](elems: (A, B)*) = (newBuilder[A, B] ++= elems).result() 59 | } -------------------------------------------------------------------------------- /src/main/scala/examples/ElasticSensitivityExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package examples 24 | 25 | import com.uber.engsec.dp.schema.Schema 26 | import com.uber.engsec.dp.util.ElasticSensitivity 27 | 28 | /** A simple differential privacy example using elastic sensitivity. 29 | * 30 | * This example code supports queries that return a single column and single row. The code can be extended to support 31 | * queries returning multiple columns and rows by generating independent noise samples for each cell based the 32 | * appropriate column sensitivity. 33 | * 34 | * Caveats: 35 | * 36 | * Histogram queries (using SQL's GROUP BY) must be handled carefully so as not to leak information in the bin labels. 37 | * The analysis throws an error to warn about this, but this behavior can overridden if you know what you're doing. 38 | * 39 | * This example does not implement a privacy budget management strategy. Each query is executed using the full budget 40 | * value of EPSILON. Correct use of differential privacy requires allocating a fixed privacy from which a portion is 41 | * depleted to run each query. A privacy budget strategy depends on the problem domain and threat model and is 42 | * therefore beyond the scope of this tool. 43 | */ 44 | object ElasticSensitivityExample extends App { 45 | // Use the table schemas and metadata defined by the test classes 46 | System.setProperty("schema.config.path", "src/test/resources/schema.yaml") 47 | val database = Schema.getDatabase("test") 48 | 49 | // example query: How many US customers ordered product #1? 50 | val query = """ 51 | SELECT COUNT(*) FROM orders 52 | JOIN customers ON orders.customer_id = customers.customer_id 53 | WHERE orders.product_id = 1 AND customers.address LIKE '%United States%' 54 | """ 55 | 56 | // query result when executed on the database 57 | val QUERY_RESULT = 100000 58 | 59 | // privacy budget 60 | val EPSILON = 0.1 61 | // delta parameter: use 1/n^2, with n = 100000 62 | val DELTA = 1 / (math.pow(100000,2)) 63 | 64 | println(s"Query: $query") 65 | println(s"Private result: $QUERY_RESULT\n") 66 | 67 | (1 to 10).foreach { i => 68 | val noisyResult = ElasticSensitivity.addNoise(query, database, QUERY_RESULT, EPSILON, DELTA) 69 | println(s"Noisy result (run $i): %.0f".format(noisyResult)) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/examples/QueryRewritingExample.scala: -------------------------------------------------------------------------------- 1 | package examples 2 | 3 | import com.uber.engsec.dp.analysis.histogram.HistogramAnalysis 4 | import com.uber.engsec.dp.rewriting.differential_privacy.{ElasticSensitivityConfig, ElasticSensitivityRewriter, SampleAndAggregateConfig, SampleAndAggregateRewriter} 5 | import com.uber.engsec.dp.schema.Schema 6 | import com.uber.engsec.dp.sql.QueryParser 7 | import com.uber.engsec.dp.util.ElasticSensitivity 8 | 9 | /** A simple example demonstrating query rewriting for differential privacy. 10 | */ 11 | object QueryRewritingExample extends App { 12 | // Use the table schemas and metadata defined by the test classes 13 | System.setProperty("schema.config.path", "src/test/resources/schema.yaml") 14 | val database = Schema.getDatabase("test") 15 | 16 | // privacy budget 17 | val EPSILON = 0.1 18 | // delta parameter: use 1/n^2, with n = 100000 19 | val DELTA = 1 / (math.pow(100000,2)) 20 | 21 | // Helper function to print queries with indentation. 22 | def printQuery(query: String) = println(s"\n " + query.replaceAll("\\n", s"\n ") + "\n") 23 | 24 | def elasticSensitivityExample() = { 25 | println("*** Elastic sensitivity example ***") 26 | 27 | // Example query: How many US customers ordered product #1? 28 | val query = """ 29 | |SELECT COUNT(*) FROM orders 30 | |JOIN customers ON orders.customer_id = customers.customer_id 31 | |WHERE orders.product_id = 1 AND customers.address LIKE '%United States%'""" 32 | .stripMargin.stripPrefix("\n") 33 | 34 | // Print the example query and privacy budget 35 | val root = QueryParser.parseToRelTree(query, database) 36 | println("Original query:") 37 | printQuery(query) 38 | println(s"> Epsilon: $EPSILON") 39 | 40 | // Compute mechanism parameter values from the query. Note the rewriter does this automatically; here we calculate 41 | // the values manually so we can print them. 42 | val elasticSensitivity = ElasticSensitivity.smoothElasticSensitivity(root, database, 0, EPSILON, DELTA) 43 | println(s"> Elastic sensitivity of this query: $elasticSensitivity") 44 | println(s"> Required scale of Laplace noise: 2 * $elasticSensitivity / $EPSILON = ${2 * elasticSensitivity/EPSILON}") 45 | 46 | // Rewrite the original query to enforce differential privacy using Elastic Sensitivity. 47 | println("\nRewritten query:") 48 | val config = new ElasticSensitivityConfig(EPSILON, DELTA, database) 49 | val rewrittenQuery = new ElasticSensitivityRewriter(config).run(query) 50 | printQuery(rewrittenQuery.toSql()) 51 | } 52 | 53 | def sampleAndAggregateExample() = { 54 | println("*** Sample and aggregate example ***") 55 | val LAMBDA = 2.0 56 | 57 | // Example query: What is the average cost of orders for product 1? 58 | val query = """ 59 | |SELECT AVG(order_cost) FROM orders 60 | |WHERE product_id = 1""" 61 | .stripMargin.stripPrefix("\n") 62 | 63 | // Print the example query and privacy budget 64 | val root = QueryParser.parseToRelTree(query, database) 65 | println("Original query:") 66 | printQuery(query) 67 | println(s"> Epsilon: $EPSILON") 68 | 69 | // Compute mechanism parameter values from the query. Note the rewriter does this automatically; here we calculate 70 | // the values manually so we can print them. 71 | val analysisResults = new HistogramAnalysis().run(root, database).colFacts.head 72 | println(s"> Aggregation function applied: ${analysisResults.outermostAggregation}") 73 | val tableName = analysisResults.references.head.table 74 | val approxRowCount = Schema.getTableProperties(database, tableName)("approxRowCount").toLong 75 | 76 | println(s"> Table being queried: $tableName") 77 | println(s"> Approximate cardinality of table '$tableName': $approxRowCount") 78 | println(s"> Number of partitions (default heuristic): $approxRowCount ^ 0.4 = ${math.floor(math.pow(approxRowCount, 0.4)).toInt}") 79 | println(s"> Lambda: $LAMBDA") 80 | 81 | // Rewrite the original query to enforce differential privacy using Sample and Aggregate. 82 | println("\nRewritten query:") 83 | val config = new SampleAndAggregateConfig(EPSILON, LAMBDA, database) 84 | val rewrittenQuery = new SampleAndAggregateRewriter(config).run(query) 85 | printQuery(rewrittenQuery.toSql()) 86 | } 87 | 88 | elasticSensitivityExample() 89 | sampleAndAggregateExample() 90 | } 91 | -------------------------------------------------------------------------------- /src/test/resources/schema.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | databases: 3 | - database: "test" 4 | dialect: "hive" 5 | namespace: "public" 6 | tables: 7 | - table: "orders" 8 | approxRowCount: 1000000 9 | columns: 10 | - name: "order_id" 11 | maxFreq: 1 12 | - name: "order_date" 13 | canRelease: true 14 | - name: "customer_id" 15 | maxFreq: 100 16 | - name: "product_id" 17 | maxFreq: 500 18 | canRelease: true 19 | domainSet: "products.product_id" 20 | - name: "quantity" 21 | - name: "order_cost" 22 | - table: "products" 23 | isPublic: true 24 | columns: 25 | - name: "product_id" 26 | maxFreq: 300 27 | - name: "name" 28 | - name: "price" 29 | - table: "customers" 30 | columns: 31 | - name: "customer_id" 32 | maxFreq: 1 33 | - name: "name" 34 | isTainted: true 35 | - name: "address" 36 | isTainted: true 37 | maxFreq: 5 38 | - table: "recommendations" 39 | columns: 40 | - name: "customer_id" 41 | maxFreq: 250 42 | - name: "product_id" 43 | maxFreq: 2000 44 | 45 | - database: "test2" 46 | dialect: "hive" 47 | namespace: "" 48 | tables: 49 | - table: "my_table" 50 | columns: 51 | - name: "my_col" 52 | - name: "structured_col" 53 | fields: 54 | - name: "field1" 55 | - name: "field2" 56 | fields: 57 | - name: "subfield1" 58 | - table: "subschema.tbl" 59 | columns: 60 | - name: "col" 61 | -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/analysis/columns_used/ColumnsUsedAnalysisTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.analysis.columns_used 24 | 25 | import com.uber.engsec.dp.schema.Schema 26 | import com.uber.engsec.dp.sql.QueryParser 27 | import junit.framework.TestCase 28 | 29 | class ColumnsUsedAnalysisTest extends TestCase { 30 | val database = Schema.getDatabase("test") 31 | 32 | def checkResult(queryStr: String, expected: List[Set[String]]): Unit = { 33 | val root = QueryParser.parseToDataflowGraph(queryStr, database) 34 | val results = (new ColumnsUsedAnalysis).run(root, database) 35 | TestCase.assertEquals(expected, results.toList) 36 | } 37 | 38 | def testSelectAll() = { 39 | val query = "SELECT * FROM orders" 40 | checkResult(query, List(Set("orders.order_id"), Set("orders.order_date"), Set("orders.customer_id"), Set("orders.product_id"), Set("orders.quantity"), Set("orders.order_cost"))) 41 | } 42 | 43 | def testCountAll() = { 44 | val query = "SELECT count(*) FROM orders" 45 | checkResult(query, List(Set("orders.order_cost", "orders.order_id", "orders.product_id", "orders.order_date", "orders.customer_id", "orders.quantity"))) 46 | } 47 | 48 | def testWithoutWhere() = { 49 | val query = "SELECT order_id FROM orders" 50 | checkResult(query, List(Set("orders.order_id"))) 51 | } 52 | 53 | def testWithWhere() = { 54 | val query = "SELECT order_id FROM orders WHERE product_id = 1" 55 | checkResult(query, List(Set("orders.order_id"))) 56 | } 57 | 58 | def testJoin() = { 59 | val query = "SELECT order_date FROM orders JOIN products ON orders.product_id = products.product_id" 60 | checkResult(query, List(Set("orders.order_date", "orders.product_id", "products.product_id"))) 61 | } 62 | } -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/analysis/histogram/HistogramAnalysisTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.analysis.histogram 24 | 25 | import com.uber.engsec.dp.dataflow.AggFunctions._ 26 | import com.uber.engsec.dp.dataflow.domain.{Bottom, Top} 27 | import com.uber.engsec.dp.schema.Schema 28 | import junit.framework.TestCase 29 | 30 | class HistogramAnalysisTest extends TestCase { 31 | val database = Schema.getDatabase("test") 32 | 33 | private def getResults(query: String) = { 34 | val h = new HistogramAnalysis 35 | val results = h.analyzeQuery(query, database) 36 | results.colFacts.toList 37 | } 38 | 39 | def assertHistogramFailure(queryStr: String, errorMsg: String) = { 40 | try { 41 | getResults(queryStr) 42 | TestCase.fail("Unexpected successful transformation (was expecting exception)") 43 | } 44 | catch { 45 | case e: Exception => TestCase.assertEquals(errorMsg, e.getMessage) 46 | } 47 | } 48 | 49 | def testSimpleHistogram() = { 50 | val query = "SELECT product_id, COUNT(*) FROM orders GROUP BY product_id" 51 | val actualResult = getResults(query) 52 | 53 | val expectedResult = List( 54 | AggregationInfo(false, Bottom, Set(QualifiedColumnName("public.orders", "product_id")), false, true), 55 | AggregationInfo(true, Some(COUNT), Set(QualifiedColumnName("public.orders", "*")), true, false) 56 | ) 57 | 58 | TestCase.assertEquals(expectedResult, actualResult) 59 | } 60 | 61 | def testAliasHistogram() = { 62 | val query = "SELECT order_date as bin, COUNT(*) FROM orders GROUP BY bin" 63 | val actualResult = getResults(query) 64 | 65 | val expectedResult = List( 66 | AggregationInfo(false, Bottom, Set(QualifiedColumnName("public.orders", "order_date")), false, true), 67 | AggregationInfo(true, Some(COUNT), Set(QualifiedColumnName("public.orders", "*")), true, false) 68 | ) 69 | 70 | TestCase.assertEquals(expectedResult, actualResult) 71 | } 72 | 73 | def testModifiedHistogramBin() = { 74 | val query = "SELECT order_date+1 as bin, COUNT(*) FROM orders GROUP BY bin" 75 | val actualResult = getResults(query) 76 | 77 | val expectedResult = List( 78 | AggregationInfo(false, Bottom, Set(QualifiedColumnName("public.orders", "order_date")), true, true), 79 | AggregationInfo(true, Some(COUNT), Set(QualifiedColumnName("public.orders", "*")), true, false) 80 | ) 81 | 82 | TestCase.assertEquals(expectedResult, actualResult) 83 | } 84 | 85 | def testRoundFunction() = { 86 | // functions that input only aggregates should return true for isAggregation 87 | val query = "SELECT product_id, ROUND(COUNT(*), 0) FROM orders GROUP BY 1" 88 | val actualResult = getResults(query) 89 | 90 | val expectedResult = List( 91 | AggregationInfo(false, Bottom, Set(QualifiedColumnName("public.orders", "product_id")), false, true), 92 | AggregationInfo(true, Some(COUNT), Set(QualifiedColumnName("public.orders", "*")), true, false) 93 | ) 94 | 95 | TestCase.assertEquals(expectedResult, actualResult) 96 | } 97 | 98 | def testColumnReference() = { 99 | val query = "WITH t1 as (SELECT order_id as a FROM orders) SELECT a FROM t1" 100 | val actualResult = getResults(query) 101 | 102 | val expectedResult = List( 103 | AggregationInfo(false, Bottom, Set(QualifiedColumnName("public.orders", "order_id")), false, false) 104 | ) 105 | 106 | TestCase.assertEquals(expectedResult, actualResult) 107 | } 108 | 109 | def testDivision() = { 110 | // arithmetic of aggregations is still an aggregation, but outermost aggregation is Top since more than one 111 | // aggregation function was applied. 112 | val query = "SELECT (AVG(price) / COUNT(*)) as \"result\" FROM products" 113 | val actualResult = getResults(query) 114 | 115 | val expectedResult = List( 116 | AggregationInfo(true, Top, Set(QualifiedColumnName("public.products", "price"), QualifiedColumnName("public.products", "*")), true, false) 117 | ) 118 | 119 | TestCase.assertEquals(expectedResult, actualResult) 120 | } 121 | 122 | def testCountStar() = { 123 | val query = "SELECT COUNT(*) FROM orders" 124 | val actualResult = getResults(query) 125 | 126 | val expectedResult = List( 127 | AggregationInfo(true, COUNT, Set.empty, true, false) 128 | ) 129 | 130 | TestCase.assertEquals(expectedResult, actualResult) 131 | } 132 | 133 | // Test that statistics analysis correctly returns applied aggregation functions for simple query. 134 | def testStatistics() = { 135 | val query = "SELECT COUNT(*) as my_count, SUM(price) as my_sum, AVG(price) as my_avg FROM products" 136 | val actualResult = getResults(query) 137 | 138 | val expectedResult = List( 139 | AggregationInfo(true, COUNT, Set(QualifiedColumnName("public.products", "*")), true, false), 140 | AggregationInfo(true, SUM, Set(QualifiedColumnName("public.products", "price")), true, false), 141 | AggregationInfo(true, AVG, Set(QualifiedColumnName("public.products", "price")), true, false) 142 | ) 143 | 144 | TestCase.assertEquals(expectedResult, actualResult) 145 | } 146 | } -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/analysis/taint/TaintAnalysisTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.analysis.taint 24 | 25 | import com.uber.engsec.dp.schema.Schema 26 | import com.uber.engsec.dp.sql.QueryParser 27 | import junit.framework.TestCase 28 | 29 | /** Note: the following columns care marked tainted in schema config: 30 | * customers.name 31 | * customers.address 32 | * 33 | * All other columns are untainted. 34 | */ 35 | class TaintAnalysisTest extends TestCase { 36 | val database = Schema.getDatabase("test") 37 | 38 | private def getResults(query: String) = { 39 | val root = QueryParser.parseToRelTree(query, database) 40 | new TaintAnalysis().run(root, database).colFacts.toList 41 | } 42 | 43 | def testSimple() = { 44 | val query = "SELECT customer_id, name, address FROM customers" 45 | val actualResult = getResults(query) 46 | val expectedResult = List(false, true, true) 47 | 48 | TestCase.assertEquals(expectedResult, actualResult) 49 | } 50 | 51 | def testAggregation() = { 52 | val query = """ 53 | SELECT customers.name as name, count(*) as "count" 54 | FROM orders JOIN customers ON orders.customer_id = customers.customer_id 55 | GROUP BY 1 56 | """ 57 | 58 | val actualResult = getResults(query) 59 | // "customers.name" is tainted, so output column 'name' should be tainted 60 | val expectedResult = List(true, true) 61 | 62 | TestCase.assertEquals(expectedResult, actualResult) 63 | } 64 | 65 | def testWith() = { 66 | val query = """ 67 | WITH t1 as (SELECT * FROM products), 68 | t2 as (SELECT * FROM customers) 69 | SELECT t1.name as product_name, t2.name as customer_name from t1, t2 70 | """ 71 | 72 | val actualResult = getResults(query) 73 | val expectedResult = List(false, true) 74 | 75 | TestCase.assertEquals(expectedResult, actualResult) 76 | } 77 | 78 | } -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/core/SchemaTest.scala: -------------------------------------------------------------------------------- 1 | package com.uber.engsec.dp.core 2 | 3 | import com.uber.engsec.dp.schema.Schema 4 | import com.uber.engsec.dp.sql.QueryParser 5 | import junit.framework.TestCase 6 | 7 | class SchemaTest extends TestCase { 8 | val db1 = Schema.getDatabase("test") 9 | val db2 = Schema.getDatabase("test2") 10 | 11 | def testMultiDatabase(): Unit = { 12 | // Query on db1 13 | val query1 = "SELECT COUNT(*) FROM orders WHERE product_id = 1" 14 | 15 | // Query on db2 16 | val query2 = "SELECT COUNT(*) from my_table WHERE my_col = 1" 17 | 18 | QueryParser.parseToRelTree(query1, db1) 19 | QueryParser.parseToRelTree(query2, db2) 20 | 21 | try { 22 | QueryParser.parseToRelTree(query1, db2) 23 | TestCase.fail() 24 | } catch { 25 | case _: Exception => 26 | } 27 | 28 | try { 29 | Schema.getDatabase("nonexistDb") 30 | TestCase.fail() 31 | } catch { 32 | case _: Exception => 33 | } 34 | } 35 | 36 | def testSchemaWithEmptyNamespace(): Unit = { 37 | QueryParser.parseToRelTree("SELECT my_col FROM my_table", db2) 38 | QueryParser.parseToRelTree("SELECT col FROM subschema.tbl", db2) 39 | } 40 | 41 | def testStructuredColumn(): Unit = { 42 | QueryParser.parseToRelTree("SELECT structured_col FROM my_table", db2) 43 | QueryParser.parseToRelTree("SELECT structured_col.field1 FROM my_table", db2) 44 | QueryParser.parseToRelTree("SELECT structured_col.field2.subfield1 FROM my_table", db2) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/rewriting/CoverageRewriterTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.rewriting 24 | 25 | import com.uber.engsec.dp.rewriting.coverage.CoverageRewriter 26 | import com.uber.engsec.dp.schema.Schema 27 | import com.uber.engsec.dp.sql.QueryParser 28 | import junit.framework.TestCase 29 | 30 | class CoverageRewriterTest extends TestCase { 31 | val database = Schema.getDatabase("test") 32 | 33 | def checkResult(query: String, expected: String): Unit = { 34 | val root = QueryParser.parseToRelTree(query, database) 35 | val config = new RewriterConfig(database) 36 | val result = new CoverageRewriter(config).run(root) 37 | TestCase.assertEquals(expected.stripMargin.stripPrefix("\n"), result.toSql()) 38 | } 39 | 40 | def testStatisticalQuery() = { 41 | val query = "SELECT COUNT(*), AVG(order_id) FROM orders" 42 | 43 | checkResult(query, """ 44 | |SELECT COUNT(*) coverage 45 | |FROM public.orders""" 46 | ) 47 | } 48 | 49 | def testHistogramQuery() = { 50 | val query = "SELECT order_id, AVG(order_id) FROM orders WHERE order_id < 10 GROUP BY order_id" 51 | 52 | checkResult(query, """ 53 | |WITH _count AS ( 54 | | SELECT order_id, COUNT(*) coverage 55 | | FROM public.orders 56 | | WHERE order_id < 10 57 | | GROUP BY order_id 58 | |) 59 | |SELECT MEDIAN(coverage) coverage 60 | |FROM _count 61 | |LIMIT 1""" 62 | ) 63 | } 64 | } -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/rewriting/ElasticSensitivityRewriterTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.rewriting 24 | 25 | import com.uber.engsec.dp.rewriting.differential_privacy.{ElasticSensitivityConfig, ElasticSensitivityRewriter} 26 | import com.uber.engsec.dp.schema.Schema 27 | import com.uber.engsec.dp.sql.QueryParser 28 | import junit.framework.TestCase 29 | 30 | class ElasticSensitivityRewriterTest extends TestCase { 31 | val database = Schema.getDatabase("test") 32 | 33 | def checkResult(query: String, epsilon: Double, delta: Double, expected: String, fillMissingBins: Boolean = false): Unit = { 34 | val root = QueryParser.parseToRelTree(query, database) 35 | val config = new ElasticSensitivityConfig(epsilon, delta, database, fillMissingBins) 36 | val result = new ElasticSensitivityRewriter(config).run(root) 37 | TestCase.assertEquals(expected.stripMargin.stripPrefix("\n"), result.toSql()) 38 | } 39 | 40 | def testUnsupportedQueries() = { 41 | // This rewriter calls ElasticSensitivity analysis; see ElasticSensitivityAnalysisTest for tests of unsupported queries 42 | } 43 | 44 | def testCountQueryWithoutJoin() = { 45 | // the sensitivity of this query is 1 46 | val query = """ 47 | SELECT COUNT(*) FROM orders 48 | """ 49 | 50 | // scale of Laplace noise for epsilon 0.1 is 2*(1/0.1) = 20 51 | checkResult(query, 0.1, 1e-8, """ 52 | |SELECT COUNT(*) + 20.0 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) 53 | |FROM public.orders""" 54 | ) 55 | 56 | // scale of Laplace noise for epsilon 1 is 2*(1/1) = 2 57 | checkResult(query, 1, 1e-8, """ 58 | |SELECT COUNT(*) + 2.0 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) 59 | |FROM public.orders""" 60 | ) 61 | } 62 | 63 | def testCountQueryWithJoin() = { 64 | val query = """ 65 | SELECT COUNT(*) 66 | FROM orders JOIN recommendations ON orders.customer_id = recommendations.customer_id 67 | WHERE orders.product_id = 1 68 | """ 69 | 70 | checkResult(query, 0.1, 1e-8, """ 71 | |SELECT COUNT(*) + 5409.181856298167 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) 72 | |FROM (SELECT customer_id, product_id 73 | |FROM public.orders) t 74 | |INNER JOIN (SELECT customer_id 75 | |FROM public.recommendations) t0 ON t.customer_id = t0.customer_id 76 | |WHERE t.product_id = 1""" 77 | ) 78 | } 79 | 80 | def testHistogramQueryWithJoin() = { 81 | val query = """ 82 | SELECT orders.product_id, COUNT(*) 83 | FROM orders JOIN recommendations ON orders.product_id = recommendations.product_id 84 | WHERE orders.product_id = 1 85 | GROUP BY 1 86 | """ 87 | 88 | 89 | checkResult(query, 0.1, 1e-8, """ 90 | |SELECT t.product_id, COUNT(*) + 80000.0 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) 91 | |FROM (SELECT product_id 92 | |FROM public.orders) t 93 | |INNER JOIN (SELECT product_id 94 | |FROM public.recommendations) t0 ON t.product_id = t0.product_id 95 | |WHERE t.product_id = 1 96 | |GROUP BY t.product_id""" 97 | ) 98 | 99 | // Test histogram bin enumeration 100 | checkResult(query, 0.1, 1e-8, """ 101 | |WITH _orig AS ( 102 | | SELECT t.product_id, COUNT(*) _agg 103 | | FROM (SELECT product_id 104 | | FROM public.orders) t 105 | | INNER JOIN (SELECT product_id 106 | | FROM public.recommendations) t0 ON t.product_id = t0.product_id 107 | | WHERE t.product_id = 1 108 | | GROUP BY t.product_id 109 | |) 110 | |SELECT t0._domain product_id, CASE WHEN product_id IS NULL THEN 0 ELSE _agg END + 80000.0 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) 111 | |FROM (SELECT product_id, _agg 112 | |FROM _orig) t 113 | |RIGHT JOIN (SELECT product_id _domain 114 | |FROM public.products) t0 ON product_id = t0._domain""", 115 | true 116 | ) 117 | } 118 | 119 | def testHistogramQueryWithAggAlias() = { 120 | val query = """ 121 | SELECT orders.product_id, COUNT(*) AS "mycount" 122 | FROM orders JOIN recommendations ON orders.product_id = recommendations.product_id 123 | WHERE orders.product_id = 1 124 | GROUP BY 1 125 | """ 126 | 127 | // Test histogram bin enumeration when aggregation already has explicit alias 128 | checkResult(query, 0.1, 1e-8, """ 129 | |WITH _orig AS ( 130 | | SELECT t.product_id, COUNT(*) mycount 131 | | FROM (SELECT product_id 132 | | FROM public.orders) t 133 | | INNER JOIN (SELECT product_id 134 | | FROM public.recommendations) t0 ON t.product_id = t0.product_id 135 | | WHERE t.product_id = 1 136 | | GROUP BY t.product_id 137 | |) 138 | |SELECT t0._domain product_id, CASE WHEN product_id IS NULL THEN 0 ELSE mycount END + 80000.0 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) mycount 139 | |FROM (SELECT product_id, mycount 140 | |FROM _orig) t 141 | |RIGHT JOIN (SELECT product_id _domain 142 | |FROM public.products) t0 ON product_id = t0._domain""", 143 | true 144 | ) 145 | } 146 | } -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/rewriting/RestrictedSensitivityRewriterTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package com.uber.engsec.dp.rewriting 24 | 25 | import com.uber.engsec.dp.rewriting.differential_privacy.{RestrictedSensitivityConfig, RestrictedSensitivityRewriter} 26 | import com.uber.engsec.dp.schema.Schema 27 | import com.uber.engsec.dp.sql.QueryParser 28 | import junit.framework.TestCase 29 | 30 | class RestrictedSensitivityRewriterTest extends TestCase { 31 | val database = Schema.getDatabase("test") 32 | 33 | def checkResult(query: String, epsilon: Double, expected: String, fillMissingBins: Boolean = false): Unit = { 34 | val root = QueryParser.parseToRelTree(query, database) 35 | val config = new RestrictedSensitivityConfig(epsilon, database, fillMissingBins) 36 | val result = new RestrictedSensitivityRewriter(config).run(root) 37 | TestCase.assertEquals(expected.stripMargin.stripPrefix("\n"), result.toSql()) 38 | } 39 | 40 | def testSimpleHistogram() { 41 | val query = "SELECT order_date, COUNT(*) FROM orders GROUP BY 1" 42 | 43 | // Sensitivity of this query is 2.0 so scale of Laplace noise for epsilon 0.1 is (2/0.1) = 20 44 | checkResult(query, 0.1, """ 45 | |SELECT order_date, COUNT(*) + 20.0 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) 46 | |FROM public.orders 47 | |GROUP BY order_date""" 48 | ) 49 | } 50 | 51 | /** Restricted Sensitivity rewriter uses the same code as Elastic Sensitivity rewriter, the only difference being 52 | * the call to the sensitivity calculation analysis. Hence, see [ElasticSensitivityRewriterTest] and 53 | * [RestrictedSensitivityAnalysis] for additional test cases relevant to this mechanism. 54 | */ 55 | } --------------------------------------------------------------------------------