├── src ├── main │ ├── resources │ │ └── schema.yaml.template │ └── scala │ │ ├── chorus │ │ ├── schema │ │ │ ├── CachingSchema.scala │ │ │ └── DatabaseModel.scala │ │ ├── mechanisms │ │ │ ├── ChorusMechanism.scala │ │ │ ├── BasicMechanisms.scala │ │ │ ├── LaplaceMechClipping.scala │ │ │ ├── ReportNoisyMax.scala │ │ │ ├── ExponentialMechanism.scala │ │ │ ├── PrivacyAccountants.scala │ │ │ ├── AverageMechClipping.scala │ │ │ └── SparseVectorMechanism.scala │ │ ├── rewriting │ │ │ ├── differential_privacy │ │ │ │ ├── RestrictedSensitivityRewriter.scala │ │ │ │ ├── ClippingRewriter.scala │ │ │ │ ├── ElasticSensitivityRewriter.scala │ │ │ │ ├── SensitivityRewriter.scala │ │ │ │ └── WPINQRewriter.scala │ │ │ ├── coverage │ │ │ │ └── CoverageRewriter.scala │ │ │ ├── rules │ │ │ │ └── ColumnDefinition.scala │ │ │ ├── Rewriter.scala │ │ │ └── DPUtil.scala │ │ ├── util │ │ │ ├── DB.scala │ │ │ ├── IdentityHashMap.scala │ │ │ └── ElasticSensitivity.scala │ │ ├── dataflow │ │ │ ├── AggFunctions.scala │ │ │ ├── node │ │ │ │ ├── DFGVisitorAnalysis.scala │ │ │ │ └── ASTDataflowAnalysis.scala │ │ │ ├── domain │ │ │ │ ├── Collection.scala │ │ │ │ ├── Lattice.scala │ │ │ │ ├── AbstractDomain.scala │ │ │ │ ├── Basic.scala │ │ │ │ └── DomainElement.scala │ │ │ ├── AbstractDataflowAnalysis.scala │ │ │ └── column │ │ │ │ ├── AbstractColumnAnalysis.scala │ │ │ │ └── DFGColumnAnalysis.scala │ │ ├── sql │ │ │ ├── dataflow_graph │ │ │ │ ├── relation │ │ │ │ │ ├── Except.scala │ │ │ │ │ ├── Union.scala │ │ │ │ │ ├── DataTable.scala │ │ │ │ │ ├── Select.scala │ │ │ │ │ ├── Relation.scala │ │ │ │ │ └── Join.scala │ │ │ │ ├── DataflowGraphFunctions.scala │ │ │ │ ├── DataflowGraphUtils.scala │ │ │ │ ├── reference │ │ │ │ │ ├── Reference.scala │ │ │ │ │ ├── Function.scala │ │ │ │ │ ├── UnstructuredReference.scala │ │ │ │ │ └── ColumnReference.scala │ │ │ │ └── Node.scala │ │ │ ├── TreeFunctions.scala │ │ │ ├── relational_algebra │ │ │ │ ├── RelTreeFunctions.scala │ │ │ │ └── RelOrExpr.scala │ │ │ ├── QueryParser.scala │ │ │ ├── AbstractAnalysis.scala │ │ │ └── ast │ │ │ │ └── ASTFunctions.scala │ │ ├── exception │ │ │ ├── AnalysisException.scala │ │ │ ├── DPException.scala │ │ │ └── TransformationException.scala │ │ └── analysis │ │ │ ├── columns_used │ │ │ └── ColumnsUsedAnalysis.scala │ │ │ ├── taint │ │ │ └── TaintAnalysis.scala │ │ │ ├── differential_privacy │ │ │ ├── StabilityDomain.scala │ │ │ ├── RestrictedSensitivityAnalysis.scala │ │ │ └── SensitivityDomain.scala │ │ │ ├── join │ │ │ └── JoinKeysUsed.scala │ │ │ ├── histogram │ │ │ ├── QueryType.scala │ │ │ └── HistogramAnalysis.scala │ │ │ └── name_resolution │ │ │ └── ReferenceInfo.scala │ │ └── examples │ │ ├── MechanismExamples.scala │ │ ├── ElasticSensitivityExample.scala │ │ └── QueryRewritingExample.scala └── test │ ├── resources │ └── schema.yaml │ └── scala │ ├── com │ └── uber │ │ └── engsec │ │ └── dp │ │ ├── core │ │ └── SchemaTest.scala │ │ ├── rewriting │ │ ├── CoverageRewriterTest.scala │ │ ├── RestrictedSensitivityRewriterTest.scala │ │ └── ElasticSensitivityRewriterTest.scala │ │ └── analysis │ │ ├── columns_used │ │ └── ColumnsUsedAnalysisTest.scala │ │ ├── taint │ │ └── TaintAnalysisTest.scala │ │ └── histogram │ │ └── HistogramAnalysisTest.scala │ └── chorus │ └── ChorusTests.scala ├── LICENSE └── README.md /src/main/resources/schema.yaml.template: -------------------------------------------------------------------------------- 1 | --- 2 | databases: 3 | - database: "my_database" 4 | dialect: "postgres" 5 | namespace: "public" 6 | tables: 7 | - table: "my_table" 8 | columns: 9 | - name: "col1" 10 | - name: "col2" 11 | 12 | -------------------------------------------------------------------------------- /src/main/scala/chorus/schema/CachingSchema.scala: -------------------------------------------------------------------------------- 1 | package org.apache.calcite.jdbc 2 | 3 | import org.apache.calcite.schema.{Schema, SchemaPlus} 4 | 5 | object SchemaAdapter { 6 | def toRootSchemaPlus(schema: Schema, name: String): SchemaPlus = new CachingCalciteSchema(null, schema, name).plus() 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/chorus/mechanisms/ChorusMechanism.scala: -------------------------------------------------------------------------------- 1 | package chorus.mechanisms 2 | 3 | abstract class ChorusMechanism[A] { 4 | def run(): (A, PrivacyCost) 5 | 6 | def execute(accountant: PrivacyAccountant): A = { 7 | val (result, cost) = run() 8 | accountant.addCost(cost) 9 | result 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/chorus/mechanisms/BasicMechanisms.scala: -------------------------------------------------------------------------------- 1 | package chorus.mechanisms 2 | 3 | import chorus.util.DB 4 | 5 | object BasicMechanisms { 6 | 7 | def laplaceSample(scale: Double): Double = { 8 | val u = 0.5 - scala.util.Random.nextDouble() 9 | -math.signum(u) * scale * math.log(1 - 2*math.abs(u)) 10 | } 11 | 12 | def laplace(vals: List[DB.Row], scales: Seq[Double]): List[DB.Row] = 13 | vals.map { (row: DB.Row) => 14 | DB.Row((row.vals zip scales).map { 15 | case (v, scale) => 16 | (v.toDouble + laplaceSample(scale)).toString 17 | }) 18 | } 19 | 20 | def argmax(vals: List[Double]): Int = 21 | vals.view.zipWithIndex.maxBy(_._1)._2 22 | 23 | def chooseWithProbability[A](probabilities: List[(A, Double)]): Int = 24 | 0 // probabilities(0) 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/chorus/rewriting/differential_privacy/RestrictedSensitivityRewriter.scala: -------------------------------------------------------------------------------- 1 | package chorus.rewriting.differential_privacy 2 | 3 | import chorus.analysis.differential_privacy.RestrictedSensitivityAnalysis 4 | import chorus.rewriting.DPRewriterConfig 5 | import chorus.schema.Database 6 | import chorus.sql.relational_algebra.Relation 7 | 8 | /** Rewriter that enforces differential privacy using Restricted Sensitivity. */ 9 | class RestrictedSensitivityRewriter(config: RestrictedSensitivityConfig) extends SensitivityRewriter(config) { 10 | def getLaplaceNoiseScale(node: Relation, colIdx: Int): Double = 11 | new RestrictedSensitivityAnalysis().run(node, config.database).colFacts(colIdx).sensitivity.get / config.epsilon 12 | } 13 | 14 | class RestrictedSensitivityConfig( 15 | override val epsilon: Double, 16 | override val database: Database, 17 | override val fillMissingBins: Boolean = true) 18 | extends DPRewriterConfig(epsilon, database, fillMissingBins) 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Uber Technologies, Inc. and Joseph P. Near 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /src/main/scala/chorus/mechanisms/LaplaceMechClipping.scala: -------------------------------------------------------------------------------- 1 | package chorus.mechanisms 2 | 3 | import chorus.analysis.differential_privacy.GlobalSensitivityAnalysis 4 | import chorus.schema.Database 5 | import chorus.rewriting.RewriterConfig 6 | import chorus.sql.relational_algebra.Relation 7 | import chorus.rewriting.differential_privacy.ClippingRewriter 8 | import chorus.util.DB 9 | 10 | class LaplaceMechClipping(epsilon: Double, l: Double, u: Double, 11 | root: Relation, config: RewriterConfig) 12 | extends ChorusMechanism[List[DB.Row]] { 13 | 14 | def getSensitivities(query: Relation, database: Database): Seq[Double] = { 15 | val a = new GlobalSensitivityAnalysis() 16 | val facts = a.run(query, database).colFacts 17 | facts.map(_.sensitivity.get) 18 | } 19 | 20 | def run() = { 21 | val clippedQuery = new ClippingRewriter(config, l, u).run(root).root 22 | 23 | val sensitivities = getSensitivities(clippedQuery, config.database) 24 | val scales = sensitivities.map(_ / epsilon) 25 | 26 | val result = DB.execute(clippedQuery, config.database) 27 | (BasicMechanisms.laplace(result, scales), EpsilonDPCost(epsilon)) 28 | } 29 | 30 | } 31 | 32 | -------------------------------------------------------------------------------- /src/main/scala/chorus/mechanisms/ReportNoisyMax.scala: -------------------------------------------------------------------------------- 1 | package chorus.mechanisms 2 | 3 | 4 | import chorus.analysis.differential_privacy.GlobalSensitivityAnalysis 5 | import chorus.schema.Database 6 | import chorus.rewriting.RewriterConfig 7 | import chorus.rewriting.differential_privacy.ClippingRewriter 8 | import chorus.util.DB 9 | import chorus.sql.relational_algebra.{RelUtils, Relation} 10 | import chorus.dataflow.domain.UnitDomain 11 | import org.apache.calcite.rel.core.Aggregate 12 | import org.apache.calcite.sql.fun.SqlSumAggFunction 13 | import chorus.exception.UnsupportedQueryException 14 | 15 | import chorus.rewriting.rules.ColumnDefinition._ 16 | import chorus.rewriting.rules.Operations._ 17 | import chorus.rewriting.rules.ValueExpr 18 | import chorus.rewriting.rules.Expr._ 19 | 20 | 21 | class ReportNoisyMax(epsilon: Double, queries: List[Relation], config: RewriterConfig) 22 | extends ChorusMechanism[Int] { 23 | 24 | def run() = { 25 | val results = queries.map { (q: Relation) => 26 | new LaplaceMechClipping(epsilon, 0, 1, q, config).run()._1 } 27 | val unwrappedResults : List[Double] = 28 | results.map { case List(DB.Row(List(i))) => i.toDouble } 29 | 30 | (BasicMechanisms.argmax(unwrappedResults), EpsilonDPCost(epsilon)) 31 | } 32 | } 33 | 34 | -------------------------------------------------------------------------------- /src/test/resources/schema.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | databases: 3 | - database: "test" 4 | dialect: "hive" 5 | namespace: "public" 6 | tables: 7 | - table: "orders" 8 | approxRowCount: 1000000 9 | columns: 10 | - name: "order_id" 11 | maxFreq: 1 12 | - name: "order_date" 13 | canRelease: true 14 | - name: "customer_id" 15 | maxFreq: 100 16 | - name: "product_id" 17 | maxFreq: 500 18 | canRelease: true 19 | domainSet: "products.product_id" 20 | - name: "quantity" 21 | - name: "order_cost" 22 | - table: "products" 23 | isPublic: true 24 | columns: 25 | - name: "product_id" 26 | maxFreq: 300 27 | - name: "name" 28 | - name: "price" 29 | - table: "customers" 30 | columns: 31 | - name: "customer_id" 32 | maxFreq: 1 33 | - name: "name" 34 | isTainted: true 35 | - name: "address" 36 | isTainted: true 37 | maxFreq: 5 38 | - table: "recommendations" 39 | columns: 40 | - name: "customer_id" 41 | maxFreq: 250 42 | - name: "product_id" 43 | maxFreq: 2000 44 | 45 | - database: "test2" 46 | dialect: "hive" 47 | namespace: "" 48 | tables: 49 | - table: "my_table" 50 | columns: 51 | - name: "my_col" 52 | - name: "structured_col" 53 | fields: 54 | - name: "field1" 55 | - name: "field2" 56 | fields: 57 | - name: "subfield1" 58 | - table: "subschema.tbl" 59 | columns: 60 | - name: "col" 61 | -------------------------------------------------------------------------------- /src/main/scala/chorus/mechanisms/ExponentialMechanism.scala: -------------------------------------------------------------------------------- 1 | package chorus.mechanisms 2 | 3 | 4 | import chorus.analysis.differential_privacy.GlobalSensitivityAnalysis 5 | import chorus.schema.Database 6 | import chorus.rewriting.RewriterConfig 7 | import chorus.rewriting.differential_privacy.ClippingRewriter 8 | import chorus.util.DB 9 | import chorus.sql.relational_algebra.{RelUtils, Relation} 10 | import chorus.dataflow.domain.UnitDomain 11 | import org.apache.calcite.rel.core.Aggregate 12 | import org.apache.calcite.sql.fun.SqlSumAggFunction 13 | import chorus.exception.UnsupportedQueryException 14 | 15 | import chorus.rewriting.rules.ColumnDefinition._ 16 | import chorus.rewriting.rules.Operations._ 17 | import chorus.rewriting.rules.ValueExpr 18 | import chorus.rewriting.rules.Expr._ 19 | 20 | 21 | class ExponentialMechanism(epsilon: Double, scoring: Relation, config: RewriterConfig) 22 | extends ChorusMechanism[Int] { 23 | 24 | def run() = { 25 | val sensitivity = new GlobalSensitivityAnalysis().run(scoring, config.database) 26 | .colFacts.map(_.sensitivity.get).max 27 | 28 | val scores = DB.execute(scoring, config.database) 29 | val totalScore = scores.map { case DB.Row(List(_, v)) => v.toDouble }.sum 30 | 31 | val probabilities = scores.map { case DB.Row(List(k, v)) => 32 | (k, epsilon * (v.toDouble / totalScore) / (2 * sensitivity)) } 33 | 34 | (BasicMechanisms.chooseWithProbability(probabilities), 35 | EpsilonDPCost(epsilon)) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/core/SchemaTest.scala: -------------------------------------------------------------------------------- 1 | package chorus.core 2 | 3 | import chorus.schema.Schema 4 | import chorus.sql.QueryParser 5 | import junit.framework.TestCase 6 | 7 | class SchemaTest extends TestCase { 8 | val db1 = Schema.getDatabase("test") 9 | val db2 = Schema.getDatabase("test2") 10 | 11 | def testMultiDatabase(): Unit = { 12 | // Query on db1 13 | val query1 = "SELECT COUNT(*) FROM orders WHERE product_id = 1" 14 | 15 | // Query on db2 16 | val query2 = "SELECT COUNT(*) from my_table WHERE my_col = 1" 17 | 18 | QueryParser.parseToRelTree(query1, db1) 19 | QueryParser.parseToRelTree(query2, db2) 20 | 21 | try { 22 | QueryParser.parseToRelTree(query1, db2) 23 | TestCase.fail() 24 | } catch { 25 | case _: Exception => 26 | } 27 | 28 | try { 29 | Schema.getDatabase("nonexistDb") 30 | TestCase.fail() 31 | } catch { 32 | case _: Exception => 33 | } 34 | } 35 | 36 | def testSchemaWithEmptyNamespace(): Unit = { 37 | QueryParser.parseToRelTree("SELECT my_col FROM my_table", db2) 38 | QueryParser.parseToRelTree("SELECT col FROM subschema.tbl", db2) 39 | } 40 | 41 | def testStructuredColumn(): Unit = { 42 | QueryParser.parseToRelTree("SELECT structured_col FROM my_table", db2) 43 | QueryParser.parseToRelTree("SELECT structured_col.field1 FROM my_table", db2) 44 | QueryParser.parseToRelTree("SELECT structured_col.field2.subfield1 FROM my_table", db2) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/examples/MechanismExamples.scala: -------------------------------------------------------------------------------- 1 | package examples 2 | 3 | import chorus.schema.Schema 4 | import chorus.sql.QueryParser 5 | import chorus.mechanisms.LaplaceMechClipping 6 | import chorus.mechanisms.AverageMechClipping 7 | import chorus.mechanisms.EpsilonCompositionAccountant 8 | import chorus.rewriting.RewriterConfig 9 | 10 | object MechanismExamples extends App { 11 | System.setProperty("dp.elastic_sensitivity.check_bins_for_release", "false") 12 | System.setProperty("db.use_dummy_database", "true") 13 | 14 | // Use the table schemas and metadata defined by the test classes 15 | System.setProperty("schema.config.path", "src/test/resources/schema.yaml") 16 | val database = Schema.getDatabase("test") 17 | val config = new RewriterConfig(database) 18 | 19 | // Define simple test queries 20 | val query1 = "SELECT SUM(order_cost) FROM orders WHERE product_id = 1" 21 | val root1 = QueryParser.parseToRelTree(query1, database) 22 | 23 | val query2 = "SELECT AVG(order_cost) FROM orders" 24 | val root2 = QueryParser.parseToRelTree(query2, database) 25 | 26 | // Define the privacy accountant 27 | val accountant = new EpsilonCompositionAccountant() 28 | 29 | // Run the mechanisms 30 | val r1 = new LaplaceMechClipping(1.0, 0, 10, root1, config).execute(accountant) 31 | val r2 = new AverageMechClipping(1.0, 0, 10, root2, config).execute(accountant) 32 | 33 | println("Sum query result: " + r1) 34 | println("Average query result: " + r2) 35 | 36 | println("Total privacy cost: " + accountant.getTotalCost()) 37 | } 38 | -------------------------------------------------------------------------------- /src/test/scala/chorus/ChorusTests.scala: -------------------------------------------------------------------------------- 1 | package chorus 2 | 3 | import chorus.rewriting.differential_privacy.{ElasticSensitivityConfig, ElasticSensitivityRewriter} 4 | import chorus.schema.Schema 5 | import chorus.sql.QueryParser 6 | import junit.framework.TestCase 7 | import chorus.analysis.differential_privacy.GlobalSensitivityAnalysis 8 | import chorus.rewriting.RewriterConfig 9 | import chorus.sql.relational_algebra.Relation 10 | import chorus.rewriting.differential_privacy.ClippingRewriter 11 | import chorus.exception.{UnsupportedConstructException, UnsupportedQueryException} 12 | 13 | class ChorusTests extends TestCase { 14 | val database = Schema.getDatabase("test") 15 | val config = new RewriterConfig(database) 16 | 17 | def checkResult(query: String, expected: Seq[Double]): Unit = { 18 | val root = QueryParser.parseToRelTree(query, database) 19 | val clippedQuery = new ClippingRewriter(config, 0, 10).run(root).root 20 | 21 | val sensitivities = getSensitivities(clippedQuery) 22 | TestCase.assertEquals(sensitivities, expected) 23 | } 24 | 25 | def getSensitivities(query: Relation): Seq[Double] = { 26 | val a = new GlobalSensitivityAnalysis() 27 | val facts = a.run(query, database).colFacts 28 | facts.map(_.sensitivity.get) 29 | } 30 | 31 | 32 | def testMathExpressions() = { 33 | val query1 = "SELECT SUM(order_cost) FROM orders WHERE product_id = 1" 34 | checkResult(query1, List(10.0)) 35 | 36 | val query2 = "SELECT SUM(order_cost/0) FROM orders WHERE product_id = 1" 37 | try { 38 | checkResult(query2, List(0)) 39 | } catch { 40 | case e: UnsupportedConstructException => TestCase.assertTrue(true) 41 | } 42 | 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/chorus/util/DB.scala: -------------------------------------------------------------------------------- 1 | package chorus.util 2 | 3 | import java.sql.DriverManager 4 | import java.sql.Connection 5 | import scala.collection.mutable.MutableList 6 | 7 | import chorus.sql.relational_algebra.{RelUtils, Relation} 8 | import chorus.schema.Database 9 | 10 | object DB { 11 | var connection: Connection = null 12 | 13 | case class Row(vals: List[String]) 14 | 15 | def execute(q: Relation, database: Database): List[Row] = { 16 | def init() = { 17 | val driver = System.getProperty("db.driver") 18 | val url = System.getProperty("db.url") 19 | val username = System.getProperty("db.username") 20 | val password = System.getProperty("db.password") 21 | 22 | try { 23 | Class.forName(driver) 24 | connection = DriverManager.getConnection(url, username, password) 25 | } catch { 26 | case e: Throwable => e.printStackTrace 27 | } 28 | } 29 | 30 | val useDummy = System.getProperty("db.use_dummy_database") 31 | if (useDummy != null && useDummy == "true") { 32 | val result: List[Row] = List(Row(List("1"))) 33 | return result 34 | } 35 | 36 | if (connection == null) 37 | init() 38 | 39 | val sqlQuery = RelUtils.relToSql(q, database.dialect) 40 | 41 | val statement = connection.createStatement() 42 | val resultSet = statement.executeQuery(sqlQuery) 43 | 44 | val cols = resultSet.getMetaData().getColumnCount() 45 | 46 | val results: MutableList[Row] = MutableList() 47 | 48 | while (resultSet.next()) { 49 | val l: List[String] = (1 to cols).map(resultSet.getString(_)).toList 50 | results += Row(l) 51 | } 52 | 53 | results.toList 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/chorus/dataflow/AggFunctions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.dataflow 24 | 25 | /** Enumeration of SQL aggregation functions. 26 | */ 27 | object AggFunctions { 28 | sealed abstract class AggFunction 29 | case object COUNT extends AggFunction 30 | case object SUM extends AggFunction 31 | case object MIN extends AggFunction 32 | case object MAX extends AggFunction 33 | case object AVG extends AggFunction 34 | case object VAR extends AggFunction 35 | case object STDDEV extends AggFunction 36 | case object SINGLE_VALUE extends AggFunction 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/dataflow_graph/relation/Except.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.dataflow_graph.relation 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | 27 | /** A relation created from SQL's EXCEPT clause. 28 | */ 29 | case class Except(val left: Relation, val right: Relation)(implicit override val prestoSource: Option[PrestoNode] = None) 30 | extends Relation(left.columnNames, prestoSource ) { 31 | 32 | override val children = List(left, right) 33 | 34 | override val nodeStr: String = "\"EXCEPT\"" 35 | 36 | override def toString: String = "EXCEPT" 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/dataflow_graph/relation/Union.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.dataflow_graph.relation 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | 27 | /** A relation created from the UNION of two or more relations. 28 | */ 29 | case class Union(val relations: List[Relation])(implicit override val prestoSource: Option[PrestoNode] = None) 30 | extends Relation(relations.head.columnNames, prestoSource ) { 31 | 32 | override val children = relations 33 | 34 | override val nodeStr: String = "\"UNION\"" 35 | 36 | override def toString: String = "UNION" 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/chorus/exception/AnalysisException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.exception 24 | 25 | /** Exception encountered during analysis (as opposed to parsing, tree transformation, etc.) 26 | */ 27 | class AnalysisException(val msg: String) extends RuntimeException(msg) 28 | 29 | /** Indicates that an analysis does not support a given query. 30 | */ 31 | class UnsupportedQueryException(message: String) extends AnalysisException(message) 32 | 33 | /** Indicates that an analysis does not support a specific construct used in the query. 34 | */ 35 | class UnsupportedConstructException(message: String) extends UnsupportedQueryException(message) 36 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/dataflow_graph/DataflowGraphFunctions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.dataflow_graph 24 | 25 | import chorus.schema.Database 26 | import chorus.sql.{AbstractAnalysis, QueryParser, TreeFunctions, TreePrinter} 27 | 28 | trait DataflowGraphFunctions extends TreeFunctions[Node] { 29 | this: AbstractAnalysis[Node, _] => 30 | override def getNodeChildren(node: Node): Iterable[Node] = node.children 31 | override def parseQueryToTree(query: String, database: Database): Node = QueryParser.parseToDataflowGraph(query, database) 32 | override def printTree(node: Node) = TreePrinter.printTree(node, resultMap, currentNode) 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/chorus/analysis/columns_used/ColumnsUsedAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.analysis.columns_used 24 | import chorus.dataflow.column.DataflowGraphColumnAnalysis 25 | import chorus.dataflow.domain.SetDomain 26 | import chorus.sql.dataflow_graph.relation.DataTable 27 | 28 | /** Returns a set of all data table columns influencing each output column. 29 | */ 30 | class ColumnsUsedAnalysis extends DataflowGraphColumnAnalysis(new SetDomain[String]) { 31 | override def transferDataTable(d: DataTable, idx: Int, fact: Set[String]): Set[String] = { 32 | val qualifiedColName = s"${d.name}.${d.getColumnName(idx)}" 33 | fact ++ Set(qualifiedColName) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/TreeFunctions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql 24 | 25 | import chorus.schema.Database 26 | 27 | /** Common trait for all query representations (AST, dataflow graph, and relational algebra tree). 28 | * 29 | * @tparam N Node type of tree. 30 | */ 31 | abstract trait TreeFunctions[N] { 32 | /** Returns the children for the given node. 33 | */ 34 | def getNodeChildren(node: N): Iterable[N] 35 | 36 | /** Parses and converts the given SQL string query to this tree type. 37 | */ 38 | def parseQueryToTree(query: String, database: Database): N 39 | 40 | /** Prints the tree for debugging. 41 | */ 42 | def printTree(node: N): Unit 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/chorus/rewriting/coverage/CoverageRewriter.scala: -------------------------------------------------------------------------------- 1 | package chorus.rewriting.coverage 2 | 3 | import chorus.rewriting.rules.ColumnDefinition._ 4 | import chorus.rewriting.rules.Expr._ 5 | import chorus.rewriting.rules.Operations._ 6 | import chorus.rewriting.{Rewriter, RewriterConfig} 7 | import chorus.sql.relational_algebra.{RelUtils, Relation} 8 | import org.apache.calcite.rel.logical.{LogicalAggregate, LogicalSort} 9 | import org.apache.calcite.rel.rules.FilterProjectTransposeRule 10 | 11 | /** 12 | * Rewriter that calculates coverage of aggregation queries. 13 | */ 14 | class CoverageRewriter(config: RewriterConfig) extends Rewriter(config) { 15 | override def rewrite(root: Relation): Relation = { 16 | /** Find first aggregation node (strip away projections and other post-processing of aggregation column). */ 17 | val rootAggNode = root.collectFirst{ case Relation(l: LogicalAggregate) => l }.get 18 | val groupedColumns = RelUtils.getGroupedCols(rootAggNode) 19 | 20 | /** Replace aggregation with a count-histogram, grouping by the same bins of original aggregation. */ 21 | val coverageRelation = Relation(rootAggNode.getInput) 22 | .agg (groupedColumns: _*) (Count(*) AS "coverage") 23 | .optimize(FilterProjectTransposeRule.INSTANCE) 24 | 25 | /** Reconstruct sort node, if present in original query, to preserve ORDER BY and LIMIT clauses. */ 26 | val newRoot = root.unwrap match { 27 | case l: LogicalSort => Relation(LogicalSort.create(coverageRelation, l.getCollation, l.offset, l.fetch)) 28 | case _ => coverageRelation 29 | } 30 | 31 | /** For histogram queries, compute median coverage across all bins */ 32 | val result = if (groupedColumns.nonEmpty) 33 | newRoot.asAlias("_count") 34 | .project(Median(col("coverage")) AS "coverage") 35 | .fetch(1) 36 | else 37 | newRoot 38 | 39 | result 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/chorus/mechanisms/PrivacyAccountants.scala: -------------------------------------------------------------------------------- 1 | package chorus.mechanisms 2 | 3 | import scala.collection.mutable.MutableList 4 | 5 | trait PrivacyCost { 6 | def +(other: PrivacyCost): PrivacyCost 7 | } 8 | 9 | case class EpsilonDPCost(epsilon: Double) extends PrivacyCost { 10 | def +(other: PrivacyCost) = other match { 11 | case EpsilonDPCost(otherEpsilon) => EpsilonDPCost(epsilon + otherEpsilon) 12 | } 13 | } 14 | 15 | case class RenyiDPCost(alpha: Int, epsilon: Double) extends PrivacyCost { 16 | def +(other: PrivacyCost) = other match { 17 | case RenyiDPCost(otherAlpha, otherEpsilon) => 18 | RenyiDPCost(math.max(alpha, otherAlpha), epsilon + otherEpsilon) 19 | } 20 | } 21 | 22 | case class EpsilonDeltaDPCost(epsilon: Double, delta: Double) extends PrivacyCost { 23 | def +(other: PrivacyCost) = other match { 24 | case EpsilonDPCost(otherEpsilon) => EpsilonDeltaDPCost(epsilon + otherEpsilon, delta) 25 | case EpsilonDeltaDPCost(otherEpsilon, otherDelta) => 26 | EpsilonDeltaDPCost(epsilon + otherEpsilon, delta + otherDelta) 27 | } 28 | } 29 | 30 | 31 | abstract class PrivacyAccountant { 32 | val costs: MutableList[PrivacyCost] = MutableList() 33 | 34 | def getTotalCost(): PrivacyCost 35 | 36 | def addCost(c: PrivacyCost) = costs += c 37 | } 38 | 39 | class EpsilonCompositionAccountant extends PrivacyAccountant { 40 | def getTotalCost() = costs.fold(EpsilonDPCost(0))(_ + _) 41 | } 42 | 43 | class RenyiCompositionAccountant extends PrivacyAccountant { 44 | def getTotalCost() = costs.fold(RenyiDPCost(0, 0))(_ + _) 45 | } 46 | 47 | 48 | class AdvancedCompositionAccountant(delta: Double) extends PrivacyAccountant { 49 | def getTotalCost() = { 50 | val epsilons: Seq[Double] = costs.map { case EpsilonDPCost(eps) => eps } 51 | val totalEpsilon = 2*(epsilons.max)*math.sqrt(2*(epsilons.length)*math.log(1/delta)) 52 | EpsilonDeltaDPCost(totalEpsilon, delta) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/chorus/rewriting/differential_privacy/ClippingRewriter.scala: -------------------------------------------------------------------------------- 1 | package chorus.rewriting.differential_privacy 2 | 3 | import chorus.rewriting.rules.ValueExpr 4 | import chorus.rewriting.{RewriterConfig, DPRewriterConfig, DPUtil, Rewriter} 5 | import chorus.exception.UnsupportedQueryException 6 | import chorus.sql.relational_algebra.{RelUtils, Relation} 7 | 8 | import chorus.rewriting.rules.ColumnDefinition._ 9 | import chorus.rewriting.rules.Operations._ 10 | import chorus.rewriting.rules.ValueExpr 11 | import chorus.rewriting.rules.Expr._ 12 | 13 | import chorus.dataflow.domain.UnitDomain 14 | 15 | import org.apache.calcite.rel.core.Aggregate 16 | import org.apache.calcite.sql.fun.SqlSumAggFunction 17 | 18 | class ClippingRewriter[C <: RewriterConfig](config: C, l: Double, u: Double) 19 | extends Rewriter(config) { 20 | val lowerBound = l 21 | val upperBound = u 22 | 23 | def clamp(expr: ValueExpr, min: ValueExpr, max: ValueExpr): ValueExpr = 24 | Case(expr < min, min, Case(expr > max, max, expr)) 25 | 26 | def rewrite(root: Relation): Relation = { 27 | root.rewriteRecursive(UnitDomain) { (node, orig, _) => 28 | 29 | node match { 30 | case Relation(a: Aggregate) => { 31 | val groupedCols = RelUtils.getGroupedCols(a).map {c => c.idx} 32 | 33 | val newR = Relation(a.getInput).mapCols { colDef => 34 | if (groupedCols contains colDef.idx) 35 | col(colDef) AS colDef.alias 36 | else 37 | clamp(col(colDef), lowerBound, upperBound) AS colDef.alias 38 | } 39 | 40 | val finalResult = Relation(a).replaceInputs { rs => 41 | if (rs.length > 1) 42 | throw new UnsupportedQueryException("This rewriter only works on single-table queries") 43 | List(newR) 44 | } 45 | 46 | (finalResult, ()) 47 | } 48 | case _ => (node, ()) 49 | } 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/chorus/dataflow/node/DFGVisitorAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.dataflow.node 24 | 25 | import chorus.sql.AbstractAnalysis 26 | import chorus.sql.dataflow_graph.{DataflowGraphFunctions, Node} 27 | 28 | /** Interface for simple DFG analyses that don't require dataflow tracking (e.g., visitor analyses). 29 | * 30 | * @tparam T The result type 31 | */ 32 | trait DFGVisitorAnalysis[T <: Any] extends AbstractAnalysis[Node,T] with DataflowGraphFunctions { 33 | // Handle book keeping for abstract analysis 34 | final override def process(root: Node): Unit = resultMap += (root -> run(root)) 35 | 36 | /** The only method that needs to be implemented by subclasses. Runs the analysis on the given tree and returns 37 | * the result. 38 | */ 39 | def run(root: Node): T 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/chorus/dataflow/domain/Collection.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.dataflow.domain 24 | 25 | /** An abstract domain that implements a Map of facts, with join defined as map union. If A and B have intersecting 26 | * keys, behavior is undefined (only one value for each key is retained). 27 | */ 28 | class MapDomain[K,V] extends AbstractDomain[Map[K,V]] { 29 | override val bottom: Map[K,V] = Map.empty 30 | override def leastUpperBound(first: Map[K, V], second: Map[K, V]): Map[K, V] = first ++ second 31 | } 32 | 33 | /** An abstract domain that implements a Set of facts, with leastUpperBound defined as set union. 34 | */ 35 | class SetDomain[T] extends AbstractDomain[Set[T]] { 36 | override val bottom: Set[T] = Set.empty 37 | override def leastUpperBound(first: Set[T], second: Set[T]): Set[T] = first ++ second 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/chorus/exception/DPException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.exception 24 | 25 | /** The exception type is raised for any exceptional condition encountered during end-to-end analysis of a query (i.e., 26 | * by calls to AbstractAnalysis.analyzeQuery()). This includes parsing exceptions, tree transformation exceptions, and 27 | * analysis runtime errors. 28 | * 29 | * This is a checked exception, requiring callers to explicitly handle errors. Internal code may throw any of 30 | * the unchecked error types defined in [[chorus.exception]]. All public interfaces to this tool should 31 | * catch internal errors and wrap with this exception type. Callers can use the getCause() method to retrieve details 32 | * about the underlying exception. 33 | */ 34 | class DPException(val message: String, val cause: Throwable) extends Exception(message, cause) 35 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/dataflow_graph/DataflowGraphUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.dataflow_graph 24 | 25 | import chorus.sql.dataflow_graph.reference.{ColumnReference, Function} 26 | import chorus.sql.dataflow_graph.relation.Join 27 | 28 | object DataflowGraphUtils { 29 | /** Extracts the left and right column indexes, respectively, used in an equijoin condition, or None if 30 | * the join node uses any other type of join condition (including an empty join condition). 31 | */ 32 | def extractEquiJoinColumns(node: Join): Option[(Int,Int)] = { 33 | node.condition.collect { 34 | case Function("EQUAL", ColumnReference(leftIdx, node.left) :: ColumnReference(rightIdx, node.right) :: Nil) => (leftIdx, rightIdx) 35 | case Function("EQUAL", ColumnReference(rightIdx, node.right) :: ColumnReference(leftIdx, node.left) :: Nil) => (leftIdx, rightIdx) 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/chorus/dataflow/node/ASTDataflowAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.dataflow.node 24 | 25 | import com.facebook.presto.sql.tree.Node 26 | import chorus.dataflow.AbstractDataflowAnalysis 27 | import chorus.dataflow.domain.AbstractDomain 28 | import chorus.sql.ast.ASTFunctions 29 | 30 | /** Dataflow analysis on AST nodes. For more information see [[AbstractDataflowAnalysis]]. 31 | */ 32 | abstract class ASTDataflowAnalysis[E, T <: AbstractDomain[E]](domain: AbstractDomain[E]) 33 | extends AbstractDataflowAnalysis[Node, E] 34 | with ASTFunctions { 35 | 36 | override def joinNode(node: Node, children: Iterable[Node]): E = { 37 | if (children.isEmpty) 38 | domain.bottom 39 | else if (children.size == 1) 40 | resultMap(children.head) 41 | else 42 | children.map{ resultMap(_) }.reduce { (first, second) => domain.leastUpperBound(first, second) } 43 | } 44 | 45 | } 46 | 47 | -------------------------------------------------------------------------------- /src/main/scala/chorus/analysis/taint/TaintAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.analysis.taint 24 | 25 | import chorus.dataflow.column.{NodeColumnFacts, RelNodeColumnAnalysis} 26 | import chorus.dataflow.domain.{BooleanDomain, UnitDomain} 27 | import chorus.sql.relational_algebra.RelUtils 28 | import org.apache.calcite.rel.core.TableScan 29 | 30 | /** Returns true for each output column that is derived from a column marked as tainted (isTaint=true in the schema config). 31 | */ 32 | class TaintAnalysis extends RelNodeColumnAnalysis(UnitDomain, BooleanDomain) { 33 | 34 | override def transferTableScan(node: TableScan, state: NodeColumnFacts[Unit, Boolean]) = NodeColumnFacts( 35 | UnitDomain.bottom, 36 | state.colFacts.zipWithIndex.map { case (colState, idx) => 37 | val isTainted = RelUtils.getColumnProperty[Boolean]("isTainted", node, idx, this.getDatabase).getOrElse(false) 38 | isTainted 39 | }) 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/dataflow_graph/reference/Reference.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.dataflow_graph.reference 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | import chorus.sql.dataflow_graph.Node 27 | 28 | /** Generic parent class for Reference nodes. 29 | * 30 | * Conceptually, a reference node captures a specific and well-defined data dependence into a relation, either by 31 | * direct column reference, e.g., "SELECT a.x from blah", or function application, e.g., "SELECT count(*) from blah". 32 | * In both examples, the part immediately after the SELECT is represented by a specific subclass of this class 33 | * which knows that it is executed w.r.t. relation "blah". For functions, this is tracked by the 'args' field; for 34 | * ColumnReference, it's tracked by the 'of' field. 35 | */ 36 | abstract class Reference(override val prestoSource: Option[PrestoNode]) extends Node(prestoSource) 37 | -------------------------------------------------------------------------------- /src/main/scala/chorus/rewriting/differential_privacy/ElasticSensitivityRewriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.rewriting.differential_privacy 24 | 25 | import chorus.rewriting._ 26 | import chorus.schema.Database 27 | import chorus.sql.relational_algebra.Relation 28 | import chorus.util.ElasticSensitivity 29 | 30 | /** Rewriter that enforces differential privacy using Elastic Sensitivity. */ 31 | class ElasticSensitivityRewriter(config: ElasticSensitivityConfig) extends SensitivityRewriter(config) { 32 | def getLaplaceNoiseScale(node: Relation, colIdx: Int): Double = 33 | 2 * ElasticSensitivity.smoothElasticSensitivity(node, config.database, colIdx, config.epsilon, config.delta) / config.epsilon 34 | } 35 | 36 | class ElasticSensitivityConfig( 37 | override val epsilon: Double, 38 | val delta: Double, 39 | override val database: Database, 40 | override val fillMissingBins: Boolean = true) 41 | extends DPRewriterConfig(epsilon, database, fillMissingBins) 42 | -------------------------------------------------------------------------------- /src/main/scala/chorus/dataflow/domain/Lattice.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.dataflow.domain.lattice 24 | 25 | import chorus.dataflow.domain._ 26 | 27 | /** Models a flat lattice of a finite set of elements of type [E]: 28 | * 29 | * ⊤ 30 | * / | \ 31 | * / | \ 32 | * e1 e2 ... 33 | * \ | / 34 | * \ | / 35 | * ⊥ 36 | */ 37 | class FlatLatticeDomain[E] extends AbstractDomain[DomainElem[E]] { 38 | override val bottom: DomainElem[E] = Bottom 39 | override def leastUpperBound(first: DomainElem[E], second: DomainElem[E]): DomainElem[E] = FlatLatticeDomain.leastUpperBound(first, second) 40 | } 41 | 42 | object FlatLatticeDomain { 43 | def bottom[E]: DomainElem[E] = Bottom 44 | 45 | def leastUpperBound[E](first: DomainElem[E], second: DomainElem[E]): DomainElem[E] = { 46 | (first, second) match { 47 | case (Top, _) | (_, Top) => Top 48 | case (Bottom, _) => second 49 | case (_, Bottom) => first 50 | case (Mid(a), Mid(b)) => if (a == b) first else Top 51 | case _ => Top 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/dataflow_graph/relation/DataTable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.dataflow_graph.relation 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | import chorus.schema.{Database, Schema} 27 | 28 | /** A DataTable is a leaf node of that represents a table in the database. 29 | */ 30 | case class DataTable( 31 | name: String, 32 | database: Database, 33 | override val columnNames: IndexedSeq[String]) 34 | (implicit override val prestoSource: Option[PrestoNode] = None) 35 | extends Relation(columnNames, prestoSource ) { 36 | 37 | override val children = Nil 38 | 39 | override val nodeStr: String = "\"" + name + "\"" 40 | 41 | /** Metadata properties (from the schema config file) for the columns in this table. 42 | */ 43 | lazy val colProperties: IndexedSeq[Map[String,String]] = { 44 | val colMap = Schema.getSchemaMapForTable(database, name) 45 | columnNames.map { colName => colMap.get(colName).fold(Map.empty[String,String])(_.properties) } 46 | } 47 | 48 | override def toString: String = name 49 | } 50 | 51 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/dataflow_graph/reference/Function.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.dataflow_graph.reference 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | 27 | /** Function: A SQL function application, e.g., COUNT(x). In the most general sense, this node type captures 28 | * any SQL construct that can be modeled as a function of one of more references (the references themselves being 29 | * perhaps subtrees). 30 | */ 31 | case class Function(functionName: String, args: List[Reference] = Nil)(implicit override val prestoSource: Option[PrestoNode] = None) 32 | extends Reference(prestoSource) { 33 | 34 | override val children = args 35 | 36 | override val nodeStr = functionName 37 | 38 | override def toString: String = functionName 39 | } 40 | 41 | object Function { 42 | def apply(functionName: String, arg: Reference)(implicit prestoSource: Option[PrestoNode]) = new Function(functionName, List(arg))(prestoSource) 43 | def apply(functionName: String, args: Reference*)(implicit prestoSource: Option[PrestoNode]) = new Function(functionName, args.toList)(prestoSource) 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/chorus/mechanisms/AverageMechClipping.scala: -------------------------------------------------------------------------------- 1 | package chorus.mechanisms 2 | 3 | 4 | import chorus.analysis.differential_privacy.GlobalSensitivityAnalysis 5 | import chorus.schema.Database 6 | import chorus.rewriting.RewriterConfig 7 | import chorus.rewriting.differential_privacy.ClippingRewriter 8 | import chorus.util.DB 9 | import chorus.sql.relational_algebra.{RelUtils, Relation} 10 | import chorus.dataflow.domain.UnitDomain 11 | import org.apache.calcite.rel.core.Aggregate 12 | import org.apache.calcite.sql.fun.SqlSumAggFunction 13 | import chorus.exception.UnsupportedQueryException 14 | 15 | import chorus.rewriting.rules.ColumnDefinition._ 16 | import chorus.rewriting.rules.Operations._ 17 | import chorus.rewriting.rules.ValueExpr 18 | import chorus.rewriting.rules.Expr._ 19 | 20 | class AverageMechClipping(epsilon: Double, l: Double, u: Double, 21 | root: Relation, config: RewriterConfig) 22 | extends ChorusMechanism[List[DB.Row]] { 23 | 24 | def replaceAgg(root: Relation, aggFn: String) = { 25 | root.rewriteRecursive(UnitDomain) { (node, orig, _) => 26 | node match { 27 | case Relation(a: Aggregate) => { 28 | val groupedCols = RelUtils.getGroupedCols(a) 29 | 30 | val origAlias = a.getRowType.getFieldNames.get(groupedCols.length) 31 | val origCol = a.getInput().getRowType.getFieldNames.get(groupedCols.length) 32 | 33 | val finalAgg = aggFn match { 34 | case "sum" => Sum(col(origCol)) AS origAlias 35 | case "count" => Count(col(origCol)) AS origAlias 36 | } 37 | 38 | val newR = Relation(a.getInput).agg(groupedCols: _*)(finalAgg) 39 | 40 | (newR, ()) 41 | } 42 | case _ => (node, ()) 43 | } 44 | } 45 | } 46 | 47 | def run() = { 48 | val sumQuery = replaceAgg(root, "sum") 49 | val countQuery = replaceAgg(root, "count") 50 | 51 | val (r1, c1) = new LaplaceMechClipping(epsilon/2.0, l, u, sumQuery, config).run() 52 | val (r2, c2) = new LaplaceMechClipping(epsilon/2.0, l, u, countQuery, config).run() 53 | 54 | val results = (r1 zip r2).map { 55 | case (DB.Row(vs1), DB.Row(vs2)) => DB.Row((vs1 zip vs2).map { 56 | case (n1, n2) => (n1.toDouble / n2.toDouble).toString 57 | }) 58 | } 59 | 60 | (results, c1 + c2) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/dataflow_graph/reference/UnstructuredReference.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.dataflow_graph.reference 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | import chorus.sql.dataflow_graph.Node 27 | 28 | /** An unstructured reference is a type of reference node that stores a relationship between children nodes without any 29 | * other semantic information. It is used as a "catch all" to represent SQL constructs that need no distinct 30 | * representation for our analyses but for which we wish to still capture a coarse data dependency between nodes. 31 | */ 32 | case class UnstructuredReference(refType: String, refChildren: List[Node] = Nil)(implicit override val prestoSource: Option[PrestoNode] = None) 33 | extends Reference(prestoSource) { 34 | 35 | override val children = refChildren 36 | 37 | override val nodeStr: String = refType 38 | 39 | override def toString: String = refType 40 | } 41 | 42 | object UnstructuredReference { 43 | def apply(refType: String, children: Node*)(implicit prestoSource: Option[PrestoNode]) = new UnstructuredReference(refType, children.toList)(prestoSource) 44 | } 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | This repository contains the updated implementation of the Chorus 4 | system (version 0.1.3) for differential privacy, to accompany the 5 | following paper: 6 | 7 | - [**CHORUS: a Programming Framework for Building Scalable Differential 8 | Privacy Mechanisms.**](https://conferences.computer.org/eurosp/pdfs/EuroSP2020-2psedXWK6U4prXdo7t91Gm/508700a535/508700a535.pdf) Noah Johnson, Joseph P. Near, Joseph 9 | M. Hellerstein, Dawn Song. *EuroS&P 2020*. 10 | 11 | This is an updated release of Chorus (version 0.1.3). The original 12 | release of Chorus is available 13 | [here](https://github.com/uber-archive/sql-differential-privacy); see 14 | the original repository and the paper for more documentation. 15 | 16 | ## Building & Running 17 | 18 | This framework is written in Scala and built using Maven. The code has been tested on Mac OS X and Linux. To build the code: 19 | 20 | ``` 21 | $ mvn package 22 | ``` 23 | 24 | ## Running Examples 25 | 26 | The file `examples/MechanismExamples.scala` contains several examples 27 | from the paper. To run the examples, after building Chorus: 28 | 29 | ``` 30 | mvn exec:java -Dexec.mainClass="examples.MechanismExamples" 31 | ``` 32 | 33 | ## Potential Security Issues 34 | 35 | This repo contains a proof-of-concept implementation resulting from a 36 | research project; we did not attempt to mitigate issues resulting from 37 | floating-point arithmetic or side channels. In particular, the 38 | following known attacks might be possible, depending on the deployment 39 | scenario and DBMS you use: 40 | 41 | - [On significance of the least significant bits for differential 42 | privacy](https://dl.acm.org/doi/pdf/10.1145/2382196.2382264). Ilya 43 | Mironov, CCS 2012. 44 | - [Widespread Underestimation of Sensitivity in Differentially Private 45 | Libraries and How to Fix It](https://arxiv.org/abs/2207.10635). 46 | Casacuberta et al., preprint, July 2022. 47 | - [Side-Channel Attacks on Query-Based Data 48 | Anonymization](https://dl.acm.org/doi/pdf/10.1145/3460120.3484751). 49 | Boenisch et al., CCS 2021. 50 | 51 | Please don't use this code in production without considering these 52 | issues first. 53 | 54 | ## License 55 | 56 | This project is released under the MIT License. 57 | 58 | ## Contact Information 59 | 60 | This code is maintained by [Joe Near](http://www.uvm.edu/~jnear/). 61 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/dataflow_graph/relation/Select.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.dataflow_graph.relation 24 | 25 | import com.facebook.presto.sql.tree.{SingleColumn, Node => PrestoNode} 26 | import chorus.sql.dataflow_graph.reference.Reference 27 | 28 | /** Select: A relation created with SQL's SELECT. Note that in dataflow graphs, a select node has no "from" field, 29 | * as this information is explicitly encoded inside each column reference, which maintains a pointer to the node 30 | * to which it refers. 31 | */ 32 | case class Select( 33 | items: List[SelectItem], 34 | where: Option[Reference] = None, 35 | groupBy: List[Int] = Nil) 36 | (implicit override val prestoSource: Option[PrestoNode] = None) 37 | extends Relation(items.map{ _.as }.toIndexedSeq, prestoSource ) { 38 | 39 | override val children = items.map{ _.ref } ++ List(where).flatten ++ groupBy.map{ items(_).ref } 40 | 41 | override val nodeStr : String = "" 42 | 43 | override def toString: String = items.toString 44 | } 45 | 46 | /** A selection of a single column. 47 | */ 48 | case class SelectItem(as: String, ref: Reference, prestoSource: Option[SingleColumn] = None) 49 | -------------------------------------------------------------------------------- /src/main/scala/chorus/dataflow/domain/AbstractDomain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.dataflow.domain 24 | 25 | /** Models a domain lattice whose elements are represented by type E and partial order is defined (implicitly) by the 26 | * leastUpperBound method. 27 | * 28 | * This is the common interface for abstract domains, which store a particular type of dataflow fact for an analysis. 29 | * A dataflow analysis updates this abstract state by modeling the semantics of nodes in the tree with respect to the 30 | * domain of choice. 31 | * 32 | * Each abstract domain must implement a leastUpperBound operation that computes (or at minimum, over-approximates) the 33 | * lowest domain element that is greater than both input elements per to the domain's partial order. This method 34 | * is the means by which the analysis framework conservatively combines multiple states at branches in the tree. 35 | */ 36 | trait AbstractDomain[E] { 37 | /** The bottom element for this domain. 38 | */ 39 | val bottom: E 40 | 41 | /** The least upper bound of elements a and b as defined by the partial order of this abstract domain. 42 | */ 43 | def leastUpperBound(a: E, b: E): E 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/chorus/dataflow/domain/Basic.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.dataflow.domain 24 | 25 | /** An abstract domain with just top and bottom values: 26 | * 27 | * ⊤ (true) 28 | * | 29 | * ⊥ (false) 30 | */ 31 | object BooleanDomain extends AbstractDomain[Boolean] { 32 | override val bottom: Boolean = false 33 | override def leastUpperBound(first: Boolean, second: Boolean): Boolean = first || second 34 | } 35 | 36 | /** An abstract domain representing an optional *fixed* value, where bottom is None. This lattice has no top element; 37 | * only one element value may be stored. */ 38 | class OptionDomain[T] extends AbstractDomain[Option[T]] { 39 | override val bottom: Option[T] = Option.empty 40 | override def leastUpperBound(first: Option[T], second: Option[T]): Option[T] = { 41 | if (first.equals(second)) 42 | first 43 | else 44 | throw new java.util.NoSuchElementException("OptionDomain.leastUpperBound with different element values") 45 | } 46 | } 47 | 48 | /** The void domain, storing nothing */ 49 | object UnitDomain extends AbstractDomain[Unit] { 50 | override val bottom: Unit = () 51 | override def leastUpperBound(a: Unit, b: Unit): Unit = bottom 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/chorus/rewriting/differential_privacy/SensitivityRewriter.scala: -------------------------------------------------------------------------------- 1 | package chorus.rewriting.differential_privacy 2 | 3 | import chorus.analysis.histogram.HistogramAnalysis 4 | import chorus.dataflow.domain.UnitDomain 5 | import chorus.rewriting.rules.ColumnDefinition._ 6 | import chorus.rewriting.rules.Operations._ 7 | import chorus.rewriting.{DPRewriterConfig, DPUtil, Rewriter} 8 | import chorus.sql.relational_algebra.Relation 9 | import org.apache.calcite.rel.core.Aggregate 10 | 11 | 12 | /** Parent class for sensitivity-based mechanisms, which add Laplace noise scaled to each output column's sensitivity. 13 | * Each mechanism has a specific way of computing the scale of this noise for its supported class of queries. 14 | * 15 | * See [ElasticSensitivityRewriter] and [RestrictedSensitivityRewriter]. 16 | */ 17 | abstract class SensitivityRewriter[C <: DPRewriterConfig](config: C) extends Rewriter(config) { 18 | 19 | /** Returns the scale of Laplace noise required for the given column as defined by the mechanism. Implemented by subclasses. */ 20 | def getLaplaceNoiseScale(node: Relation, colIdx: Int): Double 21 | 22 | def rewrite(root: Relation): Relation = { 23 | root.rewriteRecursive(UnitDomain) { (node, orig, _) => 24 | node match { 25 | case Relation(a: Aggregate) => 26 | // For histogram queries, ensure all values from domain appear in result set 27 | val withFilledBins = 28 | if (a.getGroupCount > 0) { 29 | val histogramResults = new HistogramAnalysis().run(node, config.database).colFacts 30 | DPUtil.addBinsFromDomain(a, histogramResults, config) 31 | } 32 | else Relation(a) 33 | 34 | val result = withFilledBins.mapCols { col => 35 | // Compute the scale of Laplace noise for the column. 36 | val laplaceNoiseScale = getLaplaceNoiseScale(node, col.idx) 37 | 38 | if (laplaceNoiseScale == 0) 39 | // No noise added to histogram bins that are marked safe for release. 40 | col 41 | else { 42 | // Rewrite the column expression to add scaled Laplace noise. 43 | val noiseExpr = laplaceNoiseScale * DPUtil.LaplaceSample 44 | (col.expr + noiseExpr) AS col.alias 45 | } 46 | } 47 | 48 | (result, ()) 49 | 50 | case _ => (node, ()) 51 | } 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/chorus/mechanisms/SparseVectorMechanism.scala: -------------------------------------------------------------------------------- 1 | package chorus.mechanisms 2 | 3 | 4 | import chorus.analysis.differential_privacy.GlobalSensitivityAnalysis 5 | import chorus.schema.Database 6 | import chorus.rewriting.RewriterConfig 7 | import chorus.rewriting.differential_privacy.ClippingRewriter 8 | import chorus.util.DB 9 | import chorus.sql.relational_algebra.{RelUtils, Relation} 10 | import chorus.dataflow.domain.UnitDomain 11 | import org.apache.calcite.rel.core.Aggregate 12 | import org.apache.calcite.sql.fun.SqlSumAggFunction 13 | import chorus.exception.UnsupportedQueryException 14 | 15 | import chorus.rewriting.rules.ColumnDefinition._ 16 | import chorus.rewriting.rules.Operations._ 17 | import chorus.rewriting.rules.ValueExpr 18 | import chorus.rewriting.rules.Expr._ 19 | 20 | 21 | class SparseVectorMechanism(epsilon: Double, queries: List[Relation], threshold: Double, 22 | config: RewriterConfig) extends ChorusMechanism[Option[Int]] { 23 | 24 | def maxSensitivity(query: Relation, database: Database): Double = { 25 | val facts = new GlobalSensitivityAnalysis().run(query, database).colFacts 26 | facts.map(_.sensitivity.get).max 27 | } 28 | 29 | def run(): (Option[Int], EpsilonDPCost) = { 30 | val cost = EpsilonDPCost(epsilon) 31 | 32 | // Require queries to have a maximum sensitivity of 1 33 | val sensitivities = queries.map { (q: Relation) => maxSensitivity(q, config.database) } 34 | if (sensitivities.max > 1) 35 | return (None, cost) 36 | 37 | // Generate noisy threshold 38 | val T = threshold + BasicMechanisms.laplaceSample(2/epsilon) 39 | 40 | // Loop over the queries, to find one exceeding the threshold 41 | for (i <- 0 to queries.length) { 42 | DB.execute(queries(i), config.database) match { 43 | case List(DB.Row(List(r))) => 44 | if (r.toDouble + BasicMechanisms.laplaceSample(4/epsilon) >= T) 45 | return (Some(i), cost) 46 | } 47 | } 48 | 49 | (None, cost) 50 | } 51 | } 52 | 53 | class SparseVectorMechanismValue(epsilon: Double, queries: List[Relation], 54 | threshold: Double, config: RewriterConfig) extends ChorusMechanism[List[DB.Row]] { 55 | 56 | def run() = { 57 | val (Some(idx), c1) = new SparseVectorMechanism(epsilon/2, queries, threshold, config).run() 58 | val (result, c2) = new LaplaceMechClipping(epsilon/2, 0, 10, queries(idx), config).run() 59 | 60 | (result, c1 + c2) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/dataflow_graph/reference/ColumnReference.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.dataflow_graph.reference 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | import chorus.sql.dataflow_graph.relation.Relation 27 | 28 | /** ColumnReference: A column reference is a node that reads a specific column (referenced by zero-indexed ordinal) from 29 | * a relation. For example, if my_table has columns [foo, bar, baz] then for query "SELECT baz from my_table", the 30 | * dataflow graph includes a ColumnReference node with .of pointing to relation DataTable[my_table] and .colIndex = 2. 31 | * 32 | * If you want to know the name of the column, you can ask the relation, e.g., this.of.getColumnName(this.colIndex) but 33 | * be aware a column reference is not uniquely defined by the column name since relations in a dataflow graph can 34 | * have more than one column the same name. 35 | */ 36 | case class ColumnReference(colIndex: Int, of: Relation)(implicit override val prestoSource: Option[PrestoNode] = None) 37 | extends Reference(prestoSource) { 38 | 39 | override val children = List( of ) 40 | 41 | override val nodeStr = colIndex.toString 42 | 43 | override def toString: String = s"${of.toString}.${colIndex.toString}" 44 | } 45 | -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/rewriting/CoverageRewriterTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.rewriting 24 | 25 | import chorus.rewriting.coverage.CoverageRewriter 26 | import chorus.schema.Schema 27 | import chorus.sql.QueryParser 28 | import junit.framework.TestCase 29 | 30 | class CoverageRewriterTest extends TestCase { 31 | val database = Schema.getDatabase("test") 32 | 33 | def checkResult(query: String, expected: String): Unit = { 34 | val root = QueryParser.parseToRelTree(query, database) 35 | val config = new RewriterConfig(database) 36 | val result = new CoverageRewriter(config).run(root) 37 | TestCase.assertEquals(expected.stripMargin.stripPrefix("\n").replaceAll("\r", ""), result.toSql()) 38 | } 39 | 40 | def testStatisticalQuery() = { 41 | val query = "SELECT COUNT(*), AVG(order_id) FROM orders" 42 | 43 | checkResult(query, """ 44 | |SELECT COUNT(*) coverage 45 | |FROM public.orders""" 46 | ) 47 | } 48 | 49 | def testHistogramQuery() = { 50 | val query = "SELECT order_id, AVG(order_id) FROM orders WHERE order_id < 10 GROUP BY order_id" 51 | 52 | checkResult(query, """ 53 | |WITH _count AS ( 54 | | SELECT order_id, COUNT(*) coverage 55 | | FROM public.orders 56 | | WHERE order_id < 10 57 | | GROUP BY order_id 58 | |) 59 | |SELECT MEDIAN(coverage) coverage 60 | |FROM _count 61 | |LIMIT 1""" 62 | ) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/chorus/dataflow/AbstractDataflowAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.dataflow 24 | 25 | import chorus.sql.AbstractAnalysis 26 | 27 | /** Common trait of dataflow analyses across different representations (AST, dataflow graph, and relational algebra) 28 | */ 29 | trait AbstractDataflowAnalysis[N <: AnyRef, T1] extends AbstractAnalysis[N, T1] { 30 | /** Invokes the transfer function of the analysis implementation and returns a new abstract state. Implemented by 31 | * analysis type subclasses. 32 | */ 33 | def transferNode(node: N, state: T1): T1 34 | 35 | /** Returns the children of the given node. Implemented by analysis type subclasses. 36 | */ 37 | def getNodeChildren(node: N): Iterable[N] 38 | 39 | /** Invokes the join function of the analysis implementation and returns a new abstract state. Implemented by 40 | * analysis type subclasses. 41 | */ 42 | def joinNode(node: N, children: Iterable[N]): T1 43 | 44 | /** Recursive procedure to visit nodes in the tree and invoke analysis transfer/join methods. 45 | */ 46 | def process(node: N): Unit = { 47 | if (resultMap.contains(node)) 48 | return 49 | 50 | val children = getNodeChildren(node) 51 | children.foreach { process } 52 | 53 | currentNode = Some(node) 54 | 55 | val joinResult = joinNode(node, children) 56 | val transferResult = transferNode(node, joinResult) 57 | resultMap += (node -> transferResult) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/chorus/util/IdentityHashMap.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.util 24 | 25 | import scala.collection.generic.CanBuildFrom 26 | import scala.collection.mutable 27 | 28 | /** Identity hash map: compares objects by object ID, not value. 29 | */ 30 | final class IdentityHashMap[A <: AnyRef, B]() extends mutable.HashMap[A, B] with mutable.MapLike[A, B, IdentityHashMap[A, B]] { 31 | override protected def elemEquals(key1: A, key2: A): Boolean = key1 eq key2 32 | override protected def elemHashCode(key: A) = System.identityHashCode(key) 33 | override def empty: IdentityHashMap[A, B] = IdentityHashMap.empty 34 | } 35 | 36 | object IdentityHashMap { 37 | type Coll = IdentityHashMap[_, _] 38 | 39 | implicit def canBuildFrom[A <: AnyRef, B] = new CanBuildFrom[Coll, (A, B), IdentityHashMap[A, B]] { 40 | def apply() = newBuilder[A, B] 41 | def apply(from: Coll) = { 42 | val builder = newBuilder[A, B] 43 | builder.sizeHint(from.size) 44 | builder 45 | } 46 | } 47 | 48 | def empty[A <: AnyRef, B]: IdentityHashMap[A, B] = new IdentityHashMap[A, B] 49 | 50 | def newBuilder[A <: AnyRef, B] = new mutable.MapBuilder[A, B, IdentityHashMap[A, B]](empty[A, B]) { 51 | override def +=(x: (A, B)): this.type = { 52 | elems += x 53 | this 54 | } 55 | override def sizeHint(size: Int): Unit = elems.sizeHint(size) 56 | } 57 | 58 | def apply[A <: AnyRef, B](elems: (A, B)*) = (newBuilder[A, B] ++= elems).result() 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/chorus/analysis/differential_privacy/StabilityDomain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.analysis.differential_privacy 24 | 25 | import chorus.dataflow.domain.AbstractDomain 26 | 27 | /** Abstract domain for relations in elastic sensitivity analysis. 28 | * 29 | * @param stability Stability of the relation as defined by elastic sensitivity. 30 | * @param isPublic Does this relation contain only publicly-derived data (as determined by the isPublic table flag)? 31 | * When public tables are joined with a protected table the entire relation becomes non-public. 32 | * @param ancestors Set of this node's ancestor tables, used to detect self-joins. 33 | */ 34 | case class RelStability(stability: Double, 35 | isPublic: Boolean, 36 | ancestors: Set[String]) { 37 | override def toString: String = s"stability: $stability, isPublic: $isPublic, ancestors: $ancestors" 38 | } 39 | 40 | /** The abstract domain is a product lattice with pointwise ordering of the element types defined above. 41 | */ 42 | object StabilityDomain extends AbstractDomain[RelStability] { 43 | override val bottom: RelStability = 44 | RelStability( 45 | stability = 1.0, 46 | isPublic = false, 47 | ancestors = Set.empty) 48 | 49 | override def leastUpperBound(first: RelStability, second: RelStability): RelStability = 50 | RelStability( 51 | stability = math.max(first.stability, second.stability), 52 | isPublic = first.isPublic && second.isPublic, 53 | ancestors = first.ancestors ++ second.ancestors) 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/chorus/analysis/join/JoinKeysUsed.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.analysis.join 24 | 25 | import chorus.dataflow.node.DFGVisitorAnalysis 26 | import chorus.sql.dataflow_graph.Node 27 | import chorus.sql.dataflow_graph.reference.{ColumnReference, Function} 28 | import chorus.sql.dataflow_graph.relation.{DataTable, Join} 29 | 30 | import scala.collection.mutable 31 | 32 | /** Analysis that returns the set of all columns used as equi-join keys in a given query */ 33 | class JoinKeysUsed extends DFGVisitorAnalysis[JoinKeyDomain] { 34 | 35 | override def run(node: Node): JoinKeyDomain = { 36 | 37 | val state = new JoinKeyDomain() 38 | 39 | node.foreach { 40 | case d: DataTable => 41 | state.tables.add(d) 42 | 43 | case c: ColumnReference => 44 | state.tables.foreach { table => state.refs.add(table.name + "." + table.getColumnName(c.colIndex)) } 45 | state.tables.clear() 46 | 47 | case f: Function => 48 | if (f.functionName == "EQUAL") { 49 | state.refs.foreach { state.eqKeys.add } 50 | state.refs.clear() 51 | } 52 | 53 | case j: Join => 54 | state.eqKeys.foreach { state.joinKeys.add } 55 | state.eqKeys.clear() 56 | 57 | case _ => () 58 | } 59 | 60 | state 61 | } 62 | } 63 | 64 | /** Abstract domain for the join keys used analysis */ 65 | class JoinKeyDomain { 66 | val joinKeys = new mutable.HashSet[String]() 67 | val refs = new mutable.HashSet[String] 68 | val eqKeys = new mutable.HashSet[String] 69 | val tables = new mutable.HashSet[DataTable] 70 | } 71 | -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/rewriting/RestrictedSensitivityRewriterTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.rewriting 24 | 25 | import chorus.rewriting.differential_privacy.{RestrictedSensitivityConfig, RestrictedSensitivityRewriter} 26 | import chorus.schema.Schema 27 | import chorus.sql.QueryParser 28 | import junit.framework.TestCase 29 | 30 | class RestrictedSensitivityRewriterTest extends TestCase { 31 | val database = Schema.getDatabase("test") 32 | 33 | def checkResult(query: String, epsilon: Double, expected: String, fillMissingBins: Boolean = false): Unit = { 34 | val root = QueryParser.parseToRelTree(query, database) 35 | val config = new RestrictedSensitivityConfig(epsilon, database, fillMissingBins) 36 | val result = new RestrictedSensitivityRewriter(config).run(root) 37 | TestCase.assertEquals(expected.stripMargin.stripPrefix("\n").replaceAll("\r", ""), result.toSql()) 38 | } 39 | 40 | def testSimpleHistogram() { 41 | val query = "SELECT order_date, COUNT(*) FROM orders GROUP BY 1" 42 | 43 | // Sensitivity of this query is 2.0 so scale of Laplace noise for epsilon 0.1 is (2/0.1) = 20 44 | checkResult(query, 0.1, """ 45 | |SELECT order_date, COUNT(*) + 20.0 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) 46 | |FROM public.orders 47 | |GROUP BY order_date""" 48 | ) 49 | } 50 | 51 | /** Restricted Sensitivity rewriter uses the same code as Elastic Sensitivity rewriter, the only difference being 52 | * the call to the sensitivity calculation analysis. Hence, see [ElasticSensitivityRewriterTest] and 53 | * [RestrictedSensitivityAnalysis] for additional test cases relevant to this mechanism. 54 | */ 55 | } 56 | -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/analysis/columns_used/ColumnsUsedAnalysisTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.analysis.columns_used 24 | 25 | import chorus.schema.Schema 26 | import chorus.sql.QueryParser 27 | import junit.framework.TestCase 28 | 29 | class ColumnsUsedAnalysisTest extends TestCase { 30 | val database = Schema.getDatabase("test") 31 | 32 | def checkResult(queryStr: String, expected: List[Set[String]]): Unit = { 33 | val root = QueryParser.parseToDataflowGraph(queryStr, database) 34 | val results = (new ColumnsUsedAnalysis).run(root, database) 35 | TestCase.assertEquals(expected, results.toList) 36 | } 37 | 38 | def testSelectAll() = { 39 | val query = "SELECT * FROM orders" 40 | checkResult(query, List(Set("orders.order_id"), Set("orders.order_date"), Set("orders.customer_id"), Set("orders.product_id"), Set("orders.quantity"), Set("orders.order_cost"))) 41 | } 42 | 43 | def testCountAll() = { 44 | val query = "SELECT count(*) FROM orders" 45 | checkResult(query, List(Set("orders.order_cost", "orders.order_id", "orders.product_id", "orders.order_date", "orders.customer_id", "orders.quantity"))) 46 | } 47 | 48 | def testWithoutWhere() = { 49 | val query = "SELECT order_id FROM orders" 50 | checkResult(query, List(Set("orders.order_id"))) 51 | } 52 | 53 | def testWithWhere() = { 54 | val query = "SELECT order_id FROM orders WHERE product_id = 1" 55 | checkResult(query, List(Set("orders.order_id"))) 56 | } 57 | 58 | def testJoin() = { 59 | val query = "SELECT order_date FROM orders JOIN products ON orders.product_id = products.product_id" 60 | checkResult(query, List(Set("orders.order_date", "orders.product_id", "products.product_id"))) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/analysis/taint/TaintAnalysisTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.analysis.taint 24 | 25 | import chorus.schema.Schema 26 | import chorus.sql.QueryParser 27 | import junit.framework.TestCase 28 | 29 | /** Note: the following columns care marked tainted in schema config: 30 | * customers.name 31 | * customers.address 32 | * 33 | * All other columns are untainted. 34 | */ 35 | class TaintAnalysisTest extends TestCase { 36 | val database = Schema.getDatabase("test") 37 | 38 | private def getResults(query: String) = { 39 | val root = QueryParser.parseToRelTree(query, database) 40 | new TaintAnalysis().run(root, database).colFacts.toList 41 | } 42 | 43 | def testSimple() = { 44 | val query = "SELECT customer_id, name, address FROM customers" 45 | val actualResult = getResults(query) 46 | val expectedResult = List(false, true, true) 47 | 48 | TestCase.assertEquals(expectedResult, actualResult) 49 | } 50 | 51 | def testAggregation() = { 52 | val query = """ 53 | SELECT customers.name as name, count(*) as "count" 54 | FROM orders JOIN customers ON orders.customer_id = customers.customer_id 55 | GROUP BY 1 56 | """ 57 | 58 | val actualResult = getResults(query) 59 | // "customers.name" is tainted, so output column 'name' should be tainted 60 | val expectedResult = List(true, true) 61 | 62 | TestCase.assertEquals(expectedResult, actualResult) 63 | } 64 | 65 | def testWith() = { 66 | val query = """ 67 | WITH t1 as (SELECT * FROM products), 68 | t2 as (SELECT * FROM customers) 69 | SELECT t1.name as product_name, t2.name as customer_name from t1, t2 70 | """ 71 | 72 | val actualResult = getResults(query) 73 | val expectedResult = List(false, true) 74 | 75 | TestCase.assertEquals(expectedResult, actualResult) 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/dataflow_graph/Node.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.dataflow_graph 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | 27 | import scala.collection.mutable 28 | 29 | /** A dataflow graph is a custom representation of a SQL query where all data dependencies are explicitly expressed via 30 | * graph edges. It can be used as the basis for both column-based and relation-based dataflow analyses. 31 | * 32 | * This class is the parent type of all dataflow graph nodes. 33 | */ 34 | abstract class Node(val prestoSource: Option[PrestoNode]) extends Traversable[Node] { 35 | 36 | val nodeStr: String 37 | val children: List[Node] 38 | 39 | /** Implementing the foreach method from Traversable gives access to many useful higher-order functions on dataflow 40 | * graphs including fold*, reduce*, exists, collect, etc. 41 | * 42 | * Since dataflow graphs may contain cycles, our implementation of foreach must keep track of which children nodes 43 | * have been traversed already. 44 | */ 45 | override def foreach[U](f: Node => U) = _foreach(f, new mutable.HashSet()) 46 | private def _foreach[U](f: Node => U, visited: mutable.HashSet[Node]): Unit = { 47 | if (visited.contains(this)) 48 | return 49 | visited += this 50 | 51 | f(this) 52 | children.foreach { 53 | _._foreach(f, visited) 54 | } 55 | } 56 | 57 | /** Optimized version of some traversable methods. 58 | */ 59 | override def isEmpty: Boolean = false 60 | override def head: Node = this 61 | // tail is inherited from TraversableLike (and implemented using foreach) 62 | 63 | override val hashCode: Int = super.hashCode 64 | 65 | // We override the equals method to ensure reference equality (by default, Scala uses structural equality for case classes) 66 | override def equals(that: Any): Boolean = 67 | that match { 68 | case ref: AnyRef => this eq ref 69 | case _ => false 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/chorus/analysis/histogram/QueryType.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.analysis.histogram 24 | 25 | import chorus.schema.Database 26 | import org.apache.calcite.rel.RelNode 27 | 28 | /** Classification of queries: histogram, non-histogram statistical, and raw data. */ 29 | object QueryType extends Enumeration { 30 | type QueryType = Value 31 | val HISTOGRAM, NON_HISTOGRAM_STATISTICAL, RAW_DATA = Value 32 | 33 | /** Inspects results of HistogramAnalysis to categorize the query as statistical (histogram or non-histogram) 34 | * or raw data. 35 | * 36 | * @param results A set of column facts representing the results of a histogram analysis 37 | * @return The type of the query: histogram, non-histogram statistical, or raw data 38 | */ 39 | def getQueryType(results: HistogramAnalysis#ResultType): QueryType = { 40 | var groupedColumns = 0 41 | var nonGroupedAggregations = 0 42 | var rawColumns = 0 43 | 44 | results.colFacts.foreach { info => 45 | if (info.isGroupBy) 46 | groupedColumns += 1 47 | 48 | if (info.isAggregation && !info.isGroupBy) 49 | nonGroupedAggregations += 1 50 | 51 | if (!info.isAggregation && !info.isGroupBy) 52 | rawColumns += 1 53 | } 54 | 55 | // A histogram is a query with one or more (non-grouped) aggregations, and all remaining columns grouped. 56 | if ((groupedColumns > 0) && (nonGroupedAggregations > 0) && (rawColumns == 0)) 57 | QueryType.HISTOGRAM 58 | 59 | // A statistical query has every column aggregated. 60 | else if (nonGroupedAggregations == results.colFacts.size) 61 | QueryType.NON_HISTOGRAM_STATISTICAL 62 | 63 | // Everything else is "raw data" 64 | else 65 | QueryType.RAW_DATA 66 | } 67 | 68 | /** Categorize the query using an already parsed tree. */ 69 | def getQueryType(root: RelNode, database: Database): QueryType = { 70 | val results = new HistogramAnalysis().run(root, database) 71 | getQueryType(results) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/dataflow_graph/relation/Relation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.dataflow_graph.relation 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | import chorus.sql.dataflow_graph.Node 27 | 28 | /** Generic parent class for all relations in dataflow graphs, which includes: the result of the entire query, named 29 | * tables in the database, and subqueries. 30 | */ 31 | abstract class Relation( 32 | val columnNames: IndexedSeq[String], // Ordered list of columns in this relation. We use IndexedSeq rather 33 | // than List to ensure fast lookups by index, an operation performed 34 | // frequently by analyses. 35 | override val prestoSource: Option[PrestoNode] = None) 36 | extends Node(prestoSource) { 37 | /** Optimization: because we frequently perform lookups by column name, we maintain a map to do the lookup 38 | * without having to loop over the list. 39 | * 40 | * Note that in Vertica, column references by name are case-insensitive, i.e., the following queries are valid: 41 | * WITH t1 as (SELECT a as BLAH) select blah from t1" 42 | * WITH t1 as (SELECT a as BLAH) select Blah from t1" 43 | * so although we preserve case in the schema because it determines output column names, we perform name-to-index 44 | * lookups without considering case. 45 | */ 46 | private val colIndexMap: Map[String, List[Int]] = 47 | columnNames.map{ _.toUpperCase } 48 | .zipWithIndex 49 | .groupBy(_._1) 50 | .map { case (k,v) => (k,v.map(_._2).toList) } 51 | 52 | /** Returns the number of columns in this relation. 53 | */ 54 | final def numCols: Int = columnNames.size 55 | 56 | /** Returns the index(es) of the column(s) with the given name, or Nil if this relation does not contain the given column. 57 | */ 58 | def getColumnIndexes(colName: String): List[Int] = colIndexMap.getOrElse(colName.toUpperCase, Nil) 59 | 60 | /** Returns the name of the column at the given ordinal. 61 | */ 62 | def getColumnName(index: Int): String = columnNames(index) 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/relational_algebra/RelTreeFunctions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.relational_algebra 24 | 25 | import chorus.schema.Database 26 | import chorus.sql.{AbstractAnalysis, QueryParser, TreeFunctions, TreePrinter} 27 | import org.apache.calcite.rel.core._ 28 | import org.apache.calcite.rex._ 29 | 30 | import scala.collection.JavaConverters._ 31 | 32 | /** Common trait for analyses on relational algebra trees. 33 | */ 34 | trait RelTreeFunctions extends TreeFunctions[RelOrExpr] { 35 | this: AbstractAnalysis[RelOrExpr, _] => 36 | override def getNodeChildren(node: RelOrExpr): Iterable[RelOrExpr] = RelTreeFunctions.getChildren(node) 37 | 38 | override def parseQueryToTree(query: String, database: Database): RelOrExpr = { 39 | QueryParser.parseToRelTree(query, database) 40 | } 41 | 42 | override def printTree(node: RelOrExpr): Unit = TreePrinter.printRelTree(node, resultMap, currentNode) 43 | } 44 | 45 | object RelTreeFunctions { 46 | def getChildren(node: RelOrExpr): Iterable[RelOrExpr] = node match { 47 | case Relation(p: Project) => Relation(p.getInput) :: p.getProjects.asScala.map{Expression}.toList 48 | case Relation(a: Aggregate) => List(a.getInput) 49 | case Relation(t: TableScan) => Nil 50 | case Relation(j: Join) => j.getInputs.asScala.map{Relation} ++ List(Expression(j.getCondition)) 51 | case Relation(c: Correlate) => c.getInputs.asScala.map{Relation} 52 | case Relation(f: Filter) => Relation(f.getInput) :: Expression(f.getCondition) :: Nil 53 | case Relation(s: Sort) => (Relation(s.getInput) :: Expression(s.fetch) :: Expression(s.offset) :: Nil).filter{ _.unwrap != null } 54 | case Relation(v: Values) => Nil 55 | case Relation(u: SetOp) => u.getInputs.asScala.map{Relation} 56 | 57 | case Expression(c: RexCall) => c.operands.asScala 58 | case Expression(i: RexInputRef) => Nil 59 | case Expression(l: RexLiteral) => Nil 60 | case Expression(f: RexFieldAccess) => List(f.getReferenceExpr) 61 | case Expression(c: RexCorrelVariable) => Nil 62 | case Expression(e) => throw new RuntimeException("Unimplemented: " + e.getClass.getSimpleName) 63 | case Relation(e) => throw new RuntimeException("Unimplemented: " + e.getClass.getSimpleName) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/scala/chorus/analysis/name_resolution/ReferenceInfo.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.analysis.name_resolution 24 | 25 | import com.facebook.presto.sql.tree.{Node, Query} 26 | import chorus.exception.TransformationException 27 | 28 | /** Stores information about references into relations as needed by TreeTransformAnalysis. 29 | */ 30 | class RefOption(val first: Node, val second: Option[Node] = None) { 31 | def isUnique: Boolean = !hasTwoRelations 32 | def hasTwoRelations: Boolean = second.isDefined 33 | 34 | def getOnly: Node = { 35 | if (hasTwoRelations) throw new TransformationException("getOnly called on reference with multiple possible relations.") 36 | first 37 | } 38 | } 39 | 40 | case class ReferenceInfo(relation1: Node, 41 | relation2: Option[Node] = None, 42 | // The node representing the "inner relation" being referenced, or None if the reference does not 43 | // specify an inner relation. See comments in NameResolutionDomain for details about inner relation references. 44 | var innerRelation: Option[Node] = None) { 45 | 46 | /** The presto node representing the relation being referenced into. This is usually a single relation, but it may 47 | * include two relations if the reference might point to either of them and must be resolved using schema 48 | * information. For example, in query 49 | * 50 | * SELECT blah from a JOIN b ON col1 = col2 51 | * 52 | * both col1 and col2 may refer to either a or b, and we can only determine which one it is by consulting the schema 53 | * (which is only available during tree transformation). 54 | */ 55 | val ref = new RefOption(relation1, relation2) 56 | 57 | override def toString: String = { 58 | def node2Str(node: Node): String = node match { 59 | case _ : Query => "Query" 60 | case _ => node.toString 61 | } 62 | 63 | var refStr = "" 64 | if (ref.second.isDefined) 65 | refStr = "Refs[1:" + node2Str(ref.first) + ", 2:" + node2Str(ref.second.get) + "]" 66 | else refStr = "Ref[" + node2Str(ref.first) + "]" 67 | 68 | refStr += (if (innerRelation.isEmpty) "" else " InnerRelation[" + node2Str(innerRelation.get) + "]") 69 | refStr 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/chorus/dataflow/column/AbstractColumnAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.dataflow.column 24 | 25 | import chorus.dataflow.AbstractDataflowAnalysis 26 | import chorus.dataflow.column.AbstractColumnAnalysis.ColumnFacts 27 | import chorus.dataflow.domain.AbstractDomain 28 | 29 | /** Tracks dataflow facts (abstract domains) individually for each column, automatically propagating 30 | * facts up the tree by figuring out which columns in a relation/reference correspond to which columns of its 31 | * subrelations. In other words, this analysis tracks data provenance automatically so subclasses need only define 32 | * methods for updating these facts at appropriate nodes. 33 | * 34 | * @tparam N The tree node type 35 | * @tparam E The result fact type 36 | * @tparam D The abstract domain for the analysis (i.e., lattice with element type E) 37 | */ 38 | abstract class AbstractColumnAnalysis[N <: AnyRef, E, D <: AbstractDomain[E]] 39 | extends AbstractDataflowAnalysis[N, ColumnFacts[E]] { 40 | 41 | def flattenJoinChildren(domain: AbstractDomain[E], node: N, children: Iterable[N]): ColumnFacts[E] = { 42 | val childrenFacts = children.flatMap{ resultMap(_) } 43 | val resultFacts = AbstractColumnAnalysis.joinFacts(domain, childrenFacts) 44 | IndexedSeq(resultFacts) 45 | } 46 | 47 | /** Implemented by analysis subclasses. 48 | */ 49 | override def transferNode(node: N, state: ColumnFacts[E]): ColumnFacts[E] 50 | override def joinNode(node: N, children: Iterable[N]): ColumnFacts[E] 51 | } 52 | 53 | object AbstractColumnAnalysis { 54 | import scala.language.implicitConversions 55 | 56 | type ColumnFacts[+J] = IndexedSeq[J] 57 | implicit def elemListToColumnFacts[J](elems: List[J]): ColumnFacts[J] = elems.toIndexedSeq 58 | implicit def elemsToColumnFacts[J](elems: J*): ColumnFacts[J] = elems.toIndexedSeq 59 | implicit def elemToColumnFacts[J](elem: J): ColumnFacts[J] = IndexedSeq(elem) 60 | 61 | def joinFacts[E](domain: AbstractDomain[E], facts: Iterable[E]): E = { 62 | val resultFact: E = 63 | if (facts.isEmpty) 64 | domain.bottom 65 | else if (facts.size == 1) 66 | facts.head 67 | else 68 | facts.reduce( (first, second) => domain.leastUpperBound(first, second) ) 69 | 70 | resultFact 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/examples/ElasticSensitivityExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package examples 24 | 25 | import chorus.schema.Schema 26 | import chorus.util.ElasticSensitivity 27 | 28 | /** A simple differential privacy example using elastic sensitivity. 29 | * 30 | * This example code supports queries that return a single column and single row. The code can be extended to support 31 | * queries returning multiple columns and rows by generating independent noise samples for each cell based the 32 | * appropriate column sensitivity. 33 | * 34 | * Caveats: 35 | * 36 | * Histogram queries (using SQL's GROUP BY) must be handled carefully so as not to leak information in the bin labels. 37 | * The analysis throws an error to warn about this, but this behavior can overridden if you know what you're doing. 38 | * 39 | * This example does not implement a privacy budget management strategy. Each query is executed using the full budget 40 | * value of EPSILON. Correct use of differential privacy requires allocating a fixed privacy from which a portion is 41 | * depleted to run each query. A privacy budget strategy depends on the problem domain and threat model and is 42 | * therefore beyond the scope of this tool. 43 | */ 44 | object ElasticSensitivityExample extends App { 45 | // Use the table schemas and metadata defined by the test classes 46 | System.setProperty("schema.config.path", "src/test/resources/schema.yaml") 47 | val database = Schema.getDatabase("test") 48 | 49 | // example query: How many US customers ordered product #1? 50 | val query = """ 51 | SELECT COUNT(*) FROM orders 52 | JOIN customers ON orders.customer_id = customers.customer_id 53 | WHERE orders.product_id = 1 AND customers.address LIKE '%United States%' 54 | """ 55 | 56 | // query result when executed on the database 57 | val QUERY_RESULT = 100000 58 | 59 | // privacy budget 60 | val EPSILON = 0.1 61 | // delta parameter: use 1/n^2, with n = 100000 62 | val DELTA = 1 / (math.pow(100000,2)) 63 | 64 | println(s"Query: $query") 65 | println(s"Private result: $QUERY_RESULT\n") 66 | 67 | (1 to 10).foreach { i => 68 | val noisyResult = ElasticSensitivity.addNoise(query, database, QUERY_RESULT, EPSILON, DELTA) 69 | println(s"Noisy result (run $i): %.0f".format(noisyResult)) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/relational_algebra/RelOrExpr.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.relational_algebra 24 | 25 | import org.apache.calcite.rel.RelNode 26 | import org.apache.calcite.rex.RexNode 27 | 28 | /** Wrapper for union type RelNode | RexNode, the root node type for relational algebra trees. 29 | */ 30 | sealed abstract class RelOrExpr extends Traversable[RelOrExpr] { 31 | 32 | /** Implementing the foreach method from Traversable gives access to many useful higher-order functions on relational 33 | * algebra trees fold*, reduce*, exists, collect, etc. 34 | */ 35 | override def foreach[U](f: RelOrExpr => U): Unit = { 36 | f(this) 37 | RelTreeFunctions.getChildren(this).foreach { _.foreach(f) } 38 | } 39 | 40 | /** Optimized version of some traversable methods. 41 | */ 42 | override def isEmpty: Boolean = false 43 | override def head: RelOrExpr = this 44 | // tail is inherited from TraversableLike 45 | 46 | /** Returns the underlying node element. 47 | */ 48 | def unwrap: AnyRef 49 | } 50 | 51 | case class Relation(node: RelNode) extends RelOrExpr { 52 | override def hashCode: Int = System.identityHashCode(node) 53 | override def equals(other: Any): Boolean = other match { 54 | case other: Relation => other.node eq node 55 | case _ => false 56 | } 57 | override def unwrap: RelNode = node 58 | override def toString: String = node.toString 59 | } 60 | 61 | case class Expression(node: RexNode) extends RelOrExpr { 62 | override def hashCode: Int = System.identityHashCode(node) 63 | override def equals(other: Any): Boolean = other match { 64 | case other: Expression => other.node eq node 65 | case _ => false 66 | } 67 | override def unwrap: RexNode = node 68 | override def toString: String = node.toString 69 | } 70 | 71 | /** Conversions to and from RelOrExpr */ 72 | object RelOrExpr { 73 | import scala.language.implicitConversions 74 | implicit def rel2Sum(node: RelNode): RelOrExpr = Relation(node) 75 | implicit def rex2Sum(node: RexNode): RelOrExpr = Expression(node) 76 | 77 | implicit def relIterable2Sum(nodes: Iterable[RelNode]): Iterable[RelOrExpr] = nodes.map{Relation} 78 | implicit def rexIterable2Sum(nodes: Iterable[RexNode]): Iterable[RelOrExpr] = nodes.map{Expression} 79 | 80 | implicit def sum2Rel(rel: Relation): RelNode = rel.node 81 | implicit def sum2Rex(rex: Expression): RexNode = rex.node 82 | } 83 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/QueryParser.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql 24 | 25 | import com.facebook.presto.sql.parser.{SqlParser => PrestoSqlParser} 26 | import com.facebook.presto.sql.tree.{Query, Statement} 27 | import chorus.exception.ParsingException 28 | import chorus.schema.Database 29 | import chorus.sql.ast.{Transformer => ASTTransformer} 30 | import chorus.sql.dataflow_graph.Node 31 | import chorus.sql.relational_algebra.{Relation, Transformer => RelTransformer} 32 | 33 | /** Utility class for parsing SQL queries into different representations. 34 | */ 35 | object QueryParser { 36 | private val prestoParser: PrestoSqlParser = new PrestoSqlParser 37 | 38 | def printQuery(query: String, treeType: String): Unit = { 39 | if (AbstractAnalysis.DEBUG) { 40 | println(s">>>>>>>>>>>>>>>>>>>>>>>>>>>> Parsing query to ${treeType}:") 41 | println(query) 42 | println("<<<<<<<<<<<<<<<<<<<<<<<<<<<") 43 | } 44 | } 45 | 46 | /** Parse a SQL query into an AST (represented by a Presto tree) 47 | * @param query The SQL query to be parsed 48 | * @return The AST root node representing the query 49 | */ 50 | def parseToPrestoTree(query: String): Query = { 51 | printQuery(query, "presto tree") 52 | 53 | try { 54 | return prestoParser.createStatement(query).asInstanceOf[Query] 55 | } 56 | catch { 57 | case e: Exception => { 58 | // Catch all exceptions that occur during presto parsing and wrap them in our ParsingException exception type. 59 | throw new ParsingException(e.getMessage) 60 | } 61 | } 62 | } 63 | 64 | /** Parse a SQL query and transform it into a dataflow graph 65 | * @param query The SQL query to be parsed 66 | * @return The dataflow graph root node 67 | */ 68 | def parseToDataflowGraph(query: String, database: Database): Node = { 69 | printQuery(query, "dataflow graph") 70 | 71 | val prestoRoot: Statement = parseToPrestoTree(query) 72 | val transform = new ASTTransformer(database) 73 | transform.convertToDataflowGraph(prestoRoot) 74 | } 75 | 76 | /** Parse a SQL query and transform it into a relational algebra representation. 77 | * @param query The SQL query to be parsed 78 | * @return The relational algebra tree root node 79 | */ 80 | def parseToRelTree(query: String, database: Database): Relation = { 81 | printQuery(query, "relational algebra tree") 82 | 83 | val transformer = RelTransformer.create(database) 84 | val root = transformer.convertToRelTree(query) 85 | Relation(root) 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/main/scala/chorus/schema/DatabaseModel.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.schema 24 | 25 | import com.facebook.presto.sql.tree._ 26 | 27 | /** A class to model differences between SQL database dialects and vendors that are material to query analysis. This 28 | * will be updated over time 29 | */ 30 | object DatabaseModel { 31 | /** Returns the column name assigned implicitly by the database for the given expression, i.e., 32 | * the name of the output column in the absence of an explicit alias. Note this is highly 33 | * database-specific. Logic below is for Vertica. 34 | */ 35 | def getImplicitColumnName(expr: Expression) = expr match { 36 | case q: QualifiedNameReference => q.getName.toString 37 | case d: DereferenceExpression => d.getFieldName 38 | case f: FunctionCall => f.getName.toString 39 | case s: StringLiteral => s.getValue 40 | case _ : ArithmeticBinaryExpression => "?column?" 41 | case _ : AtTimeZone => "timezone" 42 | case _ : SearchedCaseExpression => "case" 43 | case _ : Extract => "date_part" 44 | case _ : CurrentTime => "?column?" 45 | case _ : InPredicate => "?column?" 46 | case _ : CoalesceExpression => "coalesce" 47 | case _ : LongLiteral => "?column?" 48 | case _ : ComparisonExpression => "?column?" 49 | case _ => "?column?" // unknown/default 50 | } 51 | 52 | /** Is the given name a built-in function? If so, all QualifiedNameReference nodes with this value will be interpreted 53 | * as functions rather than column references. 54 | */ 55 | def isBuiltInFunction(name: String): Boolean = { 56 | name == "sysdate" 57 | } 58 | 59 | /** Returns true if the given function's ordinal argument (0-indexed) is known to be a literal value, in which case 60 | * it should be interpreted as a literal value even if parsed as a QualifiedName reference because it may not 61 | * be quoted/escaped in the original query. 62 | */ 63 | def isFunctionArgumentLiteral(functionName: String, argNum: Int): Boolean = { 64 | // TODO: extend this. 65 | (argNum == 0) && (functionName == "datediff" || functionName == "timestampadd") 66 | } 67 | 68 | /** Normalizes the table name to a canonical representation, e.g., by stripping out namespace and/or optional prefixes. 69 | * This canonical table name should match the name provided in the schema config to ensure that schema information 70 | * can be retrieved for the table. 71 | */ 72 | def normalizeTableName(tableName: String) = { 73 | tableName.replaceAll("^public.", "") // strip any "public." prefix 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/chorus/dataflow/domain/DomainElement.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.dataflow.domain 24 | 25 | /** A monad for lattice values represented by type E augmented with (type-less) top and bottom elements. 26 | */ 27 | abstract class DomainElem[+E] { 28 | /** Returns the element value, or throws java.util.NoSuchElementException if the lattice element is Top or Bottom 29 | */ 30 | def get: E 31 | def isTop: Boolean 32 | def isBottom: Boolean 33 | 34 | /** Returns true if the lattice value contains the given element. Always returns false if the lattice value is Top or Bottom. */ 35 | def contains[F >: E](elem: F): Boolean 36 | 37 | /** Retrieves the lattice value as an option, with Bottom returning None and element type E returning Some(e). 38 | * Should only be used on semi-bounded lattices which are guaranteed never to have value Top (e.g., SetLattice) since 39 | * this will raise an exception. 40 | */ 41 | def asOption: Option[E] 42 | } 43 | 44 | case object Top extends DomainElem[Nothing] { 45 | override def isTop: Boolean = true 46 | override def isBottom: Boolean = false 47 | override def contains[F >: Nothing](elem: F): Boolean = false 48 | override def asOption: Option[Nothing] = throw new java.util.NoSuchElementException("Top.asOption") 49 | override def get = throw new java.util.NoSuchElementException("Top.get") 50 | } 51 | 52 | case object Bottom extends DomainElem[Nothing] { 53 | override def isTop: Boolean = false 54 | override def isBottom: Boolean = true 55 | override def contains[F >: Nothing](elem: F): Boolean = false 56 | override def asOption: Option[Nothing] = None 57 | override def get = throw new java.util.NoSuchElementException("Bottom.get") 58 | } 59 | 60 | /** External code shouldn't need to interact directly with this class; the implicit definitions below automatically 61 | * convert to and from this wrapper and the underlying element type. 62 | */ 63 | case class Mid[E](value: E) extends DomainElem[E] { 64 | override def isTop: Boolean = false 65 | override def isBottom: Boolean = false 66 | override def get: E = value 67 | override def contains[F >: E](elem: F): Boolean = elem.equals(value) 68 | override def asOption: Option[E] = Some(value) 69 | override def toString: String = value.toString 70 | } 71 | 72 | object DomainElem { 73 | import scala.language.implicitConversions 74 | implicit def val2DomainElem[E](value: E): DomainElem[E] = Mid(value) 75 | implicit def elem2Val[E](value: Mid[E]): E = value.get 76 | 77 | /** Convert from Option[E] to lattice element, with Option.None mapped to Bottom. */ 78 | implicit def option2DomainElem[E](value: Option[E]): DomainElem[E] = value.fold[DomainElem[E]](Bottom)(Mid(_)) 79 | } 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /src/main/scala/examples/QueryRewritingExample.scala: -------------------------------------------------------------------------------- 1 | package examples 2 | 3 | import chorus.analysis.histogram.HistogramAnalysis 4 | import chorus.rewriting.differential_privacy.{ElasticSensitivityConfig, ElasticSensitivityRewriter, SampleAndAggregateConfig, SampleAndAggregateRewriter} 5 | import chorus.schema.Schema 6 | import chorus.sql.QueryParser 7 | import chorus.util.ElasticSensitivity 8 | 9 | /** A simple example demonstrating query rewriting for differential privacy. 10 | */ 11 | object QueryRewritingExample extends App { 12 | // Use the table schemas and metadata defined by the test classes 13 | System.setProperty("schema.config.path", "src/test/resources/schema.yaml") 14 | val database = Schema.getDatabase("test") 15 | 16 | // privacy budget 17 | val EPSILON = 0.1 18 | // delta parameter: use 1/n^2, with n = 100000 19 | val DELTA = 1 / (math.pow(100000,2)) 20 | 21 | // Helper function to print queries with indentation. 22 | def printQuery(query: String) = println(s"\n " + query.replaceAll("\\n", s"\n ") + "\n") 23 | 24 | def elasticSensitivityExample() = { 25 | println("*** Elastic sensitivity example ***") 26 | 27 | // Example query: How many US customers ordered product #1? 28 | val query = """ 29 | |SELECT COUNT(*) FROM orders 30 | |JOIN customers ON orders.customer_id = customers.customer_id 31 | |WHERE orders.product_id = 1 AND customers.address LIKE '%United States%'""" 32 | .stripMargin.stripPrefix("\n") 33 | 34 | // Print the example query and privacy budget 35 | val root = QueryParser.parseToRelTree(query, database) 36 | println("Original query:") 37 | printQuery(query) 38 | println(s"> Epsilon: $EPSILON") 39 | 40 | // Compute mechanism parameter values from the query. Note the rewriter does this automatically; here we calculate 41 | // the values manually so we can print them. 42 | val elasticSensitivity = ElasticSensitivity.smoothElasticSensitivity(root, database, 0, EPSILON, DELTA) 43 | println(s"> Elastic sensitivity of this query: $elasticSensitivity") 44 | println(s"> Required scale of Laplace noise: 2 * $elasticSensitivity / $EPSILON = ${2 * elasticSensitivity/EPSILON}") 45 | 46 | // Rewrite the original query to enforce differential privacy using Elastic Sensitivity. 47 | println("\nRewritten query:") 48 | val config = new ElasticSensitivityConfig(EPSILON, DELTA, database) 49 | val rewrittenQuery = new ElasticSensitivityRewriter(config).run(query) 50 | printQuery(rewrittenQuery.toSql()) 51 | } 52 | 53 | def sampleAndAggregateExample() = { 54 | println("*** Sample and aggregate example ***") 55 | val LAMBDA = 2.0 56 | 57 | // Example query: What is the average cost of orders for product 1? 58 | val query = """ 59 | |SELECT AVG(order_cost) FROM orders 60 | |WHERE product_id = 1""" 61 | .stripMargin.stripPrefix("\n") 62 | 63 | // Print the example query and privacy budget 64 | val root = QueryParser.parseToRelTree(query, database) 65 | println("Original query:") 66 | printQuery(query) 67 | println(s"> Epsilon: $EPSILON") 68 | 69 | // Compute mechanism parameter values from the query. Note the rewriter does this automatically; here we calculate 70 | // the values manually so we can print them. 71 | val analysisResults = new HistogramAnalysis().run(root, database).colFacts.head 72 | println(s"> Aggregation function applied: ${analysisResults.outermostAggregation}") 73 | val tableName = analysisResults.references.head.table 74 | val approxRowCount = Schema.getTableProperties(database, tableName)("approxRowCount").toLong 75 | 76 | println(s"> Table being queried: $tableName") 77 | println(s"> Approximate cardinality of table '$tableName': $approxRowCount") 78 | println(s"> Number of partitions (default heuristic): $approxRowCount ^ 0.4 = ${math.floor(math.pow(approxRowCount, 0.4)).toInt}") 79 | println(s"> Lambda: $LAMBDA") 80 | 81 | // Rewrite the original query to enforce differential privacy using Sample and Aggregate. 82 | println("\nRewritten query:") 83 | val config = new SampleAndAggregateConfig(EPSILON, LAMBDA, database) 84 | val rewrittenQuery = new SampleAndAggregateRewriter(config).run(query) 85 | printQuery(rewrittenQuery.toSql()) 86 | } 87 | 88 | elasticSensitivityExample() 89 | sampleAndAggregateExample() 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/chorus/exception/TransformationException.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.exception 24 | 25 | /** Thrown when fatal errors are encountered during Presto to dataflow graph transformation. 26 | */ 27 | class TransformationException(val message: String) extends RuntimeException(message) {} 28 | 29 | /** Thrown when Presto parsing fails, e.g., because the query has a syntax problem or because it uses a dialect of SQL 30 | * not supported by Presto's SQL grammar. 31 | */ 32 | class ParsingException(message: String) extends TransformationException(message) {} 33 | 34 | /** Thrown when processing a query that uses SELECT * on a relation for which the schema is unknown or incomplete. 35 | */ 36 | class AmbiguousWildcardException(message: String) extends TransformationException(message) {} 37 | 38 | /** Thrown when processing a query that references a column in two or more relations such that the reference is 39 | * ambiguous. For example, if tables A and B both have column "city_id", this query is ambiguous and would 40 | * produce a runtime error on the database: 41 | * 42 | * SELECT blah FROM A JOIN B on column_from_a = city_id 43 | * 44 | * Note this query would be legal if either: ambigous "city_id" is qualified with a dereference expression 45 | * (e.g., "B.city_id") OR all non-deference columns are unambiguous by schema (e.g., "column_from_a" appears only in 46 | * relation A and "city_id" appears only in relation B). 47 | */ 48 | class AmbiguousColumnReference(message: String) extends TransformationException(message) {} 49 | 50 | /** Thrown when tree transformation detects an infinite loop (e.g., because the tree has a cycle), which would otherwise 51 | * result in a StackOverflowException. 52 | */ 53 | class InfiniteLoopException(message: String) extends TransformationException(message) {} 54 | 55 | /** Thrown in exceptional cases when processing joins in the query. 56 | */ 57 | class JoinException(message: String) extends TransformationException(message) {} 58 | 59 | /** Thrown during graph transformation when the schema mode is STRICT and the query references a table whose schema is 60 | * not defined. 61 | */ 62 | class UndefinedSchemaException(message: String) extends TransformationException(message) {} 63 | 64 | /** Thrown when transforming a query that references a relation in such a way as we cannot determine which columns are 65 | * accessed. Possible causes: the query is invalid, schema is invalid or incomplete, or the name resolution analysis 66 | * has a bug. 67 | */ 68 | class UnknownColumnException(message: String) extends TransformationException(message) {} 69 | 70 | /** Thrown when trying to parse a query that is known to be invalid, for example because the list of returned columns is 71 | * either empty or "error", or when trying to rewrite a query that is not a supported type (e.g., a raw data query in a 72 | * differential privacy rewriter). 73 | */ 74 | class InvalidQueryException(message: String) extends TransformationException(message) {} 75 | 76 | /** Thrown when the tree contains a node type that is not recognized or unsupported. 77 | */ 78 | class UnrecognizedNodeTypeException(message: String) extends TransformationException(message) {} 79 | -------------------------------------------------------------------------------- /src/main/scala/chorus/rewriting/rules/ColumnDefinition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.rewriting.rules 24 | 25 | import chorus.dataflow.column.AbstractColumnAnalysis.ColumnFacts 26 | import chorus.dataflow.column.NodeColumnFacts 27 | import chorus.rewriting.rules.Expr.ColumnReferenceByName 28 | import chorus.sql.relational_algebra.{Relation, Transformer} 29 | import org.apache.calcite.rel.logical.{LogicalProject, LogicalValues} 30 | import org.apache.calcite.tools.Frameworks 31 | 32 | /** Methods for specifying column references and definitions in rewriting operations. */ 33 | 34 | class ColumnDefinition[+T <: Expr](val expr: T) 35 | case class ColumnDefinitionWithAlias[+T <: Expr](override val expr: T, alias: String) extends ColumnDefinition[T](expr) 36 | case class ColumnDefinitionWithOrdinal[+T <: Expr](override val expr: T, alias: String, idx: Int) extends ColumnDefinition[T](expr) 37 | 38 | object ColumnDefinition { 39 | import scala.collection.JavaConverters._ 40 | import scala.language.implicitConversions 41 | 42 | // Automatically cast to column if alias is attached to an expression 43 | implicit class ExprColumnAlias[T <: Expr](expr: T) { 44 | def AS(alias: String): ColumnDefinitionWithAlias[T] = ColumnDefinitionWithAlias[T](expr, alias) 45 | def AS(alias: ColumnReferenceByName): ColumnDefinitionWithAlias[T] = ColumnDefinitionWithAlias[T](expr, alias.name) 46 | } 47 | 48 | // Allow renaming of a column (keeping the same expression) 49 | implicit class ColumnAlias[T <: Expr](col: ColumnDefinition[T]) { 50 | def AS(alias: String): ColumnDefinitionWithAlias[T] = ColumnDefinitionWithAlias[T](col.expr, alias) 51 | } 52 | 53 | // Allow easy lookup of the column fact from an analysis result 54 | implicit class ColumnFactLookup[F](results: ColumnFacts[F]) { 55 | def apply[T <: Expr](col: ColumnDefinitionWithOrdinal[T]): F = results(col.idx) 56 | } 57 | implicit class NodeColumnFactLookup[F](results: NodeColumnFacts[_,F]) { 58 | def apply[T <: Expr](col: ColumnDefinitionWithOrdinal[T]): F = results.colFacts(col.idx) 59 | } 60 | 61 | // Creates a relation from a list of column definitions 62 | def rel(cols: ColumnDefinition[Expr]*): Relation = columnDefsToRelation(cols) 63 | implicit def columnDefsToRelation(cols: Seq[ColumnDefinition[Expr]]): Relation = { 64 | val cluster = new Transformer( 65 | Frameworks.newConfigBuilder 66 | .defaultSchema(Frameworks.createRootSchema(true)) 67 | .build 68 | ).cluster 69 | 70 | val inputRel = LogicalValues.createOneRow(cluster) 71 | val projections = cols.map{ _.expr.toRex(Relation(inputRel)) } 72 | val rowType = Helpers.getRecordType( cols.zip(projections) ) 73 | val result = LogicalProject.create(inputRel, projections.asJava, rowType) 74 | Relation(result) 75 | } 76 | 77 | implicit def columnReferenceToColumnDefinitionWithName(col: ColumnReferenceByName): ColumnDefinitionWithAlias[ColumnReferenceByName] = ColumnDefinitionWithAlias[ColumnReferenceByName](col, col.name) 78 | implicit def columnDefinitionWithAliasToColumnReferenceByName[T <: Expr](col: ColumnDefinitionWithAlias[T]): ColumnReferenceByName = Expr.col(col.alias) 79 | implicit def exprToColumnDefinition[T <: Expr](expr: T): ColumnDefinition[T] = new ColumnDefinition(expr) 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/chorus/util/ElasticSensitivity.scala: -------------------------------------------------------------------------------- 1 | package chorus.util 2 | 3 | import chorus.analysis.differential_privacy.ElasticSensitivityAnalysis 4 | import chorus.schema.Database 5 | import chorus.sql.QueryParser 6 | import chorus.sql.relational_algebra.Relation 7 | 8 | /** Utility methods for elastic sensitivity-based differential privacy. */ 9 | object ElasticSensitivity { 10 | /** Generate Laplace noise centered at 0 with the given scale. 11 | * 12 | * @param scale The scale of the noise 13 | * @return A single random number drawn from the distribution 14 | */ 15 | def laplace(scale: Double): Double = { 16 | val u = 0.5 - scala.util.Random.nextDouble() 17 | -math.signum(u) * scale * math.log(1 - 2*math.abs(u)) 18 | } 19 | 20 | /** Compute the elastic sensitivity of the query at distance k. 21 | * 22 | * Note: when calculating elastic sensitivity for sequential values of k (e.g., to use a smoothing function), use the 23 | * stream method below, which caches the query parse tree and is therefore much more efficient. 24 | * 25 | * @param query The input query 26 | * @param k The desired distance from the true database 27 | * @return Elastic sensitivity of query at distance k 28 | */ 29 | def elasticSensitivity(query: Relation, database: Database, k: Int): Double = { 30 | val analysis = new ElasticSensitivityAnalysis() 31 | analysis.setK(k) 32 | 33 | val result = analysis.analyzeQuery(query, database).colFacts 34 | assert (result.size == 1) // this function works for single-column queries. 35 | result.head.sensitivity.get 36 | } 37 | 38 | /** Returns a (lazily evaluated) stream of elastic sensitivities of the given column for the query at every distance k. 39 | * 40 | * @param query The input query 41 | * @return Elastic sensitivities for every distance k from the true database (k = 0, 1, 2, ...) 42 | */ 43 | def elasticSensitivityStream(query: Relation, database: Database, col: Int): Stream[Double] = { 44 | val analysis = new ElasticSensitivityAnalysis() 45 | 46 | Stream.from(0).map{ k => 47 | analysis.setK(k) 48 | val result = analysis.analyzeQuery(query, database).colFacts 49 | result(col).sensitivity.get 50 | } 51 | } 52 | 53 | /** Compute the smoothed elastic sensitivity for a given column of the query with a given epsilon. 54 | * 55 | * @param query The input query 56 | * @param col The index of the target column (0-based) 57 | * @param epsilon The desired privacy budget 58 | * @param delta The value of the delta parameter 59 | * @return The smoothed elastic sensitivity 60 | */ 61 | def smoothElasticSensitivity(query: Relation, database: Database, col: Int, epsilon: Double, delta: Double): Double = { 62 | /** Calculates the smooth elastic sensitivity by recursively computing smooth sensitivity for each value of k 63 | * until the function decreases at k+1. Since elastic sensitivity increases polynomially (at worst) in k while the 64 | * smoothing factor decays exponentially in k, this provides the correct (maximum) smooth sensitivity without 65 | * requiring computation for every k up to the size of the database. 66 | */ 67 | def sensitivityAtDistance(k: Int, prevSensitivity: Double, esStream: Stream[Double]): Double = { 68 | val elasticSensitivityAtK = esStream.head 69 | val beta = epsilon / (2 * Math.log(2 / delta)) 70 | val smoothSensitivity = Math.exp(-k * beta) * elasticSensitivityAtK 71 | 72 | if ((elasticSensitivityAtK == 0) || (smoothSensitivity < prevSensitivity)) prevSensitivity 73 | else sensitivityAtDistance(k+1, smoothSensitivity, esStream.tail) 74 | } 75 | 76 | sensitivityAtDistance(0, 0, elasticSensitivityStream(query, database, col)) 77 | } 78 | 79 | /** Produce a differentially private result for a query given its non-private result and the desired privacy budget. 80 | * 81 | * @param query The input query. It must return a single row and single column. 82 | * @param result The non-private result of running the query (a single number). 83 | * @param epsilon The desired privacy budget (e.g. 0.1). 84 | * @param delta The desired delta parameter (e.g. 1/n^2) 85 | * @return A differentially private answer to the input query. 86 | */ 87 | def addNoise(query: String, database: Database, result: Double, epsilon: Double, delta: Double): Double = { 88 | val tree = QueryParser.parseToRelTree(query, database) 89 | val sensitivity = ElasticSensitivity.smoothElasticSensitivity(tree, database, 0, epsilon, delta) 90 | result + laplace(2 * sensitivity / epsilon) 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/main/scala/chorus/analysis/differential_privacy/RestrictedSensitivityAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.analysis.differential_privacy 24 | 25 | import chorus.dataflow.column.NodeColumnFacts 26 | import chorus.exception.{AnalysisException, UnsupportedQueryException} 27 | import chorus.sql.relational_algebra._ 28 | import org.apache.calcite.rel.core.Join 29 | 30 | /** Restricted sensitivity analysis. Calculates the global sensitivity of a query over a restricted class of datasets 31 | * defined by properties of the data model (in particular the max frequency of join keys), which is presumed known by 32 | * the querier. 33 | * 34 | * @see [[https://arxiv.org/abs/1208.4586 Differentially Private Data Analysis of Social Networks via Restricted Sensitivity]] 35 | */ 36 | class RestrictedSensitivityAnalysis extends ElasticSensitivityAnalysis { 37 | 38 | override def transferJoin(node: Join, state: NodeColumnFacts[RelStability,ColSensitivity]): NodeColumnFacts[RelStability,ColSensitivity] = { 39 | /** Update the stability at every join, per restricted sensitivity definition. 40 | */ 41 | val equijoinColumns = RelUtils.extractEquiJoinColumns(node, node.getCondition) 42 | if (equijoinColumns.isEmpty) 43 | throw new UnsupportedQueryException(s"This analysis only works on single-clause equijoins.") 44 | 45 | val (leftColumnIndex, rightColumnIndex) = equijoinColumns.get 46 | 47 | val leftState = resultMap(Relation(node.getLeft)) 48 | val rightState = resultMap(Relation(node.getRight)) 49 | 50 | val leftStability = leftState.nodeFact.stability 51 | val rightStability = rightState.nodeFact.stability 52 | 53 | // Determine if this is a self-join: get the intersection of ancestors for the left and right relations 54 | // If the intersection is not empty, then this is a self-join (and restricted sensitivity doesn't support it) 55 | val isSelfJoin = (leftState.nodeFact.ancestors intersect rightState.nodeFact.ancestors).nonEmpty 56 | if (isSelfJoin) 57 | throw new UnsupportedQueryException("This analysis does not support self joins") 58 | 59 | // Determine the stability of the join 60 | val leftColFact = leftState.colFacts(leftColumnIndex) 61 | val rightColFact = rightState.colFacts(rightColumnIndex) 62 | 63 | val maxFreqLeftJoinColumn = leftColFact.maxFreq 64 | val maxFreqRightJoinColumn = rightColFact.maxFreq 65 | 66 | val newStability = 67 | (maxFreqLeftJoinColumn, maxFreqRightJoinColumn) match { 68 | case (l, r) if l <= 1.0 => r * leftStability 69 | case (l, r) if r <= 1.0 => l * rightStability 70 | case _ => throw new UnsupportedQueryException("This analysis does not support many-to-many joins") 71 | } 72 | 73 | val newNodeState = state.nodeFact.copy( 74 | stability = newStability 75 | ) 76 | 77 | /** Update the max frequency for every column by a factor of the max frequency of the join key in the opposing 78 | * relation. This models the worst-case situation where each record containing the most-frequent-key is duplicated 79 | * this many times by the join. 80 | */ 81 | val newColState = 82 | leftState.colFacts.map { x => x.copy(maxFreq = x.maxFreq * maxFreqRightJoinColumn) } ++ 83 | rightState.colFacts.map { x => x.copy(maxFreq = x.maxFreq * maxFreqLeftJoinColumn) } 84 | 85 | NodeColumnFacts(newNodeState, newColState) 86 | } 87 | 88 | override def setK(k: Int): Unit = throw new AnalysisException("This analysis does not use K") 89 | } 90 | -------------------------------------------------------------------------------- /src/main/scala/chorus/analysis/differential_privacy/SensitivityDomain.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.analysis.differential_privacy 24 | 25 | import chorus.dataflow.domain.AbstractDomain 26 | 27 | /** Abstract domain for columns in elastic sensitivity analysis. 28 | * 29 | * @param sensitivity Elastic sensitivity for this column. Always an upper bound of local sensitivity. 30 | * This is a floating point lattice with bottom (undefined sensitivity) represented by Option.None, 31 | * top (unbounded sensitivity) represented by Some(Infinity), and partial order defined by max. 32 | * @param maxFreq Max frequency of the column. The lattice is defined by the natural ordering. 33 | * @param aggregationApplied Has an aggregation already been applied to this column? 34 | * @param postAggregationArithmeticApplied Was a function/operation applied to post-aggregated result? We track this 35 | * only to print a helpful error message since this results in infinite 36 | * sensitivity. 37 | * @param canRelease Can the values of this column be released without adding noise? This is true for columns 38 | * of public tables and columns in private tables explicitly marked with canRelease=true 39 | * (as well as values derived therefrom). This is used to determine whether histogram bin 40 | * columns are safe for release. Boolean lattice with bottom = true and top = false 41 | */ 42 | case class ColSensitivity(sensitivity: Option[Double], 43 | maxFreq: Double, 44 | aggregationApplied: Boolean, 45 | postAggregationArithmeticApplied: Boolean, 46 | canRelease: Boolean, 47 | upperBound: Option[Double], 48 | lowerBound: Option[Double]) { 49 | override def toString: String = s"sensitivity: $sensitivity, maxFreq: $maxFreq, aggregationApplied: $aggregationApplied, postAggregationArithmeticApplied: $postAggregationArithmeticApplied, canRelease: $canRelease, upperBound: $upperBound, lowerBound: $lowerBound" 50 | } 51 | 52 | /** The abstract domain is a product lattice with pointwise ordering of the element types defined above. 53 | */ 54 | object SensitivityDomain extends AbstractDomain[ColSensitivity] { 55 | override val bottom: ColSensitivity = 56 | ColSensitivity( 57 | sensitivity = None, // sensitivity is undefined until aggregations are applied 58 | maxFreq = 0.0, 59 | aggregationApplied = false, 60 | postAggregationArithmeticApplied = false, 61 | canRelease = true, 62 | upperBound = None, 63 | lowerBound = None) 64 | 65 | override def leastUpperBound(first: ColSensitivity, second: ColSensitivity): ColSensitivity = 66 | ColSensitivity( 67 | sensitivity = (first.sensitivity ++ second.sensitivity).reduceLeftOption(math.max), 68 | maxFreq = math.max(first.maxFreq, second.maxFreq), 69 | aggregationApplied = first.aggregationApplied || second.aggregationApplied, 70 | postAggregationArithmeticApplied = first.postAggregationArithmeticApplied || second.postAggregationArithmeticApplied, 71 | canRelease = first.canRelease && second.canRelease, 72 | upperBound = (first.upperBound ++ second.upperBound).reduceLeftOption(math.max), 73 | lowerBound = (first.upperBound ++ second.upperBound).reduceLeftOption(math.min)) 74 | } 75 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/dataflow_graph/relation/Join.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.dataflow_graph.relation 24 | 25 | import com.facebook.presto.sql.tree.{Node => PrestoNode} 26 | import chorus.exception.AmbiguousColumnReference 27 | import chorus.sql.dataflow_graph.reference.Reference 28 | 29 | /** A relation created by JOIN. 30 | */ 31 | object JoinType { 32 | sealed trait JoinType 33 | case object CROSS extends JoinType 34 | case object INNER extends JoinType 35 | case object LEFT extends JoinType 36 | case object RIGHT extends JoinType 37 | case object FULL extends JoinType 38 | 39 | def parse(name: String): JoinType = { 40 | name match { 41 | case "CROSS" => CROSS 42 | case "INNER" => INNER 43 | case "LEFT" => LEFT 44 | case "RIGHT" => RIGHT 45 | case "FULL" => FULL 46 | case "IMPLICIT" => CROSS // implicit joins are effectively cross joins 47 | case _ => throw new IllegalArgumentException(s"Unknown join type: $name") 48 | } 49 | } 50 | } 51 | 52 | case class Join( 53 | left: Relation, 54 | right: Relation, 55 | joinType: JoinType.JoinType, 56 | condition: Option[Reference] = None) 57 | (implicit override val prestoSource: Option[PrestoNode] = None) 58 | extends Relation(left.columnNames ++ right.columnNames, prestoSource) { 59 | 60 | val children = List(left, right) ++ (if (condition.isDefined) List(condition.get) else Nil) 61 | 62 | /** Returns the column index for the column appearing in the *specified* inner relation. This is a substitute for the 63 | * Relation.getColumnIndex method for cases where the select item references a joined table by name. For example, 64 | * if table1 and table2 both have a column "uuid", this method can be used to resolve the correct index for the 65 | * query: SELECT table2.uuid from table1 JOIN table2 ... 66 | * 67 | * Returns the index of the specific column, or None if the column doesn't exist. 68 | */ 69 | def getColumnIndexForInnerRelation(colName: String, innerRelation: Relation): Option[Int] = { 70 | 71 | def visitRelation(indexSoFar: Int, relation: Relation): Option[Int] = 72 | 73 | if (relation == innerRelation) { 74 | // We found the specified inner relation. Return. 75 | val result = relation.getColumnIndexes(colName) 76 | result.size match { 77 | case 0 => None // Column not found 78 | case 1 => Some(indexSoFar + result(0)) 79 | case _ => throw new AmbiguousColumnReference("Relation " + this.toString + " has more than one column named " + colName) 80 | } 81 | 82 | } else { 83 | relation match { 84 | case join: Join => 85 | // We are processing a Join node. Call the visitRelation method recursively on both the left and right relations 86 | val leftResult = visitRelation(indexSoFar, join.left) 87 | val rightResult = visitRelation(indexSoFar + join.left.numCols, join.right) 88 | 89 | (leftResult, rightResult) match { 90 | case (Some(a), None) => Some(a) // we found the column in .left 91 | case (None, Some(b)) => Some(b) // we found the column in .right 92 | case (None, None) => None 93 | case (Some(a), Some(b)) => 94 | if (join.left == join.right) // a table is joined with itself (so the graph node is shared, and both sides match). Return the left one; it doesn't matter. 95 | Some(a) 96 | else // This should never happen 97 | throw new AmbiguousColumnReference("Children of relation " + this.toString + " both match target inner relation node and have a column named " + colName) 98 | } 99 | 100 | case _ => None 101 | } 102 | } 103 | 104 | visitRelation(0, this) 105 | } 106 | 107 | override val nodeStr = joinType.toString 108 | override def toString = joinType.toString + " JOIN" 109 | } 110 | -------------------------------------------------------------------------------- /src/main/scala/chorus/rewriting/Rewriter.scala: -------------------------------------------------------------------------------- 1 | package chorus.rewriting 2 | 3 | import chorus.schema.Database 4 | import chorus.sql.relational_algebra.{RelUtils, Relation} 5 | import chorus.sql.{AbstractAnalysis, QueryParser, TreePrinter} 6 | import org.apache.calcite.plan.{Convention, RelOptAbstractTable} 7 | import org.apache.calcite.rel.core._ 8 | import org.apache.calcite.rel.rules.ProjectMergeRule 9 | 10 | /** Root class for rewriters. 11 | * 12 | * @tparam C Config for rewriter. 13 | */ 14 | abstract class Rewriter[C <: RewriterConfig](config: C) { 15 | /** Rewrites the given relational algebra tree with the given config. Implemented by subclasses. 16 | */ 17 | def rewrite(root: Relation): Relation 18 | 19 | /** Entry point for rewriting by callers. Rewrites the given query with this rewriter using the given config. 20 | */ 21 | def run(query: String): RewriterResult = { 22 | val root = QueryParser.parseToRelTree(query, config.database) 23 | run(root) 24 | } 25 | 26 | /** Rewrites the given relational algebra tree with this rewriter using the given config. 27 | */ 28 | def run(root: Relation): RewriterResult = { 29 | if (AbstractAnalysis.DEBUG) { 30 | println("--- Original query ---") 31 | TreePrinter.printRelTree(root) 32 | } 33 | 34 | val rewrittenTree = rewrite(root) 35 | 36 | if (AbstractAnalysis.DEBUG) { 37 | println("--- Rewritten query (${this.getClass.getSimpleName}) ---") 38 | printTreeAndSql(rewrittenTree) 39 | } 40 | 41 | new RewriterResult(rewrittenTree, config) 42 | } 43 | 44 | /** For debugging. */ 45 | def printTreeAndSql(root: Relation): Unit = { 46 | val withQueries = root.collect{ case Relation(w: WithTable) => w }.toSet 47 | withQueries.foreach{ q => 48 | println(s"WITH ${q.alias} AS") 49 | TreePrinter.printRelTree(q.definition) 50 | println("\n") 51 | } 52 | TreePrinter.printRelTree(root) 53 | 54 | println("---") 55 | println(Rewriter.toSqlWithAliases(root, config.database.dialect)) 56 | println("") 57 | } 58 | 59 | class RewriterResult(val root: Relation, config: C) { 60 | /** Emits a SQL query for the given rewritten result. 61 | */ 62 | def toSql(dialect: String = config.database.dialect): String = Rewriter.toSqlWithAliases(root, dialect) 63 | } 64 | } 65 | 66 | object Rewriter { 67 | import chorus.rewriting.rules.Operations._ 68 | 69 | /** Transforms the given relation into SQL, preserving any aliases specified by the rewriter. 70 | */ 71 | def toSqlWithAliases(root: Relation, dialect: String, aliasRelationsInScope: Set[WithTable] = Set.empty): String = { 72 | val withQueries = root.collect{ case Relation(w: WithTable) => w }.toList.distinct.filter(!aliasRelationsInScope.contains(_)).sortBy(_.alias) 73 | 74 | val withClauses = withQueries.zipWithIndex.map { case (w, idx) => 75 | val queryStr = toSqlWithAliases(w.definition, dialect, aliasRelationsInScope ++ withQueries.take(idx)).replace("\n", "\n ") 76 | s"${w.alias} AS (\n ${queryStr}\n)" 77 | } 78 | 79 | val withPrefix = if (withClauses.isEmpty) "" else withClauses.mkString("WITH ", ", ", "\n") 80 | val querySql = RelUtils.relToSql(root.optimize(ProjectMergeRule.INSTANCE), dialect) 81 | withPrefix + querySql 82 | } 83 | } 84 | 85 | /** Dummy class to store relations that are to be defined as WITH clauses in the rewritten query (the relational 86 | * algebra tree has no representation for this since it does not admit aliases). 87 | */ 88 | case class WithTable(definition: Relation, alias: String) extends TableScan( 89 | definition.getCluster, 90 | definition.getCluster.traitSetOf(Convention.NONE), 91 | new RelOptAbstractTable(null, alias, definition.getRowType) {}) { 92 | override def equals(obj: scala.Any): Boolean = { 93 | obj match { 94 | case w: WithTable => this.definition.equals(w.definition) && this.alias.equals(w.alias) 95 | case _ => false 96 | } 97 | } 98 | } 99 | 100 | /** Flags for all rewriters */ 101 | class RewriterConfig(val database: Database) 102 | 103 | /** Flags for differential privacy-based rewriters */ 104 | class DPRewriterConfig( 105 | /** The privacy budget allocated to this query. Callers are responsible for tracking the remaining budget. */ 106 | val epsilon: Double, 107 | 108 | /** The database being queried. */ 109 | override val database: Database, 110 | 111 | /** Should rewriter add logic to automatically insert histogram bins from domain? This flag should be true if 112 | * query results are released directly. 113 | * 114 | * This is necessary for histogram queries since a DP mechanism must return a noisy result for all records in the 115 | * domain - including those not appearing in the output - in order to avoid leaking information via the presence or 116 | * absence of a bin. If this flag is true, missing bins will be populated with noisy empty results, and query 117 | * rewriting will fail if this cannot be achieved. 118 | */ 119 | val fillMissingBins: Boolean) 120 | extends RewriterConfig(database) 121 | 122 | 123 | class RewritingException(val msg: String) extends RuntimeException(msg) 124 | -------------------------------------------------------------------------------- /src/main/scala/chorus/rewriting/DPUtil.scala: -------------------------------------------------------------------------------- 1 | package chorus.rewriting 2 | 3 | import chorus.analysis.histogram.AggregationInfo 4 | import chorus.dataflow.column.AbstractColumnAnalysis.ColumnFacts 5 | import chorus.rewriting.rules.ColumnDefinition._ 6 | import chorus.rewriting.rules.Expr.{Abs, Case, Ln, Rand, _} 7 | import chorus.rewriting.rules.Operations._ 8 | import chorus.rewriting.rules.{Helpers, ValueExpr} 9 | import chorus.schema.Schema 10 | import chorus.sql.relational_algebra.Relation 11 | import org.apache.calcite.rel.core._ 12 | 13 | /** Utilities for differential privacy-based rewriters. */ 14 | object DPUtil { 15 | // Expression to sample a random value from the Laplace distribution. 16 | val LaplaceSample: ValueExpr = Case((Rand-0.5) < 0, -1.0, 1.0) * Ln(1 - (2 * Abs(Rand-0.5))) 17 | 18 | /** Rewrites the relation to add all values in the domain of the binned column that are not present in the result set 19 | * if necessary (as determined by the [fillMissingBins] flag). If the provided aggregation contains no grouped 20 | * columns, returns the original relation. 21 | * 22 | * To support this feature the schema must define flag 'domainSet' for any database column usable as a histogram bin. 23 | * The value of this flag is a fully qualified column in the same database whose records enumerate all values in that 24 | * column's domain. This flag may point to itself (e.g., if the column's values already span the domain) or it may 25 | * refer to an auxiliary table. If the flag is not defined, this method throws an error since the rewritten query's 26 | * result cannot be safely returned; in such cases the mechanism must either perform additional processing on the 27 | * results or interpose between the results and analyst. 28 | */ 29 | def addBinsFromDomain(node: Aggregate, 30 | histogramResults: ColumnFacts[AggregationInfo], 31 | config: DPRewriterConfig): Relation = { 32 | import scala.collection.JavaConverters._ 33 | 34 | if (!config.fillMissingBins) 35 | return Relation(node) 36 | 37 | val cols = node.getRowType.getFieldList.asScala 38 | val (groupedCols, aggCols) = cols.splitAt(node.getGroupCount) 39 | 40 | if (groupedCols.length > 1) throw new RewritingException("Multi-column grouping in histograms is not yet supported.") 41 | val groupedColIdx = groupedCols.head.getIndex 42 | val groupedColInfo = histogramResults(groupedColIdx) 43 | val groupedColName = cols(groupedColIdx).getName 44 | 45 | if (aggCols.length > 1) throw new RewritingException("Multi-column aggregations in histograms are not yet supported.") 46 | val aggColIdx = aggCols.head.getIndex 47 | val origAggColAlias = cols(aggColIdx).getName 48 | 49 | // Ensure aggregation column has explicit alias in relation, otherwise Calcite will reference it using a derived 50 | // alias (e.g. EXPR$0), which will fail on the actual database. 51 | val (withAggColAlias, explicitAggAlias) = 52 | if (Helpers.isDerivedAlias(origAggColAlias)) { 53 | val explicitAlias = "_agg" 54 | val rel = Relation(node).mapCols { col => 55 | if (col.idx == aggColIdx) 56 | EnsureAlias(col.expr) AS explicitAlias 57 | else col 58 | } 59 | (rel, explicitAlias) 60 | } 61 | else (Relation(node), origAggColAlias) 62 | 63 | val defaultVal = 0 64 | 65 | // If the value of the histogram bin has been modified prior to grouping, this approach will not work. 66 | if (groupedColInfo.valueModified) throw new RewritingException(s"Histogram column $groupedColName has modified valued.") 67 | 68 | // If the histogram is not derived from exactly one database column, this approach will not work. 69 | if (groupedColInfo.references.size != 1) throw new RewritingException(s"Histogram column must derive its values from exactly one database column.") 70 | 71 | // Figure out which database column contains the domain values for to the histogram column. 72 | val targetCol = groupedColInfo.references.head 73 | val colProperties = Schema.getSchemaMapForTable(config.database, targetCol.table)(targetCol.column).properties 74 | val domainSetFlag = colProperties.getOrElse("domainSet", throw new RewritingException( 75 | s"Column '${targetCol.column}' in table '${targetCol.table}' is used as a histogram bin. " + 76 | "Please define 'domainSet' parameter specifying a table/column that enumerates all values from this column's domain. " + 77 | "To disable this check set fillMissingBins = false in rewriter config (if disabled, query results are NOT safe for release)")) 78 | 79 | val (domainSetTable, domainSetCol) = { 80 | val elems = domainSetFlag.split('.') 81 | val (tbl, col) = elems.splitAt(elems.length-1) 82 | (tbl.mkString("."), col.head) 83 | } 84 | 85 | val domainSetRel = table(domainSetTable, config.database).project(col(domainSetCol) AS "_domain") 86 | 87 | withAggColAlias 88 | .asAlias("_orig") 89 | .project(col(groupedColIdx), EnsureAlias(col(explicitAggAlias))) 90 | .join(domainSetRel, left(0) == right(0), JoinRelType.RIGHT) 91 | .project(right(0) AS groupedColName, 92 | Case(IsNull(left(0)), defaultVal, left(1)) AS origAggColAlias) 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/chorus/analysis/histogram/HistogramAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.analysis.histogram 24 | 25 | import chorus.dataflow.AggFunctions._ 26 | import chorus.dataflow.column.{NodeColumnFacts, RelNodeColumnAnalysis} 27 | import chorus.dataflow.domain._ 28 | import chorus.dataflow.domain.lattice.FlatLatticeDomain 29 | import chorus.sql.relational_algebra.RelUtils 30 | import org.apache.calcite.rel.core.{Aggregate, TableScan} 31 | import org.apache.calcite.rex.{RexNode, RexSlot} 32 | 33 | /** Returns the aggregation status of each output column of a query. The results of this analysis are used to classify 34 | * queries as statistical or raw data, determine which columns contain aggregations, and track the provenance of 35 | * aggregated columns and histogram bins. 36 | */ 37 | class HistogramAnalysis extends RelNodeColumnAnalysis(UnitDomain, AggregationDomain) { 38 | 39 | override def transferAggregate(node: Aggregate, aggFunctions: IndexedSeq[Option[AggFunction]], state: NodeColumnFacts[Unit, AggregationInfo]) = { 40 | val newColFacts = state.colFacts.zipWithIndex.map { case (state, idx) => 41 | val aggFunction = aggFunctions(idx) 42 | 43 | if (aggFunction.isEmpty) // grouped column 44 | state.copy(isGroupBy = true) 45 | else { 46 | val newReferences: Set[QualifiedColumnName] = aggFunction.get match { 47 | case COUNT => state.references.map{ _.table }.toList.distinct.map{ QualifiedColumnName(_, "*") }.toSet 48 | case _ => state.references 49 | } 50 | 51 | AggregationInfo( 52 | isAggregation = true, 53 | outermostAggregation = aggFunction, 54 | references = newReferences, 55 | valueModified = true, 56 | isGroupBy = false 57 | ) 58 | } 59 | } 60 | 61 | NodeColumnFacts(UnitDomain.bottom, newColFacts) 62 | } 63 | 64 | override def transferExpression(node: RexNode, state: AggregationInfo): AggregationInfo = { 65 | node match { 66 | case _: RexSlot => state 67 | case _ => state.copy(valueModified = true) 68 | } 69 | } 70 | 71 | override def transferTableScan(node: TableScan, state: NodeColumnFacts[Unit, AggregationInfo]) = { 72 | import scala.collection.JavaConverters._ 73 | 74 | val tableName = RelUtils.getQualifiedTableName(node) 75 | val colNames = node.getRowType.getFieldNames.asScala 76 | 77 | val newColFacts = state.colFacts.zip(colNames).map { case (state, colName) => 78 | val qualifiedColName = QualifiedColumnName(tableName, colName) 79 | state.copy(references = Set(qualifiedColName)) 80 | } 81 | 82 | NodeColumnFacts(UnitDomain.bottom, newColFacts) 83 | } 84 | } 85 | 86 | /** Information about the aggregation status of a column 87 | * 88 | * @param isAggregation Is this column any type of aggregation? 89 | * @param outermostAggregation Outermost aggregation function applied to references 90 | * @param references Data provenance of the column (i.e., each database column influencing this column's value) 91 | * @param valueModified Was any function/operation/expression applied to this column? If (and only if) false, values of 92 | * this column are guaranteed to correspond exactly to values in database table [references]. 93 | * @param isGroupBy Is this column grouped? 94 | */ 95 | case class AggregationInfo(isAggregation: Boolean, 96 | outermostAggregation: DomainElem[AggFunction], 97 | references: Set[QualifiedColumnName], 98 | valueModified: Boolean, 99 | isGroupBy: Boolean) 100 | 101 | object AggregationDomain extends AbstractDomain[AggregationInfo] { 102 | override val bottom: AggregationInfo = AggregationInfo(false, FlatLatticeDomain.bottom, Set.empty, false, false) 103 | 104 | override def leastUpperBound(first: AggregationInfo, second: AggregationInfo): AggregationInfo = { 105 | AggregationInfo( 106 | isAggregation=first.isAggregation || second.isAggregation, 107 | outermostAggregation=FlatLatticeDomain.leastUpperBound(first.outermostAggregation, second.outermostAggregation), 108 | references=first.references ++ second.references, 109 | valueModified=first.valueModified || second.valueModified, 110 | isGroupBy=first.isGroupBy || second.isGroupBy 111 | ) 112 | } 113 | } 114 | 115 | case class QualifiedColumnName(table: String, column: String) 116 | -------------------------------------------------------------------------------- /src/main/scala/chorus/rewriting/differential_privacy/WPINQRewriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.rewriting.differential_privacy 24 | 25 | import chorus.analysis.histogram.{HistogramAnalysis, QueryType} 26 | import chorus.dataflow.AggFunctions.COUNT 27 | import chorus.dataflow.domain.UnitDomain 28 | import chorus.exception.UnsupportedQueryException 29 | import chorus.rewriting.rules.ColumnDefinition._ 30 | import chorus.rewriting.rules.Expr.{col, _} 31 | import chorus.rewriting.rules.Operations._ 32 | import chorus.rewriting.rules._ 33 | import chorus.rewriting.{DPRewriterConfig, DPUtil, Rewriter} 34 | import chorus.schema.Database 35 | import chorus.sql.relational_algebra.{RelUtils, Relation} 36 | import org.apache.calcite.rel.core._ 37 | 38 | /** 39 | * Rewriter for WPINQ. Converts a SQL counting query into a query that returns a noisy count of weights. 40 | * 41 | * @see [[https://arxiv.org/abs/1203.3453 Calibrating Data to Sensitivity in Private Data Analysis]] 42 | */ 43 | class WPINQRewriter(config: WPINQConfig) extends Rewriter(config) { 44 | def rewrite(root: Relation): Relation = { 45 | // Reject unsupported queries 46 | val histogramResults = new HistogramAnalysis().runAll(root, config.database) 47 | val queryType = QueryType.getQueryType(histogramResults(root)) 48 | 49 | val isValidQueryType = 50 | Set(QueryType.HISTOGRAM, QueryType.NON_HISTOGRAM_STATISTICAL).contains(queryType) && 51 | histogramResults(root).colFacts.filter(_.isAggregation).forall(_.outermostAggregation.contains(COUNT)) 52 | 53 | if (!isValidQueryType) throw new UnsupportedQueryException("This rewriter only works on counting queries") 54 | 55 | val joinNodes = root.collect{ case Relation(j: Join) => j } 56 | if (joinNodes.exists{ join => RelUtils.extractEquiJoinColumns(join, join.getCondition).isEmpty }) 57 | throw new UnsupportedQueryException("This rewriter only works on queries with equijoins") 58 | 59 | root.rewriteRecursive(UnitDomain) { (node, orig, _) => 60 | node match { 61 | // Add initial weight column to tables. 62 | case Relation(tbl: TableScan) => (node.project(*, (config.initialWeights(tbl): ValueExpr) AS "_weight"), ()) 63 | 64 | // Ensure the weight column is projected through project nodes. 65 | case Relation(p: Project) => (p.reproject(*, col("_weight")), ()) 66 | 67 | case Relation(j: Join) => 68 | val (leftJoinCol, rightJoinCol) = RelUtils.extractEquiJoinColumns(j, j.getCondition).getOrElse(throw new UnsupportedQueryException("This rewriter only supports equijoin conditions.")) 69 | 70 | val A = Relation(j.getLeft).rename(col("_weight") AS "_A_w").asAlias("_A") 71 | val B = Relation(j.getRight).rename(col("_weight") AS "_B_w").asAlias("_B") 72 | 73 | val Ak = A.agg (col(leftJoinCol)) (Sum(col("_A_w")) AS "_A_s") 74 | val Bk = B.agg (col(rightJoinCol)) (Sum(col("_B_w")) AS "_B_s") 75 | 76 | val newNode = node 77 | .replaceInputs(_ => List(A, B)) 78 | .join(Ak, left(leftJoinCol) == right(0)) 79 | .join(Bk, left(leftJoinCol) == right(0)) 80 | .project(*, ((col("_A_w") * col("_B_w")) / (col("_A_s") + col("_B_s"))) AS "_weight") 81 | .remove(col("_A_w"), col("_B_w"), col("_A_s"), col("_B_s")) 82 | 83 | (newNode, ()) 84 | 85 | case Relation(a: Aggregate) => 86 | val groupedCols = RelUtils.getGroupedCols(a) 87 | val origColName = a.getRowType.getFieldNames.get(groupedCols.length) 88 | val weightSumRelation = Relation(a.getInput).agg (groupedCols: _*) (Sum(col("_weight")) AS "_weight_sum") 89 | 90 | // For histogram queries, ensure all values from domain appear in result set, assigning weighted sum 0 to 91 | // absent bins. 92 | val withFilledBins = DPUtil.addBinsFromDomain(weightSumRelation.unwrap.asInstanceOf[Aggregate], histogramResults(orig).colFacts, config) 93 | 94 | // Add noise to weights 95 | val result = withFilledBins 96 | .project(*, col("_weight_sum") + (1.0 / config.epsilon) * DPUtil.LaplaceSample AS origColName) 97 | .remove(col("_weight_sum")) 98 | 99 | (result, ()) 100 | 101 | case _ => (node, ()) 102 | } 103 | } 104 | } 105 | } 106 | 107 | class WPINQConfig( 108 | override val epsilon: Double, 109 | override val database: Database, 110 | override val fillMissingBins: Boolean = true) 111 | extends DPRewriterConfig(epsilon, database, fillMissingBins) { 112 | /** The initial weight assigned to each record in the given table. Default is 1.0. */ 113 | def initialWeights(table: TableScan): Double = 1.0 114 | } 115 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/AbstractAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql 24 | 25 | import chorus.exception.DPException 26 | import chorus.schema.Database 27 | import chorus.util.IdentityHashMap 28 | 29 | import scala.collection.mutable 30 | 31 | /** Abstract class for all analyses on parsed SQL queries. 32 | * 33 | * @tparam N The node type for the tree (AST, dataflow graph, or relational algebra) 34 | * @tparam T The return type of the analysis. For column fact analysis, [T] derives from ColumnFacts[_]. For visitor 35 | * analyses, T is any object reference type. For abstract interpretation-based dataflow analyses, [T] 36 | * derives from AbstractDomain. 37 | */ 38 | abstract class AbstractAnalysis[N <: AnyRef, T] extends TreeFunctions[N] { 39 | 40 | /** Allows code to symbolically reference return type of an analysis (e.g., HistogramAnalysis#ResultType) */ 41 | type ResultType = T 42 | 43 | /****************************************************************************************************************** 44 | * Public methods for analysis callers. 45 | *******************************************************************************************************************/ 46 | 47 | /** Runs the analysis on the given query and returns the abstract results at tree root. 48 | */ 49 | final def analyzeQuery(query: String, database: Database): T = { 50 | try { 51 | val treeRoot = parseQueryToTree(query, database) 52 | run(treeRoot, database) 53 | } 54 | catch { 55 | case e: Exception => 56 | // Catch all exceptions that may occur during query parsing and analysis, and wrap in DPException type. 57 | throw new DPException("Error during query analysis", e) 58 | } 59 | } 60 | 61 | /** Runs the analysis on the given parsed representation of the query. 62 | */ 63 | final def analyzeQuery(root: N, database: Database): T = { 64 | try { 65 | run(root, database) 66 | } 67 | catch { 68 | case e: Exception => 69 | // Catch all exceptions that may occur during query parsing and analysis, and wrap in DPException type. 70 | throw new DPException("Error during query analysis", e) 71 | } 72 | } 73 | 74 | /** Runs the analysis on the tree and returns the abstract result at the tree root. Subclases may override this 75 | * method to pre-process the query before analysis begins, but must call super.run(). 76 | */ 77 | def run(root: N, database: Database): T = { 78 | try { 79 | treeRoot = Some(root) 80 | currentDb = Some(database) 81 | resultMap.clear() 82 | this.process(root) 83 | currentNode = None 84 | } 85 | finally { // Print the tree even if analysis throws an exception 86 | if (AbstractAnalysis.DEBUG) { 87 | System.out.println("\n********** " + this.getClass.getSimpleName + " **********") 88 | printTree(treeRoot.get) 89 | } 90 | } 91 | resultMap(root) 92 | } 93 | 94 | /** Returns the current database being queried. */ 95 | def getDatabase: Database = currentDb.get 96 | 97 | /** Runs the analysis on the tree and returns a map from nodes in the tree to analysis state at that node. */ 98 | def runAll(root: N, database: Database): mutable.HashMap[N, T] = { 99 | run(root, database) 100 | resultMap 101 | } 102 | 103 | /****************************************************************************************************************** 104 | * Analysis engine internals. 105 | ******************************************************************************************************************/ 106 | 107 | /** Map from each node in the tree to analysis results at that node. May be inspected by analysis implementations in 108 | * their transfer/join functions; results are guaranteed to exist for all nodes *below* the current node in the tree. 109 | */ 110 | val resultMap: mutable.HashMap[N, T] = new IdentityHashMap[N, T]() 111 | 112 | /** The root node of the tree under analysis. */ 113 | final var treeRoot: Option[N] = None 114 | 115 | /** The database being queried. */ 116 | final var currentDb: Option[Database] = None 117 | 118 | /** The current node being processed. Subclasses should update this variable as tree is traversed to enable helpful 119 | * debugging when analysis throws an exception. 120 | */ 121 | var currentNode: Option[N] = None 122 | 123 | /** Analysis entry point. Runs analysis and stores results in resultMap. */ 124 | def process(root: N): Unit 125 | } 126 | 127 | object AbstractAnalysis { 128 | // Set this argument to print all analysis trees along with result state. 129 | val DEBUG: Boolean = System.getProperty("query.debug", "false").toBoolean 130 | } 131 | -------------------------------------------------------------------------------- /src/main/scala/chorus/dataflow/column/DFGColumnAnalysis.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.dataflow.column 24 | 25 | import chorus.dataflow.column.AbstractColumnAnalysis.ColumnFacts 26 | import chorus.dataflow.domain.AbstractDomain 27 | import chorus.exception.AnalysisException 28 | import chorus.sql.dataflow_graph.reference.{ColumnReference, Function, UnstructuredReference} 29 | import chorus.sql.dataflow_graph.relation._ 30 | import chorus.sql.dataflow_graph.{DataflowGraphFunctions, Node} 31 | 32 | /** Column fact analysis on dataflow graphs. For more details see [[AbstractColumnAnalysis]]. 33 | */ 34 | abstract class DataflowGraphColumnAnalysis[E, D <: AbstractDomain[E]](domain: AbstractDomain[E]) 35 | extends AbstractColumnAnalysis[Node, E, D] 36 | with DataflowGraphFunctions 37 | with DataflowGraphColumnAnalysisFunctions[E] { 38 | 39 | override final def transferNode(node: Node, state: ColumnFacts[E]): ColumnFacts[E] = { 40 | 41 | val newFacts: Seq[E] = node match { 42 | case s: Select => state.zipWithIndex.map { case (fact,idx) => transferSelect(s, idx, fact) } 43 | 44 | case c: ColumnReference => List(transferColumnReference(c, 0, state.head)) 45 | 46 | case f: Function => 47 | assert (state.length == 1) 48 | List(transferFunction(f, 0, state.head)) 49 | 50 | case u: UnstructuredReference => 51 | assert (state.length == 1) 52 | List(transferUnstructuredReference(u, 0, state.head)) 53 | 54 | case t: DataTable => 55 | (0 until t.numCols).map { idx => 56 | transferDataTable(t, idx, domain.bottom) 57 | } 58 | 59 | case j: Join => 60 | if (state.size != j.numCols) throw new AnalysisException("Schema size mismatch (probably caused by unknown table) in JOIN[" + node.toString + "]. Some columns in this relation have unknown provenance, so analysis results may be incorrect.") 61 | state.zipWithIndex.map { case (fact,idx) => transferJoin(j, idx, fact) } 62 | 63 | case u: Union => 64 | state.zipWithIndex.map { case (fact,idx) => transferUnion(u, idx, fact) } 65 | 66 | case e: Except => 67 | state.zipWithIndex.map { case (fact,idx) => transferExcept(e, idx, fact) } 68 | } 69 | 70 | newFacts.toIndexedSeq 71 | } 72 | 73 | override def joinNode(node: Node, children: Iterable[Node]): ColumnFacts[E] = { 74 | node match { 75 | /** For Select, join fact from where condition (if present) with fact from each SelectItem. 76 | */ 77 | case s: Select => 78 | val colResults = s.items.map{ item => 79 | val childResult = resultMap(item.ref) 80 | assert (childResult.size == 1) 81 | childResult.head 82 | } 83 | 84 | colResults.toIndexedSeq 85 | 86 | case c: ColumnReference => 87 | val result = resultMap(c.of)(c.colIndex) 88 | IndexedSeq(result) 89 | 90 | /** For Function and UnstructedReference, reduce all columns from all children into a single column fact. 91 | */ 92 | case f: Function => flattenJoinChildren(domain, node, children) 93 | case u: UnstructuredReference => flattenJoinChildren(domain, node, children) 94 | 95 | /** For Join, pass through state from left and right relations, joined with join condition fact. 96 | */ 97 | case j: Join => 98 | val colResults = resultMap(j.left) ++ resultMap(j.right) 99 | val result = 100 | if (j.condition.isDefined) 101 | colResults.map { x => domain.leastUpperBound(x, resultMap(j.condition.get).head) } 102 | else 103 | colResults 104 | result 105 | 106 | /** For Union and Except, join column facts of corresponding columns from all children (schemas of children are guaranteed to match). 107 | */ 108 | case u: Union => children.map{ resultMap(_) }.transpose.map{ _.reduce( (x,y) => domain.leastUpperBound(x, y) )}.toIndexedSeq 109 | case e: Except => children.map{ resultMap(_) }.transpose.map{ _.reduce( (x,y) => domain.leastUpperBound(x, y) )}.toIndexedSeq 110 | case d: DataTable => IndexedSeq.empty // we'll initialize the facts to bottom in the transfer function. 111 | case _ => throw new RuntimeException(s"Unsupported join node type ${node.getClass.getSimpleName}") 112 | } 113 | } 114 | } 115 | 116 | /** Subclasses may override any of these methods as appropriate. */ 117 | trait DataflowGraphColumnAnalysisFunctions[E] { 118 | def transferSelect(s: Select, idx: Int, fact: E): E = fact 119 | def transferColumnReference(c: ColumnReference, idx: Int, fact: E): E = fact 120 | def transferFunction(f: Function, idx: Int, fact: E): E = fact 121 | def transferUnstructuredReference(u: UnstructuredReference, idx: Int, fact: E): E = fact 122 | def transferDataTable(d: DataTable, idx: Int, fact: E): E = fact 123 | def transferJoin(j: Join, idx: Int, fact: E): E = fact 124 | def transferUnion(u: Union, idx: Int, fact: E): E = fact 125 | def transferExcept(e: Except, idx: Int, fact: E): E = fact 126 | } 127 | -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/analysis/histogram/HistogramAnalysisTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.analysis.histogram 24 | 25 | import chorus.dataflow.AggFunctions._ 26 | import chorus.dataflow.domain.{Bottom, Top} 27 | import chorus.schema.Schema 28 | import junit.framework.TestCase 29 | 30 | class HistogramAnalysisTest extends TestCase { 31 | val database = Schema.getDatabase("test") 32 | 33 | private def getResults(query: String) = { 34 | val h = new HistogramAnalysis 35 | val results = h.analyzeQuery(query, database) 36 | results.colFacts.toList 37 | } 38 | 39 | def assertHistogramFailure(queryStr: String, errorMsg: String) = { 40 | try { 41 | getResults(queryStr) 42 | TestCase.fail("Unexpected successful transformation (was expecting exception)") 43 | } 44 | catch { 45 | case e: Exception => TestCase.assertEquals(errorMsg, e.getMessage) 46 | } 47 | } 48 | 49 | def testSimpleHistogram() = { 50 | val query = "SELECT product_id, COUNT(*) FROM orders GROUP BY product_id" 51 | val actualResult = getResults(query) 52 | 53 | val expectedResult = List( 54 | AggregationInfo(false, Bottom, Set(QualifiedColumnName("public.orders", "product_id")), false, true), 55 | AggregationInfo(true, Some(COUNT), Set(QualifiedColumnName("public.orders", "*")), true, false) 56 | ) 57 | 58 | TestCase.assertEquals(expectedResult, actualResult) 59 | } 60 | 61 | def testAliasHistogram() = { 62 | val query = "SELECT order_date as bin, COUNT(*) FROM orders GROUP BY bin" 63 | val actualResult = getResults(query) 64 | 65 | val expectedResult = List( 66 | AggregationInfo(false, Bottom, Set(QualifiedColumnName("public.orders", "order_date")), false, true), 67 | AggregationInfo(true, Some(COUNT), Set(QualifiedColumnName("public.orders", "*")), true, false) 68 | ) 69 | 70 | TestCase.assertEquals(expectedResult, actualResult) 71 | } 72 | 73 | def testModifiedHistogramBin() = { 74 | val query = "SELECT order_date+1 as bin, COUNT(*) FROM orders GROUP BY bin" 75 | val actualResult = getResults(query) 76 | 77 | val expectedResult = List( 78 | AggregationInfo(false, Bottom, Set(QualifiedColumnName("public.orders", "order_date")), true, true), 79 | AggregationInfo(true, Some(COUNT), Set(QualifiedColumnName("public.orders", "*")), true, false) 80 | ) 81 | 82 | TestCase.assertEquals(expectedResult, actualResult) 83 | } 84 | 85 | def testRoundFunction() = { 86 | // functions that input only aggregates should return true for isAggregation 87 | val query = "SELECT product_id, ROUND(COUNT(*), 0) FROM orders GROUP BY 1" 88 | val actualResult = getResults(query) 89 | 90 | val expectedResult = List( 91 | AggregationInfo(false, Bottom, Set(QualifiedColumnName("public.orders", "product_id")), false, true), 92 | AggregationInfo(true, Some(COUNT), Set(QualifiedColumnName("public.orders", "*")), true, false) 93 | ) 94 | 95 | TestCase.assertEquals(expectedResult, actualResult) 96 | } 97 | 98 | def testColumnReference() = { 99 | val query = "WITH t1 as (SELECT order_id as a FROM orders) SELECT a FROM t1" 100 | val actualResult = getResults(query) 101 | 102 | val expectedResult = List( 103 | AggregationInfo(false, Bottom, Set(QualifiedColumnName("public.orders", "order_id")), false, false) 104 | ) 105 | 106 | TestCase.assertEquals(expectedResult, actualResult) 107 | } 108 | 109 | def testDivision() = { 110 | // arithmetic of aggregations is still an aggregation, but outermost aggregation is Top since more than one 111 | // aggregation function was applied. 112 | val query = "SELECT (AVG(price) / COUNT(*)) as \"result\" FROM products" 113 | val actualResult = getResults(query) 114 | 115 | val expectedResult = List( 116 | AggregationInfo(true, Top, Set(QualifiedColumnName("public.products", "price"), QualifiedColumnName("public.products", "*")), true, false) 117 | ) 118 | 119 | TestCase.assertEquals(expectedResult, actualResult) 120 | } 121 | 122 | def testCountStar() = { 123 | val query = "SELECT COUNT(*) FROM orders" 124 | val actualResult = getResults(query) 125 | 126 | val expectedResult = List( 127 | AggregationInfo(true, COUNT, Set.empty, true, false) 128 | ) 129 | 130 | TestCase.assertEquals(expectedResult, actualResult) 131 | } 132 | 133 | // Test that statistics analysis correctly returns applied aggregation functions for simple query. 134 | def testStatistics() = { 135 | val query = "SELECT COUNT(*) as my_count, SUM(price) as my_sum, AVG(price) as my_avg FROM products" 136 | val actualResult = getResults(query) 137 | 138 | val expectedResult = List( 139 | AggregationInfo(true, COUNT, Set(QualifiedColumnName("public.products", "*")), true, false), 140 | AggregationInfo(true, SUM, Set(QualifiedColumnName("public.products", "price")), true, false), 141 | AggregationInfo(true, AVG, Set(QualifiedColumnName("public.products", "price")), true, false) 142 | ) 143 | 144 | TestCase.assertEquals(expectedResult, actualResult) 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/main/scala/chorus/sql/ast/ASTFunctions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.sql.ast 24 | 25 | import com.facebook.presto.sql.tree._ 26 | import chorus.exception.{TransformationException, UnsupportedConstructException} 27 | import chorus.schema.Database 28 | import chorus.sql.{AbstractAnalysis, QueryParser, TreeFunctions, TreePrinter} 29 | 30 | trait ASTFunctions extends TreeFunctions[Node] { 31 | this: AbstractAnalysis[Node, _] => 32 | override def getNodeChildren(node: Node): Iterable[Node] = ASTFunctions.getChildren(node) 33 | override def parseQueryToTree(query: String, database: Database): Node = QueryParser.parseToPrestoTree(query) 34 | override def printTree(node: Node) = TreePrinter.printTreePresto(node, resultMap, currentNode) 35 | } 36 | 37 | object ASTFunctions { 38 | /** Returns the children of the given AST node. This is used to traverse ASTs in lieu of Presto's Java visitor interface. 39 | */ 40 | def getChildren(node: Node): Iterable[Node] = { 41 | import scala.collection.JavaConverters._ 42 | val result = node match { 43 | case _: IntervalLiteral => Nil 44 | case _: Literal => Nil 45 | case e: Explain => List(e.getStatement) ++ e.getOptions.asScala 46 | case e: ExistsPredicate => List(e.getSubquery) 47 | case e: Extract => List(e.getExpression) 48 | case c: Cast => List(c.getExpression) 49 | case a: ArithmeticBinaryExpression => List(a.getLeft, a.getRight) 50 | case b: BetweenPredicate => List(b.getMin, b.getMax, b.getValue) 51 | case c: CoalesceExpression => c.getOperands.asScala 52 | case a: AtTimeZone => List(a.getValue, a.getTimeZone) 53 | case a: ArrayConstructor => a.getValues.asScala 54 | case s: SubscriptExpression => List(s.getBase, s.getIndex) 55 | case c: ComparisonExpression => List(c.getLeft, c.getRight) 56 | case q: QualifiedNameReference => Nil 57 | case q: Query => stripOption(q.getWith) ++ List(q.getQueryBody) ++ q.getOrderBy.asScala 58 | case w: With => w.getQueries.asScala 59 | case w: WithQuery => List(w.getQuery) 60 | case s: Select => s.getSelectItems.asScala 61 | case s: SingleColumn => List(s.getExpression) 62 | case w: WhenClause => List(w.getOperand, w.getResult) 63 | case i: InPredicate => List(i.getValue, i.getValueList) 64 | case f: FunctionCall => f.getArguments.asScala ++ stripOption(f.getWindow) 65 | case d: DereferenceExpression => List(d.getBase) 66 | case w: Window => w.getOrderBy.asScala ++ w.getPartitionBy.asScala ++ stripOption(w.getFrame) 67 | case w: WindowFrame => List(w.getStart) ++ stripOption(w.getEnd) 68 | case f: FrameBound => if (f.getValue.isPresent) List(f.getValue.get) else Nil 69 | case s: SimpleCaseExpression => s.getWhenClauses.asScala ++ List(s.getOperand) ++ stripOption(s.getDefaultValue) 70 | case i: InListExpression => i.getValues.asScala 71 | case n: NullIfExpression => List(n.getFirst, n.getSecond) 72 | case i: IfExpression => List(i.getCondition, i.getTrueValue) ++ stripOption(i.getFalseValue) 73 | case t: TryExpression => List(t.getInnerExpression) 74 | case a: ArithmeticUnaryExpression => List(a.getValue) 75 | case n: NotExpression => List(n.getValue) 76 | case s: SearchedCaseExpression => s.getWhenClauses.asScala ++ stripOption(s.getDefaultValue) 77 | case l: LikePredicate => List(l.getValue, l.getPattern, l.getEscape) 78 | case i: IsNotNullPredicate => List(i.getValue) 79 | case i: IsNullPredicate => List(i.getValue) 80 | case l: LogicalBinaryExpression => List(l.getRight, l.getLeft) 81 | case s: SubqueryExpression => List(s.getQuery) 82 | case s: SortItem => List(s.getSortKey) 83 | case q: QuerySpecification => List(q.getSelect) ++ stripOption(q.getFrom) ++ stripOption(q.getWhere) ++ stripOption(q.getGroupBy) ++ stripOption(q.getHaving) 84 | case s: SetOperation => s.getRelations.asScala 85 | case v: Values => v.getRows.asScala 86 | case r: Row => r.getItems.asScala 87 | case t: Table => Nil 88 | case t: TableSubquery => List(t.getQuery) 89 | case a: AliasedRelation => List(a.getRelation) 90 | case s: SampledRelation => List(s.getRelation, s.getSamplePercentage) 91 | case j: Join => List(j.getLeft, j.getRight) ++ stripOption(j.getCriteria).collect{ case c: JoinOn => c.getExpression } 92 | case u: Unnest => u.getExpressions.asScala 93 | case g: GroupBy => g.getGroupingElements.asScala 94 | case s: SimpleGroupBy => s.getColumnExpressions.asScala 95 | case g: GroupingElement => g.enumerateGroupingSets.asScala.flatMap{ _.asScala } 96 | case i: Insert => illegalOperation(i) 97 | case d: Delete => illegalOperation(d) 98 | case c: CreateTableAsSelect => illegalOperation(c) 99 | case c: AllColumns => Nil 100 | case c: CurrentTime => Nil 101 | case _ => throw new UnsupportedConstructException("getChildren on unsupported AST node type: " + node.getClass.getSimpleName) 102 | } 103 | result.filter{ _ != null } 104 | } 105 | 106 | private[ast] def illegalOperation(node: Node): Nothing = throw new TransformationException("Found illegal/unsupported operation in query: " + node.getClass.toString) 107 | def stripOption[T](node: java.util.Optional[T]): List[T] = { if (node.isPresent) List(node.get) else Nil } 108 | } 109 | -------------------------------------------------------------------------------- /src/test/scala/com/uber/engsec/dp/rewriting/ElasticSensitivityRewriterTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017 Uber Technologies, Inc. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | */ 22 | 23 | package chorus.rewriting 24 | 25 | import chorus.rewriting.differential_privacy.{ElasticSensitivityConfig, ElasticSensitivityRewriter} 26 | import chorus.schema.Schema 27 | import chorus.sql.QueryParser 28 | import junit.framework.TestCase 29 | 30 | class ElasticSensitivityRewriterTest extends TestCase { 31 | val database = Schema.getDatabase("test") 32 | 33 | def checkResult(query: String, epsilon: Double, delta: Double, expected: String, fillMissingBins: Boolean = false): Unit = { 34 | val root = QueryParser.parseToRelTree(query, database) 35 | val config = new ElasticSensitivityConfig(epsilon, delta, database, fillMissingBins) 36 | val result = new ElasticSensitivityRewriter(config).run(root) 37 | TestCase.assertEquals(expected.stripMargin.stripPrefix("\n").replaceAll("\r", ""), result.toSql()) 38 | } 39 | 40 | def testUnsupportedQueries() = { 41 | // This rewriter calls ElasticSensitivity analysis; see ElasticSensitivityAnalysisTest for tests of unsupported queries 42 | } 43 | 44 | def testCountQueryWithoutJoin() = { 45 | // the sensitivity of this query is 1 46 | val query = """ 47 | SELECT COUNT(*) FROM orders 48 | """ 49 | 50 | // scale of Laplace noise for epsilon 0.1 is 2*(1/0.1) = 20 51 | checkResult(query, 0.1, 1e-8, """ 52 | |SELECT COUNT(*) + 20.0 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) 53 | |FROM public.orders""" 54 | ) 55 | 56 | // scale of Laplace noise for epsilon 1 is 2*(1/1) = 2 57 | checkResult(query, 1, 1e-8, """ 58 | |SELECT COUNT(*) + 2.0 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) 59 | |FROM public.orders""" 60 | ) 61 | } 62 | 63 | def testCountQueryWithJoin() = { 64 | val query = """ 65 | SELECT COUNT(*) 66 | FROM orders JOIN recommendations ON orders.customer_id = recommendations.customer_id 67 | WHERE orders.product_id = 1 68 | """ 69 | 70 | checkResult(query, 0.1, 1e-8, """ 71 | |SELECT COUNT(*) + 5409.181856298167 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) 72 | |FROM (SELECT customer_id, product_id 73 | |FROM public.orders) t 74 | |INNER JOIN (SELECT customer_id 75 | |FROM public.recommendations) t0 ON t.customer_id = t0.customer_id 76 | |WHERE t.product_id = 1""" 77 | ) 78 | } 79 | 80 | def testHistogramQueryWithJoin() = { 81 | val query = """ 82 | SELECT orders.product_id, COUNT(*) 83 | FROM orders JOIN recommendations ON orders.product_id = recommendations.product_id 84 | WHERE orders.product_id = 1 85 | GROUP BY 1 86 | """ 87 | 88 | 89 | checkResult(query, 0.1, 1e-8, """ 90 | |SELECT t.product_id, COUNT(*) + 80000.0 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) 91 | |FROM (SELECT product_id 92 | |FROM public.orders) t 93 | |INNER JOIN (SELECT product_id 94 | |FROM public.recommendations) t0 ON t.product_id = t0.product_id 95 | |WHERE t.product_id = 1 96 | |GROUP BY t.product_id""" 97 | ) 98 | 99 | // Test histogram bin enumeration 100 | checkResult(query, 0.1, 1e-8, """ 101 | |WITH _orig AS ( 102 | | SELECT t.product_id, COUNT(*) _agg 103 | | FROM (SELECT product_id 104 | | FROM public.orders) t 105 | | INNER JOIN (SELECT product_id 106 | | FROM public.recommendations) t0 ON t.product_id = t0.product_id 107 | | WHERE t.product_id = 1 108 | | GROUP BY t.product_id 109 | |) 110 | |SELECT t0._domain product_id, CASE WHEN product_id IS NULL THEN 0 ELSE _agg END + 80000.0 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) 111 | |FROM (SELECT product_id, _agg 112 | |FROM _orig) t 113 | |RIGHT JOIN (SELECT product_id _domain 114 | |FROM public.products) t0 ON product_id = t0._domain""", 115 | true 116 | ) 117 | } 118 | 119 | def testHistogramQueryWithAggAlias() = { 120 | val query = """ 121 | SELECT orders.product_id, COUNT(*) AS "mycount" 122 | FROM orders JOIN recommendations ON orders.product_id = recommendations.product_id 123 | WHERE orders.product_id = 1 124 | GROUP BY 1 125 | """ 126 | 127 | // Test histogram bin enumeration when aggregation already has explicit alias 128 | checkResult(query, 0.1, 1e-8, """ 129 | |WITH _orig AS ( 130 | | SELECT t.product_id, COUNT(*) mycount 131 | | FROM (SELECT product_id 132 | | FROM public.orders) t 133 | | INNER JOIN (SELECT product_id 134 | | FROM public.recommendations) t0 ON t.product_id = t0.product_id 135 | | WHERE t.product_id = 1 136 | | GROUP BY t.product_id 137 | |) 138 | |SELECT t0._domain product_id, CASE WHEN product_id IS NULL THEN 0 ELSE mycount END + 80000.0 * (CASE WHEN RAND() - 0.5 < 0 THEN -1.0 ELSE 1.0 END * LN(1 - 2 * ABS(RAND() - 0.5))) mycount 139 | |FROM (SELECT product_id, mycount 140 | |FROM _orig) t 141 | |RIGHT JOIN (SELECT product_id _domain 142 | |FROM public.products) t0 ON product_id = t0._domain""", 143 | true 144 | ) 145 | } 146 | } 147 | --------------------------------------------------------------------------------