├── .github └── workflows │ ├── branch_protection.yml │ └── test.yml ├── .gitignore ├── LICENSE ├── README.md ├── build.sbt ├── documentation └── do_not_delete ├── examples ├── README.md ├── build.sbt └── src │ └── main │ └── scala │ └── Quickstart.scala ├── images ├── average_overlap.jpg ├── average_overlap_depth.jpg ├── do_not_delete └── total_uniform_file_count.png ├── notebooks └── databricks │ └── DeltaClusteringMetrics.scala ├── project └── build.properties └── src ├── main └── scala │ └── fr │ └── databeans │ └── lighthouse │ ├── fileStatsIntervalTree │ ├── Interval.scala │ ├── IntervalBoundary.scala │ ├── IntervalTree.scala │ └── Node.scala │ └── metrics │ ├── ClusteringMetrics.scala │ ├── Distribution.scala │ └── delta │ ├── DeltaClusteringMetrics.scala │ └── DeltaClusteringMetricsBase.scala └── test └── scala ├── fr └── databeans │ └── lighthouse │ ├── fileStatsIntervalTree │ ├── IntervalSpec.scala │ ├── IntervalTreeSpec.scala │ └── NodeSpec.scala │ └── metrics │ ├── ClusteringMetricsSpec.scala │ └── delta │ └── DeltaClusteringMetricsSpec.scala └── org └── apache └── spark └── sql └── delta └── test └── DeltaExtendedSparkSession.scala /.github/workflows/branch_protection.yml: -------------------------------------------------------------------------------- 1 | # Will be deleted after account upgrade to team ro Enterprise account. 2 | name: Branch Protection 3 | 4 | on: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | branch-protection: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Enable branch protection 14 | uses: peter-evans/branch-protection-action@v2 15 | with: 16 | token: ${{ secrets.GITHUB_TOKEN }} 17 | branch: main 18 | enforce_admins: false 19 | required_pull_request_reviews: true 20 | required_status_checks: [Lighthouse Tests] 21 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Lighthouse Tests 2 | 3 | on: 4 | [push, pull_request] 5 | 6 | 7 | jobs: 8 | build: 9 | 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | 14 | - uses: actions/checkout@v3 15 | - name: Set up JDK 1.8 16 | uses: actions/setup-java@v3 17 | with: 18 | java-version: '8' 19 | distribution: 'zulu' 20 | cache: 'sbt' 21 | 22 | - name: Run tests 23 | run: sbt test -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | .cache/ 4 | tmp/ 5 | .idea/ 6 | .DS_Store 7 | .bsp/ 8 | spark-warehouse/ 9 | project/target/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Databeans 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Lighthouse 2 | 3 | ## OVERVIEW 4 | 5 | Lighthouse is a library developed by DataBeans to optimize Lakehouse performance and reduce its total cost ownership. It is designed to monitor the health of the Lakehouse tables from a data layout perspective and provide valuable insights about how well data is clustered. This information helps users identify when data maintenance operations (vacuum, compaction, clustering …) should be performed, which engenders **improvements in query performance** and **reduction in storage costs**. 6 | 7 | The Lighthouse library can assist in addressing the following questions: 8 | * How well is my data clustered? 9 | * Does my data layout favor skipping based on statistics? 10 | * Is it advisable to Z-order before running a query on a certain column? 11 | * Is my data suffering from the many small files problem? 12 | * How frequently should I re-cluster my data to maintain its optimal clustering state? 13 | 14 | ## BUILDING 15 | 16 | Lighthouse is compiled using SBT. 17 | 18 | To compile, run 19 | ``` 20 | sbt compile 21 | ``` 22 | 23 | To generate artifacts, run 24 | ``` 25 | sbt package 26 | ``` 27 | 28 | To execute tests, run 29 | ``` 30 | sbt test 31 | ``` 32 | 33 | ## SETUP INSTRUCTIONS 34 | 35 | ### Prerequisites 36 | - Apache Spark 3.3.2 37 | - Delta 2.3.0 38 | 39 | ### Using Spark Shell 40 | 1. Open the terminal and run the following command: 41 | ``` 42 | spark-shell 43 | --packages io.delta:delta-core_2.12:2.3.0,io.github.Databeans:lighthouse_2.12:0.1.0 44 | --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" 45 | --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" 46 | ``` 47 | 48 | 2. Import the DeltaClusteringMetrics class : 49 | ``` 50 | import fr.databeans.lighthouse.metrics.delta.DeltaClusteringMetrics 51 | ``` 52 | 53 | 3. Compute clustering metrics for a given column of the Delta table: 54 | ``` 55 | val clusteringMetrics = DeltaClusteringMetrics.forPath("path/to/your/deltaTable", spark).computeForColumn("col_name") 56 | ``` 57 | 58 | 4. Display the computed clustering metrics using the show() method: 59 | ``` 60 | clusteringMetrics.show() 61 | ``` 62 | 63 | ### Using spark-submit 64 | Submit the application to a Spark cluster: 65 | ``` 66 | spark-submit \ 67 | --class com.example.MyApp \ 68 | --master \ 69 | --packages io.delta:delta-core_2.12:2.3.0,io.github.Databeans:lighthouse_2.12:0.1.0 \ 70 | 71 | ``` 72 | This command specifies the following options: 73 | - --class: Name of the main class of your application. 74 | - --master: URL of the Spark cluster to use. 75 | - --packages: Maven coordinates of the Delta Lake library to use, Maven coordinates of the lighthouse library to use. 76 | - The path to your application's JAR file. 77 | 78 | Example: 79 | ``` 80 | spark-submit 81 | --class Quickstart 82 | --master local[*] 83 | --packages io.delta:delta-core_2.12:2.3.0,io.github.Databeans:lighthouse_2.12:0.1.0 84 | target/scala-2.12/clustering-metrics-example_2.12-0.1.jar 85 | ``` 86 | ### Using DATABRICKS 87 | 1. Install our Maven library to your cluster: 88 | 89 | Go to `compute` > `cluster` > `Libraries` > `Install New` > Set `Source` = **Maven** | `coordinates` = **io.github.Databeans:lighthouse_2.12:0.1.0** 90 | 91 | (Or Add the Lighthouse_2.12-0.1.0.jar to your cluster) 92 | 93 | 2. Download this [notebook](https://github.com/Databeans/lighthouse/blob/main/notebooks/databricks/DeltaClusteringMetrics.scala) and import it to your workspace. 94 | 3. Create a new cell in your notebook and insert ```%run ```. 95 | 96 | **PS:** Replace with the actual path to the DeltaClusteringMetrics notebook. 97 | 4. Run the cell. 98 | 99 | With these steps completed, you'll be able to use the DeltaClusteringMetrics library. 100 | 101 | ## CLUSTERING METRICS 102 | 103 | ### Syntax 104 | 105 | - forName(deltaTable: String, spark: SparkSession): DeltaClusteringMetrics 106 | * deltaTable: Name of the Delta table 107 | * spark: SparkSession instance 108 | 109 | 110 | - forPath(deltaPath: String, spark: SparkSession): DeltaClusteringMetrics 111 | * deltaPath: Path of the Delta table 112 | * spark: SparkSession instance 113 | 114 | 115 | - computeForColumn(column: String): DataFrame 116 | * column: column name to compute metrics for 117 | 118 | 119 | - computeForColumns(columns: String*): DataFrame 120 | * columns: columns list to compute metrics for 121 | 122 | 123 | - computeForAllColumns(): DataFrame 124 | 125 | 126 | ### Usage: 127 | Assuming that you have a delta table 128 | 129 | import DeltaClusteringMetrics 130 | ``` 131 | import fr.databeans.lighthouse.metrics.delta.DeltaClusteringMetrics 132 | ``` 133 | 134 | compute clustering information for a given column. 135 | 136 | ``` 137 | val clusteringMetric = DeltaClusteringMetrics 138 | .forPath("path/to/deltaTable", spark) 139 | .computeForColumn("id") 140 | ``` 141 | 142 | compute clustering information for multiple columns. 143 | 144 | ``` 145 | val clusteringMetrics = DeltaClusteringMetrics 146 | .forName("DeltaTable",spark) 147 | .computeForColumns("id","value") 148 | ``` 149 | 150 | compute clustering information for all columns of the table. 151 | 152 | ``` 153 | val clusteringMetrics = DeltaClusteringMetrics 154 | .forName("DeltaTable",spark) 155 | .computeForAllColumns() 156 | ``` 157 | 158 | ### Output: 159 | The library will then compute the clustering metrics and generate a dataframe containing the next columns: 160 | 161 | | column | total_file_count | total_uniform_file_count | average_overlap | average_overlap_depth | file_depth_histogram | 162 | |----------|------------------|--------------------------|-----------------|-----------------------|----------------------| 163 | | col_name | 5 | 5 | 3.0 | 4 .0 | {5.0 -> 0, 10.0 -... | 164 | 165 | 166 | ```total_file_count``` 167 | Total number of files composing the Delta table. 168 | 169 | ```total_uniform_file_count``` 170 | Files in which min and max values of a given ordering column are equal 171 | 172 | ```average_overlap``` 173 | Average number of overlapping files for each file in the delta table. 174 | The higher the average_overlap, the worse the clustering. 175 | 176 | ```average_overlap_depth``` 177 | The average number of files that will be read when an overlap occurs. 178 | The higher the average_overlap_depth, the worse the clustering. 179 | 180 | ```File_depth_histogram``` 181 | A histogram detailing the distribution of the overlap_depth on the table by grouping the tables’ files by their proportional overlap depth. 182 | * 0 to 16 with increments of 1. 183 | * For buckets larger than 16, increments of twice the width of the previous bucket (e.g. 32, 64, 128, …) 184 | 185 | ### Use-case: 186 | 187 | To gain a comprehensive understanding of the library in action, including: 188 | * how to utilize the lighthouse library for metric extraction 189 | * how to interpret the extracted metrics for performing maintenance operations on your data layout 190 | 191 | We, Databeans, recommend reading the following blog post: 192 | - [Z-ordering: take the Guesswork out (part2)](https://databeans-blogs.medium.com/delta-z-ordering-take-the-guesswork-out-part2-1bdd03121aec) 193 | 194 | ## NOTES 195 | 196 | - Lighthouse cannot compute metrics for a column without statistics: Before computing clustering metrics, Lighthouse requires the statistics of the columns to be computed, so if statistics are not available, it will not be able to compute metrics for that column. 197 | - clustering metrics cannot be computed for partitioning columns 198 | - When handling a column with all null values, ```the average_overlap``` and ```average_overlap_depth``` metrics will be assigned a value of -1, while the ```file_depth_histogram``` metric will be assigned a null value. 199 | 200 | ## LIMITATIONS 201 | 202 | - Lighthouse currently supports the following data types: Int, Long, Decimal, and String. 203 | - Lighthouse supports only Delta tables and may not work with other table formats. 204 | 205 | ## TECHNOLOGIES 206 | 207 | Lighthouse supports: 208 | - Scala 2.12.13 209 | - Spark 3.3.2 210 | - Delta 2.3.0 211 | 212 | ## CONTRIBUTING 213 | 214 | Lighthouse is an open-source project, and we welcome contributors from the community. If you have a new feature or improvement, feel free to submit a pull request. 215 | 216 | ## BLOGS 217 | 218 | - [Z-ordering: take the Guesswork out (part1)](https://databeans-blogs.medium.com/z-ordre-take-the-guesswork-out-bad0133d7895) 219 | - [Z-ordering: take the Guesswork out (part2)](https://databeans-blogs.medium.com/delta-z-ordering-take-the-guesswork-out-part2-1bdd03121aec) 220 | 221 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | 2 | name := "lighthouse" 3 | 4 | version := "0.1.0" 5 | 6 | scalaVersion := "2.12.13" 7 | 8 | val scalaTestVersion = "3.1.1" 9 | val sparkVersion = "3.3.2" 10 | val deltaVersion = "2.3.0" 11 | 12 | libraryDependencies += "org.scalactic" %% "scalactic" % scalaTestVersion 13 | libraryDependencies += "org.scalatest" %% "scalatest" % scalaTestVersion % "test" 14 | 15 | libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion % "provided" 16 | libraryDependencies += "io.delta" %% "delta-core" % deltaVersion % "provided" 17 | 18 | libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion % Test classifier "tests" 19 | libraryDependencies += "org.apache.spark" %% "spark-catalyst" % sparkVersion % Test classifier "tests" 20 | libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion % Test classifier "tests" 21 | libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion % Test classifier "tests" 22 | libraryDependencies += "org.apache.spark" %% "spark-hive" % sparkVersion % Test classifier "tests" -------------------------------------------------------------------------------- /documentation/do_not_delete: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Databeans/lighthouse/35217ff9e7c1a956ce65793c1f569a15990fe89a/documentation/do_not_delete -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # lighthouse Example 2 | This example demonstrates how to use lighthouse to analyze the data layout of a Delta table. 3 | lighthouse is a library designed to monitor the health of the Lakehouse from a data layout perspective, and provide valuable insights about how well data is clustered. 4 | This example calculates the clustering metrics of a delta table, and prints the results to the console. It can be run if the prerequisites are satisfied. 5 | 6 | ## Prerequisites 7 | - Scala 2.12.13 8 | - Spark 3.3.2 9 | - Delta 2.3.0 10 | - lighthouse_2.12-0.1.0.jar 11 | 12 | ## Instructions 13 | To run the example: 14 | 1. Download or clone the lighthouse project. 15 | 2. run ```sbt compile``` to compile. 16 | 3. run ```sbt package``` to generate the jar file. 17 | 4. run ```mkdir examples/lib/ ``` to create the lib directory. 18 | 5. run ```cp target/scala-2.12/lighthouse_2.12-0.1.0.jar examples/lib/``` to copy the jar in the lib folder. 19 | 6. Navigate to the examples directory: ```cd examples```. 20 | 7. Run ```sbt compile``` to compile the example. 21 | 8. Run ```sbt "runMain Quickstart --master local[*]"``` to execute the example. 22 | 9. The clustering metrics for the specified Delta table will be printed to the console. 23 | 24 | By running this example, you can learn how to use lighthouse to calculate the clustering metrics for a Delta table and interpret the results. 25 | You can also use this example as a starting point for your own projects. 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /examples/build.sbt: -------------------------------------------------------------------------------- 1 | name := "clustering-metrics-example" 2 | 3 | version := "0.1" 4 | scalaVersion := "2.12.13" 5 | 6 | val sparkVersion = "3.3.2" 7 | val deltaVersion = "2.3.0" 8 | 9 | lazy val root = (project in file(".")) 10 | .settings( 11 | Compile / unmanagedJars += file("lib/lighthouse_2.12-0.1.0.jar") 12 | ) 13 | 14 | libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion 15 | libraryDependencies += "io.delta" %% "delta-core" % deltaVersion -------------------------------------------------------------------------------- /examples/src/main/scala/Quickstart.scala: -------------------------------------------------------------------------------- 1 | import fr.databeans.lighthouse.metrics.delta.DeltaClusteringMetrics 2 | import org.apache.spark.sql.SparkSession 3 | import org.apache.spark.sql.functions.{col, lit} 4 | import org.apache.spark.sql.types.IntegerType 5 | 6 | object Quickstart { 7 | def main(args: Array[String]): Unit = { 8 | 9 | implicit val spark: SparkSession = SparkSession 10 | .builder() 11 | .master("local[*]") 12 | .appName("Quickstart") 13 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 14 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 15 | .getOrCreate() 16 | import spark.implicits._ 17 | 18 | spark.range(1, 5, 1).toDF() 19 | .withColumn("id", col("id").cast(IntegerType)) 20 | .withColumn("keys", lit(1)) 21 | .withColumn("values", col("id") * 3) 22 | .write.mode("overwrite") 23 | .format("delta") 24 | .save("deltaTable") 25 | 26 | val clusteringMetric = DeltaClusteringMetrics 27 | .forPath("deltaTable", spark) 28 | .computeForColumn("id") 29 | clusteringMetric.show() 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /images/average_overlap.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Databeans/lighthouse/35217ff9e7c1a956ce65793c1f569a15990fe89a/images/average_overlap.jpg -------------------------------------------------------------------------------- /images/average_overlap_depth.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Databeans/lighthouse/35217ff9e7c1a956ce65793c1f569a15990fe89a/images/average_overlap_depth.jpg -------------------------------------------------------------------------------- /images/do_not_delete: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Databeans/lighthouse/35217ff9e7c1a956ce65793c1f569a15990fe89a/images/do_not_delete -------------------------------------------------------------------------------- /images/total_uniform_file_count.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Databeans/lighthouse/35217ff9e7c1a956ce65793c1f569a15990fe89a/images/total_uniform_file_count.png -------------------------------------------------------------------------------- /notebooks/databricks/DeltaClusteringMetrics.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | import com.databricks.sql.transaction.tahoe.DeltaLog 3 | import fr.databeans.lighthouse.metrics.delta.DeltaClusteringMetricsBase 4 | import org.apache.spark.sql.{DataFrame, SparkSession} 5 | import org.apache.spark.sql.types.StructType 6 | 7 | case class DeltaClusteringMetrics(deltaLog: DeltaLog, spark: SparkSession) extends DeltaClusteringMetricsBase(spark) { 8 | 9 | override def schema: StructType = deltaLog.unsafeVolatileSnapshot.schema 10 | 11 | override def statsSchema: StructType = deltaLog.unsafeVolatileSnapshot.statsSchema 12 | 13 | override def stateWithStats: DataFrame = deltaLog.unsafeVolatileSnapshot.stateDF 14 | 15 | override def allColumns: Seq[String] = deltaLog.unsafeVolatileSnapshot.schema.map(_.name) 16 | 17 | override def partitionColumns: Seq[String] = deltaLog.unsafeVolatileSnapshot.metadata.partitionColumns 18 | } 19 | 20 | object DeltaClusteringMetrics { 21 | 22 | def forName(deltaTable: String, spark: SparkSession): DeltaClusteringMetrics = { 23 | val location = spark.sql(s"describe detail $deltaTable").select("location").collect()(0)(0).toString 24 | val deltaLog = DeltaLog.forTable(spark, location) 25 | DeltaClusteringMetrics(deltaLog, spark) 26 | } 27 | 28 | def forPath(deltaPath: String, spark: SparkSession): DeltaClusteringMetrics = { 29 | val deltaLog = DeltaLog.forTable(spark, deltaPath) 30 | DeltaClusteringMetrics(deltaLog, spark) 31 | } 32 | } 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.4.3 -------------------------------------------------------------------------------- /src/main/scala/fr/databeans/lighthouse/fileStatsIntervalTree/Interval.scala: -------------------------------------------------------------------------------- 1 | package fr.databeans.lighthouse.fileStatsIntervalTree 2 | 3 | import org.apache.spark.sql.types.{DataType, DecimalType, IntegerType, LongType} 4 | 5 | case class Interval(start: String, end: String, fileName: String, statsType: DataType) 6 | extends Comparable[Interval] { 7 | 8 | def intersects(min: String, max: String): Boolean = { 9 | greaterThenOrEqual(max, start) && greaterThenOrEqual(end, min) 10 | } 11 | 12 | def exclusiveIntersects(min: String, max: String): Boolean = { 13 | greaterThen(max, start) && greaterThen(end, min) 14 | } 15 | 16 | def greaterThenOrEqual(a: String, b: String): Boolean = { 17 | if (compare(a, b) == -1) false else true 18 | } 19 | 20 | def greaterThen(a: String, b: String): Boolean = { 21 | if (compare(a, b) == 1) true else false 22 | } 23 | 24 | override def compareTo(o: Interval): Int = { 25 | val compareStarts = compare(start, o.start) 26 | if (compareStarts != 0) { 27 | compareStarts 28 | } 29 | else compare(end, o.end) 30 | } 31 | 32 | def compare(a: String, b: String): Int = { 33 | statsType match { 34 | case IntegerType => compare[Int](a.toInt, b.toInt) 35 | case LongType => compare[Long](a.toLong, b.toLong) 36 | case DecimalType() => { 37 | compare[BigDecimal]( 38 | new BigDecimal(new java.math.BigDecimal(a)), 39 | new BigDecimal(new java.math.BigDecimal(b))) 40 | } 41 | case _ => compare[String](a, b) 42 | } 43 | } 44 | 45 | def compare[T: Ordering](a: T, b: T): Int = { 46 | val ord = implicitly[Ordering[T]] 47 | import ord.mkOrderingOps 48 | if (a < b) -1 49 | else if (a > b) 1 50 | else 0 51 | } 52 | 53 | def lowerThenPoint(median: String): Boolean = { 54 | val comp = compare(end, median) 55 | if (comp == -1) true else false 56 | } 57 | 58 | def greaterThenPoint(median: String): Boolean = { 59 | val comp = compare(median, start) 60 | if (comp == -1) true else false 61 | } 62 | 63 | def greaterThenOrEqualPoint(median: String): Boolean = { 64 | val comp = compare(median, start) 65 | if (comp == 1) false else true 66 | } 67 | 68 | def lowerThenPointOrEqual(median: String): Boolean = { 69 | val comp = compare(end, median) 70 | if (comp == 1) false else true 71 | } 72 | 73 | def startsBefore(point: String): Boolean = { 74 | val comp = compare(start, point) 75 | if (comp == 1) false else true 76 | } 77 | 78 | def endsAfter(point: String): Boolean = { 79 | val comp = compare(end, point) 80 | if (comp == 1) true else false 81 | } 82 | 83 | def min(a: String, b: String): String = { 84 | val comp = compare(a, b) 85 | if (comp == 1) b else a 86 | } 87 | 88 | def max(a: String, b: String): String = { 89 | val comp = compare(a, b) 90 | if (comp == 1) a else b 91 | } 92 | } -------------------------------------------------------------------------------- /src/main/scala/fr/databeans/lighthouse/fileStatsIntervalTree/IntervalBoundary.scala: -------------------------------------------------------------------------------- 1 | package fr.databeans.lighthouse.fileStatsIntervalTree 2 | 3 | import org.apache.spark.sql.types.{DataType, DecimalType, IntegerType, LongType} 4 | 5 | case class IntervalBoundary(value: String, statsType: DataType) extends Comparable[IntervalBoundary] { 6 | 7 | def greaterThenOrEqual(a: String, b: String): Boolean = { 8 | if (compare(a, b) == -1) false else true 9 | } 10 | 11 | def greaterThenOrEqual(b: String): Boolean = { 12 | if (compare(value, b) == -1) false else true 13 | } 14 | 15 | def greaterThen(b: String): Boolean = { 16 | if (compare(value, b) == 1) true else false 17 | } 18 | 19 | override def compareTo(o: IntervalBoundary): Int = { 20 | compare(value, o.value) 21 | } 22 | 23 | def compare(a: String, b: String): Int = { 24 | statsType match { 25 | case IntegerType => compare[Int](a.toInt, b.toInt) 26 | case LongType => compare[Long](a.toLong, b.toLong) 27 | case DecimalType() => 28 | compare[BigDecimal]( 29 | new BigDecimal(new java.math.BigDecimal(a)), 30 | new BigDecimal(new java.math.BigDecimal(b))) 31 | case _ => compare[String](a, b) 32 | } 33 | } 34 | 35 | def compare[T: Ordering](a: T, b: T): Int = { 36 | val ord = implicitly[Ordering[T]] 37 | import ord.mkOrderingOps 38 | if (a < b) -1 39 | else if (a > b) 1 40 | else 0 41 | } 42 | 43 | def min(a: String, b: String): String = { 44 | val comp = compare(a, b) 45 | if (comp == 1) b else a 46 | } 47 | } -------------------------------------------------------------------------------- /src/main/scala/fr/databeans/lighthouse/fileStatsIntervalTree/IntervalTree.scala: -------------------------------------------------------------------------------- 1 | package fr.databeans.lighthouse.fileStatsIntervalTree 2 | 3 | case class IntervalTree(head: Node, intervals: Seq[Interval]) { 4 | 5 | def size: Int = intervals.size 6 | 7 | def isEmpty: Boolean = intervals.isEmpty 8 | 9 | def nonEmpty: Boolean = intervals.nonEmpty 10 | 11 | def getIntervals(i: Interval, inclusive: Boolean = true): List[Interval] = 12 | head.query(i, inclusive) 13 | } 14 | 15 | object IntervalTree { 16 | def apply(intervals: Seq[Interval]): IntervalTree = 17 | IntervalTree(Node(intervals), intervals) 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/fr/databeans/lighthouse/fileStatsIntervalTree/Node.scala: -------------------------------------------------------------------------------- 1 | package fr.databeans.lighthouse.fileStatsIntervalTree 2 | 3 | import org.apache.spark.sql.types.{DecimalType, IntegerType, LongType} 4 | 5 | import scala.collection.SortedMap 6 | import scala.collection.immutable.TreeSet 7 | import scala.collection.mutable.ListBuffer 8 | 9 | case class Node( 10 | center: String, 11 | left: Option[Node], 12 | right: Option[Node], 13 | intervals: SortedMap[Interval, List[Interval]]) { 14 | 15 | 16 | def size: Int = intervals.size 17 | 18 | def isEmpty: Boolean = intervals.isEmpty 19 | 20 | def query(i: Interval, inclusive: Boolean = true): List[Interval] = { 21 | 22 | val result = ListBuffer.empty[Interval] 23 | 24 | intervals.takeWhile { 25 | case (key, list) => 26 | val overlap = if (inclusive) { 27 | key.intersects(i.start, i.end) 28 | } 29 | else { 30 | key.exclusiveIntersects(i.start, i.end) 31 | } 32 | if (overlap) list.foreach(result += _) 33 | if (i.compare(key.start, i.end) == 1) false else true 34 | } 35 | 36 | if (!i.greaterThenPoint(center) && left.isDefined) 37 | result ++= left.get.query(i, inclusive) 38 | 39 | if (!i.lowerThenPoint(center) && right.isDefined) 40 | result ++= right.get.query(i, inclusive) 41 | 42 | result.toList 43 | } 44 | } 45 | 46 | object Node { 47 | 48 | def medianOf(set: TreeSet[_ >: Int with Long with BigDecimal with String]): Option[String] = { 49 | val mid = set.size / 2 50 | 51 | set.zipWithIndex.find(_._2 == mid) match { 52 | case None => None 53 | case Some((point, _)) => Some(point.toString) 54 | } 55 | } 56 | 57 | def getMedian(intervals: Seq[Interval]): Option[String] = { 58 | val statsType = intervals.map(_.statsType).head 59 | statsType match { 60 | case IntegerType => { 61 | var endpoints = TreeSet.empty[Int] 62 | intervals.foreach { interval => 63 | endpoints += interval.start.toInt 64 | endpoints += interval.end.toInt 65 | } 66 | medianOf(endpoints) 67 | } 68 | case LongType => { 69 | var endpoints = TreeSet.empty[Long] 70 | intervals.foreach { interval => 71 | endpoints += interval.start.toLong 72 | endpoints += interval.end.toLong 73 | } 74 | medianOf(endpoints) 75 | } 76 | case DecimalType() => { 77 | var endpoints = TreeSet.empty[BigDecimal] 78 | intervals.foreach { interval => 79 | endpoints += new BigDecimal(new java.math.BigDecimal(interval.start)) 80 | endpoints += new BigDecimal(new java.math.BigDecimal(interval.end)) 81 | } 82 | medianOf(endpoints) 83 | } 84 | 85 | case _ => { 86 | var endpoints = TreeSet.empty[String] 87 | intervals.foreach { interval => 88 | endpoints += interval.start 89 | endpoints += interval.end 90 | } 91 | medianOf(endpoints) 92 | } 93 | } 94 | } 95 | 96 | def apply(intervals: Seq[Interval]): Node = { 97 | 98 | var intervalsMap = SortedMap.empty[Interval, List[Interval]] 99 | val median = getMedian(intervals).get 100 | 101 | var leftNodes = List.empty[Interval] 102 | var rightNodes = List.empty[Interval] 103 | 104 | intervals.foreach { interval => 105 | if (interval.lowerThenPoint(median)) leftNodes ::= interval 106 | else if (interval.greaterThenPoint(median)) rightNodes ::= interval 107 | else intervalsMap ++= Seq(interval -> (interval :: intervalsMap.getOrElse(interval, List.empty))) 108 | } 109 | 110 | if (leftNodes.nonEmpty && rightNodes.nonEmpty) { 111 | Node(median, Some(Node(leftNodes)), Some(Node(rightNodes)), intervalsMap) 112 | } else if (leftNodes.nonEmpty) 113 | Node(median, Some(Node(leftNodes)), None, intervalsMap) 114 | else if (rightNodes.nonEmpty) 115 | Node(median, None, Some(Node(rightNodes)), intervalsMap) 116 | else 117 | Node(median, None, None, intervalsMap) 118 | } 119 | } -------------------------------------------------------------------------------- /src/main/scala/fr/databeans/lighthouse/metrics/ClusteringMetrics.scala: -------------------------------------------------------------------------------- 1 | package fr.databeans.lighthouse.metrics 2 | 3 | import fr.databeans.lighthouse.fileStatsIntervalTree.{Interval, IntervalBoundary, IntervalTree} 4 | 5 | case class ClusteringMetrics( 6 | column: String, 7 | total_file_count: Long, 8 | total_uniform_file_count: Long, 9 | averageOverlapDepth: Double, 10 | fileDepthHistogram: Map[Double, Int], 11 | averageOverlaps: Double 12 | ) 13 | 14 | 15 | class ClusteringMetricsBuilder { 16 | 17 | def computeMetrics(column: String, intervals: Seq[Interval]): ClusteringMetrics = { 18 | 19 | val uniformFilesCount = countUniformFiles(intervals) 20 | 21 | val filteredIntervals = intervals.filter(i => i.start != null & i.end != null) 22 | 23 | if (filteredIntervals.nonEmpty) { 24 | val representativePoints = filteredIntervals 25 | .flatMap(i => Seq(IntervalBoundary(i.start, i.statsType), IntervalBoundary(i.end, i.statsType))) 26 | .distinct 27 | .sorted 28 | .map(p => Interval(p.value, p.value, p.value, p.statsType)) 29 | 30 | val tree = IntervalTree(filteredIntervals) 31 | var depthPerSubInterval: Seq[(Interval, Int)] = Seq() 32 | var histogramInput: Seq[(Interval, Int)] = Seq() 33 | var i = 0 34 | while (i < representativePoints.length) { 35 | val upperBoundOverlappingIntervals = tree.getIntervals(representativePoints(i)) 36 | val upperBoundDepth = upperBoundOverlappingIntervals.size 37 | if (i > 0) { 38 | val interval = Interval( 39 | representativePoints(i - 1).start, 40 | representativePoints(i).end, 41 | s"]${representativePoints(i - 1).start},${representativePoints(i).end}[", 42 | representativePoints(i - 1).statsType 43 | ) 44 | val overlappingIntervals = tree.getIntervals(interval, false) 45 | val openIntervalDepth = overlappingIntervals.size 46 | 47 | if (openIntervalDepth != depthPerSubInterval.last._2) { 48 | depthPerSubInterval = depthPerSubInterval ++ Seq((interval, openIntervalDepth)) 49 | } 50 | 51 | if (upperBoundDepth != depthPerSubInterval.last._2) { 52 | depthPerSubInterval = depthPerSubInterval ++ Seq((representativePoints(i), upperBoundDepth)) 53 | } 54 | histogramInput = histogramInput ++ 55 | (upperBoundOverlappingIntervals ++ overlappingIntervals) 56 | .distinct 57 | .map(i => (i, Seq(depthPerSubInterval.last._2, openIntervalDepth, upperBoundDepth).max)) 58 | 59 | i = i + 1 60 | } 61 | else { 62 | depthPerSubInterval = depthPerSubInterval ++ Seq((representativePoints(i), upperBoundDepth)) 63 | histogramInput = histogramInput ++ 64 | upperBoundOverlappingIntervals.map(i => (i, upperBoundDepth)) 65 | i = i + 1 66 | } 67 | } 68 | 69 | val fileDepthHistogram = computeDepthHistogram(histogramInput) 70 | val averageOverlapDepth = computeAverageOverlapDepth(depthPerSubInterval) 71 | val averageOverlaps = computeAverageOverlaps(filteredIntervals) 72 | 73 | ClusteringMetrics(column, intervals.size.toLong, uniformFilesCount, averageOverlapDepth, fileDepthHistogram, averageOverlaps) 74 | } 75 | else { 76 | ClusteringMetrics(column, intervals.size.toLong, uniformFilesCount, -1.toDouble, null.asInstanceOf[Map[Double, Int]], -1.toDouble) 77 | } 78 | } 79 | 80 | private def computeAverageOverlapDepth(depthPerSubInterval: Seq[(Interval, Int)]): Double = { 81 | val depths = depthPerSubInterval.filter(_._2 > 1) 82 | if (depths.nonEmpty) { 83 | "%.4f".format(depths.map(_._2).sum.toFloat / depthPerSubInterval.count(_._2 > 1)).toDouble 84 | } 85 | else { 86 | 1.0 87 | } 88 | } 89 | 90 | private def computeDepthHistogram(histogramInput: Seq[(Interval, Int)]): Map[Double, Int] = { 91 | val data = histogramInput.groupBy(_._1).values.map(_.maxBy(_._2)).map(_._2.toDouble).toList 92 | Distribution.histogram(data) 93 | } 94 | 95 | private def computeAverageOverlaps(intervals: Seq[Interval]): Double = { 96 | val tree = IntervalTree(intervals) 97 | val intervalsOverlaps = intervals 98 | .map(i => tree.getIntervals(i).size - 1) 99 | 100 | "%.4f".format(intervalsOverlaps.sum.toFloat / intervalsOverlaps.size).toDouble 101 | } 102 | 103 | private def countUniformFiles(intervals: Seq[Interval]): Int = { 104 | intervals.count(i => i.start == i.end) 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/scala/fr/databeans/lighthouse/metrics/Distribution.scala: -------------------------------------------------------------------------------- 1 | package fr.databeans.lighthouse.metrics 2 | 3 | object Distribution { 4 | 5 | def roundToPowerOfTwo(element: Int): Double = { 6 | val log = Math.log(element) / Math.log(2); 7 | val roundLog = Math.round(log); 8 | val powerOfTwo = Math.pow(2, roundLog); 9 | if (powerOfTwo > element) { 10 | Math.pow(2, roundLog - 1) 11 | } 12 | else { 13 | Math.pow(2, roundLog) 14 | } 15 | } 16 | 17 | def getBounds(maxBin: Double): List[Double] = { 18 | var powerOfTwo = 32 19 | var bins: List[Int] = List.range(1, 17) 20 | while (powerOfTwo < maxBin) { 21 | bins = bins ++ Seq(powerOfTwo) 22 | powerOfTwo = powerOfTwo * 2 23 | } 24 | bins.map(_.toDouble) 25 | } 26 | 27 | def computePopulatedBuckets(data: List[Double]): Map[Double, Int] = { 28 | data.map(_.floor.toInt).map { e => 29 | if (e > 16) 30 | roundToPowerOfTwo(e) 31 | else e 32 | }.groupBy(identity).mapValues(_.size) 33 | } 34 | 35 | def computeUnPopulatedBuckets(maxBin: Double, populatedBuckets: Map[Double, Int]): Map[Double, Int] = { 36 | getBounds(maxBin).map(e => (e, 0)).toMap.filter(x => !populatedBuckets.keys.toList.contains(x._1)) 37 | } 38 | 39 | def histogram(data: List[Double]): Map[Double, Int] = { 40 | val maxBin = data.max 41 | val populatedBuckets = computePopulatedBuckets(data) 42 | val unPopulatedBuckets = computeUnPopulatedBuckets(maxBin, populatedBuckets) 43 | Map((populatedBuckets ++ unPopulatedBuckets).toSeq.sortBy(_._1): _*) 44 | } 45 | } -------------------------------------------------------------------------------- /src/main/scala/fr/databeans/lighthouse/metrics/delta/DeltaClusteringMetrics.scala: -------------------------------------------------------------------------------- 1 | package fr.databeans.lighthouse.metrics.delta 2 | 3 | import org.apache.spark.sql.delta.DeltaLog 4 | import org.apache.spark.sql.types.StructType 5 | import org.apache.spark.sql.{DataFrame, SparkSession} 6 | 7 | 8 | case class DeltaClusteringMetrics(deltaLog: DeltaLog, spark: SparkSession) extends DeltaClusteringMetricsBase(spark) { 9 | 10 | override def schema: StructType = deltaLog.unsafeVolatileSnapshot.schema 11 | 12 | override def statsSchema: StructType = deltaLog.unsafeVolatileSnapshot.statsSchema 13 | 14 | override def stateWithStats: DataFrame = deltaLog.unsafeVolatileSnapshot.stateDF 15 | 16 | override def allColumns: Seq[String] = deltaLog.unsafeVolatileSnapshot.schema.map(_.name) 17 | 18 | override def partitionColumns: Seq[String] = deltaLog.unsafeVolatileSnapshot.metadata.partitionColumns 19 | } 20 | 21 | object DeltaClusteringMetrics { 22 | 23 | def forName(deltaTable: String, spark: SparkSession): DeltaClusteringMetrics = { 24 | val location = spark.sql(s"describe detail $deltaTable").select("location").collect()(0)(0).toString 25 | val deltaLog = DeltaLog.forTable(spark, location) 26 | DeltaClusteringMetrics(deltaLog, spark) 27 | } 28 | 29 | def forPath(deltaPath: String, spark: SparkSession): DeltaClusteringMetrics = { 30 | val deltaLog = DeltaLog.forTable(spark, deltaPath) 31 | DeltaClusteringMetrics(deltaLog, spark) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/fr/databeans/lighthouse/metrics/delta/DeltaClusteringMetricsBase.scala: -------------------------------------------------------------------------------- 1 | package fr.databeans.lighthouse.metrics.delta 2 | 3 | import fr.databeans.lighthouse.fileStatsIntervalTree.Interval 4 | import fr.databeans.lighthouse.metrics.{ClusteringMetrics, ClusteringMetricsBuilder} 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql.{DataFrame, SparkSession} 8 | 9 | abstract class DeltaClusteringMetricsBase(spark: SparkSession) extends ClusteringMetricsBuilder { 10 | 11 | val STATS_COLUMN = "stats" 12 | val MIN_PREFIX = "minValues" 13 | val MAX_PREFIX = "maxValues" 14 | val FILE_RELATIVE_PATH = "add.path" 15 | 16 | def schema: StructType 17 | 18 | def statsSchema: StructType 19 | 20 | def stateWithStats: DataFrame 21 | 22 | def allColumns: Seq[String] 23 | 24 | def partitionColumns: Seq[String] 25 | 26 | def computeForColumn(column: String): DataFrame = { 27 | import spark.implicits._ 28 | 29 | Seq(column).map(col => compute(col)).toDF() 30 | } 31 | 32 | def computeForColumns(columns: String*): DataFrame = { 33 | import spark.implicits._ 34 | columns.map(col => compute(col)).toDF() 35 | } 36 | 37 | def computeForAllColumns(): DataFrame = { 38 | import spark.implicits._ 39 | val colsWithoutStats = getColumnsWithoutStats() 40 | val omittedCols = partitionColumns.union(colsWithoutStats) 41 | allColumns.diff(omittedCols).map(col => compute(col)).toDF() 42 | } 43 | 44 | private def compute(column: String): ClusteringMetrics = { 45 | val intervals = prepareIntervals(column) 46 | computeMetrics(column, intervals) 47 | } 48 | 49 | private def prepareIntervals(column: String): Seq[Interval] = { 50 | 51 | assert(!isPartitioningColumn(column), 52 | s"'$column' is a partitioning column. Clustering metrics cannot be computed for partitioning columns") 53 | 54 | val dataType = getStatsType(column) 55 | 56 | assert(checkIfStatsExists(column), s"no statistics found for column '$column'") 57 | 58 | stateWithStats 59 | .filter(col("add").isNotNull) 60 | .withColumn(STATS_COLUMN, from_json(col(s"add.$STATS_COLUMN"), statsSchema)) 61 | .select( 62 | col(s"$FILE_RELATIVE_PATH"), 63 | col(s"${STATS_COLUMN}.${MIN_PREFIX}.$column").cast(StringType).as("min"), 64 | col(s"${STATS_COLUMN}.${MAX_PREFIX}.$column").cast(StringType).as("max") 65 | ) 66 | .collect() 67 | .map { row => 68 | Interval(row.getString(1), row.getString(2), row.getString(0), dataType) 69 | } 70 | } 71 | 72 | private def getStatsType(column: String): DataType = { 73 | val extractedColumn = schema 74 | .filter(_.name == column) 75 | 76 | assert(extractedColumn.nonEmpty, s"column $column not found in columns ${allColumns.mkString(",")}") 77 | extractedColumn.head.dataType 78 | } 79 | 80 | private def checkIfStatsExists(column: String): Boolean = { 81 | statsSchema.fields.filter(_.name == MIN_PREFIX) 82 | .map(_.dataType) 83 | .flatMap { 84 | case StructType(f) => f 85 | }.map(_.name) 86 | .contains(column) 87 | } 88 | 89 | 90 | private def getColumnsWithoutStats(): Seq[String] ={ 91 | allColumns.filter(col => !checkIfStatsExists(col)) 92 | } 93 | 94 | private def isPartitioningColumn(column: String): Boolean = { 95 | partitionColumns.contains(column) 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/test/scala/fr/databeans/lighthouse/fileStatsIntervalTree/IntervalSpec.scala: -------------------------------------------------------------------------------- 1 | package fr.databeans.lighthouse.fileStatsIntervalTree 2 | 3 | import org.scalatest.funspec.AnyFunSpec 4 | import org.scalatest.matchers.should.Matchers 5 | import org.apache.spark.sql.types._ 6 | 7 | final class IntervalSpec extends AnyFunSpec with Matchers { 8 | 9 | describe("An interval [b,c]") { 10 | 11 | val interval = Interval("b", "c", "file1", StringType) 12 | 13 | it("should not intersect with [d,e]") { 14 | interval.intersects("d", "e") shouldBe false 15 | } 16 | 17 | it("should intersect with [b,d]") { 18 | interval.intersects("b", "d") shouldBe true 19 | } 20 | } 21 | 22 | describe("An interval [b,c]") { 23 | 24 | val interval = Interval("b", "c", "file1", StringType) 25 | 26 | it("should be greater than [a,e]") { 27 | val interval2 = Interval("a", "e", "file2", StringType) 28 | interval.compareTo(interval2) shouldBe 1 29 | } 30 | 31 | it("should be lower than [f,g]") { 32 | val interval2 = Interval("f", "g", "file2", StringType) 33 | interval.compareTo(interval2) shouldBe -1 34 | } 35 | 36 | it("should be equal to [b,c]") { 37 | val interval2 = Interval("b", "c", "file2", StringType) 38 | interval.compareTo(interval2) shouldBe 0 39 | } 40 | } 41 | 42 | describe("An interval [1,5]") { 43 | 44 | val interval = Interval("1", "5", "file1", IntegerType) 45 | 46 | it("should not intersect with [-1,0]") { 47 | interval.intersects("-1", "0") shouldBe false 48 | } 49 | 50 | it("should not intersect with [6,17]") { 51 | interval.intersects("6", "17") shouldBe false 52 | } 53 | 54 | it("should intersect with [0,1]") { 55 | interval.intersects("0", "1") shouldBe true 56 | } 57 | 58 | it("should intersect with [5,6]") { 59 | interval.intersects("5", "6") shouldBe true 60 | } 61 | 62 | it("should intersect with [2,3]") { 63 | interval.intersects("2", "3") shouldBe true 64 | } 65 | 66 | it("should intersect with [0,4]") { 67 | interval.intersects("0", "4") shouldBe true 68 | } 69 | 70 | it("should intersect with [3,10]") { 71 | interval.intersects("3", "10") shouldBe true 72 | } 73 | 74 | it("should intersect with [0,10]") { 75 | interval.intersects("0", "10") shouldBe true 76 | } 77 | } 78 | 79 | describe("An interval [7,17]") { 80 | val interval = Interval("7", "17", "file1", IntegerType) 81 | 82 | 83 | it("should intersect with [5,20]") { 84 | interval.intersects("5", "20") shouldBe true 85 | } 86 | } 87 | } 88 | 89 | -------------------------------------------------------------------------------- /src/test/scala/fr/databeans/lighthouse/fileStatsIntervalTree/IntervalTreeSpec.scala: -------------------------------------------------------------------------------- 1 | package fr.databeans.lighthouse.fileStatsIntervalTree 2 | 3 | import org.apache.spark.sql.types.IntegerType 4 | import org.scalatest.funspec.AnyFunSpec 5 | import org.scalatest.matchers.should.Matchers 6 | 7 | final class IntervalTreeSpec extends AnyFunSpec with Matchers { 8 | 9 | describe("Tree holding a single interval [1, 5]") { 10 | val intervals = Seq[Interval](Interval("1", "5", "file1", IntegerType)) 11 | val tree = IntervalTree(intervals) 12 | 13 | it("should not be empty") { 14 | tree.isEmpty shouldBe false 15 | } 16 | 17 | it("should return 1 result on query interval [4, 8]") { 18 | tree.getIntervals(Interval("4", "8", "file1", IntegerType)).size shouldEqual 1 19 | } 20 | } 21 | 22 | describe("depth.Node holding many intervals") { 23 | val intervals = Seq[Interval]( 24 | Interval("1", "5", "file2", IntegerType), 25 | Interval("6", "9", "file3", IntegerType), 26 | Interval("10", "14", "file4", IntegerType) 27 | ) 28 | 29 | val tree = IntervalTree(intervals) 30 | 31 | it("should not be empty") { 32 | tree.isEmpty shouldBe false 33 | } 34 | 35 | it("should return 2 results on query interval [6, 19]") { 36 | tree.getIntervals(Interval("6", "19", "file5", IntegerType)).size shouldEqual 2 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/test/scala/fr/databeans/lighthouse/fileStatsIntervalTree/NodeSpec.scala: -------------------------------------------------------------------------------- 1 | package fr.databeans.lighthouse.fileStatsIntervalTree 2 | 3 | import org.apache.spark.sql.types.IntegerType 4 | import org.scalatest.funspec.AnyFunSpec 5 | import org.scalatest.matchers.should.Matchers 6 | 7 | final class NodeSpec extends AnyFunSpec with Matchers { 8 | 9 | describe("depth.Node holding a single interval [1, 5]") { 10 | val intervals = Seq[Interval](Interval("1", "5", "file1", IntegerType)) 11 | val node = Node(intervals) 12 | 13 | it("should not be empty") { 14 | assert(node.isEmpty === false) 15 | } 16 | 17 | it("should has no left or right children") { 18 | node.left shouldBe None 19 | node.right shouldBe None 20 | } 21 | 22 | it("should return 1 result on query interval [4, 8]") { 23 | node.query(Interval("4", "8", "file2", IntegerType)).size shouldEqual 1 24 | } 25 | 26 | it("should return 1 result on query interval [2, 3]") { 27 | node.query(Interval("2", "3", "file2", IntegerType)).size shouldEqual 1 28 | } 29 | 30 | it("should return 1 result on query interval [0, 1]") { 31 | node.query(Interval("0", "1", "file2", IntegerType)).size shouldEqual 1 32 | } 33 | 34 | it("should return 0 result on query interval [-1, 0]") { 35 | node.query(Interval("-1", "0", "file2", IntegerType)).size shouldEqual 0 36 | } 37 | 38 | it("should return 1 result on query interval [6, 7]") { 39 | node.query(Interval("6", "7", "file2", IntegerType)).size shouldEqual 0 40 | } 41 | } 42 | 43 | describe("Node holding many intervals") { 44 | val intervals = Seq( 45 | Interval("1", "5", "file1", IntegerType), 46 | Interval("6", "9", "file2", IntegerType), 47 | Interval("10", "14", "file3", IntegerType) 48 | ) 49 | 50 | val node = Node(intervals) 51 | 52 | it("should not be empty") { 53 | node.isEmpty shouldBe false 54 | } 55 | 56 | it("should has 1 left and 1 right child") { 57 | node.left.get.size shouldEqual 1 58 | node.right.get.size shouldEqual 1 59 | } 60 | 61 | it("should return 2 results on query interval [6, 19]") { 62 | node.query(Interval("6", "19", "file4", IntegerType)).size shouldEqual 2 63 | } 64 | } 65 | 66 | describe("Node holding same interval multiple times") { 67 | val intervals = Seq[Interval]( 68 | Interval("1", "355", "file1", IntegerType), 69 | Interval("1", "355", "file2", IntegerType), 70 | Interval("1", "355", "file3", IntegerType) 71 | ) 72 | 73 | val node = Node(intervals) 74 | 75 | it("should not be empty") { 76 | node.isEmpty shouldBe false 77 | } 78 | 79 | it("should has no left or right children") { 80 | node.left shouldBe None 81 | node.right shouldBe None 82 | } 83 | 84 | it("should return 3 results on query interval [6, 19]") { 85 | node.query(Interval("6", "19", "file4", IntegerType)).size shouldEqual 3 86 | } 87 | 88 | it("should return 3 results on query interval [300, 400]") { 89 | node.query(Interval("300", "400", "file4", IntegerType)).size shouldEqual 3 90 | } 91 | 92 | it("should return 0 results on query interval [360, 400]") { 93 | node.query(Interval("360", "400", "file4", IntegerType)).size shouldEqual 0 94 | } 95 | } 96 | 97 | describe("Node holding sorted intervals") { 98 | val intervals = Seq[Interval]( 99 | Interval("1", "1139", "file1", IntegerType), 100 | Interval("1139", "2368", "file2", IntegerType), 101 | Interval("2368", "3503", "file3", IntegerType), 102 | Interval("3503", "4745", "file4", IntegerType), 103 | Interval("4745", "5999", "file5", IntegerType), 104 | Interval("5999", "7200", "file6", IntegerType) 105 | ) 106 | 107 | val node = Node(intervals) 108 | 109 | it("should not be empty") { 110 | node.isEmpty shouldBe false 111 | } 112 | 113 | it("should has 1 left and 1 right child") { 114 | println(node) 115 | node.left.get.size shouldEqual 2 116 | node.right.get.size shouldEqual 2 117 | } 118 | 119 | it("should return 2 results on the first and last files and 3 else") { 120 | node.query(Interval("1", "1139", "file1", IntegerType)).size shouldEqual 2 121 | node.query(Interval("1139", "2368", "file2", IntegerType)).size shouldEqual 3 122 | node.query(Interval("2368", "3503", "file3", IntegerType)).size shouldEqual 3 123 | node.query(Interval("3503", "4745", "file4", IntegerType)).size shouldEqual 3 124 | node.query(Interval("4745", "5999", "file5", IntegerType)).size shouldEqual 3 125 | node.query(Interval("5999", "7200", "file6", IntegerType)).size shouldEqual 2 126 | } 127 | } 128 | 129 | describe("Bug1 Node holding many intervals") { 130 | val intervals = Seq[Interval]( 131 | Interval("1", "5", "file2", IntegerType), 132 | Interval("0", "7", "file3", IntegerType), 133 | Interval("11", "16", "file3", IntegerType), 134 | Interval("7", "16", "file5", IntegerType), 135 | Interval("5", "9", "file6", IntegerType), 136 | Interval("4", "16", "file6", IntegerType), 137 | Interval("0", "13", "file6", IntegerType), 138 | Interval("9", "12", "file6", IntegerType), 139 | Interval("7", "9", "file6", IntegerType), 140 | Interval("20", "30", "file6", IntegerType), 141 | Interval("31", "40", "file6", IntegerType) 142 | ) 143 | 144 | val node = Node(intervals) 145 | 146 | it("should not be empty") { 147 | node.isEmpty shouldBe false 148 | } 149 | 150 | it("should return 2 results on the first and last files and 3 else") { 151 | node.query(Interval("11", "16", "file1", IntegerType)).size shouldEqual 5 152 | } 153 | } 154 | 155 | describe("Node holding many intervals all in one group") { 156 | val intervals = Seq[Interval]( 157 | Interval("16", "32", "file2", IntegerType), 158 | Interval("4", "40", "file3", IntegerType), 159 | Interval("10", "38", "file3", IntegerType), 160 | Interval("2", "24", "file5", IntegerType), 161 | Interval("6", "28", "file6", IntegerType) 162 | ) 163 | 164 | val node = Node(intervals) 165 | 166 | it("should not be empty") { 167 | node.isEmpty shouldBe false 168 | } 169 | 170 | it("should return 2 results on the first and last files and 3 else") { 171 | node.query(Interval("11", "16", "file1", IntegerType)).size shouldEqual 5 172 | } 173 | } 174 | 175 | } 176 | -------------------------------------------------------------------------------- /src/test/scala/fr/databeans/lighthouse/metrics/ClusteringMetricsSpec.scala: -------------------------------------------------------------------------------- 1 | package fr.databeans.lighthouse.metrics 2 | 3 | import fr.databeans.lighthouse.fileStatsIntervalTree.{Interval, Node} 4 | import org.apache.spark.sql.types.{DecimalType, IntegerType} 5 | import org.scalatest.funspec.AnyFunSpec 6 | import org.scalatest.matchers.should.Matchers 7 | 8 | class ClusteringMetricsSpec extends AnyFunSpec with Matchers { 9 | 10 | def buildHistogram(maxBin: Int, populatedBuckets: Map[Double, Int]): Map[Double, Int] = { 11 | val missingBins = Distribution.computeUnPopulatedBuckets(maxBin, populatedBuckets) 12 | missingBins ++ populatedBuckets 13 | } 14 | 15 | describe("compute the overlap metrics") { 16 | 17 | it("should return 2.0 as overlap depth") { 18 | 19 | val intervals1 = Seq[Interval]( 20 | Interval("1", "2", "file2", IntegerType), 21 | Interval("3", "4", "file3", IntegerType), 22 | Interval("0", "5", "file5", IntegerType), 23 | Interval("4", "10", "file6", IntegerType), 24 | Interval("14", "15", "file8", IntegerType), 25 | Interval("14", "20", "file9", IntegerType) 26 | ) 27 | 28 | val clusteringMetricsBuilder = new ClusteringMetricsBuilder() 29 | val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals1) 30 | val avgOverlapDepth = overlapMetrics.averageOverlapDepth 31 | val overlapDepthHistogram = overlapMetrics.fileDepthHistogram 32 | val averageOverlaps = overlapMetrics.averageOverlaps 33 | 34 | avgOverlapDepth shouldBe 2.2 35 | 36 | overlapDepthHistogram shouldBe 37 | buildHistogram(16, Map((2.0, 3), (3.0, 3))) 38 | 39 | averageOverlaps shouldBe 1.6667 40 | } 41 | 42 | it("should return the number of files when all files have the same min max") { 43 | 44 | val intervals2 = Seq[Interval]( 45 | Interval("1", "2", "file2", IntegerType), 46 | Interval("1", "2", "file3", IntegerType), 47 | Interval("1", "2", "file5", IntegerType), 48 | Interval("1", "2", "file6", IntegerType) 49 | ) 50 | 51 | val clusteringMetricsBuilder = new ClusteringMetricsBuilder() 52 | val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals2) 53 | val avgOverlapDepth = overlapMetrics.averageOverlapDepth 54 | val overlapDepthHistogram = overlapMetrics.fileDepthHistogram 55 | val averageOverlaps = overlapMetrics.averageOverlaps 56 | 57 | avgOverlapDepth shouldBe 4.0000 58 | 59 | overlapDepthHistogram shouldBe 60 | buildHistogram(16, Map((4.0, 4))) 61 | 62 | averageOverlaps shouldBe 3.0 63 | } 64 | 65 | it("should return 3.7778 as overlap depth and compute the histogram") { 66 | 67 | val intervals3 = Seq[Interval]( 68 | Interval("1", "5", "file2", IntegerType), 69 | Interval("0", "7", "file3", IntegerType), 70 | Interval("11", "16", "file4", IntegerType), 71 | Interval("7", "16", "file5", IntegerType), 72 | Interval("5", "9", "file6", IntegerType), 73 | Interval("4", "16", "file7", IntegerType), 74 | Interval("0", "13", "file8", IntegerType), 75 | Interval("9", "12", "file9", IntegerType), 76 | Interval("7", "9", "file10", IntegerType), 77 | Interval("20", "30", "file11", IntegerType), 78 | Interval("31", "40", "file12", IntegerType) 79 | ) 80 | 81 | val clusteringMetricsBuilder = new ClusteringMetricsBuilder() 82 | val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals3) 83 | val avgOverlapDepth = overlapMetrics.averageOverlapDepth 84 | val overlapDepthHistogram = overlapMetrics.fileDepthHistogram 85 | val averageOverlaps = overlapMetrics.averageOverlaps 86 | val total_file_count = overlapMetrics.total_file_count 87 | val total_uniform_file_count = overlapMetrics.total_uniform_file_count 88 | 89 | avgOverlapDepth shouldBe 4.25 90 | 91 | overlapDepthHistogram shouldBe 92 | buildHistogram(16, Map((1.0, 2), (5.0, 2), (6.0, 7))) 93 | 94 | averageOverlaps shouldBe 5.0909 95 | 96 | total_file_count shouldBe 11 97 | 98 | total_uniform_file_count shouldBe 0 99 | } 100 | 101 | it("should return 1 as overlap depth and compute the histogram") { 102 | 103 | val intervals4 = Seq[Interval]( 104 | Interval("1", "2", "file2", IntegerType), 105 | Interval("3", "4", "file3", IntegerType), 106 | Interval("5", "6", "file4", IntegerType), 107 | Interval("7", "8", "file5", IntegerType) 108 | ) 109 | 110 | val clusteringMetricsBuilder = new ClusteringMetricsBuilder() 111 | val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals4) 112 | val avgOverlapDepth = overlapMetrics.averageOverlapDepth 113 | val overlapDepthHistogram = overlapMetrics.fileDepthHistogram 114 | val averageOverlaps = overlapMetrics.averageOverlaps 115 | val total_file_count = overlapMetrics.total_file_count 116 | val total_uniform_file_count = overlapMetrics.total_uniform_file_count 117 | 118 | avgOverlapDepth shouldBe 1.0 119 | 120 | overlapDepthHistogram shouldBe 121 | buildHistogram(16, Map((1.0, 4))) 122 | 123 | averageOverlaps shouldBe 0 124 | 125 | total_file_count shouldBe 4 126 | 127 | total_uniform_file_count shouldBe 0 128 | } 129 | 130 | it("BUG: min = max for all intervals") { 131 | 132 | val intervals5 = Seq[Interval]( 133 | Interval("1", "1", "file2", IntegerType), 134 | Interval("1", "1", "file3", IntegerType), 135 | Interval("1", "1", "file5", IntegerType), 136 | Interval("1", "1", "file6", IntegerType) 137 | ) 138 | 139 | val clusteringMetricsBuilder = new ClusteringMetricsBuilder() 140 | val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals5) 141 | val avgOverlapDepth = overlapMetrics.averageOverlapDepth 142 | val overlapDepthHistogram = overlapMetrics.fileDepthHistogram 143 | val averageOverlaps = overlapMetrics.averageOverlaps 144 | val total_file_count = overlapMetrics.total_file_count 145 | val total_uniform_file_count = overlapMetrics.total_uniform_file_count 146 | 147 | avgOverlapDepth shouldBe 4.0000 148 | 149 | overlapDepthHistogram shouldBe 150 | buildHistogram(16, Map((4.0, 4))) 151 | 152 | averageOverlaps shouldBe 3.0 153 | 154 | total_file_count shouldBe 4 155 | 156 | total_uniform_file_count shouldBe 4 157 | } 158 | 159 | it("example 2") { 160 | 161 | val intervals2 = Seq[Interval]( 162 | Interval("1", "4", "file2", IntegerType), 163 | Interval("2", "6", "file3", IntegerType), 164 | Interval("5", "7", "file5", IntegerType), 165 | Interval("5", "10", "file6", IntegerType) 166 | ) 167 | 168 | val clusteringMetricsBuilder = new ClusteringMetricsBuilder() 169 | val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals2) 170 | val avgOverlapDepth = overlapMetrics.averageOverlapDepth 171 | val overlapDepthHistogram = overlapMetrics.fileDepthHistogram 172 | val averageOverlaps = overlapMetrics.averageOverlaps 173 | val total_file_count = overlapMetrics.total_file_count 174 | val total_uniform_file_count = overlapMetrics.total_uniform_file_count 175 | 176 | avgOverlapDepth shouldBe 2.3333 177 | 178 | overlapDepthHistogram shouldBe 179 | buildHistogram(16, Map((2.0, 1), (3.0, 3))) 180 | 181 | averageOverlaps shouldBe 2.0 182 | 183 | total_file_count shouldBe 4 184 | 185 | total_uniform_file_count shouldBe 0 186 | } 187 | 188 | it("example 3") { 189 | 190 | val intervals2 = Seq[Interval]( 191 | Interval("1", "2", "file2", IntegerType), 192 | Interval("3", "5", "file3", IntegerType), 193 | Interval("3", "5", "file5", IntegerType), 194 | Interval("3", "5", "file6", IntegerType) 195 | ) 196 | 197 | val clusteringMetricsBuilder = new ClusteringMetricsBuilder() 198 | val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals2) 199 | val avgOverlapDepth = overlapMetrics.averageOverlapDepth 200 | val overlapDepthHistogram = overlapMetrics.fileDepthHistogram 201 | val averageOverlaps = overlapMetrics.averageOverlaps 202 | val total_file_count = overlapMetrics.total_file_count 203 | val total_uniform_file_count = overlapMetrics.total_uniform_file_count 204 | 205 | avgOverlapDepth shouldBe 3 206 | 207 | overlapDepthHistogram shouldBe 208 | buildHistogram(16, Map((1.0, 1), (3.0, 3))) 209 | 210 | averageOverlaps shouldBe 1.5 211 | 212 | total_file_count shouldBe 4 213 | 214 | total_uniform_file_count shouldBe 0 215 | } 216 | 217 | it("example 4") { 218 | 219 | val intervals2 = Seq[Interval]( 220 | Interval("1", "2", "file2", IntegerType), 221 | Interval("3", "5", "file3", IntegerType), 222 | Interval("4", "7", "file5", IntegerType), 223 | Interval("6", "8", "file6", IntegerType) 224 | ) 225 | 226 | val clusteringMetricsBuilder = new ClusteringMetricsBuilder() 227 | val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals2) 228 | val avgOverlapDepth = overlapMetrics.averageOverlapDepth 229 | val overlapDepthHistogram = overlapMetrics.fileDepthHistogram 230 | val averageOverlaps = overlapMetrics.averageOverlaps 231 | val total_file_count = overlapMetrics.total_file_count 232 | val total_uniform_file_count = overlapMetrics.total_uniform_file_count 233 | 234 | avgOverlapDepth shouldBe 2 235 | 236 | overlapDepthHistogram shouldBe 237 | buildHistogram(16, Map((1.0, 1), (2.0, 3))) 238 | 239 | averageOverlaps shouldBe 1.0 240 | 241 | total_file_count shouldBe 4 242 | 243 | total_uniform_file_count shouldBe 0 244 | } 245 | 246 | it("intervals have one uniform interval") { 247 | 248 | val intervals = Seq[Interval]( 249 | Interval("1", "5", "file2", IntegerType), 250 | Interval("4", "8", "file3", IntegerType), 251 | Interval("6", "9", "file5", IntegerType), 252 | Interval("7", "7", "file6", IntegerType) 253 | ) 254 | 255 | val clusteringMetricsBuilder = new ClusteringMetricsBuilder() 256 | val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals) 257 | val avgOverlapDepth = overlapMetrics.averageOverlapDepth 258 | val overlapDepthHistogram = overlapMetrics.fileDepthHistogram 259 | val averageOverlaps = overlapMetrics.averageOverlaps 260 | val total_file_count = overlapMetrics.total_file_count 261 | val total_uniform_file_count = overlapMetrics.total_uniform_file_count 262 | 263 | avgOverlapDepth shouldBe 2.25 264 | 265 | overlapDepthHistogram shouldBe 266 | buildHistogram(16, Map((2.0, 1), (3.0, 3))) 267 | 268 | averageOverlaps shouldBe 2.0 269 | 270 | total_file_count shouldBe 4 271 | 272 | total_uniform_file_count shouldBe 1 273 | } 274 | 275 | it("intervals have two uniform intervals") { 276 | 277 | val intervals = Seq[Interval]( 278 | Interval("1", "5", "file2", IntegerType), 279 | Interval("4", "8", "file3", IntegerType), 280 | Interval("6", "9", "file5", IntegerType), 281 | Interval("7", "7", "file6", IntegerType), 282 | Interval("7", "7", "file7", IntegerType) 283 | ) 284 | 285 | val clusteringMetricsBuilder = new ClusteringMetricsBuilder() 286 | val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals) 287 | val avgOverlapDepth = overlapMetrics.averageOverlapDepth 288 | val overlapDepthHistogram = overlapMetrics.fileDepthHistogram 289 | val averageOverlaps = overlapMetrics.averageOverlaps 290 | val total_file_count = overlapMetrics.total_file_count 291 | val total_uniform_file_count = overlapMetrics.total_uniform_file_count 292 | 293 | avgOverlapDepth shouldBe 2.5 294 | 295 | overlapDepthHistogram shouldBe 296 | buildHistogram(16, Map((2.0, 1), (4.0, 4))) 297 | 298 | averageOverlaps shouldBe 2.8 299 | 300 | total_file_count shouldBe 5 301 | 302 | total_uniform_file_count shouldBe 2 303 | } 304 | 305 | it("intervals start with uniform interval and have gaps") { 306 | 307 | val intervals = Seq[Interval]( 308 | Interval("0", "0", "file1", IntegerType), 309 | Interval("0", "0", "file2", IntegerType), 310 | Interval("1", "5", "file2", IntegerType), 311 | Interval("4", "8", "file3", IntegerType), 312 | Interval("6", "9", "file5", IntegerType), 313 | Interval("7", "7", "file6", IntegerType), 314 | Interval("7", "7", "file7", IntegerType) 315 | ) 316 | 317 | val clusteringMetricsBuilder = new ClusteringMetricsBuilder() 318 | val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals) 319 | val avgOverlapDepth = overlapMetrics.averageOverlapDepth 320 | val overlapDepthHistogram = overlapMetrics.fileDepthHistogram 321 | val averageOverlaps = overlapMetrics.averageOverlaps 322 | val total_file_count = overlapMetrics.total_file_count 323 | val total_uniform_file_count = overlapMetrics.total_uniform_file_count 324 | 325 | avgOverlapDepth shouldBe 2.4 326 | 327 | overlapDepthHistogram shouldBe 328 | buildHistogram(16, Map((2.0, 3), (4.0, 4))) 329 | 330 | averageOverlaps shouldBe 2.2857 331 | 332 | total_file_count shouldBe 7 333 | 334 | total_uniform_file_count shouldBe 4 335 | } 336 | 337 | 338 | } 339 | 340 | describe("decimal type") { 341 | it("BUG: decimal type should be supported for statistics") { 342 | 343 | val intervals = Seq[Interval]( 344 | Interval("-8.00", "-5.00", "file1", DecimalType(5, 2)) 345 | ) 346 | 347 | val clusteringMetricsBuilder = new ClusteringMetricsBuilder() 348 | val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals) 349 | val avgOverlapDepth = overlapMetrics.averageOverlapDepth 350 | val overlapDepthHistogram = overlapMetrics.fileDepthHistogram 351 | val averageOverlaps = overlapMetrics.averageOverlaps 352 | val total_file_count = overlapMetrics.total_file_count 353 | val total_uniform_file_count = overlapMetrics.total_uniform_file_count 354 | 355 | avgOverlapDepth shouldBe 1.0 356 | 357 | overlapDepthHistogram shouldBe 358 | buildHistogram(16, Map((1.0, 1))) 359 | 360 | averageOverlaps shouldBe 0 361 | 362 | total_file_count shouldBe 1 363 | 364 | total_uniform_file_count shouldBe 0 365 | } 366 | } 367 | 368 | 369 | describe("Node holding many intervals all in one group") { 370 | val intervals = Seq[Interval]( 371 | Interval("16", "32", "file2", IntegerType), 372 | Interval("4", "40", "file3", IntegerType), 373 | Interval("10", "38", "file3", IntegerType), 374 | Interval("2", "24", "file5", IntegerType), 375 | Interval("6", "28", "file6", IntegerType) 376 | ) 377 | 378 | val node = Node(intervals) 379 | 380 | it("should not be empty") { 381 | node.isEmpty shouldBe false 382 | } 383 | 384 | it("should return 5 as overlap depth") { 385 | val clusteringMetricsBuilder = new ClusteringMetricsBuilder() 386 | val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals) 387 | val avgOverlapDepth = overlapMetrics.averageOverlapDepth 388 | val overlapDepthHistogram = overlapMetrics.fileDepthHistogram 389 | 390 | avgOverlapDepth shouldBe 3.2857 391 | 392 | overlapDepthHistogram shouldBe 393 | buildHistogram(16, Map((5.0, 5))) 394 | } 395 | } 396 | } 397 | -------------------------------------------------------------------------------- /src/test/scala/fr/databeans/lighthouse/metrics/delta/DeltaClusteringMetricsSpec.scala: -------------------------------------------------------------------------------- 1 | package fr.databeans.lighthouse.metrics.delta 2 | 3 | import fr.databeans.lighthouse.metrics.Distribution 4 | import org.apache.spark.sql.{QueryTest, Row} 5 | import org.apache.spark.sql.delta.DeltaLog 6 | import org.apache.spark.sql.delta.test.DeltaExtendedSparkSession 7 | import org.apache.spark.sql.functions._ 8 | import org.apache.spark.sql.test.SharedSparkSession 9 | import org.apache.spark.sql.types._ 10 | 11 | class DeltaClusteringMetricsSpec extends QueryTest with SharedSparkSession with DeltaExtendedSparkSession { 12 | 13 | def buildHistogram(maxBin: Int, populatedBuckets: Map[Double, Int]): Map[Double, Int] = { 14 | val missingBins = Distribution.computeUnPopulatedBuckets(maxBin, populatedBuckets) 15 | missingBins ++ populatedBuckets 16 | } 17 | 18 | def getStats(deltaPath: String, column: String) = { 19 | DeltaLog.forTable(spark, deltaPath).snapshot.withStats 20 | .select( 21 | col("path"), 22 | col(s"stats.minValues.$column").as("min"), 23 | col(s"stats.maxValues.$column").as("max") 24 | ) 25 | } 26 | 27 | override def beforeAll(): Unit = { 28 | super.beforeAll() 29 | spark.sparkContext.setLogLevel("ERROR") 30 | } 31 | 32 | test("compute metrics for a delta table with non overlapping files") { 33 | withTempDir { dir => 34 | spark.range(1, 50, 1, 5).toDF() 35 | .write.mode("overwrite") 36 | .format("delta") 37 | .save(dir.toString) 38 | 39 | 40 | val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark) 41 | val metrics = deltaClusteringMetric.computeForColumn("id") 42 | checkAnswer(metrics, Row("id", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0)) 43 | } 44 | } 45 | 46 | test("compute metrics for a delta table with all overlapping files") { 47 | withTempDir { dir => 48 | spark.range(1, 50, 1, 5).toDF() 49 | .withColumn("key", lit(1)) 50 | .write.mode("overwrite") 51 | .format("delta") 52 | .save(dir.toString) 53 | 54 | val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark) 55 | val metrics = deltaClusteringMetric.computeForColumn("key") 56 | checkAnswer(metrics, Row("key", 5L, 5L, 5.0, buildHistogram(16, Map((5.0, 5))), 4.0)) 57 | } 58 | } 59 | 60 | test("compute metrics for table defined by name") { 61 | withTable("deltaTable") { 62 | spark.range(1, 50, 1, 5).toDF() 63 | .write.format("delta").saveAsTable("deltaTable") 64 | 65 | val deltaClusteringMetric = DeltaClusteringMetrics.forName("deltaTable", spark) 66 | val metrics = deltaClusteringMetric.computeForColumn("id") 67 | checkAnswer(metrics, Row("id", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0)) 68 | } 69 | } 70 | 71 | test("compute metrics for a column without statistics") { 72 | withTempDir { dir => 73 | val data = spark.range(1, 50, 1, 5).toDF() 74 | .withColumn("value", col("id") * 3) 75 | 76 | data 77 | .filter("1 > 2") 78 | .write.mode("append") 79 | .format("delta").save(dir.toString) 80 | 81 | spark.sql(s"ALTER TABLE delta.`${dir.toString}` SET TBLPROPERTIES ('delta.dataSkippingNumIndexedCols' = '1')") 82 | 83 | data 84 | .write.mode("append") 85 | .format("delta").save(dir.toString) 86 | 87 | val thrown = intercept[AssertionError] { 88 | val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark) 89 | deltaClusteringMetric.computeForColumn("value") 90 | } 91 | assert(thrown.getMessage === "assertion failed: no statistics found for column 'value'") 92 | } 93 | } 94 | 95 | test("compute metrics for a non existent column") { 96 | withTempDir { dir => 97 | spark.range(1, 50, 1, 5).toDF() 98 | .write.format("delta").save(dir.toString) 99 | 100 | val thrown = intercept[AssertionError] { 101 | val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark) 102 | deltaClusteringMetric.computeForColumn("non_existent_column") 103 | } 104 | assert(thrown.getMessage.contains("assertion failed: column non_existent_column not found in columns")) 105 | } 106 | } 107 | 108 | test("compute metrics for all columns of the table") { 109 | withTempDir { dir => 110 | spark.range(1, 50, 1, 5).toDF() 111 | .withColumn("id", col("id").cast(IntegerType)) 112 | .withColumn("value", lit(1)) 113 | .write.mode("overwrite") 114 | .format("delta").save(dir.toString) 115 | 116 | val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark) 117 | val metrics = deltaClusteringMetric.computeForAllColumns() 118 | 119 | checkAnswer( 120 | metrics, 121 | Seq( 122 | Row("id", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0), 123 | Row("value", 5L, 5L, 5.0, buildHistogram(16, Map((5.0, 5))), 4.0) 124 | ) 125 | ) 126 | } 127 | } 128 | 129 | test("compute metrics for a subset columns of the table") { 130 | withTempDir { dir => 131 | spark.range(1, 50, 1, 5).toDF() 132 | .withColumn("id", col("id")) 133 | .withColumn("value1", lit(1)) 134 | .withColumn("value2", lit(2)) 135 | .write.format("delta").save(dir.toString) 136 | 137 | val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark) 138 | val metrics = deltaClusteringMetric.computeForColumns("id", "value1") 139 | 140 | checkAnswer( 141 | metrics, 142 | Seq( 143 | Row("id", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0), 144 | Row("value1", 5L, 5L, 5.0, buildHistogram(16, Map((5.0, 5))), 4.0) 145 | ) 146 | ) 147 | } 148 | } 149 | 150 | test("compute metrics for supported Data Types") { 151 | withTempDir { dir => 152 | spark.range(1, 50, 1, 5).toDF() 153 | .withColumn("value_int", col("id").cast(IntegerType)) 154 | .withColumn("value_long", col("id").cast(LongType)) 155 | .withColumn("value_decimal", col("id").cast(DecimalType(4, 2))) 156 | .withColumn("value_string", format_string("%02d", col("id"))) 157 | .drop("id") 158 | .write.format("delta").save(dir.toString) 159 | 160 | val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark) 161 | val metrics = deltaClusteringMetric.computeForAllColumns() 162 | 163 | checkAnswer( 164 | metrics, 165 | Seq( 166 | Row("value_int", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0), 167 | Row("value_long", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0), 168 | Row("value_decimal", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0), 169 | Row("value_string", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0) 170 | ) 171 | ) 172 | } 173 | } 174 | 175 | test("compute metrics for a partitioned delta table") { 176 | withTempDir { dir => 177 | spark.range(1, 50, 1, 5).toDF() 178 | .withColumn("part", col("id") % 3) 179 | .write.partitionBy("part").format("delta").save(dir.toString) 180 | 181 | val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark) 182 | 183 | val errorMessage = "assertion failed: 'part' is a partitioning column. Clustering metrics cannot be computed for partitioning columns" 184 | 185 | // computeForColumn should fail 186 | val thrown1 = intercept[AssertionError] { 187 | deltaClusteringMetric.computeForColumn("part") 188 | } 189 | assert(thrown1.getMessage == errorMessage) 190 | 191 | // computeForColumns should fail 192 | val thrown2 = intercept[AssertionError] { 193 | deltaClusteringMetric.computeForColumns("part", "id") 194 | } 195 | assert(thrown2.getMessage == errorMessage) 196 | 197 | // computeForAllColumns should compute metrics for non partitioning columns only. 198 | val metrics = deltaClusteringMetric.computeForAllColumns() 199 | checkAnswer(metrics, Seq(Row("id", 15L, 0L, 2.3333, buildHistogram(16, Map((3.0, 15))), 2.0))) 200 | } 201 | } 202 | 203 | test("compute metrics for column with null values") { 204 | withTempDir { dir => 205 | spark.range(1, 50, 1, 5).toDF() 206 | .withColumn("value_1", when(col("id") % 10 === 1, null).otherwise(col("id"))) 207 | .withColumn("value_2", when(col("id") < 20, null).otherwise(col("id"))) 208 | .withColumn("value_3", lit(null).cast(StringType)) 209 | .write.format("delta").save(dir.toString) 210 | 211 | val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark) 212 | 213 | val value1Metrics = deltaClusteringMetric.computeForColumn("value_1") 214 | checkAnswer(value1Metrics, Seq(Row("value_1", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0))) 215 | 216 | // null intervals included in total_file_count and total_uniform_file_count but excluded from other metrics. 217 | val value2Metrics = deltaClusteringMetric.computeForColumn("value_2") 218 | checkAnswer(value2Metrics, Seq(Row("value_2", 5L, 2L, 1.0, buildHistogram(16, Map((1.0, 3))), 0.0))) 219 | 220 | // all intervals are null 221 | val value3Metrics = deltaClusteringMetric.computeForColumn("value_3") 222 | checkAnswer(value3Metrics, Seq(Row("value_3", 5L, 5L, -1, null.asInstanceOf[Map[Double, Int]], -1))) 223 | } 224 | } 225 | 226 | test("compute metrics for allColumns of a table where statistics does not exist for certain columns") { 227 | withTempDir { dir => 228 | val data = spark.range(1, 50, 1, 5).toDF() 229 | .withColumn("value", col("id") * 3) 230 | 231 | data 232 | .filter("1 > 2") 233 | .write.mode("append") 234 | .format("delta").save(dir.toString) 235 | 236 | spark.sql(s"ALTER TABLE delta.`${dir.toString}` SET TBLPROPERTIES ('delta.dataSkippingNumIndexedCols' = '1')") 237 | 238 | data 239 | .write.mode("append") 240 | .format("delta").save(dir.toString) 241 | 242 | val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark) 243 | // computeForAllColumns should compute metrics only for columns with statistics. 244 | val metrics = deltaClusteringMetric.computeForAllColumns() 245 | checkAnswer(metrics, Seq(Row("id", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0))) 246 | } 247 | } 248 | 249 | } 250 | 251 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/sql/delta/test/DeltaExtendedSparkSession.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.sql.delta.test 2 | 3 | import org.apache.spark.sql.delta.catalog.DeltaCatalog 4 | import io.delta.sql.DeltaSparkSessionExtension 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.sql.internal.SQLConf 7 | import org.apache.spark.sql.{SparkSession, SparkSessionExtensions} 8 | import org.apache.spark.sql.test.{SharedSparkSession, TestSparkSession} 9 | 10 | class DeltaTestSparkSession(sparkConf: SparkConf) extends TestSparkSession(sparkConf) { 11 | override val extensions: SparkSessionExtensions = { 12 | val extensions = new SparkSessionExtensions 13 | new DeltaSparkSessionExtension().apply(extensions) 14 | extensions 15 | } 16 | } 17 | 18 | trait DeltaExtendedSparkSession { self: SharedSparkSession => 19 | 20 | override protected def createSparkSession: TestSparkSession = { 21 | SparkSession.cleanupAnyExistingSession() 22 | val session = new DeltaTestSparkSession(sparkConf) 23 | session.conf.set(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[DeltaCatalog].getName) 24 | session 25 | } 26 | } --------------------------------------------------------------------------------