├── .github
    └── workflows
    │   ├── branch_protection.yml
    │   └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── build.sbt
├── documentation
    └── do_not_delete
├── examples
    ├── README.md
    ├── build.sbt
    └── src
    │   └── main
    │       └── scala
    │           └── Quickstart.scala
├── images
    ├── average_overlap.jpg
    ├── average_overlap_depth.jpg
    ├── do_not_delete
    └── total_uniform_file_count.png
├── notebooks
    └── databricks
    │   └── DeltaClusteringMetrics.scala
├── project
    └── build.properties
└── src
    ├── main
        └── scala
        │   └── fr
        │       └── databeans
        │           └── lighthouse
        │               ├── fileStatsIntervalTree
        │                   ├── Interval.scala
        │                   ├── IntervalBoundary.scala
        │                   ├── IntervalTree.scala
        │                   └── Node.scala
        │               └── metrics
        │                   ├── ClusteringMetrics.scala
        │                   ├── Distribution.scala
        │                   └── delta
        │                       ├── DeltaClusteringMetrics.scala
        │                       └── DeltaClusteringMetricsBase.scala
    └── test
        └── scala
            ├── fr
                └── databeans
                │   └── lighthouse
                │       ├── fileStatsIntervalTree
                │           ├── IntervalSpec.scala
                │           ├── IntervalTreeSpec.scala
                │           └── NodeSpec.scala
                │       └── metrics
                │           ├── ClusteringMetricsSpec.scala
                │           └── delta
                │               └── DeltaClusteringMetricsSpec.scala
            └── org
                └── apache
                    └── spark
                        └── sql
                            └── delta
                                └── test
                                    └── DeltaExtendedSparkSession.scala


/.github/workflows/branch_protection.yml:
--------------------------------------------------------------------------------
 1 | # Will be deleted after account upgrade to team ro Enterprise account.
 2 | name: Branch Protection
 3 | 
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | jobs:
10 |   branch-protection:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Enable branch protection
14 |         uses: peter-evans/branch-protection-action@v2
15 |         with:
16 |           token: ${{ secrets.GITHUB_TOKEN }}
17 |           branch: main
18 |           enforce_admins: false
19 |           required_pull_request_reviews: true
20 |           required_status_checks: [Lighthouse Tests]
21 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Lighthouse Tests
 2 | 
 3 | on:
 4 |   [push, pull_request]
 5 | 
 6 | 
 7 | jobs:
 8 |   build:
 9 | 
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 | 
14 |     - uses: actions/checkout@v3
15 |     - name: Set up JDK 1.8
16 |       uses: actions/setup-java@v3
17 |       with:
18 |         java-version: '8'
19 |         distribution: 'zulu'
20 |         cache: 'sbt'
21 | 
22 |     - name: Run tests
23 |       run: sbt test


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | dist/
3 | .cache/
4 | tmp/
5 | .idea/
6 | .DS_Store
7 | .bsp/
8 | spark-warehouse/
9 | project/target/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Databeans
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Lighthouse
  2 | 
  3 | ## OVERVIEW
  4 |  
  5 | Lighthouse is a library developed by DataBeans to optimize Lakehouse performance and reduce its total cost ownership. It is designed to monitor the health of the Lakehouse tables from a data layout perspective and provide valuable insights about how well data is clustered. This information helps users identify when data maintenance operations (vacuum, compaction, clustering …) should be performed, which engenders **improvements in query performance** and **reduction in storage costs**.  
  6 | 
  7 | The Lighthouse library can assist in addressing the following questions:
  8 |  * How well is my data clustered?
  9 |  * Does my data layout favor skipping based on statistics?
 10 |  * Is it advisable to Z-order before running a query on a certain column?
 11 |  * Is my data suffering from the many small files problem?
 12 |  * How frequently should I re-cluster my data to maintain its optimal clustering state?
 13 |  
 14 | ## BUILDING
 15 | 
 16 | Lighthouse is compiled using SBT.
 17 | 
 18 | To compile, run
 19 | ``` 
 20 | sbt compile
 21 | ``` 
 22 | 
 23 | To generate artifacts, run
 24 | ``` 
 25 | sbt package
 26 | ``` 
 27 | 
 28 | To execute tests, run
 29 | ``` 
 30 | sbt test
 31 | ``` 
 32 | 
 33 | ## SETUP INSTRUCTIONS
 34 | 
 35 | ### Prerequisites
 36 | - Apache Spark 3.3.2
 37 | - Delta 2.3.0
 38 | 
 39 | ### Using Spark Shell  
 40 | 1. Open the terminal and run the following command: 
 41 | ``` 
 42 | spark-shell 
 43 | --packages io.delta:delta-core_2.12:2.3.0,io.github.Databeans:lighthouse_2.12:0.1.0 
 44 | --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" 
 45 | --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog"
 46 | ```  
 47 | 
 48 | 2. Import the DeltaClusteringMetrics class :
 49 | ```
 50 | import fr.databeans.lighthouse.metrics.delta.DeltaClusteringMetrics
 51 | ```  
 52 | 
 53 | 3. Compute clustering metrics for a given column of the Delta table:  
 54 | ```
 55 | val clusteringMetrics = DeltaClusteringMetrics.forPath("path/to/your/deltaTable", spark).computeForColumn("col_name")
 56 | ```  
 57 | 
 58 | 4. Display the computed clustering metrics using the show() method:  
 59 | ```
 60 | clusteringMetrics.show() 
 61 | ```
 62 | 
 63 | ### Using spark-submit
 64 | Submit the application to a Spark cluster:
 65 | ``` 
 66 | spark-submit \
 67 |    --class com.example.MyApp \
 68 |    --master <master-url> \
 69 |    --packages io.delta:delta-core_2.12:2.3.0,io.github.Databeans:lighthouse_2.12:0.1.0 \
 70 |    </path/to/your/spark/application.jar>
 71 | ```
 72 | This command specifies the following options:  
 73 | - --class: Name of the main class of your application.  
 74 | - --master: URL of the Spark cluster to use.  
 75 | - --packages: Maven coordinates of the Delta Lake library to use, Maven coordinates of the lighthouse library to use.
 76 | - The path to your application's JAR file.
 77 | 
 78 | Example:
 79 | ```  
 80 | spark-submit 
 81 | --class Quickstart 
 82 | --master local[*] 
 83 | --packages io.delta:delta-core_2.12:2.3.0,io.github.Databeans:lighthouse_2.12:0.1.0 
 84 | target/scala-2.12/clustering-metrics-example_2.12-0.1.jar
 85 | ```  
 86 | ### Using DATABRICKS  
 87 | 1. Install our Maven library to your cluster:
 88 | 
 89 | Go to `compute` > `cluster` > `Libraries` > `Install New` > Set `Source` = **Maven** | `coordinates` = **io.github.Databeans:lighthouse_2.12:0.1.0**
 90 |    
 91 |    (Or Add the Lighthouse_2.12-0.1.0.jar to your cluster)
 92 |    
 93 | 2. Download this [notebook](https://github.com/Databeans/lighthouse/blob/main/notebooks/databricks/DeltaClusteringMetrics.scala) and import it to your workspace.
 94 | 3. Create a new cell in your notebook and insert ```%run <path/to/DeltaClusteringMetrics>```.
 95 | 
 96 |    **PS:**   Replace <path/to/your/DeltaClusteringMetrics> with the actual path to the DeltaClusteringMetrics notebook.  
 97 | 4. Run the cell.   
 98 | 
 99 | With these steps completed, you'll be able to use the DeltaClusteringMetrics library.  
100 | 
101 | ## CLUSTERING METRICS
102 | 
103 | ### Syntax
104 | 
105 | - forName(deltaTable: String, spark: SparkSession): DeltaClusteringMetrics  
106 |      * deltaTable: Name of the Delta table  
107 |      * spark: SparkSession instance
108 |   
109 | 
110 | - forPath(deltaPath: String, spark: SparkSession): DeltaClusteringMetrics  
111 |      * deltaPath: Path of the Delta table  
112 |      * spark: SparkSession instance  
113 | 
114 | 
115 | - computeForColumn(column: String): DataFrame
116 |      * column: column name to compute metrics for
117 |   
118 | 
119 | - computeForColumns(columns: String*): DataFrame
120 |      * columns: columns list to compute metrics for
121 |   
122 | 
123 | - computeForAllColumns(): DataFrame
124 | 
125 | 
126 | ### Usage: 
127 | Assuming that you have a delta table
128 | 
129 | import DeltaClusteringMetrics
130 | ```
131 | import fr.databeans.lighthouse.metrics.delta.DeltaClusteringMetrics
132 | ```
133 | 
134 | compute clustering information for a given column.
135 | 
136 | ```
137 | val clusteringMetric = DeltaClusteringMetrics
138 |  .forPath("path/to/deltaTable", spark)
139 |  .computeForColumn("id")
140 | ```
141 | 
142 | compute clustering information for multiple columns.  
143 | 
144 | ```
145 | val clusteringMetrics = DeltaClusteringMetrics  
146 |   .forName("DeltaTable",spark)  
147 |   .computeForColumns("id","value")  
148 | ```
149 | 
150 | compute clustering information for all columns of the table.  
151 | 
152 | ```
153 | val clusteringMetrics = DeltaClusteringMetrics  
154 |   .forName("DeltaTable",spark)  
155 |   .computeForAllColumns()  
156 | ```  
157 | 
158 | ### Output:
159 | The library will then compute the clustering metrics and generate a dataframe containing the next columns:  
160 | 
161 | | column   | total_file_count | total_uniform_file_count | average_overlap | average_overlap_depth | file_depth_histogram |
162 | |----------|------------------|--------------------------|-----------------|-----------------------|----------------------|
163 | | col_name | 5                | 5                        | 3.0             | 4 .0                  | {5.0 -> 0, 10.0 -... |  
164 |   
165 | 
166 | ```total_file_count```  
167 | Total number of files composing the Delta table.
168 | 
169 | ```total_uniform_file_count```  
170 | Files in which min and max values of a given ordering column are equal
171 | 
172 | ```average_overlap```  
173 | Average number of overlapping files for each file in the delta table.  
174 | The higher the average_overlap, the worse the clustering.
175 | 
176 | ```average_overlap_depth```  
177 | The average number of files that will be read when an overlap occurs.
178 | The higher the average_overlap_depth, the worse the clustering.
179 | 
180 | ```File_depth_histogram```  
181 | A histogram detailing the distribution of the overlap_depth on the table by grouping the tables’ files by their proportional overlap depth.  
182 |    * 0 to 16 with increments of 1.  
183 |    * For buckets larger than 16, increments of twice the width of the previous bucket (e.g. 32, 64, 128, …)  
184 | 
185 | ### Use-case:
186 | 
187 | To gain a comprehensive understanding of the library in action, including:
188 |  * how to utilize the lighthouse library for metric extraction
189 |  * how to interpret the extracted metrics for performing maintenance operations on your data layout
190 | 
191 | We, Databeans, recommend reading the following blog post:
192 | - [Z-ordering: take the Guesswork out (part2)](https://databeans-blogs.medium.com/delta-z-ordering-take-the-guesswork-out-part2-1bdd03121aec)
193 |  
194 | ## NOTES
195 |  
196 | - Lighthouse cannot compute metrics for a column without statistics: Before computing clustering metrics, Lighthouse requires the statistics of the columns to be computed, so if statistics are not available, it will not be able to compute metrics for that column.  
197 | - clustering metrics cannot be computed for partitioning columns  
198 | - When handling a column with all null values, ```the average_overlap``` and ```average_overlap_depth``` metrics will be assigned a value of -1, while the ```file_depth_histogram``` metric will be assigned a null value.  
199 | 
200 | ## LIMITATIONS
201 |  
202 | - Lighthouse currently supports the following data types: Int, Long, Decimal, and String.  
203 | - Lighthouse supports only Delta tables and may not work with other table formats.  
204 | 
205 | ## TECHNOLOGIES
206 |  
207 | Lighthouse supports:  
208 | - Scala 2.12.13  
209 | - Spark 3.3.2  
210 | - Delta 2.3.0  
211 | 
212 | ## CONTRIBUTING
213 |  
214 | Lighthouse is an open-source project, and we welcome contributors from the community. If you have a new feature or improvement, feel free to submit a pull request.  
215 | 
216 | ## BLOGS
217 | 
218 | - [Z-ordering: take the Guesswork out (part1)](https://databeans-blogs.medium.com/z-ordre-take-the-guesswork-out-bad0133d7895)  
219 | - [Z-ordering: take the Guesswork out (part2)](https://databeans-blogs.medium.com/delta-z-ordering-take-the-guesswork-out-part2-1bdd03121aec)
220 | 
221 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | 
 2 | name := "lighthouse"
 3 | 
 4 | version := "0.1.0"
 5 | 
 6 | scalaVersion := "2.12.13"
 7 | 
 8 | val scalaTestVersion = "3.1.1"
 9 | val sparkVersion = "3.3.2"
10 | val deltaVersion = "2.3.0"
11 | 
12 | libraryDependencies += "org.scalactic" %% "scalactic" % scalaTestVersion
13 | libraryDependencies += "org.scalatest" %% "scalatest" % scalaTestVersion % "test"
14 | 
15 | libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion % "provided"
16 | libraryDependencies += "io.delta" %% "delta-core" % deltaVersion % "provided"
17 | 
18 | libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion % Test classifier "tests"
19 | libraryDependencies += "org.apache.spark" %% "spark-catalyst" % sparkVersion % Test classifier "tests"
20 | libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion % Test classifier "tests"
21 | libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion % Test classifier "tests"
22 | libraryDependencies += "org.apache.spark" %% "spark-hive" % sparkVersion % Test classifier "tests"


--------------------------------------------------------------------------------
/documentation/do_not_delete:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Databeans/lighthouse/35217ff9e7c1a956ce65793c1f569a15990fe89a/documentation/do_not_delete


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # lighthouse Example
 2 | This example demonstrates how to use lighthouse to analyze the data layout of a Delta table.  
 3 | lighthouse is a library designed to monitor the health of the Lakehouse from a data layout perspective, and provide valuable insights about how well data is clustered.   
 4 | This example calculates the clustering metrics of a delta table, and prints the results to the console. It can be run if the prerequisites are satisfied.
 5 | 
 6 | ## Prerequisites  
 7 | - Scala 2.12.13
 8 | - Spark 3.3.2
 9 | - Delta 2.3.0  
10 | - lighthouse_2.12-0.1.0.jar  
11 | 
12 | ## Instructions  
13 | To run the example:  
14 | 1. Download or clone the lighthouse project.   
15 | 2. run ```sbt compile``` to compile.  
16 | 3. run ```sbt package``` to generate the jar file.
17 | 4. run ```mkdir examples/lib/ ``` to create the lib directory.  
18 | 5. run ```cp target/scala-2.12/lighthouse_2.12-0.1.0.jar examples/lib/``` to copy the jar in the lib folder.  
19 | 6. Navigate to the examples directory: ```cd examples```.  
20 | 7. Run ```sbt compile``` to compile the example.  
21 | 8. Run ```sbt "runMain Quickstart --master local[*]"``` to execute the example.  
22 | 9. The clustering metrics for the specified Delta table will be printed to the console.    
23 | 
24 | By running this example, you can learn how to use lighthouse to calculate the clustering metrics for a Delta table and interpret the results.  
25 | You can also use this example as a starting point for your own projects.
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/examples/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "clustering-metrics-example"
 2 | 
 3 | version := "0.1"
 4 | scalaVersion := "2.12.13"
 5 | 
 6 | val sparkVersion = "3.3.2"
 7 | val deltaVersion = "2.3.0"
 8 | 
 9 | lazy val root = (project in file("."))
10 |   .settings(
11 |     Compile / unmanagedJars += file("lib/lighthouse_2.12-0.1.0.jar")
12 |   )
13 | 
14 | libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion
15 | libraryDependencies += "io.delta" %% "delta-core" % deltaVersion


--------------------------------------------------------------------------------
/examples/src/main/scala/Quickstart.scala:
--------------------------------------------------------------------------------
 1 | import fr.databeans.lighthouse.metrics.delta.DeltaClusteringMetrics
 2 | import org.apache.spark.sql.SparkSession
 3 | import org.apache.spark.sql.functions.{col, lit}
 4 | import org.apache.spark.sql.types.IntegerType
 5 | 
 6 | object Quickstart {
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     implicit val spark: SparkSession = SparkSession
10 |       .builder()
11 |       .master("local[*]")
12 |       .appName("Quickstart")
13 |       .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
14 |       .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
15 |       .getOrCreate()
16 |     import spark.implicits._
17 | 
18 |     spark.range(1, 5, 1).toDF()
19 |       .withColumn("id", col("id").cast(IntegerType))
20 |       .withColumn("keys", lit(1))
21 |       .withColumn("values", col("id") * 3)
22 |       .write.mode("overwrite")
23 |       .format("delta")
24 |       .save("deltaTable")
25 | 
26 |     val clusteringMetric = DeltaClusteringMetrics
27 |       .forPath("deltaTable", spark)
28 |       .computeForColumn("id")
29 |     clusteringMetric.show()
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/images/average_overlap.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Databeans/lighthouse/35217ff9e7c1a956ce65793c1f569a15990fe89a/images/average_overlap.jpg


--------------------------------------------------------------------------------
/images/average_overlap_depth.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Databeans/lighthouse/35217ff9e7c1a956ce65793c1f569a15990fe89a/images/average_overlap_depth.jpg


--------------------------------------------------------------------------------
/images/do_not_delete:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Databeans/lighthouse/35217ff9e7c1a956ce65793c1f569a15990fe89a/images/do_not_delete


--------------------------------------------------------------------------------
/images/total_uniform_file_count.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Databeans/lighthouse/35217ff9e7c1a956ce65793c1f569a15990fe89a/images/total_uniform_file_count.png


--------------------------------------------------------------------------------
/notebooks/databricks/DeltaClusteringMetrics.scala:
--------------------------------------------------------------------------------
 1 | // Databricks notebook source
 2 | import com.databricks.sql.transaction.tahoe.DeltaLog
 3 | import fr.databeans.lighthouse.metrics.delta.DeltaClusteringMetricsBase
 4 | import org.apache.spark.sql.{DataFrame, SparkSession}
 5 | import org.apache.spark.sql.types.StructType
 6 | 
 7 | case class DeltaClusteringMetrics(deltaLog: DeltaLog, spark: SparkSession) extends DeltaClusteringMetricsBase(spark) {
 8 | 
 9 |   override def schema: StructType = deltaLog.unsafeVolatileSnapshot.schema
10 | 
11 |   override def statsSchema: StructType = deltaLog.unsafeVolatileSnapshot.statsSchema
12 | 
13 |   override def stateWithStats: DataFrame = deltaLog.unsafeVolatileSnapshot.stateDF
14 | 
15 |   override def allColumns: Seq[String] = deltaLog.unsafeVolatileSnapshot.schema.map(_.name)
16 | 
17 |   override def partitionColumns: Seq[String] = deltaLog.unsafeVolatileSnapshot.metadata.partitionColumns
18 | }
19 | 
20 | object DeltaClusteringMetrics {
21 | 
22 |   def forName(deltaTable: String, spark: SparkSession): DeltaClusteringMetrics = {
23 |     val location = spark.sql(s"describe detail $deltaTable").select("location").collect()(0)(0).toString
24 |     val deltaLog = DeltaLog.forTable(spark, location)
25 |     DeltaClusteringMetrics(deltaLog, spark)
26 |   }
27 | 
28 |   def forPath(deltaPath: String, spark: SparkSession): DeltaClusteringMetrics = {
29 |     val deltaLog = DeltaLog.forTable(spark, deltaPath)
30 |     DeltaClusteringMetrics(deltaLog, spark)
31 |   }
32 | }
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 1.4.3


--------------------------------------------------------------------------------
/src/main/scala/fr/databeans/lighthouse/fileStatsIntervalTree/Interval.scala:
--------------------------------------------------------------------------------
 1 | package fr.databeans.lighthouse.fileStatsIntervalTree
 2 | 
 3 | import org.apache.spark.sql.types.{DataType, DecimalType, IntegerType, LongType}
 4 | 
 5 | case class Interval(start: String, end: String, fileName: String, statsType: DataType)
 6 |   extends Comparable[Interval] {
 7 | 
 8 |   def intersects(min: String, max: String): Boolean = {
 9 |     greaterThenOrEqual(max, start) && greaterThenOrEqual(end, min)
10 |   }
11 | 
12 |   def exclusiveIntersects(min: String, max: String): Boolean = {
13 |     greaterThen(max, start) && greaterThen(end, min)
14 |   }
15 | 
16 |   def greaterThenOrEqual(a: String, b: String): Boolean = {
17 |     if (compare(a, b) == -1) false else true
18 |   }
19 | 
20 |   def greaterThen(a: String, b: String): Boolean = {
21 |     if (compare(a, b) == 1) true else false
22 |   }
23 | 
24 |   override def compareTo(o: Interval): Int = {
25 |     val compareStarts = compare(start, o.start)
26 |     if (compareStarts != 0) {
27 |       compareStarts
28 |     }
29 |     else compare(end, o.end)
30 |   }
31 | 
32 |   def compare(a: String, b: String): Int = {
33 |     statsType match {
34 |       case IntegerType => compare[Int](a.toInt, b.toInt)
35 |       case LongType => compare[Long](a.toLong, b.toLong)
36 |       case DecimalType() => {
37 |         compare[BigDecimal](
38 |           new BigDecimal(new java.math.BigDecimal(a)),
39 |           new BigDecimal(new java.math.BigDecimal(b)))
40 |       }
41 |       case _ => compare[String](a, b)
42 |     }
43 |   }
44 | 
45 |   def compare[T: Ordering](a: T, b: T): Int = {
46 |     val ord = implicitly[Ordering[T]]
47 |     import ord.mkOrderingOps
48 |     if (a < b) -1
49 |     else if (a > b) 1
50 |     else 0
51 |   }
52 | 
53 |   def lowerThenPoint(median: String): Boolean = {
54 |     val comp = compare(end, median)
55 |     if (comp == -1) true else false
56 |   }
57 | 
58 |   def greaterThenPoint(median: String): Boolean = {
59 |     val comp = compare(median, start)
60 |     if (comp == -1) true else false
61 |   }
62 | 
63 |   def greaterThenOrEqualPoint(median: String): Boolean = {
64 |     val comp = compare(median, start)
65 |     if (comp == 1) false else true
66 |   }
67 | 
68 |   def lowerThenPointOrEqual(median: String): Boolean = {
69 |     val comp = compare(end, median)
70 |     if (comp == 1) false else true
71 |   }
72 | 
73 |   def startsBefore(point: String): Boolean = {
74 |     val comp = compare(start, point)
75 |     if (comp == 1) false else true
76 |   }
77 | 
78 |   def endsAfter(point: String): Boolean = {
79 |     val comp = compare(end, point)
80 |     if (comp == 1) true else false
81 |   }
82 | 
83 |   def min(a: String, b: String): String = {
84 |     val comp = compare(a, b)
85 |     if (comp == 1) b else a
86 |   }
87 | 
88 |   def max(a: String, b: String): String = {
89 |     val comp = compare(a, b)
90 |     if (comp == 1) a else b
91 |   }
92 | }


--------------------------------------------------------------------------------
/src/main/scala/fr/databeans/lighthouse/fileStatsIntervalTree/IntervalBoundary.scala:
--------------------------------------------------------------------------------
 1 | package fr.databeans.lighthouse.fileStatsIntervalTree
 2 | 
 3 | import org.apache.spark.sql.types.{DataType, DecimalType, IntegerType, LongType}
 4 | 
 5 | case class IntervalBoundary(value: String, statsType: DataType) extends Comparable[IntervalBoundary] {
 6 | 
 7 |   def greaterThenOrEqual(a: String, b: String): Boolean = {
 8 |     if (compare(a, b) == -1) false else true
 9 |   }
10 | 
11 |   def greaterThenOrEqual(b: String): Boolean = {
12 |     if (compare(value, b) == -1) false else true
13 |   }
14 | 
15 |   def greaterThen(b: String): Boolean = {
16 |     if (compare(value, b) == 1) true else false
17 |   }
18 | 
19 |   override def compareTo(o: IntervalBoundary): Int = {
20 |     compare(value, o.value)
21 |   }
22 | 
23 |   def compare(a: String, b: String): Int = {
24 |     statsType match {
25 |       case IntegerType => compare[Int](a.toInt, b.toInt)
26 |       case LongType => compare[Long](a.toLong, b.toLong)
27 |       case DecimalType() =>
28 |         compare[BigDecimal](
29 |           new BigDecimal(new java.math.BigDecimal(a)),
30 |           new BigDecimal(new java.math.BigDecimal(b)))
31 |       case _ => compare[String](a, b)
32 |     }
33 |   }
34 | 
35 |   def compare[T: Ordering](a: T, b: T): Int = {
36 |     val ord = implicitly[Ordering[T]]
37 |     import ord.mkOrderingOps
38 |     if (a < b) -1
39 |     else if (a > b) 1
40 |     else 0
41 |   }
42 | 
43 |   def min(a: String, b: String): String = {
44 |     val comp = compare(a, b)
45 |     if (comp == 1) b else a
46 |   }
47 | }


--------------------------------------------------------------------------------
/src/main/scala/fr/databeans/lighthouse/fileStatsIntervalTree/IntervalTree.scala:
--------------------------------------------------------------------------------
 1 | package fr.databeans.lighthouse.fileStatsIntervalTree
 2 | 
 3 | case class IntervalTree(head: Node, intervals: Seq[Interval]) {
 4 | 
 5 |   def size: Int = intervals.size
 6 | 
 7 |   def isEmpty: Boolean = intervals.isEmpty
 8 | 
 9 |   def nonEmpty: Boolean = intervals.nonEmpty
10 | 
11 |   def getIntervals(i: Interval, inclusive: Boolean = true): List[Interval] =
12 |     head.query(i, inclusive)
13 | }
14 | 
15 | object IntervalTree {
16 |   def apply(intervals: Seq[Interval]): IntervalTree =
17 |     IntervalTree(Node(intervals), intervals)
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/fr/databeans/lighthouse/fileStatsIntervalTree/Node.scala:
--------------------------------------------------------------------------------
  1 | package fr.databeans.lighthouse.fileStatsIntervalTree
  2 | 
  3 | import org.apache.spark.sql.types.{DecimalType, IntegerType, LongType}
  4 | 
  5 | import scala.collection.SortedMap
  6 | import scala.collection.immutable.TreeSet
  7 | import scala.collection.mutable.ListBuffer
  8 | 
  9 | case class Node(
 10 |                  center: String,
 11 |                  left: Option[Node],
 12 |                  right: Option[Node],
 13 |                  intervals: SortedMap[Interval, List[Interval]]) {
 14 | 
 15 | 
 16 |   def size: Int = intervals.size
 17 | 
 18 |   def isEmpty: Boolean = intervals.isEmpty
 19 | 
 20 |   def query(i: Interval, inclusive: Boolean = true): List[Interval] = {
 21 | 
 22 |     val result = ListBuffer.empty[Interval]
 23 | 
 24 |     intervals.takeWhile {
 25 |       case (key, list) =>
 26 |         val overlap = if (inclusive) {
 27 |           key.intersects(i.start, i.end)
 28 |         }
 29 |         else {
 30 |           key.exclusiveIntersects(i.start, i.end)
 31 |         }
 32 |         if (overlap) list.foreach(result += _)
 33 |         if (i.compare(key.start, i.end) == 1) false else true
 34 |     }
 35 | 
 36 |     if (!i.greaterThenPoint(center) && left.isDefined)
 37 |       result ++= left.get.query(i, inclusive)
 38 | 
 39 |     if (!i.lowerThenPoint(center) && right.isDefined)
 40 |       result ++= right.get.query(i, inclusive)
 41 | 
 42 |     result.toList
 43 |   }
 44 | }
 45 | 
 46 | object Node {
 47 | 
 48 |   def medianOf(set: TreeSet[_ >: Int with Long with BigDecimal with String]): Option[String] = {
 49 |     val mid = set.size / 2
 50 | 
 51 |     set.zipWithIndex.find(_._2 == mid) match {
 52 |       case None => None
 53 |       case Some((point, _)) => Some(point.toString)
 54 |     }
 55 |   }
 56 | 
 57 |   def getMedian(intervals: Seq[Interval]): Option[String] = {
 58 |     val statsType = intervals.map(_.statsType).head
 59 |     statsType match {
 60 |       case IntegerType => {
 61 |         var endpoints = TreeSet.empty[Int]
 62 |         intervals.foreach { interval =>
 63 |           endpoints += interval.start.toInt
 64 |           endpoints += interval.end.toInt
 65 |         }
 66 |         medianOf(endpoints)
 67 |       }
 68 |       case LongType => {
 69 |         var endpoints = TreeSet.empty[Long]
 70 |         intervals.foreach { interval =>
 71 |           endpoints += interval.start.toLong
 72 |           endpoints += interval.end.toLong
 73 |         }
 74 |         medianOf(endpoints)
 75 |       }
 76 |       case DecimalType() => {
 77 |         var endpoints = TreeSet.empty[BigDecimal]
 78 |         intervals.foreach { interval =>
 79 |           endpoints += new BigDecimal(new java.math.BigDecimal(interval.start))
 80 |           endpoints += new BigDecimal(new java.math.BigDecimal(interval.end))
 81 |         }
 82 |         medianOf(endpoints)
 83 |       }
 84 | 
 85 |       case _ => {
 86 |         var endpoints = TreeSet.empty[String]
 87 |         intervals.foreach { interval =>
 88 |           endpoints += interval.start
 89 |           endpoints += interval.end
 90 |         }
 91 |         medianOf(endpoints)
 92 |       }
 93 |     }
 94 |   }
 95 | 
 96 |   def apply(intervals: Seq[Interval]): Node = {
 97 | 
 98 |     var intervalsMap = SortedMap.empty[Interval, List[Interval]]
 99 |     val median = getMedian(intervals).get
100 | 
101 |     var leftNodes = List.empty[Interval]
102 |     var rightNodes = List.empty[Interval]
103 | 
104 |     intervals.foreach { interval =>
105 |       if (interval.lowerThenPoint(median)) leftNodes ::= interval
106 |       else if (interval.greaterThenPoint(median)) rightNodes ::= interval
107 |       else intervalsMap ++= Seq(interval -> (interval :: intervalsMap.getOrElse(interval, List.empty)))
108 |     }
109 | 
110 |     if (leftNodes.nonEmpty && rightNodes.nonEmpty) {
111 |       Node(median, Some(Node(leftNodes)), Some(Node(rightNodes)), intervalsMap)
112 |     } else if (leftNodes.nonEmpty)
113 |       Node(median, Some(Node(leftNodes)), None, intervalsMap)
114 |     else if (rightNodes.nonEmpty)
115 |       Node(median, None, Some(Node(rightNodes)), intervalsMap)
116 |     else
117 |       Node(median, None, None, intervalsMap)
118 |   }
119 | }


--------------------------------------------------------------------------------
/src/main/scala/fr/databeans/lighthouse/metrics/ClusteringMetrics.scala:
--------------------------------------------------------------------------------
  1 | package fr.databeans.lighthouse.metrics
  2 | 
  3 | import fr.databeans.lighthouse.fileStatsIntervalTree.{Interval, IntervalBoundary, IntervalTree}
  4 | 
  5 | case class ClusteringMetrics(
  6 |                               column: String,
  7 |                               total_file_count: Long,
  8 |                               total_uniform_file_count: Long,
  9 |                               averageOverlapDepth: Double,
 10 |                               fileDepthHistogram: Map[Double, Int],
 11 |                               averageOverlaps: Double
 12 |                             )
 13 | 
 14 | 
 15 | class ClusteringMetricsBuilder {
 16 | 
 17 |   def computeMetrics(column: String, intervals: Seq[Interval]): ClusteringMetrics = {
 18 | 
 19 |     val uniformFilesCount = countUniformFiles(intervals)
 20 | 
 21 |     val filteredIntervals = intervals.filter(i => i.start != null & i.end != null)
 22 | 
 23 |     if (filteredIntervals.nonEmpty) {
 24 |       val representativePoints = filteredIntervals
 25 |         .flatMap(i => Seq(IntervalBoundary(i.start, i.statsType), IntervalBoundary(i.end, i.statsType)))
 26 |         .distinct
 27 |         .sorted
 28 |         .map(p => Interval(p.value, p.value, p.value, p.statsType))
 29 | 
 30 |       val tree = IntervalTree(filteredIntervals)
 31 |       var depthPerSubInterval: Seq[(Interval, Int)] = Seq()
 32 |       var histogramInput: Seq[(Interval, Int)] = Seq()
 33 |       var i = 0
 34 |       while (i < representativePoints.length) {
 35 |         val upperBoundOverlappingIntervals = tree.getIntervals(representativePoints(i))
 36 |         val upperBoundDepth = upperBoundOverlappingIntervals.size
 37 |         if (i > 0) {
 38 |           val interval = Interval(
 39 |             representativePoints(i - 1).start,
 40 |             representativePoints(i).end,
 41 |             s"]${representativePoints(i - 1).start},${representativePoints(i).end}[",
 42 |             representativePoints(i - 1).statsType
 43 |           )
 44 |           val overlappingIntervals = tree.getIntervals(interval, false)
 45 |           val openIntervalDepth = overlappingIntervals.size
 46 | 
 47 |           if (openIntervalDepth != depthPerSubInterval.last._2) {
 48 |             depthPerSubInterval = depthPerSubInterval ++ Seq((interval, openIntervalDepth))
 49 |           }
 50 | 
 51 |           if (upperBoundDepth != depthPerSubInterval.last._2) {
 52 |             depthPerSubInterval = depthPerSubInterval ++ Seq((representativePoints(i), upperBoundDepth))
 53 |           }
 54 |           histogramInput = histogramInput ++
 55 |             (upperBoundOverlappingIntervals ++ overlappingIntervals)
 56 |               .distinct
 57 |               .map(i => (i, Seq(depthPerSubInterval.last._2, openIntervalDepth, upperBoundDepth).max))
 58 | 
 59 |           i = i + 1
 60 |         }
 61 |         else {
 62 |           depthPerSubInterval = depthPerSubInterval ++ Seq((representativePoints(i), upperBoundDepth))
 63 |           histogramInput = histogramInput ++
 64 |             upperBoundOverlappingIntervals.map(i => (i, upperBoundDepth))
 65 |           i = i + 1
 66 |         }
 67 |       }
 68 | 
 69 |       val fileDepthHistogram = computeDepthHistogram(histogramInput)
 70 |       val averageOverlapDepth = computeAverageOverlapDepth(depthPerSubInterval)
 71 |       val averageOverlaps = computeAverageOverlaps(filteredIntervals)
 72 | 
 73 |       ClusteringMetrics(column, intervals.size.toLong, uniformFilesCount, averageOverlapDepth, fileDepthHistogram, averageOverlaps)
 74 |     }
 75 |     else {
 76 |       ClusteringMetrics(column, intervals.size.toLong, uniformFilesCount, -1.toDouble, null.asInstanceOf[Map[Double, Int]], -1.toDouble)
 77 |     }
 78 |   }
 79 | 
 80 |   private def computeAverageOverlapDepth(depthPerSubInterval: Seq[(Interval, Int)]): Double = {
 81 |     val depths = depthPerSubInterval.filter(_._2 > 1)
 82 |     if (depths.nonEmpty) {
 83 |       "%.4f".format(depths.map(_._2).sum.toFloat / depthPerSubInterval.count(_._2 > 1)).toDouble
 84 |     }
 85 |     else {
 86 |       1.0
 87 |     }
 88 |   }
 89 | 
 90 |   private def computeDepthHistogram(histogramInput: Seq[(Interval, Int)]): Map[Double, Int] = {
 91 |     val data = histogramInput.groupBy(_._1).values.map(_.maxBy(_._2)).map(_._2.toDouble).toList
 92 |     Distribution.histogram(data)
 93 |   }
 94 | 
 95 |   private def computeAverageOverlaps(intervals: Seq[Interval]): Double = {
 96 |     val tree = IntervalTree(intervals)
 97 |     val intervalsOverlaps = intervals
 98 |       .map(i => tree.getIntervals(i).size - 1)
 99 | 
100 |     "%.4f".format(intervalsOverlaps.sum.toFloat / intervalsOverlaps.size).toDouble
101 |   }
102 | 
103 |   private def countUniformFiles(intervals: Seq[Interval]): Int = {
104 |     intervals.count(i => i.start == i.end)
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/src/main/scala/fr/databeans/lighthouse/metrics/Distribution.scala:
--------------------------------------------------------------------------------
 1 | package fr.databeans.lighthouse.metrics
 2 | 
 3 | object Distribution {
 4 | 
 5 |   def roundToPowerOfTwo(element: Int): Double = {
 6 |     val log = Math.log(element) / Math.log(2);
 7 |     val roundLog = Math.round(log);
 8 |     val powerOfTwo = Math.pow(2, roundLog);
 9 |     if (powerOfTwo > element) {
10 |       Math.pow(2, roundLog - 1)
11 |     }
12 |     else {
13 |       Math.pow(2, roundLog)
14 |     }
15 |   }
16 | 
17 |   def getBounds(maxBin: Double): List[Double] = {
18 |     var powerOfTwo = 32
19 |     var bins: List[Int] = List.range(1, 17)
20 |     while (powerOfTwo < maxBin) {
21 |       bins = bins ++ Seq(powerOfTwo)
22 |       powerOfTwo = powerOfTwo * 2
23 |     }
24 |     bins.map(_.toDouble)
25 |   }
26 | 
27 |   def computePopulatedBuckets(data: List[Double]): Map[Double, Int] = {
28 |     data.map(_.floor.toInt).map { e =>
29 |       if (e > 16)
30 |         roundToPowerOfTwo(e)
31 |       else e
32 |     }.groupBy(identity).mapValues(_.size)
33 |   }
34 | 
35 |   def computeUnPopulatedBuckets(maxBin: Double, populatedBuckets: Map[Double, Int]): Map[Double, Int] = {
36 |     getBounds(maxBin).map(e => (e, 0)).toMap.filter(x => !populatedBuckets.keys.toList.contains(x._1))
37 |   }
38 | 
39 |   def histogram(data: List[Double]): Map[Double, Int] = {
40 |     val maxBin = data.max
41 |     val populatedBuckets = computePopulatedBuckets(data)
42 |     val unPopulatedBuckets = computeUnPopulatedBuckets(maxBin, populatedBuckets)
43 |     Map((populatedBuckets ++ unPopulatedBuckets).toSeq.sortBy(_._1): _*)
44 |   }
45 | }


--------------------------------------------------------------------------------
/src/main/scala/fr/databeans/lighthouse/metrics/delta/DeltaClusteringMetrics.scala:
--------------------------------------------------------------------------------
 1 | package fr.databeans.lighthouse.metrics.delta
 2 | 
 3 | import org.apache.spark.sql.delta.DeltaLog
 4 | import org.apache.spark.sql.types.StructType
 5 | import org.apache.spark.sql.{DataFrame, SparkSession}
 6 | 
 7 | 
 8 | case class DeltaClusteringMetrics(deltaLog: DeltaLog, spark: SparkSession) extends DeltaClusteringMetricsBase(spark) {
 9 | 
10 |   override def schema: StructType = deltaLog.unsafeVolatileSnapshot.schema
11 | 
12 |   override def statsSchema: StructType = deltaLog.unsafeVolatileSnapshot.statsSchema
13 | 
14 |   override def stateWithStats: DataFrame = deltaLog.unsafeVolatileSnapshot.stateDF
15 | 
16 |   override def allColumns: Seq[String] = deltaLog.unsafeVolatileSnapshot.schema.map(_.name)
17 | 
18 |   override def partitionColumns: Seq[String] = deltaLog.unsafeVolatileSnapshot.metadata.partitionColumns
19 | }
20 | 
21 | object DeltaClusteringMetrics {
22 | 
23 |   def forName(deltaTable: String, spark: SparkSession): DeltaClusteringMetrics = {
24 |     val location = spark.sql(s"describe detail $deltaTable").select("location").collect()(0)(0).toString
25 |     val deltaLog = DeltaLog.forTable(spark, location)
26 |     DeltaClusteringMetrics(deltaLog, spark)
27 |   }
28 | 
29 |   def forPath(deltaPath: String, spark: SparkSession): DeltaClusteringMetrics = {
30 |     val deltaLog = DeltaLog.forTable(spark, deltaPath)
31 |     DeltaClusteringMetrics(deltaLog, spark)
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/scala/fr/databeans/lighthouse/metrics/delta/DeltaClusteringMetricsBase.scala:
--------------------------------------------------------------------------------
 1 | package fr.databeans.lighthouse.metrics.delta
 2 | 
 3 | import fr.databeans.lighthouse.fileStatsIntervalTree.Interval
 4 | import fr.databeans.lighthouse.metrics.{ClusteringMetrics, ClusteringMetricsBuilder}
 5 | import org.apache.spark.sql.functions._
 6 | import org.apache.spark.sql.types._
 7 | import org.apache.spark.sql.{DataFrame, SparkSession}
 8 | 
 9 | abstract class DeltaClusteringMetricsBase(spark: SparkSession) extends ClusteringMetricsBuilder {
10 | 
11 |   val STATS_COLUMN = "stats"
12 |   val MIN_PREFIX = "minValues"
13 |   val MAX_PREFIX = "maxValues"
14 |   val FILE_RELATIVE_PATH = "add.path"
15 | 
16 |   def schema: StructType
17 | 
18 |   def statsSchema: StructType
19 | 
20 |   def stateWithStats: DataFrame
21 | 
22 |   def allColumns: Seq[String]
23 | 
24 |   def partitionColumns: Seq[String]
25 | 
26 |   def computeForColumn(column: String): DataFrame = {
27 |     import spark.implicits._
28 | 
29 |     Seq(column).map(col => compute(col)).toDF()
30 |   }
31 | 
32 |   def computeForColumns(columns: String*): DataFrame = {
33 |     import spark.implicits._
34 |     columns.map(col => compute(col)).toDF()
35 |   }
36 | 
37 |   def computeForAllColumns(): DataFrame = {
38 |     import spark.implicits._
39 |     val colsWithoutStats = getColumnsWithoutStats()
40 |     val omittedCols = partitionColumns.union(colsWithoutStats)
41 |     allColumns.diff(omittedCols).map(col => compute(col)).toDF()
42 |   }
43 | 
44 |   private def compute(column: String): ClusteringMetrics = {
45 |     val intervals = prepareIntervals(column)
46 |     computeMetrics(column, intervals)
47 |   }
48 | 
49 |   private def prepareIntervals(column: String): Seq[Interval] = {
50 | 
51 |     assert(!isPartitioningColumn(column),
52 |       s"'$column' is a partitioning column. Clustering metrics cannot be computed for partitioning columns")
53 | 
54 |     val dataType = getStatsType(column)
55 | 
56 |     assert(checkIfStatsExists(column), s"no statistics found for column '$column'")
57 | 
58 |     stateWithStats
59 |       .filter(col("add").isNotNull)
60 |       .withColumn(STATS_COLUMN, from_json(col(s"add.$STATS_COLUMN"), statsSchema))
61 |       .select(
62 |         col(s"$FILE_RELATIVE_PATH"),
63 |         col(s"${STATS_COLUMN}.${MIN_PREFIX}.$column").cast(StringType).as("min"),
64 |         col(s"${STATS_COLUMN}.${MAX_PREFIX}.$column").cast(StringType).as("max")
65 |       )
66 |       .collect()
67 |       .map { row =>
68 |         Interval(row.getString(1), row.getString(2), row.getString(0), dataType)
69 |       }
70 |   }
71 | 
72 |   private def getStatsType(column: String): DataType = {
73 |     val extractedColumn = schema
74 |       .filter(_.name == column)
75 | 
76 |     assert(extractedColumn.nonEmpty, s"column $column not found in columns ${allColumns.mkString(",")}")
77 |     extractedColumn.head.dataType
78 |   }
79 | 
80 |   private def checkIfStatsExists(column: String): Boolean = {
81 |     statsSchema.fields.filter(_.name == MIN_PREFIX)
82 |       .map(_.dataType)
83 |       .flatMap {
84 |         case StructType(f) => f
85 |       }.map(_.name)
86 |       .contains(column)
87 |   }
88 | 
89 | 
90 |   private def getColumnsWithoutStats(): Seq[String] ={
91 |     allColumns.filter(col => !checkIfStatsExists(col))
92 |   }
93 | 
94 |   private def isPartitioningColumn(column: String): Boolean = {
95 |     partitionColumns.contains(column)
96 |   }
97 | }
98 | 


--------------------------------------------------------------------------------
/src/test/scala/fr/databeans/lighthouse/fileStatsIntervalTree/IntervalSpec.scala:
--------------------------------------------------------------------------------
 1 | package fr.databeans.lighthouse.fileStatsIntervalTree
 2 | 
 3 | import org.scalatest.funspec.AnyFunSpec
 4 | import org.scalatest.matchers.should.Matchers
 5 | import org.apache.spark.sql.types._
 6 | 
 7 | final class IntervalSpec extends AnyFunSpec with Matchers {
 8 | 
 9 |   describe("An interval [b,c]") {
10 | 
11 |     val interval = Interval("b", "c", "file1", StringType)
12 | 
13 |     it("should not intersect with [d,e]") {
14 |       interval.intersects("d", "e") shouldBe false
15 |     }
16 | 
17 |     it("should intersect with [b,d]") {
18 |       interval.intersects("b", "d") shouldBe true
19 |     }
20 |   }
21 | 
22 |   describe("An interval [b,c]") {
23 | 
24 |     val interval = Interval("b", "c", "file1", StringType)
25 | 
26 |     it("should be greater than [a,e]") {
27 |       val interval2 = Interval("a", "e", "file2", StringType)
28 |       interval.compareTo(interval2) shouldBe 1
29 |     }
30 | 
31 |     it("should be lower than [f,g]") {
32 |       val interval2 = Interval("f", "g", "file2", StringType)
33 |       interval.compareTo(interval2) shouldBe -1
34 |     }
35 | 
36 |     it("should be equal to [b,c]") {
37 |       val interval2 = Interval("b", "c", "file2", StringType)
38 |       interval.compareTo(interval2) shouldBe 0
39 |     }
40 |   }
41 | 
42 |   describe("An interval [1,5]") {
43 | 
44 |     val interval = Interval("1", "5", "file1", IntegerType)
45 | 
46 |     it("should not intersect with [-1,0]") {
47 |       interval.intersects("-1", "0") shouldBe false
48 |     }
49 | 
50 |     it("should not intersect with [6,17]") {
51 |       interval.intersects("6", "17") shouldBe false
52 |     }
53 | 
54 |     it("should intersect with [0,1]") {
55 |       interval.intersects("0", "1") shouldBe true
56 |     }
57 | 
58 |     it("should intersect with [5,6]") {
59 |       interval.intersects("5", "6") shouldBe true
60 |     }
61 | 
62 |     it("should intersect with [2,3]") {
63 |       interval.intersects("2", "3") shouldBe true
64 |     }
65 | 
66 |     it("should intersect with [0,4]") {
67 |       interval.intersects("0", "4") shouldBe true
68 |     }
69 | 
70 |     it("should intersect with [3,10]") {
71 |       interval.intersects("3", "10") shouldBe true
72 |     }
73 | 
74 |     it("should intersect with [0,10]") {
75 |       interval.intersects("0", "10") shouldBe true
76 |     }
77 |   }
78 | 
79 |   describe("An interval [7,17]") {
80 |     val interval = Interval("7", "17", "file1", IntegerType)
81 | 
82 | 
83 |     it("should intersect with [5,20]") {
84 |       interval.intersects("5", "20") shouldBe true
85 |     }
86 |   }
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/src/test/scala/fr/databeans/lighthouse/fileStatsIntervalTree/IntervalTreeSpec.scala:
--------------------------------------------------------------------------------
 1 | package fr.databeans.lighthouse.fileStatsIntervalTree
 2 | 
 3 | import org.apache.spark.sql.types.IntegerType
 4 | import org.scalatest.funspec.AnyFunSpec
 5 | import org.scalatest.matchers.should.Matchers
 6 | 
 7 | final class IntervalTreeSpec extends AnyFunSpec with Matchers {
 8 | 
 9 |   describe("Tree holding a single interval [1, 5]") {
10 |     val intervals = Seq[Interval](Interval("1", "5", "file1", IntegerType))
11 |     val tree = IntervalTree(intervals)
12 | 
13 |     it("should not be empty") {
14 |       tree.isEmpty shouldBe false
15 |     }
16 | 
17 |     it("should return 1 result on query interval [4, 8]") {
18 |       tree.getIntervals(Interval("4", "8", "file1", IntegerType)).size shouldEqual 1
19 |     }
20 |   }
21 | 
22 |   describe("depth.Node holding many intervals") {
23 |     val intervals = Seq[Interval](
24 |       Interval("1", "5", "file2", IntegerType),
25 |       Interval("6", "9", "file3", IntegerType),
26 |       Interval("10", "14", "file4", IntegerType)
27 |     )
28 | 
29 |     val tree = IntervalTree(intervals)
30 | 
31 |     it("should not be empty") {
32 |       tree.isEmpty shouldBe false
33 |     }
34 | 
35 |     it("should return 2 results on query interval [6, 19]") {
36 |       tree.getIntervals(Interval("6", "19", "file5", IntegerType)).size shouldEqual 2
37 |     }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/test/scala/fr/databeans/lighthouse/fileStatsIntervalTree/NodeSpec.scala:
--------------------------------------------------------------------------------
  1 | package fr.databeans.lighthouse.fileStatsIntervalTree
  2 | 
  3 | import org.apache.spark.sql.types.IntegerType
  4 | import org.scalatest.funspec.AnyFunSpec
  5 | import org.scalatest.matchers.should.Matchers
  6 | 
  7 | final class NodeSpec extends AnyFunSpec with Matchers {
  8 | 
  9 |   describe("depth.Node holding a single interval [1, 5]") {
 10 |     val intervals = Seq[Interval](Interval("1", "5", "file1", IntegerType))
 11 |     val node = Node(intervals)
 12 | 
 13 |     it("should not be empty") {
 14 |       assert(node.isEmpty === false)
 15 |     }
 16 | 
 17 |     it("should has no left or right children") {
 18 |       node.left shouldBe None
 19 |       node.right shouldBe None
 20 |     }
 21 | 
 22 |     it("should return 1 result on query interval [4, 8]") {
 23 |       node.query(Interval("4", "8", "file2", IntegerType)).size shouldEqual 1
 24 |     }
 25 | 
 26 |     it("should return 1 result on query interval [2, 3]") {
 27 |       node.query(Interval("2", "3", "file2", IntegerType)).size shouldEqual 1
 28 |     }
 29 | 
 30 |     it("should return 1 result on query interval [0, 1]") {
 31 |       node.query(Interval("0", "1", "file2", IntegerType)).size shouldEqual 1
 32 |     }
 33 | 
 34 |     it("should return 0 result on query interval [-1, 0]") {
 35 |       node.query(Interval("-1", "0", "file2", IntegerType)).size shouldEqual 0
 36 |     }
 37 | 
 38 |     it("should return 1 result on query interval [6, 7]") {
 39 |       node.query(Interval("6", "7", "file2", IntegerType)).size shouldEqual 0
 40 |     }
 41 |   }
 42 | 
 43 |   describe("Node holding many intervals") {
 44 |     val intervals = Seq(
 45 |       Interval("1", "5", "file1", IntegerType),
 46 |       Interval("6", "9", "file2", IntegerType),
 47 |       Interval("10", "14", "file3", IntegerType)
 48 |     )
 49 | 
 50 |     val node = Node(intervals)
 51 | 
 52 |     it("should not be empty") {
 53 |       node.isEmpty shouldBe false
 54 |     }
 55 | 
 56 |     it("should has 1 left and 1 right child") {
 57 |       node.left.get.size shouldEqual 1
 58 |       node.right.get.size shouldEqual 1
 59 |     }
 60 | 
 61 |     it("should return 2 results on query interval [6, 19]") {
 62 |       node.query(Interval("6", "19", "file4", IntegerType)).size shouldEqual 2
 63 |     }
 64 |   }
 65 | 
 66 |   describe("Node holding same interval multiple times") {
 67 |     val intervals = Seq[Interval](
 68 |       Interval("1", "355", "file1", IntegerType),
 69 |       Interval("1", "355", "file2", IntegerType),
 70 |       Interval("1", "355", "file3", IntegerType)
 71 |     )
 72 | 
 73 |     val node = Node(intervals)
 74 | 
 75 |     it("should not be empty") {
 76 |       node.isEmpty shouldBe false
 77 |     }
 78 | 
 79 |     it("should has no left or right children") {
 80 |       node.left shouldBe None
 81 |       node.right shouldBe None
 82 |     }
 83 | 
 84 |     it("should return 3 results on query interval [6, 19]") {
 85 |       node.query(Interval("6", "19", "file4", IntegerType)).size shouldEqual 3
 86 |     }
 87 | 
 88 |     it("should return 3 results on query interval [300, 400]") {
 89 |       node.query(Interval("300", "400", "file4", IntegerType)).size shouldEqual 3
 90 |     }
 91 | 
 92 |     it("should return 0 results on query interval [360, 400]") {
 93 |       node.query(Interval("360", "400", "file4", IntegerType)).size shouldEqual 0
 94 |     }
 95 |   }
 96 | 
 97 |   describe("Node holding sorted intervals") {
 98 |     val intervals = Seq[Interval](
 99 |       Interval("1", "1139", "file1", IntegerType),
100 |       Interval("1139", "2368", "file2", IntegerType),
101 |       Interval("2368", "3503", "file3", IntegerType),
102 |       Interval("3503", "4745", "file4", IntegerType),
103 |       Interval("4745", "5999", "file5", IntegerType),
104 |       Interval("5999", "7200", "file6", IntegerType)
105 |     )
106 | 
107 |     val node = Node(intervals)
108 | 
109 |     it("should not be empty") {
110 |       node.isEmpty shouldBe false
111 |     }
112 | 
113 |     it("should has 1 left and 1 right child") {
114 |       println(node)
115 |       node.left.get.size shouldEqual 2
116 |       node.right.get.size shouldEqual 2
117 |     }
118 | 
119 |     it("should return 2 results on the first and last files and 3 else") {
120 |       node.query(Interval("1", "1139", "file1", IntegerType)).size shouldEqual 2
121 |       node.query(Interval("1139", "2368", "file2", IntegerType)).size shouldEqual 3
122 |       node.query(Interval("2368", "3503", "file3", IntegerType)).size shouldEqual 3
123 |       node.query(Interval("3503", "4745", "file4", IntegerType)).size shouldEqual 3
124 |       node.query(Interval("4745", "5999", "file5", IntegerType)).size shouldEqual 3
125 |       node.query(Interval("5999", "7200", "file6", IntegerType)).size shouldEqual 2
126 |     }
127 |   }
128 | 
129 |   describe("Bug1 Node holding many intervals") {
130 |     val intervals = Seq[Interval](
131 |       Interval("1", "5", "file2", IntegerType),
132 |       Interval("0", "7", "file3", IntegerType),
133 |       Interval("11", "16", "file3", IntegerType),
134 |       Interval("7", "16", "file5", IntegerType),
135 |       Interval("5", "9", "file6", IntegerType),
136 |       Interval("4", "16", "file6", IntegerType),
137 |       Interval("0", "13", "file6", IntegerType),
138 |       Interval("9", "12", "file6", IntegerType),
139 |       Interval("7", "9", "file6", IntegerType),
140 |       Interval("20", "30", "file6", IntegerType),
141 |       Interval("31", "40", "file6", IntegerType)
142 |     )
143 | 
144 |     val node = Node(intervals)
145 | 
146 |     it("should not be empty") {
147 |       node.isEmpty shouldBe false
148 |     }
149 | 
150 |     it("should return 2 results on the first and last files and 3 else") {
151 |       node.query(Interval("11", "16", "file1", IntegerType)).size shouldEqual 5
152 |     }
153 |   }
154 | 
155 |   describe("Node holding many intervals all in one group") {
156 |     val intervals = Seq[Interval](
157 |       Interval("16", "32", "file2", IntegerType),
158 |       Interval("4", "40", "file3", IntegerType),
159 |       Interval("10", "38", "file3", IntegerType),
160 |       Interval("2", "24", "file5", IntegerType),
161 |       Interval("6", "28", "file6", IntegerType)
162 |     )
163 | 
164 |     val node = Node(intervals)
165 | 
166 |     it("should not be empty") {
167 |       node.isEmpty shouldBe false
168 |     }
169 | 
170 |     it("should return 2 results on the first and last files and 3 else") {
171 |       node.query(Interval("11", "16", "file1", IntegerType)).size shouldEqual 5
172 |     }
173 |   }
174 | 
175 | }
176 | 


--------------------------------------------------------------------------------
/src/test/scala/fr/databeans/lighthouse/metrics/ClusteringMetricsSpec.scala:
--------------------------------------------------------------------------------
  1 | package fr.databeans.lighthouse.metrics
  2 | 
  3 | import fr.databeans.lighthouse.fileStatsIntervalTree.{Interval, Node}
  4 | import org.apache.spark.sql.types.{DecimalType, IntegerType}
  5 | import org.scalatest.funspec.AnyFunSpec
  6 | import org.scalatest.matchers.should.Matchers
  7 | 
  8 | class ClusteringMetricsSpec extends AnyFunSpec with Matchers {
  9 | 
 10 |   def buildHistogram(maxBin: Int, populatedBuckets: Map[Double, Int]): Map[Double, Int] = {
 11 |     val missingBins = Distribution.computeUnPopulatedBuckets(maxBin, populatedBuckets)
 12 |     missingBins ++ populatedBuckets
 13 |   }
 14 | 
 15 |   describe("compute the overlap metrics") {
 16 | 
 17 |     it("should return 2.0 as overlap depth") {
 18 | 
 19 |       val intervals1 = Seq[Interval](
 20 |         Interval("1", "2", "file2", IntegerType),
 21 |         Interval("3", "4", "file3", IntegerType),
 22 |         Interval("0", "5", "file5", IntegerType),
 23 |         Interval("4", "10", "file6", IntegerType),
 24 |         Interval("14", "15", "file8", IntegerType),
 25 |         Interval("14", "20", "file9", IntegerType)
 26 |       )
 27 | 
 28 |       val clusteringMetricsBuilder = new ClusteringMetricsBuilder()
 29 |       val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals1)
 30 |       val avgOverlapDepth = overlapMetrics.averageOverlapDepth
 31 |       val overlapDepthHistogram = overlapMetrics.fileDepthHistogram
 32 |       val averageOverlaps = overlapMetrics.averageOverlaps
 33 | 
 34 |       avgOverlapDepth shouldBe 2.2
 35 | 
 36 |       overlapDepthHistogram shouldBe
 37 |         buildHistogram(16, Map((2.0, 3), (3.0, 3)))
 38 | 
 39 |       averageOverlaps shouldBe 1.6667
 40 |     }
 41 | 
 42 |     it("should return the number of files when all files have the same min max") {
 43 | 
 44 |       val intervals2 = Seq[Interval](
 45 |         Interval("1", "2", "file2", IntegerType),
 46 |         Interval("1", "2", "file3", IntegerType),
 47 |         Interval("1", "2", "file5", IntegerType),
 48 |         Interval("1", "2", "file6", IntegerType)
 49 |       )
 50 | 
 51 |       val clusteringMetricsBuilder = new ClusteringMetricsBuilder()
 52 |       val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals2)
 53 |       val avgOverlapDepth = overlapMetrics.averageOverlapDepth
 54 |       val overlapDepthHistogram = overlapMetrics.fileDepthHistogram
 55 |       val averageOverlaps = overlapMetrics.averageOverlaps
 56 | 
 57 |       avgOverlapDepth shouldBe 4.0000
 58 | 
 59 |       overlapDepthHistogram shouldBe
 60 |         buildHistogram(16, Map((4.0, 4)))
 61 | 
 62 |       averageOverlaps shouldBe 3.0
 63 |     }
 64 | 
 65 |     it("should return 3.7778 as overlap depth and compute the histogram") {
 66 | 
 67 |       val intervals3 = Seq[Interval](
 68 |         Interval("1", "5", "file2", IntegerType),
 69 |         Interval("0", "7", "file3", IntegerType),
 70 |         Interval("11", "16", "file4", IntegerType),
 71 |         Interval("7", "16", "file5", IntegerType),
 72 |         Interval("5", "9", "file6", IntegerType),
 73 |         Interval("4", "16", "file7", IntegerType),
 74 |         Interval("0", "13", "file8", IntegerType),
 75 |         Interval("9", "12", "file9", IntegerType),
 76 |         Interval("7", "9", "file10", IntegerType),
 77 |         Interval("20", "30", "file11", IntegerType),
 78 |         Interval("31", "40", "file12", IntegerType)
 79 |       )
 80 | 
 81 |       val clusteringMetricsBuilder = new ClusteringMetricsBuilder()
 82 |       val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals3)
 83 |       val avgOverlapDepth = overlapMetrics.averageOverlapDepth
 84 |       val overlapDepthHistogram = overlapMetrics.fileDepthHistogram
 85 |       val averageOverlaps = overlapMetrics.averageOverlaps
 86 |       val total_file_count = overlapMetrics.total_file_count
 87 |       val total_uniform_file_count = overlapMetrics.total_uniform_file_count
 88 | 
 89 |       avgOverlapDepth shouldBe 4.25
 90 | 
 91 |       overlapDepthHistogram shouldBe
 92 |         buildHistogram(16, Map((1.0, 2), (5.0, 2), (6.0, 7)))
 93 | 
 94 |       averageOverlaps shouldBe 5.0909
 95 | 
 96 |       total_file_count shouldBe 11
 97 | 
 98 |       total_uniform_file_count shouldBe 0
 99 |     }
100 | 
101 |     it("should return 1 as overlap depth and compute the histogram") {
102 | 
103 |       val intervals4 = Seq[Interval](
104 |         Interval("1", "2", "file2", IntegerType),
105 |         Interval("3", "4", "file3", IntegerType),
106 |         Interval("5", "6", "file4", IntegerType),
107 |         Interval("7", "8", "file5", IntegerType)
108 |       )
109 | 
110 |       val clusteringMetricsBuilder = new ClusteringMetricsBuilder()
111 |       val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals4)
112 |       val avgOverlapDepth = overlapMetrics.averageOverlapDepth
113 |       val overlapDepthHistogram = overlapMetrics.fileDepthHistogram
114 |       val averageOverlaps = overlapMetrics.averageOverlaps
115 |       val total_file_count = overlapMetrics.total_file_count
116 |       val total_uniform_file_count = overlapMetrics.total_uniform_file_count
117 | 
118 |       avgOverlapDepth shouldBe 1.0
119 | 
120 |       overlapDepthHistogram shouldBe
121 |         buildHistogram(16, Map((1.0, 4)))
122 | 
123 |       averageOverlaps shouldBe 0
124 | 
125 |       total_file_count shouldBe 4
126 | 
127 |       total_uniform_file_count shouldBe 0
128 |     }
129 | 
130 |     it("BUG: min = max for all intervals") {
131 | 
132 |       val intervals5 = Seq[Interval](
133 |         Interval("1", "1", "file2", IntegerType),
134 |         Interval("1", "1", "file3", IntegerType),
135 |         Interval("1", "1", "file5", IntegerType),
136 |         Interval("1", "1", "file6", IntegerType)
137 |       )
138 | 
139 |       val clusteringMetricsBuilder = new ClusteringMetricsBuilder()
140 |       val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals5)
141 |       val avgOverlapDepth = overlapMetrics.averageOverlapDepth
142 |       val overlapDepthHistogram = overlapMetrics.fileDepthHistogram
143 |       val averageOverlaps = overlapMetrics.averageOverlaps
144 |       val total_file_count = overlapMetrics.total_file_count
145 |       val total_uniform_file_count = overlapMetrics.total_uniform_file_count
146 | 
147 |       avgOverlapDepth shouldBe 4.0000
148 | 
149 |       overlapDepthHistogram shouldBe
150 |         buildHistogram(16, Map((4.0, 4)))
151 | 
152 |       averageOverlaps shouldBe 3.0
153 | 
154 |       total_file_count shouldBe 4
155 | 
156 |       total_uniform_file_count shouldBe 4
157 |     }
158 | 
159 |     it("example 2") {
160 | 
161 |       val intervals2 = Seq[Interval](
162 |         Interval("1", "4", "file2", IntegerType),
163 |         Interval("2", "6", "file3", IntegerType),
164 |         Interval("5", "7", "file5", IntegerType),
165 |         Interval("5", "10", "file6", IntegerType)
166 |       )
167 | 
168 |       val clusteringMetricsBuilder = new ClusteringMetricsBuilder()
169 |       val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals2)
170 |       val avgOverlapDepth = overlapMetrics.averageOverlapDepth
171 |       val overlapDepthHistogram = overlapMetrics.fileDepthHistogram
172 |       val averageOverlaps = overlapMetrics.averageOverlaps
173 |       val total_file_count = overlapMetrics.total_file_count
174 |       val total_uniform_file_count = overlapMetrics.total_uniform_file_count
175 | 
176 |       avgOverlapDepth shouldBe 2.3333
177 | 
178 |       overlapDepthHistogram shouldBe
179 |         buildHistogram(16, Map((2.0, 1), (3.0, 3)))
180 | 
181 |       averageOverlaps shouldBe 2.0
182 | 
183 |       total_file_count shouldBe 4
184 | 
185 |       total_uniform_file_count shouldBe 0
186 |     }
187 | 
188 |     it("example 3") {
189 | 
190 |       val intervals2 = Seq[Interval](
191 |         Interval("1", "2", "file2", IntegerType),
192 |         Interval("3", "5", "file3", IntegerType),
193 |         Interval("3", "5", "file5", IntegerType),
194 |         Interval("3", "5", "file6", IntegerType)
195 |       )
196 | 
197 |       val clusteringMetricsBuilder = new ClusteringMetricsBuilder()
198 |       val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals2)
199 |       val avgOverlapDepth = overlapMetrics.averageOverlapDepth
200 |       val overlapDepthHistogram = overlapMetrics.fileDepthHistogram
201 |       val averageOverlaps = overlapMetrics.averageOverlaps
202 |       val total_file_count = overlapMetrics.total_file_count
203 |       val total_uniform_file_count = overlapMetrics.total_uniform_file_count
204 | 
205 |       avgOverlapDepth shouldBe 3
206 | 
207 |       overlapDepthHistogram shouldBe
208 |         buildHistogram(16, Map((1.0, 1), (3.0, 3)))
209 | 
210 |       averageOverlaps shouldBe 1.5
211 | 
212 |       total_file_count shouldBe 4
213 | 
214 |       total_uniform_file_count shouldBe 0
215 |     }
216 | 
217 |     it("example 4") {
218 | 
219 |       val intervals2 = Seq[Interval](
220 |         Interval("1", "2", "file2", IntegerType),
221 |         Interval("3", "5", "file3", IntegerType),
222 |         Interval("4", "7", "file5", IntegerType),
223 |         Interval("6", "8", "file6", IntegerType)
224 |       )
225 | 
226 |       val clusteringMetricsBuilder = new ClusteringMetricsBuilder()
227 |       val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals2)
228 |       val avgOverlapDepth = overlapMetrics.averageOverlapDepth
229 |       val overlapDepthHistogram = overlapMetrics.fileDepthHistogram
230 |       val averageOverlaps = overlapMetrics.averageOverlaps
231 |       val total_file_count = overlapMetrics.total_file_count
232 |       val total_uniform_file_count = overlapMetrics.total_uniform_file_count
233 | 
234 |       avgOverlapDepth shouldBe 2
235 | 
236 |       overlapDepthHistogram shouldBe
237 |         buildHistogram(16, Map((1.0, 1), (2.0, 3)))
238 | 
239 |       averageOverlaps shouldBe 1.0
240 | 
241 |       total_file_count shouldBe 4
242 | 
243 |       total_uniform_file_count shouldBe 0
244 |     }
245 | 
246 |     it("intervals have one uniform interval") {
247 | 
248 |       val intervals = Seq[Interval](
249 |         Interval("1", "5", "file2", IntegerType),
250 |         Interval("4", "8", "file3", IntegerType),
251 |         Interval("6", "9", "file5", IntegerType),
252 |         Interval("7", "7", "file6", IntegerType)
253 |       )
254 | 
255 |       val clusteringMetricsBuilder = new ClusteringMetricsBuilder()
256 |       val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals)
257 |       val avgOverlapDepth = overlapMetrics.averageOverlapDepth
258 |       val overlapDepthHistogram = overlapMetrics.fileDepthHistogram
259 |       val averageOverlaps = overlapMetrics.averageOverlaps
260 |       val total_file_count = overlapMetrics.total_file_count
261 |       val total_uniform_file_count = overlapMetrics.total_uniform_file_count
262 | 
263 |       avgOverlapDepth shouldBe 2.25
264 | 
265 |       overlapDepthHistogram shouldBe
266 |         buildHistogram(16, Map((2.0, 1), (3.0, 3)))
267 | 
268 |       averageOverlaps shouldBe 2.0
269 | 
270 |       total_file_count shouldBe 4
271 | 
272 |       total_uniform_file_count shouldBe 1
273 |     }
274 | 
275 |     it("intervals have two uniform intervals") {
276 | 
277 |       val intervals = Seq[Interval](
278 |         Interval("1", "5", "file2", IntegerType),
279 |         Interval("4", "8", "file3", IntegerType),
280 |         Interval("6", "9", "file5", IntegerType),
281 |         Interval("7", "7", "file6", IntegerType),
282 |         Interval("7", "7", "file7", IntegerType)
283 |       )
284 | 
285 |       val clusteringMetricsBuilder = new ClusteringMetricsBuilder()
286 |       val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals)
287 |       val avgOverlapDepth = overlapMetrics.averageOverlapDepth
288 |       val overlapDepthHistogram = overlapMetrics.fileDepthHistogram
289 |       val averageOverlaps = overlapMetrics.averageOverlaps
290 |       val total_file_count = overlapMetrics.total_file_count
291 |       val total_uniform_file_count = overlapMetrics.total_uniform_file_count
292 | 
293 |       avgOverlapDepth shouldBe 2.5
294 | 
295 |       overlapDepthHistogram shouldBe
296 |         buildHistogram(16, Map((2.0, 1), (4.0, 4)))
297 | 
298 |       averageOverlaps shouldBe 2.8
299 | 
300 |       total_file_count shouldBe 5
301 | 
302 |       total_uniform_file_count shouldBe 2
303 |     }
304 | 
305 |     it("intervals start with uniform interval and have gaps") {
306 | 
307 |       val intervals = Seq[Interval](
308 |         Interval("0", "0", "file1", IntegerType),
309 |         Interval("0", "0", "file2", IntegerType),
310 |         Interval("1", "5", "file2", IntegerType),
311 |         Interval("4", "8", "file3", IntegerType),
312 |         Interval("6", "9", "file5", IntegerType),
313 |         Interval("7", "7", "file6", IntegerType),
314 |         Interval("7", "7", "file7", IntegerType)
315 |       )
316 | 
317 |       val clusteringMetricsBuilder = new ClusteringMetricsBuilder()
318 |       val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals)
319 |       val avgOverlapDepth = overlapMetrics.averageOverlapDepth
320 |       val overlapDepthHistogram = overlapMetrics.fileDepthHistogram
321 |       val averageOverlaps = overlapMetrics.averageOverlaps
322 |       val total_file_count = overlapMetrics.total_file_count
323 |       val total_uniform_file_count = overlapMetrics.total_uniform_file_count
324 | 
325 |       avgOverlapDepth shouldBe 2.4
326 | 
327 |       overlapDepthHistogram shouldBe
328 |         buildHistogram(16, Map((2.0, 3), (4.0, 4)))
329 | 
330 |       averageOverlaps shouldBe 2.2857
331 | 
332 |       total_file_count shouldBe 7
333 | 
334 |       total_uniform_file_count shouldBe 4
335 |     }
336 | 
337 | 
338 |   }
339 | 
340 |   describe("decimal type") {
341 |     it("BUG: decimal type should be supported for statistics") {
342 | 
343 |       val intervals = Seq[Interval](
344 |         Interval("-8.00", "-5.00", "file1", DecimalType(5, 2))
345 |       )
346 | 
347 |       val clusteringMetricsBuilder = new ClusteringMetricsBuilder()
348 |       val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals)
349 |       val avgOverlapDepth = overlapMetrics.averageOverlapDepth
350 |       val overlapDepthHistogram = overlapMetrics.fileDepthHistogram
351 |       val averageOverlaps = overlapMetrics.averageOverlaps
352 |       val total_file_count = overlapMetrics.total_file_count
353 |       val total_uniform_file_count = overlapMetrics.total_uniform_file_count
354 | 
355 |       avgOverlapDepth shouldBe 1.0
356 | 
357 |       overlapDepthHistogram shouldBe
358 |         buildHistogram(16, Map((1.0, 1)))
359 | 
360 |       averageOverlaps shouldBe 0
361 | 
362 |       total_file_count shouldBe 1
363 | 
364 |       total_uniform_file_count shouldBe 0
365 |     }
366 |   }
367 | 
368 | 
369 |   describe("Node holding many intervals all in one group") {
370 |     val intervals = Seq[Interval](
371 |       Interval("16", "32", "file2", IntegerType),
372 |       Interval("4", "40", "file3", IntegerType),
373 |       Interval("10", "38", "file3", IntegerType),
374 |       Interval("2", "24", "file5", IntegerType),
375 |       Interval("6", "28", "file6", IntegerType)
376 |     )
377 | 
378 |     val node = Node(intervals)
379 | 
380 |     it("should not be empty") {
381 |       node.isEmpty shouldBe false
382 |     }
383 | 
384 |     it("should return 5 as overlap depth") {
385 |       val clusteringMetricsBuilder = new ClusteringMetricsBuilder()
386 |       val overlapMetrics = clusteringMetricsBuilder.computeMetrics("colA", intervals)
387 |       val avgOverlapDepth = overlapMetrics.averageOverlapDepth
388 |       val overlapDepthHistogram = overlapMetrics.fileDepthHistogram
389 | 
390 |       avgOverlapDepth shouldBe 3.2857
391 | 
392 |       overlapDepthHistogram shouldBe
393 |         buildHistogram(16, Map((5.0, 5)))
394 |     }
395 |   }
396 | }
397 | 


--------------------------------------------------------------------------------
/src/test/scala/fr/databeans/lighthouse/metrics/delta/DeltaClusteringMetricsSpec.scala:
--------------------------------------------------------------------------------
  1 | package fr.databeans.lighthouse.metrics.delta
  2 | 
  3 | import fr.databeans.lighthouse.metrics.Distribution
  4 | import org.apache.spark.sql.{QueryTest, Row}
  5 | import org.apache.spark.sql.delta.DeltaLog
  6 | import org.apache.spark.sql.delta.test.DeltaExtendedSparkSession
  7 | import org.apache.spark.sql.functions._
  8 | import org.apache.spark.sql.test.SharedSparkSession
  9 | import org.apache.spark.sql.types._
 10 | 
 11 | class DeltaClusteringMetricsSpec extends QueryTest with SharedSparkSession with DeltaExtendedSparkSession {
 12 | 
 13 |   def buildHistogram(maxBin: Int, populatedBuckets: Map[Double, Int]): Map[Double, Int] = {
 14 |     val missingBins = Distribution.computeUnPopulatedBuckets(maxBin, populatedBuckets)
 15 |     missingBins ++ populatedBuckets
 16 |   }
 17 | 
 18 |   def getStats(deltaPath: String, column: String) = {
 19 |     DeltaLog.forTable(spark, deltaPath).snapshot.withStats
 20 |       .select(
 21 |         col("path"),
 22 |         col(s"stats.minValues.$column").as("min"),
 23 |         col(s"stats.maxValues.$column").as("max")
 24 |       )
 25 |   }
 26 | 
 27 |   override def beforeAll(): Unit = {
 28 |     super.beforeAll()
 29 |     spark.sparkContext.setLogLevel("ERROR")
 30 |   }
 31 | 
 32 |   test("compute metrics for a delta table with non overlapping files") {
 33 |     withTempDir { dir =>
 34 |       spark.range(1, 50, 1, 5).toDF()
 35 |         .write.mode("overwrite")
 36 |         .format("delta")
 37 |         .save(dir.toString)
 38 | 
 39 | 
 40 |       val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark)
 41 |       val metrics = deltaClusteringMetric.computeForColumn("id")
 42 |       checkAnswer(metrics, Row("id", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0))
 43 |     }
 44 |   }
 45 | 
 46 |   test("compute metrics for a delta table with all overlapping files") {
 47 |     withTempDir { dir =>
 48 |       spark.range(1, 50, 1, 5).toDF()
 49 |         .withColumn("key", lit(1))
 50 |         .write.mode("overwrite")
 51 |         .format("delta")
 52 |         .save(dir.toString)
 53 | 
 54 |       val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark)
 55 |       val metrics = deltaClusteringMetric.computeForColumn("key")
 56 |       checkAnswer(metrics, Row("key", 5L, 5L, 5.0, buildHistogram(16, Map((5.0, 5))), 4.0))
 57 |     }
 58 |   }
 59 | 
 60 |   test("compute metrics for table defined by name") {
 61 |     withTable("deltaTable") {
 62 |       spark.range(1, 50, 1, 5).toDF()
 63 |         .write.format("delta").saveAsTable("deltaTable")
 64 | 
 65 |       val deltaClusteringMetric = DeltaClusteringMetrics.forName("deltaTable", spark)
 66 |       val metrics = deltaClusteringMetric.computeForColumn("id")
 67 |       checkAnswer(metrics, Row("id", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0))
 68 |     }
 69 |   }
 70 | 
 71 |   test("compute metrics for a column without statistics") {
 72 |     withTempDir { dir =>
 73 |       val data = spark.range(1, 50, 1, 5).toDF()
 74 |         .withColumn("value", col("id") * 3)
 75 | 
 76 |       data
 77 |         .filter("1 > 2")
 78 |         .write.mode("append")
 79 |         .format("delta").save(dir.toString)
 80 | 
 81 |       spark.sql(s"ALTER TABLE delta.`${dir.toString}` SET TBLPROPERTIES ('delta.dataSkippingNumIndexedCols' = '1')")
 82 | 
 83 |       data
 84 |         .write.mode("append")
 85 |         .format("delta").save(dir.toString)
 86 | 
 87 |       val thrown = intercept[AssertionError] {
 88 |         val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark)
 89 |         deltaClusteringMetric.computeForColumn("value")
 90 |       }
 91 |       assert(thrown.getMessage === "assertion failed: no statistics found for column 'value'")
 92 |     }
 93 |   }
 94 | 
 95 |   test("compute metrics for a non existent column") {
 96 |     withTempDir { dir =>
 97 |       spark.range(1, 50, 1, 5).toDF()
 98 |         .write.format("delta").save(dir.toString)
 99 | 
100 |       val thrown = intercept[AssertionError] {
101 |         val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark)
102 |         deltaClusteringMetric.computeForColumn("non_existent_column")
103 |       }
104 |       assert(thrown.getMessage.contains("assertion failed: column non_existent_column not found in columns"))
105 |     }
106 |   }
107 | 
108 |   test("compute metrics for all columns of the table") {
109 |     withTempDir { dir =>
110 |       spark.range(1, 50, 1, 5).toDF()
111 |         .withColumn("id", col("id").cast(IntegerType))
112 |         .withColumn("value", lit(1))
113 |         .write.mode("overwrite")
114 |         .format("delta").save(dir.toString)
115 | 
116 |       val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark)
117 |       val metrics = deltaClusteringMetric.computeForAllColumns()
118 | 
119 |       checkAnswer(
120 |         metrics,
121 |         Seq(
122 |           Row("id", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0),
123 |           Row("value", 5L, 5L, 5.0, buildHistogram(16, Map((5.0, 5))), 4.0)
124 |         )
125 |       )
126 |     }
127 |   }
128 | 
129 |   test("compute metrics for a subset columns of the table") {
130 |     withTempDir { dir =>
131 |       spark.range(1, 50, 1, 5).toDF()
132 |         .withColumn("id", col("id"))
133 |         .withColumn("value1", lit(1))
134 |         .withColumn("value2", lit(2))
135 |         .write.format("delta").save(dir.toString)
136 | 
137 |       val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark)
138 |       val metrics = deltaClusteringMetric.computeForColumns("id", "value1")
139 | 
140 |       checkAnswer(
141 |         metrics,
142 |         Seq(
143 |           Row("id", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0),
144 |           Row("value1", 5L, 5L, 5.0, buildHistogram(16, Map((5.0, 5))), 4.0)
145 |         )
146 |       )
147 |     }
148 |   }
149 | 
150 |   test("compute metrics for supported Data Types") {
151 |     withTempDir { dir =>
152 |       spark.range(1, 50, 1, 5).toDF()
153 |         .withColumn("value_int", col("id").cast(IntegerType))
154 |         .withColumn("value_long", col("id").cast(LongType))
155 |         .withColumn("value_decimal", col("id").cast(DecimalType(4, 2)))
156 |         .withColumn("value_string", format_string("%02d", col("id")))
157 |         .drop("id")
158 |         .write.format("delta").save(dir.toString)
159 | 
160 |       val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark)
161 |       val metrics = deltaClusteringMetric.computeForAllColumns()
162 | 
163 |       checkAnswer(
164 |         metrics,
165 |         Seq(
166 |           Row("value_int", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0),
167 |           Row("value_long", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0),
168 |           Row("value_decimal", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0),
169 |           Row("value_string", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0)
170 |         )
171 |       )
172 |     }
173 |   }
174 | 
175 |   test("compute metrics for a partitioned delta table") {
176 |     withTempDir { dir =>
177 |       spark.range(1, 50, 1, 5).toDF()
178 |         .withColumn("part", col("id") % 3)
179 |         .write.partitionBy("part").format("delta").save(dir.toString)
180 | 
181 |       val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark)
182 | 
183 |       val errorMessage = "assertion failed: 'part' is a partitioning column. Clustering metrics cannot be computed for partitioning columns"
184 | 
185 |       // computeForColumn should fail
186 |       val thrown1 = intercept[AssertionError] {
187 |         deltaClusteringMetric.computeForColumn("part")
188 |       }
189 |       assert(thrown1.getMessage == errorMessage)
190 | 
191 |       // computeForColumns should fail
192 |       val thrown2 = intercept[AssertionError] {
193 |         deltaClusteringMetric.computeForColumns("part", "id")
194 |       }
195 |       assert(thrown2.getMessage == errorMessage)
196 | 
197 |       // computeForAllColumns should compute metrics for non partitioning columns only.
198 |       val metrics = deltaClusteringMetric.computeForAllColumns()
199 |       checkAnswer(metrics, Seq(Row("id", 15L, 0L, 2.3333, buildHistogram(16, Map((3.0, 15))), 2.0)))
200 |     }
201 |   }
202 | 
203 |   test("compute metrics for column with null values") {
204 |     withTempDir { dir =>
205 |       spark.range(1, 50, 1, 5).toDF()
206 |         .withColumn("value_1", when(col("id") % 10 === 1, null).otherwise(col("id")))
207 |         .withColumn("value_2", when(col("id") < 20, null).otherwise(col("id")))
208 |         .withColumn("value_3", lit(null).cast(StringType))
209 |         .write.format("delta").save(dir.toString)
210 | 
211 |       val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark)
212 | 
213 |       val value1Metrics = deltaClusteringMetric.computeForColumn("value_1")
214 |       checkAnswer(value1Metrics, Seq(Row("value_1", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0)))
215 | 
216 |       // null intervals included in total_file_count and total_uniform_file_count but excluded from other metrics.
217 |       val value2Metrics = deltaClusteringMetric.computeForColumn("value_2")
218 |       checkAnswer(value2Metrics, Seq(Row("value_2", 5L, 2L, 1.0, buildHistogram(16, Map((1.0, 3))), 0.0)))
219 | 
220 |       // all intervals are null
221 |       val value3Metrics = deltaClusteringMetric.computeForColumn("value_3")
222 |       checkAnswer(value3Metrics, Seq(Row("value_3", 5L, 5L, -1, null.asInstanceOf[Map[Double, Int]], -1)))
223 |     }
224 |   }
225 | 
226 |   test("compute metrics for allColumns of a table where statistics does not exist for certain columns") {
227 |     withTempDir { dir =>
228 |       val data = spark.range(1, 50, 1, 5).toDF()
229 |         .withColumn("value", col("id") * 3)
230 | 
231 |       data
232 |         .filter("1 > 2")
233 |         .write.mode("append")
234 |         .format("delta").save(dir.toString)
235 | 
236 |       spark.sql(s"ALTER TABLE delta.`${dir.toString}` SET TBLPROPERTIES ('delta.dataSkippingNumIndexedCols' = '1')")
237 | 
238 |       data
239 |         .write.mode("append")
240 |         .format("delta").save(dir.toString)
241 | 
242 |       val deltaClusteringMetric = DeltaClusteringMetrics.forPath(dir.toString, spark)
243 |       // computeForAllColumns should compute metrics only for columns with statistics.
244 |       val metrics = deltaClusteringMetric.computeForAllColumns()
245 |       checkAnswer(metrics, Seq(Row("id", 5L, 0L, 1.0, buildHistogram(16, Map((1.0, 5))), 0.0)))
246 |     }
247 |   }
248 | 
249 | }
250 | 
251 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/sql/delta/test/DeltaExtendedSparkSession.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.sql.delta.test
 2 | 
 3 | import org.apache.spark.sql.delta.catalog.DeltaCatalog
 4 | import io.delta.sql.DeltaSparkSessionExtension
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.sql.internal.SQLConf
 7 | import org.apache.spark.sql.{SparkSession, SparkSessionExtensions}
 8 | import org.apache.spark.sql.test.{SharedSparkSession, TestSparkSession}
 9 | 
10 | class DeltaTestSparkSession(sparkConf: SparkConf) extends TestSparkSession(sparkConf) {
11 |   override val extensions: SparkSessionExtensions = {
12 |     val extensions = new SparkSessionExtensions
13 |     new DeltaSparkSessionExtension().apply(extensions)
14 |     extensions
15 |   }
16 | }
17 | 
18 | trait DeltaExtendedSparkSession { self: SharedSparkSession =>
19 | 
20 |   override protected def createSparkSession: TestSparkSession = {
21 |     SparkSession.cleanupAnyExistingSession()
22 |     val session = new DeltaTestSparkSession(sparkConf)
23 |     session.conf.set(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[DeltaCatalog].getName)
24 |     session
25 |   }
26 | }


--------------------------------------------------------------------------------