├── README.md ├── amazon ├── README.md └── configurations │ ├── 16nodes.json │ ├── 1node.json │ ├── 2nodes.json │ ├── 4nodes.json │ ├── 8nodes.json │ └── README.md ├── geospark ├── DistanceJoin.scala ├── RangeQueries.scala ├── SpatialJoins.scala └── kNNQueries.scala ├── locationspark ├── RangeQueries.scala ├── SpatialJoins.scala ├── kNNJoin.scala └── kNNQueries.scala ├── magellan ├── RangeQueries.scala └── SpatialJoins.scala ├── simba ├── DistanceJoin.scala ├── RangeQueries.scala ├── kNNJoin.scala └── kNNQueries.scala ├── spatialspark ├── DistanceJoin.scala ├── RangeQueries.scala └── SpatialJoins.scala └── tiger_results ├── README.md └── images ├── distance_join_cost_breakdown.jpg ├── distance_join_scalability.jpg ├── distance_join_shuffle.jpg ├── joins_peak_memory.jpg ├── joins_scalability.jpg ├── joins_shuffle_reads.jpg ├── joins_shuffle_writes.jpg ├── knn_join_cost_breakdown.jpg ├── knn_join_scalability.jpg ├── knn_join_shuffle.jpg ├── spatial_joins.jpg └── spatial_joins_breakdown.jpg /README.md: -------------------------------------------------------------------------------- 1 | # Spatial Analytics 2 | Amazon related configuration files can be found under the directory amazon 3 | 4 | The experimental files for various systems can be found under their respective directories 5 | 6 | Note: All the experimental files assume that the datasets are available in the HDFS directory /data/ 7 | 8 | The datasets are available at http://osm.db.in.tum.de/ 9 | 10 | ## Cite 11 | 12 | If you liked the work, please cite our paper in your work: 13 | 14 | ``` 15 | @article{pandey2018howgood, 16 | author = {Varun Pandey and Andreas Kipf and Thomas Neumann and Alfons Kemper}, 17 | title = {How Good Are Modern Spatial Analytics Systems?}, 18 | journal = {Proc. {VLDB} Endow.}, 19 | volume = {11}, 20 | number = {11}, 21 | pages = {1661--1673}, 22 | year = {2018}, 23 | url = {http://www.vldb.org/pvldb/vol11/p1661-pandey.pdf}, 24 | doi = {10.14778/3236187.3236213} 25 | } 26 | ``` 27 | -------------------------------------------------------------------------------- /amazon/README.md: -------------------------------------------------------------------------------- 1 | # Amazon Configuration 2 | 3 | The Amazon configuration values can be found in the directory configuration 4 | 5 | The configuration parameters do not depend on the Amazon EMR version 6 | 7 | Make sure you launch the cluster on the Amazon EMR version that the relevant system is compatible with as mentioned in the publication 8 | -------------------------------------------------------------------------------- /amazon/configurations/16nodes.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "classification": "yarn-site", 4 | "properties": { 5 | "mapreduce.map.memory.mb": "7552", 6 | "yarn.nodemanager.resource.memory-mb": "241664", 7 | "mapreduce.reduce.memory.mb": "15104", 8 | "mapreduce.map.java.opts": "-Xmx6042m", 9 | "yarn.scheduler.maximum-allocation-mb": "241664", 10 | "yarn.scheduler.minimum-allocation-mb": "32", 11 | "yarn.nodemanager.vmem-check-enabled": "false", 12 | "mapreduce.reduce.java.opts": "-Xmx12084m", 13 | "yarn.app.mapreduce.am.resource.mb": "7552" 14 | }, 15 | "configurations": [ 16 | 17 | ] 18 | }, 19 | { 20 | "classification": "spark-defaults", 21 | "properties": { 22 | "spark.local.dir": "/mnt/tmp", 23 | "spark.executor.instances": "99", 24 | "spark.executor.memory": "37G", 25 | "spark.driver.memory": "37G", 26 | "spark.driver.cores": "5", 27 | "spark.executor.cores": "5", 28 | "spark.driver.maxResultSize": "35G", 29 | "spark.kryo.referenceTracking": "false" 30 | }, 31 | "configurations": [ 32 | 33 | ] 34 | } 35 | ] 36 | -------------------------------------------------------------------------------- /amazon/configurations/1node.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "classification": "yarn-site", 4 | "properties": { 5 | "mapreduce.map.memory.mb": "7552", 6 | "yarn.nodemanager.resource.memory-mb": "241664", 7 | "mapreduce.reduce.memory.mb": "15104", 8 | "mapreduce.map.java.opts": "-Xmx6042m", 9 | "yarn.scheduler.maximum-allocation-mb": "241664", 10 | "yarn.scheduler.minimum-allocation-mb": "32", 11 | "yarn.nodemanager.vmem-check-enabled": "false", 12 | "mapreduce.reduce.java.opts": "-Xmx12084m", 13 | "yarn.app.mapreduce.am.resource.mb": "7552" 14 | }, 15 | "configurations": [ 16 | 17 | ] 18 | }, 19 | { 20 | "classification": "spark-defaults", 21 | "properties": { 22 | "spark.local.dir": "/mnt/tmp", 23 | "spark.executor.instances": "6", 24 | "spark.executor.memory": "37G", 25 | "spark.driver.memory": "37G", 26 | "spark.driver.cores": "5", 27 | "spark.executor.cores": "5", 28 | "spark.driver.maxResultSize": "35G", 29 | "spark.kryo.referenceTracking": "false" 30 | }, 31 | "configurations": [ 32 | 33 | ] 34 | } 35 | ] 36 | -------------------------------------------------------------------------------- /amazon/configurations/2nodes.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "classification": "yarn-site", 4 | "properties": { 5 | "mapreduce.map.memory.mb": "7552", 6 | "yarn.nodemanager.resource.memory-mb": "241664", 7 | "mapreduce.reduce.memory.mb": "15104", 8 | "mapreduce.map.java.opts": "-Xmx6042m", 9 | "yarn.scheduler.maximum-allocation-mb": "241664", 10 | "yarn.scheduler.minimum-allocation-mb": "32", 11 | "yarn.nodemanager.vmem-check-enabled": "false", 12 | "mapreduce.reduce.java.opts": "-Xmx12084m", 13 | "yarn.app.mapreduce.am.resource.mb": "7552" 14 | }, 15 | "configurations": [ 16 | 17 | ] 18 | }, 19 | { 20 | "classification": "spark-defaults", 21 | "properties": { 22 | "spark.local.dir": "/mnt/tmp", 23 | "spark.executor.instances": "12", 24 | "spark.executor.memory": "37G", 25 | "spark.driver.memory": "37G", 26 | "spark.driver.cores": "5", 27 | "spark.executor.cores": "5", 28 | "spark.driver.maxResultSize": "35G", 29 | "spark.kryo.referenceTracking": "false" 30 | }, 31 | "configurations": [ 32 | 33 | ] 34 | } 35 | ] 36 | -------------------------------------------------------------------------------- /amazon/configurations/4nodes.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "classification": "yarn-site", 4 | "properties": { 5 | "mapreduce.map.memory.mb": "7552", 6 | "yarn.nodemanager.resource.memory-mb": "241664", 7 | "mapreduce.reduce.memory.mb": "15104", 8 | "mapreduce.map.java.opts": "-Xmx6042m", 9 | "yarn.scheduler.maximum-allocation-mb": "241664", 10 | "yarn.scheduler.minimum-allocation-mb": "32", 11 | "yarn.nodemanager.vmem-check-enabled": "false", 12 | "mapreduce.reduce.java.opts": "-Xmx12084m", 13 | "yarn.app.mapreduce.am.resource.mb": "7552" 14 | }, 15 | "configurations": [ 16 | 17 | ] 18 | }, 19 | { 20 | "classification": "spark-defaults", 21 | "properties": { 22 | "spark.local.dir": "/mnt/tmp", 23 | "spark.executor.instances": "24", 24 | "spark.executor.memory": "37G", 25 | "spark.driver.memory": "37G", 26 | "spark.driver.cores": "5", 27 | "spark.executor.cores": "5", 28 | "spark.driver.maxResultSize": "35G", 29 | "spark.kryo.referenceTracking": "false" 30 | }, 31 | "configurations": [ 32 | 33 | ] 34 | } 35 | ] 36 | -------------------------------------------------------------------------------- /amazon/configurations/8nodes.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "classification": "yarn-site", 4 | "properties": { 5 | "mapreduce.map.memory.mb": "7552", 6 | "yarn.nodemanager.resource.memory-mb": "241664", 7 | "mapreduce.reduce.memory.mb": "15104", 8 | "mapreduce.map.java.opts": "-Xmx6042m", 9 | "yarn.scheduler.maximum-allocation-mb": "241664", 10 | "yarn.scheduler.minimum-allocation-mb": "32", 11 | "yarn.nodemanager.vmem-check-enabled": "false", 12 | "mapreduce.reduce.java.opts": "-Xmx12084m", 13 | "yarn.app.mapreduce.am.resource.mb": "7552" 14 | }, 15 | "configurations": [ 16 | 17 | ] 18 | }, 19 | { 20 | "classification": "spark-defaults", 21 | "properties": { 22 | "spark.local.dir": "/mnt/tmp", 23 | "spark.executor.instances": "49", 24 | "spark.executor.memory": "37G", 25 | "spark.driver.memory": "37G", 26 | "spark.driver.cores": "5", 27 | "spark.executor.cores": "5", 28 | "spark.driver.maxResultSize": "35G", 29 | "spark.kryo.referenceTracking": "false" 30 | }, 31 | "configurations": [ 32 | 33 | ] 34 | } 35 | ] 36 | -------------------------------------------------------------------------------- /amazon/configurations/README.md: -------------------------------------------------------------------------------- 1 | # Amazon EMR Software Configuration Settings 2 | In Amazon EMR under "Create Cluster - Advanced Options" and under "Step 1: Software and Steps", copy and paste the values from the json files depending on the size of the cluster. 3 | 4 | Make sure in "Step 2: Hardware" of Create Cluster the values in the json matches with your configuration 5 | 6 | The default machines we used are m4.xlarge for master and r4.8xlarge as slave nodes. We only count the slave nodes in the cluster. Also the yarn-site configurations in the json files reflect the Task configuration values for r4.8xlarge. If you choose a different type of machine as slave node, change these values according to the values here: https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hadoop-task-config.html 7 | 8 | Note: We attached 100GB general purpose SSDs with each slave node, and these disks are mounted at /mnt by Amazon. The "spark.local.dir" parameter in the json files reflects this path. If you intent on running larger queries/datasets, you may want to attach a bigger SSD. 9 | -------------------------------------------------------------------------------- /geospark/DistanceJoin.scala: -------------------------------------------------------------------------------- 1 | package geospark.measurements 2 | 3 | import org.apache.log4j.Level 4 | import org.apache.log4j.Logger 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import org.apache.spark.storage.StorageLevel 7 | import org.datasyslab.geospark.enums.FileDataSplitter 8 | import org.datasyslab.geospark.enums.GridType 9 | import org.datasyslab.geospark.enums.IndexType 10 | import org.datasyslab.geospark.spatialOperator.JoinQuery 11 | import org.datasyslab.geospark.spatialOperator.KNNQuery 12 | import org.datasyslab.geospark.spatialOperator.RangeQuery 13 | import org.datasyslab.geospark.spatialRDD.CircleRDD 14 | import org.datasyslab.geospark.spatialRDD.PointRDD 15 | import org.datasyslab.geospark.spatialRDD.PolygonRDD 16 | import org.datasyslab.geospark.spatialRDD.LineStringRDD 17 | import org.datasyslab.geospark.spatialRDD.RectangleRDD 18 | import com.vividsolutions.jts.geom.Coordinate 19 | import com.vividsolutions.jts.geom.Envelope 20 | import com.vividsolutions.jts.geom.GeometryFactory 21 | import org.apache.spark.serializer.KryoSerializer 22 | import org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileRDD 23 | import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator 24 | 25 | 26 | /** 27 | * Distance Join For Points. 28 | */ 29 | 30 | object DistanceJoin extends App { 31 | 32 | val conf = new SparkConf().setAppName("GeoSpark Distance Join") 33 | conf.set("spark.serializer", classOf[KryoSerializer].getName) 34 | conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName) 35 | val sc = new SparkContext(conf) 36 | Logger.getLogger("org").setLevel(Level.INFO) 37 | Logger.getLogger("akka").setLevel(Level.INFO) 38 | 39 | def time[R](block: => R): R = { 40 | val t0 = System.nanoTime() 41 | val result = block // call-by-name 42 | val t1 = System.nanoTime() 43 | println("Join time: " + (t1 - t0) / 1E9 + " sec ") 44 | result 45 | } 46 | 47 | val geometryFactory = new GeometryFactory() 48 | 49 | distanceJoin() 50 | 51 | sc.stop() 52 | 53 | def distanceJoin() { 54 | 55 | var t0 = 0L 56 | var t1 = 0L 57 | var count1 = 0L 58 | 59 | println("************************ POINTS Distance Join **************************************") 60 | t0 = System.nanoTime() 61 | val objectRDD = new PointRDD(sc, "/data/points_200M.csv", 0, FileDataSplitter.CSV, false, 1024, StorageLevel.MEMORY_ONLY) 62 | t1 = System.nanoTime() 63 | val read_time = ((t1 - t0) / 1E9) 64 | 65 | println("Read Time: " + read_time + " sec") 66 | 67 | t0 = System.nanoTime() 68 | 69 | objectRDD.spatialPartitioning(GridType.QUADTREE) 70 | 71 | objectRDD.buildIndex(IndexType.RTREE, true) 72 | 73 | objectRDD.indexedRDD.persist(StorageLevel.MEMORY_ONLY) 74 | 75 | objectRDD.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) 76 | 77 | val c1 = objectRDD.spatialPartitionedRDD.count() 78 | 79 | val c2 = objectRDD.indexedRDD.count() 80 | 81 | t1 = System.nanoTime() 82 | val left_time = ((t1 - t0) / 1E9) 83 | println("Left Partitioning and Indexing Time: " + left_time + " sec") 84 | 85 | objectRDD.rawSpatialRDD.unpersist() 86 | 87 | t0 = System.nanoTime() 88 | 89 | val queryWindow = new CircleRDD(objectRDD, 0.000045027) // Simba computes euclidean distance, this distance is in degrees (in meters it would be approx 5 meters considering worst case scenario where 1 degree is equal to 110 kms at the equator) 90 | 91 | queryWindow.spatialPartitioning(objectRDD.getPartitioner) 92 | 93 | queryWindow.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) 94 | 95 | val c3 = queryWindow.spatialPartitionedRDD.count() 96 | 97 | t1 = System.nanoTime() 98 | val right_time = ((t1 - t0) / 1E9) 99 | println("Right Partitioning and Indexing Time: " + right_time + " sec") 100 | 101 | queryWindow.rawSpatialRDD.unpersist() 102 | 103 | t0 = System.nanoTime() 104 | count1 = time(JoinQuery.DistanceJoinQuery(objectRDD, queryWindow, true, false).count()) 105 | t1 = System.nanoTime() 106 | val join_time = ((t1 - t0) / 1E9) 107 | println("Distance Join Runtime: " + join_time + " sec") 108 | 109 | val total_time = read_time + left_time + right_time + join_time 110 | 111 | println("Total Runtime: " + total_time + " sec") 112 | 113 | println("***********************************************************************************") 114 | println("") 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /geospark/RangeQueries.scala: -------------------------------------------------------------------------------- 1 | package geospark.measurements 2 | 3 | import org.apache.log4j.Level 4 | import org.apache.log4j.Logger 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import org.apache.spark.storage.StorageLevel 7 | import org.datasyslab.geospark.enums.FileDataSplitter 8 | import org.datasyslab.geospark.enums.GridType 9 | import org.datasyslab.geospark.enums.IndexType 10 | import org.datasyslab.geospark.spatialOperator.JoinQuery 11 | import org.datasyslab.geospark.spatialOperator.KNNQuery 12 | import org.datasyslab.geospark.spatialOperator.RangeQuery 13 | import org.datasyslab.geospark.spatialRDD.CircleRDD 14 | import org.datasyslab.geospark.spatialRDD.PointRDD 15 | import org.datasyslab.geospark.spatialRDD.PolygonRDD 16 | import org.datasyslab.geospark.spatialRDD.LineStringRDD 17 | import org.datasyslab.geospark.spatialRDD.RectangleRDD 18 | import com.vividsolutions.jts.geom.Coordinate 19 | import com.vividsolutions.jts.geom.Envelope 20 | import com.vividsolutions.jts.geom.GeometryFactory 21 | import org.apache.spark.serializer.KryoSerializer 22 | import org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileRDD 23 | import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator 24 | 25 | 26 | /** 27 | * Range Queries For Different Geometric Objects. 28 | */ 29 | 30 | /** 31 | * rangeQueryWindow1 ==> Selection ratio 0.0001 32 | * rangeQueryWindow2 ==> Selection ratio 0.01 33 | * rangeQueryWindow3 ==> Selection ratio 1.0 34 | * rangeQueryWindow4 ==> Selection ratio 10.0 35 | * rangeQueryWindow5 ==> Selection ratio 50.0 36 | * rangeQueryWindow6 ==> Selection ratio 100.0 37 | */ 38 | 39 | object RangeQueries extends App { 40 | 41 | val conf = new SparkConf().setAppName("GeoSpark Range Queries") 42 | conf.set("spark.serializer", classOf[KryoSerializer].getName) 43 | conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName) 44 | val sc = new SparkContext(conf) 45 | Logger.getLogger("org").setLevel(Level.WARN) 46 | Logger.getLogger("akka").setLevel(Level.WARN) 47 | 48 | val geometryFactory = new GeometryFactory() 49 | 50 | spatialRangePoint() 51 | spatialRangeLineString() 52 | spatialRangePolygon() 53 | spatialRangeRectangle() 54 | 55 | sc.stop() 56 | 57 | def spatialRangePoint() { 58 | 59 | val nQueries = 100 60 | val rangeQueryWindow1 = new Envelope(-50.3010141441, -24.9526465797, -53.209588996, -30.1096863746) 61 | val rangeQueryWindow2 = new Envelope(-54.4270741441, -24.9526465797, -53.209588996, -30.1096863746) 62 | val rangeQueryWindow3 = new Envelope(-114.4270741441, 42.9526465797, -54.509588996, -27.0106863746) 63 | val rangeQueryWindow4 = new Envelope(-82.7638020000, 42.9526465797, -54.509588996, 38.0106863746) 64 | val rangeQueryWindow5 = new Envelope(-140.99778, 5.7305630159, -52.6480987209, 83.23324) 65 | val rangeQueryWindow6 = new Envelope(-180.0, 180.0, -90.0, 90.0) 66 | 67 | println("************************ POINT Range Queries **************************************") 68 | 69 | val objectRDD = new PointRDD(sc, "/data/points_200M.csv", 0, FileDataSplitter.CSV, false, 1024, StorageLevel.MEMORY_ONLY) 70 | 71 | objectRDD.buildIndex(IndexType.RTREE, false) 72 | objectRDD.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY) 73 | 74 | var t0 = 0L 75 | var t1 = 0L 76 | var count1 = 0L 77 | var count = 0L 78 | 79 | // Materialize IndexedRDD 80 | t0 = System.nanoTime() 81 | for (i <- 1 to 20) { 82 | count = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow6, false, true).count() 83 | } 84 | t1 = System.nanoTime() 85 | 86 | // Actual Measurements 87 | println("Range1: ") 88 | t0 = System.nanoTime() 89 | for (i <- 1 to nQueries) { 90 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow1, false, true).count() 91 | } 92 | t1 = System.nanoTime() 93 | println("Count: " + count1) 94 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 95 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 96 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 97 | t1 = 0L 98 | t0 = 0L 99 | 100 | println("Range2: ") 101 | t0 = System.nanoTime() 102 | for (i <- 1 to nQueries) { 103 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow2, false, true).count() 104 | } 105 | t1 = System.nanoTime() 106 | println("Count: " + count1) 107 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 108 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 109 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 110 | t1 = 0L 111 | t0 = 0L 112 | 113 | println("Range3: ") 114 | t0 = System.nanoTime() 115 | for (i <- 1 to nQueries) { 116 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow3, false, true).count() 117 | } 118 | t1 = System.nanoTime() 119 | println("Count: " + count1) 120 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 121 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 122 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 123 | t1 = 0L 124 | t0 = 0L 125 | 126 | println("Range4: ") 127 | t0 = System.nanoTime() 128 | for (i <- 1 to nQueries) { 129 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow4, false, true).count() 130 | } 131 | t1 = System.nanoTime() 132 | println("Count: " + count1) 133 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 134 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 135 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 136 | t1 = 0L 137 | t0 = 0L 138 | 139 | println("Range5: ") 140 | t0 = System.nanoTime() 141 | for (i <- 1 to nQueries) { 142 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow5, false, true).count() 143 | } 144 | t1 = System.nanoTime() 145 | println("Count: " + count1) 146 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 147 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 148 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 149 | t1 = 0L 150 | t0 = 0L 151 | 152 | println("Range6: ") 153 | t0 = System.nanoTime() 154 | for (i <- 1 to nQueries) { 155 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow6, false, true).count() 156 | } 157 | t1 = System.nanoTime() 158 | println("Count: " + count1) 159 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 160 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 161 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 162 | t1 = 0L 163 | t0 = 0L 164 | 165 | objectRDD.indexedRawRDD.unpersist() 166 | objectRDD.rawSpatialRDD.unpersist() 167 | 168 | println("***********************************************************************************") 169 | println("") 170 | } 171 | 172 | def spatialRangeLineString() { 173 | 174 | val nQueries = 100 175 | val rangeQueryWindow1 = new Envelope(-50.204, -24.9526465797, -53.209588996, -30.1096863746) 176 | val rangeQueryWindow2 = new Envelope(-52.1270741441, -24.9526465797, -53.209588996, -30.1096863746) 177 | val rangeQueryWindow3 = new Envelope(-94.4270741441, 22.9526465797, -34.609588996, -27.0106863746) 178 | val rangeQueryWindow4 = new Envelope(-74.0938020000, 42.9526465797, -54.509588996, 38.0106863746) 179 | val rangeQueryWindow5 = new Envelope(-150.99778, 7.2705630159, -52.6480987209, 83.23324) 180 | val rangeQueryWindow6 = new Envelope(-180.0, 180.0, -90.0, 90.0) 181 | 182 | println("************************ LineString Range Queries **************************************") 183 | 184 | val objectRDD = new LineStringRDD(sc, "/data/linestrings_72M.csv", FileDataSplitter.WKT, false, 1024, StorageLevel.MEMORY_ONLY) 185 | 186 | objectRDD.buildIndex(IndexType.RTREE, false) 187 | objectRDD.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY) 188 | 189 | var t0 = 0L 190 | var t1 = 0L 191 | var count1 = 0L 192 | var count = 0L 193 | 194 | // Materialize RDDs 195 | t0 = System.nanoTime() 196 | for (i <- 1 to 20) { 197 | count = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow6, false, true).count() 198 | } 199 | t1 = System.nanoTime() 200 | 201 | // Actual Measurements 202 | println("Range1: ") 203 | t0 = System.nanoTime() 204 | for (i <- 1 to nQueries) { 205 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow1, false, true).count() 206 | } 207 | t1 = System.nanoTime() 208 | println("Count: " + count1) 209 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 210 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 211 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 212 | t1 = 0L 213 | t0 = 0L 214 | 215 | println("Range2: ") 216 | t0 = System.nanoTime() 217 | for (i <- 1 to nQueries) { 218 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow2, false, true).count() 219 | } 220 | t1 = System.nanoTime() 221 | println("Count: " + count1) 222 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 223 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 224 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 225 | t1 = 0L 226 | t0 = 0L 227 | 228 | println("Range3: ") 229 | t0 = System.nanoTime() 230 | for (i <- 1 to nQueries) { 231 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow3, false, true).count() 232 | } 233 | t1 = System.nanoTime() 234 | println("Count: " + count1) 235 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 236 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 237 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 238 | t1 = 0L 239 | t0 = 0L 240 | 241 | println("Range4: ") 242 | t0 = System.nanoTime() 243 | for (i <- 1 to nQueries) { 244 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow4, false, true).count() 245 | } 246 | t1 = System.nanoTime() 247 | println("Count: " + count1) 248 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 249 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 250 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 251 | t1 = 0L 252 | t0 = 0L 253 | 254 | println("Range5: ") 255 | t0 = System.nanoTime() 256 | for (i <- 1 to nQueries) { 257 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow5, false, true).count() 258 | } 259 | t1 = System.nanoTime() 260 | println("Count: " + count1) 261 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 262 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 263 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 264 | t1 = 0L 265 | t0 = 0L 266 | 267 | println("Range6: ") 268 | t0 = System.nanoTime() 269 | for (i <- 1 to nQueries) { 270 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow6, false, true).count() 271 | } 272 | t1 = System.nanoTime() 273 | println("Count: " + count1) 274 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 275 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 276 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 277 | t1 = 0L 278 | t0 = 0L 279 | 280 | objectRDD.indexedRawRDD.unpersist() 281 | objectRDD.rawSpatialRDD.unpersist() 282 | 283 | println("***********************************************************************************") 284 | println("") 285 | } 286 | 287 | def spatialRangePolygon() { 288 | 289 | val nQueries = 100 290 | val rangeQueryWindow1 = new Envelope(-20.204, 17.9526465797, -53.209588996, -30.1096863746) 291 | val rangeQueryWindow2 = new Envelope(-20.204, 20.4376465797, -53.209588996, -30.1096863746) 292 | val rangeQueryWindow3 = new Envelope(-74.4270741441, 72.9526465797, -34.609588996, -6.5906863746) 293 | val rangeQueryWindow4 = new Envelope(-104.0938020000, 118.9526465797, -54.509588996, 40.2406863746) 294 | val rangeQueryWindow5 = new Envelope(-174.4270741441, 72.9526465797, -34.609588996, 48.4396863746) 295 | val rangeQueryWindow6 = new Envelope(-180.0, 180.0, -90.0, 90.0) 296 | 297 | println("************************ POLYGON Range Queries **************************************") 298 | 299 | val objectRDD = new PolygonRDD(sc, "/data/buildings_114M.csv", 0, 8, FileDataSplitter.WKT, false, 1024, StorageLevel.MEMORY_ONLY) 300 | 301 | objectRDD.buildIndex(IndexType.RTREE, false) 302 | objectRDD.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY) 303 | 304 | var t0 = 0L 305 | var t1 = 0L 306 | var count1 = 0L 307 | var count = 0L 308 | 309 | // Materialize RDDs 310 | t0 = System.nanoTime() 311 | for (i <- 1 to 20) { 312 | count = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow6, false, true).count() 313 | } 314 | t1 = System.nanoTime() 315 | 316 | // Actual Measurements 317 | println("Range1: ") 318 | t0 = System.nanoTime() 319 | for (i <- 1 to nQueries) { 320 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow1, false, true).count() 321 | } 322 | t1 = System.nanoTime() 323 | println("Count: " + count1) 324 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 325 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 326 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 327 | t1 = 0L 328 | t0 = 0L 329 | 330 | println("Range2: ") 331 | t0 = System.nanoTime() 332 | for (i <- 1 to nQueries) { 333 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow2, false, true).count() 334 | } 335 | t1 = System.nanoTime() 336 | println("Count: " + count1) 337 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 338 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 339 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 340 | t1 = 0L 341 | t0 = 0L 342 | 343 | println("Range3: ") 344 | t0 = System.nanoTime() 345 | for (i <- 1 to nQueries) { 346 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow3, false, true).count() 347 | } 348 | t1 = System.nanoTime() 349 | println("Count: " + count1) 350 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 351 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 352 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 353 | t1 = 0L 354 | t0 = 0L 355 | 356 | println("Range4: ") 357 | t0 = System.nanoTime() 358 | for (i <- 1 to nQueries) { 359 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow4, false, true).count() 360 | } 361 | t1 = System.nanoTime() 362 | println("Count: " + count1) 363 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 364 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 365 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 366 | t1 = 0L 367 | t0 = 0L 368 | 369 | println("Range5: ") 370 | t0 = System.nanoTime() 371 | for (i <- 1 to nQueries) { 372 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow5, false, true).count() 373 | } 374 | t1 = System.nanoTime() 375 | println("Count: " + count1) 376 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 377 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 378 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 379 | t1 = 0L 380 | t0 = 0L 381 | 382 | println("Range6: ") 383 | t0 = System.nanoTime() 384 | for (i <- 1 to nQueries) { 385 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow6, false, true).count() 386 | } 387 | t1 = System.nanoTime() 388 | println("Count: " + count1) 389 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 390 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 391 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 392 | t1 = 0L 393 | t0 = 0L 394 | 395 | objectRDD.indexedRawRDD.unpersist() 396 | objectRDD.rawSpatialRDD.unpersist() 397 | 398 | println("***********************************************************************************") 399 | println("") 400 | } 401 | 402 | def spatialRangeRectangle() { 403 | 404 | val nQueries = 100 405 | val rangeQueryWindow1 = new Envelope(-20.204, 17.9526465797, -53.209588996, -30.1096863746) 406 | val rangeQueryWindow2 = new Envelope(-20.204, 20.4376465797, -53.209588996, -30.1096863746) 407 | val rangeQueryWindow3 = new Envelope(-74.4270741441, 72.9526465797, -34.609588996, -6.5906863746) 408 | val rangeQueryWindow4 = new Envelope(-104.0938020000, 118.9526465797, -54.509588996, 40.2406863746) 409 | val rangeQueryWindow5 = new Envelope(-174.4270741441, 72.9526465797, -34.609588996, 48.4396863746) 410 | val rangeQueryWindow6 = new Envelope(-180.0, 180.0, -90.0, 90.0) 411 | 412 | println("************************ Rectangle Range Queries **************************************") 413 | 414 | val objectRDD = new RectangleRDD(sc, "/data/rectangles_114M.csv", FileDataSplitter.WKT, false, 1024, StorageLevel.MEMORY_ONLY) 415 | 416 | objectRDD.buildIndex(IndexType.RTREE, false) 417 | objectRDD.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY) 418 | 419 | var t0 = 0L 420 | var t1 = 0L 421 | var count1 = 0L 422 | var count = 0L 423 | t0 = System.nanoTime() 424 | 425 | // Materialize RDDs 426 | for (i <- 1 to 20) { 427 | count = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow6, false, true).count() 428 | } 429 | t1 = System.nanoTime() 430 | 431 | // Actual Measurements 432 | println("Range1: ") 433 | t0 = System.nanoTime() 434 | for (i <- 1 to nQueries) { 435 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow1, false, true).count() 436 | } 437 | t1 = System.nanoTime() 438 | println("Count: " + count1) 439 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 440 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 441 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 442 | t1 = 0L 443 | t0 = 0L 444 | 445 | println("Range2: ") 446 | t0 = System.nanoTime() 447 | for (i <- 1 to nQueries) { 448 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow2, false, true).count() 449 | } 450 | t1 = System.nanoTime() 451 | println("Count: " + count1) 452 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 453 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 454 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 455 | t1 = 0L 456 | t0 = 0L 457 | 458 | println("Range3: ") 459 | t0 = System.nanoTime() 460 | for (i <- 1 to nQueries) { 461 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow3, false, true).count() 462 | } 463 | t1 = System.nanoTime() 464 | println("Count: " + count1) 465 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 466 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 467 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 468 | t1 = 0L 469 | t0 = 0L 470 | 471 | println("Range4: ") 472 | t0 = System.nanoTime() 473 | for (i <- 1 to nQueries) { 474 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow4, false, true).count() 475 | } 476 | t1 = System.nanoTime() 477 | println("Count: " + count1) 478 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 479 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 480 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 481 | t1 = 0L 482 | t0 = 0L 483 | 484 | println("Range5: ") 485 | t0 = System.nanoTime() 486 | for (i <- 1 to nQueries) { 487 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow5, false, true).count() 488 | } 489 | t1 = System.nanoTime() 490 | println("Count: " + count1) 491 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 492 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 493 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 494 | t1 = 0L 495 | t0 = 0L 496 | 497 | println("Range6: ") 498 | t0 = System.nanoTime() 499 | for (i <- 1 to nQueries) { 500 | count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow6, false, true).count() 501 | } 502 | t1 = System.nanoTime() 503 | println("Count: " + count1) 504 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 505 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 506 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 507 | t1 = 0L 508 | t0 = 0L 509 | 510 | objectRDD.indexedRawRDD.unpersist() 511 | objectRDD.rawSpatialRDD.unpersist() 512 | 513 | println("***********************************************************************************") 514 | println("") 515 | } 516 | } 517 | -------------------------------------------------------------------------------- /geospark/SpatialJoins.scala: -------------------------------------------------------------------------------- 1 | package geospark.measurements 2 | 3 | import org.apache.log4j.Level 4 | import org.apache.log4j.Logger 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import org.apache.spark.storage.StorageLevel 7 | import org.datasyslab.geospark.enums.FileDataSplitter 8 | import org.datasyslab.geospark.enums.GridType 9 | import org.datasyslab.geospark.enums.IndexType 10 | import org.datasyslab.geospark.spatialOperator.JoinQuery 11 | import org.datasyslab.geospark.spatialOperator.KNNQuery 12 | import org.datasyslab.geospark.spatialOperator.RangeQuery 13 | import org.datasyslab.geospark.spatialRDD.CircleRDD 14 | import org.datasyslab.geospark.spatialRDD.PointRDD 15 | import org.datasyslab.geospark.spatialRDD.PolygonRDD 16 | import org.datasyslab.geospark.spatialRDD.LineStringRDD 17 | import org.datasyslab.geospark.spatialRDD.RectangleRDD 18 | import com.vividsolutions.jts.geom.Coordinate 19 | import com.vividsolutions.jts.geom.Envelope 20 | import com.vividsolutions.jts.geom.GeometryFactory 21 | import org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileRDD 22 | import org.apache.spark.serializer.KryoSerializer 23 | import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator 24 | 25 | /* 26 | * Spatial Joins between different geometric objects using KDB and Quadtree partitioning 27 | */ 28 | 29 | object SpatialJoins { 30 | 31 | def main(args: Array[String]) { 32 | 33 | val conf = new SparkConf().setAppName("GeoSpark Spatial Joins") 34 | conf.set("spark.serializer", classOf[KryoSerializer].getName) 35 | conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName) 36 | val sc = new SparkContext(conf) 37 | Logger.getLogger("org").setLevel(Level.INFO) 38 | Logger.getLogger("akka").setLevel(Level.INFO) 39 | 40 | val points = "/data/points_200M.csv" 41 | val polygons = "/data/buildings_114M.csv" 42 | val rectangles = "/data/rectangles_114M.csv" 43 | val linestrings = "/data/linestrings_72M.csv" 44 | 45 | val quad = GridType.QUADTREE 46 | val kdb = GridType.KDBTREE 47 | val idx = IndexType.RTREE 48 | val numPartitions = 1024 49 | 50 | /*println("******************************* KDB Partitioning *******************************") 51 | runSpatialJoin(kdb,"point","point") 52 | runSpatialJoin(kdb,"point","linestring") 53 | runSpatialJoin(kdb,"point","polygon") 54 | runSpatialJoin(kdb,"point","rectangle") 55 | runSpatialJoin(kdb,"linestring","linestring") 56 | runSpatialJoin(kdb,"linestring","polygon") 57 | runSpatialJoin(kdb,"linestring","rectangle") 58 | runSpatialJoin(kdb,"rectangle","rectangle") 59 | runSpatialJoin(kdb,"rectangle","polygon") 60 | runSpatialJoin(kdb,"polygon","polygon") 61 | println("*************************** Finished KDB Partitioning ***************************")*/ 62 | println("******************************* quad Partitioning *******************************") 63 | runSpatialJoin(quad, "point", "point") 64 | runSpatialJoin(quad, "point", "linestring") 65 | runSpatialJoin(quad, "point", "polygon") 66 | runSpatialJoin(quad, "point", "rectangle") 67 | runSpatialJoin(quad, "linestring", "linestring") 68 | runSpatialJoin(quad, "linestring", "polygon") 69 | runSpatialJoin(quad, "linestring", "rectangle") 70 | runSpatialJoin(quad, "rectangle", "rectangle") 71 | runSpatialJoin(quad, "rectangle", "polygon") 72 | runSpatialJoin(quad, "polygon", "polygon") 73 | println("*************************** Finished quad Partitioning ***************************") 74 | 75 | def runSpatialJoin(partitioningScheme: org.datasyslab.geospark.enums.GridType, leftrdd: String, rightrdd: String) { 76 | 77 | var count = 0L 78 | val beginTime = System.currentTimeMillis() 79 | var t0 = 0L 80 | var t1 = 0L 81 | 82 | println("******************************** " + leftrdd + " and " + rightrdd + " spatial join ********************************") 83 | 84 | t0 = System.nanoTime() 85 | val leftRDD = leftrdd match { 86 | 87 | case "point" => new PointRDD(sc, points, FileDataSplitter.CSV, false, numPartitions, StorageLevel.MEMORY_ONLY) 88 | case "linestring" => new LineStringRDD(sc, linestrings, FileDataSplitter.WKT, false, numPartitions, StorageLevel.MEMORY_ONLY) 89 | case "rectangle" => new RectangleRDD(sc, rectangles, FileDataSplitter.WKT, false, numPartitions, StorageLevel.MEMORY_ONLY) 90 | case "polygon" => new PolygonRDD(sc, polygons, FileDataSplitter.WKT, false, numPartitions, StorageLevel.MEMORY_ONLY) 91 | 92 | } 93 | 94 | leftRDD.spatialPartitioning(partitioningScheme) 95 | 96 | leftRDD.buildIndex(idx,true) 97 | 98 | leftRDD.indexedRDD.persist(StorageLevel.MEMORY_ONLY) 99 | 100 | leftRDD.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) 101 | 102 | val c1 = leftRDD.spatialPartitionedRDD.count() 103 | 104 | val c2 = leftRDD.indexedRDD.count() 105 | 106 | leftRDD.rawSpatialRDD.unpersist() 107 | 108 | leftRDD.spatialPartitionedRDD.unpersist() 109 | 110 | val rightRDD = rightrdd match { 111 | 112 | case "point" => new PointRDD(sc, points, FileDataSplitter.CSV, false, numPartitions, StorageLevel.MEMORY_ONLY) 113 | case "linestring" => new LineStringRDD(sc, linestrings, FileDataSplitter.WKT, false, numPartitions, StorageLevel.MEMORY_ONLY) 114 | case "rectangle" => new RectangleRDD(sc, rectangles, FileDataSplitter.WKT, false, numPartitions, StorageLevel.MEMORY_ONLY) 115 | case "polygon" => new PolygonRDD(sc, polygons, FileDataSplitter.WKT, false, numPartitions, StorageLevel.MEMORY_ONLY) 116 | 117 | } 118 | 119 | rightRDD.spatialPartitioning(leftRDD.getPartitioner) 120 | 121 | rightRDD.spatialPartitionedRDD.persist(StorageLevel.MEMORY_ONLY) 122 | 123 | val c3 = rightRDD.spatialPartitionedRDD.count() 124 | 125 | t1 = System.nanoTime() 126 | 127 | val rpTime = (t1 - t0)/1E9 128 | 129 | println("Total Reading and Partitioning Time: " + rpTime + " sec") 130 | 131 | rightRDD.rawSpatialRDD.unpersist() 132 | 133 | t0 = System.nanoTime() 134 | 135 | count = JoinQuery.SpatialJoinQuery(leftRDD, rightRDD, true, false).count() 136 | 137 | t1 = System.nanoTime() 138 | val join_time = (t1 - t0) / 1E9 139 | println("Join Time: " + join_time + " sec") 140 | 141 | val total_time = read_time + leftPTime + rightPTime + join_time 142 | 143 | println("Total Join Time: " + total_time + " sec") 144 | 145 | println("********************************************************************************************") 146 | 147 | leftRDD.spatialPartitionedRDD.unpersist() 148 | leftRDD.indexedRDD.unpersist() 149 | rightRDD.spatialPartitionedRDD.unpersist() 150 | } 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /geospark/kNNQueries.scala: -------------------------------------------------------------------------------- 1 | package geospark.measurements 2 | 3 | import org.apache.log4j.Level 4 | import org.apache.log4j.Logger 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import org.apache.spark.storage.StorageLevel 7 | import org.datasyslab.geospark.enums.FileDataSplitter 8 | import org.datasyslab.geospark.enums.GridType 9 | import org.datasyslab.geospark.enums.IndexType 10 | import org.datasyslab.geospark.spatialOperator.JoinQuery 11 | import org.datasyslab.geospark.spatialOperator.KNNQuery 12 | import org.datasyslab.geospark.spatialOperator.RangeQuery 13 | import org.datasyslab.geospark.spatialRDD.CircleRDD 14 | import org.datasyslab.geospark.spatialRDD.PointRDD 15 | import org.datasyslab.geospark.spatialRDD.PolygonRDD 16 | import org.datasyslab.geospark.spatialRDD.LineStringRDD 17 | import org.datasyslab.geospark.spatialRDD.RectangleRDD 18 | import com.vividsolutions.jts.geom.Coordinate 19 | import com.vividsolutions.jts.geom.Envelope 20 | import com.vividsolutions.jts.geom.GeometryFactory 21 | import org.apache.spark.serializer.KryoSerializer 22 | import org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileRDD 23 | import org.datasyslab.geospark.serde.GeoSparkKryoRegistrator 24 | import scala.util.Random 25 | 26 | 27 | /** 28 | * knnQueries For Different Geometric Objects. 29 | */ 30 | 31 | object kNNQueries extends App { 32 | 33 | val conf = new SparkConf().setAppName("GeoSpark kNN Queries") 34 | conf.set("spark.serializer", classOf[KryoSerializer].getName) 35 | conf.set("spark.kryo.registrator", classOf[GeoSparkKryoRegistrator].getName) 36 | val sc = new SparkContext(conf) 37 | Logger.getLogger("org").setLevel(Level.WARN) 38 | Logger.getLogger("akka").setLevel(Level.WARN) 39 | 40 | def time[R](block: => R): R = { 41 | val t0 = System.nanoTime() 42 | val result = block // call-by-name 43 | val t1 = System.nanoTime() 44 | println("Query time: " + (t1 - t0) / 1E9 + " sec ") 45 | result 46 | } 47 | 48 | val geometryFactory = new GeometryFactory() 49 | 50 | knnPoint() 51 | //knnLineString() 52 | //knnPolygon() 53 | //knnRectangle() 54 | 55 | sc.stop() 56 | 57 | def knnPoint() { 58 | 59 | val nQueries = 100 60 | val rangeQueryWindow6 = new Envelope(-180.0, 180.0, -90.0, 90.0) 61 | val random = scala.util.Random 62 | 63 | println("************************ POINT KNN Queries **************************************") 64 | 65 | val objectRDD = new PointRDD(sc, "/data/points_200M.csv", 0, FileDataSplitter.CSV, false, 1024, StorageLevel.MEMORY_ONLY) 66 | 67 | objectRDD.buildIndex(IndexType.RTREE, false) 68 | objectRDD.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY) 69 | 70 | var t0 = 0L 71 | var t1 = 0L 72 | var show = 0L 73 | 74 | // Materialize RDDs 75 | t0 = System.nanoTime() 76 | for (i <- 1 to 20) { 77 | val count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow6, false, true).count() 78 | } 79 | t1 = System.nanoTime() 80 | 81 | println("k=1") 82 | t0 = System.nanoTime() 83 | for (i <- 1 to nQueries) { 84 | var lat = (random.nextDouble() * 2 - 1) * 90 85 | var long = (random.nextDouble() * 2 - 1) * 180 86 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 87 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 1, true) // flag true to use index 88 | } 89 | t1 = System.nanoTime() 90 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 91 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 92 | t1 = 0L 93 | t0 = 0L 94 | 95 | println("k=5") 96 | for (i <- 1 to nQueries) { 97 | var lat = (random.nextDouble() * 2 - 1) * 90 98 | var long = (random.nextDouble() * 2 - 1) * 180 99 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 100 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 5, true) // flag true to use index 101 | } 102 | t1 = System.nanoTime() 103 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 104 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 105 | t1 = 0L 106 | t0 = 0L 107 | 108 | println("k=10") 109 | t0 = System.nanoTime() 110 | for (i <- 1 to nQueries) { 111 | var lat = (random.nextDouble() * 2 - 1) * 90 112 | var long = (random.nextDouble() * 2 - 1) * 180 113 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 114 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 10, true) // flag true to use index 115 | } 116 | t1 = System.nanoTime() 117 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 118 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 119 | t1 = 0L 120 | t0 = 0L 121 | 122 | println("k=20") 123 | t0 = System.nanoTime() 124 | for (i <- 1 to nQueries) { 125 | var lat = (random.nextDouble() * 2 - 1) * 90 126 | var long = (random.nextDouble() * 2 - 1) * 180 127 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 128 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 20, true) // flag true to use index 129 | } 130 | t1 = System.nanoTime() 131 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 132 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 133 | t1 = 0L 134 | t0 = 0L 135 | 136 | println("k=30") 137 | t0 = System.nanoTime() 138 | for (i <- 1 to nQueries) { 139 | var lat = (random.nextDouble() * 2 - 1) * 90 140 | var long = (random.nextDouble() * 2 - 1) * 180 141 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 142 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 30, true) // flag true to use index 143 | } 144 | t1 = System.nanoTime() 145 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 146 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 147 | t1 = 0L 148 | t0 = 0L 149 | 150 | println("k=40") 151 | t0 = System.nanoTime() 152 | for (i <- 1 to nQueries) { 153 | var lat = (random.nextDouble() * 2 - 1) * 90 154 | var long = (random.nextDouble() * 2 - 1) * 180 155 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 156 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 40, true) // flag true to use index 157 | } 158 | t1 = System.nanoTime() 159 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 160 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 161 | t1 = 0L 162 | t0 = 0L 163 | 164 | println("k=50") 165 | t0 = System.nanoTime() 166 | for (i <- 1 to nQueries) { 167 | var lat = (random.nextDouble() * 2 - 1) * 90 168 | var long = (random.nextDouble() * 2 - 1) * 180 169 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 170 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 50, true) // flag true to use index 171 | } 172 | t1 = System.nanoTime() 173 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 174 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 175 | t1 = 0L 176 | t0 = 0L 177 | 178 | objectRDD.indexedRawRDD.unpersist() 179 | objectRDD.rawSpatialRDD.unpersist() 180 | 181 | println("***********************************************************************************") 182 | println("") 183 | } 184 | 185 | def knnLineString() { 186 | 187 | val nQueries = 100 188 | var tTime = 0L 189 | val rangeQueryWindow6 = new Envelope(-180.0, 180.0, -90.0, 90.0) 190 | val random = scala.util.Random 191 | 192 | println("************************ LineString KNN Queries **************************************") 193 | 194 | val objectRDD = new LineStringRDD(sc, "/data/linestrings_72M.csv", FileDataSplitter.WKT, false, 1024, StorageLevel.MEMORY_ONLY) 195 | 196 | objectRDD.buildIndex(IndexType.RTREE, false) 197 | objectRDD.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY) 198 | 199 | 200 | var t0 = 0L 201 | var t1 = 0L 202 | 203 | t0 = System.nanoTime() 204 | for (i <- 1 to 20) { 205 | val count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow6, false, true).count() 206 | } 207 | t1 = System.nanoTime() 208 | 209 | println("k=1") 210 | t0 = System.nanoTime() 211 | for (i <- 1 to nQueries) { 212 | var lat = (random.nextDouble() * 2 - 1) * 90 213 | var long = (random.nextDouble() * 2 - 1) * 180 214 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 215 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 1, true) // flag true to use index 216 | } 217 | t1 = System.nanoTime() 218 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 219 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 220 | t1 = 0L 221 | t0 = 0L 222 | 223 | println("k=5") 224 | t0 = System.nanoTime() 225 | for (i <- 1 to nQueries) { 226 | var lat = (random.nextDouble() * 2 - 1) * 90 227 | var long = (random.nextDouble() * 2 - 1) * 180 228 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 229 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 5, true) // flag true to use index 230 | } 231 | t1 = System.nanoTime() 232 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 233 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 234 | t1 = 0L 235 | t0 = 0L 236 | 237 | println("k=10") 238 | t0 = System.nanoTime() 239 | for (i <- 1 to nQueries) { 240 | var lat = (random.nextDouble() * 2 - 1) * 90 241 | var long = (random.nextDouble() * 2 - 1) * 180 242 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 243 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 10, true) // flag true to use index 244 | } 245 | t1 = System.nanoTime() 246 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 247 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 248 | t1 = 0L 249 | t0 = 0L 250 | 251 | println("k=20") 252 | t0 = System.nanoTime() 253 | for (i <- 1 to nQueries) { 254 | var lat = (random.nextDouble() * 2 - 1) * 90 255 | var long = (random.nextDouble() * 2 - 1) * 180 256 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 257 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 20, true) // flag true to use index 258 | } 259 | t1 = System.nanoTime() 260 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 261 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 262 | t1 = 0L 263 | t0 = 0L 264 | 265 | println("k=30") 266 | t0 = System.nanoTime() 267 | for (i <- 1 to nQueries) { 268 | var lat = (random.nextDouble() * 2 - 1) * 90 269 | var long = (random.nextDouble() * 2 - 1) * 180 270 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 271 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 30, true) // flag true to use index 272 | } 273 | t1 = System.nanoTime() 274 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 275 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 276 | t1 = 0L 277 | t0 = 0L 278 | 279 | println("k=40") 280 | t0 = System.nanoTime() 281 | for (i <- 1 to nQueries) { 282 | var lat = (random.nextDouble() * 2 - 1) * 90 283 | var long = (random.nextDouble() * 2 - 1) * 180 284 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 285 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 40, true) // flag true to use index 286 | } 287 | t1 = System.nanoTime() 288 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 289 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 290 | t1 = 0L 291 | t0 = 0L 292 | 293 | println("k=50") 294 | t0 = System.nanoTime() 295 | for (i <- 1 to nQueries) { 296 | var lat = (random.nextDouble() * 2 - 1) * 90 297 | var long = (random.nextDouble() * 2 - 1) * 180 298 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 299 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 50, true) // flag true to use index 300 | } 301 | t1 = System.nanoTime() 302 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 303 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 304 | 305 | objectRDD.indexedRawRDD.unpersist() 306 | objectRDD.rawSpatialRDD.unpersist() 307 | 308 | println("***********************************************************************************") 309 | println("") 310 | } 311 | 312 | def knnPolygon() { 313 | 314 | val nQueries = 100 315 | var tTime = 0L 316 | val rangeQueryWindow6 = new Envelope(-180.0, 180.0, -90.0, 90.0) 317 | val random = scala.util.Random 318 | 319 | println("************************ POLYGON KNN Queries **************************************") 320 | 321 | val objectRDD = new PolygonRDD(sc, "/data/buildings_114M.csv", 0, 8, FileDataSplitter.WKT, false, 1024, StorageLevel.MEMORY_ONLY) 322 | 323 | objectRDD.buildIndex(IndexType.RTREE, false) 324 | objectRDD.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY) 325 | 326 | var t0 = 0L 327 | var t1 = 0L 328 | 329 | t0 = System.nanoTime() 330 | for (i <- 1 to 20) { 331 | val count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow6, false, true).count() 332 | } 333 | t1 = System.nanoTime() 334 | 335 | println("k=1") 336 | t0 = System.nanoTime() 337 | for (i <- 1 to nQueries) { 338 | var lat = (random.nextDouble() * 2 - 1) * 90 339 | var long = (random.nextDouble() * 2 - 1) * 180 340 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 341 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 1, true) // flag true to use index 342 | } 343 | t1 = System.nanoTime() 344 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 345 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 346 | t1 = 0L 347 | t0 = 0L 348 | 349 | println("k=5") 350 | t0 = System.nanoTime() 351 | for (i <- 1 to nQueries) { 352 | var lat = (random.nextDouble() * 2 - 1) * 90 353 | var long = (random.nextDouble() * 2 - 1) * 180 354 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 355 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 5, true) // flag true to use index 356 | } 357 | t1 = System.nanoTime() 358 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 359 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 360 | t1 = 0L 361 | t0 = 0L 362 | 363 | println("k=10") 364 | t0 = System.nanoTime() 365 | for (i <- 1 to nQueries) { 366 | var lat = (random.nextDouble() * 2 - 1) * 90 367 | var long = (random.nextDouble() * 2 - 1) * 180 368 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 369 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 10, true) // flag true to use index 370 | } 371 | t1 = System.nanoTime() 372 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 373 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 374 | t1 = 0L 375 | t0 = 0L 376 | 377 | println("k=20") 378 | t0 = System.nanoTime() 379 | for (i <- 1 to nQueries) { 380 | var lat = (random.nextDouble() * 2 - 1) * 90 381 | var long = (random.nextDouble() * 2 - 1) * 180 382 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 383 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 20, true) // flag true to use index 384 | } 385 | t1 = System.nanoTime() 386 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 387 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 388 | t1 = 0L 389 | t0 = 0L 390 | 391 | println("k=30") 392 | t0 = System.nanoTime() 393 | for (i <- 1 to nQueries) { 394 | var lat = (random.nextDouble() * 2 - 1) * 90 395 | var long = (random.nextDouble() * 2 - 1) * 180 396 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 397 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 30, true) // flag true to use index 398 | } 399 | t1 = System.nanoTime() 400 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 401 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 402 | t1 = 0L 403 | t0 = 0L 404 | 405 | println("k=40") 406 | t0 = System.nanoTime() 407 | for (i <- 1 to nQueries) { 408 | var lat = (random.nextDouble() * 2 - 1) * 90 409 | var long = (random.nextDouble() * 2 - 1) * 180 410 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 411 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 40, true) // flag true to use index 412 | } 413 | t1 = System.nanoTime() 414 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 415 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 416 | t1 = 0L 417 | t0 = 0L 418 | 419 | println("k=50") 420 | t0 = System.nanoTime() 421 | for (i <- 1 to nQueries) { 422 | var lat = (random.nextDouble() * 2 - 1) * 90 423 | var long = (random.nextDouble() * 2 - 1) * 180 424 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 425 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 50, true) // flag true to use index 426 | } 427 | t1 = System.nanoTime() 428 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 429 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 430 | 431 | objectRDD.indexedRawRDD.unpersist() 432 | objectRDD.rawSpatialRDD.unpersist() 433 | 434 | println("***********************************************************************************") 435 | println("") 436 | } 437 | 438 | def knnRectangle() { 439 | 440 | val nQueries = 100 441 | var tTime = 0L 442 | val rangeQueryWindow6 = new Envelope(-180.0, 180.0, -90.0, 90.0) 443 | val random = scala.util.Random 444 | 445 | println("************************ Rectangle KNN Queries **************************************") 446 | 447 | val objectRDD = new RectangleRDD(sc, "/data/rectangles_114M.csv", FileDataSplitter.WKT, false, 1024, StorageLevel.MEMORY_ONLY) 448 | 449 | objectRDD.buildIndex(IndexType.RTREE, false) 450 | objectRDD.indexedRawRDD.persist(StorageLevel.MEMORY_ONLY) 451 | 452 | var t0 = 0L 453 | var t1 = 0L 454 | 455 | t0 = System.nanoTime() 456 | for (i <- 1 to 20) { 457 | val count1 = RangeQuery.SpatialRangeQuery(objectRDD, rangeQueryWindow6, false, true).count() 458 | } 459 | t1 = System.nanoTime() 460 | 461 | println("k=1") 462 | t0 = System.nanoTime() 463 | for (i <- 1 to nQueries) { 464 | var lat = (random.nextDouble() * 2 - 1) * 90 465 | var long = (random.nextDouble() * 2 - 1) * 180 466 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 467 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 1, true) // flag true to use index 468 | } 469 | t1 = System.nanoTime() 470 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 471 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 472 | t1 = 0L 473 | t0 = 0L 474 | 475 | println("k=5") 476 | t0 = System.nanoTime() 477 | for (i <- 1 to nQueries) { 478 | var lat = (random.nextDouble() * 2 - 1) * 90 479 | var long = (random.nextDouble() * 2 - 1) * 180 480 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 481 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 5, true) // flag true to use index 482 | } 483 | t1 = System.nanoTime() 484 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 485 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 486 | t1 = 0L 487 | t0 = 0L 488 | 489 | println("k=10") 490 | t0 = System.nanoTime() 491 | for (i <- 1 to nQueries) { 492 | var lat = (random.nextDouble() * 2 - 1) * 90 493 | var long = (random.nextDouble() * 2 - 1) * 180 494 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 495 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 10, true) // flag true to use index 496 | } 497 | t1 = System.nanoTime() 498 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 499 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 500 | t1 = 0L 501 | t0 = 0L 502 | 503 | println("k=20") 504 | t0 = System.nanoTime() 505 | for (i <- 1 to nQueries) { 506 | var lat = (random.nextDouble() * 2 - 1) * 90 507 | var long = (random.nextDouble() * 2 - 1) * 180 508 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 509 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 20, true) // flag true to use index 510 | } 511 | t1 = System.nanoTime() 512 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 513 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 514 | t1 = 0L 515 | t0 = 0L 516 | 517 | println("k=30") 518 | t0 = System.nanoTime() 519 | for (i <- 1 to nQueries) { 520 | var lat = (random.nextDouble() * 2 - 1) * 90 521 | var long = (random.nextDouble() * 2 - 1) * 180 522 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 523 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 30, true) // flag true to use index 524 | } 525 | t1 = System.nanoTime() 526 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 527 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 528 | t1 = 0L 529 | t0 = 0L 530 | 531 | println("k=40") 532 | t0 = System.nanoTime() 533 | for (i <- 1 to nQueries) { 534 | var lat = (random.nextDouble() * 2 - 1) * 90 535 | var long = (random.nextDouble() * 2 - 1) * 180 536 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 537 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 40, true) // flag true to use index 538 | } 539 | t1 = System.nanoTime() 540 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 541 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 542 | t1 = 0L 543 | t0 = 0L 544 | 545 | println("k=50") 546 | t0 = System.nanoTime() 547 | for (i <- 1 to nQueries) { 548 | var lat = (random.nextDouble() * 2 - 1) * 90 549 | var long = (random.nextDouble() * 2 - 1) * 180 550 | val kNNQueryPoint = geometryFactory.createPoint(new Coordinate(long, lat)) 551 | val count1 = KNNQuery.SpatialKnnQuery(objectRDD, kNNQueryPoint, 50, true) // flag true to use index 552 | } 553 | t1 = System.nanoTime() 554 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 555 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 556 | 557 | objectRDD.indexedRawRDD.unpersist() 558 | objectRDD.rawSpatialRDD.unpersist() 559 | 560 | println("***********************************************************************************") 561 | println("") 562 | } 563 | } 564 | -------------------------------------------------------------------------------- /locationspark/RangeQueries.scala: -------------------------------------------------------------------------------- 1 | package locationspark.measurements 2 | 3 | import cs.purdue.edu.spatialrdd.SpatialRDD 4 | import cs.purdue.edu.spatialindex.rtree._ 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.SparkConf 7 | import com.vividsolutions.jts.io.WKTReader 8 | //import cs.purdue.edu.spatialindex.qtree.{Box, Point} 9 | import cs.purdue.edu.spatialindex.rtree.{Box, Entry, Point} 10 | import cs.purdue.edu.spatialrdd.SpatialRDD 11 | import cs.purdue.edu.spatialrdd.impl.Util 12 | import org.apache.spark.{SparkConf, SparkContext} 13 | import scala.util.Try 14 | import org.apache.spark.{SparkConf, SparkContext} 15 | 16 | object RangeQueries { 17 | 18 | def main(args: Array[String]) { 19 | 20 | val conf = new SparkConf().setAppName("LocationSpark Range Queries") 21 | val sc = new SparkContext(conf) 22 | sc.setLogLevel("OFF") 23 | Util.localIndex = "QTREE" 24 | val input = "/data/points_200M_wkt.csv" 25 | 26 | val rangeQueryWindow1 = Box(-50.3010141441f, -53.209588996f, -24.9526465797f, -30.1096863746f) 27 | val rangeQueryWindow2 = Box(-54.4270741441f, -53.209588996f, -24.9526465797f, -30.1096863746f) 28 | val rangeQueryWindow3 = Box(-114.4270741441f, -54.509588996f, 42.9526465797f, -27.0106863746f) 29 | val rangeQueryWindow4 = Box(-82.7638020000f, -54.509588996f, 42.9526465797f, 38.0106863746f) 30 | val rangeQueryWindow5 = Box(-140.99778f, -52.6480987209f, 5.7305630159f, 83.23324f) 31 | val rangeQueryWindow6 = Box(-180.0f, -90.0f, 180.0f, 90.0f) 32 | 33 | val nQueries = 100 34 | var t0 = 0L 35 | var t1 = 0L 36 | 37 | val leftpoints = sc.textFile("/data/points_200M_wkt.csv", 1024).map(x => (Try(new WKTReader().read(x)))).filter(_.isSuccess).map { 38 | case x => 39 | val corrds = x.get.getCoordinates 40 | val p1 = corrds(0) 41 | (Point(p1.x.toFloat, p1.y.toFloat), "1") 42 | } 43 | 44 | val leftLocationRDD = SpatialRDD(leftpoints).cache() 45 | val count = leftLocationRDD.count() 46 | 47 | println("Range1: 0.00001 %") 48 | t0 = System.nanoTime() 49 | for (i <- 1 to nQueries) { 50 | var count1 = leftLocationRDD.rangeFilter(rangeQueryWindow1, (id) => true) 51 | } 52 | t1 = System.nanoTime() 53 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 54 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 55 | 56 | println("Range2: 0.001 %") 57 | t0 = System.nanoTime() 58 | for (i <- 1 to nQueries) { 59 | var count1 = leftLocationRDD.rangeFilter(rangeQueryWindow2, (id) => true) 60 | } 61 | t1 = System.nanoTime() 62 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 63 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 64 | 65 | println("Range3: 1.0 %") 66 | t0 = System.nanoTime() 67 | for (i <- 1 to nQueries) { 68 | var count1 = leftLocationRDD.rangeFilter(rangeQueryWindow3, (id) => true) 69 | } 70 | t1 = System.nanoTime() 71 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 72 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 73 | 74 | println("Range4: 10.0 %") 75 | t0 = System.nanoTime() 76 | for (i <- 1 to nQueries) { 77 | var count1 = leftLocationRDD.rangeFilter(rangeQueryWindow4, (id) => true) 78 | } 79 | t1 = System.nanoTime() 80 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 81 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 82 | 83 | println("Range5: 50.0 %") 84 | t0 = System.nanoTime() 85 | for (i <- 1 to nQueries) { 86 | var count1 = leftLocationRDD.rangeFilter(rangeQueryWindow5, (id) => true) 87 | } 88 | t1 = System.nanoTime() 89 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 90 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 91 | 92 | println("Range6: 100.0 %") 93 | t0 = System.nanoTime() 94 | for (i <- 1 to nQueries) { 95 | var count1 = leftLocationRDD.rangeFilter(rangeQueryWindow6, (id) => true) 96 | } 97 | t1 = System.nanoTime() 98 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 99 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 100 | 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /locationspark/SpatialJoins.scala: -------------------------------------------------------------------------------- 1 | package locationspark.measurements 2 | 3 | import cs.purdue.edu.spatialrdd.SpatialRDD 4 | import cs.purdue.edu.spatialindex.rtree._ 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.SparkConf 7 | import com.vividsolutions.jts.io.WKTReader 8 | //import cs.purdue.edu.spatialindex.qtree.{Box, Point} 9 | import cs.purdue.edu.spatialindex.rtree.{Box, Entry, Point} 10 | import cs.purdue.edu.spatialrdd.SpatialRDD 11 | import cs.purdue.edu.spatialrdd.impl.Util 12 | import org.apache.spark.{SparkConf, SparkContext} 13 | import scala.util.Try 14 | 15 | object SpatialJoins { 16 | 17 | def main(args: Array[String]) { 18 | 19 | val conf = new SparkConf().setAppName("LocationSpark Spatial Joins") 20 | val sc = new SparkContext(conf) 21 | var count = 0L 22 | sc.setLogLevel("OFF") 23 | Util.localIndex = "QTREE" 24 | var t0 = 0L 25 | var t1 = 0L 26 | 27 | def aggfunction1[K, V](itr: Iterator[(K, V)]): Int = { 28 | itr.size 29 | } 30 | 31 | def aggfunction2(v1: Int, v2: Int): Int = { 32 | v1 + v2 33 | } 34 | 35 | t0 = System.nanoTime() 36 | val leftpoints = sc.textFile("/data/points_200M_wkt.csv", 1024).map(x => (Try(new WKTReader().read(x)))).filter(_.isSuccess).map { 37 | case x => 38 | val corrds = x.get.getCoordinates 39 | val p1 = corrds(0) 40 | (Point(p1.x.toFloat, p1.y.toFloat), "1") 41 | } 42 | val leftLocationRDD = SpatialRDD(leftpoints).cache() 43 | count = leftLocationRDD.count() 44 | t1 = System.nanoTime() 45 | val leftTime = (t1 - t0) / 1E9 46 | println("Left Indexing Time: " + ((t1 - t0) / 1E9) + " sec") 47 | 48 | t0 = System.nanoTime() 49 | val rightData = sc.textFile("/data/rectangles_114M.csv") 50 | val rightBoxes = rightData.map(x => (Try(new WKTReader().read(x)))).filter(_.isSuccess).map { 51 | case x => 52 | val corrds = x.get.getCoordinates 53 | val p1 = corrds(0) 54 | val p2 = corrds(2) 55 | Box(p1.x.toFloat, p1.y.toFloat, p2.x.toFloat, p2.y.toFloat) 56 | }.cache() 57 | val count1 = rightBoxes.count() 58 | t1 = System.nanoTime() 59 | val rightTime = (t1 - t0) / 1E9 60 | println("Right Indexing Time: " + rightTime + " sec") 61 | 62 | t0 = System.nanoTime() 63 | val joinresultRdd0 = leftLocationRDD.rjoin(rightBoxes)(aggfunction1, aggfunction2) 64 | val tuples0 = joinresultRdd0.map { case (b, v) => (1, v) }.reduceByKey { case (a, b) => { 65 | a + b 66 | } 67 | }.map { case (a, b) => b }.collect() 68 | val count0 = tuples0(0) 69 | t1 = System.nanoTime() 70 | val time0 = (t1 - t0) / 1E9 71 | 72 | val total_time = leftTime + rightTime + time0 73 | println("Total Join Time: " + total_time + " sec") 74 | } 75 | } -------------------------------------------------------------------------------- /locationspark/kNNJoin.scala: -------------------------------------------------------------------------------- 1 | package locationspark.measurements 2 | 3 | import cs.purdue.edu.spatialrdd.SpatialRDD 4 | import cs.purdue.edu.spatialindex.rtree._ 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.SparkConf 7 | import com.vividsolutions.jts.io.WKTReader 8 | //import cs.purdue.edu.spatialindex.qtree.{Box, Point} 9 | import cs.purdue.edu.spatialindex.rtree.{Box, Entry, Point} 10 | import cs.purdue.edu.spatialrdd.SpatialRDD 11 | import cs.purdue.edu.spatialrdd.impl.Util 12 | import cs.purdue.edu.spatialrdd.impl.{Util, knnJoinRDD} 13 | import org.apache.spark.{SparkConf, SparkContext} 14 | import scala.util.Try 15 | 16 | object kNNJoin { 17 | 18 | def main(args: Array[String]) { 19 | 20 | val conf = new SparkConf().setAppName("LocationSpark kNN Join") 21 | val sc = new SparkContext(conf) 22 | var count = 0L 23 | sc.setLogLevel("OFF") 24 | Util.localIndex = "QTREE" 25 | var t0 = 0L 26 | var t1 = 0L 27 | val knn = 5 28 | 29 | def aggfunction1[K, V](itr: Iterator[(K, V)]): Int = { 30 | itr.size 31 | } 32 | 33 | def aggfunction2(v1: Int, v2: Int): Int = { 34 | v1 + v2 35 | } 36 | 37 | t0 = System.nanoTime() 38 | /** **********************************************************************************/ 39 | val leftpoints = sc.textFile("/data/points_10M_wkt.csv").map(x => (Try(new WKTReader().read(x)))) 40 | .filter(_.isSuccess).map { 41 | case x => 42 | val corrds = x.get.getCoordinates 43 | val p1 = corrds(0) 44 | (Point(p1.x.toFloat, p1.y.toFloat), "1") 45 | } 46 | val leftLocationRDD = SpatialRDD(leftpoints).cache() 47 | /** **********************************************************************************/ 48 | count = leftLocationRDD.count() 49 | t1 = System.nanoTime() 50 | val leftTime = (t1 - t0) / 1E9 51 | println("Left Indexing Time: " + ((t1 - t0) / 1E9) + " sec") 52 | 53 | t0 = System.nanoTime() 54 | /** **********************************************************************************/ 55 | val rightpoints = sc.textFile("/data/points_10M_wkt.csv").map(x => (Try(new WKTReader().read(x)))) 56 | .filter(_.isSuccess).map { 57 | case x => 58 | val corrds = x.get.getCoordinates 59 | val p1 = corrds(0) 60 | (Point(p1.x.toFloat, p1.y.toFloat)) 61 | }.cache() 62 | /** **********************************************************************************/ 63 | val count1 = rightpoints.count() 64 | t1 = System.nanoTime() 65 | val rightTime = (t1 - t0) / 1E9 66 | println("Right Indexing Time: " + rightTime + " sec") 67 | 68 | 69 | t0 = System.nanoTime() 70 | 71 | val knnjoin = new knnJoinRDD[Point, String](leftLocationRDD, rightpoints, knn, (id) => true, (id) => true) 72 | 73 | val knnjoinresult = knnjoin.rangebasedKnnjoin() 74 | 75 | val tuples = knnjoinresult.map { case (b, v) => (1, v.size) }.reduceByKey { case (a, b) => { 76 | a + b 77 | } 78 | }.map { case (a, b) => b }.collect() 79 | 80 | println("k: " + knn) 81 | println("kNN Join Results Size: " + tuples(0)) 82 | 83 | t1 = System.nanoTime() 84 | val join_time = (t1 - t0) / 1E9 85 | println("kNN Join Time: " + join_time + " sec") 86 | 87 | val total_time = leftTime + rightTime + join_time 88 | //val total_time = 0L 89 | println("Total Join Time: " + total_time + " sec") 90 | } 91 | } -------------------------------------------------------------------------------- /locationspark/kNNQueries.scala: -------------------------------------------------------------------------------- 1 | package locationspark.measurements 2 | 3 | import cs.purdue.edu.spatialrdd.SpatialRDD 4 | import cs.purdue.edu.spatialindex.rtree._ 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.SparkConf 7 | import com.vividsolutions.jts.io.WKTReader 8 | //import cs.purdue.edu.spatialindex.qtree.{Box, Point} 9 | import cs.purdue.edu.spatialindex.rtree.{Box, Entry, Point} 10 | import cs.purdue.edu.spatialrdd.SpatialRDD 11 | import cs.purdue.edu.spatialrdd.impl.Util 12 | import org.apache.spark.{SparkConf, SparkContext} 13 | import scala.util.Try 14 | import scala.util.Random 15 | import org.apache.spark.{SparkConf, SparkContext} 16 | 17 | object kNNQueries { 18 | 19 | def main(args: Array[String]) { 20 | 21 | val conf = new SparkConf().setAppName("LocationSpark kNN Queries") 22 | val sc = new SparkContext(conf) 23 | sc.setLogLevel("OFF") 24 | Util.localIndex = "QTREE" 25 | val input = "/data/points_200M_wkt.csv" 26 | val random = scala.util.Random 27 | 28 | println("************************************** LocationSpark kNN Queries **************************************************") 29 | 30 | val rangeQueryWindow1 = Box(-50.3010141441f, -53.209588996f, -24.9526465797f, -30.1096863746f) 31 | val rangeQueryWindow2 = Box(-54.4270741441f, -53.209588996f, -24.9526465797f, -30.1096863746f) 32 | val rangeQueryWindow3 = Box(-114.4270741441f, -54.509588996f, 42.9526465797f, -27.0106863746f) 33 | val rangeQueryWindow4 = Box(-82.7638020000f, -54.509588996f, 42.9526465797f, 38.0106863746f) 34 | val rangeQueryWindow5 = Box(-140.99778f, -52.6480987209f, 5.7305630159f, 83.23324f) 35 | val rangeQueryWindow6 = Box(-180.0f, -90.0f, 180.0f, 90.0f) 36 | 37 | val nQueries = 100 38 | var t0 = 0L 39 | var t1 = 0L 40 | 41 | t0 = System.nanoTime() 42 | val leftpoints = sc.textFile("/data/points_200M_wkt.csv", 1024).map(x => (Try(new WKTReader().read(x)))).filter(_.isSuccess).map { 43 | case x => 44 | val corrds = x.get.getCoordinates 45 | val p1 = corrds(0) 46 | (Point(p1.x.toFloat, p1.y.toFloat), "1") 47 | } 48 | val leftLocationRDD = SpatialRDD(leftpoints).cache() 49 | val size = leftLocationRDD.count() 50 | t1 = System.nanoTime() 51 | var read_time = ((t1 - t0) / 1E9) 52 | println("Left Indexing Time: " + read_time + " sec") 53 | var knnresults = leftLocationRDD.knnFilter(Point(107.32653223f, 44.323999f), 5, (id) => true) 54 | knnresults.size 55 | 56 | t1 = 0L 57 | t0 = 0L 58 | 59 | println("k=10") 60 | t0 = System.nanoTime() 61 | for (i <- 1 to nQueries) { 62 | var lat = (random.nextFloat() * 2 - 1) * 90 63 | var long = (random.nextFloat() * 2 - 1) * 180 64 | var qPoint = Point(long, lat) 65 | var count1 = leftLocationRDD.knnFilter(qPoint, 10, (id) => true) 66 | val count = count1.size 67 | } 68 | t1 = System.nanoTime() 69 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 70 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 71 | t1 = 0L 72 | t0 = 0L 73 | 74 | println("k=20") 75 | t0 = System.nanoTime() 76 | for (i <- 1 to nQueries) { 77 | var lat = (random.nextFloat() * 2 - 1) * 90 78 | var long = (random.nextFloat() * 2 - 1) * 180 79 | var qPoint = Point(long, lat) 80 | var count1 = leftLocationRDD.knnFilter(qPoint, 20, (id) => true) 81 | val count = count1.size 82 | } 83 | t1 = System.nanoTime() 84 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 85 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 86 | t1 = 0L 87 | t0 = 0L 88 | 89 | println("k=30") 90 | t0 = System.nanoTime() 91 | for (i <- 1 to nQueries) { 92 | var lat = (random.nextFloat() * 2 - 1) * 90 93 | var long = (random.nextFloat() * 2 - 1) * 180 94 | var qPoint = Point(long, lat) 95 | var count1 = leftLocationRDD.knnFilter(qPoint, 30, (id) => true) 96 | val count = count1.size 97 | } 98 | t1 = System.nanoTime() 99 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 100 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 101 | t1 = 0L 102 | t0 = 0L 103 | 104 | println("k=40") 105 | t0 = System.nanoTime() 106 | for (i <- 1 to nQueries) { 107 | var lat = (random.nextFloat() * 2 - 1) * 90 108 | var long = (random.nextFloat() * 2 - 1) * 180 109 | var qPoint = Point(long, lat) 110 | var count1 = leftLocationRDD.knnFilter(qPoint, 40, (id) => true) 111 | val count = count1.size 112 | } 113 | t1 = System.nanoTime() 114 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 115 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 116 | t1 = 0L 117 | t0 = 0L 118 | 119 | println("k=50") 120 | t0 = System.nanoTime() 121 | for (i <- 1 to nQueries) { 122 | var lat = (random.nextFloat() * 2 - 1) * 90 123 | var long = (random.nextFloat() * 2 - 1) * 180 124 | var qPoint = Point(long, lat) 125 | var count1 = leftLocationRDD.knnFilter(qPoint, 50, (id) => true) 126 | val count = count1.size 127 | } 128 | t1 = System.nanoTime() 129 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 130 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 131 | t1 = 0L 132 | t0 = 0L 133 | 134 | println("****************************************************************************************") 135 | println() 136 | 137 | } 138 | } -------------------------------------------------------------------------------- /magellan/RangeQueries.scala: -------------------------------------------------------------------------------- 1 | package magellan.measurements 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.sql.SQLContext 5 | import org.apache.spark.util.SizeEstimator 6 | import org.apache.spark.rdd.RDD 7 | import magellan._ 8 | import org.apache.spark.sql.magellan.dsl.expressions._ 9 | import org.apache.hadoop.mapreduce.TaskInputOutputContext 10 | import org.apache.spark.sql.Row 11 | import org.apache.spark.sql.functions._ 12 | import org.apache.spark.sql.types._ 13 | import java.util.UUID 14 | import org.apache.hadoop.mapreduce.TaskInputOutputContext 15 | import magellan.index._ 16 | import fastparse.all._ 17 | import fastparse.core.Parsed.{Failure, Success} 18 | import scala.collection.mutable.ListBuffer 19 | import magellan.{BoundingBox, Point, Polygon, PolygonDeserializer} 20 | 21 | object RangeQueries { 22 | 23 | def main(args: Array[String]) { 24 | 25 | val conf = new SparkConf().setAppName("Magellan Range Queries") 26 | val spark = new SparkContext(conf) 27 | val sqlContext = new org.apache.spark.sql.SQLContext(spark) 28 | import sqlContext.implicits._ 29 | 30 | def time[R](block: => R): R = { 31 | val t0 = System.nanoTime() 32 | val result = block // call-by-name 33 | val t1 = System.nanoTime() 34 | println("Elapsed time: " + (t1 - t0) / 1E9 + " sec ") 35 | result 36 | } 37 | 38 | spatialRangePoint() 39 | spatialRangeLineString() 40 | spatialRangeRectangle() 41 | spatialRangePolygon() 42 | 43 | spark.stop() 44 | 45 | def spatialRangePoint() { 46 | 47 | println("************************ POINT Range Queries **************************************") 48 | val nQueries = 100 49 | var count = 0L 50 | val rangeQueryWindow1 = BoundingBox(-50.3010141441, -53.209588996, -24.9526465797, -30.1096863746) 51 | val rangeQueryWindow2 = BoundingBox(-54.4270741441, -53.209588996, -24.9526465797, -30.1096863746) 52 | val rangeQueryWindow3 = BoundingBox(-114.4270741441, -54.509588996, 42.9526465797, -27.0106863746) 53 | val rangeQueryWindow4 = BoundingBox(-82.7638020000, -54.509588996, 42.9526465797, 38.0106863746) 54 | val rangeQueryWindow5 = BoundingBox(-140.99778, -52.6480987209, 5.7305630159, 83.23324) 55 | val rangeQueryWindow6 = BoundingBox(-180.0, -90.0, 180.0, 90.0) 56 | 57 | val rawPoints = spark.textFile("/data/points_200M.csv").map { line => 58 | val parts = line.split(",") 59 | val longitude = parts(0).toDouble 60 | val latitude = parts(1).toDouble 61 | (UUID.randomUUID().toString(), Point(longitude, latitude)) 62 | }.repartition(1024).toDF("id", "point").cache() 63 | 64 | val rawCount = rawPoints.count() 65 | 66 | // Dry run 67 | var t0 = System.nanoTime() 68 | for (i <- 1 to 20) { 69 | count = rawPoints.where($"point" withinRange rangeQueryWindow1).count() 70 | } 71 | var t1 = System.nanoTime() 72 | 73 | // Main measurements 74 | t0 = System.nanoTime() 75 | for (i <- 1 to nQueries) { 76 | count = rawPoints.where($"point" withinRange rangeQueryWindow1).count() 77 | } 78 | t1 = System.nanoTime() 79 | println("Count: " + count) 80 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 81 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 82 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 83 | 84 | t0 = System.nanoTime() 85 | for (i <- 1 to nQueries) { 86 | count = rawPoints.where($"point" withinRange rangeQueryWindow2).count() 87 | } 88 | t1 = System.nanoTime() 89 | println("Count: " + count) 90 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 91 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 92 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 93 | 94 | t0 = System.nanoTime() 95 | for (i <- 1 to nQueries) { 96 | count = rawPoints.where($"point" withinRange rangeQueryWindow3).count() 97 | } 98 | t1 = System.nanoTime() 99 | println("Count: " + count) 100 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 101 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 102 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 103 | 104 | t0 = System.nanoTime() 105 | for (i <- 1 to nQueries) { 106 | count = rawPoints.where($"point" withinRange rangeQueryWindow4).count() 107 | } 108 | t1 = System.nanoTime() 109 | println("Count: " + count) 110 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 111 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 112 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 113 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 114 | 115 | t0 = System.nanoTime() 116 | for (i <- 1 to nQueries) { 117 | count = rawPoints.where($"point" withinRange rangeQueryWindow5).count() 118 | } 119 | t1 = System.nanoTime() 120 | println("Count: " + count) 121 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 122 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 123 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 124 | 125 | t0 = System.nanoTime() 126 | for (i <- 1 to nQueries) { 127 | count = rawPoints.where($"point" withinRange rangeQueryWindow6).count() 128 | } 129 | t1 = System.nanoTime() 130 | println("Count: " + count) 131 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 132 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 133 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 134 | 135 | rawPoints.unpersist() 136 | 137 | println("***********************************************************************************") 138 | println("") 139 | } 140 | 141 | def spatialRangePolygon() { 142 | 143 | println("************************ POLYGON Range Queries **************************************") 144 | val nQueries = 100 145 | var count = 0L 146 | val rangeQueryWindow1 = BoundingBox(-20.204, -53.209588996, 17.9526465797, -30.1096863746) 147 | val rangeQueryWindow2 = BoundingBox(-20.204, -53.209588996, 20.4376465797, -30.1096863746) 148 | val rangeQueryWindow3 = BoundingBox(-74.4270741441, -34.609588996, 72.9526465797, -6.5906863746) 149 | val rangeQueryWindow4 = BoundingBox(-104.0938020000, -54.509588996, 118.9526465797, 40.2406863746) 150 | val rangeQueryWindow5 = BoundingBox(-174.4270741441, -34.609588996, 72.9526465797, 48.4396863746) 151 | val rangeQueryWindow6 = BoundingBox(-180.0, -90.0, 180.0, 90.0) 152 | 153 | val readBuildings = spark.textFile("/data/buildings_114M.csv", 1024) 154 | val rawBuildings = readBuildings.toDF("text").withColumn("polygon", wkt($"text")("polygon")).cache() 155 | 156 | val rawCount = rawBuildings.count() 157 | var t0 = System.nanoTime() 158 | for (i <- 1 to nQueries) { 159 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow1).count() 160 | } 161 | var t1 = System.nanoTime() 162 | 163 | t0 = System.nanoTime() 164 | for (i <- 1 to nQueries) { 165 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow1).count() 166 | } 167 | t1 = System.nanoTime() 168 | println("Count: " + count) 169 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 170 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 171 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 172 | 173 | 174 | t0 = System.nanoTime() 175 | for (i <- 1 to nQueries) { 176 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow2).count() 177 | } 178 | t1 = System.nanoTime() 179 | println("Count: " + count) 180 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 181 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 182 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 183 | 184 | t0 = System.nanoTime() 185 | for (i <- 1 to nQueries) { 186 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow3).count() 187 | } 188 | t1 = System.nanoTime() 189 | println("Count: " + count) 190 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 191 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 192 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 193 | 194 | t0 = System.nanoTime() 195 | for (i <- 1 to nQueries) { 196 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow4).count() 197 | } 198 | t1 = System.nanoTime() 199 | println("Count: " + count) 200 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 201 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 202 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 203 | 204 | t0 = System.nanoTime() 205 | for (i <- 1 to nQueries) { 206 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow5).count() 207 | } 208 | t1 = System.nanoTime() 209 | println("Count: " + count) 210 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 211 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 212 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 213 | 214 | t0 = System.nanoTime() 215 | for (i <- 1 to nQueries) { 216 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow6).count() 217 | } 218 | t1 = System.nanoTime() 219 | println("Count: " + count) 220 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 221 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 222 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 223 | 224 | rawBuildings.unpersist() 225 | 226 | println("***********************************************************************************") 227 | println("") 228 | } 229 | 230 | def spatialRangeLineString() { 231 | 232 | println("************************ LineString Range Queries **************************************") 233 | val nQueries = 100 234 | var count = 0L 235 | val rangeQueryWindow1 = BoundingBox(-50.204, -53.209588996, -24.9526465797, -30.1096863746) 236 | val rangeQueryWindow2 = BoundingBox(-52.1270741441, -53.209588996, -24.9526465797, -30.1096863746) 237 | val rangeQueryWindow3 = BoundingBox(-94.4270741441, -34.609588996, 22.9526465797, -27.0106863746) 238 | val rangeQueryWindow4 = BoundingBox(-74.0938020000, -54.509588996, 42.9526465797, 38.0106863746) 239 | val rangeQueryWindow5 = BoundingBox(-150.99778, -52.6480987209, 7.2705630159, 83.23324) 240 | val rangeQueryWindow6 = BoundingBox(-180.0, -90.0, 180.0, 90.0) 241 | 242 | val readRoads = spark.textFile("/data/linestrings_72M.csv", 1024) 243 | val rawRoads = readRoads.toDF("text").withColumn("polyline", wkt($"text")("polyline")).cache() 244 | 245 | val rawCount = rawRoads.count() 246 | var t0 = System.nanoTime() 247 | for (i <- 1 to nQueries) { 248 | count = rawRoads.where($"polyline" withinRange rangeQueryWindow1).count() 249 | } 250 | var t1 = System.nanoTime() 251 | 252 | t0 = System.nanoTime() 253 | for (i <- 1 to nQueries) { 254 | count = rawRoads.where($"polyline" withinRange rangeQueryWindow1).count() 255 | } 256 | t1 = System.nanoTime() 257 | println("Count: " + count) 258 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 259 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 260 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 261 | 262 | 263 | t0 = System.nanoTime() 264 | for (i <- 1 to nQueries) { 265 | count = rawRoads.where($"polyline" withinRange rangeQueryWindow2).count() 266 | } 267 | t1 = System.nanoTime() 268 | println("Count: " + count) 269 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 270 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 271 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 272 | 273 | t0 = System.nanoTime() 274 | for (i <- 1 to nQueries) { 275 | count = rawRoads.where($"polyline" withinRange rangeQueryWindow3).count() 276 | } 277 | t1 = System.nanoTime() 278 | println("Count: " + count) 279 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 280 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 281 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 282 | 283 | t0 = System.nanoTime() 284 | for (i <- 1 to nQueries) { 285 | count = rawRoads.where($"polyline" withinRange rangeQueryWindow4).count() 286 | } 287 | t1 = System.nanoTime() 288 | println("Count: " + count) 289 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 290 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 291 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 292 | 293 | t0 = System.nanoTime() 294 | for (i <- 1 to nQueries) { 295 | count = rawRoads.where($"polyline" withinRange rangeQueryWindow5).count() 296 | } 297 | t1 = System.nanoTime() 298 | println("Count: " + count) 299 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 300 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 301 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 302 | 303 | t0 = System.nanoTime() 304 | for (i <- 1 to nQueries) { 305 | count = rawRoads.where($"polyline" withinRange rangeQueryWindow6).count() 306 | } 307 | t1 = System.nanoTime() 308 | println("Count: " + count) 309 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 310 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 311 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 312 | 313 | rawRoads.unpersist() 314 | 315 | println("***********************************************************************************") 316 | println("") 317 | } 318 | 319 | def spatialRangeRectangle() { 320 | 321 | println("************************ Rectangle Range Queries **************************************") 322 | val nQueries = 100 323 | var count = 0L 324 | val rangeQueryWindow1 = BoundingBox(-20.204, -53.209588996, 17.9526465797, -30.1096863746) 325 | val rangeQueryWindow2 = BoundingBox(-20.204, -53.209588996, 20.4376465797, -30.1096863746) 326 | val rangeQueryWindow3 = BoundingBox(-74.4270741441, -34.609588996, 72.9526465797, -6.5906863746) 327 | val rangeQueryWindow4 = BoundingBox(-104.0938020000, -54.509588996, 118.9526465797, 40.2406863746) 328 | val rangeQueryWindow5 = BoundingBox(-174.4270741441, -34.609588996, 72.9526465797, 48.4396863746) 329 | val rangeQueryWindow6 = BoundingBox(-180.0, -90.0, 180.0, 90.0) 330 | 331 | val readRectangles = spark.textFile("/data/rectangles_114M.csv", 1024) 332 | val rawBuildings = readRectangles.toDF("text").withColumn("polygon", wkt($"text")("polygon")).cache() 333 | 334 | val rawCount = rawBuildings.count() 335 | var t0 = System.nanoTime() 336 | for (i <- 1 to nQueries) { 337 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow1).count() 338 | } 339 | var t1 = System.nanoTime() 340 | 341 | t0 = System.nanoTime() 342 | for (i <- 1 to nQueries) { 343 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow1).count() 344 | } 345 | t1 = System.nanoTime() 346 | println("Count: " + count) 347 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 348 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 349 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 350 | 351 | 352 | t0 = System.nanoTime() 353 | for (i <- 1 to nQueries) { 354 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow2).count() 355 | } 356 | t1 = System.nanoTime() 357 | println("Count: " + count) 358 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 359 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 360 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 361 | 362 | t0 = System.nanoTime() 363 | for (i <- 1 to nQueries) { 364 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow3).count() 365 | } 366 | t1 = System.nanoTime() 367 | println("Count: " + count) 368 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 369 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 370 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 371 | 372 | t0 = System.nanoTime() 373 | for (i <- 1 to nQueries) { 374 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow4).count() 375 | } 376 | t1 = System.nanoTime() 377 | println("Count: " + count) 378 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 379 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 380 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 381 | 382 | t0 = System.nanoTime() 383 | for (i <- 1 to nQueries) { 384 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow5).count() 385 | } 386 | t1 = System.nanoTime() 387 | println("Count: " + count) 388 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 389 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 390 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 391 | 392 | t0 = System.nanoTime() 393 | for (i <- 1 to nQueries) { 394 | count = rawBuildings.where($"polygon" withinRange rangeQueryWindow6).count() 395 | } 396 | t1 = System.nanoTime() 397 | println("Count: " + count) 398 | println("Selection Ratio: " + ((count * 100.0) / rawCount) + " %") 399 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 400 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / 1E9) + " queries/min") 401 | 402 | rawBuildings.unpersist() 403 | 404 | println("***********************************************************************************") 405 | println("") 406 | } 407 | } 408 | } -------------------------------------------------------------------------------- /magellan/SpatialJoins.scala: -------------------------------------------------------------------------------- 1 | package magellan.measurements 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.sql.SQLContext 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.util.SizeEstimator 7 | import org.apache.spark.rdd.RDD 8 | import magellan._ 9 | import org.apache.spark.sql.magellan.dsl.expressions._ 10 | import org.apache.hadoop.mapreduce.TaskInputOutputContext 11 | import org.apache.spark.sql.Row 12 | import org.apache.spark.sql.functions._ 13 | import org.apache.spark.sql.types._ 14 | import java.util.UUID 15 | import org.apache.hadoop.mapreduce.TaskInputOutputContext 16 | import magellan.index._ 17 | import fastparse.all._ 18 | import fastparse.core.Parsed.{Failure, Success} 19 | import scala.collection.mutable.ListBuffer 20 | import magellan.{BoundingBox, Point, Polygon, PolygonDeserializer} 21 | 22 | object SpatialJoins { 23 | 24 | def main(args: Array[String]) { 25 | 26 | val conf = new SparkConf().setAppName("Magellan Spatial Joins") 27 | val spark = new SparkContext(conf) 28 | val sqlContext = new org.apache.spark.sql.SQLContext(spark) 29 | val sparkSession = SparkSession.builder().appName("Magellan Spatial Joins").getOrCreate() 30 | import sqlContext.implicits._ 31 | 32 | def time[R](block: => R): R = { 33 | val t0 = System.nanoTime() 34 | val result = block // call-by-name 35 | val t1 = System.nanoTime() 36 | println("Join time: " + (t1 - t0) / 1E9 + " sec ") 37 | result 38 | } 39 | 40 | pointPoint() 41 | pointLineString() 42 | pointRectangle() 43 | pointPolygon() 44 | linestringLineString() 45 | linestringRectangle() 46 | linestringPolygon() 47 | rectangleRectangle() 48 | polygonRectangle() 49 | polygonPolygon() 50 | 51 | spark.stop() 52 | 53 | def pointLineString() { 54 | 55 | println("*************************** Point-LineString ****************************************") 56 | 57 | val precision = 30 58 | var count = 0L 59 | var t0 = 0L 60 | var t1 = 0L 61 | 62 | val beginTime = System.currentTimeMillis() 63 | magellan.Utils.injectRules(sparkSession) 64 | 65 | val rawPoints = spark.textFile("/data/points_200M.csv").map { line => 66 | val parts = line.split(",") 67 | val longitude = parts(0).toDouble 68 | val latitude = parts(1).toDouble 69 | (UUID.randomUUID().toString(), Point(longitude, latitude)) 70 | }.repartition(1024).toDF("id", "point") 71 | val roads = rawPoints.withColumn("index", $"point" index precision).select($"point", $"index") 72 | val indexedRoads = roads.withColumn("index", explode($"index")).select("point", "index.curve", "index.relation").cache() 73 | val count1 = indexedRoads.count() 74 | 75 | val readBuildings = spark.textFile("/data/linestrings_72M.csv", 1024) 76 | val rawBuildings = readBuildings.toDF("text").withColumn("polyline", wkt($"text")("polyline")) 77 | val buildings = rawBuildings.withColumn("index", $"polyline" index precision).select($"polyline", $"index") 78 | val indexedBuildings = buildings.withColumn("index", explode($"index")).select("polyline", "index.curve", "index.relation").cache() 79 | val count2 = indexedBuildings.count() 80 | 81 | val runtime = System.currentTimeMillis() - beginTime 82 | 83 | println("Indexing Time: " + (runtime) / 1E3 + " sec ") 84 | 85 | t0 = System.nanoTime() 86 | // This is only a filter based on Z-curve value 87 | count = indexedBuildings.join(indexedRoads, indexedRoads("curve") === indexedBuildings("curve")).where((indexedBuildings("relation") === "Intersects")).count() 88 | t1 = System.nanoTime() 89 | val time0 = ((t1 - t0) / 1E9) 90 | println("Join Time: " + time0 + " sec") 91 | 92 | val total_time = (runtime / 1E3) + time0 93 | 94 | println("Total Join Time: " + total_time + " sec") 95 | println("") 96 | 97 | indexedRoads.unpersist() 98 | indexedBuildings.unpersist() 99 | } 100 | 101 | def pointPoint() { 102 | 103 | println("*************************** Point-Point ****************************************") 104 | 105 | val precision = 30 106 | var count = 0L 107 | var t0 = 0L 108 | var t1 = 0L 109 | 110 | val beginTime = System.currentTimeMillis() 111 | magellan.Utils.injectRules(sparkSession) 112 | 113 | val rawPoints = spark.textFile("/data/points_200M.csv").map { line => 114 | val parts = line.split(",") 115 | val longitude = parts(0).toDouble 116 | val latitude = parts(1).toDouble 117 | (UUID.randomUUID().toString(), Point(longitude, latitude)) 118 | }.repartition(1024).toDF("id", "point") 119 | val roads = rawPoints.withColumn("index", $"point" index precision).select($"point", $"index") 120 | val indexedRoads = roads.withColumn("index", explode($"index")).select("point", "index.curve", "index.relation").cache() 121 | val count1 = indexedRoads.count() 122 | 123 | val rawBuildings = spark.textFile("/data/points_200M.csv").map { line => 124 | val parts = line.split(",") 125 | val longitude = parts(0).toDouble 126 | val latitude = parts(1).toDouble 127 | (UUID.randomUUID().toString(), Point(longitude, latitude)) 128 | }.repartition(1024).toDF("id", "point") 129 | val buildings = rawBuildings.withColumn("index", $"point" index precision).select($"point", $"index") 130 | val indexedBuildings = buildings.withColumn("index", explode($"index")).select("point", "index.curve", "index.relation").cache() 131 | val count2 = indexedBuildings.count() 132 | 133 | val runtime = System.currentTimeMillis() - beginTime 134 | 135 | println("Indexing Time: " + (runtime) / 1E3 + " sec ") 136 | 137 | t0 = System.nanoTime() 138 | // indexedRoads("point") intersects indexedBuildings("point") -> this enforces exact check 139 | count = indexedRoads.join(indexedBuildings, indexedBuildings("curve") === indexedRoads("curve")).where((indexedRoads("relation") === "Intersects") or (indexedRoads("point") intersects indexedBuildings("point"))).count() 140 | t1 = System.nanoTime() 141 | val time0 = ((t1 - t0) / 1E9) 142 | println("Join Time: " + time0 + " sec") 143 | 144 | val total_time = (runtime / 1E3) + time0 145 | 146 | println("Total Join Time: " + total_time + " sec") 147 | println("") 148 | 149 | indexedRoads.unpersist() 150 | indexedBuildings.unpersist() 151 | } 152 | 153 | def pointPolygon() { 154 | 155 | println("*************************** Point-Polygon ****************************************") 156 | 157 | val precision = 30 158 | var count = 0L 159 | var t0 = 0L 160 | var t1 = 0L 161 | 162 | val beginTime = System.currentTimeMillis() 163 | magellan.Utils.injectRules(sparkSession) 164 | 165 | val rawPoints = spark.textFile("/data/points_200M.csv").map { line => 166 | val parts = line.split(",") 167 | val longitude = parts(0).toDouble 168 | val latitude = parts(1).toDouble 169 | (UUID.randomUUID().toString(), Point(longitude, latitude)) 170 | }.repartition(1024).toDF("id", "point") 171 | val roads = rawPoints.withColumn("index", $"point" index precision).select($"point", $"index") 172 | val indexedRoads = roads.withColumn("index", explode($"index")).select("point", "index.curve", "index.relation").cache() 173 | val count1 = indexedRoads.count() 174 | 175 | 176 | val readBuildings = spark.textFile("/data/buildings_114M.csv", 1024) 177 | val rawBuildings = readBuildings.toDF("text").withColumn("polygon", wkt($"text")("polygon")) 178 | val buildings = rawBuildings.withColumn("index", $"polygon" index precision).select($"polygon", $"index") 179 | val indexedBuildings = buildings.withColumn("index", explode($"index")).select("polygon", "index.curve", "index.relation").cache() 180 | val count2 = indexedBuildings.count() 181 | 182 | val runtime = System.currentTimeMillis() - beginTime 183 | 184 | println("Indexing Time: " + (runtime) / 1E3 + " sec ") 185 | 186 | t0 = System.nanoTime() 187 | // $"point" intersects $"polygon" enforces exact check 188 | count = indexedRoads.join(indexedBuildings, indexedBuildings("curve") === indexedRoads("curve")).where((indexedRoads("relation") === "Intersects") or ($"point" intersects $"polygon")).count() 189 | t1 = System.nanoTime() 190 | val time0 = ((t1 - t0) / 1E9) 191 | println("Join Time: " + time0 + " sec") 192 | 193 | val total_time = (runtime / 1E3) + time0 194 | 195 | println("Total Join Time: " + total_time + " sec") 196 | 197 | println("") 198 | 199 | indexedRoads.unpersist() 200 | indexedBuildings.unpersist() 201 | } 202 | 203 | def pointRectangle() { 204 | 205 | println("*************************** Point-Rectangle ****************************************") 206 | 207 | val precision = 30 208 | var count = 0L 209 | var t0 = 0L 210 | var t1 = 0L 211 | 212 | val beginTime = System.currentTimeMillis() 213 | magellan.Utils.injectRules(sparkSession) 214 | 215 | val rawPoints = spark.textFile("/data/points_200M.csv").map { line => 216 | val parts = line.split(",") 217 | val longitude = parts(0).toDouble 218 | val latitude = parts(1).toDouble 219 | (UUID.randomUUID().toString(), Point(longitude, latitude)) 220 | }.repartition(1024).toDF("id", "point") 221 | val roads = rawPoints.withColumn("index", $"point" index precision).select($"point", $"index") 222 | val indexedRoads = roads.withColumn("index", explode($"index")).select("point", "index.curve", "index.relation").cache() 223 | val count1 = indexedRoads.count() 224 | 225 | val readBuildings = spark.textFile("/data/rectangles_114M.csv", 1024) 226 | val rawBuildings = readBuildings.toDF("text").withColumn("polygon", wkt($"text")("polygon")) 227 | val buildings = rawBuildings.withColumn("index", $"polygon" index precision).select($"polygon", $"index") 228 | val indexedBuildings = buildings.withColumn("index", explode($"index")).select("polygon", "index.curve", "index.relation").cache() 229 | val count2 = indexedBuildings.count() 230 | 231 | val runtime = System.currentTimeMillis() - beginTime 232 | 233 | println("Indexing Time: " + (runtime) / 1E3 + " sec ") 234 | 235 | t0 = System.nanoTime() 236 | count = indexedRoads.join(indexedBuildings, indexedBuildings("curve") === indexedRoads("curve")).where((indexedRoads("relation") === "Intersects") or ($"point" intersects $"polygon")).count() 237 | t1 = System.nanoTime() 238 | val time0 = ((t1 - t0) / 1E9) 239 | println("Join Time: " + time0 + " sec") 240 | 241 | val total_time = (runtime / 1E3) + time0 242 | 243 | println("Total Join Time: " + total_time + " sec") 244 | 245 | println("") 246 | 247 | indexedRoads.unpersist() 248 | indexedBuildings.unpersist() 249 | } 250 | 251 | def linestringLineString() { 252 | 253 | println("*************************** LineString-LineString ****************************************") 254 | 255 | val precision = 30 256 | var count = 0L 257 | var t0 = 0L 258 | var t1 = 0L 259 | 260 | val beginTime = System.currentTimeMillis() 261 | magellan.Utils.injectRules(sparkSession) 262 | 263 | val readRoads = spark.textFile("/data/linestrings_72M.csv", 1024) 264 | val rawRoads = readRoads.toDF("text").withColumn("polyline", wkt($"text")("polyline")) 265 | val roads = rawRoads.withColumn("index", $"polyline" index precision).select($"polyline", $"index") 266 | val indexedRoads = roads.withColumn("index", explode($"index")).select("polyline", "index.curve", "index.relation").cache() 267 | val count1 = indexedRoads.count() 268 | 269 | val readBuildings = spark.textFile("/data/linestrings_72M.csv", 1024) 270 | val rawBuildings = readBuildings.toDF("text").withColumn("polyline", wkt($"text")("polyline")) 271 | val buildings = rawBuildings.withColumn("index", $"polyline" index precision).select($"polyline", $"index") 272 | val indexedBuildings = buildings.withColumn("index", explode($"index")).select("polyline", "index.curve", "index.relation").cache() 273 | val count2 = indexedBuildings.count() 274 | 275 | val runtime = System.currentTimeMillis() - beginTime 276 | 277 | println("Indexing Time: " + (runtime) / 1E3 + " sec ") 278 | 279 | t0 = System.nanoTime() 280 | // This is only a filter based on Z-curve value, also intersects predicate has not been implemented for LineString-LineString yet 281 | count = indexedBuildings.join(indexedRoads, indexedRoads("curve") === indexedBuildings("curve")).where((indexedBuildings("relation") === "Contains")).count() 282 | t1 = System.nanoTime() 283 | val time0 = ((t1 - t0) / 1E9) 284 | println("Join Time: " + time0 + " sec") 285 | 286 | val total_time = (runtime / 1E3) + time0 287 | 288 | println("Total Join Time: " + total_time + " sec") 289 | 290 | println("") 291 | 292 | indexedRoads.unpersist() 293 | indexedBuildings.unpersist() 294 | } 295 | 296 | def linestringPolygon() { 297 | 298 | println("*************************** LineString-Polygon ****************************************") 299 | 300 | val precision = 30 301 | var count = 0L 302 | var t0 = 0L 303 | var t1 = 0L 304 | 305 | val beginTime = System.currentTimeMillis() 306 | magellan.Utils.injectRules(sparkSession) 307 | 308 | val readRoads = spark.textFile("/data/linestrings_72M.csv", 1024) 309 | val rawRoads = readRoads.toDF("text").withColumn("polyline", wkt($"text")("polyline")) 310 | val roads = rawRoads.withColumn("index", $"polyline" index precision).select($"polyline", $"index") 311 | val indexedRoads = roads.withColumn("index", explode($"index")).select("polyline", "index.curve", "index.relation").cache() 312 | val count1 = indexedRoads.count() 313 | 314 | val readBuildings = spark.textFile("/data/buildings_114M.csv", 1024) 315 | val rawBuildings = readBuildings.toDF("text").withColumn("polygon", wkt($"text")("polygon")) 316 | val buildings = rawBuildings.withColumn("index", $"polygon" index precision).select($"polygon", $"index") 317 | val indexedBuildings = buildings.withColumn("index", explode($"index")).select("polygon", "index.curve", "index.relation").cache() 318 | val count2 = indexedBuildings.count() 319 | 320 | val runtime = System.currentTimeMillis() - beginTime 321 | 322 | println("Indexing Time: " + (runtime) / 1E3 + " sec ") 323 | 324 | t0 = System.nanoTime() 325 | count = indexedRoads.join(indexedBuildings, indexedBuildings("curve") === indexedRoads("curve")).where((indexedRoads("relation") === "Intersects") or ($"polyline" intersects $"polygon")).count() 326 | t1 = System.nanoTime() 327 | val time0 = ((t1 - t0) / 1E9) 328 | println("Join Time: " + time0 + " sec") 329 | 330 | val total_time = (runtime / 1E3) + time0 331 | 332 | println("Total Join Time: " + total_time + " sec") 333 | 334 | println("") 335 | 336 | indexedRoads.unpersist() 337 | indexedBuildings.unpersist() 338 | } 339 | 340 | def linestringRectangle() { 341 | 342 | println("*************************** LineString-Rectangle ****************************************") 343 | 344 | val precision = 30 345 | var count = 0L 346 | var t0 = 0L 347 | var t1 = 0L 348 | 349 | val beginTime = System.currentTimeMillis() 350 | magellan.Utils.injectRules(sparkSession) 351 | 352 | val readRoads = spark.textFile("/data/linestrings_72M.csv", 1024) 353 | val rawRoads = readRoads.toDF("text").withColumn("polyline", wkt($"text")("polyline")) 354 | val roads = rawRoads.withColumn("index", $"polyline" index precision).select($"polyline", $"index") 355 | val indexedRoads = roads.withColumn("index", explode($"index")).select("polyline", "index.curve", "index.relation").cache() 356 | val count1 = indexedRoads.count() 357 | 358 | val readBuildings = spark.textFile("/data/rectangles_114M.csv", 1024) 359 | val rawBuildings = readBuildings.toDF("text").withColumn("polygon", wkt($"text")("polygon")) 360 | val buildings = rawBuildings.withColumn("index", $"polygon" index precision).select($"polygon", $"index") 361 | val indexedBuildings = buildings.withColumn("index", explode($"index")).select("polygon", "index.curve", "index.relation").cache() 362 | val count2 = indexedBuildings.count() 363 | 364 | val runtime = System.currentTimeMillis() - beginTime 365 | 366 | println("Indexing Time: " + (runtime) / 1E3 + " sec ") 367 | 368 | t0 = System.nanoTime() 369 | count = indexedRoads.join(indexedBuildings, indexedBuildings("curve") === indexedRoads("curve")).where((indexedRoads("relation") === "Intersects") or ($"polyline" intersects $"polygon")).count() 370 | t1 = System.nanoTime() 371 | val time0 = ((t1 - t0) / 1E9) 372 | println("Join Time: " + time0 + " sec") 373 | 374 | val total_time = (runtime / 1E3) + time0 375 | 376 | println("Total Join Time: " + total_time + " sec") 377 | 378 | println("") 379 | 380 | indexedRoads.unpersist() 381 | indexedBuildings.unpersist() 382 | } 383 | 384 | def polygonPolygon() { 385 | 386 | println("*************************** Polygon-Polygon ****************************************") 387 | 388 | val precision = 30 389 | var count = 0L 390 | var t0 = 0L 391 | var t1 = 0L 392 | 393 | val beginTime = System.currentTimeMillis() 394 | magellan.Utils.injectRules(sparkSession) 395 | 396 | val readRoads = spark.textFile("/data/buildings_114M.csv", 1024) 397 | val rawRoads = readRoads.toDF("text").withColumn("polygon", wkt($"text")("polygon")) 398 | val roads = rawRoads.withColumn("index", $"polygon" index precision).select($"polygon", $"index") 399 | val indexedRoads = roads.withColumn("index", explode($"index")).select("polygon", "index.curve", "index.relation").cache() 400 | val count1 = indexedRoads.count() 401 | 402 | val readBuildings = spark.textFile("/data/buildings_114M.csv", 1024) 403 | val rawBuildings = readBuildings.toDF("text").withColumn("polygon", wkt($"text")("polygon")) 404 | val buildings = rawBuildings.withColumn("index", $"polygon" index precision).select($"polygon", $"index") 405 | val indexedBuildings = buildings.withColumn("index", explode($"index")).select("polygon", "index.curve", "index.relation").cache() 406 | val count2 = indexedBuildings.count() 407 | 408 | val runtime = System.currentTimeMillis() - beginTime 409 | 410 | println("Indexing Time: " + (runtime) / 1E3 + " sec ") 411 | 412 | t0 = System.nanoTime() 413 | count = indexedRoads.join(indexedBuildings, indexedBuildings("curve") === indexedRoads("curve")).where((indexedRoads("relation") === "Intersects") or (indexedRoads("polygon") intersects indexedBuildings("polygon"))).count() 414 | t1 = System.nanoTime() 415 | val time0 = ((t1 - t0) / 1E9) 416 | println("Join Time: " + time0 + " sec") 417 | 418 | val total_time = (runtime / 1E3) + time0 419 | 420 | println("Total Join Time: " + total_time + " sec") 421 | 422 | println("") 423 | 424 | indexedRoads.unpersist() 425 | indexedBuildings.unpersist() 426 | } 427 | 428 | def polygonRectangle() { 429 | 430 | println("*************************** Rectangle-Polygon ****************************************") 431 | 432 | val precision = 30 433 | var count = 0L 434 | var t0 = 0L 435 | var t1 = 0L 436 | 437 | val beginTime = System.currentTimeMillis() 438 | magellan.Utils.injectRules(sparkSession) 439 | 440 | val readRoads = spark.textFile("/data/rectangles_114M.csv", 1024) 441 | val rawRoads = readRoads.toDF("text").withColumn("polygon", wkt($"text")("polygon")) 442 | val roads = rawRoads.withColumn("index", $"polygon" index precision).select($"polygon", $"index") 443 | val indexedRoads = roads.withColumn("index", explode($"index")).select("polygon", "index.curve", "index.relation").cache() 444 | val count1 = indexedRoads.count() 445 | 446 | val readBuildings = spark.textFile("/data/buildings_114M.csv", 1024) 447 | val rawBuildings = readBuildings.toDF("text").withColumn("polygon", wkt($"text")("polygon")) 448 | val buildings = rawBuildings.withColumn("index", $"polygon" index precision).select($"polygon", $"index") 449 | val indexedBuildings = buildings.withColumn("index", explode($"index")).select("polygon", "index.curve", "index.relation").cache() 450 | val count2 = indexedBuildings.count() 451 | 452 | val runtime = System.currentTimeMillis() - beginTime 453 | 454 | println("Indexing Time: " + (runtime) / 1E3 + " sec ") 455 | 456 | t0 = System.nanoTime() 457 | count = indexedRoads.join(indexedBuildings, indexedBuildings("curve") === indexedRoads("curve")).where((indexedRoads("relation") === "Intersects") or (indexedRoads("polygon") intersects indexedBuildings("polygon"))).count() 458 | t1 = System.nanoTime() 459 | val time0 = ((t1 - t0) / 1E9) 460 | println("Join Time: " + time0 + " sec") 461 | 462 | val total_time = (runtime / 1E3) + time0 463 | 464 | println("Total Join Time: " + total_time + " sec") 465 | 466 | println("") 467 | 468 | indexedRoads.unpersist() 469 | indexedBuildings.unpersist() 470 | } 471 | 472 | def rectangleRectangle() { 473 | 474 | println("*************************** Rectangle-Rectangle ****************************************") 475 | 476 | val precision = 30 477 | var count = 0L 478 | var t0 = 0L 479 | var t1 = 0L 480 | 481 | val beginTime = System.currentTimeMillis() 482 | magellan.Utils.injectRules(sparkSession) 483 | 484 | val readRoads = spark.textFile("/data/rectangles_114M.csv", 1024) 485 | val rawRoads = readRoads.toDF("text").withColumn("polygon", wkt($"text")("polygon")) 486 | val roads = rawRoads.withColumn("index", $"polygon" index precision).select($"polygon", $"index") 487 | val indexedRoads = roads.withColumn("index", explode($"index")).select("polygon", "index.curve", "index.relation").cache() 488 | val count1 = indexedRoads.count() 489 | 490 | val readBuildings = spark.textFile("/data/rectangles_114M.csv", 1024) 491 | val rawBuildings = readBuildings.toDF("text").withColumn("polygon", wkt($"text")("polygon")) 492 | val buildings = rawBuildings.withColumn("index", $"polygon" index precision).select($"polygon", $"index") 493 | val indexedBuildings = buildings.withColumn("index", explode($"index")).select("polygon", "index.curve", "index.relation").cache() 494 | val count2 = indexedBuildings.count() 495 | 496 | val runtime = System.currentTimeMillis() - beginTime 497 | 498 | println("Indexing Time: " + (runtime) / 1E3 + " sec ") 499 | 500 | t0 = System.nanoTime() 501 | count = indexedRoads.join(indexedBuildings, indexedBuildings("curve") === indexedRoads("curve")).where((indexedRoads("relation") === "Intersects") or (indexedRoads("polygon") intersects indexedBuildings("polygon"))).count() 502 | t1 = System.nanoTime() 503 | val time0 = ((t1 - t0) / 1E9) 504 | 505 | println("Join Time: " + time0 + " sec") 506 | 507 | val total_time = (runtime / 1E3) + time0 508 | 509 | println("Total Join Time: " + total_time + " sec") 510 | 511 | println("") 512 | 513 | indexedRoads.unpersist() 514 | indexedBuildings.unpersist() 515 | } 516 | } 517 | } 518 | -------------------------------------------------------------------------------- /simba/DistanceJoin.scala: -------------------------------------------------------------------------------- 1 | package simba.measurements 2 | 3 | import org.apache.spark.sql.simba.SimbaSession 4 | import org.apache.spark.sql.simba.{Dataset, SimbaSession} 5 | import org.apache.spark.sql.simba.{Dataset, SimbaSession} 6 | import org.apache.spark.sql.simba.index.{RTreeType, TreapType} 7 | 8 | import org.apache.spark.sql.simba.spatial.Point 9 | import org.apache.spark.sql.simba.spatial.Polygon 10 | 11 | object DistanceJoin { 12 | 13 | case class PointData(x: Double, y: Double) 14 | 15 | def main(args: Array[String]): Unit = { 16 | 17 | val simbaSession = SimbaSession.builder().appName("Simba Distance Join").config("simba.join.partitions", "1024").config("simba.index.partitions", "1024").getOrCreate() 18 | 19 | distanceJoin(simbaSession) 20 | simbaSession.stop() 21 | } 22 | 23 | def distanceJoin(simba: SimbaSession) { 24 | 25 | import simba.implicits._ 26 | import simba.simbaImplicits._ 27 | 28 | var t0 = 0L 29 | var t1 = 0L 30 | t0 = System.nanoTime() 31 | var DS1 = simba.read.textFile("/data/points_200M.csv").map { line => 32 | val parts = line.split(",") 33 | val longitude = parts(0).toDouble 34 | val latitude = parts(1).toDouble 35 | PointData(longitude, latitude) 36 | }.repartition(1024).toDF() 37 | var count1 = DS1.count() 38 | t1 = System.nanoTime() 39 | var left_time = (t1 - t0) / 1E9 40 | println("Left Read time: " + left_time + " seconds") 41 | 42 | var DS2 = simba.read.textFile("/data/points_200M.csv").map { line => 43 | val parts = line.split(",") 44 | val longitude = parts(0).toDouble 45 | val latitude = parts(1).toDouble 46 | PointData(longitude, latitude) 47 | }.repartition(1024).toDF() 48 | count1 = DS2.count() 49 | t1 = System.nanoTime() 50 | var right_time = (t1 - t0) / 1E9 51 | println("Right Read time: " + right_time + " seconds") 52 | 53 | // Embed timers in DJSpark for partitioning time and indexing time 54 | t0 = System.nanoTime() 55 | val count = DS1.distanceJoin(DS2, Array("x", "y"), Array("x", "y"), 0.000045027).show(10) 56 | t1 = System.nanoTime() 57 | val runTime = (t1 - t0) / 1E9 58 | println("Elapsed time: " + runTime + " seconds") 59 | println("Total Time: " + (left_time + right_time + runTime) + " seconds") 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /simba/RangeQueries.scala: -------------------------------------------------------------------------------- 1 | package simba.measurements 2 | 3 | import org.apache.spark.sql.simba.SimbaSession 4 | import org.apache.spark.sql.simba.{Dataset, SimbaSession} 5 | import org.apache.spark.sql.simba.{Dataset, SimbaSession} 6 | import org.apache.spark.sql.simba.index.{RTreeType, TreapType} 7 | 8 | import org.apache.spark.sql.simba.spatial.Point 9 | import org.apache.spark.sql.simba.spatial.Polygon 10 | 11 | object RangeQueries { 12 | 13 | case class PointData(x: Double, y: Double) 14 | 15 | def main(args: Array[String]): Unit = { 16 | 17 | val simbaSession = SimbaSession.builder().appName("Simba Range Queries").config("simba.index.partitions", "1024").getOrCreate() 18 | 19 | range_queries(simbaSession) 20 | simbaSession.stop() 21 | } 22 | 23 | def range_queries(simba: SimbaSession) { 24 | 25 | import simba.implicits._ 26 | import simba.simbaImplicits._ 27 | 28 | var t0 = 0L 29 | var t1 = 0L 30 | val nQueries = 100 31 | var count1 = 0L 32 | 33 | t0 = System.nanoTime() 34 | var DS1 = simba.read.textFile("/data/points_200M.csv").map { line => 35 | val parts = line.split(",") 36 | val pickup_longitude = parts(0).toDouble 37 | val pickup_latitude = parts(1).toDouble 38 | PointData(pickup_longitude, pickup_latitude) 39 | }.repartition(1024).toDF() 40 | DS1.index(RTreeType, "RtreeForDS1", Array("x", "y")) 41 | var count = DS1.count() 42 | t1 = System.nanoTime() 43 | var left_time = (t1 - t0) / 1E9 44 | println("Read time: " + left_time + " seconds") 45 | println() 46 | 47 | // Dry run 48 | t0 = System.nanoTime() 49 | for (i <- 1 to 20) { 50 | count1 = DS1.range(Array("x", "y"), Array(-50.3010141441, -53.209588996), Array(-24.9526465797, -30.1096863746)).rdd.count() 51 | } 52 | t1 = System.nanoTime() 53 | 54 | // Actual Measurements 55 | println("Range1: ") 56 | t0 = System.nanoTime() 57 | for (i <- 1 to nQueries) { 58 | count1 = DS1.range(Array("x", "y"), Array(-50.3010141441, -53.209588996), Array(-24.9526465797, -30.1096863746)).rdd.count() 59 | } 60 | t1 = System.nanoTime() 61 | println("Count: " + count1) 62 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 63 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 64 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 65 | 66 | println("Range2: ") 67 | t0 = System.nanoTime() 68 | for (i <- 1 to nQueries) { 69 | count1 = DS1.range(Array("x", "y"), Array(-54.4270741441, -53.209588996), Array(-24.9526465797, -30.1096863746)).rdd.count() 70 | } 71 | t1 = System.nanoTime() 72 | println("Count: " + count1) 73 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 74 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 75 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 76 | 77 | println("Range3: ") 78 | t0 = System.nanoTime() 79 | for (i <- 1 to nQueries) { 80 | count1 = DS1.range(Array("x", "y"), Array(-114.4270741441, -54.509588996), Array(42.9526465797, -27.0106863746)).rdd.count() 81 | } 82 | t1 = System.nanoTime() 83 | println("Count: " + count1) 84 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 85 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 86 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 87 | 88 | println("Range4: ") 89 | t0 = System.nanoTime() 90 | for (i <- 1 to nQueries) { 91 | count1 = DS1.range(Array("x", "y"), Array(-82.7638020000, -54.509588996), Array(42.9526465797, 38.0106863746)).rdd.count() 92 | } 93 | t1 = System.nanoTime() 94 | println("Count: " + count1) 95 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 96 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 97 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 98 | 99 | println("Range5: ") 100 | t0 = System.nanoTime() 101 | for (i <- 1 to nQueries) { 102 | count1 = DS1.range(Array("x", "y"), Array(-140.99778, -52.6480987209), Array(5.7305630159, 83.23324)).rdd.count() 103 | } 104 | t1 = System.nanoTime() 105 | println("Count: " + count1) 106 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 107 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 108 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 109 | 110 | println("Range6: ") 111 | t0 = System.nanoTime() 112 | for (i <- 1 to nQueries) { 113 | count1 = DS1.range(Array("x", "y"), Array(-180.0, -90.0), Array(180.0, 90.0)).rdd.count() 114 | } 115 | t1 = System.nanoTime() 116 | println("Count: " + count1) 117 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 118 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 119 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 120 | 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /simba/kNNJoin.scala: -------------------------------------------------------------------------------- 1 | package simba.measurements 2 | 3 | import org.apache.spark.sql.simba.SimbaSession 4 | import org.apache.spark.sql.simba.{Dataset, SimbaSession} 5 | import org.apache.spark.sql.simba.{Dataset, SimbaSession} 6 | import org.apache.spark.sql.simba.index.{RTreeType, TreapType} 7 | 8 | import org.apache.spark.sql.simba.spatial.Point 9 | import org.apache.spark.sql.simba.spatial.Polygon 10 | 11 | object kNNJoin { 12 | 13 | case class PointData(x: Double, y: Double) 14 | 15 | def main(args: Array[String]): Unit = { 16 | 17 | val simbaSession = SimbaSession.builder().appName("Simba kNN Join").config("simba.join.partitions", "50").config("simba.index.partitions", "50").getOrCreate() 18 | knnJoin(simbaSession) 19 | simbaSession.stop() 20 | } 21 | 22 | def knnJoin(simba: SimbaSession) { 23 | 24 | import simba.implicits._ 25 | import simba.simbaImplicits._ 26 | var t0 = 0L 27 | var t1 = 0L 28 | t0 = System.nanoTime() 29 | var DS1 = simba.read.textFile("/data/points_10M.csv").map { line => 30 | val parts = line.split(",") 31 | val longitude = parts(0).toDouble 32 | val latitude = parts(1).toDouble 33 | PointData(longitude, latitude) 34 | }.repartition(1024).toDF() 35 | var count1 = DS1.count() 36 | t1 = System.nanoTime() 37 | var left_time = (t1 - t0) / 1E9 38 | println("Left Read time: " + left_time + " seconds") 39 | 40 | t0 = System.nanoTime() 41 | var DS2 = simba.read.textFile("/data/points_10M.csv").map { line => 42 | val parts = line.split(",") 43 | val longitude = parts(0).toDouble 44 | val latitude = parts(1).toDouble 45 | PointData(longitude, latitude) 46 | }.repartition(1024).toDF() 47 | count1 = DS2.count() 48 | t1 = System.nanoTime() 49 | var right_time = (t1 - t0) / 1E9 50 | println("Right Read time: " + right_time + " seconds") 51 | 52 | //DS2.index(RTreeType, "RtreeForDS2", Array("x", "y")) 53 | 54 | // Embed timers inside RKJSpark for partitioning time and indexing time 55 | t0 = System.nanoTime() 56 | val count = DS1.knnJoin(DS2, Array("x", "y"), Array("x", "y"), 5).count() 57 | t1 = System.nanoTime() 58 | val runTime = (t1 - t0) / 1E9 59 | println("Join time: " + runTime + " seconds") 60 | println("Total Time: " + (left_time + right_time + runTime) + " seconds") 61 | println("Count: " + count) 62 | } 63 | } -------------------------------------------------------------------------------- /simba/kNNQueries.scala: -------------------------------------------------------------------------------- 1 | package simba.measurements 2 | 3 | import org.apache.spark.sql.simba.SimbaSession 4 | import org.apache.spark.sql.simba.{Dataset, SimbaSession} 5 | import org.apache.spark.sql.simba.index.{RTreeType, TreapType} 6 | import org.apache.spark.sql.simba.spatial.Point 7 | import org.apache.spark.sql.simba.spatial.Polygon 8 | import scala.util.Random 9 | 10 | object kNNQueries { 11 | 12 | case class PointData(x: Double, y: Double) 13 | 14 | def main(args: Array[String]): Unit = { 15 | 16 | val simbaSession = SimbaSession.builder().appName("Simba kNN Queries").config("simba.index.partitions", "1024").getOrCreate() 17 | knn_queries(simbaSession) 18 | simbaSession.stop() 19 | } 20 | 21 | def knn_queries(simba: SimbaSession) { 22 | 23 | import simba.implicits._ 24 | import simba.simbaImplicits._ 25 | 26 | val nQueries = 100 27 | var t0 = 0L 28 | var t1 = 0L 29 | var count1 = 0L 30 | val random = scala.util.Random 31 | 32 | t0 = System.nanoTime() 33 | var DS1 = simba.read.textFile("/data/points_200M.csv").map { line => 34 | val parts = line.split(",") 35 | val longitude = parts(0).toDouble 36 | val latitude = parts(1).toDouble 37 | PointData(longitude, latitude) 38 | }.repartition(1024).toDF() 39 | DS1.index(RTreeType, "RtreeForDS1", Array("x", "y")) 40 | var count = DS1.count() 41 | t1 = System.nanoTime() 42 | var left_time = (t1 - t0) / 1E9 43 | println("Read time: " + (left_time) + " seconds") 44 | println() 45 | 46 | // Dry run 47 | println("k=5") 48 | t0 = System.nanoTime() 49 | for (i <- 1 to 20) { 50 | var lat = (random.nextDouble() * 2 - 1) * 90 51 | var long = (random.nextDouble() * 2 - 1) * 180 52 | count1 = DS1.knn(Array("x", "y"), Array(long, lat), 5).count() 53 | } 54 | t1 = System.nanoTime() 55 | 56 | // Actual Measurements 57 | println("k=5") 58 | t0 = System.nanoTime() 59 | for (i <- 1 to nQueries) { 60 | var lat = (random.nextDouble() * 2 - 1) * 90 61 | var long = (random.nextDouble() * 2 - 1) * 180 62 | count1 = DS1.knn(Array("x", "y"), Array(long, lat), 5).count() 63 | } 64 | t1 = System.nanoTime() 65 | println("Count: " + count1) 66 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 67 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 68 | 69 | println("k=10") 70 | t0 = System.nanoTime() 71 | for (i <- 1 to nQueries) { 72 | var lat = (random.nextDouble() * 2 - 1) * 90 73 | var long = (random.nextDouble() * 2 - 1) * 180 74 | count1 = DS1.knn(Array("x", "y"), Array(long, lat), 10).count() 75 | } 76 | t1 = System.nanoTime() 77 | println("Count: " + count1) 78 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 79 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 80 | 81 | println("k=20") 82 | t0 = System.nanoTime() 83 | for (i <- 1 to nQueries) { 84 | var lat = (random.nextDouble() * 2 - 1) * 90 85 | var long = (random.nextDouble() * 2 - 1) * 180 86 | count1 = DS1.knn(Array("x", "y"), Array(long, lat), 20).count() 87 | } 88 | t1 = System.nanoTime() 89 | println("Count: " + count1) 90 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 91 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 92 | 93 | println("k=30") 94 | t0 = System.nanoTime() 95 | for (i <- 1 to nQueries) { 96 | var lat = (random.nextDouble() * 2 - 1) * 90 97 | var long = (random.nextDouble() * 2 - 1) * 180 98 | count1 = DS1.knn(Array("x", "y"), Array(long, lat), 30).count() 99 | } 100 | t1 = System.nanoTime() 101 | println("Count: " + count1) 102 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 103 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 104 | 105 | println("k=40") 106 | t0 = System.nanoTime() 107 | for (i <- 1 to nQueries) { 108 | var lat = (random.nextDouble() * 2 - 1) * 90 109 | var long = (random.nextDouble() * 2 - 1) * 180 110 | count1 = DS1.knn(Array("x", "y"), Array(long, lat), 40).count() 111 | } 112 | t1 = System.nanoTime() 113 | println("Count: " + count1) 114 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 115 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 116 | 117 | println("k=50") 118 | t0 = System.nanoTime() 119 | for (i <- 1 to nQueries) { 120 | var lat = (random.nextDouble() * 2 - 1) * 90 121 | var long = (random.nextDouble() * 2 - 1) * 180 122 | count1 = DS1.knn(Array("x", "y"), Array(long, lat), 50).count() 123 | } 124 | t1 = System.nanoTime() 125 | println("Count: " + count1) 126 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 127 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 128 | 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /spatialspark/DistanceJoin.scala: -------------------------------------------------------------------------------- 1 | package spatialspark.measurements 2 | 3 | import spatialspark.index.IndexConf 4 | import spatialspark.index.STIndex 5 | import spatialspark.util.MBR 6 | import com.vividsolutions.jts.geom.{Envelope, GeometryFactory} 7 | import com.vividsolutions.jts.io.WKTReader 8 | import spatialspark.operator.SpatialOperator 9 | import spatialspark.query.RangeQuery 10 | import spatialspark.operator.SpatialOperator 11 | import spatialspark.partition.bsp.BinarySplitPartitionConf 12 | import spatialspark.partition.fgp.FixedGridPartitionConf 13 | import spatialspark.partition.stp.SortTilePartitionConf 14 | import spatialspark.join.{BroadcastSpatialJoin, PartitionedSpatialJoin} 15 | import org.apache.spark.rdd.RDD 16 | import org.apache.spark.{SparkContext, SparkConf} 17 | import scala.util.Try 18 | 19 | object DistanceJoin { 20 | 21 | def main(args: Array[String]) { 22 | 23 | val conf = new SparkConf().setAppName("SpatialSpark Distance Join") 24 | conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 25 | conf.set("spark.kryo.registrator", "spatialspark.util.KyroRegistrator") 26 | val sc = new SparkContext(conf) 27 | 28 | val points = "/data/points_200M_wkt.csv" 29 | 30 | def time[R](block: => R): R = { 31 | val t0 = System.nanoTime() 32 | val result = block // call-by-name 33 | val t1 = System.nanoTime() 34 | println("Query Runtime: " + (t1 - t0) / 1E9 + " sec ") 35 | result 36 | } 37 | 38 | query(points, points, "Points", "Points") 39 | 40 | sc.stop() 41 | 42 | def query(leftInput: String, rightInput: String, leftGeomType: String, rightGeomType: String) { 43 | 44 | var t0 = 0L 45 | var t1 = 0L 46 | var count1 = 0L 47 | var count = 0L 48 | val nQueries = 10 49 | val extentString = "" 50 | val method = "stp" 51 | 52 | val rangeQueryWindow6 = new GeometryFactory().toGeometry(new Envelope(-180.0, 180.0, -90.0, 90.0)) 53 | 54 | println("************************************ " + leftGeomType + "-" + rightGeomType + " Spatial Join ************************************") 55 | 56 | t0 = System.nanoTime() 57 | val leftData = sc.textFile(leftInput, 1024).map(x => x.split("\t")).zipWithIndex() 58 | val leftGeometryById = leftData.map(x => (x._2, Try(new WKTReader().read(x._1.apply(0))))).filter(_._2.isSuccess).map(x => (x._1, x._2.get)).cache() 59 | count = leftGeometryById.count() 60 | t1 = System.nanoTime() 61 | println(leftGeomType + " Read Time: " + ((t1 - t0) / 1E9) + " sec") 62 | val leftTime = (t1 - t0) 63 | 64 | t0 = System.nanoTime() 65 | val rightData = sc.textFile(rightInput, 1024).map(x => x.split("\t")).zipWithIndex() 66 | val rightGeometryById = rightData.map(x => (x._2, Try(new WKTReader().read(x._1.apply(0))))).filter(_._2.isSuccess).map(x => (x._1, x._2.get)).cache() 67 | count = rightGeometryById.count() 68 | t1 = System.nanoTime() 69 | println(rightGeomType + " Read Time: " + ((t1 - t0) / 1E9) + " sec") 70 | val rightTime = (t1 - t0) 71 | val read_time = (leftTime + rightTime) / 1E9 72 | println("Total Read Time: " + read_time + " sec") 73 | 74 | 75 | var matchedPairs: RDD[(Long, Long)] = sc.emptyRDD 76 | t0 = System.nanoTime() 77 | val extent = extentString match { 78 | case "" => 79 | val temp = leftGeometryById.map(x => x._2.getEnvelopeInternal).map(x => (x.getMinX, x.getMinY, x.getMaxX, x.getMaxY)).reduce((a, b) => (a._1 min b._1, a._2 min b._2, a._3 max b._3, a._4 max b._4)) 80 | val temp2 = rightGeometryById.map(x => x._2.getEnvelopeInternal).map(x => (x.getMinX, x.getMinY, x.getMaxX, x.getMaxY)).reduce((a, b) => (a._1 min b._1, a._2 min b._2, a._3 max b._3, a._4 max b._4)) 81 | (temp._1 min temp2._1, temp._2 min temp2._2, temp._3 max temp2._3, temp._4 max temp2._4) 82 | case _ => (extentString.split(":").apply(0).toDouble, extentString.split(":").apply(1).toDouble, 83 | extentString.split(":").apply(2).toDouble, extentString.split(":").apply(3).toDouble) 84 | } 85 | 86 | val partConf = method match { 87 | case "stp" => 88 | val dimX = 32 89 | val dimY = 32 90 | val ratio = 0.3 91 | new SortTilePartitionConf(dimX, dimY, new MBR(extent._1, extent._2, extent._3, extent._4), ratio, true) 92 | case "bsp" => 93 | val level = 32 94 | val ratio = 0.1 95 | new BinarySplitPartitionConf(ratio, new MBR(extent._1, extent._2, extent._3, extent._4), level, true) 96 | case _ => 97 | val dimX = 32 98 | val dimY = 32 99 | new FixedGridPartitionConf(dimX, dimY, new MBR(extent._1, extent._2, extent._3, extent._4)) 100 | } 101 | t1 = System.nanoTime() 102 | val extentTime = ((t1 - t0) / 1E9) 103 | println("Extent Time: " + extentTime + " sec") 104 | 105 | // Embed timers inside SpatialSaprk PartitionedJoin to get partitioning and indexing time. Subtract the partitioning and indexing time from join time below to get purely join time 106 | t0 = System.nanoTime() 107 | matchedPairs = PartitionedSpatialJoin(sc, leftGeometryById, rightGeometryById, SpatialOperator.WithinD, 0.000045027, partConf) 108 | count = matchedPairs.count() 109 | t1 = System.nanoTime() 110 | val time0 = ((t1 - t0) / 1E9) 111 | 112 | val total_time = time0 + read_time + extentTime 113 | println("Join Time (Including Partitioning and Indexing Time): " + time0 + " sec") 114 | println("Total Join Time: " + total_time + " sec") 115 | 116 | leftGeometryById.unpersist() 117 | rightGeometryById.unpersist() 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /spatialspark/RangeQueries.scala: -------------------------------------------------------------------------------- 1 | package spatialspark.measurements 2 | 3 | import spatialspark.index.IndexConf 4 | import spatialspark.index.STIndex 5 | import spatialspark.util.MBR 6 | import com.vividsolutions.jts.geom.{Envelope, GeometryFactory} 7 | import com.vividsolutions.jts.io.WKTReader 8 | import spatialspark.operator.SpatialOperator 9 | import spatialspark.query.RangeQuery 10 | import org.apache.spark.{SparkContext, SparkConf} 11 | 12 | object RangeQueries { 13 | 14 | def main(args: Array[String]) { 15 | 16 | val conf = new SparkConf().setAppName("SpatialSpark Range Queries") 17 | conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 18 | conf.set("spark.kryo.registrator", "spatialspark.util.KyroRegistrator") 19 | val sc = new SparkContext(conf) 20 | 21 | val points = "/data/points_200M_wkt.csv" 22 | val polygons = "/data/buildings_114M.csv" 23 | val rectangles = "/data/rectangles_114M.csv" 24 | val linestrings = "/data/linestrings_72M.csv" 25 | 26 | def time[R](block: => R): R = { 27 | val t0 = System.nanoTime() 28 | val result = block // call-by-name 29 | val t1 = System.nanoTime() 30 | println("Query Runtime: " + (t1 - t0) / 1E9 + " sec ") 31 | result 32 | } 33 | 34 | point(points, "Points") 35 | linestring(linestrings, "Linestrings") 36 | query(polygons, "Polygons") 37 | query(rectangles, "Rectangles") 38 | 39 | sc.stop() 40 | 41 | def point(input: String, geomType: String) { 42 | 43 | val nQueries = 100 44 | var t0 = 0L 45 | var t1 = 0L 46 | var count1 = 0L 47 | 48 | val rangeQueryWindow1 = new GeometryFactory().toGeometry(new Envelope(-50.3010141441, -24.9526465797, -53.209588996, -30.1096863746)) 49 | val rangeQueryWindow2 = new GeometryFactory().toGeometry(new Envelope(-54.4270741441, -24.9526465797, -53.209588996, -30.1096863746)) 50 | val rangeQueryWindow3 = new GeometryFactory().toGeometry(new Envelope(-114.4270741441, 42.9526465797, -54.509588996, -27.0106863746)) 51 | val rangeQueryWindow4 = new GeometryFactory().toGeometry(new Envelope(-82.7638020000, 42.9526465797, -54.509588996, 38.0106863746)) 52 | val rangeQueryWindow5 = new GeometryFactory().toGeometry(new Envelope(-140.99778, 5.7305630159, -52.6480987209, 83.23324)) 53 | val rangeQueryWindow6 = new GeometryFactory().toGeometry(new Envelope(-180.0, 180.0, -90.0, 90.0)) 54 | 55 | val inputData = sc.textFile(input).map(x => (new WKTReader).read(x.split("\t").apply(0))) 56 | val inputDataWithId = inputData.zipWithIndex().map(_.swap).cache() 57 | val count = inputDataWithId.count() 58 | 59 | // Dry run 60 | t0 = System.nanoTime() 61 | for (i <- 1 to 20) { 62 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow1, SpatialOperator.Within, 0.0).count() 63 | } 64 | t1 = System.nanoTime() 65 | 66 | // Actual Measurements 67 | println("************************************ " + geomType + " range queries ************************************") 68 | 69 | println("Range1: ") 70 | t0 = System.nanoTime() 71 | for (i <- 1 to nQueries) { 72 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow1, SpatialOperator.Within, 0.0).count() 73 | } 74 | t1 = System.nanoTime() 75 | println("Count: " + count1) 76 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 77 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 78 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 79 | t1 = 0L 80 | t0 = 0L 81 | 82 | println("Range2: ") 83 | t0 = System.nanoTime() 84 | for (i <- 1 to nQueries) { 85 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow2, SpatialOperator.Within, 0.0).count() 86 | } 87 | t1 = System.nanoTime() 88 | println("Count: " + count1) 89 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 90 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 91 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 92 | t1 = 0L 93 | t0 = 0L 94 | 95 | println("Range3: ") 96 | t0 = System.nanoTime() 97 | for (i <- 1 to nQueries) { 98 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow3, SpatialOperator.Within, 0.0).count() 99 | } 100 | t1 = System.nanoTime() 101 | println("Count: " + count1) 102 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 103 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 104 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 105 | t1 = 0L 106 | t0 = 0L 107 | 108 | println("Range4: ") 109 | t0 = System.nanoTime() 110 | for (i <- 1 to nQueries) { 111 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow4, SpatialOperator.Within, 0.0).count() 112 | } 113 | t1 = System.nanoTime() 114 | println("Count: " + count1) 115 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 116 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 117 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 118 | t1 = 0L 119 | t0 = 0L 120 | 121 | println("Range5: ") 122 | t0 = System.nanoTime() 123 | for (i <- 1 to nQueries) { 124 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow5, SpatialOperator.Within, 0.0).count() 125 | } 126 | t1 = System.nanoTime() 127 | println("Count: " + count1) 128 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 129 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 130 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 131 | 132 | println("Range6: ") 133 | t0 = System.nanoTime() 134 | for (i <- 1 to nQueries) { 135 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow6, SpatialOperator.Within, 0.0).count() 136 | } 137 | t1 = System.nanoTime() 138 | println("Count: " + count1) 139 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 140 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 141 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 142 | t1 = 0L 143 | t0 = 0L 144 | 145 | inputDataWithId.unpersist() 146 | } 147 | 148 | def linestring(input: String, geomType: String) { 149 | 150 | val nQueries = 100 151 | var t0 = 0L 152 | var t1 = 0L 153 | var count1 = 0L 154 | 155 | val rangeQueryWindow1 = new GeometryFactory().toGeometry(new Envelope(-50.204, -24.9526465797, -53.209588996, -30.1096863746)) 156 | val rangeQueryWindow2 = new GeometryFactory().toGeometry(new Envelope(-52.1270741441, -24.9526465797, -53.209588996, -30.1096863746)) 157 | val rangeQueryWindow3 = new GeometryFactory().toGeometry(new Envelope(-94.4270741441, 22.9526465797, -34.609588996, -27.0106863746)) 158 | val rangeQueryWindow4 = new GeometryFactory().toGeometry(new Envelope(-74.0938020000, 42.9526465797, -54.509588996, 38.0106863746)) 159 | val rangeQueryWindow5 = new GeometryFactory().toGeometry(new Envelope(-150.99778, 7.2705630159, -52.6480987209, 83.23324)) 160 | val rangeQueryWindow6 = new GeometryFactory().toGeometry(new Envelope(-180.0, 180.0, -90.0, 90.0)) 161 | 162 | val inputData = sc.textFile(input).map(x => (new WKTReader).read(x.split("\t").apply(0))) 163 | val inputDataWithId = inputData.zipWithIndex().map(_.swap).cache() 164 | val count = inputDataWithId.count() 165 | t0 = System.nanoTime() 166 | 167 | // Dry run 168 | for (i <- 1 to 20) { 169 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow1, SpatialOperator.Within, 0.0).count() 170 | } 171 | t1 = System.nanoTime() 172 | 173 | // Actual Measurements 174 | println("************************************ " + geomType + " range queries ************************************") 175 | 176 | println("Range1: ") 177 | t0 = System.nanoTime() 178 | for (i <- 1 to nQueries) { 179 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow1, SpatialOperator.Within, 0.0).count() 180 | } 181 | t1 = System.nanoTime() 182 | println("Count: " + count1) 183 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 184 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 185 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 186 | t1 = 0L 187 | t0 = 0L 188 | 189 | println("Range2: ") 190 | t0 = System.nanoTime() 191 | for (i <- 1 to nQueries) { 192 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow2, SpatialOperator.Within, 0.0).count() 193 | } 194 | t1 = System.nanoTime() 195 | println("Count: " + count1) 196 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 197 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 198 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 199 | t1 = 0L 200 | t0 = 0L 201 | 202 | println("Range3: ") 203 | t0 = System.nanoTime() 204 | for (i <- 1 to nQueries) { 205 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow3, SpatialOperator.Within, 0.0).count() 206 | } 207 | t1 = System.nanoTime() 208 | println("Count: " + count1) 209 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 210 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 211 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 212 | t1 = 0L 213 | t0 = 0L 214 | 215 | println("Range4: ") 216 | t0 = System.nanoTime() 217 | for (i <- 1 to nQueries) { 218 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow4, SpatialOperator.Within, 0.0).count() 219 | } 220 | t1 = System.nanoTime() 221 | println("Count: " + count1) 222 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 223 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 224 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 225 | t1 = 0L 226 | t0 = 0L 227 | 228 | println("Range5: ") 229 | t0 = System.nanoTime() 230 | for (i <- 1 to nQueries) { 231 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow5, SpatialOperator.Within, 0.0).count() 232 | } 233 | t1 = System.nanoTime() 234 | println("Count: " + count1) 235 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 236 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 237 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 238 | t1 = 0L 239 | t0 = 0L 240 | 241 | println("Range6: ") 242 | t0 = System.nanoTime() 243 | for (i <- 1 to nQueries) { 244 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow6, SpatialOperator.Within, 0.0).count() 245 | } 246 | t1 = System.nanoTime() 247 | println("Count: " + count1) 248 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 249 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 250 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 251 | 252 | inputDataWithId.unpersist() 253 | } 254 | 255 | def query(input: String, geomType: String) { 256 | 257 | val nQueries = 100 258 | var t0 = 0L 259 | var t1 = 0L 260 | var count1 = 0L 261 | 262 | val rangeQueryWindow1 = new GeometryFactory().toGeometry(new Envelope(-20.204, 17.9526465797, -53.209588996, -30.1096863746)) 263 | val rangeQueryWindow2 = new GeometryFactory().toGeometry(new Envelope(-20.204, 20.4376465797, -53.209588996, -30.1096863746)) 264 | val rangeQueryWindow3 = new GeometryFactory().toGeometry(new Envelope(-74.4270741441, 72.9526465797, -34.609588996, -6.5906863746)) 265 | val rangeQueryWindow4 = new GeometryFactory().toGeometry(new Envelope(-104.0938020000, 118.9526465797, -54.509588996, 40.2406863746)) 266 | val rangeQueryWindow5 = new GeometryFactory().toGeometry(new Envelope(-174.4270741441, 72.9526465797, -34.609588996, 48.4396863746)) 267 | val rangeQueryWindow6 = new GeometryFactory().toGeometry(new Envelope(-180.0, 180.0, -90.0, 90.0)) 268 | 269 | val inputData = sc.textFile(input).map(x => (new WKTReader).read(x.split("\t").apply(0))) 270 | val inputDataWithId = inputData.zipWithIndex().map(_.swap).cache() 271 | val count = inputDataWithId.count() 272 | t0 = System.nanoTime() 273 | 274 | // Dry run 275 | for (i <- 1 to 20) { 276 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow1, SpatialOperator.Within, 0.0).count() 277 | } 278 | t1 = System.nanoTime() 279 | 280 | // Actual Measurements 281 | println("************************************ " + geomType + " range queries ************************************") 282 | 283 | println("Range1: ") 284 | t0 = System.nanoTime() 285 | for (i <- 1 to nQueries) { 286 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow1, SpatialOperator.Within, 0.0).count() 287 | } 288 | t1 = System.nanoTime() 289 | println("Count: " + count1) 290 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 291 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 292 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 293 | t1 = 0L 294 | t0 = 0L 295 | 296 | println("Range2: ") 297 | t0 = System.nanoTime() 298 | for (i <- 1 to nQueries) { 299 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow2, SpatialOperator.Within, 0.0).count() 300 | } 301 | t1 = System.nanoTime() 302 | println("Count: " + count1) 303 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 304 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 305 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 306 | t1 = 0L 307 | t0 = 0L 308 | 309 | println("Range3: ") 310 | t0 = System.nanoTime() 311 | for (i <- 1 to nQueries) { 312 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow3, SpatialOperator.Within, 0.0).count() 313 | } 314 | t1 = System.nanoTime() 315 | println("Count: " + count1) 316 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 317 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 318 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 319 | t1 = 0L 320 | t0 = 0L 321 | 322 | println("Range4: ") 323 | t0 = System.nanoTime() 324 | for (i <- 1 to nQueries) { 325 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow4, SpatialOperator.Within, 0.0).count() 326 | } 327 | t1 = System.nanoTime() 328 | println("Count: " + count1) 329 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 330 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 331 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 332 | t1 = 0L 333 | t0 = 0L 334 | 335 | println("Range5: ") 336 | t0 = System.nanoTime() 337 | for (i <- 1 to nQueries) { 338 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow5, SpatialOperator.Within, 0.0).count() 339 | } 340 | t1 = System.nanoTime() 341 | println("Count: " + count1) 342 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 343 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 344 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 345 | t1 = 0L 346 | t0 = 0L 347 | 348 | println("Range6: ") 349 | t0 = System.nanoTime() 350 | for (i <- 1 to nQueries) { 351 | count1 = RangeQuery(sc, inputDataWithId, rangeQueryWindow6, SpatialOperator.Within, 0.0).count() 352 | } 353 | t1 = System.nanoTime() 354 | println("Count: " + count1) 355 | println("Selection Ratio: " + ((count1 * 100.0) / count)) 356 | println("Total Time: " + ((t1 - t0) / 1E9) + " sec") 357 | println("Throughput: " + (nQueries * 60) / ((t1 - t0) / (1E9)) + " queries/min") 358 | 359 | inputDataWithId.unpersist() 360 | } 361 | } 362 | } 363 | -------------------------------------------------------------------------------- /spatialspark/SpatialJoins.scala: -------------------------------------------------------------------------------- 1 | package spatialspark.measurements 2 | 3 | import spatialspark.index.IndexConf 4 | import spatialspark.index.STIndex 5 | import spatialspark.util.MBR 6 | import com.vividsolutions.jts.geom.{Envelope, GeometryFactory} 7 | import com.vividsolutions.jts.io.WKTReader 8 | import spatialspark.operator.SpatialOperator 9 | import spatialspark.query.RangeQuery 10 | import spatialspark.operator.SpatialOperator 11 | import spatialspark.partition.bsp.BinarySplitPartitionConf 12 | import spatialspark.partition.fgp.FixedGridPartitionConf 13 | import spatialspark.partition.stp.SortTilePartitionConf 14 | import spatialspark.join.{BroadcastSpatialJoin, PartitionedSpatialJoin} 15 | import org.apache.spark.rdd.RDD 16 | import org.apache.spark.{SparkContext, SparkConf} 17 | import scala.util.Try 18 | 19 | object SpatialJoins { 20 | 21 | def main(args: Array[String]) { 22 | 23 | val conf = new SparkConf().setAppName("SpatialSpark Spatial Joins") 24 | conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 25 | conf.set("spark.kryo.registrator", "spatialspark.util.KyroRegistrator") 26 | val sc = new SparkContext(conf) 27 | 28 | val points = "/data/points_200M_wkt.csv" 29 | val polygons = "/data/buildings_114M.csv" 30 | val rectangles = "/data/rectangles_114M.csv" 31 | val linestrings = "/data/linestrings_72M.csv" 32 | 33 | def time[R](block: => R): R = { 34 | val t0 = System.nanoTime() 35 | val result = block // call-by-name 36 | val t1 = System.nanoTime() 37 | println("Query Runtime: " + (t1 - t0) / 1E9 + " sec ") 38 | result 39 | } 40 | 41 | query(points, points, "Points", "Points") 42 | query(points, linestrings, "Points", "LineStrings") 43 | query(points, rectangles, "Points", "Rectangles") 44 | query(points, polygons, "Points", "Polygons") 45 | query(linestrings, linestrings, "LineStrings", "LineStrings") 46 | query(linestrings, rectangles, "LineStrings", "Rectangles") 47 | query(linestrings, polygons, "LineStrings", "Polygons") 48 | query(rectangles, rectangles, "Rectangles", "Rectangles") 49 | query(rectangles, polygons, "Rectangles", "Polygons") 50 | query(polygons, polygons, "Polygons", "Polygons") 51 | 52 | sc.stop() 53 | 54 | def query(leftInput: String, rightInput: String, leftGeomType: String, rightGeomType: String) { 55 | 56 | var t0 = 0L 57 | var t1 = 0L 58 | var count1 = 0L 59 | var count = 0L 60 | val nQueries = 10 61 | val extentString = "" 62 | val method = "stp" 63 | 64 | val rangeQueryWindow6 = new GeometryFactory().toGeometry(new Envelope(-180.0, 180.0, -90.0, 90.0)) 65 | 66 | println("************************************ " + leftGeomType + "-" + rightGeomType + " Spatial Join ************************************") 67 | 68 | t0 = System.nanoTime() 69 | val leftData = sc.textFile(leftInput, 1024).map(x => x.split("\t")).zipWithIndex() 70 | val leftGeometryById = leftData.map(x => (x._2, Try(new WKTReader().read(x._1.apply(0))))).filter(_._2.isSuccess).map(x => (x._1, x._2.get)).cache() 71 | count = leftGeometryById.count() 72 | t1 = System.nanoTime() 73 | println(leftGeomType + " Read Time: " + ((t1 - t0) / 1E9) + " sec") 74 | val leftTime = (t1 - t0) 75 | 76 | t0 = System.nanoTime() 77 | val rightData = sc.textFile(rightInput, 1024).map(x => x.split("\t")).zipWithIndex() 78 | val rightGeometryById = rightData.map(x => (x._2, Try(new WKTReader().read(x._1.apply(0))))).filter(_._2.isSuccess).map(x => (x._1, x._2.get)).cache() 79 | count = rightGeometryById.count() 80 | t1 = System.nanoTime() 81 | println(rightGeomType + " Read Time: " + ((t1 - t0) / 1E9) + " sec") 82 | val rightTime = (t1 - t0) 83 | val read_time = (leftTime + rightTime) / 1E9 84 | println("Total Reading Time: " + read_time + " sec") 85 | 86 | 87 | var matchedPairs: RDD[(Long, Long)] = sc.emptyRDD 88 | t0 = System.nanoTime() 89 | val extent = extentString match { 90 | case "" => 91 | val temp = leftGeometryById.map(x => x._2.getEnvelopeInternal).map(x => (x.getMinX, x.getMinY, x.getMaxX, x.getMaxY)).reduce((a, b) => (a._1 min b._1, a._2 min b._2, a._3 max b._3, a._4 max b._4)) 92 | val temp2 = rightGeometryById.map(x => x._2.getEnvelopeInternal).map(x => (x.getMinX, x.getMinY, x.getMaxX, x.getMaxY)).reduce((a, b) => (a._1 min b._1, a._2 min b._2, a._3 max b._3, a._4 max b._4)) 93 | (temp._1 min temp2._1, temp._2 min temp2._2, temp._3 max temp2._3, temp._4 max temp2._4) 94 | case _ => (extentString.split(":").apply(0).toDouble, extentString.split(":").apply(1).toDouble, 95 | extentString.split(":").apply(2).toDouble, extentString.split(":").apply(3).toDouble) 96 | } 97 | 98 | val partConf = method match { 99 | case "stp" => 100 | val dimX = 32 101 | val dimY = 32 102 | val ratio = 0.3 103 | new SortTilePartitionConf(dimX, dimY, new MBR(extent._1, extent._2, extent._3, extent._4), ratio, true) 104 | case "bsp" => 105 | val level = 32 106 | val ratio = 0.1 107 | new BinarySplitPartitionConf(ratio, new MBR(extent._1, extent._2, extent._3, extent._4), level, true) 108 | case _ => 109 | val dimX = 32 110 | val dimY = 32 111 | new FixedGridPartitionConf(dimX, dimY, new MBR(extent._1, extent._2, extent._3, extent._4)) 112 | } 113 | t1 = System.nanoTime() 114 | val extentTime = ((t1 - t0) / 1E9) 115 | println("Extent Time: " + extentTime + " sec") 116 | 117 | // Embed timers inside PartitionedJoin in SpatialSpark to get partitioning and indexing time. Subtract the partitioning and indexing time from join time below to get purely join time 118 | t0 = System.nanoTime() 119 | matchedPairs = PartitionedSpatialJoin(sc, leftGeometryById, rightGeometryById, SpatialOperator.Intersects, 0.0, partConf) 120 | count = matchedPairs.count() 121 | t1 = System.nanoTime() 122 | val time0 = ((t1 - t0) / 1E9) 123 | 124 | val total_time = time0 + read_time + extentTime 125 | println("Join Time (including partitioning and indexing time): " + time0 + " sec") 126 | println("Total Join Time: " + total_time + " sec") 127 | 128 | leftGeometryById.unpersist() 129 | rightGeometryById.unpersist() 130 | } 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /tiger_results/README.md: -------------------------------------------------------------------------------- 1 | 2 | # US Census Tiger Dataset 3 | 4 | We used the LineString dataset from the TIGER dataset which contains approximately 70 million linestrings in US. There are other datasets in TIGER but they are limited in size (less than 2 million spatial objects). To have a larger dataset to join with, we generated a rectangle dataset by computing bounding boxes of these linestrings. We also sampled 170 million points that are in US from the OSM dataset to join with these datasets. 5 | 6 | ## Distance Join Performance 7 | 8 |

9 | 10 |

11 | 12 | Distance join costs breakdown 13 | 14 |

15 | 16 |

17 | 18 | Distance join scalability 19 | 20 |

21 | 22 |

23 | 24 | Distance join shuffle costs 25 | 26 | --- 27 | 28 | 29 | ## Spatial Joins Performance 30 | 31 |

32 | 33 |

34 | 35 | Spatial joins scalability 36 | 37 |

38 | 39 |

40 | 41 | Spatial joins peak execution memory consumption 42 | 43 |

44 | 45 |

46 | 47 | Spatial joins shuffle write costs 48 | 49 | 50 |

51 | 52 |

53 | 54 | Spatial joins shuffle read costs 55 | 56 |

57 | 58 |

59 | 60 | Spatial joins costs breakdown on a single node 61 | 62 |

63 | 64 |

65 | 66 | Point-Rectangle spatial join costs breakdown scalinng up the number of nodes 67 | 68 | --- 69 | 70 | ## kNN Join Performance 71 | 72 |

73 | 74 |

75 | 76 | kNN join costs breakdown 77 | 78 |

79 | 80 |

81 | 82 | kNN join scalability 83 | 84 |

85 | 86 |

87 | 88 | kNN join shuffle costs 89 | 90 | 91 | -------------------------------------------------------------------------------- /tiger_results/images/distance_join_cost_breakdown.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varpande/spatialanalytics/1558a6ebf922f6b614bb0742e9a30c98befdf5cf/tiger_results/images/distance_join_cost_breakdown.jpg -------------------------------------------------------------------------------- /tiger_results/images/distance_join_scalability.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varpande/spatialanalytics/1558a6ebf922f6b614bb0742e9a30c98befdf5cf/tiger_results/images/distance_join_scalability.jpg -------------------------------------------------------------------------------- /tiger_results/images/distance_join_shuffle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varpande/spatialanalytics/1558a6ebf922f6b614bb0742e9a30c98befdf5cf/tiger_results/images/distance_join_shuffle.jpg -------------------------------------------------------------------------------- /tiger_results/images/joins_peak_memory.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varpande/spatialanalytics/1558a6ebf922f6b614bb0742e9a30c98befdf5cf/tiger_results/images/joins_peak_memory.jpg -------------------------------------------------------------------------------- /tiger_results/images/joins_scalability.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varpande/spatialanalytics/1558a6ebf922f6b614bb0742e9a30c98befdf5cf/tiger_results/images/joins_scalability.jpg -------------------------------------------------------------------------------- /tiger_results/images/joins_shuffle_reads.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varpande/spatialanalytics/1558a6ebf922f6b614bb0742e9a30c98befdf5cf/tiger_results/images/joins_shuffle_reads.jpg -------------------------------------------------------------------------------- /tiger_results/images/joins_shuffle_writes.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varpande/spatialanalytics/1558a6ebf922f6b614bb0742e9a30c98befdf5cf/tiger_results/images/joins_shuffle_writes.jpg -------------------------------------------------------------------------------- /tiger_results/images/knn_join_cost_breakdown.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varpande/spatialanalytics/1558a6ebf922f6b614bb0742e9a30c98befdf5cf/tiger_results/images/knn_join_cost_breakdown.jpg -------------------------------------------------------------------------------- /tiger_results/images/knn_join_scalability.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varpande/spatialanalytics/1558a6ebf922f6b614bb0742e9a30c98befdf5cf/tiger_results/images/knn_join_scalability.jpg -------------------------------------------------------------------------------- /tiger_results/images/knn_join_shuffle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varpande/spatialanalytics/1558a6ebf922f6b614bb0742e9a30c98befdf5cf/tiger_results/images/knn_join_shuffle.jpg -------------------------------------------------------------------------------- /tiger_results/images/spatial_joins.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varpande/spatialanalytics/1558a6ebf922f6b614bb0742e9a30c98befdf5cf/tiger_results/images/spatial_joins.jpg -------------------------------------------------------------------------------- /tiger_results/images/spatial_joins_breakdown.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/varpande/spatialanalytics/1558a6ebf922f6b614bb0742e9a30c98befdf5cf/tiger_results/images/spatial_joins_breakdown.jpg --------------------------------------------------------------------------------