├── .gitignore ├── data └── records.json ├── src ├── main │ └── scala │ │ └── MainApp.scala └── test │ └── java │ └── TestThriftClient.java ├── README.md ├── dependency-reduced-pom.xml └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | target 3 | *.iml 4 | *.log 5 | metastore_db 6 | -------------------------------------------------------------------------------- /data/records.json: -------------------------------------------------------------------------------- 1 | {"id":"305901156","language":"English","content":"Demo SparkSql as JDBC source via Spark ThriftServer"} -------------------------------------------------------------------------------- /src/main/scala/MainApp.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql.SparkSession 2 | 3 | /** 4 | * Created by surthi on 28/06/17. 5 | */ 6 | object MainApp { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | println("Hi") 11 | 12 | // change master from local[2] accordingly (change it to "yarn" if we are running it in yarn cluster..) 13 | val spark = SparkSession.builder() 14 | .master("local[2]") 15 | .appName("SparkSqlExample") 16 | .config("spark.sql.warehouse.dir", "/beeline/spark-sql-warehouse") 17 | .config("hive.server2.thrift.port", "10000") 18 | .config("spark.sql.hive.thriftServer.singleSession", true) 19 | .enableHiveSupport() 20 | .getOrCreate() 21 | 22 | import spark.implicits._ 23 | 24 | // For simplicity, Iam loading data from one file for now.. 25 | // But, in reality, this records dataset can be some real-time continuous streaming data 26 | val records = spark.read.format("json").load("data/records.json") 27 | records.show() 28 | records.write.saveAsTable("records") 29 | 30 | // This loop will keep the session alive 31 | while (true) { 32 | Thread.`yield`() 33 | } 34 | } 35 | } 36 | 37 | // Cloud-based SQL engine using SPARK made available for access as a JDBC/ODBC data source via the Spark thrift server. -------------------------------------------------------------------------------- /src/test/java/TestThriftClient.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by surthi on 30/06/17. 3 | */ 4 | import java.sql.SQLException; 5 | import java.sql.Connection; 6 | import java.sql.ResultSet; 7 | import java.sql.Statement; 8 | import java.sql.DriverManager; 9 | 10 | /** 11 | * This is a test case that connects to sprak's thrift server and asserts that the data is present. 12 | * In this example, 3 asserts are done: "show tables", "describe table " and "select * from ". 13 | */ 14 | public class TestThriftClient { 15 | private static String driverName = "org.apache.hive.jdbc.HiveDriver"; 16 | 17 | public static void main(String[] args) throws SQLException { 18 | 19 | // Assert that the driver is present in classpath 20 | try { 21 | Class.forName(driverName); 22 | } catch (ClassNotFoundException e) { 23 | e.printStackTrace(); 24 | System.exit(1); 25 | } 26 | 27 | // Change the connection-url, username and password accordingly 28 | String connUrl = "jdbc:hive2://localhost:10000/default"; 29 | String username = ""; 30 | String password = ""; 31 | 32 | Connection con = DriverManager.getConnection(connUrl, username, password); 33 | Statement stmt = con.createStatement(); 34 | 35 | // show tables 36 | String sql = "show tables"; 37 | System.out.println("Running: " + sql); 38 | ResultSet res = stmt.executeQuery(sql); 39 | if (res.next()) { 40 | System.out.println(res.getString(1)); 41 | } 42 | 43 | // describe table 44 | String tableName = "records"; 45 | sql = "describe " + tableName; 46 | System.out.println("Running: " + sql); 47 | res = stmt.executeQuery(sql); 48 | while (res.next()) { 49 | System.out.println(res.getString(1) + "\t" + res.getString(2)); 50 | } 51 | 52 | // select * query 53 | sql = "select * from " + tableName; 54 | System.out.println("Running: " + sql); 55 | res = stmt.executeQuery(sql); 56 | while (res.next()) { 57 | System.out.println(String.valueOf(res.getString(1)) + "\t" + res.getString(2)); 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark As CLoudBased SQL Engine 2 | This project shows how to use SPARK as Cloud-based SQL Engine and expose your big-data as a JDBC/ODBC data source via the Spark thrift server. 3 | 4 | ## 1. Central Idea 5 | Traditional relational Database engines like SQL had scalability problems and so evolved couple of SQL-on-Hadoop frameworks like Hive, Cloudier Impala, Presto etc. These frameworks are essentially cloud-based solutions and they all come with their own advantages and limitations. This project will demo how SparkSQL comes across as one more SQL-on-Hadoop framework. 6 | 7 | ## 2. Architecture 8 | Following picture illustrates how ApacheSpark can be used as SQL-on-Hadoop framework to serve your big-data as a JDBC/ODBC data source via the Spark thrift server.: 9 | 10 | 11 | - Data from multiple sources can be pushed into Spark and then exposed as SQLtable 12 | - These tables are then made accessible as a JDBC/ODBC data source via the Spark thrift server. 13 | - Multiple clients like ```Beeline CLI```, ```JDBC```, ```ODBC``` or ```BI tools like Tableau``` connect to Spark thrift server. 14 | - Once the connection is established, ThriftServer will contact ```SparkSQL engine to access Hive or Spark temp tables and run the sql queries on ApacheSpark framework```. 15 | - Spark Thrift basically works similar to HiveServer2 thrift where HiveServer2 submits the sql queries as Hive MapReduce job vs Spark thrift server will use Spark SQL engine which underline uses full spark capabilities. 16 | 17 | #### To know more about this topic, please refer to my blog [here](https://spoddutur.github.io/spark-notes/spark-as-cloud-based-sql-engine-via-thrift-server) where I briefed the concept in detail. 18 | 19 | ## 3. Structure of the project: 20 | - **data:** Contains input json used in MainApp to register sample data with SparkSql. 21 | - **src/main/java/MainApp.scala:** Spark 2.1 implementation where it starts SparkSession and registers data from input.json with SparkSQL. (To keep the spark-session alive, there's a continuous while-loop in there). 22 | - **src/test/java/TestThriftClient.java:** Java class to demo how to connect to thrift server as JDBC source and query the registered data 23 | 24 | ## 4. How to run this project? 25 | This project does demo 2 things: 26 | - 4.1. How to register data with SparkSql 27 | - 4.2. How to query registered data via Spark ThriftServer - using **Beeline** and **JDBC** 28 | 29 | ### 4.1 How to register data with SparkSql 30 | - Download this project. 31 | - Build it: `mvn clean install` and 32 | - Run MainApp: `spark-submit --class MainApp cloud-based-sql-engine-using-spark.jar`. Tht's it! 33 | - It'll register some sample data in `records` table with SparkSQL. 34 | 35 | ### 4.2 How to query registered data via Spark Thrift Server using Beeline and JDBC? 36 | For this, first connect to Spark ThriftServer. Once the connection is established, just like HiveServer2, access Hive or Spark temp tables to run the sql queries on ApacheSpark framework. I'll show 2 ways to do this: 37 | 38 | 1. **Beeline:** Perhaps, the simplest is to use beeline command-line tool provided in Spark's bin folder. 39 | ```markdown 40 | `$> beeline` 41 | Beeline version 2.1.1-amzn-0 by Apache Hive 42 | 43 | // Connect to spark thrift server.. 44 | `beeline> !connect jdbc:hive2://localhost:10000` 45 | Connecting to jdbc:hive2://localhost:10000 46 | Enter username for jdbc:hive2://localhost:10000: 47 | Enter password for jdbc:hive2://localhost:10000: 48 | 49 | // run your sql queries and access data.. 50 | `jdbc:hive2://localhost:10000> show tables;,` 51 | ``` 52 | 2. **Java JDBC:** Please refer to this project's test folder where I've shared a java example `TestThriftClient.java` to demo the same. 53 | 54 | ## 5. Requirements 55 | - Spark 2.1.0, Java 1.8 and Scala 2.11 56 | 57 | ## 6. References: 58 | - Complete guide and references to this project are briefed in my blog [here](https://spoddutur.github.io/spark-notes/spark-as-cloud-based-sql-engine-via-thrift-server). 59 | -------------------------------------------------------------------------------- /dependency-reduced-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.spoddutur 5 | cloud-based-sql-engine-using-spark 6 | 1.0-SNAPSHOT 7 | 8 | 9 | 10 | src/main/resources/${env} 11 | 12 | 13 | src/main/resources/common 14 | 15 | 16 | 17 | 18 | net.alchim31.maven 19 | scala-maven-plugin 20 | 3.2.0 21 | 22 | 23 | process-sources 24 | 25 | compile 26 | testCompile 27 | 28 | 29 | 30 | 31 | 32 | 33 | maven-compiler-plugin 34 | 3.3 35 | 36 | ${java.version} 37 | ${java.version} 38 | 39 | 40 | 41 | maven-shade-plugin 42 | 43 | 44 | package 45 | 46 | shade 47 | 48 | 49 | 50 | 51 | *:* 52 | 53 | META-INF/*.SF 54 | META-INF/*.DSA 55 | META-INF/*.RSA 56 | 57 | 58 | 59 | 60 | 61 | at.seresunit.lecturemanager_connector.App 62 | 63 | 64 | META-INF/spring.handlers 65 | 66 | 67 | META-INF/spring.schemas 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | junit 79 | junit 80 | 4.11 81 | test 82 | 83 | 84 | hamcrest-core 85 | org.hamcrest 86 | 87 | 88 | 89 | 90 | org.hamcrest 91 | hamcrest-all 92 | 1.3 93 | test 94 | 95 | 96 | org.scalacheck 97 | scalacheck_2.11 98 | 1.11.4 99 | test 100 | 101 | 102 | test-interface 103 | org.scala-sbt 104 | 105 | 106 | 107 | 108 | org.scalatest 109 | scalatest_2.11 110 | 2.2.0 111 | test 112 | 113 | 114 | 115 | 2.11.8 116 | 1.8 117 | 2.11 118 | 1.10.50 119 | UTF-8 120 | 2.1.0 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.spoddutur 8 | cloud-based-sql-engine-using-spark 9 | 1.0-SNAPSHOT 10 | 11 | 12 | UTF-8 13 | 1.8 14 | 2.11.8 15 | 2.11 16 | 2.1.0 17 | 1.10.50 18 | 19 | 20 | 21 | 22 | 23 | net.alchim31.maven 24 | scala-maven-plugin 25 | 3.2.0 26 | 27 | 28 | 29 | process-sources 30 | 31 | compile 32 | testCompile 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | org.apache.maven.plugins 41 | maven-compiler-plugin 42 | 3.3 43 | 44 | ${java.version} 45 | ${java.version} 46 | 47 | 48 | 49 | org.apache.maven.plugins 50 | maven-shade-plugin 51 | 52 | 53 | package 54 | 55 | shade 56 | 57 | 58 | 59 | 60 | *:* 61 | 62 | META-INF/*.SF 63 | META-INF/*.DSA 64 | META-INF/*.RSA 65 | 66 | 67 | 68 | 69 | 71 | at.seresunit.lecturemanager_connector.App 72 | 73 | 75 | META-INF/spring.handlers 76 | 77 | 79 | META-INF/spring.schemas 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | src/main/resources/${env} 90 | 91 | 92 | src/main/resources/common 93 | 94 | 95 | 96 | 97 | 98 | 99 | com.typesafe 100 | config 101 | 1.3.1 102 | 103 | 104 | com.fasterxml.jackson.core 105 | jackson-core 106 | 2.6.5 107 | 108 | 109 | com.fasterxml.jackson.core 110 | jackson-databind 111 | 2.6.5 112 | 113 | 114 | org.scala-lang 115 | scala-library 116 | ${scala.version} 117 | 118 | 119 | junit 120 | junit 121 | 4.11 122 | test 123 | 124 | 125 | org.hamcrest 126 | hamcrest-all 127 | 1.3 128 | test 129 | 130 | 131 | org.scalacheck 132 | scalacheck_${scala.binary.version} 133 | 1.11.4 134 | test 135 | 136 | 137 | org.scalatest 138 | scalatest_${scala.binary.version} 139 | 2.2.0 140 | test 141 | 142 | 143 | org.apache.spark 144 | spark-core_${scala.binary.version} 145 | ${spark.version} 146 | 147 | 148 | org.apache.spark 149 | spark-sql_${scala.binary.version} 150 | ${spark.version} 151 | 152 | 153 | org.apache.spark 154 | spark-streaming_${scala.binary.version} 155 | ${spark.version} 156 | 157 | 158 | 159 | org.apache.spark 160 | spark-hive_${scala.binary.version} 161 | ${spark.version} 162 | 163 | 164 | org.apache.calcite 165 | calcite-avatica 166 | 1.6.0 167 | 168 | 169 | 170 | 171 | org.apache.calcite 172 | calcite-core 173 | 1.12.0 174 | 175 | 176 | 177 | 178 | org.spark-project.hive 179 | hive-exec 180 | 1.2.1.spark2 181 | 182 | 183 | 184 | 185 | org.spark-project.hive 186 | hive-metastore 187 | 1.2.1.spark2 188 | 189 | 190 | 191 | 192 | org.codehaus.jackson 193 | jackson-mapper-asl 194 | 1.9.13 195 | 196 | 197 | 198 | 199 | 200 | com.amazonaws 201 | aws-java-sdk-core 202 | ${aws.version} 203 | 204 | 205 | com.amazonaws 206 | aws-java-sdk-s3 207 | ${aws.version} 208 | 209 | 210 | org.slf4j 211 | slf4j-log4j12 212 | 1.7.13 213 | 214 | 215 | log4j 216 | log4j 217 | 1.2.17 218 | 219 | 220 | org.slf4j 221 | slf4j-api 222 | 1.7.13 223 | 224 | 225 | com.google.code.gson 226 | gson 227 | 2.2.4 228 | 229 | 230 | 231 | org.apache.hive 232 | hive-common 233 | 2.1.1 234 | 235 | 236 | 237 | org.apache.hive 238 | hive-jdbc 239 | 2.1.1 240 | 241 | 242 | 243 | org.apache.hive 244 | hive-exec 245 | 2.1.1 246 | 247 | 248 | 249 | org.apache.hive 250 | hive-service 251 | 2.1.1 252 | 253 | 254 | 255 | --------------------------------------------------------------------------------