├── README.md ├── pom.xml └── src └── com └── hackethon └── spark └── file └── parser ├── constants ├── Constants.scala ├── FlattenStrategy.scala └── package-info.java ├── core ├── NestedFileParserFactory.scala ├── NestedFileParserTrait.scala ├── impl │ ├── AVROFileParserImpl.scala │ ├── JSONFileParserImpl.scala │ ├── TextFileParserImpl.scala │ ├── XMLFileParserImpl.scala │ └── package-info.java └── package-info.java ├── driver ├── NestedDataParserBatchDriver.scala ├── NestedDataParserStreamDriver.scala ├── NestedDataParserWinLocalDriver.scala └── package-info.java ├── session ├── SparkSessionHandler.scala └── package-info.java └── util ├── Utils.scala └── package-info.java /README.md: -------------------------------------------------------------------------------- 1 | # Spark-Nested-Data-Parser 2 | Nested Data (JSON/AVRO/XML) Parsing and Flattening using Apache-Spark. 3 | 4 | Implementation steps: 5 | 1. Load JSON/XML to a spark data frame. 6 | 2. Loop until the nested element flag is set to false. 7 | 3. Loop through the schema fields - set the flag to true when we find ArrayType and StructType. 8 | 4. For ArrayType - Explode and StructType - separate the inner fields. 9 | 5. It comes out once all the levels are flattened out. 10 | 11 | Flatten Strategy: 12 | Schema fields iteration using 1.Iterative 2.Recursive 13 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | Spark-Nested-Data-Parser 4 | Spark-Nested-Data-Parser 5 | 0.0.1-SNAPSHOT 6 | It's a personal poc project created for converting csv to json using Spark Dataframes 7 | 8 | 1.8 9 | 1.8 10 | UTF-8 11 | 2.4.5 12 | 2.11.8 13 | 2.11 14 | 15 | 16 | 17 | 18 | org.apache.spark 19 | spark-core_2.11 20 | 2.2.0 21 | 22 | 23 | org.apache.spark 24 | spark-sql_2.11 25 | 2.2.0 26 | 27 | 28 | 29 | com.databricks 30 | spark-xml_2.11 31 | 0.9.0 32 | 33 | 34 | 35 | 36 | com.databricks 37 | spark-avro_2.11 38 | 3.2.0 39 | 40 | 41 | 42 | 54 | 55 | com.jayway.jsonpath 56 | json-path 57 | 2.4.0 58 | 59 | 60 | 61 | src 62 | 63 | 64 | maven-compiler-plugin 65 | 3.7.0 66 | 67 | 1.8 68 | 1.8 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/constants/Constants.scala: -------------------------------------------------------------------------------- 1 | package com.hackethon.spark.file.parser.constants 2 | /** 3 | * @author Sai Krishna P 4 | */ 5 | object Constants { 6 | val XML = "xml" 7 | val AVRO = "avro" 8 | val JSON = "json" 9 | } -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/constants/FlattenStrategy.scala: -------------------------------------------------------------------------------- 1 | package com.hackethon.spark.file.parser.constants 2 | /** 3 | * @author Sai Krishna P 4 | */ 5 | object FlattenStrategy { 6 | val SCHEMA_RECURSIVE = "schema_recursive" 7 | val SCHEMA_ITERATIVE = "schema_iterative" 8 | } -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/constants/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Sai Krishna P 6 | * 7 | */ 8 | package com.hackethon.spark.file.parser.constants; -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/core/NestedFileParserFactory.scala: -------------------------------------------------------------------------------- 1 | package com.hackethon.spark.file.parser.core 2 | 3 | import com.hackethon.spark.file.parser.constants.Constants 4 | import com.hackethon.spark.file.parser.core.impl.JSONFileParserImpl 5 | import com.hackethon.spark.file.parser.core.impl.TextFileParserImpl 6 | import com.hackethon.spark.file.parser.core.impl.AVROFileParserImpl 7 | import com.hackethon.spark.file.parser.core.impl.XMLFileParserImpl 8 | /** 9 | * @author Sai Krishna P 10 | */ 11 | object NestedFileParserFactory { 12 | def getParser(fileType:String):NestedFileParserTrait={ 13 | fileType match { 14 | case Constants.JSON => return new JSONFileParserImpl 15 | case Constants.AVRO => return new AVROFileParserImpl 16 | case Constants.XML => return new XMLFileParserImpl 17 | case _ => println("Parser not available for file type :"+fileType+" Going with Default parser: TextFileParser") 18 | return new TextFileParserImpl 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/core/NestedFileParserTrait.scala: -------------------------------------------------------------------------------- 1 | package com.hackethon.spark.file.parser.core 2 | 3 | import org.apache.spark.sql.DataFrame 4 | import org.apache.spark.sql.types.{ArrayType,StructType,StructField} 5 | import org.apache.spark.sql.functions.{col,explode,explode_outer,to_json} 6 | import com.hackethon.spark.file.parser.constants.FlattenStrategy 7 | import org.apache.spark.sql.SaveMode 8 | import org.apache.spark.sql.SparkSession 9 | import org.apache.spark.sql.streaming.StreamingQuery 10 | 11 | /** 12 | * @author Sai Krishna P 13 | */ 14 | trait NestedFileParserTrait { 15 | 16 | def readFile(path:String,spark:SparkSession):DataFrame 17 | 18 | def readFileStream(path:String,spark:SparkSession,schema:StructType):DataFrame 19 | 20 | def writeFile(df:DataFrame,path:String){ 21 | df.write.mode(SaveMode.Overwrite).csv(path) 22 | } 23 | 24 | def writeStream(df:DataFrame,path:String):StreamingQuery={ 25 | df.writeStream.start(path) 26 | } 27 | 28 | /** 29 | * @param df 30 | * @param strategy 31 | * @return 32 | */ 33 | def flatten(df:DataFrame, strategy:String): DataFrame = { 34 | strategy match { 35 | case FlattenStrategy.SCHEMA_ITERATIVE => return flattenIterativeV2(df) 36 | case FlattenStrategy.SCHEMA_RECURSIVE => return flattenRecursive(df) 37 | case _ => println("Undefined Strategy, Default will be applied") 38 | return flattenIterative(df) 39 | } 40 | } 41 | 42 | /** 43 | * Iterative Schema flattening 44 | * @param dfGlobal 45 | * @return 46 | */ 47 | protected def flattenIterative(dfGlobal: DataFrame): DataFrame = { 48 | var df: DataFrame = dfGlobal 49 | var schema: StructType = df.schema 50 | var flag = true //allow first loop 51 | while(flag){ 52 | flag = false //reset every loop 53 | schema.fields.foreach { 54 | elem => 55 | elem.dataType match { 56 | case arrayType: ArrayType => //println("flatten array") 57 | flag = true 58 | df = df.withColumn(elem.name + "_temp", explode_outer(col(elem.name))) 59 | .drop(col(elem.name)) 60 | .withColumnRenamed(elem.name + "_temp", elem.name) 61 | case structType: StructType => //println("flatten struct") 62 | flag = true 63 | structType.fields.foreach { 64 | inElem => 65 | df = df.withColumn(elem.name + "_" + inElem.name, col(elem.name + "." + inElem.name)) 66 | } 67 | df = df.drop(col(elem.name)) 68 | case _ => //println("other type") 69 | } 70 | } 71 | schema = df.schema 72 | } 73 | return df 74 | } 75 | 76 | /** 77 | * Iterative Schema flattening - Version2 78 | * @param dfGlobal 79 | * @return 80 | */ 81 | protected def flattenIterativeV2(dfGlobal: DataFrame): DataFrame = { 82 | var df: DataFrame = dfGlobal 83 | var flag = true //allow first loop 84 | while(flag){ 85 | flag = false //reset every loop 86 | df.schema.fields.foreach { 87 | elem => 88 | var fieldNames = df.schema.fields.map(x => x.name) 89 | elem.dataType match { 90 | case arrayType: ArrayType => //println("flatten array") 91 | flag = true 92 | fieldNames = fieldNames.filter(_!=elem.name) ++ Array("explode_outer(".concat(elem.name).concat(") as ").concat(elem.name)) 93 | df=df.selectExpr(fieldNames:_*) 94 | case structType: StructType => //println("flatten struct") 95 | flag = true 96 | fieldNames = fieldNames.filter(_!=elem.name) ++ 97 | structType.fieldNames.map(childname => elem.name.concat(".").concat(childname) 98 | .concat(" as ") 99 | .concat(elem.name).concat("_").concat(childname)) 100 | df=df.selectExpr(fieldNames:_*) 101 | case _ => //println("other type") 102 | } 103 | 104 | } 105 | } 106 | return df 107 | } 108 | 109 | /** 110 | * Recursive Schema flattening 111 | * @param df 112 | * @return 113 | */ 114 | protected def flattenRecursive(df: DataFrame): DataFrame = { 115 | 116 | val fields = df.schema.fields 117 | val fieldNames = fields.map(x => x.name) 118 | val length = fields.length 119 | 120 | for(i <- 0 to fields.length-1){ 121 | val field = fields(i) 122 | val fieldtype = field.dataType 123 | val fieldName = field.name 124 | fieldtype match { 125 | case arrayType: ArrayType => //println("flatten array") 126 | val newfieldNames = fieldNames.filter(_!=fieldName) ++ Array("explode_outer(".concat(fieldName).concat(") as ").concat(fieldName)) 127 | val explodedDf = df.selectExpr(newfieldNames:_*) 128 | return flattenRecursive(explodedDf) 129 | case structType: StructType => //println("flatten struct") 130 | val newfieldNames = fieldNames.filter(_!= fieldName) ++ 131 | structType.fieldNames.map(childname => fieldName.concat(".").concat(childname) 132 | .concat(" as ") 133 | .concat(fieldName).concat("_").concat(childname)) 134 | val explodedf = df.selectExpr(newfieldNames:_*) 135 | return flattenRecursive(explodedf) 136 | case _ => //println("other type") 137 | } 138 | } 139 | df 140 | } 141 | 142 | } -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/core/impl/AVROFileParserImpl.scala: -------------------------------------------------------------------------------- 1 | package com.hackethon.spark.file.parser.core.impl 2 | 3 | import org.apache.spark.sql.DataFrame 4 | import com.hackethon.spark.file.parser.core.NestedFileParserTrait 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.types.StructType 7 | /** 8 | * @author Sai Krishna P 9 | */ 10 | class AVROFileParserImpl extends NestedFileParserTrait{ 11 | def readFile(path:String,spark:SparkSession):DataFrame={ 12 | return spark.read.format("com.databricks.spark.avro").load(path).repartition(4) 13 | } 14 | def readFileStream(path:String,spark:SparkSession,schema:StructType):DataFrame={ 15 | return spark.readStream.schema(schema).option("checkpointLocation", path+"/cp").format("com.databricks.spark.avro").load(path) 16 | } 17 | } -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/core/impl/JSONFileParserImpl.scala: -------------------------------------------------------------------------------- 1 | package com.hackethon.spark.file.parser.core.impl 2 | 3 | import com.hackethon.spark.file.parser.core.NestedFileParserTrait 4 | import org.apache.spark.sql.DataFrame 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.types.StructType 7 | /** 8 | * @author Sai Krishna P 9 | */ 10 | class JSONFileParserImpl extends NestedFileParserTrait{ 11 | def readFile(path:String,spark:SparkSession):DataFrame={ 12 | return spark.read.format("json").load(path).repartition(4) 13 | } 14 | def readFileStream(path:String,spark:SparkSession,schema:StructType):DataFrame={ 15 | return spark.readStream.schema(schema).option("checkpointLocation", path+"/cp").format("json").option("multiline", true).load(path) 16 | } 17 | } -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/core/impl/TextFileParserImpl.scala: -------------------------------------------------------------------------------- 1 | package com.hackethon.spark.file.parser.core.impl 2 | 3 | import com.hackethon.spark.file.parser.core.NestedFileParserTrait 4 | import org.apache.spark.sql.DataFrame 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.types.StructType 7 | /** 8 | * @author Sai Krishna P 9 | */ 10 | class TextFileParserImpl extends NestedFileParserTrait{ 11 | def readFile(path:String,spark:SparkSession):DataFrame={ 12 | return spark.read.text(path).repartition(4) 13 | } 14 | def readFileStream(path:String,spark:SparkSession,schema:StructType):DataFrame={ 15 | return spark.readStream.schema(schema).option("checkpointLocation", path+"/cp").text(path) 16 | } 17 | } -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/core/impl/XMLFileParserImpl.scala: -------------------------------------------------------------------------------- 1 | package com.hackethon.spark.file.parser.core.impl 2 | 3 | import org.apache.spark.sql.DataFrame 4 | import com.hackethon.spark.file.parser.core.NestedFileParserTrait 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.types.StructType 7 | /** 8 | * @author Sai Krishna P 9 | */ 10 | class XMLFileParserImpl extends NestedFileParserTrait{ 11 | def readFile(path:String,spark:SparkSession):DataFrame={ 12 | return spark.read.format("com.databricks.spark.xml").option("rowTag", "root").load(path).repartition(4) 13 | } 14 | def readFileStream(path:String,spark:SparkSession,schema:StructType):DataFrame={ 15 | return spark.readStream.schema(schema).option("checkpointLocation", path+"/cp").format("com.databricks.spark.xml").load(path) 16 | } 17 | } -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/core/impl/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Sai Krishna P 6 | * 7 | */ 8 | package com.hackethon.spark.file.parser.core.impl; -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/core/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Sai Krishna P 6 | * 7 | */ 8 | package com.hackethon.spark.file.parser.core; -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/driver/NestedDataParserBatchDriver.scala: -------------------------------------------------------------------------------- 1 | package com.hackethon.spark.file.parser.driver 2 | 3 | import com.hackethon.spark.file.parser.core.NestedFileParserFactory 4 | import com.hackethon.spark.file.parser.constants.FlattenStrategy 5 | import com.hackethon.spark.file.parser.session.SparkSessionHandler 6 | /** 7 | * @author Sai Krishna P 8 | */ 9 | object NestedDataParserBatchDriver extends App{ 10 | 11 | if(args.length < 4){ 12 | println("Not engough Arguments!") 13 | System.exit(1) 14 | } 15 | 16 | val fileType = args(0) 17 | val filePath = args(1) 18 | val outputPath = args(2) 19 | val flattenType = args(3) 20 | println("fileType :"+fileType) 21 | println("filePath :"+filePath) 22 | println("outputPath :"+outputPath) 23 | val spark = SparkSessionHandler.getSparkSession() 24 | try{ 25 | val parser = NestedFileParserFactory.getParser(fileType) 26 | val df = parser.readFile(filePath,spark) 27 | val dfParsed = if(flattenType.equals("1")){parser.flatten(df, FlattenStrategy.SCHEMA_ITERATIVE)}else if(flattenType.equals("2")){parser.flatten(df, FlattenStrategy.SCHEMA_RECURSIVE)}else{parser.flatten(df, FlattenStrategy.SCHEMA_ITERATIVE)} 28 | 29 | dfParsed.show() 30 | println("Final DF record count:"+dfParsed.count()) 31 | parser.writeFile(dfParsed, outputPath) 32 | }catch{ 33 | case e:Exception=> println("Exception message:"+e.getMessage) 34 | e.printStackTrace() 35 | }finally{ 36 | spark.stop() 37 | } 38 | } -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/driver/NestedDataParserStreamDriver.scala: -------------------------------------------------------------------------------- 1 | package com.hackethon.spark.file.parser.driver 2 | 3 | import com.hackethon.spark.file.parser.core.NestedFileParserFactory 4 | import com.hackethon.spark.file.parser.constants.FlattenStrategy 5 | import com.hackethon.spark.file.parser.session.SparkSessionHandler 6 | /** 7 | * @author Sai Krishna P 8 | */ 9 | object NestedDataParserStreamDriver extends App { 10 | if(args.length < 4){ 11 | println("Not engough Arguments!") 12 | System.exit(1) 13 | } 14 | 15 | val fileType = args(0) 16 | val filePath = args(1) 17 | val outputPath = args(2) 18 | val flattenType = args(3) 19 | val sampleData = args(4) 20 | println("fileType :"+fileType) 21 | println("filePath :"+filePath) 22 | println("outputPath :"+outputPath) 23 | val spark = SparkSessionHandler.getSparkStreamSession() 24 | val schema = spark.read.json(sampleData).schema 25 | try{ 26 | val parser = NestedFileParserFactory.getParser(fileType) 27 | val df = parser.readFileStream(filePath,spark,schema) 28 | val dfParsed = if(flattenType.equals("1")){parser.flatten(df, FlattenStrategy.SCHEMA_ITERATIVE)}else if(flattenType.equals("2")){parser.flatten(df, FlattenStrategy.SCHEMA_RECURSIVE)}else{parser.flatten(df, FlattenStrategy.SCHEMA_ITERATIVE)} 29 | 30 | val qry = parser.writeStream(dfParsed, outputPath) 31 | qry.awaitTermination() 32 | }catch{ 33 | case e:Exception=> println("Exception message:"+e.getMessage) 34 | e.printStackTrace() 35 | }finally{ 36 | spark.stop() 37 | } 38 | } -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/driver/NestedDataParserWinLocalDriver.scala: -------------------------------------------------------------------------------- 1 | package com.hackethon.spark.file.parser.driver 2 | 3 | import com.hackethon.spark.file.parser.core.NestedFileParserFactory 4 | import com.hackethon.spark.file.parser.constants.FlattenStrategy 5 | import com.hackethon.spark.file.parser.session.SparkSessionHandler 6 | /** 7 | * @author Sai Krishna P 8 | */ 9 | object NestedDataParserWinLocalDriver extends App { 10 | 11 | /* 12 | * For testing in windows os - with eclipse 13 | * Steps:: 14 | * 15 | * Create the following directory structure: "C:\hadoop_home\bin" (or replace "C:\hadoop_home" with whatever you like) 16 | * Download the following file: http://public-repo-1.hortonworks.com/hdp-win-alpha/winutils.exe 17 | * Put the file from step 2 into the "bin" directory from step 1. 18 | * Set the "hadoop.home.dir" system property to "C:\hadoop_home" (or whatever directory you created in step 1, without the "\bin" at the end). Note: You should be declaring this property in the beginning of your Spark code 19 | */ 20 | sys.props.+=(("hadoop.home.dir", "C:\\hadoop_home")) 21 | 22 | if(args.length < 4){ 23 | println("Not engough Arguments!") 24 | System.exit(1) 25 | } 26 | 27 | val fileType = args(0) 28 | val filePath = args(1) 29 | val outputPath = args(2) 30 | val flattenType = args(3) 31 | println("fileType :"+fileType) 32 | println("filePath :"+filePath) 33 | println("outputPath :"+outputPath) 34 | val spark = SparkSessionHandler.getSparkSessionLocal() 35 | try{ 36 | val parser = NestedFileParserFactory.getParser(fileType) 37 | val df = parser.readFile(filePath,spark) 38 | val dfParsed = if(flattenType.equals("1")){parser.flatten(df, FlattenStrategy.SCHEMA_ITERATIVE)}else if(flattenType.equals("2")){parser.flatten(df, FlattenStrategy.SCHEMA_RECURSIVE)}else{parser.flatten(df, FlattenStrategy.SCHEMA_ITERATIVE)} 39 | 40 | dfParsed.show() 41 | println("Final DF record count:"+dfParsed.count()) 42 | parser.writeFile(dfParsed, outputPath) 43 | }catch{ 44 | case e:Exception=> println("Exception message:"+e.getMessage) 45 | e.printStackTrace() 46 | }finally{ 47 | spark.stop() 48 | } 49 | 50 | } -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/driver/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Sai Krishna P 6 | * 7 | */ 8 | package com.hackethon.spark.file.parser.driver; -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/session/SparkSessionHandler.scala: -------------------------------------------------------------------------------- 1 | package com.hackethon.spark.file.parser.session 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.sql.SparkSession 6 | /** 7 | * @author Sai Krishna P 8 | */ 9 | object SparkSessionHandler { 10 | def getSparkSession():SparkSession={ 11 | return SparkSession.builder().appName("SparkNestedDataParser").getOrCreate() 12 | } 13 | 14 | def getSparkStreamSession():SparkSession={ 15 | return SparkSession.builder().appName("SparkNestedDataParser").getOrCreate() 16 | } 17 | 18 | def getSparkSessionLocal():SparkSession={ 19 | return SparkSession.builder().master("local").appName("SparkNestedDataParser").getOrCreate() 20 | } 21 | 22 | } -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/session/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Sai Krishna P 6 | * 7 | */ 8 | package com.hackethon.spark.file.parser.session; -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/util/Utils.scala: -------------------------------------------------------------------------------- 1 | package com.hackethon.spark.file.parser.util 2 | /** 3 | * @author Sai Krishna P 4 | */ 5 | object Utils { 6 | 7 | } -------------------------------------------------------------------------------- /src/com/hackethon/spark/file/parser/util/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Sai Krishna P 6 | * 7 | */ 8 | package com.hackethon.spark.file.parser.util; --------------------------------------------------------------------------------