├── .gitignore ├── .recommenders ├── caches │ ├── identified-project-coordinates.json │ └── manual-mappings.json └── index │ └── http___download_eclipse_org_recommenders_models_photon_ │ ├── _1.fdt │ ├── _1.fdx │ ├── _1.fnm │ ├── _1.frq │ ├── _1.nrm │ ├── _1.prx │ ├── _1.tii │ ├── _1.tis │ ├── segments.gen │ ├── segments_2 │ └── write.lock ├── README.md ├── pom.xml ├── project1 ├── .settings │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.m2e.core.prefs ├── pom.xml ├── src │ └── main │ │ ├── java │ │ └── com │ │ │ └── jobreadyprogrammer │ │ │ └── spark │ │ │ └── Application.java │ │ └── resources │ │ └── name_and_comments.txt └── target │ └── .gitignore ├── project2 ├── .classpath ├── .project ├── .settings │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.m2e.core.prefs ├── pom.xml ├── src │ └── main │ │ ├── java │ │ └── com │ │ │ └── jobreadyprogrammer │ │ │ └── spark │ │ │ ├── Application.java │ │ │ ├── DefineCSVSchema.java │ │ │ ├── InferCSVSchema.java │ │ │ └── JSONLinesParser.java │ │ └── resources │ │ ├── amazonProducts.txt │ │ ├── multiline.json │ │ └── simple.json └── target │ ├── .gitignore │ └── classes │ ├── amazonProducts.txt │ └── com │ └── jobreadyprogrammer │ └── spark │ ├── Application.class │ ├── DefineCSVSchema.class │ ├── InferCSVSchema.class │ └── JSONLinesParser.class ├── project3 ├── .settings │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.m2e.core.prefs ├── pom.xml ├── src │ └── main │ │ ├── java │ │ └── com │ │ │ └── jobreadyprogrammer │ │ │ └── spark │ │ │ ├── Application.java │ │ │ └── ApplicationTest.java │ │ └── resources │ │ ├── durham-parks.json │ │ ├── philadelphia_recreations.csv │ │ └── students.csv └── target │ └── .gitignore ├── project4 ├── .gitignore ├── .settings │ ├── org.eclipse.core.resources.prefs │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.m2e.core.prefs ├── pom.xml ├── src │ └── main │ │ ├── java │ │ └── com │ │ │ └── jobreadyprogrammer │ │ │ ├── mappers │ │ │ ├── HouseMapper.java │ │ │ └── LineMapper.java │ │ │ ├── pojos │ │ │ └── House.java │ │ │ └── spark │ │ │ ├── Application.java │ │ │ ├── ArrayToDataset.java │ │ │ ├── CsvToDatasetHouseToDataframe.java │ │ │ └── WordCount.java │ │ └── resources │ │ ├── houses.csv │ │ └── shakespeare.txt └── target │ └── .gitignore ├── project5 ├── .gitignore ├── .settings │ ├── org.eclipse.core.resources.prefs │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.m2e.core.prefs ├── pom.xml ├── spark-warehouse │ └── grades_view_perm │ │ └── _temporary │ │ └── 0 │ │ └── _temporary │ │ └── attempt_20180912224126_0002_m_000000_0 │ │ ├── .part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc │ │ └── part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet └── src │ └── main │ ├── java │ └── com │ │ └── jobreadyprogrammer │ │ └── spark │ │ ├── Application.java │ │ └── CustomersAndProducts.java │ └── resources │ ├── customers.csv │ ├── grade_chart.csv │ ├── products.csv │ ├── purchases.csv │ └── students.csv ├── project6 ├── .gitignore ├── .settings │ ├── org.eclipse.core.resources.prefs │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.m2e.core.prefs ├── pom.xml ├── spark-warehouse │ └── grades_view_perm │ │ └── _temporary │ │ └── 0 │ │ └── _temporary │ │ └── attempt_20180912224126_0002_m_000000_0 │ │ ├── .part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc │ │ └── part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet └── src │ └── main │ └── java │ └── com │ └── jobreadyprogrammer │ └── spark │ ├── Application.java │ └── WordUtils.java ├── project7 ├── .gitignore ├── .settings │ ├── org.eclipse.core.resources.prefs │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.m2e.core.prefs ├── pom.xml ├── spark-warehouse │ └── grades_view_perm │ │ └── _temporary │ │ └── 0 │ │ └── _temporary │ │ └── attempt_20180912224126_0002_m_000000_0 │ │ ├── .part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc │ │ └── part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet └── src │ └── main │ └── java │ └── com │ └── jobreadyprogrammer │ └── spark │ ├── StreamingFileDirectoryApplication.java │ ├── StreamingKafkaConsumer.java │ └── StreamingSocketApplication.java ├── project8 ├── .gitignore ├── .settings │ ├── org.eclipse.core.resources.prefs │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.m2e.core.prefs ├── pom.xml ├── spark-warehouse │ └── grades_view_perm │ │ └── _temporary │ │ └── 0 │ │ └── _temporary │ │ └── attempt_20180912224126_0002_m_000000_0 │ │ ├── .part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc │ │ └── part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet └── src │ └── main │ └── java │ └── com │ └── jobreadyprogrammer │ └── spark │ ├── FlatMapAndFilterRddApp.java │ ├── JoinRddApp.java │ ├── MapAndReduceRddApp.java │ └── TupleAndPairRddApp.java ├── project9 ├── pom.xml ├── src │ └── main │ │ └── java │ │ └── com │ │ └── jobreadyprogrammer │ │ └── spark │ │ ├── KmeansClustering.java │ │ ├── LinearMarketingVsSales.java │ │ ├── LinearMpgRegression.java │ │ └── LogisticRegressionExample.java └── target │ └── classes │ └── com │ └── jobreadyprogrammer │ └── spark │ ├── KmeansClustering.class │ ├── LinearMarketingVsSales.class │ ├── LinearMpgRegression.class │ └── LogisticRegressionExample.class └── test-dev-env /.gitignore: -------------------------------------------------------------------------------- 1 | /.metadata/ 2 | .classpath 3 | .project 4 | -------------------------------------------------------------------------------- /.recommenders/caches/identified-project-coordinates.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /.recommenders/caches/manual-mappings.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.fdt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.fdt -------------------------------------------------------------------------------- /.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.fdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.fdx -------------------------------------------------------------------------------- /.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.fnm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.fnm -------------------------------------------------------------------------------- /.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.frq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.frq -------------------------------------------------------------------------------- /.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.nrm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.nrm -------------------------------------------------------------------------------- /.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.tii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.tii -------------------------------------------------------------------------------- /.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.tis: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.tis -------------------------------------------------------------------------------- /.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/segments.gen: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/segments.gen -------------------------------------------------------------------------------- /.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/segments_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/segments_2 -------------------------------------------------------------------------------- /.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/write.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/write.lock -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is the source code for the course "The Ultimate Apache Spark with Java Course - Hands On" 2 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.jobreadyprogrammer 4 | learningspark 5 | 0.0.1-SNAPSHOT 6 | jar 7 | 8 | 9 | 10 | UTF-8 11 | 1.8 12 | 2.11 13 | 2.3.1 14 | 1.8 15 | 1.8 16 | 17 | 18 | 19 | 20 | 21 | org.apache.spark 22 | spark-core_${scala.version} 23 | ${spark.version} 24 | 25 | 26 | 27 | org.apache.spark 28 | spark-sql_${scala.version} 29 | ${spark.version} 30 | 31 | 32 | org.slf4j 33 | slf4j-simple 34 | 35 | 36 | 37 | 38 | 39 | org.apache.spark 40 | spark-mllib_${scala.version} 41 | ${spark.version} 42 | 43 | 44 | org.slf4j 45 | slf4j-log4j12 46 | 47 | 48 | org.slf4j 49 | slf4j-simple 50 | 51 | 52 | 53 | 54 | 55 | junit 56 | junit 57 | 4.11 58 | test 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | org.apache.maven.plugins 68 | maven-dependency-plugin 69 | 70 | 71 | copy-dependencies 72 | prepare-package 73 | 74 | copy-dependencies 75 | 76 | 77 | 78 | ${project.build.directory}/libs 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | org.springframework.boot 88 | spring-boot-maven-plugin 89 | 90 | 91 | 92 | repackage 93 | 94 | 95 | 96 | com.jobreadyprogrammer.spark.Application 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /project1/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate 4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 6 | org.eclipse.jdt.core.compiler.compliance=1.8 7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 13 | org.eclipse.jdt.core.compiler.release=disabled 14 | org.eclipse.jdt.core.compiler.source=1.8 15 | -------------------------------------------------------------------------------- /project1/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /project1/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.jobreadyprogrammer 4 | project1 5 | 0.0.1-SNAPSHOT 6 | jar 7 | 8 | 9 | 10 | 11 | UTF-8 12 | 1.8 13 | 14 | 2.11 15 | 2.3.1 16 | 42.1.4 17 | 18 | 1.8 19 | 1.8 20 | 21 | 22 | 23 | 24 | 25 | 26 | org.apache.spark 27 | spark-core_${scala.version} 28 | ${spark.version} 29 | 30 | 31 | 32 | org.apache.spark 33 | spark-sql_${scala.version} 34 | ${spark.version} 35 | 36 | 37 | 38 | org.apache.spark 39 | spark-mllib_${scala.version} 40 | ${spark.version} 41 | 42 | 43 | 44 | junit 45 | junit 46 | 4.11 47 | test 48 | 49 | 50 | 51 | org.postgresql 52 | postgresql 53 | ${postgresql.version} 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | org.apache.maven.plugins 64 | maven-dependency-plugin 65 | 66 | 67 | copy-dependencies 68 | prepare-package 69 | 70 | copy-dependencies 71 | 72 | 73 | 74 | ${project.build.directory}/libs 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | org.springframework.boot 84 | spring-boot-maven-plugin 85 | 86 | 87 | 88 | repackage 89 | 90 | 91 | 92 | com.jobreadyprogrammer.spark.Application 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /project1/src/main/java/com/jobreadyprogrammer/spark/Application.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import static org.apache.spark.sql.functions.concat; 4 | import static org.apache.spark.sql.functions.lit; 5 | 6 | import java.util.Properties; 7 | 8 | import org.apache.spark.sql.Dataset; 9 | import org.apache.spark.sql.Row; 10 | import org.apache.spark.sql.SaveMode; 11 | import org.apache.spark.sql.SparkSession; 12 | 13 | public class Application { 14 | 15 | public static void main(String args[]) throws InterruptedException { 16 | 17 | // Create a session 18 | SparkSession spark = new SparkSession.Builder() 19 | .appName("CSV to DB") 20 | .master("local") 21 | .getOrCreate(); 22 | 23 | // get data 24 | Dataset df = spark.read().format("csv") 25 | .option("header", true) 26 | .load("src/main/resources/name_and_comments.txt"); 27 | 28 | // df.show(3); 29 | 30 | // Transformation 31 | df = df.withColumn("full_name", 32 | concat(df.col("last_name"), lit(", "), df.col("first_name"))) 33 | .filter(df.col("comment").rlike("\\d+")) 34 | .orderBy(df.col("last_name").asc()); 35 | 36 | // Write to destination 37 | String dbConnectionUrl = "jdbc:postgresql://localhost/course_data"; // <<- You need to create this database 38 | Properties prop = new Properties(); 39 | prop.setProperty("driver", "org.postgresql.Driver"); 40 | prop.setProperty("user", "postgres"); 41 | prop.setProperty("password", "password"); // <- The password you used while installing Postgres 42 | 43 | df.write() 44 | .mode(SaveMode.Overwrite) 45 | .jdbc(dbConnectionUrl, "project1", prop); 46 | } 47 | } -------------------------------------------------------------------------------- /project1/src/main/resources/name_and_comments.txt: -------------------------------------------------------------------------------- 1 | last_name,first_name,comment 2 | Lon,Jim,There are plenty of people in this world. 3 | Ingram,Milford,I've been using the internet for 10. 4 | Gideon,Elmer,Social media has taken over our lives for good. 5 | Dong,Fen,The body is 70% water so make sure to stay hydrated. -------------------------------------------------------------------------------- /project1/target/.gitignore: -------------------------------------------------------------------------------- 1 | /classes/ 2 | /test-classes/ 3 | -------------------------------------------------------------------------------- /project2/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /project2/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | project2 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /project2/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate 4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 6 | org.eclipse.jdt.core.compiler.compliance=1.8 7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 13 | org.eclipse.jdt.core.compiler.release=disabled 14 | org.eclipse.jdt.core.compiler.source=1.8 15 | -------------------------------------------------------------------------------- /project2/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /project2/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.jobreadyprogrammer 4 | project2 5 | 0.0.1-SNAPSHOT 6 | jar 7 | 8 | 9 | 10 | UTF-8 11 | 1.8 12 | 2.11 13 | 2.3.1 14 | 1.8 15 | 1.8 16 | 17 | 18 | 19 | 20 | 21 | org.apache.spark 22 | spark-core_${scala.version} 23 | ${spark.version} 24 | 25 | 26 | 27 | org.apache.spark 28 | spark-sql_${scala.version} 29 | ${spark.version} 30 | 31 | 32 | org.slf4j 33 | slf4j-simple 34 | 35 | 36 | 37 | 38 | 39 | org.apache.spark 40 | spark-mllib_${scala.version} 41 | ${spark.version} 42 | 43 | 44 | org.slf4j 45 | slf4j-log4j12 46 | 47 | 48 | org.slf4j 49 | slf4j-simple 50 | 51 | 52 | 53 | 54 | 55 | junit 56 | junit 57 | 4.11 58 | test 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | org.apache.maven.plugins 68 | maven-dependency-plugin 69 | 70 | 71 | copy-dependencies 72 | prepare-package 73 | 74 | copy-dependencies 75 | 76 | 77 | 78 | ${project.build.directory}/libs 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | org.springframework.boot 88 | spring-boot-maven-plugin 89 | 90 | 91 | 92 | repackage 93 | 94 | 95 | 96 | com.jobreadyprogrammer.spark.Application 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /project2/src/main/java/com/jobreadyprogrammer/spark/Application.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | public class Application { 4 | 5 | public static void main(String[] args) { 6 | 7 | // InferCSVSchema parser = new InferCSVSchema(); 8 | // parser.printSchema(); 9 | 10 | // DefineCSVSchema parser2 = new DefineCSVSchema(); 11 | // parser2.printDefinedSchema(); 12 | // 13 | JSONLinesParser parser3 = new JSONLinesParser(); 14 | parser3.parseJsonLines(); 15 | 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /project2/src/main/java/com/jobreadyprogrammer/spark/DefineCSVSchema.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Row; 5 | import org.apache.spark.sql.SparkSession; 6 | import org.apache.spark.sql.types.DataTypes; 7 | import org.apache.spark.sql.types.StructField; 8 | import org.apache.spark.sql.types.StructType; 9 | 10 | public class DefineCSVSchema { 11 | 12 | public void printDefinedSchema() { 13 | 14 | SparkSession spark = SparkSession.builder() 15 | .appName("Complex CSV with a schema to Dataframe") 16 | .master("local") 17 | .getOrCreate(); 18 | 19 | StructType schema = DataTypes.createStructType(new StructField[] { // 20 | DataTypes.createStructField( 21 | "id", // 22 | DataTypes.IntegerType, // 23 | false), // 24 | DataTypes.createStructField( 25 | "product_id", 26 | DataTypes.IntegerType, 27 | true), 28 | DataTypes.createStructField( 29 | "item_name", 30 | DataTypes.StringType, 31 | false), 32 | DataTypes.createStructField( 33 | "published_on", 34 | DataTypes.DateType, 35 | true), 36 | DataTypes.createStructField( 37 | "url", 38 | DataTypes.StringType, 39 | false) }); 40 | 41 | Dataset df = spark.read().format("csv") 42 | .option("header", "true") 43 | .option("multiline", true) 44 | .option("sep", ";") 45 | .option("dateFormat", "M/d/y") 46 | .option("quote", "^") 47 | .schema(schema) // 48 | .load("src/main/resources/amazonProducts.txt"); 49 | 50 | df.show(5, 15); 51 | df.printSchema(); 52 | 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /project2/src/main/java/com/jobreadyprogrammer/spark/InferCSVSchema.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Row; 5 | import org.apache.spark.sql.SparkSession; 6 | 7 | public class InferCSVSchema { 8 | 9 | public void printSchema() { 10 | SparkSession spark = SparkSession.builder() 11 | .appName("Complex CSV to Dataframe") 12 | .master("local") 13 | .getOrCreate(); 14 | 15 | Dataset df = spark.read().format("csv") // 16 | .option("header", "true") // 17 | .option("multiline", true) // 18 | .option("sep", ";") // 19 | .option("quote", "^") // 20 | .option("dateFormat", "M/d/y") // 21 | .option("inferSchema", true) // 22 | .load("src/main/resources/amazonProducts.txt"); 23 | 24 | System.out.println("Excerpt of the dataframe content:"); 25 | // df.show(7); 26 | df.show(7, 90); // truncate after 90 chars 27 | System.out.println("Dataframe's schema:"); 28 | df.printSchema(); 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /project2/src/main/java/com/jobreadyprogrammer/spark/JSONLinesParser.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Row; 5 | import org.apache.spark.sql.SparkSession; 6 | 7 | public class JSONLinesParser { 8 | 9 | 10 | 11 | public void parseJsonLines() { 12 | SparkSession spark = SparkSession.builder() 13 | .appName("JSON Lines to Dataframe") 14 | .master("local") 15 | .getOrCreate(); 16 | 17 | // Dataset df = spark.read().format("json") 18 | // .load("src/main/resources/simple.json"); 19 | 20 | Dataset df2 = spark.read().format("json") 21 | .option("multiline", true) 22 | .load("src/main/resources/multiline.json"); 23 | 24 | df2.show(5, 150); 25 | df2.printSchema(); 26 | } 27 | 28 | 29 | } 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /project2/src/main/resources/amazonProducts.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project2/src/main/resources/amazonProducts.txt -------------------------------------------------------------------------------- /project2/src/main/resources/multiline.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "id": "contract-11934", 3 | "buildingKey": "993839c8bh3fdgcc6734624ee8cc351050bn9shf93", 4 | "geo_location": { 5 | "type": "exact", 6 | "coordinates": [ 7 | -78.8922549, 8 | 36.0013755 9 | ] 10 | }, 11 | "properties": { 12 | "permit_no": "110138", 13 | "lat_and_lon": [ 14 | 36.0013755, 15 | -78.8922549 16 | ], 17 | "address": "877 W CANAL ST", 18 | "year": "2009" 19 | }, 20 | "timestamp": "2014-02-09T12:28:33-05:00" 21 | }, 22 | { 23 | "id": "contract-11984", 24 | "buildingKey": "8fdn8rh3fdgcc6734624ee89wn350bn9shf93", 25 | "geo_location": { 26 | "type": "exact", 27 | "coordinates": [ 28 | -87.9872323, 29 | 36.0013755 30 | ] 31 | }, 32 | "properties": { 33 | "permit_no": "110138", 34 | "lat_and_lon": [ 35 | 36.0013755, 36 | -78.8922549 37 | ], 38 | "address": "923 YETTI ST", 39 | "year": "2004" 40 | }, 41 | "timestamp": "2014-02-09T12:28:33-05:00" 42 | }] -------------------------------------------------------------------------------- /project2/src/main/resources/simple.json: -------------------------------------------------------------------------------- 1 | {"name": "Top", "owns": [["car", "honda"], ["laptop", "Dell"]]} 2 | {"name": "Frank", "owns": [["laptop", "Macbook"], ["shoes", "Nike"]]} 3 | {"name": "Peter", "owns": []} 4 | {"name": "Samantha", "owns": [["home", "34 Morris Ave."]]} -------------------------------------------------------------------------------- /project2/target/.gitignore: -------------------------------------------------------------------------------- 1 | /classes/ 2 | /test-classes/ 3 | -------------------------------------------------------------------------------- /project2/target/classes/amazonProducts.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project2/target/classes/amazonProducts.txt -------------------------------------------------------------------------------- /project2/target/classes/com/jobreadyprogrammer/spark/Application.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project2/target/classes/com/jobreadyprogrammer/spark/Application.class -------------------------------------------------------------------------------- /project2/target/classes/com/jobreadyprogrammer/spark/DefineCSVSchema.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project2/target/classes/com/jobreadyprogrammer/spark/DefineCSVSchema.class -------------------------------------------------------------------------------- /project2/target/classes/com/jobreadyprogrammer/spark/InferCSVSchema.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project2/target/classes/com/jobreadyprogrammer/spark/InferCSVSchema.class -------------------------------------------------------------------------------- /project2/target/classes/com/jobreadyprogrammer/spark/JSONLinesParser.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project2/target/classes/com/jobreadyprogrammer/spark/JSONLinesParser.class -------------------------------------------------------------------------------- /project3/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate 4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 6 | org.eclipse.jdt.core.compiler.compliance=1.8 7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 13 | org.eclipse.jdt.core.compiler.release=disabled 14 | org.eclipse.jdt.core.compiler.source=1.8 15 | -------------------------------------------------------------------------------- /project3/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /project3/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.jobreadyprogrammer 4 | project3 5 | 0.0.1-SNAPSHOT 6 | jar 7 | 8 | 9 | 10 | UTF-8 11 | 1.8 12 | 2.11 13 | 2.3.1 14 | 1.8 15 | 1.8 16 | 17 | 18 | 19 | 20 | 21 | org.apache.spark 22 | spark-core_${scala.version} 23 | ${spark.version} 24 | 25 | 26 | 27 | org.apache.spark 28 | spark-sql_${scala.version} 29 | ${spark.version} 30 | 31 | 32 | org.slf4j 33 | slf4j-simple 34 | 35 | 36 | 37 | 38 | 39 | org.apache.spark 40 | spark-mllib_${scala.version} 41 | ${spark.version} 42 | 43 | 44 | org.slf4j 45 | slf4j-log4j12 46 | 47 | 48 | org.slf4j 49 | slf4j-simple 50 | 51 | 52 | 53 | 54 | 55 | junit 56 | junit 57 | 4.11 58 | test 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | org.apache.maven.plugins 68 | maven-dependency-plugin 69 | 70 | 71 | copy-dependencies 72 | prepare-package 73 | 74 | copy-dependencies 75 | 76 | 77 | 78 | ${project.build.directory}/libs 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | org.springframework.boot 88 | spring-boot-maven-plugin 89 | 90 | 91 | 92 | repackage 93 | 94 | 95 | 96 | com.jobreadyprogrammer.spark.Application 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /project3/src/main/java/com/jobreadyprogrammer/spark/Application.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import static org.apache.spark.sql.functions.concat; 4 | import static org.apache.spark.sql.functions.lit; 5 | 6 | import org.apache.spark.Partition; 7 | import org.apache.spark.sql.Dataset; 8 | import org.apache.spark.sql.Row; 9 | import org.apache.spark.sql.SparkSession; 10 | 11 | public class Application { 12 | 13 | public static void main(String[] args) { 14 | 15 | SparkSession spark = SparkSession.builder() 16 | .appName("Combine 2 Datasets") 17 | .master("local") 18 | .getOrCreate(); 19 | 20 | Dataset durhamDf = buildDurhamParksDataFrame(spark); 21 | // durhamDf.printSchema(); 22 | // durhamDf.show(10); 23 | 24 | Dataset philDf = buildPhilParksDataFrame(spark); 25 | // philDf.printSchema(); 26 | // philDf.show(10); 27 | 28 | 29 | combineDataframes(durhamDf, philDf); 30 | 31 | } 32 | 33 | 34 | private static void combineDataframes(Dataset df1, Dataset df2) { 35 | // Match by column names using the unionByName() method. 36 | // if we use just the union() method, it matches the columns based on order. 37 | Dataset df = df1.unionByName(df2); 38 | df.show(500); 39 | df.printSchema(); 40 | System.out.println("We have " + df.count() + " records."); 41 | 42 | df = df.repartition(5); 43 | 44 | Partition[] partitions = df.rdd().partitions(); 45 | System.out.println("Total number of Partitions: "+ partitions.length); 46 | 47 | } 48 | 49 | 50 | public static Dataset buildDurhamParksDataFrame(SparkSession spark){ 51 | Dataset df = spark.read().format("json").option("multiline", true) 52 | .load("src/main/resources/durham-parks.json"); 53 | 54 | df = df.withColumn("park_id", concat(df.col("datasetid"), lit("_"), 55 | df.col("fields.objectid"), lit("_Durham"))) 56 | .withColumn("park_name", df.col("fields.park_name")) 57 | .withColumn("city", lit("Durham")) 58 | .withColumn("address", df.col("fields.address")) 59 | .withColumn("has_playground", df.col("fields.playground")) 60 | .withColumn("zipcode", df.col("fields.zip")) 61 | .withColumn("land_in_acres", df.col("fields.acres")) 62 | .withColumn("geoX", df.col("geometry.coordinates").getItem(0)) 63 | .withColumn("geoY", df.col("geometry.coordinates").getItem(1)) 64 | .drop("fields").drop("geometry").drop("record_timestamp").drop("recordid") 65 | .drop("datasetid"); 66 | 67 | return df; 68 | } 69 | 70 | 71 | private static Dataset buildPhilParksDataFrame(SparkSession spark) { 72 | Dataset df = spark.read().format("csv").option("multiline", true) 73 | .option("header", true) 74 | .load("src/main/resources/philadelphia_recreations.csv"); 75 | 76 | // df = df.filter(lower(df.col("USE_")).like("%park%")); 77 | df = df.filter("lower(USE_) like '%park%' "); 78 | 79 | df = df.withColumn("park_id", concat(lit("phil_"), df.col("OBJECTID"))) 80 | .withColumnRenamed("ASSET_NAME", "park_name") 81 | .withColumn("city", lit("Philadelphia")) 82 | .withColumnRenamed("ADDRESS", "address") 83 | .withColumn("has_playground", lit("UNKNOWN")) 84 | .withColumnRenamed("ZIPCODE", "zipcode") 85 | .withColumnRenamed("ACREAGE", "land_in_acres") 86 | .withColumn("geoX", lit("UNKNONW")) 87 | .withColumn("geoY", lit("UNKNONW")) 88 | .drop("SITE_NAME") 89 | .drop("OBJECTID") 90 | .drop("CHILD_OF") 91 | .drop("TYPE") 92 | .drop("USE_") 93 | .drop("DESCRIPTION") 94 | .drop("SQ_FEET") 95 | .drop("ALLIAS") 96 | .drop("CHRONOLOGY") 97 | .drop("NOTES") 98 | .drop("DATE_EDITED") 99 | .drop("EDITED_BY") 100 | .drop("OCCUPANT") 101 | .drop("TENANT") 102 | .drop("LABEL"); 103 | 104 | 105 | 106 | return df; 107 | } 108 | 109 | 110 | 111 | } 112 | -------------------------------------------------------------------------------- /project3/src/main/java/com/jobreadyprogrammer/spark/ApplicationTest.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | 6 | import org.apache.spark.sql.Dataset; 7 | import org.apache.spark.sql.Encoders; 8 | import org.apache.spark.sql.Row; 9 | import org.apache.spark.sql.SparkSession; 10 | 11 | public class ApplicationTest { 12 | 13 | public static void main(String[] args) { 14 | 15 | 16 | SparkSession spark = SparkSession.builder() 17 | .appName("Learning Spark SQL Dataframe API") 18 | .master("local") 19 | .getOrCreate(); 20 | 21 | String [] stringList = new String[] {"Banana", "Car", "Glass", "Banana", "Banana", "Computer", "Car", "IS", "HE"}; 22 | 23 | List words = Arrays.asList(stringList); 24 | 25 | Dataset wordsDf = spark.createDataset(words, Encoders.STRING()).toDF(); 26 | 27 | String [] bordingWords = new String[] {"this", "is", "he"}; 28 | String filter = "( 'this', 'is', 'he')"; 29 | List bordingList = Arrays.asList(bordingWords); 30 | Dataset boringWordsDf = spark.createDataset(bordingList, Encoders.STRING()).toDF(); 31 | 32 | wordsDf = wordsDf.filter("value not in "+ filter); 33 | 34 | wordsDf.show(); 35 | 36 | 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /project3/src/main/resources/students.csv: -------------------------------------------------------------------------------- 1 | student_id,student_name,State,GPA,favorite_book_title,working 2 | 1100,Royce Piche,NJ,1.5,To Kill a Mockingbird,TRUE 3 | 1120,Alexis Morriss,NJ,3.0,Pride and Prejudice,FALSE 4 | 1130,Len Tarbell,NJ,3.5,The Diary of Anne Frank,FALSE 5 | 1140,Alejandro Dory,NY,2.5,Harry Potter and the Sorcerer's Stone,FALSE 6 | 1150,Ricky Tremaine,NY,3.0,The Lord of the Rings,TRUE 7 | 1160,Monika Gift,NY,3.0,The Great Gatsby,TRUE 8 | 1170,Kristeen Line,CA,4.0,Animal Farm,FALSE 9 | 1180,Sonia Rickard,CA,4.0,Harry Potter and the Sorcerer's Stone,FALSE 10 | 1190,Dan Iacovelli,CA,3.5,The Hunger Games,FALSE 11 | 1200,Ned Alvin,CA,1.0,,TRUE 12 | 1210,Sidney Ducote,FL,1.5,The Secret Garden,FALSE 13 | 1220,Bobbie Shrader,FL,2.0,The Color Purple,FALSE 14 | ,,,,, -------------------------------------------------------------------------------- /project3/target/.gitignore: -------------------------------------------------------------------------------- 1 | /classes/ 2 | /test-classes/ 3 | -------------------------------------------------------------------------------- /project4/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | /target/ 3 | -------------------------------------------------------------------------------- /project4/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/main/resources=UTF-8 4 | encoding/=UTF-8 5 | -------------------------------------------------------------------------------- /project4/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate 4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 6 | org.eclipse.jdt.core.compiler.compliance=1.8 7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 13 | org.eclipse.jdt.core.compiler.release=disabled 14 | org.eclipse.jdt.core.compiler.source=1.8 15 | -------------------------------------------------------------------------------- /project4/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /project4/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.jobreadyprogrammer 4 | project4 5 | 0.0.1-SNAPSHOT 6 | jar 7 | 8 | 9 | 10 | UTF-8 11 | 1.8 12 | 2.11 13 | 2.3.1 14 | 1.8 15 | 1.8 16 | 17 | 18 | 19 | 20 | 21 | org.apache.spark 22 | spark-core_${scala.version} 23 | ${spark.version} 24 | 25 | 26 | 27 | org.apache.spark 28 | spark-sql_${scala.version} 29 | ${spark.version} 30 | 31 | 32 | org.slf4j 33 | slf4j-simple 34 | 35 | 36 | 37 | 38 | 39 | org.apache.spark 40 | spark-mllib_${scala.version} 41 | ${spark.version} 42 | 43 | 44 | org.slf4j 45 | slf4j-log4j12 46 | 47 | 48 | org.slf4j 49 | slf4j-simple 50 | 51 | 52 | 53 | 54 | 55 | junit 56 | junit 57 | 4.11 58 | test 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | org.apache.maven.plugins 68 | maven-dependency-plugin 69 | 70 | 71 | copy-dependencies 72 | prepare-package 73 | 74 | copy-dependencies 75 | 76 | 77 | 78 | ${project.build.directory}/libs 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | org.springframework.boot 88 | spring-boot-maven-plugin 89 | 90 | 91 | 92 | repackage 93 | 94 | 95 | 96 | com.jobreadyprogrammer.spark.Application 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /project4/src/main/java/com/jobreadyprogrammer/mappers/HouseMapper.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.mappers; 2 | 3 | import java.text.SimpleDateFormat; 4 | 5 | import org.apache.spark.api.java.function.MapFunction; 6 | import org.apache.spark.sql.Row; 7 | 8 | import com.jobreadyprogrammer.pojos.House; 9 | 10 | public class HouseMapper implements MapFunction{ 11 | 12 | /** 13 | * 14 | */ 15 | private static final long serialVersionUID = -2L; 16 | 17 | 18 | @Override 19 | public House call(Row value) throws Exception { 20 | 21 | House h = new House(); 22 | 23 | h.setId(value.getAs("id")); 24 | h.setAddress(value.getAs("address")); 25 | h.setSqft(value.getAs("sqft")); 26 | h.setPrice(value.getAs("price")); 27 | 28 | String vacancyDateString = value.getAs("vacantBy").toString(); 29 | 30 | if(vacancyDateString != null) { 31 | SimpleDateFormat parser = new SimpleDateFormat("yyyy-mm-dd"); 32 | h.setVacantBy(parser.parse(vacancyDateString)); 33 | } 34 | 35 | return h; 36 | 37 | } 38 | 39 | } -------------------------------------------------------------------------------- /project4/src/main/java/com/jobreadyprogrammer/mappers/LineMapper.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.mappers; 2 | 3 | import java.util.Arrays; 4 | import java.util.Iterator; 5 | 6 | import org.apache.spark.api.java.function.FlatMapFunction; 7 | import org.apache.spark.sql.Row; 8 | 9 | public class LineMapper implements FlatMapFunction{ 10 | 11 | /** 12 | * 13 | */ 14 | private static final long serialVersionUID = 1L; 15 | 16 | @Override 17 | public Iterator call(Row row) throws Exception { 18 | return Arrays.asList(row.toString().split(" ")).iterator(); 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /project4/src/main/java/com/jobreadyprogrammer/pojos/House.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.pojos; 2 | 3 | import java.io.Serializable; 4 | import java.util.Date; 5 | 6 | public class House implements Serializable { 7 | 8 | /** 9 | * 10 | */ 11 | private static final long serialVersionUID = 1L; 12 | 13 | private int id; 14 | private String address; 15 | private int sqft; 16 | private double price; 17 | private Date vacantBy; 18 | 19 | 20 | public int getId() { 21 | return id; 22 | } 23 | 24 | public void setId(int id) { 25 | this.id = id; 26 | } 27 | 28 | public String getAddress() { 29 | return address; 30 | } 31 | 32 | public void setAddress(String address) { 33 | this.address = address; 34 | } 35 | 36 | public int getSqft() { 37 | return sqft; 38 | } 39 | public void setSqft(int sqft) { 40 | this.sqft = sqft; 41 | } 42 | 43 | public double getPrice() { 44 | return price; 45 | } 46 | 47 | 48 | public void setPrice(double price) { 49 | this.price = price; 50 | } 51 | 52 | public Date getVacantBy() { 53 | return vacantBy; 54 | } 55 | 56 | public void setVacantBy(Date vacantBy) { 57 | this.vacantBy = vacantBy; 58 | } 59 | 60 | } -------------------------------------------------------------------------------- /project4/src/main/java/com/jobreadyprogrammer/spark/Application.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | public class Application { 4 | 5 | public static void main(String[] args) { 6 | 7 | // ArrayToDataset app = new ArrayToDataset(); 8 | // app.start(); 9 | 10 | CsvToDatasetHouseToDataframe app = new CsvToDatasetHouseToDataframe(); 11 | app.start(); 12 | 13 | // WordCount wc = new WordCount(); 14 | // wc.start(); 15 | 16 | } 17 | 18 | 19 | } 20 | -------------------------------------------------------------------------------- /project4/src/main/java/com/jobreadyprogrammer/spark/ArrayToDataset.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import java.io.Serializable; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import org.apache.spark.api.java.function.MapFunction; 8 | import org.apache.spark.api.java.function.ReduceFunction; 9 | import org.apache.spark.sql.Dataset; 10 | import org.apache.spark.sql.Encoders; 11 | import org.apache.spark.sql.SparkSession; 12 | 13 | public class ArrayToDataset { 14 | 15 | public void start() { 16 | SparkSession spark = new SparkSession.Builder() 17 | .appName("Array To Dataset") 18 | .master("local") 19 | .getOrCreate(); 20 | 21 | String [] stringList = new String[] {"Banana", "Car", "Glass", "Banana", "Computer", "Car"}; 22 | 23 | List data = Arrays.asList(stringList); 24 | 25 | Dataset ds = spark.createDataset(data, Encoders.STRING()); 26 | 27 | ds = ds.map((MapFunction) row -> "word: " + row, Encoders.STRING()); 28 | ds.show(10); 29 | 30 | String stringValue = ds.reduce(new StringReducer()); 31 | 32 | System.out.println(stringValue); 33 | 34 | } 35 | 36 | 37 | static class StringReducer implements ReduceFunction, Serializable { 38 | 39 | /** 40 | * 41 | */ 42 | private static final long serialVersionUID = 1L; 43 | 44 | @Override 45 | public String call(String v1, String v2) throws Exception { 46 | return v1 + v2; 47 | } 48 | 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /project4/src/main/java/com/jobreadyprogrammer/spark/CsvToDatasetHouseToDataframe.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Encoders; 5 | import org.apache.spark.sql.Row; 6 | import static org.apache.spark.sql.functions.*; 7 | import org.apache.spark.sql.SparkSession; 8 | 9 | import com.jobreadyprogrammer.mappers.HouseMapper; 10 | import com.jobreadyprogrammer.pojos.House; 11 | 12 | 13 | public class CsvToDatasetHouseToDataframe { 14 | 15 | public void start() { 16 | 17 | SparkSession spark = SparkSession.builder() 18 | .appName("CSV to dataframe to Dataset and back") 19 | .master("local") 20 | .getOrCreate(); 21 | 22 | 23 | String filename = "src/main/resources/houses.csv"; 24 | 25 | Dataset df = spark.read().format("csv") 26 | .option("inferSchema", "true") // Make sure to use string version of true 27 | .option("header", true) 28 | .option("sep", ";") 29 | .load(filename); 30 | 31 | System.out.println("House ingested in a dataframe: "); 32 | // df.show(5); 33 | // df.printSchema(); 34 | 35 | Dataset houseDS = df.map(new HouseMapper(), Encoders.bean(House.class)); 36 | 37 | System.out.println("*****House ingested in a dataset: *****"); 38 | 39 | houseDS.show(5); 40 | houseDS.printSchema(); 41 | 42 | Dataset df2 = houseDS.toDF(); 43 | df2 = df2.withColumn("formatedDate", concat(df2.col("vacantBy.date"), lit("_"), df2.col("vacantBy.year"))); 44 | df2.show(10); 45 | } 46 | 47 | 48 | 49 | } 50 | 51 | 52 | -------------------------------------------------------------------------------- /project4/src/main/java/com/jobreadyprogrammer/spark/WordCount.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Encoders; 5 | import org.apache.spark.sql.Row; 6 | import org.apache.spark.sql.SparkSession; 7 | 8 | import com.jobreadyprogrammer.mappers.LineMapper; 9 | 10 | public class WordCount { 11 | 12 | public void start() { 13 | 14 | String boringWords = " ('a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by',\r\n" + 15 | "'for', 'if', 'in', 'into', 'is', 'it',\r\n" + 16 | "'no', 'not', 'of', 'on', 'or', 'such',\r\n" + 17 | "'that', 'the', 'their', 'then', 'there', 'these',\r\n" + 18 | "'they', 'this', 'to', 'was', 'will', 'with', 'he', 'she'," + 19 | "'your', 'you', 'I', " 20 | + " 'i','[',']', '[]', 'his', 'him', 'our', 'we') "; 21 | 22 | SparkSession spark = SparkSession.builder() 23 | .appName("unstructured text to flatmap") 24 | .master("local") 25 | .getOrCreate(); 26 | 27 | String filename = "src/main/resources/shakespeare.txt"; 28 | 29 | Dataset df = spark.read().format("text") 30 | .load(filename); 31 | 32 | // df.printSchema(); 33 | // df.show(10); 34 | 35 | Dataset wordsDS = df.flatMap(new LineMapper(), Encoders.STRING()); 36 | 37 | Dataset df2 = wordsDS.toDF(); 38 | 39 | df2 = df2.groupBy("value").count(); 40 | df2 = df2.orderBy(df2.col("count").desc()); 41 | df2 = df2.filter("lower(value) NOT IN " + boringWords); 42 | 43 | df2.show(500); 44 | 45 | 46 | } 47 | 48 | 49 | } 50 | -------------------------------------------------------------------------------- /project4/src/main/resources/houses.csv: -------------------------------------------------------------------------------- 1 | id;address;sqft;price;vacantBy 1;609 Bayway Rd Virginia Beach, VA 23451;1531;300000.00;2018-10-31 2;3220 Kenmore Rd Richmond, VA 23225;2776;125000.00;2019-04-11 3;400 W 29th St Norfolk, VA 23508;2164;54900.00;2019-02-01 4;3223 Park Ave Richmond, VA 23221;1740;390000.00;2019-03-22 5;3645 Barn Swallow Cir Roanoke, VA 24018;1800;212950.00;2019-08-20 6;3020 Scarsborough Dr Richmond, VA 23235;2340;349000.00;2018-11-22 -------------------------------------------------------------------------------- /project4/target/.gitignore: -------------------------------------------------------------------------------- 1 | /classes/ 2 | /test-classes/ 3 | -------------------------------------------------------------------------------- /project5/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | /target/ 3 | -------------------------------------------------------------------------------- /project5/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/main/resources=UTF-8 4 | encoding/=UTF-8 5 | -------------------------------------------------------------------------------- /project5/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate 4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 6 | org.eclipse.jdt.core.compiler.compliance=1.8 7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 13 | org.eclipse.jdt.core.compiler.release=disabled 14 | org.eclipse.jdt.core.compiler.source=1.8 15 | -------------------------------------------------------------------------------- /project5/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /project5/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.jobreadyprogrammer 4 | project5 5 | 0.0.1-SNAPSHOT 6 | jar 7 | 8 | 9 | 10 | UTF-8 11 | 1.8 12 | 2.11 13 | 2.3.1 14 | 1.8 15 | 1.8 16 | 17 | 18 | 19 | 20 | 21 | org.apache.spark 22 | spark-core_${scala.version} 23 | ${spark.version} 24 | 25 | 26 | 27 | org.apache.spark 28 | spark-sql_${scala.version} 29 | ${spark.version} 30 | 31 | 32 | org.slf4j 33 | slf4j-simple 34 | 35 | 36 | 37 | 38 | 39 | org.apache.spark 40 | spark-mllib_${scala.version} 41 | ${spark.version} 42 | 43 | 44 | org.slf4j 45 | slf4j-log4j12 46 | 47 | 48 | org.slf4j 49 | slf4j-simple 50 | 51 | 52 | 53 | 54 | 55 | junit 56 | junit 57 | 4.11 58 | test 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | org.apache.maven.plugins 68 | maven-dependency-plugin 69 | 70 | 71 | copy-dependencies 72 | prepare-package 73 | 74 | copy-dependencies 75 | 76 | 77 | 78 | ${project.build.directory}/libs 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | org.springframework.boot 88 | spring-boot-maven-plugin 89 | 90 | 91 | 92 | repackage 93 | 94 | 95 | 96 | com.jobreadyprogrammer.spark.Application 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /project5/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project5/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /project5/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project5/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet -------------------------------------------------------------------------------- /project5/src/main/java/com/jobreadyprogrammer/spark/Application.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Row; 5 | import org.apache.spark.sql.SparkSession; 6 | 7 | import static org.apache.spark.sql.functions.*; 8 | 9 | public class Application { 10 | 11 | public static void main(String[] args) { 12 | 13 | SparkSession spark = SparkSession.builder() 14 | .appName("Learning Spark SQL Dataframe API") 15 | .master("local") 16 | .getOrCreate(); 17 | 18 | 19 | String studentsFile = "src/main/resources/students.csv"; 20 | 21 | Dataset studentDf = spark.read().format("csv") 22 | .option("inferSchema", "true") // Make sure to use string version of true 23 | .option("header", true) 24 | .load(studentsFile); 25 | 26 | String gradeChartFile = "src/main/resources/grade_chart.csv"; 27 | 28 | Dataset gradesDf = spark.read().format("csv") 29 | .option("inferSchema", "true") // Make sure to use string version of true 30 | .option("header", true) 31 | .load(gradeChartFile); 32 | 33 | Dataset filteredDf = studentDf.join(gradesDf, studentDf.col("GPA").equalTo(gradesDf.col("GPA"))) 34 | .filter(gradesDf.col("gpa").gt(3.0).and(gradesDf.col("gpa").lt(4.5)) 35 | .or(gradesDf.col("gpa").equalTo(1.0))) 36 | .select("student_name", 37 | "favorite_book_title", 38 | "letter_grade"); 39 | 40 | 41 | } 42 | 43 | 44 | 45 | } 46 | -------------------------------------------------------------------------------- /project5/src/main/java/com/jobreadyprogrammer/spark/CustomersAndProducts.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Row; 5 | import org.apache.spark.sql.SparkSession; 6 | 7 | import static org.apache.spark.sql.functions.*; 8 | 9 | public class CustomersAndProducts { 10 | 11 | public static void main(String[] args) { 12 | 13 | 14 | SparkSession spark = SparkSession.builder() 15 | .appName("Learning Spark SQL Dataframe API") 16 | .master("local") 17 | .getOrCreate(); 18 | 19 | String customers_file = "src/main/resources/customers.csv"; 20 | 21 | Dataset customersDf = spark.read().format("csv") 22 | .option("inferSchema", "true") // Make sure to use string version of true 23 | .option("header", true) 24 | .load(customers_file); 25 | 26 | String products_file = "src/main/resources/products.csv"; 27 | 28 | Dataset productsDf = spark.read().format("csv") 29 | .option("inferSchema", "true") // Make sure to use string version of true 30 | .option("header", true) 31 | .load(products_file); 32 | 33 | String purchases_file = "src/main/resources/purchases.csv"; 34 | 35 | 36 | Dataset purchasesDf = spark.read().format("csv") 37 | .option("inferSchema", "true") // Make sure to use string version of true 38 | .option("header", true) 39 | .load(purchases_file); 40 | 41 | System.out.println(" Loaded all files into Dataframes "); 42 | System.out.println("----------------------------------"); 43 | 44 | 45 | Dataset joinedData = customersDf.join(purchasesDf, 46 | customersDf.col("customer_id").equalTo(purchasesDf.col("customer_id"))) 47 | .join(productsDf, purchasesDf.col("product_id").equalTo(productsDf.col("product_id"))) 48 | .drop("favorite_website").drop(purchasesDf.col("customer_id")) 49 | .drop(purchasesDf.col("product_id")).drop("product_id"); 50 | 51 | Dataset aggDf = joinedData.groupBy("first_name", "product_name").agg( 52 | count("product_name").as("number_of_purchases"), 53 | max("product_price").as("most_exp_purchase"), 54 | sum("product_price").as("total_spent") 55 | ); 56 | 57 | aggDf = aggDf.drop("number_of_purchases").drop("most_exp_purchase"); 58 | 59 | Dataset initialDf = aggDf; 60 | 61 | for(int i = 0; i < 500; i++ ) { 62 | aggDf = aggDf.union(initialDf); 63 | } 64 | 65 | joinedData.collectAsList(); 66 | 67 | 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /project5/src/main/resources/customers.csv: -------------------------------------------------------------------------------- 1 | customer_id,last_name,first_name,favorite_website 2 | 4000,Jackson,Joe,techonthenet.com 3 | 5000,Smith,Jane,digminecraft.com 4 | 6000,Ferguson,Samantha,bigactivities.com 5 | 7000,Reynolds,Allen,checkyourmath.com 6 | 8000,Anderson,Paige, 7 | 9000,Johnson,Derek,techonthenet.com -------------------------------------------------------------------------------- /project5/src/main/resources/grade_chart.csv: -------------------------------------------------------------------------------- 1 | gpa,letter_grade 2 | 1.0,F 3 | 1.5,D 4 | 2.0,C 5 | 2.5,C+ 6 | 3.0,B 7 | 3.5,B+ 8 | 4.0,A -------------------------------------------------------------------------------- /project5/src/main/resources/products.csv: -------------------------------------------------------------------------------- 1 | product_id,product_name,product_price 2 | 1,Pear,0.95 3 | 2,Banana,0.75 4 | 3,Orange,0.75 5 | 4,Apple,0.85 6 | 5,Bread,2.50 7 | 6,Sliced Ham,3.00 8 | 7,Kleenex,4.00 -------------------------------------------------------------------------------- /project5/src/main/resources/purchases.csv: -------------------------------------------------------------------------------- 1 | customer_id,product_id 2 | 7000,1 3 | 7000,1 4 | 7000,2 5 | 5000,6 6 | 5000,7 7 | 5000,7 8 | 5000,6 9 | 8000,2 10 | 8000,3 11 | 8000,3 12 | 4000,3 13 | 6000,1 14 | 6000,1 15 | 6000,3 16 | 6000,6 -------------------------------------------------------------------------------- /project5/src/main/resources/students.csv: -------------------------------------------------------------------------------- 1 | student_id,student_name,State,GPA,favorite_book_title,working 2 | 1100,Royce Piche,NJ,1.5,To Kill a Mockingbird,TRUE 3 | 1120,Alexis Morriss,NJ,3.0,Pride and Prejudice,FALSE 4 | 1130,Len Tarbell,NJ,3.5,The Diary of Anne Frank,FALSE 5 | 1140,Alejandro Dory,NY,2.5,Harry Potter and the Sorcerer's Stone,FALSE 6 | 1150,Ricky Tremaine,NY,3.0,The Lord of the Rings,TRUE 7 | 1160,Monika Gift,NY,3.0,The Great Gatsby,TRUE 8 | 1170,Kristeen Line,CA,4.0,Animal Farm,FALSE 9 | 1180,Sonia Rickard,CA,4.0,Harry Potter and the Sorcerer's Stone,FALSE 10 | 1190,Dan Iacovelli,CA,3.5,The Hunger Games,FALSE 11 | 1200,Ned Alvin,CA,1.0,,TRUE 12 | 1210,Sidney Ducote,FL,1.5,The Secret Garden,FALSE 13 | 1220,Bobbie Shrader,FL,2.0,The Color Purple,FALSE -------------------------------------------------------------------------------- /project6/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | /target/ 3 | -------------------------------------------------------------------------------- /project6/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/main/resources=UTF-8 4 | encoding/=UTF-8 5 | -------------------------------------------------------------------------------- /project6/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate 4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 6 | org.eclipse.jdt.core.compiler.compliance=1.8 7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 13 | org.eclipse.jdt.core.compiler.release=disabled 14 | org.eclipse.jdt.core.compiler.source=1.8 15 | -------------------------------------------------------------------------------- /project6/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /project6/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.jobreadyprogrammer 4 | project6 5 | 0.0.1-SNAPSHOT 6 | jar 7 | 8 | 9 | 10 | UTF-8 11 | UTF-8 12 | 1.8 13 | 2.11 14 | 2.3.1 15 | 42.1.4 16 | 17 | 18 | 19 | 20 | 21 | 22 | org.apache.spark 23 | spark-core_${scala.version} 24 | ${spark.version} 25 | 26 | 27 | 28 | org.apache.spark 29 | spark-sql_${scala.version} 30 | ${spark.version} 31 | 32 | 33 | 34 | org.apache.hadoop 35 | hadoop-hdfs 36 | 2.2.0 37 | 38 | 39 | 40 | org.apache.spark 41 | spark-mllib_${scala.version} 42 | ${spark.version} 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | org.apache.maven.plugins 51 | maven-compiler-plugin 52 | 3.5.1 53 | 54 | 1.8 55 | 1.8 56 | 57 | 58 | 59 | 77 | 78 | 79 | 80 | maven-jar-plugin 81 | 3.0.2 82 | 83 | 1.8 84 | 1.8 85 | 86 | 87 | com.jobreadyprogrammer.spark.Application 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /project6/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project6/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /project6/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project6/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet -------------------------------------------------------------------------------- /project6/src/main/java/com/jobreadyprogrammer/spark/Application.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import java.beans.Encoder; 4 | import java.util.Arrays; 5 | 6 | import org.apache.spark.api.java.function.FlatMapFunction; 7 | import org.apache.spark.ml.feature.StopWordsRemover; 8 | import org.apache.spark.sql.Dataset; 9 | import org.apache.spark.sql.Encoders; 10 | import org.apache.spark.sql.Row; 11 | import org.apache.spark.sql.SparkSession; 12 | import static org.apache.spark.sql.functions.*; 13 | 14 | 15 | public class Application { 16 | 17 | public static void main(String[] args) { 18 | 19 | 20 | SparkSession spark = SparkSession.builder() 21 | .appName("Learning Spark SQL Dataframe API") 22 | .master("local") // <--- need to remove this line to run on a live cluster 23 | .getOrCreate(); 24 | 25 | 26 | // String redditFile = "s3n://your-bucket-name/Reddit_2011-large"; 27 | 28 | String redditFile = "/file/on/your/computer/Reddit_2007-small"; // <- change your file location 29 | 30 | Dataset redditDf = spark.read().format("json") 31 | .option("inferSchema", "true") // Make sure to use string version of true 32 | .option("header", true) 33 | .load(redditFile); 34 | 35 | redditDf = redditDf.select("body"); 36 | Dataset wordsDs = redditDf.flatMap((FlatMapFunction) 37 | r -> Arrays.asList(r.toString().replace("\n", "").replace("\r", "").trim().toLowerCase() 38 | .split(" ")).iterator(), 39 | Encoders.STRING()); 40 | 41 | Dataset wordsDf = wordsDs.toDF(); 42 | 43 | Dataset boringWordsDf = spark.createDataset(Arrays.asList(WordUtils.stopWords), Encoders.STRING()).toDF(); 44 | 45 | // wordsDf = wordsDf.except(boringWordsDf); // <-- This won't work because it removes duplicate words!! 46 | 47 | wordsDf = wordsDf.join(boringWordsDf, wordsDf.col("value").equalTo(boringWordsDf.col("value")), "leftanti"); 48 | 49 | wordsDf = wordsDf.groupBy("value").count(); 50 | wordsDf.orderBy(desc("count")).show(); 51 | 52 | } 53 | 54 | 55 | 56 | } 57 | -------------------------------------------------------------------------------- /project6/src/main/java/com/jobreadyprogrammer/spark/WordUtils.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | public class WordUtils { 4 | 5 | public static String [] stopWords = {"a", 6 | "ability", 7 | "able", 8 | "about", 9 | "above", 10 | "accept", 11 | "according", 12 | "account", 13 | "across", 14 | "act", 15 | "action", 16 | "activity", 17 | "actually", 18 | "add", 19 | "address", 20 | "administration", 21 | "admit", 22 | "adult", 23 | "affect", 24 | "after", 25 | "again", 26 | "against", 27 | "age", 28 | "agency", 29 | "agent", 30 | "ago", 31 | "agree", 32 | "agreement", 33 | "ahead", 34 | "air", 35 | "all", 36 | "allow", 37 | "almost", 38 | "alone", 39 | "along", 40 | "already", 41 | "also", 42 | "although", 43 | "always", 44 | "among", 45 | "amount", 46 | "analysis", 47 | "and", 48 | "animal", 49 | "another", 50 | "answer", 51 | "any", 52 | "anyone", 53 | "anything", 54 | "appear", 55 | "apply", 56 | "approach", 57 | "area", 58 | "argue", 59 | "arm", 60 | "around", 61 | "arrive", 62 | "art", 63 | "article", 64 | "artist", 65 | "as", 66 | "ask", 67 | "assume", 68 | "at", 69 | "attack", 70 | "attention", 71 | "attorney", 72 | "audience", 73 | "author", 74 | "authority", 75 | "available", 76 | "avoid", 77 | "away", 78 | "baby", 79 | "back", 80 | "bad", 81 | "bag", 82 | "ball", 83 | "bank", 84 | "bar", 85 | "base", 86 | "be", 87 | "beat", 88 | "beautiful", 89 | "because", 90 | "become", 91 | "bed", 92 | "before", 93 | "begin", 94 | "behavior", 95 | "behind", 96 | "believe", 97 | "benefit", 98 | "best", 99 | "better", 100 | "between", 101 | "beyond", 102 | "big", 103 | "bill", 104 | "billion", 105 | "bit", 106 | "black", 107 | "blood", 108 | "blue", 109 | "board", 110 | "body", 111 | "book", 112 | "born", 113 | "both", 114 | "box", 115 | "boy", 116 | "break", 117 | "bring", 118 | "brother", 119 | "budget", 120 | "build", 121 | "building", 122 | "business", 123 | "but", 124 | "buy", 125 | "by", 126 | "call", 127 | "camera", 128 | "can", 129 | "cancer", 130 | "capital", 131 | "car", 132 | "card", 133 | "care", 134 | "career", 135 | "carry", 136 | "case", 137 | "catch", 138 | "cause", 139 | "cell", 140 | "center", 141 | "central", 142 | "century", 143 | "certain", 144 | "certainly", 145 | "chair", 146 | "challenge", 147 | "chance", 148 | "change", 149 | "character", 150 | "charge", 151 | "check", 152 | "child", 153 | "choice", 154 | "choose", 155 | "church", 156 | "citizen", 157 | "city", 158 | "civil", 159 | "claim", 160 | "class", 161 | "clear", 162 | "clearly", 163 | "close", 164 | "coach", 165 | "cold", 166 | "collection", 167 | "college", 168 | "color", 169 | "come", 170 | "commercial", 171 | "common", 172 | "community", 173 | "company", 174 | "compare", 175 | "computer", 176 | "concern", 177 | "condition", 178 | "conference", 179 | "Congress", 180 | "consider", 181 | "consumer", 182 | "contain", 183 | "continue", 184 | "control", 185 | "cost", 186 | "could", 187 | "country", 188 | "couple", 189 | "course", 190 | "court", 191 | "cover", 192 | "create", 193 | "cultural", 194 | "culture", 195 | "cup", 196 | "current", 197 | "customer", 198 | "cut", 199 | "dark", 200 | "data", 201 | "daughter", 202 | "day", 203 | "dead", 204 | "deal", 205 | "death", 206 | "debate", 207 | "decade", 208 | "decide", 209 | "decision", 210 | "deep", 211 | "defense", 212 | "degree", 213 | "describe", 214 | "design", 215 | "despite", 216 | "detail", 217 | "determine", 218 | "develop", 219 | "development", 220 | "die", 221 | "difference", 222 | "different", 223 | "difficult", 224 | "dinner", 225 | "direction", 226 | "director", 227 | "discover", 228 | "discuss", 229 | "discussion", 230 | "disease", 231 | "do", 232 | "doctor", 233 | "dog", 234 | "door", 235 | "down", 236 | "draw", 237 | "dream", 238 | "drive", 239 | "drop", 240 | "drug", 241 | "during", 242 | "each", 243 | "early", 244 | "east", 245 | "easy", 246 | "eat", 247 | "edge", 248 | "education", 249 | "effect", 250 | "effort", 251 | "eight", 252 | "either", 253 | "election", 254 | "else", 255 | "employee", 256 | "end", 257 | "energy", 258 | "enjoy", 259 | "enough", 260 | "enter", 261 | "entire", 262 | "environment", 263 | "environmental", 264 | "especially", 265 | "establish", 266 | "even", 267 | "evening", 268 | "event", 269 | "ever", 270 | "every", 271 | "everybody", 272 | "everyone", 273 | "everything", 274 | "evidence", 275 | "exactly", 276 | "example", 277 | "executive", 278 | "exist", 279 | "expect", 280 | "experience", 281 | "expert", 282 | "explain", 283 | "eye", 284 | "face", 285 | "fact", 286 | "factor", 287 | "fail", 288 | "fall", 289 | "family", 290 | "far", 291 | "fast", 292 | "father", 293 | "fear", 294 | "federal", 295 | "feel", 296 | "feeling", 297 | "few", 298 | "field", 299 | "fight", 300 | "figure", 301 | "fill", 302 | "film", 303 | "final", 304 | "finally", 305 | "financial", 306 | "find", 307 | "fine", 308 | "finger", 309 | "finish", 310 | "fire", 311 | "firm", 312 | "first", 313 | "fish", 314 | "five", 315 | "floor", 316 | "fly", 317 | "focus", 318 | "follow", 319 | "food", 320 | "foot", 321 | "for", 322 | "force", 323 | "foreign", 324 | "forget", 325 | "form", 326 | "former", 327 | "forward", 328 | "four", 329 | "free", 330 | "friend", 331 | "from", 332 | "front", 333 | "full", 334 | "fund", 335 | "future", 336 | "game", 337 | "garden", 338 | "gas", 339 | "general", 340 | "generation", 341 | "get", 342 | "girl", 343 | "give", 344 | "glass", 345 | "go", 346 | "goal", 347 | "good", 348 | "great", 349 | "green", 350 | "ground", 351 | "group", 352 | "grow", 353 | "growth", 354 | "guess", 355 | "gun", 356 | "guy", 357 | "hair", 358 | "half", 359 | "hand", 360 | "hang", 361 | "happen", 362 | "happy", 363 | "hard", 364 | "have", 365 | "he", 366 | "head", 367 | "health", 368 | "hear", 369 | "heart", 370 | "heat", 371 | "heavy", 372 | "help", 373 | "her", 374 | "here", 375 | "herself", 376 | "high", 377 | "him", 378 | "himself", 379 | "his", 380 | "history", 381 | "hit", 382 | "hold", 383 | "home", 384 | "hope", 385 | "hospital", 386 | "hot", 387 | "hotel", 388 | "hour", 389 | "house", 390 | "how", 391 | "however", 392 | "huge", 393 | "human", 394 | "hundred", 395 | "husband", 396 | "I", 397 | "idea", 398 | "identify", 399 | "if", 400 | "image", 401 | "imagine", 402 | "impact", 403 | "important", 404 | "improve", 405 | "in", 406 | "include", 407 | "including", 408 | "increase", 409 | "indeed", 410 | "indicate", 411 | "individual", 412 | "industry", 413 | "information", 414 | "inside", 415 | "instead", 416 | "institution", 417 | "interest", 418 | "interesting", 419 | "international", 420 | "interview", 421 | "into", 422 | "investment", 423 | "involve", 424 | "issue", 425 | "it", 426 | "item", 427 | "its", 428 | "itself", 429 | "job", 430 | "join", 431 | "just", 432 | "keep", 433 | "key", 434 | "kid", 435 | "kill", 436 | "kind", 437 | "kitchen", 438 | "know", 439 | "knowledge", 440 | "land", 441 | "language", 442 | "large", 443 | "last", 444 | "late", 445 | "later", 446 | "laugh", 447 | "law", 448 | "lawyer", 449 | "lay", 450 | "lead", 451 | "leader", 452 | "learn", 453 | "least", 454 | "leave", 455 | "left", 456 | "leg", 457 | "legal", 458 | "less", 459 | "let", 460 | "letter", 461 | "level", 462 | "lie", 463 | "life", 464 | "light", 465 | "like", 466 | "likely", 467 | "line", 468 | "list", 469 | "listen", 470 | "little", 471 | "live", 472 | "local", 473 | "long", 474 | "look", 475 | "lose", 476 | "loss", 477 | "lot", 478 | "love", 479 | "low", 480 | "machine", 481 | "magazine", 482 | "main", 483 | "maintain", 484 | "major", 485 | "majority", 486 | "make", 487 | "man", 488 | "manage", 489 | "management", 490 | "manager", 491 | "many", 492 | "market", 493 | "marriage", 494 | "material", 495 | "matter", 496 | "may", 497 | "maybe", 498 | "me", 499 | "mean", 500 | "measure", 501 | "media", 502 | "medical", 503 | "meet", 504 | "meeting", 505 | "member", 506 | "memory", 507 | "mention", 508 | "message", 509 | "method", 510 | "middle", 511 | "might", 512 | "million", 513 | "mind", 514 | "minute", 515 | "miss", 516 | "mission", 517 | "model", 518 | "modern", 519 | "moment", 520 | "money", 521 | "month", 522 | "more", 523 | "morning", 524 | "most", 525 | "mother", 526 | "mouth", 527 | "move", 528 | "movement", 529 | "movie", 530 | "Mr", 531 | "Mrs", 532 | "much", 533 | "music", 534 | "must", 535 | "my", 536 | "myself", 537 | "name", 538 | "nation", 539 | "national", 540 | "natural", 541 | "nature", 542 | "near", 543 | "nearly", 544 | "necessary", 545 | "need", 546 | "network", 547 | "never", 548 | "new", 549 | "news", 550 | "newspaper", 551 | "next", 552 | "nice", 553 | "night", 554 | "no", 555 | "none", 556 | "nor", 557 | "north", 558 | "not", 559 | "note", 560 | "nothing", 561 | "notice", 562 | "now", 563 | "n't", 564 | "number", 565 | "occur", 566 | "of", 567 | "off", 568 | "offer", 569 | "office", 570 | "officer", 571 | "official", 572 | "often", 573 | "oh", 574 | "oil", 575 | "ok", 576 | "old", 577 | "on", 578 | "once", 579 | "one", 580 | "only", 581 | "onto", 582 | "open", 583 | "operation", 584 | "opportunity", 585 | "option", 586 | "or", 587 | "order", 588 | "organization", 589 | "other", 590 | "others", 591 | "our", 592 | "out", 593 | "outside", 594 | "over", 595 | "own", 596 | "owner", 597 | "page", 598 | "pain", 599 | "painting", 600 | "paper", 601 | "parent", 602 | "part", 603 | "participant", 604 | "particular", 605 | "particularly", 606 | "partner", 607 | "party", 608 | "pass", 609 | "past", 610 | "patient", 611 | "pattern", 612 | "pay", 613 | "peace", 614 | "people", 615 | "per", 616 | "perform", 617 | "performance", 618 | "perhaps", 619 | "period", 620 | "person", 621 | "personal", 622 | "phone", 623 | "physical", 624 | "pick", 625 | "picture", 626 | "piece", 627 | "fucking", 628 | "place", 629 | "plan", 630 | "plant", 631 | "play", 632 | "player", 633 | "PM", 634 | "point", 635 | "police", 636 | "policy", 637 | "political", 638 | "politics", 639 | "poor", 640 | "popular", 641 | "population", 642 | "position", 643 | "positive", 644 | "possible", 645 | "power", 646 | "practice", 647 | "prepare", 648 | "present", 649 | "president", 650 | "pressure", 651 | "pretty", 652 | "prevent", 653 | "price", 654 | "private", 655 | "probably", 656 | "problem", 657 | "process", 658 | "shit", 659 | "produce", 660 | "product", 661 | "production", 662 | "professional", 663 | "professor", 664 | "program", 665 | "project", 666 | "property", 667 | "protect", 668 | "prove", 669 | "provide", 670 | "public", 671 | "pull", 672 | "fuck", 673 | "purpose", 674 | "push", 675 | "put", 676 | "quality", 677 | "question", 678 | "quickly", 679 | "quite", 680 | "race", 681 | "radio", 682 | "raise", 683 | "range", 684 | "rate", 685 | "rather", 686 | "reach", 687 | "read", 688 | "ready", 689 | "real", 690 | "reality", 691 | "realize", 692 | "really", 693 | "reason", 694 | "receive", 695 | "recent", 696 | "recently", 697 | "recognize", 698 | "record", 699 | "red", 700 | "reduce", 701 | "reflect", 702 | "region", 703 | "relate", 704 | "relationship", 705 | "religious", 706 | "remain", 707 | "remember", 708 | "remove", 709 | "report", 710 | "represent", 711 | "Republican", 712 | "require", 713 | "research", 714 | "resource", 715 | "respond", 716 | "response", 717 | "responsibility", 718 | "rest", 719 | "result", 720 | "return", 721 | "reveal", 722 | "rich", 723 | "right", 724 | "rise", 725 | "risk", 726 | "road", 727 | "rock", 728 | "role", 729 | "room", 730 | "rule", 731 | "run", 732 | "safe", 733 | "same", 734 | "save", 735 | "say", 736 | "scene", 737 | "school", 738 | "score", 739 | "sea", 740 | "season", 741 | "seat", 742 | "second", 743 | "section", 744 | "security", 745 | "see", 746 | "seek", 747 | "seem", 748 | "sell", 749 | "send", 750 | "senior", 751 | "sense", 752 | "series", 753 | "serious", 754 | "serve", 755 | "service", 756 | "set", 757 | "seven", 758 | "several", 759 | "sex", 760 | "sexual", 761 | "shake", 762 | "share", 763 | "she", 764 | "shoot", 765 | "short", 766 | "shot", 767 | "should", 768 | "shoulder", 769 | "show", 770 | "side", 771 | "sign", 772 | "significant", 773 | "similar", 774 | "simple", 775 | "simply", 776 | "since", 777 | "sing", 778 | "single", 779 | "sister", 780 | "sit", 781 | "site", 782 | "situation", 783 | "six", 784 | "size", 785 | "skill", 786 | "skin", 787 | "small", 788 | "smile", 789 | "so", 790 | "social", 791 | "society", 792 | "soldier", 793 | "some", 794 | "somebody", 795 | "someone", 796 | "something", 797 | "sometimes", 798 | "son", 799 | "song", 800 | "soon", 801 | "sort", 802 | "sound", 803 | "source", 804 | "south", 805 | "southern", 806 | "space", 807 | "speak", 808 | "special", 809 | "specific", 810 | "speech", 811 | "spend", 812 | "sport", 813 | "spring", 814 | "staff", 815 | "stage", 816 | "stand", 817 | "standard", 818 | "star", 819 | "start", 820 | "state", 821 | "statement", 822 | "station", 823 | "stay", 824 | "step", 825 | "still", 826 | "stock", 827 | "stop", 828 | "store", 829 | "story", 830 | "strategy", 831 | "street", 832 | "strong", 833 | "structure", 834 | "student", 835 | "study", 836 | "stuff", 837 | "style", 838 | "subject", 839 | "success", 840 | "successful", 841 | "such", 842 | "suddenly", 843 | "suffer", 844 | "suggest", 845 | "summer", 846 | "support", 847 | "sure", 848 | "surface", 849 | "system", 850 | "table", 851 | "take", 852 | "talk", 853 | "task", 854 | "tax", 855 | "teach", 856 | "teacher", 857 | "team", 858 | "technology", 859 | "television", 860 | "tell", 861 | "ten", 862 | "tend", 863 | "term", 864 | "test", 865 | "than", 866 | "thank", 867 | "that", 868 | "the", 869 | "their", 870 | "them", 871 | "themselves", 872 | "then", 873 | "theory", 874 | "there", 875 | "these", 876 | "they", 877 | "thing", 878 | "think", 879 | "third", 880 | "this", 881 | "those", 882 | "though", 883 | "thought", 884 | "thousand", 885 | "threat", 886 | "three", 887 | "through", 888 | "throughout", 889 | "throw", 890 | "thus", 891 | "time", 892 | "to", 893 | "today", 894 | "together", 895 | "tonight", 896 | "too", 897 | "top", 898 | "total", 899 | "tough", 900 | "toward", 901 | "town", 902 | "trade", 903 | "traditional", 904 | "training", 905 | "travel", 906 | "treat", 907 | "treatment", 908 | "tree", 909 | "trial", 910 | "trip", 911 | "trouble", 912 | "true", 913 | "truth", 914 | "try", 915 | "turn", 916 | "TV", 917 | "two", 918 | "type", 919 | "under", 920 | "understand", 921 | "unit", 922 | "until", 923 | "up", 924 | "upon", 925 | "us", 926 | "use", 927 | "usually", 928 | "value", 929 | "various", 930 | "very", 931 | "victim", 932 | "view", 933 | "violence", 934 | "visit", 935 | "voice", 936 | "vote", 937 | "wait", 938 | "walk", 939 | "wall", 940 | "want", 941 | "war", 942 | "watch", 943 | "water", 944 | "way", 945 | "we", 946 | "weapon", 947 | "wear", 948 | "week", 949 | "weight", 950 | "well", 951 | "west", 952 | "western", 953 | "what", 954 | "whatever", 955 | "when", 956 | "where", 957 | "whether", 958 | "which", 959 | "while", 960 | "white", 961 | "who", 962 | "whole", 963 | "whom", 964 | "whose", 965 | "why", 966 | "wide", 967 | "wife", 968 | "will", 969 | "win", 970 | "wind", 971 | "window", 972 | "wish", 973 | "with", 974 | "within", 975 | "without", 976 | "woman", 977 | "wonder", 978 | "word", 979 | "work", 980 | "worker", 981 | "world", 982 | "worry", 983 | "would", 984 | "write", 985 | "writer", 986 | "wrong", 987 | "yard", 988 | "yeah", 989 | "year", 990 | "yes", 991 | "yet", 992 | "you", 993 | "young", 994 | "your", 995 | "yourself", 996 | "a", "about", "above", "after", "again", "against", "all", "am", "an", 997 | "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", 998 | "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", 999 | "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", 1000 | "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", 1001 | "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", 1002 | "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", 1003 | "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", 1004 | "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", 1005 | "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", 1006 | "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", 1007 | "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", 1008 | "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 1009 | "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", 1010 | "yours", "yourself", "yourselves", "[[removed]]", "[[deleted]]", "doesn't", "don't", "not", 1011 | "just", "The", "can", "can't", "no","", 1012 | "[i", 1013 | "]", 1014 | "-", 1015 | "going", 1016 | "[", 1017 | "[the", 1018 | "[>", 1019 | "didn't", 1020 | "isn't", 1021 | "things", 1022 | "it.", 1023 | "got", 1024 | "said", 1025 | "years", 1026 | "used", 1027 | "made", 1028 | "makes", 1029 | "paul", 1030 | "it,", 1031 | "[you", 1032 | "saying", 1033 | "getting", 1034 | "[this", 1035 | "ron", 1036 | "using", 1037 | "seems", 1038 | "trying", 1039 | "making", 1040 | "reddit", 1041 | "wouldn't", 1042 | "won't", 1043 | "wasn't", 1044 | "[i'm"}; 1045 | } 1046 | -------------------------------------------------------------------------------- /project7/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | /target/ 3 | -------------------------------------------------------------------------------- /project7/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/main/resources=UTF-8 4 | encoding/=UTF-8 5 | -------------------------------------------------------------------------------- /project7/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate 4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 6 | org.eclipse.jdt.core.compiler.compliance=1.8 7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 13 | org.eclipse.jdt.core.compiler.release=disabled 14 | org.eclipse.jdt.core.compiler.source=1.8 15 | -------------------------------------------------------------------------------- /project7/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /project7/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.jobreadyprogrammer 4 | project7 5 | 0.0.1-SNAPSHOT 6 | jar 7 | 8 | 9 | 10 | UTF-8 11 | UTF-8 12 | 1.8 13 | 2.11 14 | 2.3.0 15 | 42.1.4 16 | 17 | 18 | 19 | 20 | 21 | 22 | org.apache.spark 23 | spark-core_${scala.version} 24 | ${spark.version} 25 | 26 | 27 | 28 | 29 | org.apache.spark 30 | spark-sql_${scala.version} 31 | ${spark.version} 32 | 33 | 34 | 35 | org.apache.hadoop 36 | hadoop-hdfs 37 | 2.2.0 38 | 39 | 40 | 41 | org.apache.spark 42 | spark-mllib_${scala.version} 43 | ${spark.version} 44 | 45 | 46 | 47 | org.apache.spark 48 | spark-sql-kafka-0-10_2.11 49 | 2.3.0 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | org.apache.maven.plugins 60 | maven-compiler-plugin 61 | 3.5.1 62 | 63 | 1.8 64 | 1.8 65 | 66 | 67 | 68 | 86 | 87 | 88 | 89 | maven-jar-plugin 90 | 3.0.2 91 | 92 | 1.8 93 | 1.8 94 | 95 | 96 | com.jobreadyprogrammer.spark.Application 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /project7/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project7/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /project7/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project7/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet -------------------------------------------------------------------------------- /project7/src/main/java/com/jobreadyprogrammer/spark/StreamingFileDirectoryApplication.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Row; 5 | import org.apache.spark.sql.SparkSession; 6 | import org.apache.spark.sql.streaming.StreamingQuery; 7 | import org.apache.spark.sql.streaming.StreamingQueryException; 8 | import org.apache.spark.sql.types.StructType; 9 | import static org.apache.spark.sql.functions.*; 10 | 11 | public class StreamingFileDirectoryApplication { 12 | 13 | public static void main(String[] args) throws StreamingQueryException { 14 | 15 | 16 | SparkSession spark = SparkSession.builder() 17 | .appName("StreamingFileDirectoryWordCount") 18 | .master("local") 19 | .getOrCreate(); 20 | 21 | // Read all the csv files written atomically in a directory 22 | StructType userSchema = new StructType().add("date", "string").add("value", "float"); 23 | 24 | Dataset stockData = spark 25 | .readStream() 26 | .option("sep", ",") 27 | .schema(userSchema) // Specify schema of the csv files 28 | .csv("/Users/imtiazahmad/Desktop/SparkCourse/data/IncomingStockFiles"); // Equivalent to format("csv").load("/path/to/directory") 29 | 30 | 31 | Dataset resultDf = stockData.groupBy("date").agg(avg(stockData.col("value"))); 32 | 33 | StreamingQuery query = resultDf.writeStream() 34 | .outputMode("complete") 35 | .format("console") 36 | .start(); 37 | 38 | query.awaitTermination(); 39 | 40 | } 41 | 42 | 43 | } 44 | -------------------------------------------------------------------------------- /project7/src/main/java/com/jobreadyprogrammer/spark/StreamingKafkaConsumer.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import java.util.Arrays; 4 | import java.util.Properties; 5 | 6 | import org.apache.spark.api.java.function.FlatMapFunction; 7 | import org.apache.spark.sql.Dataset; 8 | import org.apache.spark.sql.Encoders; 9 | import org.apache.spark.sql.Row; 10 | import org.apache.spark.sql.SaveMode; 11 | import org.apache.spark.sql.SparkSession; 12 | import org.apache.spark.sql.streaming.StreamingQuery; 13 | import org.apache.spark.sql.streaming.StreamingQueryException; 14 | 15 | public class StreamingKafkaConsumer { 16 | 17 | public static void main(String[] args) throws StreamingQueryException { 18 | 19 | SparkSession spark = SparkSession.builder() 20 | .appName("StreamingKafkaConsumer") 21 | .master("local") 22 | .getOrCreate(); 23 | 24 | // Kafka Consumer 25 | Dataset messagesDf = spark.readStream() 26 | .format("kafka") 27 | .option("kafka.bootstrap.servers", "localhost:9092") 28 | .option("subscribe", "test") 29 | .load() 30 | .selectExpr("CAST(value AS STRING)"); // lines.selectExpr("CAST key AS STRING", "CAST value AS STRING") For key value 31 | 32 | // message.show() // <-- Can't do this when streaming! 33 | Dataset words = messagesDf 34 | .as(Encoders.STRING()) 35 | .flatMap((FlatMapFunction) x -> Arrays.asList(x.split(" ")).iterator(), Encoders.STRING()); 36 | 37 | Dataset wordCounts = words.groupBy("value").count(); 38 | // 39 | 40 | 41 | StreamingQuery query = wordCounts.writeStream() 42 | .outputMode("complete") 43 | .format("console") 44 | .start(); 45 | // 46 | query.awaitTermination(); 47 | 48 | 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /project7/src/main/java/com/jobreadyprogrammer/spark/StreamingSocketApplication.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import java.util.Arrays; 4 | 5 | import org.apache.spark.api.java.function.FlatMapFunction; 6 | import org.apache.spark.sql.Dataset; 7 | import org.apache.spark.sql.Encoders; 8 | import org.apache.spark.sql.Row; 9 | import org.apache.spark.sql.SparkSession; 10 | import org.apache.spark.sql.streaming.StreamingQuery; 11 | import org.apache.spark.sql.streaming.StreamingQueryException; 12 | 13 | 14 | public class StreamingSocketApplication { 15 | 16 | public static void main(String[] args) throws StreamingQueryException { 17 | 18 | 19 | // First start a socket connection at 9999 using this: nc -lk 9999 20 | SparkSession spark = SparkSession.builder() 21 | .appName("StreamingSocketWordCount") 22 | .master("local") 23 | .getOrCreate(); 24 | 25 | // Create DataFrame representing the stream of input lines from connection to localhost:9999 26 | 27 | Dataset lines = spark 28 | .readStream() 29 | .format("socket") 30 | .option("host", "localhost") 31 | .option("port", 9999) 32 | .load(); 33 | 34 | Dataset words = lines 35 | .as(Encoders.STRING()) 36 | .flatMap((FlatMapFunction) x -> Arrays.asList(x.split(" ")).iterator(), Encoders.STRING()); 37 | 38 | Dataset wordCounts = words.groupBy("value").count(); 39 | 40 | StreamingQuery query = wordCounts.writeStream() 41 | .outputMode("append") 42 | .format("console") 43 | .start(); 44 | 45 | query.awaitTermination(); 46 | 47 | 48 | 49 | 50 | 51 | } 52 | 53 | 54 | 55 | } 56 | -------------------------------------------------------------------------------- /project8/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | /target/ 3 | -------------------------------------------------------------------------------- /project8/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/main/resources=UTF-8 4 | encoding/=UTF-8 5 | -------------------------------------------------------------------------------- /project8/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate 4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 6 | org.eclipse.jdt.core.compiler.compliance=1.8 7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 13 | org.eclipse.jdt.core.compiler.release=disabled 14 | org.eclipse.jdt.core.compiler.source=1.8 15 | -------------------------------------------------------------------------------- /project8/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /project8/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.jobreadyprogrammer 4 | project8 5 | 0.0.1-SNAPSHOT 6 | jar 7 | 8 | 9 | 10 | UTF-8 11 | UTF-8 12 | 1.8 13 | 2.11 14 | 2.3.0 15 | 42.1.4 16 | 17 | 18 | 19 | 20 | 21 | 22 | org.apache.spark 23 | spark-core_${scala.version} 24 | ${spark.version} 25 | 26 | 27 | 28 | 29 | org.apache.spark 30 | spark-sql_${scala.version} 31 | ${spark.version} 32 | 33 | 34 | 35 | org.apache.hadoop 36 | hadoop-hdfs 37 | 2.2.0 38 | 39 | 40 | 41 | org.apache.spark 42 | spark-mllib_${scala.version} 43 | ${spark.version} 44 | 45 | 46 | 47 | org.apache.spark 48 | spark-sql-kafka-0-10_2.11 49 | 2.3.0 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | org.apache.maven.plugins 60 | maven-compiler-plugin 61 | 3.5.1 62 | 63 | 1.8 64 | 1.8 65 | 66 | 67 | 68 | 86 | 87 | 88 | 89 | maven-jar-plugin 90 | 3.0.2 91 | 92 | 1.8 93 | 1.8 94 | 95 | 96 | com.jobreadyprogrammer.spark.Application 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /project8/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project8/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /project8/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project8/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet -------------------------------------------------------------------------------- /project8/src/main/java/com/jobreadyprogrammer/spark/FlatMapAndFilterRddApp.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | public class FlatMapAndFilterRddApp { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /project8/src/main/java/com/jobreadyprogrammer/spark/JoinRddApp.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | public class JoinRddApp { 4 | 5 | public static void main(String[] args) { 6 | // TODO Auto-generated method stub 7 | 8 | } 9 | 10 | } 11 | -------------------------------------------------------------------------------- /project8/src/main/java/com/jobreadyprogrammer/spark/MapAndReduceRddApp.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.apache.spark.SparkConf; 7 | import org.apache.spark.api.java.JavaRDD; 8 | import org.apache.spark.api.java.JavaSparkContext; 9 | 10 | public class MapAndReduceRddApp { 11 | 12 | public static void main(String[] args) { 13 | 14 | List inputData = new ArrayList<>(); 15 | 16 | inputData.add(9.00); 17 | inputData.add(4.00); 18 | inputData.add(83.00); 19 | inputData.add(142.00); 20 | inputData.add(75.00); 21 | inputData.add(25.00); 22 | 23 | SparkConf conf = new SparkConf().setAppName("RddMapReduce").setMaster("local[*]"); 24 | JavaSparkContext sc = new JavaSparkContext(conf); 25 | 26 | JavaRDD myRdd = sc.parallelize(inputData); 27 | 28 | // Map function: 29 | JavaRDD squareRootRdd = myRdd.map(v -> Math.sqrt(v)); 30 | squareRootRdd.foreach(v -> System.out.println(v)); // System.out::println 31 | 32 | // count number of elements using map and reduce functions 33 | JavaRDD counterRdd = squareRootRdd.map(v1 -> 1); 34 | int count = counterRdd.reduce((v1, v2) -> v1 + v2); 35 | System.out.println(count); 36 | 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /project8/src/main/java/com/jobreadyprogrammer/spark/TupleAndPairRddApp.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.apache.spark.SparkConf; 7 | import org.apache.spark.api.java.JavaPairRDD; 8 | import org.apache.spark.api.java.JavaRDD; 9 | import org.apache.spark.api.java.JavaSparkContext; 10 | 11 | import scala.Tuple2; 12 | import scala.Tuple3; 13 | 14 | public class TupleAndPairRddApp { 15 | 16 | public static void main(String[] args) { 17 | 18 | // List inputData = new ArrayList<>(); 19 | // 20 | // inputData.add(10); 21 | // inputData.add(20); 22 | // inputData.add(142); 23 | // inputData.add(49); 24 | // inputData.add(25); 25 | // inputData.add(16); 26 | 27 | 28 | 29 | SparkConf conf = new SparkConf().setAppName("tupleExample").setMaster("local"); 30 | JavaSparkContext sc = new JavaSparkContext(conf); 31 | //// 32 | // JavaRDD inputNumbersRdd = sc.parallelize(inputData); 33 | 34 | // JavaRDD twoColumnRdd = inputNumbersRdd.map(v -> new IntegerWithRoot(v)); 35 | // twoColumnRdd.foreach(v -> System.out.println(v.number + ", " + v.squareRoot)); 36 | 37 | // JavaRDD> squaredTupleRdd = inputNumbersRdd.map( 38 | // v -> new Tuple3<>(v, Math.sqrt(v), "This is 3rd arg") 39 | // ); 40 | // squaredTupleRdd.foreach(v -> System.out.println(v)); 41 | 42 | 43 | List inputData = new ArrayList<>(); 44 | 45 | inputData.add("WARN: client stopped connection"); 46 | inputData.add("FATAL: GET request failed"); 47 | inputData.add("WARN: client stopped connection"); 48 | inputData.add("ERROR: Incorrect URL"); 49 | inputData.add("ERROR: POST request failed"); 50 | inputData.add("FATAL: File does not exist"); 51 | inputData.add("ERROR: File does not exist"); 52 | 53 | JavaRDD logRdd = sc.parallelize(inputData); 54 | 55 | JavaPairRDD pairRdd = logRdd.mapToPair(v -> { 56 | String [] columns = v.split(":"); 57 | String logLevel = columns[0]; 58 | String message = columns[1]; 59 | 60 | return new Tuple2(logLevel, 1L); 61 | }); 62 | 63 | JavaPairRDD logLevelCountsRdd = pairRdd.reduceByKey((v1, v2) -> v1+ v2); 64 | logLevelCountsRdd.foreach(v -> System.out.println(v._1 + ": " + v._2)); 65 | 66 | 67 | } 68 | 69 | } 70 | 71 | class IntegerWithRoot{ 72 | 73 | int number; 74 | double squareRoot; 75 | 76 | public IntegerWithRoot(int i) { 77 | this.number = i; 78 | this.squareRoot = Math.sqrt(i); 79 | } 80 | } 81 | 82 | -------------------------------------------------------------------------------- /project9/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.jobreadyprogrammer 4 | project9 5 | 0.0.1-SNAPSHOT 6 | jar 7 | 8 | 9 | 10 | UTF-8 11 | UTF-8 12 | 1.8 13 | 2.11 14 | 2.3.0 15 | 42.1.4 16 | 17 | 18 | 19 | 20 | 21 | 22 | org.apache.spark 23 | spark-core_${scala.version} 24 | ${spark.version} 25 | 26 | 27 | 28 | 29 | org.apache.spark 30 | spark-sql_${scala.version} 31 | ${spark.version} 32 | 33 | 34 | 35 | org.apache.hadoop 36 | hadoop-hdfs 37 | 2.2.0 38 | 39 | 40 | 41 | org.apache.spark 42 | spark-mllib_${scala.version} 43 | ${spark.version} 44 | 45 | 46 | 47 | org.apache.spark 48 | spark-sql-kafka-0-10_2.11 49 | 2.3.0 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | org.apache.maven.plugins 61 | maven-dependency-plugin 62 | 63 | 64 | copy-dependencies 65 | prepare-package 66 | 67 | copy-dependencies 68 | 69 | 70 | 71 | ${project.build.directory}/libs 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | org.springframework.boot 81 | spring-boot-maven-plugin 82 | 83 | 84 | 85 | repackage 86 | 87 | 88 | 89 | com.jobreadyprogrammer.spark.Application 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /project9/src/main/java/com/jobreadyprogrammer/spark/KmeansClustering.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import org.apache.log4j.Level; 4 | import org.apache.log4j.Logger; 5 | import org.apache.spark.ml.clustering.KMeans; 6 | import org.apache.spark.ml.clustering.KMeansModel; 7 | import org.apache.spark.ml.feature.VectorAssembler; 8 | import org.apache.spark.sql.Dataset; 9 | import org.apache.spark.sql.Row; 10 | import org.apache.spark.sql.SparkSession; 11 | 12 | public class KmeansClustering { 13 | 14 | public static void main(String[] args) { 15 | 16 | Logger.getLogger("org").setLevel(Level.ERROR); 17 | Logger.getLogger("akka").setLevel(Level.ERROR); 18 | 19 | SparkSession spark = new SparkSession.Builder() 20 | .appName("kmeans Clustering") 21 | .master("local") 22 | .getOrCreate(); 23 | 24 | Dataset wholeSaleDf = spark.read() 25 | .option("header", "true") 26 | .option("inferSchema", "true") 27 | .format("csv") 28 | .load("/Users/imtiazahmad/Desktop/SparkCourse/data/Wholesale customers data.csv"); 29 | wholeSaleDf.show(); 30 | Dataset featuresDf = wholeSaleDf.select("channel", "fresh", "milk", "grocery", "frozen", "detergents_paper", "delicassen"); 31 | 32 | VectorAssembler assembler = new VectorAssembler(); 33 | assembler = assembler.setInputCols(new String[] {"channel", "fresh", "milk", "grocery", "frozen", "detergents_paper", "delicassen"}) 34 | .setOutputCol("features"); 35 | 36 | Dataset trainingData = assembler.transform(featuresDf).select("features"); 37 | 38 | KMeans kmeans = new KMeans().setK(10); 39 | 40 | KMeansModel model = kmeans.fit(trainingData); 41 | 42 | System.out.println(model.computeCost(trainingData)); 43 | model.summary().predictions().show(); 44 | 45 | 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /project9/src/main/java/com/jobreadyprogrammer/spark/LinearMarketingVsSales.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import org.apache.log4j.Level; 4 | import org.apache.log4j.Logger; 5 | import org.apache.spark.ml.feature.VectorAssembler; 6 | import org.apache.spark.ml.regression.LinearRegression; 7 | import org.apache.spark.ml.regression.LinearRegressionModel; 8 | import org.apache.spark.sql.Dataset; 9 | import org.apache.spark.sql.Row; 10 | import org.apache.spark.sql.SparkSession; 11 | 12 | public class LinearMarketingVsSales { 13 | 14 | public static void main(String[] args) { 15 | 16 | Logger.getLogger("org").setLevel(Level.ERROR); 17 | Logger.getLogger("akka").setLevel(Level.ERROR); 18 | 19 | SparkSession spark = new SparkSession.Builder() 20 | .appName("LinearRegressionExample") 21 | .master("local") 22 | .getOrCreate(); 23 | 24 | Dataset markVsSalesDf = spark.read() 25 | .option("header", "true") 26 | .option("inferSchema", "true") 27 | .format("csv") 28 | .load("/Users/imtiazahmad/Desktop/SparkCourse/data/marketing_vs_sales.csv"); 29 | markVsSalesDf.show(); 30 | 31 | // go through the lecture first and then start un-commenting the code below 32 | /** 33 | Dataset mldf = markVsSalesDf.withColumnRenamed("sales", "label") 34 | .select("label", "marketing_spend","bad_day"); 35 | 36 | String[] featureColumns = {"marketing_spend", "bad_day"}; 37 | 38 | VectorAssembler assember = new VectorAssembler() 39 | .setInputCols(featureColumns) 40 | .setOutputCol("features"); 41 | 42 | Dataset lblFeaturesDf = assember.transform(mldf).select("label", "features"); 43 | lblFeaturesDf = lblFeaturesDf.na().drop(); 44 | lblFeaturesDf.show(); 45 | 46 | // next we need to create a linear regression model object 47 | LinearRegression lr = new LinearRegression(); 48 | LinearRegressionModel learningModel = lr.fit(lblFeaturesDf); 49 | 50 | learningModel.summary().predictions().show(); 51 | 52 | System.out.println("R Squared: "+ learningModel.summary().r2()); 53 | 54 | **/ 55 | 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /project9/src/main/java/com/jobreadyprogrammer/spark/LinearMpgRegression.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import org.apache.log4j.Level; 4 | import org.apache.log4j.Logger; 5 | import org.apache.spark.ml.Pipeline; 6 | import org.apache.spark.ml.PipelineModel; 7 | import org.apache.spark.ml.PipelineStage; 8 | import org.apache.spark.ml.feature.VectorAssembler; 9 | import org.apache.spark.ml.regression.LinearRegression; 10 | import org.apache.spark.ml.regression.LinearRegressionModel; 11 | import org.apache.spark.sql.Dataset; 12 | import org.apache.spark.sql.Row; 13 | import org.apache.spark.sql.SparkSession; 14 | 15 | public class LinearMpgRegression { 16 | 17 | public static void main(String[] args) { 18 | 19 | Logger.getLogger("org").setLevel(Level.ERROR); 20 | Logger.getLogger("akka").setLevel(Level.ERROR); 21 | 22 | SparkSession spark = new SparkSession.Builder() 23 | .appName("LinearRegressionMpgExample") 24 | .master("local") 25 | .getOrCreate(); 26 | 27 | Dataset autoMpgDf = spark.read() 28 | .option("header", "true") 29 | .option("inferSchema", "true") 30 | .format("csv") 31 | .load("/Users/imtiazahmad/Desktop/SparkCourse/data/auto_mpg.csv"); 32 | 33 | autoMpgDf = autoMpgDf.withColumnRenamed("mpg", "label") 34 | .drop("acceleration") 35 | .drop("modelYear") 36 | .drop("origin") 37 | .drop("carName") 38 | .drop("displacement"); 39 | 40 | autoMpgDf = autoMpgDf.na().drop(); 41 | 42 | String[] featureColumns = {"cylinders", "horsePower", "weight"}; 43 | 44 | VectorAssembler assembler = new VectorAssembler() 45 | .setInputCols(featureColumns) 46 | .setOutputCol("features"); 47 | 48 | autoMpgDf = assembler.transform(autoMpgDf).select("label", "features"); 49 | 50 | LinearRegression lr = new LinearRegression(); 51 | LinearRegressionModel lrm = lr.fit(autoMpgDf); 52 | 53 | Pipeline pl = new Pipeline() 54 | .setStages(new PipelineStage[] {lrm}); 55 | 56 | Dataset [] splitData = autoMpgDf.randomSplit(new double[] {0.7, 0.3}); 57 | 58 | Dataset trainingData = splitData[0]; 59 | Dataset testData = splitData[1]; 60 | 61 | PipelineModel model = pl.fit(trainingData); 62 | 63 | Dataset result = model.transform(testData); 64 | result.show(); 65 | 66 | } 67 | 68 | 69 | 70 | } 71 | 72 | 73 | -------------------------------------------------------------------------------- /project9/src/main/java/com/jobreadyprogrammer/spark/LogisticRegressionExample.java: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | 3 | import org.apache.log4j.Level; 4 | import org.apache.log4j.Logger; 5 | import org.apache.spark.ml.Pipeline; 6 | import org.apache.spark.ml.PipelineModel; 7 | import org.apache.spark.ml.PipelineStage; 8 | import org.apache.spark.ml.classification.LogisticRegression; 9 | import org.apache.spark.ml.feature.StringIndexer; 10 | import org.apache.spark.ml.feature.VectorAssembler; 11 | import org.apache.spark.sql.Dataset; 12 | import org.apache.spark.sql.Row; 13 | import org.apache.spark.sql.SparkSession; 14 | 15 | public class LogisticRegressionExample { 16 | 17 | public static void main(String[] args) { 18 | 19 | Logger.getLogger("org").setLevel(Level.ERROR); 20 | Logger.getLogger("akka").setLevel(Level.ERROR); 21 | 22 | SparkSession spark = new SparkSession.Builder() 23 | .appName("LogisticRegressionExample") 24 | .master("local") 25 | .getOrCreate(); 26 | 27 | Dataset treatmentDf = spark.read() 28 | .option("header", "true") 29 | .option("inferSchema", "true") 30 | .format("csv") 31 | .load("/Users/imtiazahmad/Desktop/SparkCourse/data/cryotherapy.csv"); 32 | 33 | Dataset lblFeatureDf = treatmentDf.withColumnRenamed("Result_of_Treatment", "label") 34 | .select("label", "sex","age","time","number_of_warts","type","area"); 35 | 36 | lblFeatureDf = lblFeatureDf.na().drop(); 37 | 38 | StringIndexer genderIndexer = new StringIndexer() 39 | .setInputCol("sex").setOutputCol("sexIndex"); 40 | 41 | VectorAssembler assembler = new VectorAssembler() 42 | .setInputCols(new String [] {"sexIndex", "age", "time", "number_of_warts", "type", "area"}) 43 | .setOutputCol("features"); 44 | 45 | 46 | Dataset [] splitData = lblFeatureDf.randomSplit(new double[] {.7, .3}); 47 | Dataset trainingDf = splitData[0]; 48 | Dataset testingDf = splitData[1]; 49 | 50 | LogisticRegression logReg = new LogisticRegression(); 51 | 52 | Pipeline pl = new Pipeline(); 53 | pl.setStages(new PipelineStage [] {genderIndexer, assembler, logReg}); 54 | 55 | PipelineModel model = pl.fit(trainingDf); 56 | Dataset results = model.transform(testingDf); 57 | 58 | results.show(); 59 | 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /project9/target/classes/com/jobreadyprogrammer/spark/KmeansClustering.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project9/target/classes/com/jobreadyprogrammer/spark/KmeansClustering.class -------------------------------------------------------------------------------- /project9/target/classes/com/jobreadyprogrammer/spark/LinearMarketingVsSales.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project9/target/classes/com/jobreadyprogrammer/spark/LinearMarketingVsSales.class -------------------------------------------------------------------------------- /project9/target/classes/com/jobreadyprogrammer/spark/LinearMpgRegression.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project9/target/classes/com/jobreadyprogrammer/spark/LinearMpgRegression.class -------------------------------------------------------------------------------- /project9/target/classes/com/jobreadyprogrammer/spark/LogisticRegressionExample.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project9/target/classes/com/jobreadyprogrammer/spark/LogisticRegressionExample.class -------------------------------------------------------------------------------- /test-dev-env: -------------------------------------------------------------------------------- 1 | package com.jobreadyprogrammer.spark; 2 | import org.apache.spark.sql.Dataset; 3 | import org.apache.spark.sql.Row; 4 | import org.apache.spark.sql.SparkSession; 5 | 6 | public class Application { 7 | 8 | public static void main(String args[]) { 9 | 10 | SparkSession spark = SparkSession.builder().appName("Name").master("local").getOrCreate(); 11 | 12 | Dataset df = spark.read().format("text").load("src/main/resources/wordsList.txt"); 13 | df.groupBy("value").count().show(); 14 | 15 | // SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("Name"); 16 | // JavaSparkContext sc = new JavaSparkContext(sparkConf); 17 | // JavaRDD textFile = sc.textFile("src/main/resources/wordsList.txt"); 18 | // JavaPairRDD counts = textFile 19 | // .flatMap(s -> Arrays.asList(s.split(" ")).iterator()) 20 | // .mapToPair(word -> new Tuple2<>(word, 1)) 21 | // .reduceByKey((a, b) -> a + b); 22 | // counts.take(10); 23 | // System.out.println(counts.collect()); 24 | 25 | 26 | } 27 | 28 | } 29 | --------------------------------------------------------------------------------