├── .gitignore
├── .recommenders
├── caches
│ ├── identified-project-coordinates.json
│ └── manual-mappings.json
└── index
│ └── http___download_eclipse_org_recommenders_models_photon_
│ ├── _1.fdt
│ ├── _1.fdx
│ ├── _1.fnm
│ ├── _1.frq
│ ├── _1.nrm
│ ├── _1.prx
│ ├── _1.tii
│ ├── _1.tis
│ ├── segments.gen
│ ├── segments_2
│ └── write.lock
├── README.md
├── pom.xml
├── project1
├── .settings
│ ├── org.eclipse.jdt.core.prefs
│ └── org.eclipse.m2e.core.prefs
├── pom.xml
├── src
│ └── main
│ │ ├── java
│ │ └── com
│ │ │ └── jobreadyprogrammer
│ │ │ └── spark
│ │ │ └── Application.java
│ │ └── resources
│ │ └── name_and_comments.txt
└── target
│ └── .gitignore
├── project2
├── .classpath
├── .project
├── .settings
│ ├── org.eclipse.jdt.core.prefs
│ └── org.eclipse.m2e.core.prefs
├── pom.xml
├── src
│ └── main
│ │ ├── java
│ │ └── com
│ │ │ └── jobreadyprogrammer
│ │ │ └── spark
│ │ │ ├── Application.java
│ │ │ ├── DefineCSVSchema.java
│ │ │ ├── InferCSVSchema.java
│ │ │ └── JSONLinesParser.java
│ │ └── resources
│ │ ├── amazonProducts.txt
│ │ ├── multiline.json
│ │ └── simple.json
└── target
│ ├── .gitignore
│ └── classes
│ ├── amazonProducts.txt
│ └── com
│ └── jobreadyprogrammer
│ └── spark
│ ├── Application.class
│ ├── DefineCSVSchema.class
│ ├── InferCSVSchema.class
│ └── JSONLinesParser.class
├── project3
├── .settings
│ ├── org.eclipse.jdt.core.prefs
│ └── org.eclipse.m2e.core.prefs
├── pom.xml
├── src
│ └── main
│ │ ├── java
│ │ └── com
│ │ │ └── jobreadyprogrammer
│ │ │ └── spark
│ │ │ ├── Application.java
│ │ │ └── ApplicationTest.java
│ │ └── resources
│ │ ├── durham-parks.json
│ │ ├── philadelphia_recreations.csv
│ │ └── students.csv
└── target
│ └── .gitignore
├── project4
├── .gitignore
├── .settings
│ ├── org.eclipse.core.resources.prefs
│ ├── org.eclipse.jdt.core.prefs
│ └── org.eclipse.m2e.core.prefs
├── pom.xml
├── src
│ └── main
│ │ ├── java
│ │ └── com
│ │ │ └── jobreadyprogrammer
│ │ │ ├── mappers
│ │ │ ├── HouseMapper.java
│ │ │ └── LineMapper.java
│ │ │ ├── pojos
│ │ │ └── House.java
│ │ │ └── spark
│ │ │ ├── Application.java
│ │ │ ├── ArrayToDataset.java
│ │ │ ├── CsvToDatasetHouseToDataframe.java
│ │ │ └── WordCount.java
│ │ └── resources
│ │ ├── houses.csv
│ │ └── shakespeare.txt
└── target
│ └── .gitignore
├── project5
├── .gitignore
├── .settings
│ ├── org.eclipse.core.resources.prefs
│ ├── org.eclipse.jdt.core.prefs
│ └── org.eclipse.m2e.core.prefs
├── pom.xml
├── spark-warehouse
│ └── grades_view_perm
│ │ └── _temporary
│ │ └── 0
│ │ └── _temporary
│ │ └── attempt_20180912224126_0002_m_000000_0
│ │ ├── .part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc
│ │ └── part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet
└── src
│ └── main
│ ├── java
│ └── com
│ │ └── jobreadyprogrammer
│ │ └── spark
│ │ ├── Application.java
│ │ └── CustomersAndProducts.java
│ └── resources
│ ├── customers.csv
│ ├── grade_chart.csv
│ ├── products.csv
│ ├── purchases.csv
│ └── students.csv
├── project6
├── .gitignore
├── .settings
│ ├── org.eclipse.core.resources.prefs
│ ├── org.eclipse.jdt.core.prefs
│ └── org.eclipse.m2e.core.prefs
├── pom.xml
├── spark-warehouse
│ └── grades_view_perm
│ │ └── _temporary
│ │ └── 0
│ │ └── _temporary
│ │ └── attempt_20180912224126_0002_m_000000_0
│ │ ├── .part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc
│ │ └── part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet
└── src
│ └── main
│ └── java
│ └── com
│ └── jobreadyprogrammer
│ └── spark
│ ├── Application.java
│ └── WordUtils.java
├── project7
├── .gitignore
├── .settings
│ ├── org.eclipse.core.resources.prefs
│ ├── org.eclipse.jdt.core.prefs
│ └── org.eclipse.m2e.core.prefs
├── pom.xml
├── spark-warehouse
│ └── grades_view_perm
│ │ └── _temporary
│ │ └── 0
│ │ └── _temporary
│ │ └── attempt_20180912224126_0002_m_000000_0
│ │ ├── .part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc
│ │ └── part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet
└── src
│ └── main
│ └── java
│ └── com
│ └── jobreadyprogrammer
│ └── spark
│ ├── StreamingFileDirectoryApplication.java
│ ├── StreamingKafkaConsumer.java
│ └── StreamingSocketApplication.java
├── project8
├── .gitignore
├── .settings
│ ├── org.eclipse.core.resources.prefs
│ ├── org.eclipse.jdt.core.prefs
│ └── org.eclipse.m2e.core.prefs
├── pom.xml
├── spark-warehouse
│ └── grades_view_perm
│ │ └── _temporary
│ │ └── 0
│ │ └── _temporary
│ │ └── attempt_20180912224126_0002_m_000000_0
│ │ ├── .part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc
│ │ └── part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet
└── src
│ └── main
│ └── java
│ └── com
│ └── jobreadyprogrammer
│ └── spark
│ ├── FlatMapAndFilterRddApp.java
│ ├── JoinRddApp.java
│ ├── MapAndReduceRddApp.java
│ └── TupleAndPairRddApp.java
├── project9
├── pom.xml
├── src
│ └── main
│ │ └── java
│ │ └── com
│ │ └── jobreadyprogrammer
│ │ └── spark
│ │ ├── KmeansClustering.java
│ │ ├── LinearMarketingVsSales.java
│ │ ├── LinearMpgRegression.java
│ │ └── LogisticRegressionExample.java
└── target
│ └── classes
│ └── com
│ └── jobreadyprogrammer
│ └── spark
│ ├── KmeansClustering.class
│ ├── LinearMarketingVsSales.class
│ ├── LinearMpgRegression.class
│ └── LogisticRegressionExample.class
└── test-dev-env
/.gitignore:
--------------------------------------------------------------------------------
1 | /.metadata/
2 | .classpath
3 | .project
4 |
--------------------------------------------------------------------------------
/.recommenders/caches/identified-project-coordinates.json:
--------------------------------------------------------------------------------
1 | {}
--------------------------------------------------------------------------------
/.recommenders/caches/manual-mappings.json:
--------------------------------------------------------------------------------
1 | {}
--------------------------------------------------------------------------------
/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.fdt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.fdt
--------------------------------------------------------------------------------
/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.fdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.fdx
--------------------------------------------------------------------------------
/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.fnm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.fnm
--------------------------------------------------------------------------------
/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.frq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.frq
--------------------------------------------------------------------------------
/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.nrm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.nrm
--------------------------------------------------------------------------------
/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.tii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.tii
--------------------------------------------------------------------------------
/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.tis:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/_1.tis
--------------------------------------------------------------------------------
/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/segments.gen:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/segments.gen
--------------------------------------------------------------------------------
/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/segments_2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/segments_2
--------------------------------------------------------------------------------
/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/write.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/.recommenders/index/http___download_eclipse_org_recommenders_models_photon_/write.lock
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This is the source code for the course "The Ultimate Apache Spark with Java Course - Hands On"
2 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | com.jobreadyprogrammer
4 | learningspark
5 | 0.0.1-SNAPSHOT
6 | jar
7 |
8 |
9 |
10 | UTF-8
11 | 1.8
12 | 2.11
13 | 2.3.1
14 | 1.8
15 | 1.8
16 |
17 |
18 |
19 |
20 |
21 | org.apache.spark
22 | spark-core_${scala.version}
23 | ${spark.version}
24 |
25 |
26 |
27 | org.apache.spark
28 | spark-sql_${scala.version}
29 | ${spark.version}
30 |
31 |
32 | org.slf4j
33 | slf4j-simple
34 |
35 |
36 |
37 |
38 |
39 | org.apache.spark
40 | spark-mllib_${scala.version}
41 | ${spark.version}
42 |
43 |
44 | org.slf4j
45 | slf4j-log4j12
46 |
47 |
48 | org.slf4j
49 | slf4j-simple
50 |
51 |
52 |
53 |
54 |
55 | junit
56 | junit
57 | 4.11
58 | test
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 | org.apache.maven.plugins
68 | maven-dependency-plugin
69 |
70 |
71 | copy-dependencies
72 | prepare-package
73 |
74 | copy-dependencies
75 |
76 |
77 |
78 | ${project.build.directory}/libs
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 | org.springframework.boot
88 | spring-boot-maven-plugin
89 |
90 |
91 |
92 | repackage
93 |
94 |
95 |
96 | com.jobreadyprogrammer.spark.Application
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/project1/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate
4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
6 | org.eclipse.jdt.core.compiler.compliance=1.8
7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
13 | org.eclipse.jdt.core.compiler.release=disabled
14 | org.eclipse.jdt.core.compiler.source=1.8
15 |
--------------------------------------------------------------------------------
/project1/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/project1/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | com.jobreadyprogrammer
4 | project1
5 | 0.0.1-SNAPSHOT
6 | jar
7 |
8 |
9 |
10 |
11 | UTF-8
12 | 1.8
13 |
14 | 2.11
15 | 2.3.1
16 | 42.1.4
17 |
18 | 1.8
19 | 1.8
20 |
21 |
22 |
23 |
24 |
25 |
26 | org.apache.spark
27 | spark-core_${scala.version}
28 | ${spark.version}
29 |
30 |
31 |
32 | org.apache.spark
33 | spark-sql_${scala.version}
34 | ${spark.version}
35 |
36 |
37 |
38 | org.apache.spark
39 | spark-mllib_${scala.version}
40 | ${spark.version}
41 |
42 |
43 |
44 | junit
45 | junit
46 | 4.11
47 | test
48 |
49 |
50 |
51 | org.postgresql
52 | postgresql
53 | ${postgresql.version}
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 | org.apache.maven.plugins
64 | maven-dependency-plugin
65 |
66 |
67 | copy-dependencies
68 | prepare-package
69 |
70 | copy-dependencies
71 |
72 |
73 |
74 | ${project.build.directory}/libs
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 | org.springframework.boot
84 | spring-boot-maven-plugin
85 |
86 |
87 |
88 | repackage
89 |
90 |
91 |
92 | com.jobreadyprogrammer.spark.Application
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
--------------------------------------------------------------------------------
/project1/src/main/java/com/jobreadyprogrammer/spark/Application.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import static org.apache.spark.sql.functions.concat;
4 | import static org.apache.spark.sql.functions.lit;
5 |
6 | import java.util.Properties;
7 |
8 | import org.apache.spark.sql.Dataset;
9 | import org.apache.spark.sql.Row;
10 | import org.apache.spark.sql.SaveMode;
11 | import org.apache.spark.sql.SparkSession;
12 |
13 | public class Application {
14 |
15 | public static void main(String args[]) throws InterruptedException {
16 |
17 | // Create a session
18 | SparkSession spark = new SparkSession.Builder()
19 | .appName("CSV to DB")
20 | .master("local")
21 | .getOrCreate();
22 |
23 | // get data
24 | Dataset df = spark.read().format("csv")
25 | .option("header", true)
26 | .load("src/main/resources/name_and_comments.txt");
27 |
28 | // df.show(3);
29 |
30 | // Transformation
31 | df = df.withColumn("full_name",
32 | concat(df.col("last_name"), lit(", "), df.col("first_name")))
33 | .filter(df.col("comment").rlike("\\d+"))
34 | .orderBy(df.col("last_name").asc());
35 |
36 | // Write to destination
37 | String dbConnectionUrl = "jdbc:postgresql://localhost/course_data"; // <<- You need to create this database
38 | Properties prop = new Properties();
39 | prop.setProperty("driver", "org.postgresql.Driver");
40 | prop.setProperty("user", "postgres");
41 | prop.setProperty("password", "password"); // <- The password you used while installing Postgres
42 |
43 | df.write()
44 | .mode(SaveMode.Overwrite)
45 | .jdbc(dbConnectionUrl, "project1", prop);
46 | }
47 | }
--------------------------------------------------------------------------------
/project1/src/main/resources/name_and_comments.txt:
--------------------------------------------------------------------------------
1 | last_name,first_name,comment
2 | Lon,Jim,There are plenty of people in this world.
3 | Ingram,Milford,I've been using the internet for 10.
4 | Gideon,Elmer,Social media has taken over our lives for good.
5 | Dong,Fen,The body is 70% water so make sure to stay hydrated.
--------------------------------------------------------------------------------
/project1/target/.gitignore:
--------------------------------------------------------------------------------
1 | /classes/
2 | /test-classes/
3 |
--------------------------------------------------------------------------------
/project2/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/project2/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | project2
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 | org.eclipse.m2e.core.maven2Builder
15 |
16 |
17 |
18 |
19 |
20 | org.eclipse.jdt.core.javanature
21 | org.eclipse.m2e.core.maven2Nature
22 |
23 |
24 |
--------------------------------------------------------------------------------
/project2/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate
4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
6 | org.eclipse.jdt.core.compiler.compliance=1.8
7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
13 | org.eclipse.jdt.core.compiler.release=disabled
14 | org.eclipse.jdt.core.compiler.source=1.8
15 |
--------------------------------------------------------------------------------
/project2/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/project2/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | com.jobreadyprogrammer
4 | project2
5 | 0.0.1-SNAPSHOT
6 | jar
7 |
8 |
9 |
10 | UTF-8
11 | 1.8
12 | 2.11
13 | 2.3.1
14 | 1.8
15 | 1.8
16 |
17 |
18 |
19 |
20 |
21 | org.apache.spark
22 | spark-core_${scala.version}
23 | ${spark.version}
24 |
25 |
26 |
27 | org.apache.spark
28 | spark-sql_${scala.version}
29 | ${spark.version}
30 |
31 |
32 | org.slf4j
33 | slf4j-simple
34 |
35 |
36 |
37 |
38 |
39 | org.apache.spark
40 | spark-mllib_${scala.version}
41 | ${spark.version}
42 |
43 |
44 | org.slf4j
45 | slf4j-log4j12
46 |
47 |
48 | org.slf4j
49 | slf4j-simple
50 |
51 |
52 |
53 |
54 |
55 | junit
56 | junit
57 | 4.11
58 | test
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 | org.apache.maven.plugins
68 | maven-dependency-plugin
69 |
70 |
71 | copy-dependencies
72 | prepare-package
73 |
74 | copy-dependencies
75 |
76 |
77 |
78 | ${project.build.directory}/libs
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 | org.springframework.boot
88 | spring-boot-maven-plugin
89 |
90 |
91 |
92 | repackage
93 |
94 |
95 |
96 | com.jobreadyprogrammer.spark.Application
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
--------------------------------------------------------------------------------
/project2/src/main/java/com/jobreadyprogrammer/spark/Application.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | public class Application {
4 |
5 | public static void main(String[] args) {
6 |
7 | // InferCSVSchema parser = new InferCSVSchema();
8 | // parser.printSchema();
9 |
10 | // DefineCSVSchema parser2 = new DefineCSVSchema();
11 | // parser2.printDefinedSchema();
12 | //
13 | JSONLinesParser parser3 = new JSONLinesParser();
14 | parser3.parseJsonLines();
15 |
16 | }
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/project2/src/main/java/com/jobreadyprogrammer/spark/DefineCSVSchema.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import org.apache.spark.sql.Dataset;
4 | import org.apache.spark.sql.Row;
5 | import org.apache.spark.sql.SparkSession;
6 | import org.apache.spark.sql.types.DataTypes;
7 | import org.apache.spark.sql.types.StructField;
8 | import org.apache.spark.sql.types.StructType;
9 |
10 | public class DefineCSVSchema {
11 |
12 | public void printDefinedSchema() {
13 |
14 | SparkSession spark = SparkSession.builder()
15 | .appName("Complex CSV with a schema to Dataframe")
16 | .master("local")
17 | .getOrCreate();
18 |
19 | StructType schema = DataTypes.createStructType(new StructField[] { //
20 | DataTypes.createStructField(
21 | "id", //
22 | DataTypes.IntegerType, //
23 | false), //
24 | DataTypes.createStructField(
25 | "product_id",
26 | DataTypes.IntegerType,
27 | true),
28 | DataTypes.createStructField(
29 | "item_name",
30 | DataTypes.StringType,
31 | false),
32 | DataTypes.createStructField(
33 | "published_on",
34 | DataTypes.DateType,
35 | true),
36 | DataTypes.createStructField(
37 | "url",
38 | DataTypes.StringType,
39 | false) });
40 |
41 | Dataset df = spark.read().format("csv")
42 | .option("header", "true")
43 | .option("multiline", true)
44 | .option("sep", ";")
45 | .option("dateFormat", "M/d/y")
46 | .option("quote", "^")
47 | .schema(schema) //
48 | .load("src/main/resources/amazonProducts.txt");
49 |
50 | df.show(5, 15);
51 | df.printSchema();
52 |
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/project2/src/main/java/com/jobreadyprogrammer/spark/InferCSVSchema.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import org.apache.spark.sql.Dataset;
4 | import org.apache.spark.sql.Row;
5 | import org.apache.spark.sql.SparkSession;
6 |
7 | public class InferCSVSchema {
8 |
9 | public void printSchema() {
10 | SparkSession spark = SparkSession.builder()
11 | .appName("Complex CSV to Dataframe")
12 | .master("local")
13 | .getOrCreate();
14 |
15 | Dataset df = spark.read().format("csv") //
16 | .option("header", "true") //
17 | .option("multiline", true) //
18 | .option("sep", ";") //
19 | .option("quote", "^") //
20 | .option("dateFormat", "M/d/y") //
21 | .option("inferSchema", true) //
22 | .load("src/main/resources/amazonProducts.txt");
23 |
24 | System.out.println("Excerpt of the dataframe content:");
25 | // df.show(7);
26 | df.show(7, 90); // truncate after 90 chars
27 | System.out.println("Dataframe's schema:");
28 | df.printSchema();
29 | }
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/project2/src/main/java/com/jobreadyprogrammer/spark/JSONLinesParser.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import org.apache.spark.sql.Dataset;
4 | import org.apache.spark.sql.Row;
5 | import org.apache.spark.sql.SparkSession;
6 |
7 | public class JSONLinesParser {
8 |
9 |
10 |
11 | public void parseJsonLines() {
12 | SparkSession spark = SparkSession.builder()
13 | .appName("JSON Lines to Dataframe")
14 | .master("local")
15 | .getOrCreate();
16 |
17 | // Dataset df = spark.read().format("json")
18 | // .load("src/main/resources/simple.json");
19 |
20 | Dataset df2 = spark.read().format("json")
21 | .option("multiline", true)
22 | .load("src/main/resources/multiline.json");
23 |
24 | df2.show(5, 150);
25 | df2.printSchema();
26 | }
27 |
28 |
29 | }
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/project2/src/main/resources/amazonProducts.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project2/src/main/resources/amazonProducts.txt
--------------------------------------------------------------------------------
/project2/src/main/resources/multiline.json:
--------------------------------------------------------------------------------
1 | [{
2 | "id": "contract-11934",
3 | "buildingKey": "993839c8bh3fdgcc6734624ee8cc351050bn9shf93",
4 | "geo_location": {
5 | "type": "exact",
6 | "coordinates": [
7 | -78.8922549,
8 | 36.0013755
9 | ]
10 | },
11 | "properties": {
12 | "permit_no": "110138",
13 | "lat_and_lon": [
14 | 36.0013755,
15 | -78.8922549
16 | ],
17 | "address": "877 W CANAL ST",
18 | "year": "2009"
19 | },
20 | "timestamp": "2014-02-09T12:28:33-05:00"
21 | },
22 | {
23 | "id": "contract-11984",
24 | "buildingKey": "8fdn8rh3fdgcc6734624ee89wn350bn9shf93",
25 | "geo_location": {
26 | "type": "exact",
27 | "coordinates": [
28 | -87.9872323,
29 | 36.0013755
30 | ]
31 | },
32 | "properties": {
33 | "permit_no": "110138",
34 | "lat_and_lon": [
35 | 36.0013755,
36 | -78.8922549
37 | ],
38 | "address": "923 YETTI ST",
39 | "year": "2004"
40 | },
41 | "timestamp": "2014-02-09T12:28:33-05:00"
42 | }]
--------------------------------------------------------------------------------
/project2/src/main/resources/simple.json:
--------------------------------------------------------------------------------
1 | {"name": "Top", "owns": [["car", "honda"], ["laptop", "Dell"]]}
2 | {"name": "Frank", "owns": [["laptop", "Macbook"], ["shoes", "Nike"]]}
3 | {"name": "Peter", "owns": []}
4 | {"name": "Samantha", "owns": [["home", "34 Morris Ave."]]}
--------------------------------------------------------------------------------
/project2/target/.gitignore:
--------------------------------------------------------------------------------
1 | /classes/
2 | /test-classes/
3 |
--------------------------------------------------------------------------------
/project2/target/classes/amazonProducts.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project2/target/classes/amazonProducts.txt
--------------------------------------------------------------------------------
/project2/target/classes/com/jobreadyprogrammer/spark/Application.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project2/target/classes/com/jobreadyprogrammer/spark/Application.class
--------------------------------------------------------------------------------
/project2/target/classes/com/jobreadyprogrammer/spark/DefineCSVSchema.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project2/target/classes/com/jobreadyprogrammer/spark/DefineCSVSchema.class
--------------------------------------------------------------------------------
/project2/target/classes/com/jobreadyprogrammer/spark/InferCSVSchema.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project2/target/classes/com/jobreadyprogrammer/spark/InferCSVSchema.class
--------------------------------------------------------------------------------
/project2/target/classes/com/jobreadyprogrammer/spark/JSONLinesParser.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project2/target/classes/com/jobreadyprogrammer/spark/JSONLinesParser.class
--------------------------------------------------------------------------------
/project3/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate
4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
6 | org.eclipse.jdt.core.compiler.compliance=1.8
7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
13 | org.eclipse.jdt.core.compiler.release=disabled
14 | org.eclipse.jdt.core.compiler.source=1.8
15 |
--------------------------------------------------------------------------------
/project3/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/project3/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | com.jobreadyprogrammer
4 | project3
5 | 0.0.1-SNAPSHOT
6 | jar
7 |
8 |
9 |
10 | UTF-8
11 | 1.8
12 | 2.11
13 | 2.3.1
14 | 1.8
15 | 1.8
16 |
17 |
18 |
19 |
20 |
21 | org.apache.spark
22 | spark-core_${scala.version}
23 | ${spark.version}
24 |
25 |
26 |
27 | org.apache.spark
28 | spark-sql_${scala.version}
29 | ${spark.version}
30 |
31 |
32 | org.slf4j
33 | slf4j-simple
34 |
35 |
36 |
37 |
38 |
39 | org.apache.spark
40 | spark-mllib_${scala.version}
41 | ${spark.version}
42 |
43 |
44 | org.slf4j
45 | slf4j-log4j12
46 |
47 |
48 | org.slf4j
49 | slf4j-simple
50 |
51 |
52 |
53 |
54 |
55 | junit
56 | junit
57 | 4.11
58 | test
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 | org.apache.maven.plugins
68 | maven-dependency-plugin
69 |
70 |
71 | copy-dependencies
72 | prepare-package
73 |
74 | copy-dependencies
75 |
76 |
77 |
78 | ${project.build.directory}/libs
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 | org.springframework.boot
88 | spring-boot-maven-plugin
89 |
90 |
91 |
92 | repackage
93 |
94 |
95 |
96 | com.jobreadyprogrammer.spark.Application
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
--------------------------------------------------------------------------------
/project3/src/main/java/com/jobreadyprogrammer/spark/Application.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import static org.apache.spark.sql.functions.concat;
4 | import static org.apache.spark.sql.functions.lit;
5 |
6 | import org.apache.spark.Partition;
7 | import org.apache.spark.sql.Dataset;
8 | import org.apache.spark.sql.Row;
9 | import org.apache.spark.sql.SparkSession;
10 |
11 | public class Application {
12 |
13 | public static void main(String[] args) {
14 |
15 | SparkSession spark = SparkSession.builder()
16 | .appName("Combine 2 Datasets")
17 | .master("local")
18 | .getOrCreate();
19 |
20 | Dataset durhamDf = buildDurhamParksDataFrame(spark);
21 | // durhamDf.printSchema();
22 | // durhamDf.show(10);
23 |
24 | Dataset philDf = buildPhilParksDataFrame(spark);
25 | // philDf.printSchema();
26 | // philDf.show(10);
27 |
28 |
29 | combineDataframes(durhamDf, philDf);
30 |
31 | }
32 |
33 |
34 | private static void combineDataframes(Dataset df1, Dataset df2) {
35 | // Match by column names using the unionByName() method.
36 | // if we use just the union() method, it matches the columns based on order.
37 | Dataset df = df1.unionByName(df2);
38 | df.show(500);
39 | df.printSchema();
40 | System.out.println("We have " + df.count() + " records.");
41 |
42 | df = df.repartition(5);
43 |
44 | Partition[] partitions = df.rdd().partitions();
45 | System.out.println("Total number of Partitions: "+ partitions.length);
46 |
47 | }
48 |
49 |
50 | public static Dataset buildDurhamParksDataFrame(SparkSession spark){
51 | Dataset df = spark.read().format("json").option("multiline", true)
52 | .load("src/main/resources/durham-parks.json");
53 |
54 | df = df.withColumn("park_id", concat(df.col("datasetid"), lit("_"),
55 | df.col("fields.objectid"), lit("_Durham")))
56 | .withColumn("park_name", df.col("fields.park_name"))
57 | .withColumn("city", lit("Durham"))
58 | .withColumn("address", df.col("fields.address"))
59 | .withColumn("has_playground", df.col("fields.playground"))
60 | .withColumn("zipcode", df.col("fields.zip"))
61 | .withColumn("land_in_acres", df.col("fields.acres"))
62 | .withColumn("geoX", df.col("geometry.coordinates").getItem(0))
63 | .withColumn("geoY", df.col("geometry.coordinates").getItem(1))
64 | .drop("fields").drop("geometry").drop("record_timestamp").drop("recordid")
65 | .drop("datasetid");
66 |
67 | return df;
68 | }
69 |
70 |
71 | private static Dataset buildPhilParksDataFrame(SparkSession spark) {
72 | Dataset df = spark.read().format("csv").option("multiline", true)
73 | .option("header", true)
74 | .load("src/main/resources/philadelphia_recreations.csv");
75 |
76 | // df = df.filter(lower(df.col("USE_")).like("%park%"));
77 | df = df.filter("lower(USE_) like '%park%' ");
78 |
79 | df = df.withColumn("park_id", concat(lit("phil_"), df.col("OBJECTID")))
80 | .withColumnRenamed("ASSET_NAME", "park_name")
81 | .withColumn("city", lit("Philadelphia"))
82 | .withColumnRenamed("ADDRESS", "address")
83 | .withColumn("has_playground", lit("UNKNOWN"))
84 | .withColumnRenamed("ZIPCODE", "zipcode")
85 | .withColumnRenamed("ACREAGE", "land_in_acres")
86 | .withColumn("geoX", lit("UNKNONW"))
87 | .withColumn("geoY", lit("UNKNONW"))
88 | .drop("SITE_NAME")
89 | .drop("OBJECTID")
90 | .drop("CHILD_OF")
91 | .drop("TYPE")
92 | .drop("USE_")
93 | .drop("DESCRIPTION")
94 | .drop("SQ_FEET")
95 | .drop("ALLIAS")
96 | .drop("CHRONOLOGY")
97 | .drop("NOTES")
98 | .drop("DATE_EDITED")
99 | .drop("EDITED_BY")
100 | .drop("OCCUPANT")
101 | .drop("TENANT")
102 | .drop("LABEL");
103 |
104 |
105 |
106 | return df;
107 | }
108 |
109 |
110 |
111 | }
112 |
--------------------------------------------------------------------------------
/project3/src/main/java/com/jobreadyprogrammer/spark/ApplicationTest.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import java.util.Arrays;
4 | import java.util.List;
5 |
6 | import org.apache.spark.sql.Dataset;
7 | import org.apache.spark.sql.Encoders;
8 | import org.apache.spark.sql.Row;
9 | import org.apache.spark.sql.SparkSession;
10 |
11 | public class ApplicationTest {
12 |
13 | public static void main(String[] args) {
14 |
15 |
16 | SparkSession spark = SparkSession.builder()
17 | .appName("Learning Spark SQL Dataframe API")
18 | .master("local")
19 | .getOrCreate();
20 |
21 | String [] stringList = new String[] {"Banana", "Car", "Glass", "Banana", "Banana", "Computer", "Car", "IS", "HE"};
22 |
23 | List words = Arrays.asList(stringList);
24 |
25 | Dataset wordsDf = spark.createDataset(words, Encoders.STRING()).toDF();
26 |
27 | String [] bordingWords = new String[] {"this", "is", "he"};
28 | String filter = "( 'this', 'is', 'he')";
29 | List bordingList = Arrays.asList(bordingWords);
30 | Dataset boringWordsDf = spark.createDataset(bordingList, Encoders.STRING()).toDF();
31 |
32 | wordsDf = wordsDf.filter("value not in "+ filter);
33 |
34 | wordsDf.show();
35 |
36 |
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/project3/src/main/resources/students.csv:
--------------------------------------------------------------------------------
1 | student_id,student_name,State,GPA,favorite_book_title,working
2 | 1100,Royce Piche,NJ,1.5,To Kill a Mockingbird,TRUE
3 | 1120,Alexis Morriss,NJ,3.0,Pride and Prejudice,FALSE
4 | 1130,Len Tarbell,NJ,3.5,The Diary of Anne Frank,FALSE
5 | 1140,Alejandro Dory,NY,2.5,Harry Potter and the Sorcerer's Stone,FALSE
6 | 1150,Ricky Tremaine,NY,3.0,The Lord of the Rings,TRUE
7 | 1160,Monika Gift,NY,3.0,The Great Gatsby,TRUE
8 | 1170,Kristeen Line,CA,4.0,Animal Farm,FALSE
9 | 1180,Sonia Rickard,CA,4.0,Harry Potter and the Sorcerer's Stone,FALSE
10 | 1190,Dan Iacovelli,CA,3.5,The Hunger Games,FALSE
11 | 1200,Ned Alvin,CA,1.0,,TRUE
12 | 1210,Sidney Ducote,FL,1.5,The Secret Garden,FALSE
13 | 1220,Bobbie Shrader,FL,2.0,The Color Purple,FALSE
14 | ,,,,,
--------------------------------------------------------------------------------
/project3/target/.gitignore:
--------------------------------------------------------------------------------
1 | /classes/
2 | /test-classes/
3 |
--------------------------------------------------------------------------------
/project4/.gitignore:
--------------------------------------------------------------------------------
1 | /bin/
2 | /target/
3 |
--------------------------------------------------------------------------------
/project4/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/main/java=UTF-8
3 | encoding//src/main/resources=UTF-8
4 | encoding/=UTF-8
5 |
--------------------------------------------------------------------------------
/project4/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate
4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
6 | org.eclipse.jdt.core.compiler.compliance=1.8
7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
13 | org.eclipse.jdt.core.compiler.release=disabled
14 | org.eclipse.jdt.core.compiler.source=1.8
15 |
--------------------------------------------------------------------------------
/project4/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/project4/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | com.jobreadyprogrammer
4 | project4
5 | 0.0.1-SNAPSHOT
6 | jar
7 |
8 |
9 |
10 | UTF-8
11 | 1.8
12 | 2.11
13 | 2.3.1
14 | 1.8
15 | 1.8
16 |
17 |
18 |
19 |
20 |
21 | org.apache.spark
22 | spark-core_${scala.version}
23 | ${spark.version}
24 |
25 |
26 |
27 | org.apache.spark
28 | spark-sql_${scala.version}
29 | ${spark.version}
30 |
31 |
32 | org.slf4j
33 | slf4j-simple
34 |
35 |
36 |
37 |
38 |
39 | org.apache.spark
40 | spark-mllib_${scala.version}
41 | ${spark.version}
42 |
43 |
44 | org.slf4j
45 | slf4j-log4j12
46 |
47 |
48 | org.slf4j
49 | slf4j-simple
50 |
51 |
52 |
53 |
54 |
55 | junit
56 | junit
57 | 4.11
58 | test
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 | org.apache.maven.plugins
68 | maven-dependency-plugin
69 |
70 |
71 | copy-dependencies
72 | prepare-package
73 |
74 | copy-dependencies
75 |
76 |
77 |
78 | ${project.build.directory}/libs
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 | org.springframework.boot
88 | spring-boot-maven-plugin
89 |
90 |
91 |
92 | repackage
93 |
94 |
95 |
96 | com.jobreadyprogrammer.spark.Application
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
--------------------------------------------------------------------------------
/project4/src/main/java/com/jobreadyprogrammer/mappers/HouseMapper.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.mappers;
2 |
3 | import java.text.SimpleDateFormat;
4 |
5 | import org.apache.spark.api.java.function.MapFunction;
6 | import org.apache.spark.sql.Row;
7 |
8 | import com.jobreadyprogrammer.pojos.House;
9 |
10 | public class HouseMapper implements MapFunction{
11 |
12 | /**
13 | *
14 | */
15 | private static final long serialVersionUID = -2L;
16 |
17 |
18 | @Override
19 | public House call(Row value) throws Exception {
20 |
21 | House h = new House();
22 |
23 | h.setId(value.getAs("id"));
24 | h.setAddress(value.getAs("address"));
25 | h.setSqft(value.getAs("sqft"));
26 | h.setPrice(value.getAs("price"));
27 |
28 | String vacancyDateString = value.getAs("vacantBy").toString();
29 |
30 | if(vacancyDateString != null) {
31 | SimpleDateFormat parser = new SimpleDateFormat("yyyy-mm-dd");
32 | h.setVacantBy(parser.parse(vacancyDateString));
33 | }
34 |
35 | return h;
36 |
37 | }
38 |
39 | }
--------------------------------------------------------------------------------
/project4/src/main/java/com/jobreadyprogrammer/mappers/LineMapper.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.mappers;
2 |
3 | import java.util.Arrays;
4 | import java.util.Iterator;
5 |
6 | import org.apache.spark.api.java.function.FlatMapFunction;
7 | import org.apache.spark.sql.Row;
8 |
9 | public class LineMapper implements FlatMapFunction{
10 |
11 | /**
12 | *
13 | */
14 | private static final long serialVersionUID = 1L;
15 |
16 | @Override
17 | public Iterator call(Row row) throws Exception {
18 | return Arrays.asList(row.toString().split(" ")).iterator();
19 | }
20 |
21 | }
22 |
--------------------------------------------------------------------------------
/project4/src/main/java/com/jobreadyprogrammer/pojos/House.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.pojos;
2 |
3 | import java.io.Serializable;
4 | import java.util.Date;
5 |
6 | public class House implements Serializable {
7 |
8 | /**
9 | *
10 | */
11 | private static final long serialVersionUID = 1L;
12 |
13 | private int id;
14 | private String address;
15 | private int sqft;
16 | private double price;
17 | private Date vacantBy;
18 |
19 |
20 | public int getId() {
21 | return id;
22 | }
23 |
24 | public void setId(int id) {
25 | this.id = id;
26 | }
27 |
28 | public String getAddress() {
29 | return address;
30 | }
31 |
32 | public void setAddress(String address) {
33 | this.address = address;
34 | }
35 |
36 | public int getSqft() {
37 | return sqft;
38 | }
39 | public void setSqft(int sqft) {
40 | this.sqft = sqft;
41 | }
42 |
43 | public double getPrice() {
44 | return price;
45 | }
46 |
47 |
48 | public void setPrice(double price) {
49 | this.price = price;
50 | }
51 |
52 | public Date getVacantBy() {
53 | return vacantBy;
54 | }
55 |
56 | public void setVacantBy(Date vacantBy) {
57 | this.vacantBy = vacantBy;
58 | }
59 |
60 | }
--------------------------------------------------------------------------------
/project4/src/main/java/com/jobreadyprogrammer/spark/Application.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | public class Application {
4 |
5 | public static void main(String[] args) {
6 |
7 | // ArrayToDataset app = new ArrayToDataset();
8 | // app.start();
9 |
10 | CsvToDatasetHouseToDataframe app = new CsvToDatasetHouseToDataframe();
11 | app.start();
12 |
13 | // WordCount wc = new WordCount();
14 | // wc.start();
15 |
16 | }
17 |
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/project4/src/main/java/com/jobreadyprogrammer/spark/ArrayToDataset.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import java.io.Serializable;
4 | import java.util.Arrays;
5 | import java.util.List;
6 |
7 | import org.apache.spark.api.java.function.MapFunction;
8 | import org.apache.spark.api.java.function.ReduceFunction;
9 | import org.apache.spark.sql.Dataset;
10 | import org.apache.spark.sql.Encoders;
11 | import org.apache.spark.sql.SparkSession;
12 |
13 | public class ArrayToDataset {
14 |
15 | public void start() {
16 | SparkSession spark = new SparkSession.Builder()
17 | .appName("Array To Dataset")
18 | .master("local")
19 | .getOrCreate();
20 |
21 | String [] stringList = new String[] {"Banana", "Car", "Glass", "Banana", "Computer", "Car"};
22 |
23 | List data = Arrays.asList(stringList);
24 |
25 | Dataset ds = spark.createDataset(data, Encoders.STRING());
26 |
27 | ds = ds.map((MapFunction) row -> "word: " + row, Encoders.STRING());
28 | ds.show(10);
29 |
30 | String stringValue = ds.reduce(new StringReducer());
31 |
32 | System.out.println(stringValue);
33 |
34 | }
35 |
36 |
37 | static class StringReducer implements ReduceFunction, Serializable {
38 |
39 | /**
40 | *
41 | */
42 | private static final long serialVersionUID = 1L;
43 |
44 | @Override
45 | public String call(String v1, String v2) throws Exception {
46 | return v1 + v2;
47 | }
48 |
49 | }
50 |
51 | }
52 |
--------------------------------------------------------------------------------
/project4/src/main/java/com/jobreadyprogrammer/spark/CsvToDatasetHouseToDataframe.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import org.apache.spark.sql.Dataset;
4 | import org.apache.spark.sql.Encoders;
5 | import org.apache.spark.sql.Row;
6 | import static org.apache.spark.sql.functions.*;
7 | import org.apache.spark.sql.SparkSession;
8 |
9 | import com.jobreadyprogrammer.mappers.HouseMapper;
10 | import com.jobreadyprogrammer.pojos.House;
11 |
12 |
13 | public class CsvToDatasetHouseToDataframe {
14 |
15 | public void start() {
16 |
17 | SparkSession spark = SparkSession.builder()
18 | .appName("CSV to dataframe to Dataset and back")
19 | .master("local")
20 | .getOrCreate();
21 |
22 |
23 | String filename = "src/main/resources/houses.csv";
24 |
25 | Dataset df = spark.read().format("csv")
26 | .option("inferSchema", "true") // Make sure to use string version of true
27 | .option("header", true)
28 | .option("sep", ";")
29 | .load(filename);
30 |
31 | System.out.println("House ingested in a dataframe: ");
32 | // df.show(5);
33 | // df.printSchema();
34 |
35 | Dataset houseDS = df.map(new HouseMapper(), Encoders.bean(House.class));
36 |
37 | System.out.println("*****House ingested in a dataset: *****");
38 |
39 | houseDS.show(5);
40 | houseDS.printSchema();
41 |
42 | Dataset df2 = houseDS.toDF();
43 | df2 = df2.withColumn("formatedDate", concat(df2.col("vacantBy.date"), lit("_"), df2.col("vacantBy.year")));
44 | df2.show(10);
45 | }
46 |
47 |
48 |
49 | }
50 |
51 |
52 |
--------------------------------------------------------------------------------
/project4/src/main/java/com/jobreadyprogrammer/spark/WordCount.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import org.apache.spark.sql.Dataset;
4 | import org.apache.spark.sql.Encoders;
5 | import org.apache.spark.sql.Row;
6 | import org.apache.spark.sql.SparkSession;
7 |
8 | import com.jobreadyprogrammer.mappers.LineMapper;
9 |
10 | public class WordCount {
11 |
12 | public void start() {
13 |
14 | String boringWords = " ('a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by',\r\n" +
15 | "'for', 'if', 'in', 'into', 'is', 'it',\r\n" +
16 | "'no', 'not', 'of', 'on', 'or', 'such',\r\n" +
17 | "'that', 'the', 'their', 'then', 'there', 'these',\r\n" +
18 | "'they', 'this', 'to', 'was', 'will', 'with', 'he', 'she'," +
19 | "'your', 'you', 'I', "
20 | + " 'i','[',']', '[]', 'his', 'him', 'our', 'we') ";
21 |
22 | SparkSession spark = SparkSession.builder()
23 | .appName("unstructured text to flatmap")
24 | .master("local")
25 | .getOrCreate();
26 |
27 | String filename = "src/main/resources/shakespeare.txt";
28 |
29 | Dataset df = spark.read().format("text")
30 | .load(filename);
31 |
32 | // df.printSchema();
33 | // df.show(10);
34 |
35 | Dataset wordsDS = df.flatMap(new LineMapper(), Encoders.STRING());
36 |
37 | Dataset df2 = wordsDS.toDF();
38 |
39 | df2 = df2.groupBy("value").count();
40 | df2 = df2.orderBy(df2.col("count").desc());
41 | df2 = df2.filter("lower(value) NOT IN " + boringWords);
42 |
43 | df2.show(500);
44 |
45 |
46 | }
47 |
48 |
49 | }
50 |
--------------------------------------------------------------------------------
/project4/src/main/resources/houses.csv:
--------------------------------------------------------------------------------
1 | id;address;sqft;price;vacantBy
1;609 Bayway Rd Virginia Beach, VA 23451;1531;300000.00;2018-10-31
2;3220 Kenmore Rd Richmond, VA 23225;2776;125000.00;2019-04-11
3;400 W 29th St Norfolk, VA 23508;2164;54900.00;2019-02-01
4;3223 Park Ave Richmond, VA 23221;1740;390000.00;2019-03-22
5;3645 Barn Swallow Cir Roanoke, VA 24018;1800;212950.00;2019-08-20
6;3020 Scarsborough Dr Richmond, VA 23235;2340;349000.00;2018-11-22
--------------------------------------------------------------------------------
/project4/target/.gitignore:
--------------------------------------------------------------------------------
1 | /classes/
2 | /test-classes/
3 |
--------------------------------------------------------------------------------
/project5/.gitignore:
--------------------------------------------------------------------------------
1 | /bin/
2 | /target/
3 |
--------------------------------------------------------------------------------
/project5/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/main/java=UTF-8
3 | encoding//src/main/resources=UTF-8
4 | encoding/=UTF-8
5 |
--------------------------------------------------------------------------------
/project5/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate
4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
6 | org.eclipse.jdt.core.compiler.compliance=1.8
7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
13 | org.eclipse.jdt.core.compiler.release=disabled
14 | org.eclipse.jdt.core.compiler.source=1.8
15 |
--------------------------------------------------------------------------------
/project5/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/project5/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | com.jobreadyprogrammer
4 | project5
5 | 0.0.1-SNAPSHOT
6 | jar
7 |
8 |
9 |
10 | UTF-8
11 | 1.8
12 | 2.11
13 | 2.3.1
14 | 1.8
15 | 1.8
16 |
17 |
18 |
19 |
20 |
21 | org.apache.spark
22 | spark-core_${scala.version}
23 | ${spark.version}
24 |
25 |
26 |
27 | org.apache.spark
28 | spark-sql_${scala.version}
29 | ${spark.version}
30 |
31 |
32 | org.slf4j
33 | slf4j-simple
34 |
35 |
36 |
37 |
38 |
39 | org.apache.spark
40 | spark-mllib_${scala.version}
41 | ${spark.version}
42 |
43 |
44 | org.slf4j
45 | slf4j-log4j12
46 |
47 |
48 | org.slf4j
49 | slf4j-simple
50 |
51 |
52 |
53 |
54 |
55 | junit
56 | junit
57 | 4.11
58 | test
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 | org.apache.maven.plugins
68 | maven-dependency-plugin
69 |
70 |
71 | copy-dependencies
72 | prepare-package
73 |
74 | copy-dependencies
75 |
76 |
77 |
78 | ${project.build.directory}/libs
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 | org.springframework.boot
88 | spring-boot-maven-plugin
89 |
90 |
91 |
92 | repackage
93 |
94 |
95 |
96 | com.jobreadyprogrammer.spark.Application
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
--------------------------------------------------------------------------------
/project5/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project5/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc
--------------------------------------------------------------------------------
/project5/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project5/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet
--------------------------------------------------------------------------------
/project5/src/main/java/com/jobreadyprogrammer/spark/Application.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import org.apache.spark.sql.Dataset;
4 | import org.apache.spark.sql.Row;
5 | import org.apache.spark.sql.SparkSession;
6 |
7 | import static org.apache.spark.sql.functions.*;
8 |
9 | public class Application {
10 |
11 | public static void main(String[] args) {
12 |
13 | SparkSession spark = SparkSession.builder()
14 | .appName("Learning Spark SQL Dataframe API")
15 | .master("local")
16 | .getOrCreate();
17 |
18 |
19 | String studentsFile = "src/main/resources/students.csv";
20 |
21 | Dataset studentDf = spark.read().format("csv")
22 | .option("inferSchema", "true") // Make sure to use string version of true
23 | .option("header", true)
24 | .load(studentsFile);
25 |
26 | String gradeChartFile = "src/main/resources/grade_chart.csv";
27 |
28 | Dataset gradesDf = spark.read().format("csv")
29 | .option("inferSchema", "true") // Make sure to use string version of true
30 | .option("header", true)
31 | .load(gradeChartFile);
32 |
33 | Dataset filteredDf = studentDf.join(gradesDf, studentDf.col("GPA").equalTo(gradesDf.col("GPA")))
34 | .filter(gradesDf.col("gpa").gt(3.0).and(gradesDf.col("gpa").lt(4.5))
35 | .or(gradesDf.col("gpa").equalTo(1.0)))
36 | .select("student_name",
37 | "favorite_book_title",
38 | "letter_grade");
39 |
40 |
41 | }
42 |
43 |
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/project5/src/main/java/com/jobreadyprogrammer/spark/CustomersAndProducts.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import org.apache.spark.sql.Dataset;
4 | import org.apache.spark.sql.Row;
5 | import org.apache.spark.sql.SparkSession;
6 |
7 | import static org.apache.spark.sql.functions.*;
8 |
9 | public class CustomersAndProducts {
10 |
11 | public static void main(String[] args) {
12 |
13 |
14 | SparkSession spark = SparkSession.builder()
15 | .appName("Learning Spark SQL Dataframe API")
16 | .master("local")
17 | .getOrCreate();
18 |
19 | String customers_file = "src/main/resources/customers.csv";
20 |
21 | Dataset customersDf = spark.read().format("csv")
22 | .option("inferSchema", "true") // Make sure to use string version of true
23 | .option("header", true)
24 | .load(customers_file);
25 |
26 | String products_file = "src/main/resources/products.csv";
27 |
28 | Dataset productsDf = spark.read().format("csv")
29 | .option("inferSchema", "true") // Make sure to use string version of true
30 | .option("header", true)
31 | .load(products_file);
32 |
33 | String purchases_file = "src/main/resources/purchases.csv";
34 |
35 |
36 | Dataset purchasesDf = spark.read().format("csv")
37 | .option("inferSchema", "true") // Make sure to use string version of true
38 | .option("header", true)
39 | .load(purchases_file);
40 |
41 | System.out.println(" Loaded all files into Dataframes ");
42 | System.out.println("----------------------------------");
43 |
44 |
45 | Dataset joinedData = customersDf.join(purchasesDf,
46 | customersDf.col("customer_id").equalTo(purchasesDf.col("customer_id")))
47 | .join(productsDf, purchasesDf.col("product_id").equalTo(productsDf.col("product_id")))
48 | .drop("favorite_website").drop(purchasesDf.col("customer_id"))
49 | .drop(purchasesDf.col("product_id")).drop("product_id");
50 |
51 | Dataset aggDf = joinedData.groupBy("first_name", "product_name").agg(
52 | count("product_name").as("number_of_purchases"),
53 | max("product_price").as("most_exp_purchase"),
54 | sum("product_price").as("total_spent")
55 | );
56 |
57 | aggDf = aggDf.drop("number_of_purchases").drop("most_exp_purchase");
58 |
59 | Dataset initialDf = aggDf;
60 |
61 | for(int i = 0; i < 500; i++ ) {
62 | aggDf = aggDf.union(initialDf);
63 | }
64 |
65 | joinedData.collectAsList();
66 |
67 |
68 | }
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/project5/src/main/resources/customers.csv:
--------------------------------------------------------------------------------
1 | customer_id,last_name,first_name,favorite_website
2 | 4000,Jackson,Joe,techonthenet.com
3 | 5000,Smith,Jane,digminecraft.com
4 | 6000,Ferguson,Samantha,bigactivities.com
5 | 7000,Reynolds,Allen,checkyourmath.com
6 | 8000,Anderson,Paige,
7 | 9000,Johnson,Derek,techonthenet.com
--------------------------------------------------------------------------------
/project5/src/main/resources/grade_chart.csv:
--------------------------------------------------------------------------------
1 | gpa,letter_grade
2 | 1.0,F
3 | 1.5,D
4 | 2.0,C
5 | 2.5,C+
6 | 3.0,B
7 | 3.5,B+
8 | 4.0,A
--------------------------------------------------------------------------------
/project5/src/main/resources/products.csv:
--------------------------------------------------------------------------------
1 | product_id,product_name,product_price
2 | 1,Pear,0.95
3 | 2,Banana,0.75
4 | 3,Orange,0.75
5 | 4,Apple,0.85
6 | 5,Bread,2.50
7 | 6,Sliced Ham,3.00
8 | 7,Kleenex,4.00
--------------------------------------------------------------------------------
/project5/src/main/resources/purchases.csv:
--------------------------------------------------------------------------------
1 | customer_id,product_id
2 | 7000,1
3 | 7000,1
4 | 7000,2
5 | 5000,6
6 | 5000,7
7 | 5000,7
8 | 5000,6
9 | 8000,2
10 | 8000,3
11 | 8000,3
12 | 4000,3
13 | 6000,1
14 | 6000,1
15 | 6000,3
16 | 6000,6
--------------------------------------------------------------------------------
/project5/src/main/resources/students.csv:
--------------------------------------------------------------------------------
1 | student_id,student_name,State,GPA,favorite_book_title,working
2 | 1100,Royce Piche,NJ,1.5,To Kill a Mockingbird,TRUE
3 | 1120,Alexis Morriss,NJ,3.0,Pride and Prejudice,FALSE
4 | 1130,Len Tarbell,NJ,3.5,The Diary of Anne Frank,FALSE
5 | 1140,Alejandro Dory,NY,2.5,Harry Potter and the Sorcerer's Stone,FALSE
6 | 1150,Ricky Tremaine,NY,3.0,The Lord of the Rings,TRUE
7 | 1160,Monika Gift,NY,3.0,The Great Gatsby,TRUE
8 | 1170,Kristeen Line,CA,4.0,Animal Farm,FALSE
9 | 1180,Sonia Rickard,CA,4.0,Harry Potter and the Sorcerer's Stone,FALSE
10 | 1190,Dan Iacovelli,CA,3.5,The Hunger Games,FALSE
11 | 1200,Ned Alvin,CA,1.0,,TRUE
12 | 1210,Sidney Ducote,FL,1.5,The Secret Garden,FALSE
13 | 1220,Bobbie Shrader,FL,2.0,The Color Purple,FALSE
--------------------------------------------------------------------------------
/project6/.gitignore:
--------------------------------------------------------------------------------
1 | /bin/
2 | /target/
3 |
--------------------------------------------------------------------------------
/project6/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/main/java=UTF-8
3 | encoding//src/main/resources=UTF-8
4 | encoding/=UTF-8
5 |
--------------------------------------------------------------------------------
/project6/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate
4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
6 | org.eclipse.jdt.core.compiler.compliance=1.8
7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
13 | org.eclipse.jdt.core.compiler.release=disabled
14 | org.eclipse.jdt.core.compiler.source=1.8
15 |
--------------------------------------------------------------------------------
/project6/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/project6/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | com.jobreadyprogrammer
4 | project6
5 | 0.0.1-SNAPSHOT
6 | jar
7 |
8 |
9 |
10 | UTF-8
11 | UTF-8
12 | 1.8
13 | 2.11
14 | 2.3.1
15 | 42.1.4
16 |
17 |
18 |
19 |
20 |
21 |
22 | org.apache.spark
23 | spark-core_${scala.version}
24 | ${spark.version}
25 |
26 |
27 |
28 | org.apache.spark
29 | spark-sql_${scala.version}
30 | ${spark.version}
31 |
32 |
33 |
34 | org.apache.hadoop
35 | hadoop-hdfs
36 | 2.2.0
37 |
38 |
39 |
40 | org.apache.spark
41 | spark-mllib_${scala.version}
42 | ${spark.version}
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 | org.apache.maven.plugins
51 | maven-compiler-plugin
52 | 3.5.1
53 |
54 | 1.8
55 | 1.8
56 |
57 |
58 |
59 |
77 |
78 |
79 |
80 | maven-jar-plugin
81 | 3.0.2
82 |
83 | 1.8
84 | 1.8
85 |
86 |
87 | com.jobreadyprogrammer.spark.Application
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
--------------------------------------------------------------------------------
/project6/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project6/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc
--------------------------------------------------------------------------------
/project6/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project6/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet
--------------------------------------------------------------------------------
/project6/src/main/java/com/jobreadyprogrammer/spark/Application.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import java.beans.Encoder;
4 | import java.util.Arrays;
5 |
6 | import org.apache.spark.api.java.function.FlatMapFunction;
7 | import org.apache.spark.ml.feature.StopWordsRemover;
8 | import org.apache.spark.sql.Dataset;
9 | import org.apache.spark.sql.Encoders;
10 | import org.apache.spark.sql.Row;
11 | import org.apache.spark.sql.SparkSession;
12 | import static org.apache.spark.sql.functions.*;
13 |
14 |
15 | public class Application {
16 |
17 | public static void main(String[] args) {
18 |
19 |
20 | SparkSession spark = SparkSession.builder()
21 | .appName("Learning Spark SQL Dataframe API")
22 | .master("local") // <--- need to remove this line to run on a live cluster
23 | .getOrCreate();
24 |
25 |
26 | // String redditFile = "s3n://your-bucket-name/Reddit_2011-large";
27 |
28 | String redditFile = "/file/on/your/computer/Reddit_2007-small"; // <- change your file location
29 |
30 | Dataset redditDf = spark.read().format("json")
31 | .option("inferSchema", "true") // Make sure to use string version of true
32 | .option("header", true)
33 | .load(redditFile);
34 |
35 | redditDf = redditDf.select("body");
36 | Dataset wordsDs = redditDf.flatMap((FlatMapFunction)
37 | r -> Arrays.asList(r.toString().replace("\n", "").replace("\r", "").trim().toLowerCase()
38 | .split(" ")).iterator(),
39 | Encoders.STRING());
40 |
41 | Dataset wordsDf = wordsDs.toDF();
42 |
43 | Dataset boringWordsDf = spark.createDataset(Arrays.asList(WordUtils.stopWords), Encoders.STRING()).toDF();
44 |
45 | // wordsDf = wordsDf.except(boringWordsDf); // <-- This won't work because it removes duplicate words!!
46 |
47 | wordsDf = wordsDf.join(boringWordsDf, wordsDf.col("value").equalTo(boringWordsDf.col("value")), "leftanti");
48 |
49 | wordsDf = wordsDf.groupBy("value").count();
50 | wordsDf.orderBy(desc("count")).show();
51 |
52 | }
53 |
54 |
55 |
56 | }
57 |
--------------------------------------------------------------------------------
/project6/src/main/java/com/jobreadyprogrammer/spark/WordUtils.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | public class WordUtils {
4 |
5 | public static String [] stopWords = {"a",
6 | "ability",
7 | "able",
8 | "about",
9 | "above",
10 | "accept",
11 | "according",
12 | "account",
13 | "across",
14 | "act",
15 | "action",
16 | "activity",
17 | "actually",
18 | "add",
19 | "address",
20 | "administration",
21 | "admit",
22 | "adult",
23 | "affect",
24 | "after",
25 | "again",
26 | "against",
27 | "age",
28 | "agency",
29 | "agent",
30 | "ago",
31 | "agree",
32 | "agreement",
33 | "ahead",
34 | "air",
35 | "all",
36 | "allow",
37 | "almost",
38 | "alone",
39 | "along",
40 | "already",
41 | "also",
42 | "although",
43 | "always",
44 | "among",
45 | "amount",
46 | "analysis",
47 | "and",
48 | "animal",
49 | "another",
50 | "answer",
51 | "any",
52 | "anyone",
53 | "anything",
54 | "appear",
55 | "apply",
56 | "approach",
57 | "area",
58 | "argue",
59 | "arm",
60 | "around",
61 | "arrive",
62 | "art",
63 | "article",
64 | "artist",
65 | "as",
66 | "ask",
67 | "assume",
68 | "at",
69 | "attack",
70 | "attention",
71 | "attorney",
72 | "audience",
73 | "author",
74 | "authority",
75 | "available",
76 | "avoid",
77 | "away",
78 | "baby",
79 | "back",
80 | "bad",
81 | "bag",
82 | "ball",
83 | "bank",
84 | "bar",
85 | "base",
86 | "be",
87 | "beat",
88 | "beautiful",
89 | "because",
90 | "become",
91 | "bed",
92 | "before",
93 | "begin",
94 | "behavior",
95 | "behind",
96 | "believe",
97 | "benefit",
98 | "best",
99 | "better",
100 | "between",
101 | "beyond",
102 | "big",
103 | "bill",
104 | "billion",
105 | "bit",
106 | "black",
107 | "blood",
108 | "blue",
109 | "board",
110 | "body",
111 | "book",
112 | "born",
113 | "both",
114 | "box",
115 | "boy",
116 | "break",
117 | "bring",
118 | "brother",
119 | "budget",
120 | "build",
121 | "building",
122 | "business",
123 | "but",
124 | "buy",
125 | "by",
126 | "call",
127 | "camera",
128 | "can",
129 | "cancer",
130 | "capital",
131 | "car",
132 | "card",
133 | "care",
134 | "career",
135 | "carry",
136 | "case",
137 | "catch",
138 | "cause",
139 | "cell",
140 | "center",
141 | "central",
142 | "century",
143 | "certain",
144 | "certainly",
145 | "chair",
146 | "challenge",
147 | "chance",
148 | "change",
149 | "character",
150 | "charge",
151 | "check",
152 | "child",
153 | "choice",
154 | "choose",
155 | "church",
156 | "citizen",
157 | "city",
158 | "civil",
159 | "claim",
160 | "class",
161 | "clear",
162 | "clearly",
163 | "close",
164 | "coach",
165 | "cold",
166 | "collection",
167 | "college",
168 | "color",
169 | "come",
170 | "commercial",
171 | "common",
172 | "community",
173 | "company",
174 | "compare",
175 | "computer",
176 | "concern",
177 | "condition",
178 | "conference",
179 | "Congress",
180 | "consider",
181 | "consumer",
182 | "contain",
183 | "continue",
184 | "control",
185 | "cost",
186 | "could",
187 | "country",
188 | "couple",
189 | "course",
190 | "court",
191 | "cover",
192 | "create",
193 | "cultural",
194 | "culture",
195 | "cup",
196 | "current",
197 | "customer",
198 | "cut",
199 | "dark",
200 | "data",
201 | "daughter",
202 | "day",
203 | "dead",
204 | "deal",
205 | "death",
206 | "debate",
207 | "decade",
208 | "decide",
209 | "decision",
210 | "deep",
211 | "defense",
212 | "degree",
213 | "describe",
214 | "design",
215 | "despite",
216 | "detail",
217 | "determine",
218 | "develop",
219 | "development",
220 | "die",
221 | "difference",
222 | "different",
223 | "difficult",
224 | "dinner",
225 | "direction",
226 | "director",
227 | "discover",
228 | "discuss",
229 | "discussion",
230 | "disease",
231 | "do",
232 | "doctor",
233 | "dog",
234 | "door",
235 | "down",
236 | "draw",
237 | "dream",
238 | "drive",
239 | "drop",
240 | "drug",
241 | "during",
242 | "each",
243 | "early",
244 | "east",
245 | "easy",
246 | "eat",
247 | "edge",
248 | "education",
249 | "effect",
250 | "effort",
251 | "eight",
252 | "either",
253 | "election",
254 | "else",
255 | "employee",
256 | "end",
257 | "energy",
258 | "enjoy",
259 | "enough",
260 | "enter",
261 | "entire",
262 | "environment",
263 | "environmental",
264 | "especially",
265 | "establish",
266 | "even",
267 | "evening",
268 | "event",
269 | "ever",
270 | "every",
271 | "everybody",
272 | "everyone",
273 | "everything",
274 | "evidence",
275 | "exactly",
276 | "example",
277 | "executive",
278 | "exist",
279 | "expect",
280 | "experience",
281 | "expert",
282 | "explain",
283 | "eye",
284 | "face",
285 | "fact",
286 | "factor",
287 | "fail",
288 | "fall",
289 | "family",
290 | "far",
291 | "fast",
292 | "father",
293 | "fear",
294 | "federal",
295 | "feel",
296 | "feeling",
297 | "few",
298 | "field",
299 | "fight",
300 | "figure",
301 | "fill",
302 | "film",
303 | "final",
304 | "finally",
305 | "financial",
306 | "find",
307 | "fine",
308 | "finger",
309 | "finish",
310 | "fire",
311 | "firm",
312 | "first",
313 | "fish",
314 | "five",
315 | "floor",
316 | "fly",
317 | "focus",
318 | "follow",
319 | "food",
320 | "foot",
321 | "for",
322 | "force",
323 | "foreign",
324 | "forget",
325 | "form",
326 | "former",
327 | "forward",
328 | "four",
329 | "free",
330 | "friend",
331 | "from",
332 | "front",
333 | "full",
334 | "fund",
335 | "future",
336 | "game",
337 | "garden",
338 | "gas",
339 | "general",
340 | "generation",
341 | "get",
342 | "girl",
343 | "give",
344 | "glass",
345 | "go",
346 | "goal",
347 | "good",
348 | "great",
349 | "green",
350 | "ground",
351 | "group",
352 | "grow",
353 | "growth",
354 | "guess",
355 | "gun",
356 | "guy",
357 | "hair",
358 | "half",
359 | "hand",
360 | "hang",
361 | "happen",
362 | "happy",
363 | "hard",
364 | "have",
365 | "he",
366 | "head",
367 | "health",
368 | "hear",
369 | "heart",
370 | "heat",
371 | "heavy",
372 | "help",
373 | "her",
374 | "here",
375 | "herself",
376 | "high",
377 | "him",
378 | "himself",
379 | "his",
380 | "history",
381 | "hit",
382 | "hold",
383 | "home",
384 | "hope",
385 | "hospital",
386 | "hot",
387 | "hotel",
388 | "hour",
389 | "house",
390 | "how",
391 | "however",
392 | "huge",
393 | "human",
394 | "hundred",
395 | "husband",
396 | "I",
397 | "idea",
398 | "identify",
399 | "if",
400 | "image",
401 | "imagine",
402 | "impact",
403 | "important",
404 | "improve",
405 | "in",
406 | "include",
407 | "including",
408 | "increase",
409 | "indeed",
410 | "indicate",
411 | "individual",
412 | "industry",
413 | "information",
414 | "inside",
415 | "instead",
416 | "institution",
417 | "interest",
418 | "interesting",
419 | "international",
420 | "interview",
421 | "into",
422 | "investment",
423 | "involve",
424 | "issue",
425 | "it",
426 | "item",
427 | "its",
428 | "itself",
429 | "job",
430 | "join",
431 | "just",
432 | "keep",
433 | "key",
434 | "kid",
435 | "kill",
436 | "kind",
437 | "kitchen",
438 | "know",
439 | "knowledge",
440 | "land",
441 | "language",
442 | "large",
443 | "last",
444 | "late",
445 | "later",
446 | "laugh",
447 | "law",
448 | "lawyer",
449 | "lay",
450 | "lead",
451 | "leader",
452 | "learn",
453 | "least",
454 | "leave",
455 | "left",
456 | "leg",
457 | "legal",
458 | "less",
459 | "let",
460 | "letter",
461 | "level",
462 | "lie",
463 | "life",
464 | "light",
465 | "like",
466 | "likely",
467 | "line",
468 | "list",
469 | "listen",
470 | "little",
471 | "live",
472 | "local",
473 | "long",
474 | "look",
475 | "lose",
476 | "loss",
477 | "lot",
478 | "love",
479 | "low",
480 | "machine",
481 | "magazine",
482 | "main",
483 | "maintain",
484 | "major",
485 | "majority",
486 | "make",
487 | "man",
488 | "manage",
489 | "management",
490 | "manager",
491 | "many",
492 | "market",
493 | "marriage",
494 | "material",
495 | "matter",
496 | "may",
497 | "maybe",
498 | "me",
499 | "mean",
500 | "measure",
501 | "media",
502 | "medical",
503 | "meet",
504 | "meeting",
505 | "member",
506 | "memory",
507 | "mention",
508 | "message",
509 | "method",
510 | "middle",
511 | "might",
512 | "million",
513 | "mind",
514 | "minute",
515 | "miss",
516 | "mission",
517 | "model",
518 | "modern",
519 | "moment",
520 | "money",
521 | "month",
522 | "more",
523 | "morning",
524 | "most",
525 | "mother",
526 | "mouth",
527 | "move",
528 | "movement",
529 | "movie",
530 | "Mr",
531 | "Mrs",
532 | "much",
533 | "music",
534 | "must",
535 | "my",
536 | "myself",
537 | "name",
538 | "nation",
539 | "national",
540 | "natural",
541 | "nature",
542 | "near",
543 | "nearly",
544 | "necessary",
545 | "need",
546 | "network",
547 | "never",
548 | "new",
549 | "news",
550 | "newspaper",
551 | "next",
552 | "nice",
553 | "night",
554 | "no",
555 | "none",
556 | "nor",
557 | "north",
558 | "not",
559 | "note",
560 | "nothing",
561 | "notice",
562 | "now",
563 | "n't",
564 | "number",
565 | "occur",
566 | "of",
567 | "off",
568 | "offer",
569 | "office",
570 | "officer",
571 | "official",
572 | "often",
573 | "oh",
574 | "oil",
575 | "ok",
576 | "old",
577 | "on",
578 | "once",
579 | "one",
580 | "only",
581 | "onto",
582 | "open",
583 | "operation",
584 | "opportunity",
585 | "option",
586 | "or",
587 | "order",
588 | "organization",
589 | "other",
590 | "others",
591 | "our",
592 | "out",
593 | "outside",
594 | "over",
595 | "own",
596 | "owner",
597 | "page",
598 | "pain",
599 | "painting",
600 | "paper",
601 | "parent",
602 | "part",
603 | "participant",
604 | "particular",
605 | "particularly",
606 | "partner",
607 | "party",
608 | "pass",
609 | "past",
610 | "patient",
611 | "pattern",
612 | "pay",
613 | "peace",
614 | "people",
615 | "per",
616 | "perform",
617 | "performance",
618 | "perhaps",
619 | "period",
620 | "person",
621 | "personal",
622 | "phone",
623 | "physical",
624 | "pick",
625 | "picture",
626 | "piece",
627 | "fucking",
628 | "place",
629 | "plan",
630 | "plant",
631 | "play",
632 | "player",
633 | "PM",
634 | "point",
635 | "police",
636 | "policy",
637 | "political",
638 | "politics",
639 | "poor",
640 | "popular",
641 | "population",
642 | "position",
643 | "positive",
644 | "possible",
645 | "power",
646 | "practice",
647 | "prepare",
648 | "present",
649 | "president",
650 | "pressure",
651 | "pretty",
652 | "prevent",
653 | "price",
654 | "private",
655 | "probably",
656 | "problem",
657 | "process",
658 | "shit",
659 | "produce",
660 | "product",
661 | "production",
662 | "professional",
663 | "professor",
664 | "program",
665 | "project",
666 | "property",
667 | "protect",
668 | "prove",
669 | "provide",
670 | "public",
671 | "pull",
672 | "fuck",
673 | "purpose",
674 | "push",
675 | "put",
676 | "quality",
677 | "question",
678 | "quickly",
679 | "quite",
680 | "race",
681 | "radio",
682 | "raise",
683 | "range",
684 | "rate",
685 | "rather",
686 | "reach",
687 | "read",
688 | "ready",
689 | "real",
690 | "reality",
691 | "realize",
692 | "really",
693 | "reason",
694 | "receive",
695 | "recent",
696 | "recently",
697 | "recognize",
698 | "record",
699 | "red",
700 | "reduce",
701 | "reflect",
702 | "region",
703 | "relate",
704 | "relationship",
705 | "religious",
706 | "remain",
707 | "remember",
708 | "remove",
709 | "report",
710 | "represent",
711 | "Republican",
712 | "require",
713 | "research",
714 | "resource",
715 | "respond",
716 | "response",
717 | "responsibility",
718 | "rest",
719 | "result",
720 | "return",
721 | "reveal",
722 | "rich",
723 | "right",
724 | "rise",
725 | "risk",
726 | "road",
727 | "rock",
728 | "role",
729 | "room",
730 | "rule",
731 | "run",
732 | "safe",
733 | "same",
734 | "save",
735 | "say",
736 | "scene",
737 | "school",
738 | "score",
739 | "sea",
740 | "season",
741 | "seat",
742 | "second",
743 | "section",
744 | "security",
745 | "see",
746 | "seek",
747 | "seem",
748 | "sell",
749 | "send",
750 | "senior",
751 | "sense",
752 | "series",
753 | "serious",
754 | "serve",
755 | "service",
756 | "set",
757 | "seven",
758 | "several",
759 | "sex",
760 | "sexual",
761 | "shake",
762 | "share",
763 | "she",
764 | "shoot",
765 | "short",
766 | "shot",
767 | "should",
768 | "shoulder",
769 | "show",
770 | "side",
771 | "sign",
772 | "significant",
773 | "similar",
774 | "simple",
775 | "simply",
776 | "since",
777 | "sing",
778 | "single",
779 | "sister",
780 | "sit",
781 | "site",
782 | "situation",
783 | "six",
784 | "size",
785 | "skill",
786 | "skin",
787 | "small",
788 | "smile",
789 | "so",
790 | "social",
791 | "society",
792 | "soldier",
793 | "some",
794 | "somebody",
795 | "someone",
796 | "something",
797 | "sometimes",
798 | "son",
799 | "song",
800 | "soon",
801 | "sort",
802 | "sound",
803 | "source",
804 | "south",
805 | "southern",
806 | "space",
807 | "speak",
808 | "special",
809 | "specific",
810 | "speech",
811 | "spend",
812 | "sport",
813 | "spring",
814 | "staff",
815 | "stage",
816 | "stand",
817 | "standard",
818 | "star",
819 | "start",
820 | "state",
821 | "statement",
822 | "station",
823 | "stay",
824 | "step",
825 | "still",
826 | "stock",
827 | "stop",
828 | "store",
829 | "story",
830 | "strategy",
831 | "street",
832 | "strong",
833 | "structure",
834 | "student",
835 | "study",
836 | "stuff",
837 | "style",
838 | "subject",
839 | "success",
840 | "successful",
841 | "such",
842 | "suddenly",
843 | "suffer",
844 | "suggest",
845 | "summer",
846 | "support",
847 | "sure",
848 | "surface",
849 | "system",
850 | "table",
851 | "take",
852 | "talk",
853 | "task",
854 | "tax",
855 | "teach",
856 | "teacher",
857 | "team",
858 | "technology",
859 | "television",
860 | "tell",
861 | "ten",
862 | "tend",
863 | "term",
864 | "test",
865 | "than",
866 | "thank",
867 | "that",
868 | "the",
869 | "their",
870 | "them",
871 | "themselves",
872 | "then",
873 | "theory",
874 | "there",
875 | "these",
876 | "they",
877 | "thing",
878 | "think",
879 | "third",
880 | "this",
881 | "those",
882 | "though",
883 | "thought",
884 | "thousand",
885 | "threat",
886 | "three",
887 | "through",
888 | "throughout",
889 | "throw",
890 | "thus",
891 | "time",
892 | "to",
893 | "today",
894 | "together",
895 | "tonight",
896 | "too",
897 | "top",
898 | "total",
899 | "tough",
900 | "toward",
901 | "town",
902 | "trade",
903 | "traditional",
904 | "training",
905 | "travel",
906 | "treat",
907 | "treatment",
908 | "tree",
909 | "trial",
910 | "trip",
911 | "trouble",
912 | "true",
913 | "truth",
914 | "try",
915 | "turn",
916 | "TV",
917 | "two",
918 | "type",
919 | "under",
920 | "understand",
921 | "unit",
922 | "until",
923 | "up",
924 | "upon",
925 | "us",
926 | "use",
927 | "usually",
928 | "value",
929 | "various",
930 | "very",
931 | "victim",
932 | "view",
933 | "violence",
934 | "visit",
935 | "voice",
936 | "vote",
937 | "wait",
938 | "walk",
939 | "wall",
940 | "want",
941 | "war",
942 | "watch",
943 | "water",
944 | "way",
945 | "we",
946 | "weapon",
947 | "wear",
948 | "week",
949 | "weight",
950 | "well",
951 | "west",
952 | "western",
953 | "what",
954 | "whatever",
955 | "when",
956 | "where",
957 | "whether",
958 | "which",
959 | "while",
960 | "white",
961 | "who",
962 | "whole",
963 | "whom",
964 | "whose",
965 | "why",
966 | "wide",
967 | "wife",
968 | "will",
969 | "win",
970 | "wind",
971 | "window",
972 | "wish",
973 | "with",
974 | "within",
975 | "without",
976 | "woman",
977 | "wonder",
978 | "word",
979 | "work",
980 | "worker",
981 | "world",
982 | "worry",
983 | "would",
984 | "write",
985 | "writer",
986 | "wrong",
987 | "yard",
988 | "yeah",
989 | "year",
990 | "yes",
991 | "yet",
992 | "you",
993 | "young",
994 | "your",
995 | "yourself",
996 | "a", "about", "above", "after", "again", "against", "all", "am", "an",
997 | "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below",
998 | "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during",
999 | "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd",
1000 | "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his",
1001 | "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's",
1002 | "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once",
1003 | "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same",
1004 | "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's",
1005 | "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they",
1006 | "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under",
1007 | "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what",
1008 | "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom",
1009 | "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your",
1010 | "yours", "yourself", "yourselves", "[[removed]]", "[[deleted]]", "doesn't", "don't", "not",
1011 | "just", "The", "can", "can't", "no","",
1012 | "[i",
1013 | "]",
1014 | "-",
1015 | "going",
1016 | "[",
1017 | "[the",
1018 | "[>",
1019 | "didn't",
1020 | "isn't",
1021 | "things",
1022 | "it.",
1023 | "got",
1024 | "said",
1025 | "years",
1026 | "used",
1027 | "made",
1028 | "makes",
1029 | "paul",
1030 | "it,",
1031 | "[you",
1032 | "saying",
1033 | "getting",
1034 | "[this",
1035 | "ron",
1036 | "using",
1037 | "seems",
1038 | "trying",
1039 | "making",
1040 | "reddit",
1041 | "wouldn't",
1042 | "won't",
1043 | "wasn't",
1044 | "[i'm"};
1045 | }
1046 |
--------------------------------------------------------------------------------
/project7/.gitignore:
--------------------------------------------------------------------------------
1 | /bin/
2 | /target/
3 |
--------------------------------------------------------------------------------
/project7/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/main/java=UTF-8
3 | encoding//src/main/resources=UTF-8
4 | encoding/=UTF-8
5 |
--------------------------------------------------------------------------------
/project7/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate
4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
6 | org.eclipse.jdt.core.compiler.compliance=1.8
7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
13 | org.eclipse.jdt.core.compiler.release=disabled
14 | org.eclipse.jdt.core.compiler.source=1.8
15 |
--------------------------------------------------------------------------------
/project7/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/project7/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | com.jobreadyprogrammer
4 | project7
5 | 0.0.1-SNAPSHOT
6 | jar
7 |
8 |
9 |
10 | UTF-8
11 | UTF-8
12 | 1.8
13 | 2.11
14 | 2.3.0
15 | 42.1.4
16 |
17 |
18 |
19 |
20 |
21 |
22 | org.apache.spark
23 | spark-core_${scala.version}
24 | ${spark.version}
25 |
26 |
27 |
28 |
29 | org.apache.spark
30 | spark-sql_${scala.version}
31 | ${spark.version}
32 |
33 |
34 |
35 | org.apache.hadoop
36 | hadoop-hdfs
37 | 2.2.0
38 |
39 |
40 |
41 | org.apache.spark
42 | spark-mllib_${scala.version}
43 | ${spark.version}
44 |
45 |
46 |
47 | org.apache.spark
48 | spark-sql-kafka-0-10_2.11
49 | 2.3.0
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 | org.apache.maven.plugins
60 | maven-compiler-plugin
61 | 3.5.1
62 |
63 | 1.8
64 | 1.8
65 |
66 |
67 |
68 |
86 |
87 |
88 |
89 | maven-jar-plugin
90 | 3.0.2
91 |
92 | 1.8
93 | 1.8
94 |
95 |
96 | com.jobreadyprogrammer.spark.Application
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
--------------------------------------------------------------------------------
/project7/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project7/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc
--------------------------------------------------------------------------------
/project7/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project7/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet
--------------------------------------------------------------------------------
/project7/src/main/java/com/jobreadyprogrammer/spark/StreamingFileDirectoryApplication.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import org.apache.spark.sql.Dataset;
4 | import org.apache.spark.sql.Row;
5 | import org.apache.spark.sql.SparkSession;
6 | import org.apache.spark.sql.streaming.StreamingQuery;
7 | import org.apache.spark.sql.streaming.StreamingQueryException;
8 | import org.apache.spark.sql.types.StructType;
9 | import static org.apache.spark.sql.functions.*;
10 |
11 | public class StreamingFileDirectoryApplication {
12 |
13 | public static void main(String[] args) throws StreamingQueryException {
14 |
15 |
16 | SparkSession spark = SparkSession.builder()
17 | .appName("StreamingFileDirectoryWordCount")
18 | .master("local")
19 | .getOrCreate();
20 |
21 | // Read all the csv files written atomically in a directory
22 | StructType userSchema = new StructType().add("date", "string").add("value", "float");
23 |
24 | Dataset stockData = spark
25 | .readStream()
26 | .option("sep", ",")
27 | .schema(userSchema) // Specify schema of the csv files
28 | .csv("/Users/imtiazahmad/Desktop/SparkCourse/data/IncomingStockFiles"); // Equivalent to format("csv").load("/path/to/directory")
29 |
30 |
31 | Dataset resultDf = stockData.groupBy("date").agg(avg(stockData.col("value")));
32 |
33 | StreamingQuery query = resultDf.writeStream()
34 | .outputMode("complete")
35 | .format("console")
36 | .start();
37 |
38 | query.awaitTermination();
39 |
40 | }
41 |
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/project7/src/main/java/com/jobreadyprogrammer/spark/StreamingKafkaConsumer.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import java.util.Arrays;
4 | import java.util.Properties;
5 |
6 | import org.apache.spark.api.java.function.FlatMapFunction;
7 | import org.apache.spark.sql.Dataset;
8 | import org.apache.spark.sql.Encoders;
9 | import org.apache.spark.sql.Row;
10 | import org.apache.spark.sql.SaveMode;
11 | import org.apache.spark.sql.SparkSession;
12 | import org.apache.spark.sql.streaming.StreamingQuery;
13 | import org.apache.spark.sql.streaming.StreamingQueryException;
14 |
15 | public class StreamingKafkaConsumer {
16 |
17 | public static void main(String[] args) throws StreamingQueryException {
18 |
19 | SparkSession spark = SparkSession.builder()
20 | .appName("StreamingKafkaConsumer")
21 | .master("local")
22 | .getOrCreate();
23 |
24 | // Kafka Consumer
25 | Dataset messagesDf = spark.readStream()
26 | .format("kafka")
27 | .option("kafka.bootstrap.servers", "localhost:9092")
28 | .option("subscribe", "test")
29 | .load()
30 | .selectExpr("CAST(value AS STRING)"); // lines.selectExpr("CAST key AS STRING", "CAST value AS STRING") For key value
31 |
32 | // message.show() // <-- Can't do this when streaming!
33 | Dataset words = messagesDf
34 | .as(Encoders.STRING())
35 | .flatMap((FlatMapFunction) x -> Arrays.asList(x.split(" ")).iterator(), Encoders.STRING());
36 |
37 | Dataset wordCounts = words.groupBy("value").count();
38 | //
39 |
40 |
41 | StreamingQuery query = wordCounts.writeStream()
42 | .outputMode("complete")
43 | .format("console")
44 | .start();
45 | //
46 | query.awaitTermination();
47 |
48 |
49 | }
50 |
51 | }
52 |
--------------------------------------------------------------------------------
/project7/src/main/java/com/jobreadyprogrammer/spark/StreamingSocketApplication.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import java.util.Arrays;
4 |
5 | import org.apache.spark.api.java.function.FlatMapFunction;
6 | import org.apache.spark.sql.Dataset;
7 | import org.apache.spark.sql.Encoders;
8 | import org.apache.spark.sql.Row;
9 | import org.apache.spark.sql.SparkSession;
10 | import org.apache.spark.sql.streaming.StreamingQuery;
11 | import org.apache.spark.sql.streaming.StreamingQueryException;
12 |
13 |
14 | public class StreamingSocketApplication {
15 |
16 | public static void main(String[] args) throws StreamingQueryException {
17 |
18 |
19 | // First start a socket connection at 9999 using this: nc -lk 9999
20 | SparkSession spark = SparkSession.builder()
21 | .appName("StreamingSocketWordCount")
22 | .master("local")
23 | .getOrCreate();
24 |
25 | // Create DataFrame representing the stream of input lines from connection to localhost:9999
26 |
27 | Dataset lines = spark
28 | .readStream()
29 | .format("socket")
30 | .option("host", "localhost")
31 | .option("port", 9999)
32 | .load();
33 |
34 | Dataset words = lines
35 | .as(Encoders.STRING())
36 | .flatMap((FlatMapFunction) x -> Arrays.asList(x.split(" ")).iterator(), Encoders.STRING());
37 |
38 | Dataset wordCounts = words.groupBy("value").count();
39 |
40 | StreamingQuery query = wordCounts.writeStream()
41 | .outputMode("append")
42 | .format("console")
43 | .start();
44 |
45 | query.awaitTermination();
46 |
47 |
48 |
49 |
50 |
51 | }
52 |
53 |
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/project8/.gitignore:
--------------------------------------------------------------------------------
1 | /bin/
2 | /target/
3 |
--------------------------------------------------------------------------------
/project8/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/main/java=UTF-8
3 | encoding//src/main/resources=UTF-8
4 | encoding/=UTF-8
5 |
--------------------------------------------------------------------------------
/project8/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate
4 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
6 | org.eclipse.jdt.core.compiler.compliance=1.8
7 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
8 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
9 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
10 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
12 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
13 | org.eclipse.jdt.core.compiler.release=disabled
14 | org.eclipse.jdt.core.compiler.source=1.8
15 |
--------------------------------------------------------------------------------
/project8/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/project8/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | com.jobreadyprogrammer
4 | project8
5 | 0.0.1-SNAPSHOT
6 | jar
7 |
8 |
9 |
10 | UTF-8
11 | UTF-8
12 | 1.8
13 | 2.11
14 | 2.3.0
15 | 42.1.4
16 |
17 |
18 |
19 |
20 |
21 |
22 | org.apache.spark
23 | spark-core_${scala.version}
24 | ${spark.version}
25 |
26 |
27 |
28 |
29 | org.apache.spark
30 | spark-sql_${scala.version}
31 | ${spark.version}
32 |
33 |
34 |
35 | org.apache.hadoop
36 | hadoop-hdfs
37 | 2.2.0
38 |
39 |
40 |
41 | org.apache.spark
42 | spark-mllib_${scala.version}
43 | ${spark.version}
44 |
45 |
46 |
47 | org.apache.spark
48 | spark-sql-kafka-0-10_2.11
49 | 2.3.0
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 | org.apache.maven.plugins
60 | maven-compiler-plugin
61 | 3.5.1
62 |
63 | 1.8
64 | 1.8
65 |
66 |
67 |
68 |
86 |
87 |
88 |
89 | maven-jar-plugin
90 | 3.0.2
91 |
92 | 1.8
93 | 1.8
94 |
95 |
96 | com.jobreadyprogrammer.spark.Application
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
--------------------------------------------------------------------------------
/project8/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project8/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/.part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet.crc
--------------------------------------------------------------------------------
/project8/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project8/spark-warehouse/grades_view_perm/_temporary/0/_temporary/attempt_20180912224126_0002_m_000000_0/part-00000-71756340-146d-4fb3-8e31-a959a9a52fcc-c000.snappy.parquet
--------------------------------------------------------------------------------
/project8/src/main/java/com/jobreadyprogrammer/spark/FlatMapAndFilterRddApp.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | public class FlatMapAndFilterRddApp {
4 |
5 | }
6 |
--------------------------------------------------------------------------------
/project8/src/main/java/com/jobreadyprogrammer/spark/JoinRddApp.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | public class JoinRddApp {
4 |
5 | public static void main(String[] args) {
6 | // TODO Auto-generated method stub
7 |
8 | }
9 |
10 | }
11 |
--------------------------------------------------------------------------------
/project8/src/main/java/com/jobreadyprogrammer/spark/MapAndReduceRddApp.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import org.apache.spark.SparkConf;
7 | import org.apache.spark.api.java.JavaRDD;
8 | import org.apache.spark.api.java.JavaSparkContext;
9 |
10 | public class MapAndReduceRddApp {
11 |
12 | public static void main(String[] args) {
13 |
14 | List inputData = new ArrayList<>();
15 |
16 | inputData.add(9.00);
17 | inputData.add(4.00);
18 | inputData.add(83.00);
19 | inputData.add(142.00);
20 | inputData.add(75.00);
21 | inputData.add(25.00);
22 |
23 | SparkConf conf = new SparkConf().setAppName("RddMapReduce").setMaster("local[*]");
24 | JavaSparkContext sc = new JavaSparkContext(conf);
25 |
26 | JavaRDD myRdd = sc.parallelize(inputData);
27 |
28 | // Map function:
29 | JavaRDD squareRootRdd = myRdd.map(v -> Math.sqrt(v));
30 | squareRootRdd.foreach(v -> System.out.println(v)); // System.out::println
31 |
32 | // count number of elements using map and reduce functions
33 | JavaRDD counterRdd = squareRootRdd.map(v1 -> 1);
34 | int count = counterRdd.reduce((v1, v2) -> v1 + v2);
35 | System.out.println(count);
36 |
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/project8/src/main/java/com/jobreadyprogrammer/spark/TupleAndPairRddApp.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import org.apache.spark.SparkConf;
7 | import org.apache.spark.api.java.JavaPairRDD;
8 | import org.apache.spark.api.java.JavaRDD;
9 | import org.apache.spark.api.java.JavaSparkContext;
10 |
11 | import scala.Tuple2;
12 | import scala.Tuple3;
13 |
14 | public class TupleAndPairRddApp {
15 |
16 | public static void main(String[] args) {
17 |
18 | // List inputData = new ArrayList<>();
19 | //
20 | // inputData.add(10);
21 | // inputData.add(20);
22 | // inputData.add(142);
23 | // inputData.add(49);
24 | // inputData.add(25);
25 | // inputData.add(16);
26 |
27 |
28 |
29 | SparkConf conf = new SparkConf().setAppName("tupleExample").setMaster("local");
30 | JavaSparkContext sc = new JavaSparkContext(conf);
31 | ////
32 | // JavaRDD inputNumbersRdd = sc.parallelize(inputData);
33 |
34 | // JavaRDD twoColumnRdd = inputNumbersRdd.map(v -> new IntegerWithRoot(v));
35 | // twoColumnRdd.foreach(v -> System.out.println(v.number + ", " + v.squareRoot));
36 |
37 | // JavaRDD> squaredTupleRdd = inputNumbersRdd.map(
38 | // v -> new Tuple3<>(v, Math.sqrt(v), "This is 3rd arg")
39 | // );
40 | // squaredTupleRdd.foreach(v -> System.out.println(v));
41 |
42 |
43 | List inputData = new ArrayList<>();
44 |
45 | inputData.add("WARN: client stopped connection");
46 | inputData.add("FATAL: GET request failed");
47 | inputData.add("WARN: client stopped connection");
48 | inputData.add("ERROR: Incorrect URL");
49 | inputData.add("ERROR: POST request failed");
50 | inputData.add("FATAL: File does not exist");
51 | inputData.add("ERROR: File does not exist");
52 |
53 | JavaRDD logRdd = sc.parallelize(inputData);
54 |
55 | JavaPairRDD pairRdd = logRdd.mapToPair(v -> {
56 | String [] columns = v.split(":");
57 | String logLevel = columns[0];
58 | String message = columns[1];
59 |
60 | return new Tuple2(logLevel, 1L);
61 | });
62 |
63 | JavaPairRDD logLevelCountsRdd = pairRdd.reduceByKey((v1, v2) -> v1+ v2);
64 | logLevelCountsRdd.foreach(v -> System.out.println(v._1 + ": " + v._2));
65 |
66 |
67 | }
68 |
69 | }
70 |
71 | class IntegerWithRoot{
72 |
73 | int number;
74 | double squareRoot;
75 |
76 | public IntegerWithRoot(int i) {
77 | this.number = i;
78 | this.squareRoot = Math.sqrt(i);
79 | }
80 | }
81 |
82 |
--------------------------------------------------------------------------------
/project9/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | com.jobreadyprogrammer
4 | project9
5 | 0.0.1-SNAPSHOT
6 | jar
7 |
8 |
9 |
10 | UTF-8
11 | UTF-8
12 | 1.8
13 | 2.11
14 | 2.3.0
15 | 42.1.4
16 |
17 |
18 |
19 |
20 |
21 |
22 | org.apache.spark
23 | spark-core_${scala.version}
24 | ${spark.version}
25 |
26 |
27 |
28 |
29 | org.apache.spark
30 | spark-sql_${scala.version}
31 | ${spark.version}
32 |
33 |
34 |
35 | org.apache.hadoop
36 | hadoop-hdfs
37 | 2.2.0
38 |
39 |
40 |
41 | org.apache.spark
42 | spark-mllib_${scala.version}
43 | ${spark.version}
44 |
45 |
46 |
47 | org.apache.spark
48 | spark-sql-kafka-0-10_2.11
49 | 2.3.0
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 | org.apache.maven.plugins
61 | maven-dependency-plugin
62 |
63 |
64 | copy-dependencies
65 | prepare-package
66 |
67 | copy-dependencies
68 |
69 |
70 |
71 | ${project.build.directory}/libs
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 | org.springframework.boot
81 | spring-boot-maven-plugin
82 |
83 |
84 |
85 | repackage
86 |
87 |
88 |
89 | com.jobreadyprogrammer.spark.Application
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
--------------------------------------------------------------------------------
/project9/src/main/java/com/jobreadyprogrammer/spark/KmeansClustering.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import org.apache.log4j.Level;
4 | import org.apache.log4j.Logger;
5 | import org.apache.spark.ml.clustering.KMeans;
6 | import org.apache.spark.ml.clustering.KMeansModel;
7 | import org.apache.spark.ml.feature.VectorAssembler;
8 | import org.apache.spark.sql.Dataset;
9 | import org.apache.spark.sql.Row;
10 | import org.apache.spark.sql.SparkSession;
11 |
12 | public class KmeansClustering {
13 |
14 | public static void main(String[] args) {
15 |
16 | Logger.getLogger("org").setLevel(Level.ERROR);
17 | Logger.getLogger("akka").setLevel(Level.ERROR);
18 |
19 | SparkSession spark = new SparkSession.Builder()
20 | .appName("kmeans Clustering")
21 | .master("local")
22 | .getOrCreate();
23 |
24 | Dataset wholeSaleDf = spark.read()
25 | .option("header", "true")
26 | .option("inferSchema", "true")
27 | .format("csv")
28 | .load("/Users/imtiazahmad/Desktop/SparkCourse/data/Wholesale customers data.csv");
29 | wholeSaleDf.show();
30 | Dataset featuresDf = wholeSaleDf.select("channel", "fresh", "milk", "grocery", "frozen", "detergents_paper", "delicassen");
31 |
32 | VectorAssembler assembler = new VectorAssembler();
33 | assembler = assembler.setInputCols(new String[] {"channel", "fresh", "milk", "grocery", "frozen", "detergents_paper", "delicassen"})
34 | .setOutputCol("features");
35 |
36 | Dataset trainingData = assembler.transform(featuresDf).select("features");
37 |
38 | KMeans kmeans = new KMeans().setK(10);
39 |
40 | KMeansModel model = kmeans.fit(trainingData);
41 |
42 | System.out.println(model.computeCost(trainingData));
43 | model.summary().predictions().show();
44 |
45 |
46 | }
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/project9/src/main/java/com/jobreadyprogrammer/spark/LinearMarketingVsSales.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import org.apache.log4j.Level;
4 | import org.apache.log4j.Logger;
5 | import org.apache.spark.ml.feature.VectorAssembler;
6 | import org.apache.spark.ml.regression.LinearRegression;
7 | import org.apache.spark.ml.regression.LinearRegressionModel;
8 | import org.apache.spark.sql.Dataset;
9 | import org.apache.spark.sql.Row;
10 | import org.apache.spark.sql.SparkSession;
11 |
12 | public class LinearMarketingVsSales {
13 |
14 | public static void main(String[] args) {
15 |
16 | Logger.getLogger("org").setLevel(Level.ERROR);
17 | Logger.getLogger("akka").setLevel(Level.ERROR);
18 |
19 | SparkSession spark = new SparkSession.Builder()
20 | .appName("LinearRegressionExample")
21 | .master("local")
22 | .getOrCreate();
23 |
24 | Dataset markVsSalesDf = spark.read()
25 | .option("header", "true")
26 | .option("inferSchema", "true")
27 | .format("csv")
28 | .load("/Users/imtiazahmad/Desktop/SparkCourse/data/marketing_vs_sales.csv");
29 | markVsSalesDf.show();
30 |
31 | // go through the lecture first and then start un-commenting the code below
32 | /**
33 | Dataset mldf = markVsSalesDf.withColumnRenamed("sales", "label")
34 | .select("label", "marketing_spend","bad_day");
35 |
36 | String[] featureColumns = {"marketing_spend", "bad_day"};
37 |
38 | VectorAssembler assember = new VectorAssembler()
39 | .setInputCols(featureColumns)
40 | .setOutputCol("features");
41 |
42 | Dataset lblFeaturesDf = assember.transform(mldf).select("label", "features");
43 | lblFeaturesDf = lblFeaturesDf.na().drop();
44 | lblFeaturesDf.show();
45 |
46 | // next we need to create a linear regression model object
47 | LinearRegression lr = new LinearRegression();
48 | LinearRegressionModel learningModel = lr.fit(lblFeaturesDf);
49 |
50 | learningModel.summary().predictions().show();
51 |
52 | System.out.println("R Squared: "+ learningModel.summary().r2());
53 |
54 | **/
55 |
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/project9/src/main/java/com/jobreadyprogrammer/spark/LinearMpgRegression.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import org.apache.log4j.Level;
4 | import org.apache.log4j.Logger;
5 | import org.apache.spark.ml.Pipeline;
6 | import org.apache.spark.ml.PipelineModel;
7 | import org.apache.spark.ml.PipelineStage;
8 | import org.apache.spark.ml.feature.VectorAssembler;
9 | import org.apache.spark.ml.regression.LinearRegression;
10 | import org.apache.spark.ml.regression.LinearRegressionModel;
11 | import org.apache.spark.sql.Dataset;
12 | import org.apache.spark.sql.Row;
13 | import org.apache.spark.sql.SparkSession;
14 |
15 | public class LinearMpgRegression {
16 |
17 | public static void main(String[] args) {
18 |
19 | Logger.getLogger("org").setLevel(Level.ERROR);
20 | Logger.getLogger("akka").setLevel(Level.ERROR);
21 |
22 | SparkSession spark = new SparkSession.Builder()
23 | .appName("LinearRegressionMpgExample")
24 | .master("local")
25 | .getOrCreate();
26 |
27 | Dataset autoMpgDf = spark.read()
28 | .option("header", "true")
29 | .option("inferSchema", "true")
30 | .format("csv")
31 | .load("/Users/imtiazahmad/Desktop/SparkCourse/data/auto_mpg.csv");
32 |
33 | autoMpgDf = autoMpgDf.withColumnRenamed("mpg", "label")
34 | .drop("acceleration")
35 | .drop("modelYear")
36 | .drop("origin")
37 | .drop("carName")
38 | .drop("displacement");
39 |
40 | autoMpgDf = autoMpgDf.na().drop();
41 |
42 | String[] featureColumns = {"cylinders", "horsePower", "weight"};
43 |
44 | VectorAssembler assembler = new VectorAssembler()
45 | .setInputCols(featureColumns)
46 | .setOutputCol("features");
47 |
48 | autoMpgDf = assembler.transform(autoMpgDf).select("label", "features");
49 |
50 | LinearRegression lr = new LinearRegression();
51 | LinearRegressionModel lrm = lr.fit(autoMpgDf);
52 |
53 | Pipeline pl = new Pipeline()
54 | .setStages(new PipelineStage[] {lrm});
55 |
56 | Dataset [] splitData = autoMpgDf.randomSplit(new double[] {0.7, 0.3});
57 |
58 | Dataset trainingData = splitData[0];
59 | Dataset testData = splitData[1];
60 |
61 | PipelineModel model = pl.fit(trainingData);
62 |
63 | Dataset result = model.transform(testData);
64 | result.show();
65 |
66 | }
67 |
68 |
69 |
70 | }
71 |
72 |
73 |
--------------------------------------------------------------------------------
/project9/src/main/java/com/jobreadyprogrammer/spark/LogisticRegressionExample.java:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 |
3 | import org.apache.log4j.Level;
4 | import org.apache.log4j.Logger;
5 | import org.apache.spark.ml.Pipeline;
6 | import org.apache.spark.ml.PipelineModel;
7 | import org.apache.spark.ml.PipelineStage;
8 | import org.apache.spark.ml.classification.LogisticRegression;
9 | import org.apache.spark.ml.feature.StringIndexer;
10 | import org.apache.spark.ml.feature.VectorAssembler;
11 | import org.apache.spark.sql.Dataset;
12 | import org.apache.spark.sql.Row;
13 | import org.apache.spark.sql.SparkSession;
14 |
15 | public class LogisticRegressionExample {
16 |
17 | public static void main(String[] args) {
18 |
19 | Logger.getLogger("org").setLevel(Level.ERROR);
20 | Logger.getLogger("akka").setLevel(Level.ERROR);
21 |
22 | SparkSession spark = new SparkSession.Builder()
23 | .appName("LogisticRegressionExample")
24 | .master("local")
25 | .getOrCreate();
26 |
27 | Dataset treatmentDf = spark.read()
28 | .option("header", "true")
29 | .option("inferSchema", "true")
30 | .format("csv")
31 | .load("/Users/imtiazahmad/Desktop/SparkCourse/data/cryotherapy.csv");
32 |
33 | Dataset lblFeatureDf = treatmentDf.withColumnRenamed("Result_of_Treatment", "label")
34 | .select("label", "sex","age","time","number_of_warts","type","area");
35 |
36 | lblFeatureDf = lblFeatureDf.na().drop();
37 |
38 | StringIndexer genderIndexer = new StringIndexer()
39 | .setInputCol("sex").setOutputCol("sexIndex");
40 |
41 | VectorAssembler assembler = new VectorAssembler()
42 | .setInputCols(new String [] {"sexIndex", "age", "time", "number_of_warts", "type", "area"})
43 | .setOutputCol("features");
44 |
45 |
46 | Dataset [] splitData = lblFeatureDf.randomSplit(new double[] {.7, .3});
47 | Dataset trainingDf = splitData[0];
48 | Dataset testingDf = splitData[1];
49 |
50 | LogisticRegression logReg = new LogisticRegression();
51 |
52 | Pipeline pl = new Pipeline();
53 | pl.setStages(new PipelineStage [] {genderIndexer, assembler, logReg});
54 |
55 | PipelineModel model = pl.fit(trainingDf);
56 | Dataset results = model.transform(testingDf);
57 |
58 | results.show();
59 |
60 | }
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/project9/target/classes/com/jobreadyprogrammer/spark/KmeansClustering.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project9/target/classes/com/jobreadyprogrammer/spark/KmeansClustering.class
--------------------------------------------------------------------------------
/project9/target/classes/com/jobreadyprogrammer/spark/LinearMarketingVsSales.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project9/target/classes/com/jobreadyprogrammer/spark/LinearMarketingVsSales.class
--------------------------------------------------------------------------------
/project9/target/classes/com/jobreadyprogrammer/spark/LinearMpgRegression.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project9/target/classes/com/jobreadyprogrammer/spark/LinearMpgRegression.class
--------------------------------------------------------------------------------
/project9/target/classes/com/jobreadyprogrammer/spark/LogisticRegressionExample.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imtiazahmad007/sparkwithjava/8816e7d62dc7f0f5a0c9bf077fca709714a51dbb/project9/target/classes/com/jobreadyprogrammer/spark/LogisticRegressionExample.class
--------------------------------------------------------------------------------
/test-dev-env:
--------------------------------------------------------------------------------
1 | package com.jobreadyprogrammer.spark;
2 | import org.apache.spark.sql.Dataset;
3 | import org.apache.spark.sql.Row;
4 | import org.apache.spark.sql.SparkSession;
5 |
6 | public class Application {
7 |
8 | public static void main(String args[]) {
9 |
10 | SparkSession spark = SparkSession.builder().appName("Name").master("local").getOrCreate();
11 |
12 | Dataset df = spark.read().format("text").load("src/main/resources/wordsList.txt");
13 | df.groupBy("value").count().show();
14 |
15 | // SparkConf sparkConf = new SparkConf().setMaster("local").setAppName("Name");
16 | // JavaSparkContext sc = new JavaSparkContext(sparkConf);
17 | // JavaRDD textFile = sc.textFile("src/main/resources/wordsList.txt");
18 | // JavaPairRDD counts = textFile
19 | // .flatMap(s -> Arrays.asList(s.split(" ")).iterator())
20 | // .mapToPair(word -> new Tuple2<>(word, 1))
21 | // .reduceByKey((a, b) -> a + b);
22 | // counts.take(10);
23 | // System.out.println(counts.collect());
24 |
25 |
26 | }
27 |
28 | }
29 |
--------------------------------------------------------------------------------