├── .idea
├── compiler.xml
├── encodings.xml
├── hydra.xml
├── libraries
│ └── R_User_Library.xml
├── misc.xml
├── sbt.xml
├── scala_compiler.xml
├── uiDesigner.xml
├── vcs.xml
└── workspace.xml
├── ReadMe.md
├── pom.xml
├── src
└── main
│ ├── resources
│ ├── create_table.sql
│ ├── log4j2.properties
│ └── 代码结构.png
│ └── scala
│ └── com
│ └── jmx
│ ├── demos
│ ├── DemoMainApp.scala
│ ├── Entry.scala
│ └── SchemaLoader.scala
│ ├── metrics
│ ├── BestFilmsByOverallRating.scala
│ ├── GenresByAverageRating.scala
│ └── MostRatedFilms.scala
│ └── util
│ └── JDBCUtil.scala
└── target
└── classes
├── com
└── jmx
│ ├── demos
│ ├── DemoMainApp$.class
│ ├── DemoMainApp.class
│ ├── Entry.class
│ ├── Movies$.class
│ ├── Movies.class
│ ├── Ratings$.class
│ ├── Ratings.class
│ ├── SchemaLoader.class
│ ├── tenGreatestMoviesByAverageRating$.class
│ ├── tenGreatestMoviesByAverageRating.class
│ ├── tenMostRatedFilms$.class
│ ├── tenMostRatedFilms.class
│ ├── topGenresByAverageRating$.class
│ └── topGenresByAverageRating.class
│ ├── metrics
│ ├── BestFilmsByOverallRating$$anonfun$run$1$$anonfun$apply$1.class
│ ├── BestFilmsByOverallRating$$anonfun$run$1.class
│ ├── BestFilmsByOverallRating$$typecreator4$1.class
│ ├── BestFilmsByOverallRating.class
│ ├── GenresByAverageRating$$anonfun$run$1$$anonfun$apply$1.class
│ ├── GenresByAverageRating$$anonfun$run$1.class
│ ├── GenresByAverageRating$$typecreator4$1.class
│ ├── GenresByAverageRating.class
│ ├── MostRatedFilms$$anonfun$run$1$$anonfun$apply$1.class
│ ├── MostRatedFilms$$anonfun$run$1.class
│ ├── MostRatedFilms$$typecreator4$1.class
│ └── MostRatedFilms.class
│ └── util
│ ├── JDBCUtil$.class
│ └── JDBCUtil.class
├── create_table.sql
└── log4j2.properties
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/hydra.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.idea/libraries/R_User_Library.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/sbt.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/scala_compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/.idea/uiDesigner.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | -
6 |
7 |
8 | -
9 |
10 |
11 | -
12 |
13 |
14 | -
15 |
16 |
17 | -
18 |
19 |
20 |
21 |
22 |
23 | -
24 |
25 |
26 |
27 |
28 |
29 | -
30 |
31 |
32 |
33 |
34 |
35 | -
36 |
37 |
38 |
39 |
40 |
41 | -
42 |
43 |
44 |
45 |
46 | -
47 |
48 |
49 |
50 |
51 | -
52 |
53 |
54 |
55 |
56 | -
57 |
58 |
59 |
60 |
61 | -
62 |
63 |
64 |
65 |
66 | -
67 |
68 |
69 |
70 |
71 | -
72 |
73 |
74 | -
75 |
76 |
77 |
78 |
79 | -
80 |
81 |
82 |
83 |
84 | -
85 |
86 |
87 |
88 |
89 | -
90 |
91 |
92 |
93 |
94 | -
95 |
96 |
97 |
98 |
99 | -
100 |
101 |
102 | -
103 |
104 |
105 | -
106 |
107 |
108 | -
109 |
110 |
111 | -
112 |
113 |
114 |
115 |
116 | -
117 |
118 |
119 | -
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 | true
163 | DEFINITION_ORDER
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 | 1605769582185
361 |
362 |
363 | 1605769582185
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 | 1.8
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
--------------------------------------------------------------------------------
/ReadMe.md:
--------------------------------------------------------------------------------
1 |
2 | ## 项目介绍
3 |
4 | ### 数据集介绍
5 |
6 | 使用MovieLens的名称为ml-25m.zip的数据集,使用的文件时**movies.csv**和**ratings.csv**,上述文件的下载地址为:
7 |
8 | ```bash
9 | http://files.grouplens.org/datasets/movielens/ml-25m.zip
10 | ```
11 |
12 | - **movies.csv**
13 |
14 | 该文件是电影数据,对应的为维表数据,大小为2.89MB,包括6万多部电影,其数据格式为[movieId,title,genres],分别对应**[电影id,电影名称,电影所属分类]**,样例数据如下所示:逗号分隔
15 |
16 | ```bash
17 | 1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
18 | ```
19 |
20 | - **ratings.csv**
21 |
22 | 该文件为定影评分数据,对应为事实表数据,大小为646MB,其数据格式为:[userId,movieId,rating,timestamp],分别对应**[用户id,电影id,评分,时间戳]**,样例数据如下所示:逗号分隔
23 |
24 | ```bash
25 | 1,296,5,1147880044
26 | ```
27 |
28 | ### 项目代码结构
29 |
30 | 
31 |
32 | ## 需求分析
33 |
34 | - 需求1:查找电影评分个数超过5000,且平均评分较高的前十部电影名称及其对应的平均评分
35 |
36 |
37 | - 需求2:查找每个电影类别及其对应的平均评分
38 |
39 |
40 | - 需求3:查找被评分次数较多的前十部电影
41 |
42 | ## 代码讲解
43 |
44 | - **DemoMainApp**
45 |
46 | 该类是程序执行的入口,主要是获取数据源,转换成DataFrame,并调用封装好的业务逻辑类。
47 |
48 | ```scala
49 | object DemoMainApp {
50 | // 文件路径
51 | private val MOVIES_CSV_FILE_PATH = "file:///e:/movies.csv"
52 | private val RATINGS_CSV_FILE_PATH = "file:///e:/ratings.csv"
53 |
54 | def main(args: Array[String]): Unit = {
55 | // 创建spark session
56 | val spark = SparkSession
57 | .builder
58 | .master("local[4]")
59 | .getOrCreate
60 | // schema信息
61 | val schemaLoader = new SchemaLoader
62 | // 读取Movie数据集
63 | val movieDF = readCsvIntoDataSet(spark, MOVIES_CSV_FILE_PATH, schemaLoader.getMovieSchema)
64 | // 读取Rating数据集
65 | val ratingDF = readCsvIntoDataSet(spark, RATINGS_CSV_FILE_PATH, schemaLoader.getRatingSchema)
66 |
67 | // 需求1:查找电影评分个数超过5000,且平均评分较高的前十部电影名称及其对应的平均评分
68 | val bestFilmsByOverallRating = new BestFilmsByOverallRating
69 | //bestFilmsByOverallRating.run(movieDF, ratingDF, spark)
70 |
71 | // 需求2:查找每个电影类别及其对应的平均评分
72 | val genresByAverageRating = new GenresByAverageRating
73 | //genresByAverageRating.run(movieDF, ratingDF, spark)
74 |
75 | // 需求3:查找被评分次数较多的前十部电影
76 | val mostRatedFilms = new MostRatedFilms
77 | mostRatedFilms.run(movieDF, ratingDF, spark)
78 |
79 | spark.close()
80 |
81 | }
82 | /**
83 | * 读取数据文件,转成DataFrame
84 | *
85 | * @param spark
86 | * @param path
87 | * @param schema
88 | * @return
89 | */
90 | def readCsvIntoDataSet(spark: SparkSession, path: String, schema: StructType) = {
91 |
92 | val dataSet = spark.read
93 | .format("csv")
94 | .option("header", "true")
95 | .schema(schema)
96 | .load(path)
97 | dataSet
98 | }
99 | }
100 | ```
101 |
102 | - **Entry**
103 |
104 | 该类为实体类,封装了数据源的样例类和结果表的样例类
105 |
106 | ```scala
107 | class Entry {
108 |
109 | }
110 |
111 | case class Movies(
112 | movieId: String, // 电影的id
113 | title: String, // 电影的标题
114 | genres: String // 电影类别
115 | )
116 |
117 | case class Ratings(
118 | userId: String, // 用户的id
119 | movieId: String, // 电影的id
120 | rating: String, // 用户评分
121 | timestamp: String // 时间戳
122 | )
123 |
124 | // 需求1MySQL结果表
125 | case class tenGreatestMoviesByAverageRating(
126 | movieId: String, // 电影的id
127 | title: String, // 电影的标题
128 | avgRating: String // 电影平均评分
129 | )
130 |
131 | // 需求2MySQL结果表
132 | case class topGenresByAverageRating(
133 | genres: String, //电影类别
134 | avgRating: String // 平均评分
135 | )
136 |
137 | // 需求3MySQL结果表
138 | case class tenMostRatedFilms(
139 | movieId: String, // 电影的id
140 | title: String, // 电影的标题
141 | ratingCnt: String // 电影被评分的次数
142 | )
143 | ```
144 |
145 | - **SchemaLoader**
146 |
147 | 该类封装了数据集的schema信息,主要用于读取数据源是指定schema信息
148 |
149 | ```scala
150 | class SchemaLoader {
151 | // movies数据集schema信息
152 | private val movieSchema = new StructType()
153 | .add("movieId", DataTypes.StringType, false)
154 | .add("title", DataTypes.StringType, false)
155 | .add("genres", DataTypes.StringType, false)
156 | // ratings数据集schema信息
157 | private val ratingSchema = new StructType()
158 | .add("userId", DataTypes.StringType, false)
159 | .add("movieId", DataTypes.StringType, false)
160 | .add("rating", DataTypes.StringType, false)
161 | .add("timestamp", DataTypes.StringType, false)
162 |
163 | def getMovieSchema: StructType = movieSchema
164 |
165 | def getRatingSchema: StructType = ratingSchema
166 | }
167 | ```
168 |
169 | - **JDBCUtil**
170 |
171 | 该类封装了连接MySQL的逻辑,主要用于连接MySQL,在业务逻辑代码中会使用该工具类获取MySQL连接,将结果数据写入到MySQL中。
172 |
173 | ```sql
174 | object JDBCUtil {
175 | val dataSource = new ComboPooledDataSource()
176 | val user = "root"
177 | val password = "123qwe"
178 | val url = "jdbc:mysql://localhost:3306/mydb"
179 |
180 | dataSource.setUser(user)
181 | dataSource.setPassword(password)
182 | dataSource.setDriverClass("com.mysql.jdbc.Driver")
183 | dataSource.setJdbcUrl(url)
184 | dataSource.setAutoCommitOnClose(false)
185 | // 获取连接
186 | def getQueryRunner(): Option[QueryRunner]={
187 | try {
188 | Some(new QueryRunner(dataSource))
189 | }catch {
190 | case e:Exception =>
191 | e.printStackTrace()
192 | None
193 | }
194 | }
195 | }
196 | ```
197 |
198 | ### 需求1实现
199 |
200 | - **BestFilmsByOverallRating**
201 |
202 | 需求1实现的业务逻辑封装。该类有一个run()方法,主要是封装计算逻辑。
203 |
204 | ```scala
205 | /**
206 | * 需求1:查找电影评分个数超过5000,且平均评分较高的前十部电影名称及其对应的平均评分
207 | */
208 | class BestFilmsByOverallRating extends Serializable {
209 |
210 | def run(moviesDataset: DataFrame, ratingsDataset: DataFrame, spark: SparkSession) = {
211 | import spark.implicits._
212 |
213 | // 将moviesDataset注册成表
214 | moviesDataset.createOrReplaceTempView("movies")
215 | // 将ratingsDataset注册成表
216 | ratingsDataset.createOrReplaceTempView("ratings")
217 |
218 | // 查询SQL语句
219 | val ressql1 =
220 | """
221 | |WITH ratings_filter_cnt AS (
222 | |SELECT
223 | | movieId,
224 | | count( * ) AS rating_cnt,
225 | | avg( rating ) AS avg_rating
226 | |FROM
227 | | ratings
228 | |GROUP BY
229 | | movieId
230 | |HAVING
231 | | count( * ) >= 5000
232 | |),
233 | |ratings_filter_score AS (
234 | |SELECT
235 | | movieId, -- 电影id
236 | | avg_rating -- 电影平均评分
237 | |FROM ratings_filter_cnt
238 | |ORDER BY avg_rating DESC -- 平均评分降序排序
239 | |LIMIT 10 -- 平均分较高的前十部电影
240 | |)
241 | |SELECT
242 | | m.movieId,
243 | | m.title,
244 | | r.avg_rating AS avgRating
245 | |FROM
246 | | ratings_filter_score r
247 | |JOIN movies m ON m.movieId = r.movieId
248 | """.stripMargin
249 |
250 | val resultDS = spark.sql(ressql1).as[tenGreatestMoviesByAverageRating]
251 | // 打印数据
252 | resultDS.show(10)
253 | resultDS.printSchema()
254 | // 写入MySQL
255 | resultDS.foreachPartition(par => par.foreach(insert2Mysql(_)))
256 | }
257 |
258 | /**
259 | * 获取连接,调用写入MySQL数据的方法
260 | *
261 | * @param res
262 | */
263 | private def insert2Mysql(res: tenGreatestMoviesByAverageRating): Unit = {
264 | lazy val conn = JDBCUtil.getQueryRunner()
265 | conn match {
266 | case Some(connection) => {
267 | upsert(res, connection)
268 | }
269 | case None => {
270 | println("Mysql连接失败")
271 | System.exit(-1)
272 | }
273 | }
274 | }
275 |
276 | /**
277 | * 封装将结果写入MySQL的方法
278 | * 执行写入操作
279 | *
280 | * @param r
281 | * @param conn
282 | */
283 | private def upsert(r: tenGreatestMoviesByAverageRating, conn: QueryRunner): Unit = {
284 | try {
285 | val sql =
286 | s"""
287 | |REPLACE INTO `ten_movies_averagerating`(
288 | |movieId,
289 | |title,
290 | |avgRating
291 | |)
292 | |VALUES
293 | |(?,?,?)
294 | """.stripMargin
295 | // 执行insert操作
296 | conn.update(
297 | sql,
298 | r.movieId,
299 | r.title,
300 | r.avgRating
301 | )
302 | } catch {
303 | case e: Exception => {
304 | e.printStackTrace()
305 | System.exit(-1)
306 | }
307 | }
308 | }
309 | }
310 | ```
311 |
312 | ### 需求1结果
313 |
314 | - 结果表建表语句
315 |
316 | ```sql
317 | CREATE TABLE `ten_movies_averagerating` (
318 | `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增id',
319 | `movieId` int(11) NOT NULL COMMENT '电影id',
320 | `title` varchar(100) NOT NULL COMMENT '电影名称',
321 | `avgRating` decimal(10,2) NOT NULL COMMENT '平均评分',
322 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间',
323 | PRIMARY KEY (`id`),
324 | UNIQUE KEY `movie_id_UNIQUE` (`movieId`)
325 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
326 | ```
327 |
328 | - 统计结果
329 |
330 | 平均评分最高的前十部电影如下:
331 |
332 | | movieId | title | avgRating |
333 | | ------- | ------------------------------------------- | --------- |
334 | | 318 | Shawshank Redemption, The (1994) | 4.41 |
335 | | 858 | Godfather, The (1972) | 4.32 |
336 | | 50 | Usual Suspects, The (1995) | 4.28 |
337 | | 1221 | Godfather: Part II, The (1974) | 4.26 |
338 | | 527 | Schindler's List (1993) | 4.25 |
339 | | 2019 | Seven Samurai (Shichinin no samurai) (1954) | 4.25 |
340 | | 904 | Rear Window (1954) | 4.24 |
341 | | 1203 | 12 Angry Men (1957) | 4.24 |
342 | | 2959 | Fight Club (1999) | 4.23 |
343 | | 1193 | One Flew Over the Cuckoo's Nest (1975) | 4.22 |
344 |
345 | 上述电影评分对应的电影中文名称为:
346 |
347 | | 英文名称 | 中文名称 |
348 | | -------------------------------------------- | ------------ |
349 | | Shawshank Redemption, The (1994) | 肖申克的救赎 |
350 | | Godfather, The (1972) | 教父1 |
351 | | Usual Suspects, The (1995) | 非常嫌疑犯 |
352 | | Godfather: Part II, The (1974) | 教父2 |
353 | | Schindler's List (1993) | 辛德勒的名单 |
354 | | Seven Samurai (Shichinin no samurai) (1954) | 七武士 |
355 | | Rear Window (1954) | 后窗 |
356 | | 12 Angry Men (1957) | 十二怒汉 |
357 | | Fight Club (1999) | 搏击俱乐部 |
358 | | One Flew Over the Cuckoo's Nest (1975) | 飞越疯人院 |
359 |
360 | ### 需求2实现
361 |
362 | - **GenresByAverageRating**
363 |
364 | 需求2实现的业务逻辑封装。该类有一个run()方法,主要是封装计算逻辑。
365 |
366 | ```scala
367 | **
368 | * 需求2:查找每个电影类别及其对应的平均评分
369 | */
370 | class GenresByAverageRating extends Serializable {
371 | def run(moviesDataset: DataFrame, ratingsDataset: DataFrame, spark: SparkSession) = {
372 | import spark.implicits._
373 | // 将moviesDataset注册成表
374 | moviesDataset.createOrReplaceTempView("movies")
375 | // 将ratingsDataset注册成表
376 | ratingsDataset.createOrReplaceTempView("ratings")
377 |
378 | val ressql2 =
379 | """
380 | |WITH explode_movies AS (
381 | |SELECT
382 | | movieId,
383 | | title,
384 | | category
385 | |FROM
386 | | movies lateral VIEW explode ( split ( genres, "\\|" ) ) temp AS category
387 | |)
388 | |SELECT
389 | | m.category AS genres,
390 | | avg( r.rating ) AS avgRating
391 | |FROM
392 | | explode_movies m
393 | | JOIN ratings r ON m.movieId = r.movieId
394 | |GROUP BY
395 | | m.category
396 | | """.stripMargin
397 |
398 | val resultDS = spark.sql(ressql2).as[topGenresByAverageRating]
399 |
400 | // 打印数据
401 | resultDS.show(10)
402 | resultDS.printSchema()
403 | // 写入MySQL
404 | resultDS.foreachPartition(par => par.foreach(insert2Mysql(_)))
405 |
406 | }
407 |
408 | /**
409 | * 获取连接,调用写入MySQL数据的方法
410 | *
411 | * @param res
412 | */
413 | private def insert2Mysql(res: topGenresByAverageRating): Unit = {
414 | lazy val conn = JDBCUtil.getQueryRunner()
415 | conn match {
416 | case Some(connection) => {
417 | upsert(res, connection)
418 | }
419 | case None => {
420 | println("Mysql连接失败")
421 | System.exit(-1)
422 | }
423 | }
424 | }
425 |
426 | /**
427 | * 封装将结果写入MySQL的方法
428 | * 执行写入操作
429 | *
430 | * @param r
431 | * @param conn
432 | */
433 | private def upsert(r: topGenresByAverageRating, conn: QueryRunner): Unit = {
434 | try {
435 | val sql =
436 | s"""
437 | |REPLACE INTO `genres_average_rating`(
438 | |genres,
439 | |avgRating
440 | |)
441 | |VALUES
442 | |(?,?)
443 | """.stripMargin
444 | // 执行insert操作
445 | conn.update(
446 | sql,
447 | r.genres,
448 | r.avgRating
449 | )
450 | } catch {
451 | case e: Exception => {
452 | e.printStackTrace()
453 | System.exit(-1)
454 | }
455 | }
456 | }
457 | }
458 | ```
459 |
460 | ### 需求2结果
461 |
462 | - 结果表建表语句
463 |
464 | ```sql
465 | CREATE TABLE genres_average_rating (
466 | `id` INT ( 11 ) NOT NULL AUTO_INCREMENT COMMENT '自增id',
467 | `genres` VARCHAR ( 100 ) NOT NULL COMMENT '电影类别',
468 | `avgRating` DECIMAL ( 10, 2 ) NOT NULL COMMENT '电影类别平均评分',
469 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间',
470 | PRIMARY KEY ( `id` ),
471 | UNIQUE KEY `genres_UNIQUE` ( `genres` )
472 | ) ENGINE = INNODB DEFAULT CHARSET = utf8;
473 | ```
474 |
475 | - 统计结果
476 |
477 | 共有20个电影分类,每个电影分类的平均评分为:
478 |
479 | | genres | avgRating |
480 | | ------------------ | --------- |
481 | | Film-Noir | 3.93 |
482 | | War | 3.79 |
483 | | Documentary | 3.71 |
484 | | Crime | 3.69 |
485 | | Drama | 3.68 |
486 | | Mystery | 3.67 |
487 | | Animation | 3.61 |
488 | | IMAX | 3.6 |
489 | | Western | 3.59 |
490 | | Musical | 3.55 |
491 | | Romance | 3.54 |
492 | | Adventure | 3.52 |
493 | | Thriller | 3.52 |
494 | | Fantasy | 3.51 |
495 | | Sci-Fi | 3.48 |
496 | | Action | 3.47 |
497 | | Children | 3.43 |
498 | | Comedy | 3.42 |
499 | | (no genres listed) | 3.33 |
500 | | Horror | 3.29 |
501 |
502 | 电影分类对应的中文名称为:
503 |
504 | | 分类 | 中文名称 |
505 | | ------------------ | -------- |
506 | | Film-Noir | 黑色电影 |
507 | | War | 战争 |
508 | | Documentary | 纪录片 |
509 | | Crime | 犯罪 |
510 | | Drama | 历史剧 |
511 | | Mystery | 推理 |
512 | | Animation | 动画片 |
513 | | IMAX | 巨幕电影 |
514 | | Western | 西部电影 |
515 | | Musical | 音乐 |
516 | | Romance | 浪漫 |
517 | | Adventure | 冒险 |
518 | | Thriller | 惊悚片 |
519 | | Fantasy | 魔幻电影 |
520 | | Sci-Fi | 科幻 |
521 | | Action | 动作 |
522 | | Children | 儿童 |
523 | | Comedy | 喜剧 |
524 | | (no genres listed) | 未分类 |
525 | | Horror | 恐怖 |
526 |
527 | ### 需求3实现
528 |
529 | - **MostRatedFilms**
530 |
531 | 需求3实现的业务逻辑封装。该类有一个run()方法,主要是封装计算逻辑。
532 |
533 | ```scala
534 | /**
535 | * 需求3:查找被评分次数较多的前十部电影.
536 | */
537 | class MostRatedFilms extends Serializable {
538 | def run(moviesDataset: DataFrame, ratingsDataset: DataFrame,spark: SparkSession) = {
539 |
540 | import spark.implicits._
541 |
542 | // 将moviesDataset注册成表
543 | moviesDataset.createOrReplaceTempView("movies")
544 | // 将ratingsDataset注册成表
545 | ratingsDataset.createOrReplaceTempView("ratings")
546 |
547 | val ressql3 =
548 | """
549 | |WITH rating_group AS (
550 | | SELECT
551 | | movieId,
552 | | count( * ) AS ratingCnt
553 | | FROM ratings
554 | | GROUP BY movieId
555 | |),
556 | |rating_filter AS (
557 | | SELECT
558 | | movieId,
559 | | ratingCnt
560 | | FROM rating_group
561 | | ORDER BY ratingCnt DESC
562 | | LIMIT 10
563 | |)
564 | |SELECT
565 | | m.movieId,
566 | | m.title,
567 | | r.ratingCnt
568 | |FROM
569 | | rating_filter r
570 | |JOIN movies m ON r.movieId = m.movieId
571 | |
572 | """.stripMargin
573 |
574 | val resultDS = spark.sql(ressql3).as[tenMostRatedFilms]
575 | // 打印数据
576 | resultDS.show(10)
577 | resultDS.printSchema()
578 | // 写入MySQL
579 | resultDS.foreachPartition(par => par.foreach(insert2Mysql(_)))
580 |
581 | }
582 |
583 | /**
584 | * 获取连接,调用写入MySQL数据的方法
585 | *
586 | * @param res
587 | */
588 | private def insert2Mysql(res: tenMostRatedFilms): Unit = {
589 | lazy val conn = JDBCUtil.getQueryRunner()
590 | conn match {
591 | case Some(connection) => {
592 | upsert(res, connection)
593 | }
594 | case None => {
595 | println("Mysql连接失败")
596 | System.exit(-1)
597 | }
598 | }
599 | }
600 |
601 | /**
602 | * 封装将结果写入MySQL的方法
603 | * 执行写入操作
604 | *
605 | * @param r
606 | * @param conn
607 | */
608 | private def upsert(r: tenMostRatedFilms, conn: QueryRunner): Unit = {
609 | try {
610 | val sql =
611 | s"""
612 | |REPLACE INTO `ten_most_rated_films`(
613 | |movieId,
614 | |title,
615 | |ratingCnt
616 | |)
617 | |VALUES
618 | |(?,?,?)
619 | """.stripMargin
620 | // 执行insert操作
621 | conn.update(
622 | sql,
623 | r.movieId,
624 | r.title,
625 | r.ratingCnt
626 | )
627 | } catch {
628 | case e: Exception => {
629 | e.printStackTrace()
630 | System.exit(-1)
631 | }
632 | }
633 | }
634 |
635 | }
636 |
637 | ```
638 |
639 | ### 需求3结果
640 |
641 | - 结果表创建语句
642 |
643 | ```sql
644 | CREATE TABLE ten_most_rated_films (
645 | `id` INT ( 11 ) NOT NULL AUTO_INCREMENT COMMENT '自增id',
646 | `movieId` INT ( 11 ) NOT NULL COMMENT '电影Id',
647 | `title` varchar(100) NOT NULL COMMENT '电影名称',
648 | `ratingCnt` INT(11) NOT NULL COMMENT '电影被评分的次数',
649 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间',
650 | PRIMARY KEY ( `id` ),
651 | UNIQUE KEY `movie_id_UNIQUE` ( `movieId` )
652 | ) ENGINE = INNODB DEFAULT CHARSET = utf8;
653 | ```
654 |
655 | - 统计结果
656 |
657 | | movieId | title | ratingCnt |
658 | | ------- | ----------------------------------------- | --------- |
659 | | 356 | Forrest Gump (1994) | 81491 |
660 | | 318 | Shawshank Redemption, The (1994) | 81482 |
661 | | 296 | Pulp Fiction (1994) | 79672 |
662 | | 593 | Silence of the Lambs, The (1991) | 74127 |
663 | | 2571 | Matrix, The (1999) | 72674 |
664 | | 260 | Star Wars: Episode IV - A New Hope (1977) | 68717 |
665 | | 480 | Jurassic Park (1993) | 64144 |
666 | | 527 | Schindler's List (1993) | 60411 |
667 | | 110 | Braveheart (1995) | 59184 |
668 | | 2959 | Fight Club (1999) | 58773 |
669 |
670 | 评分次数较多的电影对应的中文名称为:
671 |
672 | | 英文名称 | 中文名称 |
673 | | ----------------------------------------- | ------------ |
674 | | Forrest Gump (1994) | 阿甘正传 |
675 | | Shawshank Redemption, The (1994) | 肖申克的救赎 |
676 | | Pulp Fiction (1994) | 低俗小说 |
677 | | Silence of the Lambs, The (1991) | 沉默的羔羊 |
678 | | Matrix, The (1999) | 黑客帝国 |
679 | | Star Wars: Episode IV - A New Hope (1977) | 星球大战 |
680 | | Jurassic Park (1993) | 侏罗纪公园 |
681 | | Schindler's List (1993) | 辛德勒的名单 |
682 | | Braveheart (1995) | 勇敢的心 |
683 | | Fight Club (1999) | 搏击俱乐部 |
684 |
685 |
686 |
687 |
688 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 | 4.0.0
6 |
7 | com.jmx
8 | sparkproject
9 | jar
10 | 1.0-SNAPSHOT
11 |
12 | 2008
13 |
14 |
15 | 2.11.8
16 |
17 |
18 |
19 |
20 | scala-tools.org
21 | Scala-Tools Maven2 Repository
22 | http://scala-tools.org/repo-releases
23 |
24 |
25 |
26 |
27 |
28 | scala-tools.org
29 | Scala-Tools Maven2 Repository
30 | http://scala-tools.org/repo-releases
31 |
32 |
33 |
34 |
35 |
36 | org.scala-lang
37 | scala-library
38 | ${scala.version}
39 |
40 |
41 | c3p0
42 | c3p0
43 | 0.9.1.2
44 |
45 |
46 |
47 | commons-dbutils
48 | commons-dbutils
49 | 1.6
50 |
51 |
52 | junit
53 | junit
54 | 4.12
55 | test
56 |
57 |
58 | org.specs
59 | specs
60 | 1.2.5
61 | test
62 |
63 |
64 |
65 | org.apache.spark
66 | spark-core_2.11
67 | 2.3.0
68 |
69 |
70 |
71 | org.apache.spark
72 | spark-sql_2.11
73 | 2.3.0
74 |
75 |
76 |
77 | org.apache.spark
78 | spark-streaming_2.11
79 | 2.3.0
80 |
81 |
82 |
83 |
84 | org.apache.spark
85 | spark-mllib_2.11
86 | 2.3.0
87 |
88 |
89 |
90 |
91 | org.apache.spark
92 | spark-streaming-kafka-0-10_2.11
93 | 2.3.0
94 |
95 |
96 |
97 | org.apache.spark
98 | spark-hive_2.11
99 | 2.3.0
100 |
101 |
102 |
103 |
104 | mysql
105 | mysql-connector-java
106 | 5.1.39
107 |
108 |
109 |
110 | org.apache.hadoop
111 | hadoop-common
112 | 2.7.7
113 |
114 |
115 |
116 | org.apache.hadoop
117 | hadoop-client
118 | 2.7.7
119 |
120 |
121 |
122 | org.apache.hadoop
123 | hadoop-hdfs
124 | 2.7.7
125 |
126 |
127 | org.apache.avro
128 | avro-tools
129 | 1.8.1
130 |
131 |
132 |
133 |
134 | org.apache.hive
135 | hive-cli
136 | 2.3.4
137 |
138 |
139 |
140 | org.apache.hive
141 | hive-exec
142 | 2.3.4
143 |
144 |
145 | org.apache.commons
146 | commons-dbcp2
147 | 2.1.1
148 |
149 |
150 | redis.clients
151 | jedis
152 | 2.8.0
153 |
154 |
155 | ru.yandex.clickhouse
156 | clickhouse-jdbc
157 | 0.2.4
158 |
159 |
160 |
161 | com.google.guava
162 | guava
163 | 28.0-jre
164 |
165 |
166 |
167 |
168 |
169 | src/main/scala
170 | src/test/scala
171 |
172 |
173 | org.scala-tools
174 | maven-scala-plugin
175 |
176 |
177 |
178 | compile
179 | testCompile
180 |
181 |
182 |
183 |
184 | ${scala.version}
185 |
186 | -target:jvm-1.5
187 |
188 |
189 |
190 |
191 | org.apache.maven.plugins
192 | maven-eclipse-plugin
193 |
194 | true
195 |
196 | ch.epfl.lamp.sdt.core.scalabuilder
197 |
198 |
199 | ch.epfl.lamp.sdt.core.scalanature
200 |
201 |
202 | org.eclipse.jdt.launching.JRE_CONTAINER
203 | ch.epfl.lamp.sdt.launching.SCALA_CONTAINER
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 | org.scala-tools
213 | maven-scala-plugin
214 |
215 | ${scala.version}
216 |
217 |
218 |
219 |
220 |
221 |
222 |
--------------------------------------------------------------------------------
/src/main/resources/create_table.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE `ten_movies_averagerating` (
2 | `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增id',
3 | `movieId` int(11) NOT NULL COMMENT '电影id',
4 | `title` varchar(100) NOT NULL COMMENT '电影名称',
5 | `avgRating` decimal(10,2) NOT NULL COMMENT '平均评分',
6 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间',
7 | PRIMARY KEY (`id`),
8 | UNIQUE KEY `movie_id_UNIQUE` (`movieId`)
9 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
10 |
11 |
12 | -- -------------------------------------------------------------------------------
13 |
14 | CREATE TABLE genres_average_rating (
15 | `id` INT ( 11 ) NOT NULL AUTO_INCREMENT COMMENT '自增id',
16 | `genres` VARCHAR ( 100 ) NOT NULL COMMENT '电影类别',
17 | `avgRating` DECIMAL ( 10, 2 ) NOT NULL COMMENT '电影类别平均评分',
18 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间',
19 | PRIMARY KEY ( `id` ),
20 | UNIQUE KEY `genres_UNIQUE` ( `genres` )
21 | ) ENGINE = INNODB DEFAULT CHARSET = utf8;
22 |
23 | -- ------------------------------------------------------------------------------
24 |
25 |
26 | CREATE TABLE ten_most_rated_films (
27 | `id` INT ( 11 ) NOT NULL AUTO_INCREMENT COMMENT '自增id',
28 | `movieId` INT ( 11 ) NOT NULL COMMENT '电影Id',
29 | `title` varchar(100) NOT NULL COMMENT '电影名称',
30 | `ratingCnt` INT(11) NOT NULL COMMENT '电影被评分的次数',
31 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间',
32 | PRIMARY KEY ( `id` ),
33 | UNIQUE KEY `movie_id_UNIQUE` ( `movieId` )
34 | ) ENGINE = INNODB DEFAULT CHARSET = utf8;
35 |
36 | -- ------------------------------------------------------------------------------
--------------------------------------------------------------------------------
/src/main/resources/log4j2.properties:
--------------------------------------------------------------------------------
1 |
2 | appender.out.type = Console
3 | appender.out.name = out
4 | appender.out.layout.type = PatternLayout
5 | appender.out.layout.pattern = [%30.30t] %-30.30c{1} %-5p %m%n
6 | logger.springframework.name = org.springframework
7 | logger.springframework.level = WARN
8 | rootLogger.level = INFO
9 | rootLogger.appenderRef.out.ref = out
10 |
--------------------------------------------------------------------------------
/src/main/resources/代码结构.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/src/main/resources/代码结构.png
--------------------------------------------------------------------------------
/src/main/scala/com/jmx/demos/DemoMainApp.scala:
--------------------------------------------------------------------------------
1 | package com.jmx.demos
2 |
3 | import com.jmx.metrics.{BestFilmsByOverallRating, GenresByAverageRating, MostRatedFilms}
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.types.StructType
6 |
7 | /**
8 | * @Created with IntelliJ IDEA.
9 | * @author : jmx
10 | * @Date: 2020/11/19
11 | * @Time: 16:27
12 | * */
13 | object DemoMainApp {
14 | // 文件路径
15 | private val MOVIES_CSV_FILE_PATH = "file:///e:/movies.csv"
16 | private val RATINGS_CSV_FILE_PATH = "file:///e:/ratings.csv"
17 |
18 |
19 | def main(args: Array[String]): Unit = {
20 | // 创建spark session
21 | val spark = SparkSession
22 | .builder
23 | .master("local[4]")
24 | .getOrCreate
25 | // schema信息
26 | val schemaLoader = new SchemaLoader
27 | // 读取Movie数据集
28 | val movieDF = readCsvIntoDataSet(spark, MOVIES_CSV_FILE_PATH, schemaLoader.getMovieSchema)
29 |
30 | // 读取Rating数据集
31 | val ratingDF = readCsvIntoDataSet(spark, RATINGS_CSV_FILE_PATH, schemaLoader.getRatingSchema)
32 |
33 | movieDF.printSchema()
34 | ratingDF.printSchema()
35 |
36 | // 需求1:查找电影评分个数超过5000,且平均评分较高的前十部电影名称及其对应的平均评分
37 | val bestFilmsByOverallRating = new BestFilmsByOverallRating
38 | bestFilmsByOverallRating.run(movieDF, ratingDF, spark)
39 |
40 | // 需求2:查找每个电影类别及其对应的平均评分
41 | val genresByAverageRating = new GenresByAverageRating
42 | genresByAverageRating.run(movieDF, ratingDF, spark)
43 |
44 | // 需求3:查找被评分次数较多的前十部电影
45 | val mostRatedFilms = new MostRatedFilms
46 | mostRatedFilms.run(movieDF, ratingDF, spark)
47 |
48 |
49 | spark.close()
50 |
51 | }
52 |
53 | /**
54 | * 读取数据文件,转成DataFrame
55 | *
56 | * @param spark
57 | * @param path
58 | * @param schema
59 | * @return
60 | */
61 | def readCsvIntoDataSet(spark: SparkSession, path: String, schema: StructType) = {
62 |
63 | val dataSet = spark.read
64 | .format("csv")
65 | .option("header", "true")
66 | .schema(schema)
67 | .load(path)
68 | dataSet
69 |
70 | }
71 |
72 |
73 | }
74 |
--------------------------------------------------------------------------------
/src/main/scala/com/jmx/demos/Entry.scala:
--------------------------------------------------------------------------------
1 | package com.jmx.demos
2 |
3 | /**
4 | * @Created with IntelliJ IDEA.
5 | * @author : jmx
6 | * @Date: 2020/11/19
7 | * @Time: 15:57
8 | * */
9 | class Entry {
10 |
11 | }
12 |
13 | case class Movies(
14 | movieId: String, // 电影的id
15 | title: String, // 电影的标题
16 | genres: String // 电影类别
17 | )
18 |
19 | case class Ratings(
20 | userId: String, // 用户的id
21 | movieId: String, // 电影的id
22 | rating: String, // 用户评分
23 | timestamp: String // 时间戳
24 | )
25 |
26 | // 需求1MySQL结果表
27 | case class tenGreatestMoviesByAverageRating(
28 | movieId: String, // 电影的id
29 | title: String, // 电影的标题
30 | avgRating: String // 电影平均评分
31 | )
32 |
33 | // 需求2MySQL结果表
34 | case class topGenresByAverageRating(
35 | genres: String, //电影类别
36 | avgRating: String // 平均评分
37 | )
38 |
39 | // 需求3MySQL结果表
40 | case class tenMostRatedFilms(
41 | movieId: String, // 电影的id
42 | title: String, // 电影的标题
43 | ratingCnt: String // 电影被评分的次数
44 | )
--------------------------------------------------------------------------------
/src/main/scala/com/jmx/demos/SchemaLoader.scala:
--------------------------------------------------------------------------------
1 | package com.jmx.demos
2 | import org.apache.spark.sql.types.{DataTypes, StructType}
3 | /**
4 | * @Created with IntelliJ IDEA.
5 | * @author : jmx
6 | * @Date: 2020/11/19
7 | * @Time: 15:17
8 | * */
9 |
10 |
11 | class SchemaLoader {
12 |
13 | private val movieSchema = new StructType()
14 | .add("movieId", DataTypes.StringType, false)
15 | .add("title", DataTypes.StringType, false)
16 | .add("genres", DataTypes.StringType, false)
17 |
18 | private val ratingSchema = new StructType()
19 | .add("userId", DataTypes.StringType, false)
20 | .add("movieId", DataTypes.StringType, false)
21 | .add("rating", DataTypes.StringType, false)
22 | .add("timestamp", DataTypes.StringType, false)
23 |
24 | def getMovieSchema: StructType = movieSchema
25 |
26 | def getRatingSchema: StructType = ratingSchema
27 | }
--------------------------------------------------------------------------------
/src/main/scala/com/jmx/metrics/BestFilmsByOverallRating.scala:
--------------------------------------------------------------------------------
1 | package com.jmx.metrics
2 |
3 | import com.jmx.demos.tenGreatestMoviesByAverageRating
4 | import com.jmx.util.JDBCUtil
5 | import org.apache.commons.dbutils.QueryRunner
6 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
7 |
8 |
9 | /**
10 | * 需求1:查找电影评分个数超过5000,且平均评分较高的前十部电影名称及其对应的平均评分
11 | */
12 | class BestFilmsByOverallRating extends Serializable {
13 |
14 | def run(moviesDataset: DataFrame, ratingsDataset: DataFrame, spark: SparkSession) = {
15 | import spark.implicits._
16 |
17 | // 将moviesDataset注册成表
18 | moviesDataset.createOrReplaceTempView("movies")
19 | // 将ratingsDataset注册成表
20 | ratingsDataset.createOrReplaceTempView("ratings")
21 |
22 | // 查询SQL语句
23 | val ressql1 =
24 | """
25 | |WITH ratings_filter_cnt AS (
26 | |SELECT
27 | | movieId,
28 | | count( * ) AS rating_cnt,
29 | | avg( rating ) AS avg_rating
30 | |FROM
31 | | ratings
32 | |GROUP BY
33 | | movieId
34 | |HAVING
35 | | count( * ) >= 5000
36 | |),
37 | |ratings_filter_score AS (
38 | |SELECT
39 | | movieId, -- 电影id
40 | | avg_rating -- 电影平均评分
41 | |FROM ratings_filter_cnt
42 | |ORDER BY avg_rating DESC -- 平均评分降序排序
43 | |LIMIT 10 -- 平均分较高的前十部电影
44 | |)
45 | |SELECT
46 | | m.movieId,
47 | | m.title,
48 | | r.avg_rating AS avgRating
49 | |FROM
50 | | ratings_filter_score r
51 | |JOIN movies m ON m.movieId = r.movieId
52 | """.stripMargin
53 |
54 | val resultDS = spark.sql(ressql1).as[tenGreatestMoviesByAverageRating]
55 | // 打印数据
56 | resultDS.show(10)
57 | resultDS.printSchema()
58 | // 写入MySQL
59 | resultDS.foreachPartition(par => par.foreach(insert2Mysql(_)))
60 | }
61 |
62 | /**
63 | * 获取连接,调用写入MySQL数据的方法
64 | *
65 | * @param res
66 | */
67 | private def insert2Mysql(res: tenGreatestMoviesByAverageRating): Unit = {
68 | lazy val conn = JDBCUtil.getQueryRunner()
69 | conn match {
70 | case Some(connection) => {
71 | upsert(res, connection)
72 | }
73 | case None => {
74 | println("Mysql连接失败")
75 | System.exit(-1)
76 | }
77 | }
78 | }
79 |
80 | /**
81 | * 封装将结果写入MySQL的方法
82 | * 执行写入操作
83 | *
84 | * @param r
85 | * @param conn
86 | */
87 | private def upsert(r: tenGreatestMoviesByAverageRating, conn: QueryRunner): Unit = {
88 | try {
89 | val sql =
90 | s"""
91 | |REPLACE INTO `ten_movies_averagerating`(
92 | |movieId,
93 | |title,
94 | |avgRating
95 | |)
96 | |VALUES
97 | |(?,?,?)
98 | """.stripMargin
99 | // 执行insert操作
100 | conn.update(
101 | sql,
102 | r.movieId,
103 | r.title,
104 | r.avgRating
105 | )
106 | } catch {
107 | case e: Exception => {
108 | e.printStackTrace()
109 | System.exit(-1)
110 | }
111 | }
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/main/scala/com/jmx/metrics/GenresByAverageRating.scala:
--------------------------------------------------------------------------------
1 | package com.jmx.metrics
2 |
3 | /**
4 | * @Created with IntelliJ IDEA.
5 | * @author : jmx
6 | * @Date: 2020/11/19
7 | * @Time: 15:21
8 | * */
9 |
10 | import com.jmx.demos.topGenresByAverageRating
11 | import com.jmx.util.JDBCUtil
12 | import org.apache.commons.dbutils.QueryRunner
13 | import org.apache.spark.sql.{DataFrame, SparkSession}
14 |
15 | /**
16 | * 需求2:查找每个电影类别及其对应的平均评分
17 | */
18 | class GenresByAverageRating extends Serializable {
19 | def run(moviesDataset: DataFrame, ratingsDataset: DataFrame, spark: SparkSession) = {
20 | import spark.implicits._
21 | // 将moviesDataset注册成表
22 | moviesDataset.createOrReplaceTempView("movies")
23 | // 将ratingsDataset注册成表
24 | ratingsDataset.createOrReplaceTempView("ratings")
25 |
26 | val ressql2 =
27 | """
28 | |WITH explode_movies AS (
29 | |SELECT
30 | | movieId,
31 | | title,
32 | | category
33 | |FROM
34 | | movies lateral VIEW explode ( split ( genres, "\\|" ) ) temp AS category
35 | |)
36 | |SELECT
37 | | m.category AS genres,
38 | | avg( r.rating ) AS avgRating
39 | |FROM
40 | | explode_movies m
41 | | JOIN ratings r ON m.movieId = r.movieId
42 | |GROUP BY
43 | | m.category
44 | | """.stripMargin
45 |
46 | val resultDS = spark.sql(ressql2).as[topGenresByAverageRating]
47 |
48 | // 打印数据
49 | resultDS.show(10)
50 | resultDS.printSchema()
51 | // 写入MySQL
52 | resultDS.foreachPartition(par => par.foreach(insert2Mysql(_)))
53 |
54 |
55 | }
56 |
57 | /**
58 | * 获取连接,调用写入MySQL数据的方法
59 | *
60 | * @param res
61 | */
62 | private def insert2Mysql(res: topGenresByAverageRating): Unit = {
63 | lazy val conn = JDBCUtil.getQueryRunner()
64 | conn match {
65 | case Some(connection) => {
66 | upsert(res, connection)
67 | }
68 | case None => {
69 | println("Mysql连接失败")
70 | System.exit(-1)
71 | }
72 | }
73 | }
74 |
75 | /**
76 | * 封装将结果写入MySQL的方法
77 | * 执行写入操作
78 | *
79 | * @param r
80 | * @param conn
81 | */
82 | private def upsert(r: topGenresByAverageRating, conn: QueryRunner): Unit = {
83 | try {
84 | val sql =
85 | s"""
86 | |REPLACE INTO `genres_average_rating`(
87 | |genres,
88 | |avgRating
89 | |)
90 | |VALUES
91 | |(?,?)
92 | """.stripMargin
93 | // 执行insert操作
94 | conn.update(
95 | sql,
96 | r.genres,
97 | r.avgRating
98 | )
99 | } catch {
100 | case e: Exception => {
101 | e.printStackTrace()
102 | System.exit(-1)
103 | }
104 | }
105 | }
106 |
107 |
108 | }
109 |
--------------------------------------------------------------------------------
/src/main/scala/com/jmx/metrics/MostRatedFilms.scala:
--------------------------------------------------------------------------------
1 | package com.jmx.metrics
2 |
3 | import com.jmx.demos.tenMostRatedFilms
4 | import com.jmx.util.JDBCUtil
5 | import org.apache.commons.dbutils.QueryRunner
6 | import org.apache.spark.sql.{DataFrame, SparkSession}
7 | /**
8 | * @Created with IntelliJ IDEA.
9 | * @author : jmx
10 | * @Date: 2020/11/19
11 | * @Time: 15:23
12 | * */
13 | /**
14 | * 需求3:查找被评分次数较多的前十部电影.
15 | */
16 | class MostRatedFilms extends Serializable {
17 | def run(moviesDataset: DataFrame, ratingsDataset: DataFrame,spark: SparkSession) = {
18 |
19 | import spark.implicits._
20 |
21 | // 将moviesDataset注册成表
22 | moviesDataset.createOrReplaceTempView("movies")
23 | // 将ratingsDataset注册成表
24 | ratingsDataset.createOrReplaceTempView("ratings")
25 |
26 | val ressql3 =
27 | """
28 | |WITH rating_group AS (
29 | | SELECT
30 | | movieId,
31 | | count( * ) AS ratingCnt
32 | | FROM ratings
33 | | GROUP BY movieId
34 | |),
35 | |rating_filter AS (
36 | | SELECT
37 | | movieId,
38 | | ratingCnt
39 | | FROM rating_group
40 | | ORDER BY ratingCnt DESC
41 | | LIMIT 10
42 | |)
43 | |SELECT
44 | | m.movieId,
45 | | m.title,
46 | | r.ratingCnt
47 | |FROM
48 | | rating_filter r
49 | |JOIN movies m ON r.movieId = m.movieId
50 | |
51 | """.stripMargin
52 |
53 | val resultDS = spark.sql(ressql3).as[tenMostRatedFilms]
54 | // 打印数据
55 | resultDS.show(10)
56 | resultDS.printSchema()
57 | // 写入MySQL
58 | resultDS.foreachPartition(par => par.foreach(insert2Mysql(_)))
59 |
60 | }
61 |
62 | /**
63 | * 获取连接,调用写入MySQL数据的方法
64 | *
65 | * @param res
66 | */
67 | private def insert2Mysql(res: tenMostRatedFilms): Unit = {
68 | lazy val conn = JDBCUtil.getQueryRunner()
69 | conn match {
70 | case Some(connection) => {
71 | upsert(res, connection)
72 | }
73 | case None => {
74 | println("Mysql连接失败")
75 | System.exit(-1)
76 | }
77 | }
78 | }
79 |
80 | /**
81 | * 封装将结果写入MySQL的方法
82 | * 执行写入操作
83 | *
84 | * @param r
85 | * @param conn
86 | */
87 | private def upsert(r: tenMostRatedFilms, conn: QueryRunner): Unit = {
88 | try {
89 | val sql =
90 | s"""
91 | |REPLACE INTO `ten_most_rated_films`(
92 | |movieId,
93 | |title,
94 | |ratingCnt
95 | |)
96 | |VALUES
97 | |(?,?,?)
98 | """.stripMargin
99 | // 执行insert操作
100 | conn.update(
101 | sql,
102 | r.movieId,
103 | r.title,
104 | r.ratingCnt
105 | )
106 | } catch {
107 | case e: Exception => {
108 | e.printStackTrace()
109 | System.exit(-1)
110 | }
111 | }
112 | }
113 |
114 | }
115 |
--------------------------------------------------------------------------------
/src/main/scala/com/jmx/util/JDBCUtil.scala:
--------------------------------------------------------------------------------
1 | package com.jmx.util
2 |
3 | import com.jmx.demos.tenGreatestMoviesByAverageRating
4 | import com.mchange.v2.c3p0.ComboPooledDataSource
5 | import org.apache.commons.dbutils.QueryRunner
6 |
7 | /**
8 | * @Created with IntelliJ IDEA.
9 | * @author : jmx
10 | * @Date: 2020/11/19
11 | * @Time: 17:54
12 | * */
13 | object JDBCUtil {
14 | val dataSource = new ComboPooledDataSource()
15 |
16 | val user = "root"
17 | val password = "920724"
18 | val url = "jdbc:mysql://localhost:3306/mydb"
19 |
20 | dataSource.setUser(user)
21 | dataSource.setPassword(password)
22 | dataSource.setDriverClass("com.mysql.jdbc.Driver")
23 | dataSource.setJdbcUrl(url)
24 | dataSource.setAutoCommitOnClose(false)
25 |
26 |
27 | def getQueryRunner(): Option[QueryRunner]={
28 | try {
29 | Some(new QueryRunner(dataSource))
30 | }catch {
31 | case e:Exception =>
32 | e.printStackTrace()
33 | None
34 |
35 | }
36 | }
37 |
38 |
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/DemoMainApp$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/DemoMainApp$.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/DemoMainApp.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/DemoMainApp.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/Entry.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/Entry.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/Movies$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/Movies$.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/Movies.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/Movies.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/Ratings$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/Ratings$.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/Ratings.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/Ratings.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/SchemaLoader.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/SchemaLoader.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/tenGreatestMoviesByAverageRating$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/tenGreatestMoviesByAverageRating$.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/tenGreatestMoviesByAverageRating.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/tenGreatestMoviesByAverageRating.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/tenMostRatedFilms$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/tenMostRatedFilms$.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/tenMostRatedFilms.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/tenMostRatedFilms.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/topGenresByAverageRating$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/topGenresByAverageRating$.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/demos/topGenresByAverageRating.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/topGenresByAverageRating.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/metrics/BestFilmsByOverallRating$$anonfun$run$1$$anonfun$apply$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/BestFilmsByOverallRating$$anonfun$run$1$$anonfun$apply$1.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/metrics/BestFilmsByOverallRating$$anonfun$run$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/BestFilmsByOverallRating$$anonfun$run$1.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/metrics/BestFilmsByOverallRating$$typecreator4$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/BestFilmsByOverallRating$$typecreator4$1.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/metrics/BestFilmsByOverallRating.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/BestFilmsByOverallRating.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/metrics/GenresByAverageRating$$anonfun$run$1$$anonfun$apply$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/GenresByAverageRating$$anonfun$run$1$$anonfun$apply$1.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/metrics/GenresByAverageRating$$anonfun$run$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/GenresByAverageRating$$anonfun$run$1.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/metrics/GenresByAverageRating$$typecreator4$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/GenresByAverageRating$$typecreator4$1.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/metrics/GenresByAverageRating.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/GenresByAverageRating.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/metrics/MostRatedFilms$$anonfun$run$1$$anonfun$apply$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/MostRatedFilms$$anonfun$run$1$$anonfun$apply$1.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/metrics/MostRatedFilms$$anonfun$run$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/MostRatedFilms$$anonfun$run$1.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/metrics/MostRatedFilms$$typecreator4$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/MostRatedFilms$$typecreator4$1.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/metrics/MostRatedFilms.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/MostRatedFilms.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/util/JDBCUtil$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/util/JDBCUtil$.class
--------------------------------------------------------------------------------
/target/classes/com/jmx/util/JDBCUtil.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/util/JDBCUtil.class
--------------------------------------------------------------------------------
/target/classes/create_table.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE `ten_movies_averagerating` (
2 | `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增id',
3 | `movieId` int(11) NOT NULL COMMENT '电影id',
4 | `title` varchar(100) NOT NULL COMMENT '电影名称',
5 | `avgRating` decimal(10,2) NOT NULL COMMENT '平均评分',
6 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间',
7 | PRIMARY KEY (`id`),
8 | UNIQUE KEY `movie_id_UNIQUE` (`movieId`)
9 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
10 |
11 |
12 | -- -------------------------------------------------------------------------------
13 |
14 | CREATE TABLE genres_average_rating (
15 | `id` INT ( 11 ) NOT NULL AUTO_INCREMENT COMMENT '自增id',
16 | `genres` VARCHAR ( 100 ) NOT NULL COMMENT '电影类别',
17 | `avgRating` DECIMAL ( 10, 2 ) NOT NULL COMMENT '电影类别平均评分',
18 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间',
19 | PRIMARY KEY ( `id` ),
20 | UNIQUE KEY `genres_UNIQUE` ( `genres` )
21 | ) ENGINE = INNODB DEFAULT CHARSET = utf8;
22 |
23 | -- ------------------------------------------------------------------------------
24 |
25 |
26 | CREATE TABLE ten_most_rated_films (
27 | `id` INT ( 11 ) NOT NULL AUTO_INCREMENT COMMENT '自增id',
28 | `movieId` INT ( 11 ) NOT NULL COMMENT '电影Id',
29 | `title` varchar(100) NOT NULL COMMENT '电影名称',
30 | `ratingCnt` INT(11) NOT NULL COMMENT '电影被评分的次数',
31 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间',
32 | PRIMARY KEY ( `id` ),
33 | UNIQUE KEY `movie_id_UNIQUE` ( `movieId` )
34 | ) ENGINE = INNODB DEFAULT CHARSET = utf8;
35 |
36 | -- ------------------------------------------------------------------------------
--------------------------------------------------------------------------------
/target/classes/log4j2.properties:
--------------------------------------------------------------------------------
1 |
2 | appender.out.type = Console
3 | appender.out.name = out
4 | appender.out.layout.type = PatternLayout
5 | appender.out.layout.pattern = [%30.30t] %-30.30c{1} %-5p %m%n
6 | logger.springframework.name = org.springframework
7 | logger.springframework.level = WARN
8 | rootLogger.level = INFO
9 | rootLogger.appenderRef.out.ref = out
10 |
--------------------------------------------------------------------------------