├── .idea ├── compiler.xml ├── encodings.xml ├── hydra.xml ├── libraries │ └── R_User_Library.xml ├── misc.xml ├── sbt.xml ├── scala_compiler.xml ├── uiDesigner.xml ├── vcs.xml └── workspace.xml ├── ReadMe.md ├── pom.xml ├── src └── main │ ├── resources │ ├── create_table.sql │ ├── log4j2.properties │ └── 代码结构.png │ └── scala │ └── com │ └── jmx │ ├── demos │ ├── DemoMainApp.scala │ ├── Entry.scala │ └── SchemaLoader.scala │ ├── metrics │ ├── BestFilmsByOverallRating.scala │ ├── GenresByAverageRating.scala │ └── MostRatedFilms.scala │ └── util │ └── JDBCUtil.scala └── target └── classes ├── com └── jmx │ ├── demos │ ├── DemoMainApp$.class │ ├── DemoMainApp.class │ ├── Entry.class │ ├── Movies$.class │ ├── Movies.class │ ├── Ratings$.class │ ├── Ratings.class │ ├── SchemaLoader.class │ ├── tenGreatestMoviesByAverageRating$.class │ ├── tenGreatestMoviesByAverageRating.class │ ├── tenMostRatedFilms$.class │ ├── tenMostRatedFilms.class │ ├── topGenresByAverageRating$.class │ └── topGenresByAverageRating.class │ ├── metrics │ ├── BestFilmsByOverallRating$$anonfun$run$1$$anonfun$apply$1.class │ ├── BestFilmsByOverallRating$$anonfun$run$1.class │ ├── BestFilmsByOverallRating$$typecreator4$1.class │ ├── BestFilmsByOverallRating.class │ ├── GenresByAverageRating$$anonfun$run$1$$anonfun$apply$1.class │ ├── GenresByAverageRating$$anonfun$run$1.class │ ├── GenresByAverageRating$$typecreator4$1.class │ ├── GenresByAverageRating.class │ ├── MostRatedFilms$$anonfun$run$1$$anonfun$apply$1.class │ ├── MostRatedFilms$$anonfun$run$1.class │ ├── MostRatedFilms$$typecreator4$1.class │ └── MostRatedFilms.class │ └── util │ ├── JDBCUtil$.class │ └── JDBCUtil.class ├── create_table.sql └── log4j2.properties /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/hydra.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/R_User_Library.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/sbt.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 18 | 19 | 29 | 30 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 287 | 288 | 289 | 290 | 300 | 301 | 303 | 304 | 317 | 318 | 319 | 320 | 336 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 1605769582185 361 | 368 | 369 | 370 | 371 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 450 | 451 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 1.8 663 | 664 | 669 | 670 | 671 | 672 | 673 | 674 | -------------------------------------------------------------------------------- /ReadMe.md: -------------------------------------------------------------------------------- 1 | 2 | ## 项目介绍 3 | 4 | ### 数据集介绍 5 | 6 | 使用MovieLens的名称为ml-25m.zip的数据集,使用的文件时**movies.csv**和**ratings.csv**,上述文件的下载地址为: 7 | 8 | ```bash 9 | http://files.grouplens.org/datasets/movielens/ml-25m.zip 10 | ``` 11 | 12 | - **movies.csv** 13 | 14 | 该文件是电影数据,对应的为维表数据,大小为2.89MB,包括6万多部电影,其数据格式为[movieId,title,genres],分别对应**[电影id,电影名称,电影所属分类]**,样例数据如下所示:逗号分隔 15 | 16 | ```bash 17 | 1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy 18 | ``` 19 | 20 | - **ratings.csv** 21 | 22 | 该文件为定影评分数据,对应为事实表数据,大小为646MB,其数据格式为:[userId,movieId,rating,timestamp],分别对应**[用户id,电影id,评分,时间戳]**,样例数据如下所示:逗号分隔 23 | 24 | ```bash 25 | 1,296,5,1147880044 26 | ``` 27 | 28 | ### 项目代码结构 29 | 30 | ![](https://github.com/jiamx/spark_project_practise/blob/master/src/main/resources/%E4%BB%A3%E7%A0%81%E7%BB%93%E6%9E%84.png) 31 | 32 | ## 需求分析 33 | 34 | - 需求1:查找电影评分个数超过5000,且平均评分较高的前十部电影名称及其对应的平均评分 35 | 36 | 37 | - 需求2:查找每个电影类别及其对应的平均评分 38 | 39 | 40 | - 需求3:查找被评分次数较多的前十部电影 41 | 42 | ## 代码讲解 43 | 44 | - **DemoMainApp** 45 | 46 | 该类是程序执行的入口,主要是获取数据源,转换成DataFrame,并调用封装好的业务逻辑类。 47 | 48 | ```scala 49 | object DemoMainApp { 50 | // 文件路径 51 | private val MOVIES_CSV_FILE_PATH = "file:///e:/movies.csv" 52 | private val RATINGS_CSV_FILE_PATH = "file:///e:/ratings.csv" 53 | 54 | def main(args: Array[String]): Unit = { 55 | // 创建spark session 56 | val spark = SparkSession 57 | .builder 58 | .master("local[4]") 59 | .getOrCreate 60 | // schema信息 61 | val schemaLoader = new SchemaLoader 62 | // 读取Movie数据集 63 | val movieDF = readCsvIntoDataSet(spark, MOVIES_CSV_FILE_PATH, schemaLoader.getMovieSchema) 64 | // 读取Rating数据集 65 | val ratingDF = readCsvIntoDataSet(spark, RATINGS_CSV_FILE_PATH, schemaLoader.getRatingSchema) 66 | 67 | // 需求1:查找电影评分个数超过5000,且平均评分较高的前十部电影名称及其对应的平均评分 68 | val bestFilmsByOverallRating = new BestFilmsByOverallRating 69 | //bestFilmsByOverallRating.run(movieDF, ratingDF, spark) 70 | 71 | // 需求2:查找每个电影类别及其对应的平均评分 72 | val genresByAverageRating = new GenresByAverageRating 73 | //genresByAverageRating.run(movieDF, ratingDF, spark) 74 | 75 | // 需求3:查找被评分次数较多的前十部电影 76 | val mostRatedFilms = new MostRatedFilms 77 | mostRatedFilms.run(movieDF, ratingDF, spark) 78 | 79 | spark.close() 80 | 81 | } 82 | /** 83 | * 读取数据文件,转成DataFrame 84 | * 85 | * @param spark 86 | * @param path 87 | * @param schema 88 | * @return 89 | */ 90 | def readCsvIntoDataSet(spark: SparkSession, path: String, schema: StructType) = { 91 | 92 | val dataSet = spark.read 93 | .format("csv") 94 | .option("header", "true") 95 | .schema(schema) 96 | .load(path) 97 | dataSet 98 | } 99 | } 100 | ``` 101 | 102 | - **Entry** 103 | 104 | 该类为实体类,封装了数据源的样例类和结果表的样例类 105 | 106 | ```scala 107 | class Entry { 108 | 109 | } 110 | 111 | case class Movies( 112 | movieId: String, // 电影的id 113 | title: String, // 电影的标题 114 | genres: String // 电影类别 115 | ) 116 | 117 | case class Ratings( 118 | userId: String, // 用户的id 119 | movieId: String, // 电影的id 120 | rating: String, // 用户评分 121 | timestamp: String // 时间戳 122 | ) 123 | 124 | // 需求1MySQL结果表 125 | case class tenGreatestMoviesByAverageRating( 126 | movieId: String, // 电影的id 127 | title: String, // 电影的标题 128 | avgRating: String // 电影平均评分 129 | ) 130 | 131 | // 需求2MySQL结果表 132 | case class topGenresByAverageRating( 133 | genres: String, //电影类别 134 | avgRating: String // 平均评分 135 | ) 136 | 137 | // 需求3MySQL结果表 138 | case class tenMostRatedFilms( 139 | movieId: String, // 电影的id 140 | title: String, // 电影的标题 141 | ratingCnt: String // 电影被评分的次数 142 | ) 143 | ``` 144 | 145 | - **SchemaLoader** 146 | 147 | 该类封装了数据集的schema信息,主要用于读取数据源是指定schema信息 148 | 149 | ```scala 150 | class SchemaLoader { 151 | // movies数据集schema信息 152 | private val movieSchema = new StructType() 153 | .add("movieId", DataTypes.StringType, false) 154 | .add("title", DataTypes.StringType, false) 155 | .add("genres", DataTypes.StringType, false) 156 | // ratings数据集schema信息 157 | private val ratingSchema = new StructType() 158 | .add("userId", DataTypes.StringType, false) 159 | .add("movieId", DataTypes.StringType, false) 160 | .add("rating", DataTypes.StringType, false) 161 | .add("timestamp", DataTypes.StringType, false) 162 | 163 | def getMovieSchema: StructType = movieSchema 164 | 165 | def getRatingSchema: StructType = ratingSchema 166 | } 167 | ``` 168 | 169 | - **JDBCUtil** 170 | 171 | 该类封装了连接MySQL的逻辑,主要用于连接MySQL,在业务逻辑代码中会使用该工具类获取MySQL连接,将结果数据写入到MySQL中。 172 | 173 | ```sql 174 | object JDBCUtil { 175 | val dataSource = new ComboPooledDataSource() 176 | val user = "root" 177 | val password = "123qwe" 178 | val url = "jdbc:mysql://localhost:3306/mydb" 179 | 180 | dataSource.setUser(user) 181 | dataSource.setPassword(password) 182 | dataSource.setDriverClass("com.mysql.jdbc.Driver") 183 | dataSource.setJdbcUrl(url) 184 | dataSource.setAutoCommitOnClose(false) 185 | // 获取连接 186 | def getQueryRunner(): Option[QueryRunner]={ 187 | try { 188 | Some(new QueryRunner(dataSource)) 189 | }catch { 190 | case e:Exception => 191 | e.printStackTrace() 192 | None 193 | } 194 | } 195 | } 196 | ``` 197 | 198 | ### 需求1实现 199 | 200 | - **BestFilmsByOverallRating** 201 | 202 | 需求1实现的业务逻辑封装。该类有一个run()方法,主要是封装计算逻辑。 203 | 204 | ```scala 205 | /** 206 | * 需求1:查找电影评分个数超过5000,且平均评分较高的前十部电影名称及其对应的平均评分 207 | */ 208 | class BestFilmsByOverallRating extends Serializable { 209 | 210 | def run(moviesDataset: DataFrame, ratingsDataset: DataFrame, spark: SparkSession) = { 211 | import spark.implicits._ 212 | 213 | // 将moviesDataset注册成表 214 | moviesDataset.createOrReplaceTempView("movies") 215 | // 将ratingsDataset注册成表 216 | ratingsDataset.createOrReplaceTempView("ratings") 217 | 218 | // 查询SQL语句 219 | val ressql1 = 220 | """ 221 | |WITH ratings_filter_cnt AS ( 222 | |SELECT 223 | | movieId, 224 | | count( * ) AS rating_cnt, 225 | | avg( rating ) AS avg_rating 226 | |FROM 227 | | ratings 228 | |GROUP BY 229 | | movieId 230 | |HAVING 231 | | count( * ) >= 5000 232 | |), 233 | |ratings_filter_score AS ( 234 | |SELECT 235 | | movieId, -- 电影id 236 | | avg_rating -- 电影平均评分 237 | |FROM ratings_filter_cnt 238 | |ORDER BY avg_rating DESC -- 平均评分降序排序 239 | |LIMIT 10 -- 平均分较高的前十部电影 240 | |) 241 | |SELECT 242 | | m.movieId, 243 | | m.title, 244 | | r.avg_rating AS avgRating 245 | |FROM 246 | | ratings_filter_score r 247 | |JOIN movies m ON m.movieId = r.movieId 248 | """.stripMargin 249 | 250 | val resultDS = spark.sql(ressql1).as[tenGreatestMoviesByAverageRating] 251 | // 打印数据 252 | resultDS.show(10) 253 | resultDS.printSchema() 254 | // 写入MySQL 255 | resultDS.foreachPartition(par => par.foreach(insert2Mysql(_))) 256 | } 257 | 258 | /** 259 | * 获取连接,调用写入MySQL数据的方法 260 | * 261 | * @param res 262 | */ 263 | private def insert2Mysql(res: tenGreatestMoviesByAverageRating): Unit = { 264 | lazy val conn = JDBCUtil.getQueryRunner() 265 | conn match { 266 | case Some(connection) => { 267 | upsert(res, connection) 268 | } 269 | case None => { 270 | println("Mysql连接失败") 271 | System.exit(-1) 272 | } 273 | } 274 | } 275 | 276 | /** 277 | * 封装将结果写入MySQL的方法 278 | * 执行写入操作 279 | * 280 | * @param r 281 | * @param conn 282 | */ 283 | private def upsert(r: tenGreatestMoviesByAverageRating, conn: QueryRunner): Unit = { 284 | try { 285 | val sql = 286 | s""" 287 | |REPLACE INTO `ten_movies_averagerating`( 288 | |movieId, 289 | |title, 290 | |avgRating 291 | |) 292 | |VALUES 293 | |(?,?,?) 294 | """.stripMargin 295 | // 执行insert操作 296 | conn.update( 297 | sql, 298 | r.movieId, 299 | r.title, 300 | r.avgRating 301 | ) 302 | } catch { 303 | case e: Exception => { 304 | e.printStackTrace() 305 | System.exit(-1) 306 | } 307 | } 308 | } 309 | } 310 | ``` 311 | 312 | ### 需求1结果 313 | 314 | - 结果表建表语句 315 | 316 | ```sql 317 | CREATE TABLE `ten_movies_averagerating` ( 318 | `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增id', 319 | `movieId` int(11) NOT NULL COMMENT '电影id', 320 | `title` varchar(100) NOT NULL COMMENT '电影名称', 321 | `avgRating` decimal(10,2) NOT NULL COMMENT '平均评分', 322 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间', 323 | PRIMARY KEY (`id`), 324 | UNIQUE KEY `movie_id_UNIQUE` (`movieId`) 325 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 326 | ``` 327 | 328 | - 统计结果 329 | 330 | 平均评分最高的前十部电影如下: 331 | 332 | | movieId | title | avgRating | 333 | | ------- | ------------------------------------------- | --------- | 334 | | 318 | Shawshank Redemption, The (1994) | 4.41 | 335 | | 858 | Godfather, The (1972) | 4.32 | 336 | | 50 | Usual Suspects, The (1995) | 4.28 | 337 | | 1221 | Godfather: Part II, The (1974) | 4.26 | 338 | | 527 | Schindler's List (1993) | 4.25 | 339 | | 2019 | Seven Samurai (Shichinin no samurai) (1954) | 4.25 | 340 | | 904 | Rear Window (1954) | 4.24 | 341 | | 1203 | 12 Angry Men (1957) | 4.24 | 342 | | 2959 | Fight Club (1999) | 4.23 | 343 | | 1193 | One Flew Over the Cuckoo's Nest (1975) | 4.22 | 344 | 345 | 上述电影评分对应的电影中文名称为: 346 | 347 | | 英文名称 | 中文名称 | 348 | | -------------------------------------------- | ------------ | 349 | | Shawshank Redemption, The (1994) | 肖申克的救赎 | 350 | | Godfather, The (1972) | 教父1 | 351 | | Usual Suspects, The (1995) | 非常嫌疑犯 | 352 | | Godfather: Part II, The (1974) | 教父2 | 353 | | Schindler's List (1993) | 辛德勒的名单 | 354 | | Seven Samurai (Shichinin no samurai) (1954) | 七武士 | 355 | | Rear Window (1954) | 后窗 | 356 | | 12 Angry Men (1957) | 十二怒汉 | 357 | | Fight Club (1999) | 搏击俱乐部 | 358 | | One Flew Over the Cuckoo's Nest (1975) | 飞越疯人院 | 359 | 360 | ### 需求2实现 361 | 362 | - **GenresByAverageRating** 363 | 364 | 需求2实现的业务逻辑封装。该类有一个run()方法,主要是封装计算逻辑。 365 | 366 | ```scala 367 | ** 368 | * 需求2:查找每个电影类别及其对应的平均评分 369 | */ 370 | class GenresByAverageRating extends Serializable { 371 | def run(moviesDataset: DataFrame, ratingsDataset: DataFrame, spark: SparkSession) = { 372 | import spark.implicits._ 373 | // 将moviesDataset注册成表 374 | moviesDataset.createOrReplaceTempView("movies") 375 | // 将ratingsDataset注册成表 376 | ratingsDataset.createOrReplaceTempView("ratings") 377 | 378 | val ressql2 = 379 | """ 380 | |WITH explode_movies AS ( 381 | |SELECT 382 | | movieId, 383 | | title, 384 | | category 385 | |FROM 386 | | movies lateral VIEW explode ( split ( genres, "\\|" ) ) temp AS category 387 | |) 388 | |SELECT 389 | | m.category AS genres, 390 | | avg( r.rating ) AS avgRating 391 | |FROM 392 | | explode_movies m 393 | | JOIN ratings r ON m.movieId = r.movieId 394 | |GROUP BY 395 | | m.category 396 | | """.stripMargin 397 | 398 | val resultDS = spark.sql(ressql2).as[topGenresByAverageRating] 399 | 400 | // 打印数据 401 | resultDS.show(10) 402 | resultDS.printSchema() 403 | // 写入MySQL 404 | resultDS.foreachPartition(par => par.foreach(insert2Mysql(_))) 405 | 406 | } 407 | 408 | /** 409 | * 获取连接,调用写入MySQL数据的方法 410 | * 411 | * @param res 412 | */ 413 | private def insert2Mysql(res: topGenresByAverageRating): Unit = { 414 | lazy val conn = JDBCUtil.getQueryRunner() 415 | conn match { 416 | case Some(connection) => { 417 | upsert(res, connection) 418 | } 419 | case None => { 420 | println("Mysql连接失败") 421 | System.exit(-1) 422 | } 423 | } 424 | } 425 | 426 | /** 427 | * 封装将结果写入MySQL的方法 428 | * 执行写入操作 429 | * 430 | * @param r 431 | * @param conn 432 | */ 433 | private def upsert(r: topGenresByAverageRating, conn: QueryRunner): Unit = { 434 | try { 435 | val sql = 436 | s""" 437 | |REPLACE INTO `genres_average_rating`( 438 | |genres, 439 | |avgRating 440 | |) 441 | |VALUES 442 | |(?,?) 443 | """.stripMargin 444 | // 执行insert操作 445 | conn.update( 446 | sql, 447 | r.genres, 448 | r.avgRating 449 | ) 450 | } catch { 451 | case e: Exception => { 452 | e.printStackTrace() 453 | System.exit(-1) 454 | } 455 | } 456 | } 457 | } 458 | ``` 459 | 460 | ### 需求2结果 461 | 462 | - 结果表建表语句 463 | 464 | ```sql 465 | CREATE TABLE genres_average_rating ( 466 | `id` INT ( 11 ) NOT NULL AUTO_INCREMENT COMMENT '自增id', 467 | `genres` VARCHAR ( 100 ) NOT NULL COMMENT '电影类别', 468 | `avgRating` DECIMAL ( 10, 2 ) NOT NULL COMMENT '电影类别平均评分', 469 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间', 470 | PRIMARY KEY ( `id` ), 471 | UNIQUE KEY `genres_UNIQUE` ( `genres` ) 472 | ) ENGINE = INNODB DEFAULT CHARSET = utf8; 473 | ``` 474 | 475 | - 统计结果 476 | 477 | 共有20个电影分类,每个电影分类的平均评分为: 478 | 479 | | genres | avgRating | 480 | | ------------------ | --------- | 481 | | Film-Noir | 3.93 | 482 | | War | 3.79 | 483 | | Documentary | 3.71 | 484 | | Crime | 3.69 | 485 | | Drama | 3.68 | 486 | | Mystery | 3.67 | 487 | | Animation | 3.61 | 488 | | IMAX | 3.6 | 489 | | Western | 3.59 | 490 | | Musical | 3.55 | 491 | | Romance | 3.54 | 492 | | Adventure | 3.52 | 493 | | Thriller | 3.52 | 494 | | Fantasy | 3.51 | 495 | | Sci-Fi | 3.48 | 496 | | Action | 3.47 | 497 | | Children | 3.43 | 498 | | Comedy | 3.42 | 499 | | (no genres listed) | 3.33 | 500 | | Horror | 3.29 | 501 | 502 | 电影分类对应的中文名称为: 503 | 504 | | 分类 | 中文名称 | 505 | | ------------------ | -------- | 506 | | Film-Noir | 黑色电影 | 507 | | War | 战争 | 508 | | Documentary | 纪录片 | 509 | | Crime | 犯罪 | 510 | | Drama | 历史剧 | 511 | | Mystery | 推理 | 512 | | Animation | 动画片 | 513 | | IMAX | 巨幕电影 | 514 | | Western | 西部电影 | 515 | | Musical | 音乐 | 516 | | Romance | 浪漫 | 517 | | Adventure | 冒险 | 518 | | Thriller | 惊悚片 | 519 | | Fantasy | 魔幻电影 | 520 | | Sci-Fi | 科幻 | 521 | | Action | 动作 | 522 | | Children | 儿童 | 523 | | Comedy | 喜剧 | 524 | | (no genres listed) | 未分类 | 525 | | Horror | 恐怖 | 526 | 527 | ### 需求3实现 528 | 529 | - **MostRatedFilms** 530 | 531 | 需求3实现的业务逻辑封装。该类有一个run()方法,主要是封装计算逻辑。 532 | 533 | ```scala 534 | /** 535 | * 需求3:查找被评分次数较多的前十部电影. 536 | */ 537 | class MostRatedFilms extends Serializable { 538 | def run(moviesDataset: DataFrame, ratingsDataset: DataFrame,spark: SparkSession) = { 539 | 540 | import spark.implicits._ 541 | 542 | // 将moviesDataset注册成表 543 | moviesDataset.createOrReplaceTempView("movies") 544 | // 将ratingsDataset注册成表 545 | ratingsDataset.createOrReplaceTempView("ratings") 546 | 547 | val ressql3 = 548 | """ 549 | |WITH rating_group AS ( 550 | | SELECT 551 | | movieId, 552 | | count( * ) AS ratingCnt 553 | | FROM ratings 554 | | GROUP BY movieId 555 | |), 556 | |rating_filter AS ( 557 | | SELECT 558 | | movieId, 559 | | ratingCnt 560 | | FROM rating_group 561 | | ORDER BY ratingCnt DESC 562 | | LIMIT 10 563 | |) 564 | |SELECT 565 | | m.movieId, 566 | | m.title, 567 | | r.ratingCnt 568 | |FROM 569 | | rating_filter r 570 | |JOIN movies m ON r.movieId = m.movieId 571 | | 572 | """.stripMargin 573 | 574 | val resultDS = spark.sql(ressql3).as[tenMostRatedFilms] 575 | // 打印数据 576 | resultDS.show(10) 577 | resultDS.printSchema() 578 | // 写入MySQL 579 | resultDS.foreachPartition(par => par.foreach(insert2Mysql(_))) 580 | 581 | } 582 | 583 | /** 584 | * 获取连接,调用写入MySQL数据的方法 585 | * 586 | * @param res 587 | */ 588 | private def insert2Mysql(res: tenMostRatedFilms): Unit = { 589 | lazy val conn = JDBCUtil.getQueryRunner() 590 | conn match { 591 | case Some(connection) => { 592 | upsert(res, connection) 593 | } 594 | case None => { 595 | println("Mysql连接失败") 596 | System.exit(-1) 597 | } 598 | } 599 | } 600 | 601 | /** 602 | * 封装将结果写入MySQL的方法 603 | * 执行写入操作 604 | * 605 | * @param r 606 | * @param conn 607 | */ 608 | private def upsert(r: tenMostRatedFilms, conn: QueryRunner): Unit = { 609 | try { 610 | val sql = 611 | s""" 612 | |REPLACE INTO `ten_most_rated_films`( 613 | |movieId, 614 | |title, 615 | |ratingCnt 616 | |) 617 | |VALUES 618 | |(?,?,?) 619 | """.stripMargin 620 | // 执行insert操作 621 | conn.update( 622 | sql, 623 | r.movieId, 624 | r.title, 625 | r.ratingCnt 626 | ) 627 | } catch { 628 | case e: Exception => { 629 | e.printStackTrace() 630 | System.exit(-1) 631 | } 632 | } 633 | } 634 | 635 | } 636 | 637 | ``` 638 | 639 | ### 需求3结果 640 | 641 | - 结果表创建语句 642 | 643 | ```sql 644 | CREATE TABLE ten_most_rated_films ( 645 | `id` INT ( 11 ) NOT NULL AUTO_INCREMENT COMMENT '自增id', 646 | `movieId` INT ( 11 ) NOT NULL COMMENT '电影Id', 647 | `title` varchar(100) NOT NULL COMMENT '电影名称', 648 | `ratingCnt` INT(11) NOT NULL COMMENT '电影被评分的次数', 649 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间', 650 | PRIMARY KEY ( `id` ), 651 | UNIQUE KEY `movie_id_UNIQUE` ( `movieId` ) 652 | ) ENGINE = INNODB DEFAULT CHARSET = utf8; 653 | ``` 654 | 655 | - 统计结果 656 | 657 | | movieId | title | ratingCnt | 658 | | ------- | ----------------------------------------- | --------- | 659 | | 356 | Forrest Gump (1994) | 81491 | 660 | | 318 | Shawshank Redemption, The (1994) | 81482 | 661 | | 296 | Pulp Fiction (1994) | 79672 | 662 | | 593 | Silence of the Lambs, The (1991) | 74127 | 663 | | 2571 | Matrix, The (1999) | 72674 | 664 | | 260 | Star Wars: Episode IV - A New Hope (1977) | 68717 | 665 | | 480 | Jurassic Park (1993) | 64144 | 666 | | 527 | Schindler's List (1993) | 60411 | 667 | | 110 | Braveheart (1995) | 59184 | 668 | | 2959 | Fight Club (1999) | 58773 | 669 | 670 | 评分次数较多的电影对应的中文名称为: 671 | 672 | | 英文名称 | 中文名称 | 673 | | ----------------------------------------- | ------------ | 674 | | Forrest Gump (1994) | 阿甘正传 | 675 | | Shawshank Redemption, The (1994) | 肖申克的救赎 | 676 | | Pulp Fiction (1994) | 低俗小说 | 677 | | Silence of the Lambs, The (1991) | 沉默的羔羊 | 678 | | Matrix, The (1999) | 黑客帝国 | 679 | | Star Wars: Episode IV - A New Hope (1977) | 星球大战 | 680 | | Jurassic Park (1993) | 侏罗纪公园 | 681 | | Schindler's List (1993) | 辛德勒的名单 | 682 | | Braveheart (1995) | 勇敢的心 | 683 | | Fight Club (1999) | 搏击俱乐部 | 684 | 685 | 686 | 687 | 688 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 4.0.0 6 | 7 | com.jmx 8 | sparkproject 9 | jar 10 | 1.0-SNAPSHOT 11 | 12 | 2008 13 | 14 | 15 | 2.11.8 16 | 17 | 18 | 19 | 20 | scala-tools.org 21 | Scala-Tools Maven2 Repository 22 | http://scala-tools.org/repo-releases 23 | 24 | 25 | 26 | 27 | 28 | scala-tools.org 29 | Scala-Tools Maven2 Repository 30 | http://scala-tools.org/repo-releases 31 | 32 | 33 | 34 | 35 | 36 | org.scala-lang 37 | scala-library 38 | ${scala.version} 39 | 40 | 41 | c3p0 42 | c3p0 43 | 0.9.1.2 44 | 45 | 46 | 47 | commons-dbutils 48 | commons-dbutils 49 | 1.6 50 | 51 | 52 | junit 53 | junit 54 | 4.12 55 | test 56 | 57 | 58 | org.specs 59 | specs 60 | 1.2.5 61 | test 62 | 63 | 64 | 65 | org.apache.spark 66 | spark-core_2.11 67 | 2.3.0 68 | 69 | 70 | 71 | org.apache.spark 72 | spark-sql_2.11 73 | 2.3.0 74 | 75 | 76 | 77 | org.apache.spark 78 | spark-streaming_2.11 79 | 2.3.0 80 | 81 | 82 | 83 | 84 | org.apache.spark 85 | spark-mllib_2.11 86 | 2.3.0 87 | 88 | 89 | 90 | 91 | org.apache.spark 92 | spark-streaming-kafka-0-10_2.11 93 | 2.3.0 94 | 95 | 96 | 97 | org.apache.spark 98 | spark-hive_2.11 99 | 2.3.0 100 | 101 | 102 | 103 | 104 | mysql 105 | mysql-connector-java 106 | 5.1.39 107 | 108 | 109 | 110 | org.apache.hadoop 111 | hadoop-common 112 | 2.7.7 113 | 114 | 115 | 116 | org.apache.hadoop 117 | hadoop-client 118 | 2.7.7 119 | 120 | 121 | 122 | org.apache.hadoop 123 | hadoop-hdfs 124 | 2.7.7 125 | 126 | 127 | org.apache.avro 128 | avro-tools 129 | 1.8.1 130 | 131 | 132 | 133 | 134 | org.apache.hive 135 | hive-cli 136 | 2.3.4 137 | 138 | 139 | 140 | org.apache.hive 141 | hive-exec 142 | 2.3.4 143 | 144 | 145 | org.apache.commons 146 | commons-dbcp2 147 | 2.1.1 148 | 149 | 150 | redis.clients 151 | jedis 152 | 2.8.0 153 | 154 | 155 | ru.yandex.clickhouse 156 | clickhouse-jdbc 157 | 0.2.4 158 | 159 | 160 | 161 | com.google.guava 162 | guava 163 | 28.0-jre 164 | 165 | 166 | 167 | 168 | 169 | src/main/scala 170 | src/test/scala 171 | 172 | 173 | org.scala-tools 174 | maven-scala-plugin 175 | 176 | 177 | 178 | compile 179 | testCompile 180 | 181 | 182 | 183 | 184 | ${scala.version} 185 | 186 | -target:jvm-1.5 187 | 188 | 189 | 190 | 191 | org.apache.maven.plugins 192 | maven-eclipse-plugin 193 | 194 | true 195 | 196 | ch.epfl.lamp.sdt.core.scalabuilder 197 | 198 | 199 | ch.epfl.lamp.sdt.core.scalanature 200 | 201 | 202 | org.eclipse.jdt.launching.JRE_CONTAINER 203 | ch.epfl.lamp.sdt.launching.SCALA_CONTAINER 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | org.scala-tools 213 | maven-scala-plugin 214 | 215 | ${scala.version} 216 | 217 | 218 | 219 | 220 | 221 | 222 | -------------------------------------------------------------------------------- /src/main/resources/create_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `ten_movies_averagerating` ( 2 | `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增id', 3 | `movieId` int(11) NOT NULL COMMENT '电影id', 4 | `title` varchar(100) NOT NULL COMMENT '电影名称', 5 | `avgRating` decimal(10,2) NOT NULL COMMENT '平均评分', 6 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间', 7 | PRIMARY KEY (`id`), 8 | UNIQUE KEY `movie_id_UNIQUE` (`movieId`) 9 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 10 | 11 | 12 | -- ------------------------------------------------------------------------------- 13 | 14 | CREATE TABLE genres_average_rating ( 15 | `id` INT ( 11 ) NOT NULL AUTO_INCREMENT COMMENT '自增id', 16 | `genres` VARCHAR ( 100 ) NOT NULL COMMENT '电影类别', 17 | `avgRating` DECIMAL ( 10, 2 ) NOT NULL COMMENT '电影类别平均评分', 18 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间', 19 | PRIMARY KEY ( `id` ), 20 | UNIQUE KEY `genres_UNIQUE` ( `genres` ) 21 | ) ENGINE = INNODB DEFAULT CHARSET = utf8; 22 | 23 | -- ------------------------------------------------------------------------------ 24 | 25 | 26 | CREATE TABLE ten_most_rated_films ( 27 | `id` INT ( 11 ) NOT NULL AUTO_INCREMENT COMMENT '自增id', 28 | `movieId` INT ( 11 ) NOT NULL COMMENT '电影Id', 29 | `title` varchar(100) NOT NULL COMMENT '电影名称', 30 | `ratingCnt` INT(11) NOT NULL COMMENT '电影被评分的次数', 31 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间', 32 | PRIMARY KEY ( `id` ), 33 | UNIQUE KEY `movie_id_UNIQUE` ( `movieId` ) 34 | ) ENGINE = INNODB DEFAULT CHARSET = utf8; 35 | 36 | -- ------------------------------------------------------------------------------ -------------------------------------------------------------------------------- /src/main/resources/log4j2.properties: -------------------------------------------------------------------------------- 1 | 2 | appender.out.type = Console 3 | appender.out.name = out 4 | appender.out.layout.type = PatternLayout 5 | appender.out.layout.pattern = [%30.30t] %-30.30c{1} %-5p %m%n 6 | logger.springframework.name = org.springframework 7 | logger.springframework.level = WARN 8 | rootLogger.level = INFO 9 | rootLogger.appenderRef.out.ref = out 10 | -------------------------------------------------------------------------------- /src/main/resources/代码结构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/src/main/resources/代码结构.png -------------------------------------------------------------------------------- /src/main/scala/com/jmx/demos/DemoMainApp.scala: -------------------------------------------------------------------------------- 1 | package com.jmx.demos 2 | 3 | import com.jmx.metrics.{BestFilmsByOverallRating, GenresByAverageRating, MostRatedFilms} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.types.StructType 6 | 7 | /** 8 | *  @Created with IntelliJ IDEA. 9 | *  @author : jmx 10 | *  @Date: 2020/11/19 11 | *  @Time: 16:27 12 | *  */ 13 | object DemoMainApp { 14 | // 文件路径 15 | private val MOVIES_CSV_FILE_PATH = "file:///e:/movies.csv" 16 | private val RATINGS_CSV_FILE_PATH = "file:///e:/ratings.csv" 17 | 18 | 19 | def main(args: Array[String]): Unit = { 20 | // 创建spark session 21 | val spark = SparkSession 22 | .builder 23 | .master("local[4]") 24 | .getOrCreate 25 | // schema信息 26 | val schemaLoader = new SchemaLoader 27 | // 读取Movie数据集 28 | val movieDF = readCsvIntoDataSet(spark, MOVIES_CSV_FILE_PATH, schemaLoader.getMovieSchema) 29 | 30 | // 读取Rating数据集 31 | val ratingDF = readCsvIntoDataSet(spark, RATINGS_CSV_FILE_PATH, schemaLoader.getRatingSchema) 32 | 33 | movieDF.printSchema() 34 | ratingDF.printSchema() 35 | 36 | // 需求1:查找电影评分个数超过5000,且平均评分较高的前十部电影名称及其对应的平均评分 37 | val bestFilmsByOverallRating = new BestFilmsByOverallRating 38 | bestFilmsByOverallRating.run(movieDF, ratingDF, spark) 39 | 40 | // 需求2:查找每个电影类别及其对应的平均评分 41 | val genresByAverageRating = new GenresByAverageRating 42 | genresByAverageRating.run(movieDF, ratingDF, spark) 43 | 44 | // 需求3:查找被评分次数较多的前十部电影 45 | val mostRatedFilms = new MostRatedFilms 46 | mostRatedFilms.run(movieDF, ratingDF, spark) 47 | 48 | 49 | spark.close() 50 | 51 | } 52 | 53 | /** 54 | * 读取数据文件,转成DataFrame 55 | * 56 | * @param spark 57 | * @param path 58 | * @param schema 59 | * @return 60 | */ 61 | def readCsvIntoDataSet(spark: SparkSession, path: String, schema: StructType) = { 62 | 63 | val dataSet = spark.read 64 | .format("csv") 65 | .option("header", "true") 66 | .schema(schema) 67 | .load(path) 68 | dataSet 69 | 70 | } 71 | 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/com/jmx/demos/Entry.scala: -------------------------------------------------------------------------------- 1 | package com.jmx.demos 2 | 3 | /** 4 | *  @Created with IntelliJ IDEA. 5 | *  @author : jmx 6 | *  @Date: 2020/11/19 7 | *  @Time: 15:57 8 | *  */ 9 | class Entry { 10 | 11 | } 12 | 13 | case class Movies( 14 | movieId: String, // 电影的id 15 | title: String, // 电影的标题 16 | genres: String // 电影类别 17 | ) 18 | 19 | case class Ratings( 20 | userId: String, // 用户的id 21 | movieId: String, // 电影的id 22 | rating: String, // 用户评分 23 | timestamp: String // 时间戳 24 | ) 25 | 26 | // 需求1MySQL结果表 27 | case class tenGreatestMoviesByAverageRating( 28 | movieId: String, // 电影的id 29 | title: String, // 电影的标题 30 | avgRating: String // 电影平均评分 31 | ) 32 | 33 | // 需求2MySQL结果表 34 | case class topGenresByAverageRating( 35 | genres: String, //电影类别 36 | avgRating: String // 平均评分 37 | ) 38 | 39 | // 需求3MySQL结果表 40 | case class tenMostRatedFilms( 41 | movieId: String, // 电影的id 42 | title: String, // 电影的标题 43 | ratingCnt: String // 电影被评分的次数 44 | ) -------------------------------------------------------------------------------- /src/main/scala/com/jmx/demos/SchemaLoader.scala: -------------------------------------------------------------------------------- 1 | package com.jmx.demos 2 | import org.apache.spark.sql.types.{DataTypes, StructType} 3 | /** 4 | *  @Created with IntelliJ IDEA. 5 | *  @author : jmx 6 | *  @Date: 2020/11/19 7 | *  @Time: 15:17 8 | *  */ 9 | 10 | 11 | class SchemaLoader { 12 | 13 | private val movieSchema = new StructType() 14 | .add("movieId", DataTypes.StringType, false) 15 | .add("title", DataTypes.StringType, false) 16 | .add("genres", DataTypes.StringType, false) 17 | 18 | private val ratingSchema = new StructType() 19 | .add("userId", DataTypes.StringType, false) 20 | .add("movieId", DataTypes.StringType, false) 21 | .add("rating", DataTypes.StringType, false) 22 | .add("timestamp", DataTypes.StringType, false) 23 | 24 | def getMovieSchema: StructType = movieSchema 25 | 26 | def getRatingSchema: StructType = ratingSchema 27 | } -------------------------------------------------------------------------------- /src/main/scala/com/jmx/metrics/BestFilmsByOverallRating.scala: -------------------------------------------------------------------------------- 1 | package com.jmx.metrics 2 | 3 | import com.jmx.demos.tenGreatestMoviesByAverageRating 4 | import com.jmx.util.JDBCUtil 5 | import org.apache.commons.dbutils.QueryRunner 6 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 7 | 8 | 9 | /** 10 | * 需求1:查找电影评分个数超过5000,且平均评分较高的前十部电影名称及其对应的平均评分 11 | */ 12 | class BestFilmsByOverallRating extends Serializable { 13 | 14 | def run(moviesDataset: DataFrame, ratingsDataset: DataFrame, spark: SparkSession) = { 15 | import spark.implicits._ 16 | 17 | // 将moviesDataset注册成表 18 | moviesDataset.createOrReplaceTempView("movies") 19 | // 将ratingsDataset注册成表 20 | ratingsDataset.createOrReplaceTempView("ratings") 21 | 22 | // 查询SQL语句 23 | val ressql1 = 24 | """ 25 | |WITH ratings_filter_cnt AS ( 26 | |SELECT 27 | | movieId, 28 | | count( * ) AS rating_cnt, 29 | | avg( rating ) AS avg_rating 30 | |FROM 31 | | ratings 32 | |GROUP BY 33 | | movieId 34 | |HAVING 35 | | count( * ) >= 5000 36 | |), 37 | |ratings_filter_score AS ( 38 | |SELECT 39 | | movieId, -- 电影id 40 | | avg_rating -- 电影平均评分 41 | |FROM ratings_filter_cnt 42 | |ORDER BY avg_rating DESC -- 平均评分降序排序 43 | |LIMIT 10 -- 平均分较高的前十部电影 44 | |) 45 | |SELECT 46 | | m.movieId, 47 | | m.title, 48 | | r.avg_rating AS avgRating 49 | |FROM 50 | | ratings_filter_score r 51 | |JOIN movies m ON m.movieId = r.movieId 52 | """.stripMargin 53 | 54 | val resultDS = spark.sql(ressql1).as[tenGreatestMoviesByAverageRating] 55 | // 打印数据 56 | resultDS.show(10) 57 | resultDS.printSchema() 58 | // 写入MySQL 59 | resultDS.foreachPartition(par => par.foreach(insert2Mysql(_))) 60 | } 61 | 62 | /** 63 | * 获取连接,调用写入MySQL数据的方法 64 | * 65 | * @param res 66 | */ 67 | private def insert2Mysql(res: tenGreatestMoviesByAverageRating): Unit = { 68 | lazy val conn = JDBCUtil.getQueryRunner() 69 | conn match { 70 | case Some(connection) => { 71 | upsert(res, connection) 72 | } 73 | case None => { 74 | println("Mysql连接失败") 75 | System.exit(-1) 76 | } 77 | } 78 | } 79 | 80 | /** 81 | * 封装将结果写入MySQL的方法 82 | * 执行写入操作 83 | * 84 | * @param r 85 | * @param conn 86 | */ 87 | private def upsert(r: tenGreatestMoviesByAverageRating, conn: QueryRunner): Unit = { 88 | try { 89 | val sql = 90 | s""" 91 | |REPLACE INTO `ten_movies_averagerating`( 92 | |movieId, 93 | |title, 94 | |avgRating 95 | |) 96 | |VALUES 97 | |(?,?,?) 98 | """.stripMargin 99 | // 执行insert操作 100 | conn.update( 101 | sql, 102 | r.movieId, 103 | r.title, 104 | r.avgRating 105 | ) 106 | } catch { 107 | case e: Exception => { 108 | e.printStackTrace() 109 | System.exit(-1) 110 | } 111 | } 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/main/scala/com/jmx/metrics/GenresByAverageRating.scala: -------------------------------------------------------------------------------- 1 | package com.jmx.metrics 2 | 3 | /** 4 | *  @Created with IntelliJ IDEA. 5 | *  @author : jmx 6 | *  @Date: 2020/11/19 7 | *  @Time: 15:21 8 | *  */ 9 | 10 | import com.jmx.demos.topGenresByAverageRating 11 | import com.jmx.util.JDBCUtil 12 | import org.apache.commons.dbutils.QueryRunner 13 | import org.apache.spark.sql.{DataFrame, SparkSession} 14 | 15 | /** 16 | * 需求2:查找每个电影类别及其对应的平均评分 17 | */ 18 | class GenresByAverageRating extends Serializable { 19 | def run(moviesDataset: DataFrame, ratingsDataset: DataFrame, spark: SparkSession) = { 20 | import spark.implicits._ 21 | // 将moviesDataset注册成表 22 | moviesDataset.createOrReplaceTempView("movies") 23 | // 将ratingsDataset注册成表 24 | ratingsDataset.createOrReplaceTempView("ratings") 25 | 26 | val ressql2 = 27 | """ 28 | |WITH explode_movies AS ( 29 | |SELECT 30 | | movieId, 31 | | title, 32 | | category 33 | |FROM 34 | | movies lateral VIEW explode ( split ( genres, "\\|" ) ) temp AS category 35 | |) 36 | |SELECT 37 | | m.category AS genres, 38 | | avg( r.rating ) AS avgRating 39 | |FROM 40 | | explode_movies m 41 | | JOIN ratings r ON m.movieId = r.movieId 42 | |GROUP BY 43 | | m.category 44 | | """.stripMargin 45 | 46 | val resultDS = spark.sql(ressql2).as[topGenresByAverageRating] 47 | 48 | // 打印数据 49 | resultDS.show(10) 50 | resultDS.printSchema() 51 | // 写入MySQL 52 | resultDS.foreachPartition(par => par.foreach(insert2Mysql(_))) 53 | 54 | 55 | } 56 | 57 | /** 58 | * 获取连接,调用写入MySQL数据的方法 59 | * 60 | * @param res 61 | */ 62 | private def insert2Mysql(res: topGenresByAverageRating): Unit = { 63 | lazy val conn = JDBCUtil.getQueryRunner() 64 | conn match { 65 | case Some(connection) => { 66 | upsert(res, connection) 67 | } 68 | case None => { 69 | println("Mysql连接失败") 70 | System.exit(-1) 71 | } 72 | } 73 | } 74 | 75 | /** 76 | * 封装将结果写入MySQL的方法 77 | * 执行写入操作 78 | * 79 | * @param r 80 | * @param conn 81 | */ 82 | private def upsert(r: topGenresByAverageRating, conn: QueryRunner): Unit = { 83 | try { 84 | val sql = 85 | s""" 86 | |REPLACE INTO `genres_average_rating`( 87 | |genres, 88 | |avgRating 89 | |) 90 | |VALUES 91 | |(?,?) 92 | """.stripMargin 93 | // 执行insert操作 94 | conn.update( 95 | sql, 96 | r.genres, 97 | r.avgRating 98 | ) 99 | } catch { 100 | case e: Exception => { 101 | e.printStackTrace() 102 | System.exit(-1) 103 | } 104 | } 105 | } 106 | 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/main/scala/com/jmx/metrics/MostRatedFilms.scala: -------------------------------------------------------------------------------- 1 | package com.jmx.metrics 2 | 3 | import com.jmx.demos.tenMostRatedFilms 4 | import com.jmx.util.JDBCUtil 5 | import org.apache.commons.dbutils.QueryRunner 6 | import org.apache.spark.sql.{DataFrame, SparkSession} 7 | /** 8 | *  @Created with IntelliJ IDEA. 9 | *  @author : jmx 10 | *  @Date: 2020/11/19 11 | *  @Time: 15:23 12 | *  */ 13 | /** 14 | * 需求3:查找被评分次数较多的前十部电影. 15 | */ 16 | class MostRatedFilms extends Serializable { 17 | def run(moviesDataset: DataFrame, ratingsDataset: DataFrame,spark: SparkSession) = { 18 | 19 | import spark.implicits._ 20 | 21 | // 将moviesDataset注册成表 22 | moviesDataset.createOrReplaceTempView("movies") 23 | // 将ratingsDataset注册成表 24 | ratingsDataset.createOrReplaceTempView("ratings") 25 | 26 | val ressql3 = 27 | """ 28 | |WITH rating_group AS ( 29 | | SELECT 30 | | movieId, 31 | | count( * ) AS ratingCnt 32 | | FROM ratings 33 | | GROUP BY movieId 34 | |), 35 | |rating_filter AS ( 36 | | SELECT 37 | | movieId, 38 | | ratingCnt 39 | | FROM rating_group 40 | | ORDER BY ratingCnt DESC 41 | | LIMIT 10 42 | |) 43 | |SELECT 44 | | m.movieId, 45 | | m.title, 46 | | r.ratingCnt 47 | |FROM 48 | | rating_filter r 49 | |JOIN movies m ON r.movieId = m.movieId 50 | | 51 | """.stripMargin 52 | 53 | val resultDS = spark.sql(ressql3).as[tenMostRatedFilms] 54 | // 打印数据 55 | resultDS.show(10) 56 | resultDS.printSchema() 57 | // 写入MySQL 58 | resultDS.foreachPartition(par => par.foreach(insert2Mysql(_))) 59 | 60 | } 61 | 62 | /** 63 | * 获取连接,调用写入MySQL数据的方法 64 | * 65 | * @param res 66 | */ 67 | private def insert2Mysql(res: tenMostRatedFilms): Unit = { 68 | lazy val conn = JDBCUtil.getQueryRunner() 69 | conn match { 70 | case Some(connection) => { 71 | upsert(res, connection) 72 | } 73 | case None => { 74 | println("Mysql连接失败") 75 | System.exit(-1) 76 | } 77 | } 78 | } 79 | 80 | /** 81 | * 封装将结果写入MySQL的方法 82 | * 执行写入操作 83 | * 84 | * @param r 85 | * @param conn 86 | */ 87 | private def upsert(r: tenMostRatedFilms, conn: QueryRunner): Unit = { 88 | try { 89 | val sql = 90 | s""" 91 | |REPLACE INTO `ten_most_rated_films`( 92 | |movieId, 93 | |title, 94 | |ratingCnt 95 | |) 96 | |VALUES 97 | |(?,?,?) 98 | """.stripMargin 99 | // 执行insert操作 100 | conn.update( 101 | sql, 102 | r.movieId, 103 | r.title, 104 | r.ratingCnt 105 | ) 106 | } catch { 107 | case e: Exception => { 108 | e.printStackTrace() 109 | System.exit(-1) 110 | } 111 | } 112 | } 113 | 114 | } 115 | -------------------------------------------------------------------------------- /src/main/scala/com/jmx/util/JDBCUtil.scala: -------------------------------------------------------------------------------- 1 | package com.jmx.util 2 | 3 | import com.jmx.demos.tenGreatestMoviesByAverageRating 4 | import com.mchange.v2.c3p0.ComboPooledDataSource 5 | import org.apache.commons.dbutils.QueryRunner 6 | 7 | /** 8 | *  @Created with IntelliJ IDEA. 9 | *  @author : jmx 10 | *  @Date: 2020/11/19 11 | *  @Time: 17:54 12 | *  */ 13 | object JDBCUtil { 14 | val dataSource = new ComboPooledDataSource() 15 | 16 | val user = "root" 17 | val password = "920724" 18 | val url = "jdbc:mysql://localhost:3306/mydb" 19 | 20 | dataSource.setUser(user) 21 | dataSource.setPassword(password) 22 | dataSource.setDriverClass("com.mysql.jdbc.Driver") 23 | dataSource.setJdbcUrl(url) 24 | dataSource.setAutoCommitOnClose(false) 25 | 26 | 27 | def getQueryRunner(): Option[QueryRunner]={ 28 | try { 29 | Some(new QueryRunner(dataSource)) 30 | }catch { 31 | case e:Exception => 32 | e.printStackTrace() 33 | None 34 | 35 | } 36 | } 37 | 38 | 39 | 40 | } 41 | -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/DemoMainApp$.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/DemoMainApp$.class -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/DemoMainApp.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/DemoMainApp.class -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/Entry.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/Entry.class -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/Movies$.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/Movies$.class -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/Movies.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/Movies.class -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/Ratings$.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/Ratings$.class -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/Ratings.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/Ratings.class -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/SchemaLoader.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/SchemaLoader.class -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/tenGreatestMoviesByAverageRating$.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/tenGreatestMoviesByAverageRating$.class -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/tenGreatestMoviesByAverageRating.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/tenGreatestMoviesByAverageRating.class -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/tenMostRatedFilms$.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/tenMostRatedFilms$.class -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/tenMostRatedFilms.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/tenMostRatedFilms.class -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/topGenresByAverageRating$.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/topGenresByAverageRating$.class -------------------------------------------------------------------------------- /target/classes/com/jmx/demos/topGenresByAverageRating.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/demos/topGenresByAverageRating.class -------------------------------------------------------------------------------- /target/classes/com/jmx/metrics/BestFilmsByOverallRating$$anonfun$run$1$$anonfun$apply$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/BestFilmsByOverallRating$$anonfun$run$1$$anonfun$apply$1.class -------------------------------------------------------------------------------- /target/classes/com/jmx/metrics/BestFilmsByOverallRating$$anonfun$run$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/BestFilmsByOverallRating$$anonfun$run$1.class -------------------------------------------------------------------------------- /target/classes/com/jmx/metrics/BestFilmsByOverallRating$$typecreator4$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/BestFilmsByOverallRating$$typecreator4$1.class -------------------------------------------------------------------------------- /target/classes/com/jmx/metrics/BestFilmsByOverallRating.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/BestFilmsByOverallRating.class -------------------------------------------------------------------------------- /target/classes/com/jmx/metrics/GenresByAverageRating$$anonfun$run$1$$anonfun$apply$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/GenresByAverageRating$$anonfun$run$1$$anonfun$apply$1.class -------------------------------------------------------------------------------- /target/classes/com/jmx/metrics/GenresByAverageRating$$anonfun$run$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/GenresByAverageRating$$anonfun$run$1.class -------------------------------------------------------------------------------- /target/classes/com/jmx/metrics/GenresByAverageRating$$typecreator4$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/GenresByAverageRating$$typecreator4$1.class -------------------------------------------------------------------------------- /target/classes/com/jmx/metrics/GenresByAverageRating.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/GenresByAverageRating.class -------------------------------------------------------------------------------- /target/classes/com/jmx/metrics/MostRatedFilms$$anonfun$run$1$$anonfun$apply$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/MostRatedFilms$$anonfun$run$1$$anonfun$apply$1.class -------------------------------------------------------------------------------- /target/classes/com/jmx/metrics/MostRatedFilms$$anonfun$run$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/MostRatedFilms$$anonfun$run$1.class -------------------------------------------------------------------------------- /target/classes/com/jmx/metrics/MostRatedFilms$$typecreator4$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/MostRatedFilms$$typecreator4$1.class -------------------------------------------------------------------------------- /target/classes/com/jmx/metrics/MostRatedFilms.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/metrics/MostRatedFilms.class -------------------------------------------------------------------------------- /target/classes/com/jmx/util/JDBCUtil$.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/util/JDBCUtil$.class -------------------------------------------------------------------------------- /target/classes/com/jmx/util/JDBCUtil.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiamx/spark_project_practise/6e33177c302e20ed2004623e988f65b2d3b73a48/target/classes/com/jmx/util/JDBCUtil.class -------------------------------------------------------------------------------- /target/classes/create_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `ten_movies_averagerating` ( 2 | `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '自增id', 3 | `movieId` int(11) NOT NULL COMMENT '电影id', 4 | `title` varchar(100) NOT NULL COMMENT '电影名称', 5 | `avgRating` decimal(10,2) NOT NULL COMMENT '平均评分', 6 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间', 7 | PRIMARY KEY (`id`), 8 | UNIQUE KEY `movie_id_UNIQUE` (`movieId`) 9 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 10 | 11 | 12 | -- ------------------------------------------------------------------------------- 13 | 14 | CREATE TABLE genres_average_rating ( 15 | `id` INT ( 11 ) NOT NULL AUTO_INCREMENT COMMENT '自增id', 16 | `genres` VARCHAR ( 100 ) NOT NULL COMMENT '电影类别', 17 | `avgRating` DECIMAL ( 10, 2 ) NOT NULL COMMENT '电影类别平均评分', 18 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间', 19 | PRIMARY KEY ( `id` ), 20 | UNIQUE KEY `genres_UNIQUE` ( `genres` ) 21 | ) ENGINE = INNODB DEFAULT CHARSET = utf8; 22 | 23 | -- ------------------------------------------------------------------------------ 24 | 25 | 26 | CREATE TABLE ten_most_rated_films ( 27 | `id` INT ( 11 ) NOT NULL AUTO_INCREMENT COMMENT '自增id', 28 | `movieId` INT ( 11 ) NOT NULL COMMENT '电影Id', 29 | `title` varchar(100) NOT NULL COMMENT '电影名称', 30 | `ratingCnt` INT(11) NOT NULL COMMENT '电影被评分的次数', 31 | `update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '更新时间', 32 | PRIMARY KEY ( `id` ), 33 | UNIQUE KEY `movie_id_UNIQUE` ( `movieId` ) 34 | ) ENGINE = INNODB DEFAULT CHARSET = utf8; 35 | 36 | -- ------------------------------------------------------------------------------ -------------------------------------------------------------------------------- /target/classes/log4j2.properties: -------------------------------------------------------------------------------- 1 | 2 | appender.out.type = Console 3 | appender.out.name = out 4 | appender.out.layout.type = PatternLayout 5 | appender.out.layout.pattern = [%30.30t] %-30.30c{1} %-5p %m%n 6 | logger.springframework.name = org.springframework 7 | logger.springframework.level = WARN 8 | rootLogger.level = INFO 9 | rootLogger.appenderRef.out.ref = out 10 | --------------------------------------------------------------------------------