├── .gitignore ├── README.md ├── data ├── China.games.json └── China.reviews_official.json ├── pic ├── 好评数量.png ├── 数据格式.png ├── 数据流.png ├── 游戏动力对比.png ├── 游戏用户关系图.png ├── 游戏用户关系图2.png ├── 游玩时长分析.png ├── 用户游戏推荐.png ├── 用户社群聚合图.png ├── 用户社群聚合图2.png ├── 用户社群聚合图3.png ├── 评论情感分析.png ├── 评论能力对比.png └── 词云图.png ├── ppt ├── GraphX_报告.pdf ├── MLlib_报告.pdf └── Streaming_报告.pdf ├── scripts ├── generate_kaggle_log.py ├── generate_log.py ├── read_mongodb.py ├── steam-recommend.py └── write_log.sh ├── spark-graphx ├── pom.xml └── src │ └── main │ ├── resources │ ├── follows.txt │ ├── log4j.properties │ ├── output │ │ ├── graph.gexf │ │ ├── graphWeapon.gexf │ │ ├── isolate.txt │ │ └── minDegrees.gexf │ ├── steam │ │ ├── hours_3_10W.gexf │ │ ├── hours_5_20W.gexf │ │ ├── hours_6_30W.gexf │ │ ├── hours_7_30W.gexf │ │ ├── steam_3_10W.gexf │ │ ├── steam_5_20W.gexf │ │ ├── steam_6_30W.gexf │ │ └── steam_7_30W.gexf │ └── user.txt │ └── scala │ └── cn │ └── edu │ └── nju │ ├── GraphExample.scala │ ├── GraphExample2.scala │ ├── GraphExample3.scala │ ├── GraphProcess.scala │ ├── GraphProcessTest.scala │ └── MongoDBProcess.scala ├── spark-mllib ├── pom.xml └── src │ └── main │ ├── resources │ ├── game_content.txt │ ├── neg.txt │ ├── pos.txt │ └── recommend_validate │ └── scala │ └── cn │ └── edu │ └── nju │ ├── DataProcessing.scala │ ├── EmotionAnalysis.scala │ └── SteamGameRecommendation.scala ├── spark-streaming ├── pom.xml └── src │ └── main │ ├── java │ └── cn │ │ └── edu │ │ └── nju │ │ ├── ApiReturnUtil.java │ │ ├── DemoMessageController.java │ │ ├── SteamserverdemoApplication.java │ │ ├── Test.java │ │ ├── TimeFieldObject.java │ │ ├── WebSocketConfig.java │ │ ├── WebSocketServer.java │ │ ├── api │ │ ├── ApiReturnObject.java │ │ └── TagReturnObject.java │ │ ├── domain │ │ ├── GameObject.java │ │ └── TagObject.java │ │ ├── encoder │ │ ├── ApiObjectEncoder.java │ │ └── TagObjectEncoder.java │ │ └── utils │ │ ├── DbPool.java │ │ ├── HBaseUtils.java │ │ └── Test.java │ ├── resources │ ├── RollupCSV │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000-4a85a460-3bdb-48ad-bc81-9b68057082a8-c000.csv.crc │ │ └── _SUCCESS │ ├── game.json │ ├── gameAll.json │ ├── gameDetail.json │ ├── log4j.properties │ ├── test.txt │ └── update.sql │ └── scala │ └── cn │ └── edu │ └── nju │ ├── BatchProcess.scala │ ├── HDFSProcess.scala │ ├── KafkaProcess.scala │ ├── MongoDBProcess.scala │ ├── MySQLProcess.scala │ ├── SteamProcess.scala │ ├── dao │ ├── CourseClickCountDAO.scala │ ├── CourseSearchClickCountDAO.scala │ ├── RollUpDAO.scala │ └── TagDAO.scala │ ├── domain │ ├── ClickLog.scala │ ├── CommentLog.scala │ ├── CourseClickCount.scala │ ├── CourseSearchClickCount.scala │ ├── DouBanLog.scala │ ├── GameDetail.scala │ ├── ReviewsChart.scala │ ├── RollUp.scala │ ├── SteamLog.scala │ ├── Tag.scala │ └── UserData.scala │ ├── test │ ├── DateTest.scala │ ├── HDFSProcessTest.scala │ ├── JsonTest.scala │ ├── StatStreaming.scala │ └── TransformTest.scala │ └── utils │ ├── DateUtils.scala │ └── MySQLUtils.scala └── spider ├── spark-graphx ├── steam-reviews-official.py └── steam-reviews.py └── spark-streaming ├── steam-games-multithread-queue.py └── steam-hotN.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | 25 | *.iml 26 | *.idea 27 | target 28 | *.csv 29 | model 30 | 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 云计算大作业 2 | 3 | ## 1. 云计算作业介绍 4 | 5 | - **本人只负责逻辑层面的业务处理,所以代码大部分只负责到处理数据后落地,前端展示基本使用ECharts** 6 | - **详细流程参考PPT文件下三次汇报PPT** 7 | - **单机业务流程就可以跑通,集群搭建步骤请参考:[集群搭建](https://github.com/Thpffcj/BigData-Getting-Started/blob/master/%E9%9B%86%E7%BE%A4%E6%90%AD%E5%BB%BA.md)** 8 | - 实践作业分为三个部分:Spark Streaming 计算模拟、Spark GraphX 计算和基于 Spark MLlib 的计算 9 | - Spark Streaming 10 | - 要求针对DStream数据开展的计算中至少使用到5个Transformation操作,可以 11 | 是多个业务问题;必须使用到至少1个全局统计的量;结果展示不少于2类图示。Streaming 程序监听的必须是HDFS文件夹。原始数据存储在MongoDB中,模拟数据流时,从MongoDB 中读取数据,写入HDFS中 12 | - Spark GraphX 13 | - 要求必须使用边和点的RDD构造图;用于业务计算的图中不少于1 万个点和1万条边;对于图的计算使用不少于6个GraphX的API调用,可以是解决多个业务问题; 至少使用1次聚合操作或者关联操作;结果展示不少于2类图示。从MongoDB中读取图数据,结果存回 MongoDB中 14 | - Spark MLlib 15 | - 展示不仅包括实验结果,还需包括数据的相关分析 16 | 17 | *** 18 | 19 | ## 2. spark-streaming:Steam数据爬取和流模拟 20 | 21 | ### 1. 研究问题 22 | 23 | - 截至目前那些游戏最火爆 24 | - 玩家游戏时长的分布 25 | - 哪些类型的游戏最受欢迎 26 | 27 | ### 2. 数据 28 | 29 | - /data/China.games.json 30 | 31 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E6%95%B0%E6%8D%AE%E6%A0%BC%E5%BC%8F.png) 32 | 33 | - /data/steam.csv 34 | 35 | 36 | userId,gameName,behavior,duration,none 37 | 151603712,"The Elder Scrolls V Skyrim",purchase,1.0,0 38 | 151603712,"The Elder Scrolls V Skyrim",play,273.0,0 39 | 151603712,"Fallout 4",purchase,1.0,0 40 | 151603712,"Fallout 4",play,87.0,0 41 | 151603712,"Spore",purchase,1.0,0 42 | 151603712,"Spore",play,14.9,0 43 | 151603712,"Fallout New Vegas",purchase,1.0,0 44 | 151603712,"Fallout New Vegas",play,12.1,0 45 | 151603712,"Left 4 Dead 2",purchase,1.0,0 46 | 151603712,"Left 4 Dead 2",play,8.9,0 47 | 48 | 49 | ### 3. 数据流 50 | 51 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E6%95%B0%E6%8D%AE%E6%B5%81.png) 52 | 53 | ### 4. 展示效果 54 | 55 | - 游戏销量动态排名图 56 | 57 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E5%A5%BD%E8%AF%84%E6%95%B0%E9%87%8F.png) 58 | 59 | - 动态词云图 60 | 61 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E8%AF%8D%E4%BA%91%E5%9B%BE.png) 62 | 63 | - 游玩时长分布图 64 | 65 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E6%B8%B8%E7%8E%A9%E6%97%B6%E9%95%BF%E5%88%86%E6%9E%90.png) 66 | 67 | *** 68 | 69 | ## 3. spark-graphx 70 | 71 | ### 1. 研究问题 72 | 73 | - 游戏的口碑和热度 74 | - 用户社群 75 | - 游戏对市场的占有力和用户粘性 76 | - 游戏间的竞争关系 77 | 78 | **相关指标** 79 | 80 | - 游戏评论 81 | - 玩家评论游戏数 82 | - 游戏所受评论数 83 | - 玩家游戏时长 84 | 85 | ### 2. 展示效果 86 | 87 | - 游戏用户关系图 88 | 89 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E6%B8%B8%E6%88%8F%E7%94%A8%E6%88%B7%E5%85%B3%E7%B3%BB%E5%9B%BE.png) 90 | 91 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E6%B8%B8%E6%88%8F%E7%94%A8%E6%88%B7%E5%85%B3%E7%B3%BB%E5%9B%BE2.png) 92 | 93 | - 用户社群聚合图 94 | 95 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E7%94%A8%E6%88%B7%E7%A4%BE%E7%BE%A4%E8%81%9A%E5%90%88%E5%9B%BE.png) 96 | 97 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E7%94%A8%E6%88%B7%E7%A4%BE%E7%BE%A4%E8%81%9A%E5%90%88%E5%9B%BE2.png) 98 | 99 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E7%94%A8%E6%88%B7%E7%A4%BE%E7%BE%A4%E8%81%9A%E5%90%88%E5%9B%BE3.png) 100 | 101 | *** 102 | 103 | ## 4. spark-mllib 104 | 105 | ### 1. 研究问题 106 | 107 | - 玩家间有哪些社群? 108 | - 各社群特点? 109 | - 可能感兴趣的游戏? 110 | 111 | ### 2. 用户游戏推荐 112 | 113 | - 我们想利用某个兴趣相投、拥有共同经验群体的喜好来推荐感兴趣的游戏给玩家 114 | - 协同过滤技术旨在补充用户 - 商品关联矩阵中所缺失的部分 115 | - 我们并没有直观的用户对游戏的评分,于是用户的游玩时长代替用户对游戏的评价,为了消除游戏本身游玩时长的影响,我们将每款游戏的游玩时长映射到0 – 10之间代替用户对该游戏的评分 116 | 117 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E7%94%A8%E6%88%B7%E6%B8%B8%E6%88%8F%E6%8E%A8%E8%8D%90.png) 118 | 119 | ### 3. 评论情感分析 120 | 121 | - 用户对游戏的评论通常有一个标签:推荐/不推荐 122 | - 我们想通过对评论的情感分析,判断一条评论是推荐这个游戏还是不推荐 123 | 124 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E8%AF%84%E8%AE%BA%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90.png) 125 | 126 | ### 4. 社群聚类分析 127 | 128 | - 聚类特征 129 | - 玩家拥有游戏数 130 | - 玩家总评论数 131 | - 游玩时长 132 | - 是否推荐游戏 133 | - 被认为评论有用 134 | - 被认为评论欢乐数 135 | - 被回复数 136 | 137 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E6%B8%B8%E6%88%8F%E5%8A%A8%E5%8A%9B%E5%AF%B9%E6%AF%94.png) 138 | 139 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E8%AF%84%E8%AE%BA%E8%83%BD%E5%8A%9B%E5%AF%B9%E6%AF%94.png) 140 | 141 | *** 142 | 143 | ## 5. 云计算作业介绍 144 | 145 | - 由于本人只负责Spark计算的过程,所以每部分代码可能都不是完整的业务流程,主要记录学习Spark过程 146 | 147 | 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /pic/好评数量.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/好评数量.png -------------------------------------------------------------------------------- /pic/数据格式.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/数据格式.png -------------------------------------------------------------------------------- /pic/数据流.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/数据流.png -------------------------------------------------------------------------------- /pic/游戏动力对比.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/游戏动力对比.png -------------------------------------------------------------------------------- /pic/游戏用户关系图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/游戏用户关系图.png -------------------------------------------------------------------------------- /pic/游戏用户关系图2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/游戏用户关系图2.png -------------------------------------------------------------------------------- /pic/游玩时长分析.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/游玩时长分析.png -------------------------------------------------------------------------------- /pic/用户游戏推荐.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/用户游戏推荐.png -------------------------------------------------------------------------------- /pic/用户社群聚合图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/用户社群聚合图.png -------------------------------------------------------------------------------- /pic/用户社群聚合图2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/用户社群聚合图2.png -------------------------------------------------------------------------------- /pic/用户社群聚合图3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/用户社群聚合图3.png -------------------------------------------------------------------------------- /pic/评论情感分析.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/评论情感分析.png -------------------------------------------------------------------------------- /pic/评论能力对比.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/评论能力对比.png -------------------------------------------------------------------------------- /pic/词云图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/词云图.png -------------------------------------------------------------------------------- /ppt/GraphX_报告.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/ppt/GraphX_报告.pdf -------------------------------------------------------------------------------- /ppt/MLlib_报告.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/ppt/MLlib_报告.pdf -------------------------------------------------------------------------------- /ppt/Streaming_报告.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/ppt/Streaming_报告.pdf -------------------------------------------------------------------------------- /scripts/generate_kaggle_log.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # Created by thpffcj on 2019/10/19. 3 | 4 | import time 5 | import pandas as pd 6 | 7 | pd.set_option('display.max_columns', 40) 8 | pd.set_option('display.width', 1000) 9 | 10 | 11 | def generate_log(count=200000): 12 | 13 | data = pd.read_csv("/Users/thpffcj/Public/file/steam.csv") 14 | f = open("/Users/thpffcj/Public/file/user_data.log", "a") 15 | 16 | flag = 0 17 | position = 0 18 | while count >= 1: 19 | log = data.loc[position:position] 20 | query_log = "{user_id}\t{game_name}\t{behavior}\t{duration}".format( 21 | user_id=log["userId"].values.max(), game_name=log["gameName"].values.max(), 22 | behavior=log["behavior"].values.max(), duration=log["duration"].values.max()) 23 | 24 | f.write(query_log + "\n") 25 | print(query_log) 26 | 27 | if flag % 500 == 0: 28 | time.sleep(5) 29 | 30 | count = count - 1 31 | position = position + 1 32 | 33 | 34 | if __name__ == '__main__': 35 | generate_log() 36 | -------------------------------------------------------------------------------- /scripts/generate_log.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # Created by thpffcj on 2019/10/2. 3 | 4 | import pymongo 5 | import time 6 | import os 7 | 8 | 9 | # 连接数据库 10 | client = pymongo.MongoClient("***.***.***.***", 27017) 11 | 12 | db = client['steam_db'] 13 | db.authenticate("steam", "steam") 14 | 15 | table = db['China.games'] 16 | 17 | data = table.find().limit(1000) 18 | print("数据加载完成...") 19 | # 65175 20 | print(data.count()) 21 | 22 | 23 | def generate_log(count=200): 24 | print("进入方法...") 25 | flag = 0 26 | steam_log = "" 27 | for game_data in data: 28 | query_log = "{img_src}\t{game_detail}\t{original_price}\t{price}\t{review_summary}\t{date}\t{name}".format( 29 | img_src=game_data["img_src"], 30 | game_detail=str(game_data["game_detail"]), 31 | original_price=game_data["original_price"], 32 | price=game_data["price"], 33 | review_summary=game_data["review_summary"], 34 | date=game_data["date"], 35 | name=game_data["name"]) 36 | 37 | steam_log = steam_log + query_log + "\n" 38 | flag = flag + 1 39 | if flag % 200 == 0: 40 | print("flag:" + str(flag)) 41 | 42 | if flag == count: 43 | print("写日志...") 44 | f = open("/Users/thpffcj/Public/local-repository/Python-Learning/cloud-computing/utils/test.log", "w") 45 | f.write(steam_log) 46 | time.sleep(2) 47 | 48 | # 上传 49 | print("上传日志...") 50 | os.system("./write_log.sh") 51 | 52 | flag = 0 53 | steam_log = "" 54 | f.close() 55 | time.sleep(3) 56 | 57 | print("结束...") 58 | 59 | 60 | def write_log(): 61 | print("进入方法...") 62 | flag = 0 63 | f = open("/Users/thpffcj/Public/local-repository/Python-Learning/cloud-computing/utils/test.log", "a") 64 | for game_data in data: 65 | query_log = "{img_src}\t{game_detail}\t{original_price}\t{price}\t{review_summary}\t{date}\t{name}".format( 66 | img_src=game_data["img_src"], 67 | game_detail=str(game_data["game_detail"]), 68 | original_price=game_data["original_price"], 69 | price=game_data["price"], 70 | review_summary=game_data["review_summary"], 71 | date=game_data["date"], 72 | name=game_data["name"]) 73 | 74 | flag = flag + 1 75 | if flag % 200 == 0: 76 | print("flag:" + str(flag)) 77 | 78 | f.write(query_log + "\n") 79 | 80 | f.close() 81 | print("结束...") 82 | 83 | 84 | def clean(): 85 | f = open("/Users/thpffcj/Public/local-repository/Python-Learning/cloud-computing/utils/test.log", "w") 86 | f.write("") 87 | f.close() 88 | 89 | 90 | if __name__ == '__main__': 91 | generate_log() 92 | -------------------------------------------------------------------------------- /scripts/read_mongodb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # Created by thpffcj on 2019/10/30. 3 | 4 | import pymongo 5 | 6 | 7 | # 连接数据库 8 | client1 = pymongo.MongoClient("***.***.***.***", 27017) 9 | 10 | db1 = client1['steam_db'] 11 | db1.authenticate("steam", "steam") 12 | 13 | table = db1['China.reviews_official'] 14 | 15 | data = table.find().limit(300000) 16 | print("数据加载完成...") 17 | # 65175 18 | # for d in data: 19 | # print(d["game"]) 20 | 21 | 22 | # Python写MongoDB 23 | client2 = pymongo.MongoClient("127.0.0.1", 27017) 24 | # 库名inventory 25 | db2 = client2['test'] 26 | # 集合名items 27 | collection = db2['China.reviews_official_30W'] 28 | 29 | # 插入一个文档,item是一个字典{} 30 | collection.insert(data) -------------------------------------------------------------------------------- /scripts/steam-recommend.py: -------------------------------------------------------------------------------- 1 | from pyspark.ml.clustering import KMeans 2 | from pyspark.ml.evaluation import ClusteringEvaluator 3 | from pyspark.sql import SparkSession 4 | from pyspark.ml.linalg import Vectors 5 | import pymongo 6 | 7 | # db.addUser("steam",{roles:[ {role:"root",db:"steam_db"} ]}) 8 | #mongodb连接 9 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db') 10 | db = client.steam_db 11 | regions = db.China 12 | test_collection = regions.test_collection 13 | train_collection = regions.train_collection 14 | print(train_collection.find()[0]) 15 | 16 | # kmeans_path = "./kmeans" 17 | model_path = "./kmeans_model" 18 | 19 | def getData(collection): 20 | return map(lambda r: (Vectors.dense([r["author"]["num_games_owned"], 21 | r["author"]["num_reviews"], 22 | r["author"]["playtime_forever"], 23 | # r["review"], 24 | r["voted_up"], 25 | r["votes_up"], 26 | r["votes_funny"], 27 | r["comment_count"]]),), collection.find()) 28 | 29 | spark = SparkSession\ 30 | .builder\ 31 | .appName("KMeansExample")\ 32 | .getOrCreate() 33 | train_data = getData(train_collection) 34 | test_data = getData(test_collection) 35 | 36 | # Loads data. 37 | # dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt") 38 | # data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), 39 | # (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)] 40 | # data = [(Vectors.dense([0.0, 0.0, 0.0]),), (Vectors.dense([0.1, 0.1, 0.1]),), (Vectors.dense([0.2, 0.2, 0.2]),), 41 | # (Vectors.dense([9.0, 9.0, 9.0]),), (Vectors.dense([9.1, 9.1, 9.1]),), (Vectors.dense([9.2, 9.2, 9.2]),)] 42 | train_dataset = spark.createDataFrame(train_data, ["features"]) 43 | test_dataset = spark.createDataFrame(test_data, ["features"]) 44 | 45 | # Trains a k-means model. 46 | kmeans = KMeans().setK(5).setSeed(1) 47 | # kmeans = KMeans.load(kmeans_path) 48 | model = kmeans.fit(train_dataset) 49 | # model = KMeansModel.load(model_path) 50 | clusterSizes = model.summary.clusterSizes 51 | print(clusterSizes) 52 | 53 | # print("点(-3 -3)所属族:" + model.predict((Vectors.dense([1, 1, 1, 1, 1, 1, 1]),))) 54 | 55 | # Make predictions 56 | predictions = model.transform(test_dataset) 57 | # print(predictions.collect()) 58 | 59 | # Evaluate clustering by computing Silhouette score 60 | evaluator = ClusteringEvaluator() 61 | 62 | silhouette = evaluator.evaluate(predictions) #轮廓系数 silhouette coefficient 0.8758693672037696 63 | print("Silhouette with squared euclidean distance = " + str(silhouette)) 64 | 65 | # Shows the result. 66 | centers = model.clusterCenters() 67 | print("Cluster Centers: ") 68 | for center in centers: 69 | print(center) 70 | 71 | # kmeans.save(kmeans_path) 72 | # model.save(model_path) 73 | 74 | kmeans_centers = regions.kmeans_centers 75 | kmeans_centers.drop() 76 | i = 0 77 | for center in centers: 78 | json = { 79 | "num_games_owned":center[0], 80 | "num_reviews":center[1], 81 | "playtime_forever":center[2], 82 | "voted_up":center[3], 83 | "votes_up":center[4], 84 | "votes_funny":center[5], 85 | "comment_count":center[6], 86 | "num_of_reviews":clusterSizes[i], 87 | } 88 | i+=1 89 | kmeans_centers.insert_one(json) 90 | 91 | spark.stop() -------------------------------------------------------------------------------- /scripts/write_log.sh: -------------------------------------------------------------------------------- 1 | # HDFS命令 2 | HDFS="hadoop fs" 3 | 4 | # Streaming监听的文件目录,要与Streaming程序中保持一致 5 | streaming_dir="/cloud-computing" 6 | 7 | # 清空旧数据 8 | $HDFS -rm "${streaming_dir}"'/tmp/*'>/dev/null 2>&1 9 | $HDFS -rm "${streaming_dir}"'/*'>/dev/null 2>&1 10 | $HDFS -mkdir ${streaming_dir}/tmp 11 | 12 | # 生成日志 13 | 14 | # 加上时间戳,防止重名 15 | templog="access.`date +'%s'`.log" 16 | # 先将日志放到临时目录,再移动到Streaming监听目录,确保原子性 17 | $HDFS -put test.log ${streaming_dir}/tmp/$templog 18 | $HDFS -mv ${streaming_dir}/tmp/$templog ${streaming_dir}/ -------------------------------------------------------------------------------- /spark-graphx/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | 7 | cn.edu.nju 8 | mf1932063 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 2.11.8 13 | 2.2.0 14 | 2.6.0-cdh5.16.2 15 | 1.8 16 | 1.8 17 | 18 | 19 | 20 | 21 | 22 | cloudera 23 | https://repository.cloudera.com/artifactory/cloudera-repos 24 | 25 | 26 | central 27 | aliyun maven 28 | http://maven.aliyun.com/nexus/content/groups/public/ 29 | default 30 | 31 | 32 | 33 | 34 | 35 | org.projectlombok 36 | lombok 37 | 1.16.18 38 | 39 | 40 | 41 | org.scala-lang 42 | scala-library 43 | ${scala.version} 44 | 45 | 46 | 47 | org.mongodb.spark 48 | mongo-spark-connector_2.11 49 | ${spark.version} 50 | 51 | 52 | 53 | org.apache.spark 54 | spark-graphx_2.11 55 | ${spark.version} 56 | 57 | 58 | 59 | org.apache.commons 60 | commons-lang3 61 | 3.5 62 | 63 | 64 | 65 | 66 | org.apache.spark 67 | spark-sql_2.11 68 | ${spark.version} 69 | 70 | 71 | 72 | com.fasterxml.jackson.module 73 | jackson-module-scala_2.11 74 | 2.6.5 75 | 76 | 77 | 78 | com.alibaba 79 | fastjson 80 | 1.2.47 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | net.jpountz.lz4 91 | lz4 92 | 1.3.0 93 | 94 | 95 | 96 | org.codehaus.janino 97 | janino 98 | 3.0.8 99 | 100 | 101 | 102 | mysql 103 | mysql-connector-java 104 | 5.1.38 105 | 106 | 107 | 108 | io.netty 109 | netty-all 110 | 4.0.42.Final 111 | 112 | 113 | 114 | 115 | 116 | 120 | 121 | 122 | org.scala-tools 123 | maven-scala-plugin 124 | 125 | 126 | 127 | compile 128 | testCompile 129 | 130 | 131 | 132 | 133 | ${scala.version} 134 | 135 | -target:jvm-1.8 136 | 137 | 138 | 139 | 140 | org.apache.maven.plugins 141 | maven-eclipse-plugin 142 | 143 | true 144 | 145 | ch.epfl.lamp.sdt.core.scalabuilder 146 | 147 | 148 | ch.epfl.lamp.sdt.core.scalanature 149 | 150 | 151 | org.eclipse.jdt.launching.JRE_CONTAINER 152 | ch.epfl.lamp.sdt.launching.SCALA_CONTAINER 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | org.scala-tools 162 | maven-scala-plugin 163 | 164 | ${scala.version} 165 | 166 | 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /spark-graphx/src/main/resources/follows.txt: -------------------------------------------------------------------------------- 1 | 1 2 2 | 2 3 3 | 3 5 4 | 4 6 5 | 7 6 6 | 6 7 -------------------------------------------------------------------------------- /spark-graphx/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=INFO, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Set the default spark-shell log level to WARN. When running the spark-shell, the 9 | # log level for this class is used to overwrite the root logger's log level, so that 10 | # the user can have different defaults for the shell and regular Spark apps. 11 | log4j.logger.org.apache.spark.repl.Main=WARN 12 | 13 | # Settings to quiet third party logs that are too verbose 14 | log4j.logger.org.spark_project.jetty=WARN 15 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR 16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 18 | log4j.logger.org.apache.parquet=ERROR 19 | log4j.logger.parquet=ERROR 20 | 21 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 22 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 23 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR -------------------------------------------------------------------------------- /spark-graphx/src/main/resources/output/graphWeapon.gexf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /spark-graphx/src/main/resources/output/isolate.txt: -------------------------------------------------------------------------------- 1 | CompactBuffer(15, 103) 2 | CompactBuffer(39, 189, 96, 81, 153, 156, 66, 138, 171, 36, 111, 57, 75, 45, 132, 63, 72, 90, 18, 12, 9, 183, 144, 159, 21, 120, 0, 42, 102, 186, 69, 123, 174, 147, 19, 34, 52, 151, 4, 16, 82, 130, 28, 79, 127, 64, 175, 37, 133, 154, 1, 70, 109, 10, 145, 100, 115, 160, 187, 178, 76, 112, 43, 169, 25, 166, 46, 73, 172, 124, 40, 163, 7, 58, 88, 119, 155, 71, 80, 107, 98, 125, 65, 170, 14, 50, 35, 110, 161, 104, 146, 188, 17, 173, 20, 167, 122, 41, 47, 77, 95, 59, 128, 182, 62, 113, 86, 176, 26, 68) 3 | CompactBuffer(177, 168, 150, 180, 54, 105, 30, 24, 51, 108, 78, 99, 162, 84, 48, 117, 27, 93, 33, 126, 141, 6, 3, 135, 165, 60, 114, 87, 129, 13, 55, 121, 157, 106, 49, 94, 148, 61, 139, 184, 97, 22, 142, 181, 118, 67, 85, 136, 91, 31, 101, 137, 134, 158, 29, 11, 92, 152, 149, 140, 185, 74, 83, 89, 179, 38, 56, 53, 116, 131, 32, 23, 164, 143, 8, 44, 5, 2) -------------------------------------------------------------------------------- /spark-graphx/src/main/resources/output/minDegrees.gexf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | -------------------------------------------------------------------------------- /spark-graphx/src/main/resources/user.txt: -------------------------------------------------------------------------------- 1 | 1,Thpffcj1 2 | 2,Thpffcj2 3 | 3,Thpffcj3 4 | 4,Thpffcj4 5 | 5,Thpffcj5 6 | 6,Thpffcj6 7 | 7,Thpffcj7 -------------------------------------------------------------------------------- /spark-graphx/src/main/scala/cn/edu/nju/GraphExample.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import org.apache.spark.graphx.{Edge, Graph} 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | 7 | /** 8 | * Created by thpffcj on 2019/10/3. 9 | */ 10 | object GraphExample { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | val conf = new SparkConf().setAppName("GraphTest").setMaster("local") 15 | val sc = new SparkContext(conf) 16 | 17 | // 构建顶点 返回的这个Long其实是VertexId类型,都是一样的 18 | val users: RDD[(Long, (String, String))] = 19 | sc.parallelize( 20 | Array((3L, ("rxin", "student")), 21 | (7L, ("jgonzal", "postdoc")), 22 | (5L, ("franklin", "prof")), 23 | (2L, ("istoica", "prof")))) 24 | 25 | // 构建边 (边有个独特的类Edge,某种程度讲代表的就是一些关系) 26 | val relationships: RDD[Edge[String]] = 27 | sc.parallelize( 28 | Array(Edge(3L, 7L, "collab"), 29 | Edge(5L, 3L, "advisor"), 30 | Edge(2L, 5L, "colleague"), 31 | Edge(5L, 7L, "pi"))) 32 | 33 | // 顶点和边,这样就构建了我们的图 34 | val graph = Graph(users, relationships) 35 | 36 | // .vertices获取到这个图中所有的顶点 37 | val count = graph.vertices.filter { 38 | case (id, (name, pos)) => { 39 | // 计算我们这个图中有多少个postdoc博士后 40 | pos == "postdoc" 41 | } 42 | }.count() 43 | 44 | // 1 45 | println(count) 46 | 47 | //.edges获取到这个图中所有的边,过滤出 源ID<目标ID 的数量 48 | val count1 = graph.edges.filter(e => e.srcId < e.dstId).count() 49 | 50 | // 3 51 | println(count1) 52 | 53 | sc.stop() 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /spark-graphx/src/main/scala/cn/edu/nju/GraphExample2.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import org.apache.spark.graphx.{Edge, Graph, VertexId, VertexRDD} 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | 7 | /** 8 | * Created by thpffcj on 2019/10/31. 9 | */ 10 | object GraphExample2 { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | // 设置运行环境 15 | val conf = new SparkConf().setAppName("GraphExample2").setMaster("local") 16 | val sc = new SparkContext(conf) 17 | 18 | // 设置顶点和边,注意顶点和边都是用元组定义的Array 19 | // 顶点的数据类型是VD:(String, Int) 20 | val vertexArray = Array( 21 | (1L, ("Alice", 28)), 22 | (2L, ("Bob", 27)), 23 | (3L, ("Charlie", 65)), 24 | (4L, ("David", 42)), 25 | (5L, ("Ed", 55)), 26 | (6L, ("Fran", 50)) 27 | ) 28 | 29 | // 边的数据类型ED:Int 30 | val edgeArray = Array( 31 | Edge(2L, 1L, 7), 32 | Edge(2L, 4L, 2), 33 | Edge(3L, 2L, 4), 34 | Edge(3L, 6L, 3), 35 | Edge(4L, 1L, 1), 36 | Edge(5L, 2L, 2), 37 | Edge(5L, 3L, 8), 38 | Edge(5L, 6L, 3) 39 | ) 40 | 41 | // 构造vertexRDD和edgeRDD 42 | val vertexRDD: RDD[(Long, (String, Int))] = sc.parallelize(vertexArray) 43 | val edgeRDD: RDD[Edge[Int]] = sc.parallelize(edgeArray) 44 | 45 | // 构造图Graph[VD,ED] 46 | val graph: Graph[(String, Int), Int] = Graph(vertexRDD, edgeRDD) 47 | 48 | // 找出图中年龄大于30的顶点 49 | graph.vertices.filter { case (id, (name, age)) => age > 30 }.collect.foreach { 50 | case (id, (name, age)) => println(s"$name is $age") 51 | } 52 | 53 | // 边操作:找出图中属性大于5的边 54 | graph.edges.filter(e => e.attr > 5) 55 | .collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}")) 56 | 57 | // triplets操作,((srcId, srcAttr), (dstId, dstAttr), attr) 58 | // 列出边属性 >5 的tripltes 59 | for (triplet <- graph.triplets.filter(t => t.attr > 5).collect) { 60 | println(s"${triplet.srcAttr._1} likes ${triplet.dstAttr._1}") 61 | } 62 | 63 | // Degrees操作 64 | // 找出图中最大的出度、入度、度数 65 | def max(a: (VertexId, Int), b: (VertexId, Int)): (VertexId, Int) = { 66 | if (a._2 > b._2) a else b 67 | } 68 | 69 | println("max of outDegrees:" + graph.outDegrees.reduce(max) + " max of inDegrees:" + graph.inDegrees.reduce(max) + " max of Degrees:" + graph.degrees.reduce(max)) 70 | 71 | // 转换操作 72 | // 顶点的转换操作,顶点age + 10 73 | graph.mapVertices { case (id, (name, age)) => (id, (name, age + 10)) } 74 | .vertices.collect.foreach(v => println(s"${v._2._1} is ${v._2._2}")) 75 | 76 | // 边的转换操作,边的属性*2 77 | graph.mapEdges(e => e.attr * 2) 78 | .edges.collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}")) 79 | 80 | // 顶点年纪>30的子图 81 | val subGraph = graph.subgraph(vpred = (id, vd) => vd._2 >= 30) 82 | subGraph.vertices.collect.foreach(v => println(s"${v._2._1} is ${v._2._2}")) 83 | 84 | // 子图所有边 85 | subGraph.edges.collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}")) 86 | 87 | // 连接操作 88 | val inDegrees: VertexRDD[Int] = graph.inDegrees 89 | case class User(name: String, age: Int, inDeg: Int, outDeg: Int) 90 | 91 | // 创建一个新图,顶点VD的数据类型为User,并从graph做类型转换 92 | val initialUserGraph: Graph[User, Int] = graph.mapVertices { 93 | case (id, (name, age)) => User(name, age, 0, 0)} 94 | 95 | // initialUserGraph与inDegrees、outDegrees(RDD)进行连接,并修改initialUserGraph中inDeg值、outDeg值 96 | val userGraph = initialUserGraph.outerJoinVertices(initialUserGraph.inDegrees) { 97 | case (id, u, inDegOpt) => User(u.name, u.age, inDegOpt.getOrElse(0), u.outDeg) 98 | }.outerJoinVertices(initialUserGraph.outDegrees) { 99 | case (id, u, outDegOpt) => User(u.name, u.age, u.inDeg,outDegOpt.getOrElse(0)) 100 | } 101 | 102 | // 连接图的属性 103 | userGraph.vertices.collect.foreach(v => println(s"${v._2.name} inDeg: ${v._2.inDeg} outDeg: ${v._2.outDeg}")) 104 | 105 | // 出度和入读相同的人员 106 | userGraph.vertices.filter { 107 | case (id, u) => u.inDeg == u.outDeg 108 | }.collect.foreach { 109 | case (id, property) => println(property.name) 110 | } 111 | 112 | // 聚合操作 113 | // 找出年纪最大的追求者 114 | // val oldestFollower: VertexRDD[(String, Int)] = userGraph.mapReduceTriplets[(String, Int)]( 115 | // 116 | // // 将源顶点的属性发送给目标顶点,map过程 117 | // edge => Iterator((edge.dstId, (edge.srcAttr.name, edge.srcAttr.age))), 118 | // 119 | // // 得到最大追求者,reduce过程 120 | // (a, b) => if (a._2 > b._2) a else b 121 | // ) 122 | 123 | // userGraph.vertices.leftJoin(oldestFollower) { (id, user, optOldestFollower) => 124 | // optOldestFollower match { 125 | // case None => s"${user.name} does not have any followers." 126 | // case Some((name, age)) => s"${name} is the oldest follower of ${user.name}." 127 | // } 128 | // }.collect.foreach { case (id, str) => println(str)} 129 | 130 | 131 | } 132 | 133 | 134 | } 135 | -------------------------------------------------------------------------------- /spark-graphx/src/main/scala/cn/edu/nju/GraphExample3.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import org.apache.spark.graphx.GraphLoader 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | /** 7 | * Created by thpffcj on 2019/10/3. 8 | * 9 | * 图计算官网案例示范 10 | * 主要解决项目中遇到的 把同一个用户识别出来,如果是同一个用户就合并到一起 11 | */ 12 | object GraphExample3 { 13 | 14 | def main(args: Array[String]): Unit = { 15 | 16 | // graphx 基于RDD 17 | val conf = new SparkConf().setMaster("local").setAppName("ConnectedComponentsExample") 18 | val sc = new SparkContext(conf) 19 | 20 | // 构建出来图有多种方式 21 | val graph = GraphLoader.edgeListFile(sc, "src/main/resources/follows.txt") 22 | /** 23 | * 就是把所有的数字作为key,value都写为1 24 | * (4,1) 25 | * (1,1) 26 | * (6,1) 27 | * (3,1) 28 | * (7,1) 29 | * (5,1) 30 | * (2,1) 31 | */ 32 | graph.vertices.foreach(println(_)) 33 | 34 | /** 35 | * .connectedComponents()计算每个顶点的连接组件成员,并返回带有顶点的图形 36 | * 包含该顶点的连通组件中包含最低顶点id的值。 37 | */ 38 | val cc = graph.connectedComponents().vertices 39 | /** 40 | * (4,4) 41 | * (1,1) 42 | * (6,4) 43 | * (3,1) 44 | * (7,4) 45 | * (5,1) 46 | * (2,1) 47 | */ 48 | cc.foreach(println(_)) 49 | 50 | val users = sc.textFile("src/main/resources/user.txt").map(line => { 51 | // 因为要join,所以要变成kv形式 52 | val fields = line.split(",") 53 | (fields(0).toLong, fields(1)) 54 | }) 55 | 56 | // (1,Thpffcj1) join (1,1) 57 | // (1,(Thpffcj1,1)) 代表的是同一个好友的那个id 58 | users.join(cc).map { 59 | case (id, (username, cclastid)) => (cclastid, username) 60 | }.reduceByKey((x: String, y: String) => x + "," + y) 61 | .foreach(tuple => { 62 | /** 63 | * Thpffcj4,Thpffcj6,Thpffcj7 64 | * Thpffcj1,Thpffcj3,Thpffcj5,Thpffcj2 65 | */ 66 | println(tuple._2) 67 | }) 68 | 69 | sc.stop() 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /spark-graphx/src/main/scala/cn/edu/nju/GraphProcess.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import java.io.PrintWriter 4 | import java.util 5 | import java.util.concurrent.ConcurrentHashMap 6 | 7 | import com.alibaba.fastjson.JSON 8 | import com.mongodb.spark.MongoSpark 9 | import org.apache.spark.{SparkConf, SparkContext} 10 | import org.apache.spark.graphx.{Edge, Graph, VertexId, VertexRDD} 11 | import org.apache.spark.rdd.RDD 12 | import org.apache.spark.sql.{DataFrame, SparkSession} 13 | import org.bson.Document 14 | 15 | import scala.util.Random 16 | 17 | /** 18 | * Created by thpffcj on 2019/11/2. 19 | * hours_3_10W 53 160 20 | * steam_3_10W 53 160 21 | * hours_5_20W 98 464 22 | * steam_5_20W 96 452 23 | * hours_7_30W 103 535 24 | * steam_7_30W 103 535 25 | * hours_6_30W 178 1060 26 | * steam_6_30W 175 1039 27 | * 28 | */ 29 | object GraphProcess { 30 | 31 | // 点集,根据用户id或游戏名找点id 32 | // user_76561198380840992 1L 33 | // game_CODE VEIN 2L 34 | val pointMap = new ConcurrentHashMap[String, Long]() 35 | 36 | // 评论边 37 | val edgeMap1 = new ConcurrentHashMap[(Long, Long), String]() 38 | 39 | // 时长边 40 | val edgeMap2 = new ConcurrentHashMap[(Long, Long), String]() 41 | 42 | // 点权重map,根据图中点id得到权重 43 | // 1L 10 44 | val weightMap = new ConcurrentHashMap[Long, Int]() 45 | 46 | def main(args: Array[String]): Unit = { 47 | 48 | val conf = new SparkConf().setMaster("local[4]").setAppName("GraphProcess") 49 | conf.set("spark.mongodb.input.uri", "mongodb://localhost:27017/test.China.reviews_official_30W") 50 | conf.set("spark.mongodb.input.partitioner", "MongoPaginateBySizePartitioner") 51 | conf.set("spark.mongodb.output.uri", "mongodb://localhost:27017/test.steam.graph_vertice") 52 | 53 | val spark = SparkSession.builder().config(conf).getOrCreate() 54 | 55 | val frame: DataFrame = MongoSpark.load(spark) 56 | 57 | var key = 0L 58 | 59 | // 需要collect到一个节点上,key递增 60 | frame.collect().foreach(row => { 61 | 62 | val gameArray = row.getAs("game").toString.split(",") 63 | var game = gameArray(0) 64 | game = game.substring(1, game.length) 65 | // 过滤前端无法识别非法字符,比如表情等 66 | val gameNamePatterns = "[^\\u4e00-\\u9fa5a-zA-Z0-9 ]".r 67 | // 游戏名称 68 | game = gameNamePatterns.replaceAllIn(game, "") 69 | 70 | val jsonAuthor = JSON.parse(row.getAs("author").toString) 71 | val authorArray = jsonAuthor.toString.split(",") 72 | // 用户id 73 | val userId = authorArray(4) 74 | // 游玩时长 75 | val hours = authorArray(5) 76 | 77 | // 评论 78 | var review = row.getAs("review").toString 79 | val reviewPatterns = "[^\\u4e00-\\u9fa5a-zA-Z0-9 ]".r 80 | review = reviewPatterns.replaceAllIn(review, "") 81 | 82 | // 玩家顶点 83 | val playerKey = "user_" + userId 84 | var playerPoint = 0L 85 | if (pointMap.containsKey(playerKey)) { 86 | playerPoint = pointMap.get(playerKey) 87 | // 权重+1 88 | weightMap.put(playerPoint, weightMap.get(playerPoint) + 1) 89 | } else { 90 | // 点id递增 91 | this.synchronized { 92 | key = key + 1 93 | playerPoint = key 94 | } 95 | pointMap.put(playerKey, playerPoint) 96 | // 权重赋予1 97 | weightMap.put(playerPoint, 1) 98 | } 99 | 100 | // 游戏顶点 101 | val gameKey = "game_" + game 102 | var gamePoint = 0L 103 | if (pointMap.containsKey(gameKey)) { 104 | gamePoint = pointMap.get(gameKey) 105 | // 权重+1 106 | weightMap.put(gamePoint, weightMap.get(gamePoint) + 1) 107 | } else { 108 | this.synchronized { 109 | key = key + 1 110 | gamePoint = key 111 | } 112 | pointMap.put(gameKey, gamePoint) 113 | // 权重赋予1 114 | weightMap.put(gamePoint, 1) 115 | } 116 | 117 | edgeMap1.put((playerPoint, gamePoint), review) 118 | edgeMap2.put((playerPoint, gamePoint), hours) 119 | }) 120 | 121 | println("foreach 结束") 122 | 123 | // 点集 124 | var vertexArray = Seq((0L, ("test", "test"))) 125 | // 评论边 126 | var edgeArray1 = Seq(Edge(0L, 0L, "")) 127 | // 时长边 128 | var edgeArray2 = Seq(Edge(0L, 0L, "")) 129 | 130 | // 添加点 131 | val pointSet = pointMap.keySet() 132 | // TODO 遍历迭代map,这个阶段非常耗时,如何改进? 133 | val point_iter = pointSet.iterator 134 | while (point_iter.hasNext) { 135 | val key = point_iter.next 136 | val name = key.split("_") 137 | vertexArray = vertexArray :+ (pointMap.get(key), (name(0), name(1))) 138 | } 139 | 140 | println("遍历点集结束") 141 | 142 | // 添加评论边 143 | val edgeSet1 = edgeMap1.keySet() 144 | // 遍历迭代map 145 | val edge_iter1 = edgeSet1.iterator 146 | while (edge_iter1.hasNext) { 147 | val key = edge_iter1.next 148 | edgeArray1 = edgeArray1 :+ Edge(key._1, key._2, edgeMap1.get(key)) 149 | } 150 | 151 | println("遍历评论边结束") 152 | 153 | // 添加时长边 154 | val edgeSet2 = edgeMap2.keySet() 155 | // 遍历迭代map 156 | val edge_iter2 = edgeSet2.iterator 157 | while (edge_iter2.hasNext) { 158 | val key = edge_iter2.next 159 | edgeArray2 = edgeArray2 :+ Edge(key._1, key._2, edgeMap2.get(key)) 160 | } 161 | 162 | println("遍历结束") 163 | 164 | // 构造vertexRDD和edgeRDD 165 | val vertexRDD: RDD[(Long, (String, String))] = spark.sparkContext.parallelize(vertexArray) 166 | val edgeRDD1: RDD[Edge[String]] = spark.sparkContext.parallelize(edgeArray1) 167 | val edgeRDD2: RDD[Edge[String]] = spark.sparkContext.parallelize(edgeArray2) 168 | 169 | // 构造图Graph[VD,ED] 170 | var contentGraph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD1) 171 | 172 | println("构造contentGraph结束") 173 | 174 | // 构建子图,过滤评论为空的边 175 | contentGraph = contentGraph.subgraph(epred = e => !e.attr.equals("")) 176 | // 构建子图,过滤游戏权重大于10的 177 | contentGraph = contentGraph.subgraph(vpred = (id, vd) => { 178 | ((vd._1.equals("game") & weightMap.get(id) > 10) | (vd._1.equals("user"))) 179 | }) 180 | 181 | val degreeThreshold = 6 182 | // 度数>degreeThreshold的点集 183 | val contentDegreeArray = contentGraph.degrees.filter(_._2 > degreeThreshold).map(_._1).collect() 184 | 185 | // 保留度数符合规定的点 186 | contentGraph = contentGraph.subgraph(vpred = (id, vd) => { 187 | contentDegreeArray.contains(id) 188 | }) 189 | 190 | // 边的转换操作,去除前端无法识别的字符,如评论表情等 191 | val reviewPatterns = "[^\\u4e00-\\u9fa5a-zA-Z0-9 ]".r 192 | contentGraph.mapEdges(e => e.attr = reviewPatterns.replaceAllIn(e.attr, "")) 193 | 194 | println("处理contentGraph结束") 195 | 196 | // 时长图 197 | var hourGraph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD2) 198 | 199 | println("构造hourGraph结束") 200 | 201 | // TODO 顶点的转换操作,根据用户id寻找用户名称 202 | hourGraph = hourGraph.mapVertices { 203 | case (id, (types, name)) => (types, name) 204 | } 205 | 206 | hourGraph = hourGraph.subgraph(vpred = (id, vd) => { 207 | ((vd._1.equals("game") & weightMap.get(id) > 10) | (vd._1.equals("user"))) 208 | }) 209 | 210 | // 度数>0的点集 211 | val hourDegreeArray = hourGraph.degrees.filter(_._2 > degreeThreshold).map(_._1).collect() 212 | 213 | // 去除孤立的点 214 | hourGraph = hourGraph.subgraph(vpred = (id, vd) => { 215 | hourDegreeArray.contains(id) 216 | }) 217 | 218 | println("处理hourGraph结束") 219 | 220 | // 独立群体检测 221 | hourGraph.connectedComponents 222 | .vertices 223 | .map(_.swap) 224 | .groupByKey() 225 | .map(_._2) 226 | .foreach(println) 227 | 228 | /** 229 | * 将点数据写入MongoDB 230 | * Spark的算子是在executor上执行的,数据也是放在executor上。executor和driver并不在同一个jvm(local[*]除外), 231 | * 所以算子是不能访问在driver上的SparkSession对象 232 | * 如果一定要“在算子里访问SparkSession”,那你只能把数据collect回Driver,然后用Scala 集合的算子去做。这种情况下只能适 233 | * 用于数据量不大(多大取决于你分配给Driver的内存) 234 | */ 235 | hourGraph.vertices.collect.foreach(v => { 236 | 237 | val id = v._1.toString 238 | val name = v._2.toString 239 | 240 | writeVerticesToMongodb(spark, id, name) 241 | }) 242 | 243 | 244 | // 输出到文件 245 | val outputPath = "src/main/resources/" 246 | val pw1 = new PrintWriter(outputPath + "steam/hours_6_30W.gexf") 247 | pw1.write(hoursToGexf(hourGraph)) 248 | pw1.close() 249 | 250 | val pw2 = new PrintWriter(outputPath + "steam/steam_6_30W.gexf") 251 | pw2.write(gameToGexf(contentGraph)) 252 | pw2.close() 253 | 254 | spark.close() 255 | } 256 | 257 | /** 258 | * 点数据写入MongoDB 259 | */ 260 | def writeVerticesToMongodb(spark: SparkSession, id: String, name: String) = { 261 | 262 | val document = new Document() 263 | document.append("verticeId", id).append("name", name) 264 | 265 | val seq = Seq(document) 266 | val df = spark.sparkContext.parallelize(seq) 267 | 268 | // 将数据写入mongo 269 | MongoSpark.save(df) 270 | } 271 | 272 | /** 273 | * 边据写入MongoDB 274 | */ 275 | def writeEdgesToMongodb(spark: SparkSession, srcId: String, dstId: String, attr: String) = { 276 | 277 | val document = new Document() 278 | document.append("srcId", srcId).append("dstId", dstId).append("attr", attr) 279 | 280 | val seq = Seq(document) 281 | val df = spark.sparkContext.parallelize(seq) 282 | 283 | // 将数据写入mongo 284 | MongoSpark.save(df) 285 | } 286 | 287 | /** 288 | * 用户-游戏图,相比底下的图需要指定x,y的坐标 289 | * 290 | * @param graph 291 | * @tparam VD 292 | * @tparam ED 293 | * @return 294 | */ 295 | def gameToGexf[VD, ED](graph: Graph[VD, ED]) = { 296 | 297 | "\n" + 298 | "\n" + 299 | "\n" + 300 | "\n" + 301 | "\n" + 302 | "\n" + 303 | "\n " + 304 | graph.vertices.map(v => { 305 | // 根据类别填充颜色和attvalue 306 | val types = v._2.toString.split(",")(0).replace("(", "") 307 | val name = v._2.toString.split(",")(1).replace(")", "") 308 | var color = "" 309 | var attvalue = 0 310 | if (types.equals("user")) { 311 | color = "r=\"236\" g=\"81\" b=\"72\"" 312 | attvalue = 1 313 | } else { 314 | color = "r=\"236\" g=\"181\" b=\"72\"" 315 | attvalue = 0 316 | } 317 | "\n" + 318 | "\n" + 319 | "\n" + 320 | "\n" + 321 | "\n" + 322 | // (x, y) 坐标 323 | "\n" + 324 | "\n" + 325 | "\n" 326 | }).collect().mkString + 327 | "\n " + 328 | "\n" + 329 | graph.edges.map(e => { 330 | "\n" 331 | }).collect().mkString + 332 | "\n" + 333 | "\n" + 334 | "" 335 | } 336 | 337 | /** 338 | * 时间输出为指定gexf格式 339 | * 340 | * @param graph :图 341 | * @tparam VD 342 | * @tparam ED 343 | * @return 344 | */ 345 | def hoursToGexf[VD, ED](graph: Graph[VD, ED]) = { 346 | 347 | "\n" + 348 | "\n" + 349 | "\n" + 350 | "\n" + 351 | "\n" + 352 | "\n" + 353 | "\n " + 354 | graph.vertices.map(v => { 355 | // 根据类别填充颜色和attvalue 356 | val types = v._2.toString.split(",")(0).replace("(", "") 357 | val name = v._2.toString.split(",")(1).replace(")", "") 358 | var color = "" 359 | var attvalue = 0 360 | if (types.equals("user")) { 361 | color = "r=\"236\" g=\"81\" b=\"72\"" 362 | attvalue = 1 363 | } else { 364 | color = "r=\"236\" g=\"181\" b=\"72\"" 365 | attvalue = 0 366 | } 367 | "\n" + 368 | "\n" + 369 | "\n" + 370 | "\n" + 371 | "\n" + 372 | "\n" + 373 | "\n" 374 | }).collect().mkString + 375 | "\n " + 376 | "\n" + 377 | graph.edges.map(e => { 378 | "\n" 379 | }).collect().mkString + 380 | "\n" + 381 | "\n" + 382 | "" 383 | } 384 | } 385 | -------------------------------------------------------------------------------- /spark-graphx/src/main/scala/cn/edu/nju/GraphProcessTest.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import java.io.PrintWriter 4 | import java.util 5 | 6 | import com.mongodb.spark.MongoSpark 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.graphx.{Edge, Graph} 9 | import org.apache.spark.rdd.RDD 10 | import org.apache.spark.sql.{DataFrame, SparkSession} 11 | import org.bson.Document 12 | 13 | import scala.util.Random 14 | 15 | /** 16 | * Created by thpffcj on 2019/11/2. 17 | */ 18 | object GraphProcessTest { 19 | 20 | val pointMap = new util.HashMap[String, Long]() 21 | // 评论边 22 | val edgeMap1 = new util.HashMap[(Long, Long), String]() 23 | // 时长边 24 | val edgeMap2 = new util.HashMap[(Long, Long), String]() 25 | // 点权重map,根据id得到权重 26 | val weightMap = new util.HashMap[Long, Int]() 27 | // 点权重map,根据id得到权重 28 | val topGameSet = new util.HashSet[Long]() 29 | 30 | def main(args: Array[String]): Unit = { 31 | 32 | val conf = new SparkConf().setMaster("local[4]").setAppName("GraphProcess") 33 | conf.set("spark.mongodb.input.uri", "mongodb://localhost:27017/test.China.reviews") 34 | conf.set("spark.mongodb.input.partitioner", "MongoPaginateBySizePartitioner") 35 | 36 | val spark = SparkSession.builder().config(conf).getOrCreate() 37 | 38 | val frame: DataFrame = MongoSpark.load(spark) 39 | 40 | var key = 0L 41 | 42 | frame.foreach(row => { 43 | 44 | /** 45 | * 用户信息 46 | * 过滤非法输入符号 47 | */ 48 | val jsonPlayer = row.getAs("user").toString.split(",") 49 | var player = "" 50 | if (jsonPlayer.length > 2) { 51 | player = jsonPlayer(jsonPlayer.length - 1) 52 | player = player.substring(0, player.length - 1) 53 | } else if (jsonPlayer(0).contains("帐户内")) { 54 | player = jsonPlayer(1) 55 | player = player.substring(0, player.length - 1) 56 | } else { 57 | player = jsonPlayer(0) 58 | player = player.substring(1, player.length) 59 | } 60 | 61 | // 过滤前端无法识别非法字符,比如表情等 62 | val namePatterns1 = "[`~!@#$%^&*()+=|{}':;',\\[\\]<>/?~!@#� \uE009\uF8F5¥%……& amp;*()——+|{}【】‘;:”“’。,、?]".r 63 | val namePatterns2 = "[^\\u4e00-\\u9fa5a-zA-Z0-9]".r 64 | player = namePatterns1.replaceAllIn(player, "") 65 | player = namePatterns2.replaceAllIn(player, "") 66 | if (player.length == 0) { 67 | player = "anonymous" 68 | } 69 | 70 | // 过滤用户名过滤后为空的数据 71 | if (!player.equals("anonymous")) { 72 | // 游戏信息 73 | val jsonGame = row.getAs("game").toString.split(",") 74 | var game = jsonGame(0).substring(1) 75 | game = namePatterns1.replaceAllIn(game, "") 76 | 77 | // 评论 78 | var content = row.getAs("content").toString.replace("
", "") 79 | val contentPatterns = "[^\\u4e00-\\u9fa5a-zA-Z0-9 ]".r 80 | content = contentPatterns.replaceAllIn(content, "") 81 | 82 | // 游玩时长 83 | val patterns = "[\\u4e00-\\u9fa5]".r // 匹配汉字 84 | val hours = patterns.replaceAllIn(row.getAs("hours").toString, "") 85 | 86 | // 玩家顶点 87 | val playerKey = "user_" + player 88 | var playerPoint = 0L 89 | if (pointMap.containsKey(playerKey)) { 90 | playerPoint = pointMap.get(playerKey) 91 | // 权重+1 92 | weightMap.put(playerPoint, weightMap.get(playerPoint) + 1) 93 | } else { 94 | key = key + 1 95 | playerPoint = key 96 | pointMap.put(playerKey, playerPoint) 97 | // 权重赋予1 98 | weightMap.put(playerPoint, 1) 99 | } 100 | 101 | // 游戏顶点 102 | val gameKey = "game_" + game 103 | var gamePoint = 0L 104 | if (pointMap.containsKey(gameKey)) { 105 | gamePoint = pointMap.get(gameKey) 106 | // 权重+1 107 | weightMap.put(gamePoint, weightMap.get(gamePoint) + 1) 108 | } else { 109 | key = key + 1 110 | gamePoint = key 111 | pointMap.put(gameKey, gamePoint) 112 | // 权重赋予1 113 | weightMap.put(gamePoint, 1) 114 | } 115 | 116 | edgeMap1.put((playerPoint, gamePoint), content) 117 | edgeMap2.put((playerPoint, gamePoint), hours) 118 | } 119 | 120 | // KurokaneSS CODE VEIN 带妹子也就图一乐,打架还得靠云哥 121 | // println(player + " " + game + " " + content) 122 | }) 123 | 124 | // 点集 125 | var vertexArray = Seq((0L, ("test", "test"))) 126 | // 评论边 127 | var edgeArray1 = Seq(Edge(0L, 0L, "")) 128 | // 时长边 129 | var edgeArray2 = Seq(Edge(0L, 0L, "")) 130 | 131 | // 添加点 132 | val pointSet = pointMap.keySet() 133 | // 遍历迭代map 134 | val point_iter = pointSet.iterator 135 | while (point_iter.hasNext) { 136 | val key = point_iter.next 137 | // println(key) 138 | vertexArray = vertexArray :+ (pointMap.get(key), (key.split("_")(0), key.split("_")(1))) 139 | } 140 | 141 | // 添加边 142 | val edgeSet1 = edgeMap1.keySet() 143 | // 遍历迭代map 144 | val edge_iter1 = edgeSet1.iterator 145 | while (edge_iter1.hasNext) { 146 | val key = edge_iter1.next 147 | edgeArray1 = edgeArray1 :+ Edge(key._1, key._2, edgeMap1.get(key)) 148 | } 149 | 150 | // 添加边 151 | val edgeSet2 = edgeMap2.keySet() 152 | // 遍历迭代map 153 | val edge_iter2 = edgeSet2.iterator 154 | while (edge_iter2.hasNext) { 155 | val key = edge_iter2.next 156 | edgeArray2 = edgeArray2 :+ Edge(key._1, key._2, edgeMap2.get(key)) 157 | } 158 | 159 | // 构造vertexRDD和edgeRDD 160 | val vertexRDD: RDD[(Long, (String, String))] = spark.sparkContext.parallelize(vertexArray) 161 | val edgeRDD1: RDD[Edge[String]] = spark.sparkContext.parallelize(edgeArray1) 162 | val edgeRDD2: RDD[Edge[String]] = spark.sparkContext.parallelize(edgeArray2) 163 | 164 | // 构造图Graph[VD,ED] 165 | var contentGraph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD1) 166 | // 构建子图,过滤评论为空的边 167 | contentGraph = contentGraph.subgraph(epred = e => !e.attr.equals("")) 168 | // 构建子图,过滤游戏权重大于15的 169 | contentGraph = contentGraph.subgraph(vpred = (id, vd) => { 170 | ((vd._1.equals("game") & weightMap.get(id) > 15) | (vd._1.equals("user"))) 171 | }) 172 | 173 | contentGraph.vertices.foreach(v => { 174 | if (v._2._1.equals("game")) { 175 | topGameSet.add(v._1) 176 | } 177 | }) 178 | 179 | 180 | 181 | // 经过过滤后有些顶点是没有边,所以采用leftOuterJoin将这部分顶点去除 182 | // val vertices = contentGraph.vertices.leftOuterJoin(vertex).map(x => (x._1, x._2._2.getOrElse(""))) 183 | // val newGraph: Graph[(String, String), String] = Graph(vertices, edge) 184 | 185 | 186 | val hourGraph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD2) 187 | 188 | contentGraph.vertices.foreach(println(_)) 189 | // println(hourGraph.toString) 190 | 191 | // 输出到文件 192 | val outputPath = "src/main/resources/" 193 | // val pw1 = new PrintWriter(outputPath + "hours.xml") 194 | // pw1.write(hoursToGexf(hourGraph)) 195 | // pw1.close() 196 | 197 | val pw2 = new PrintWriter(outputPath + "steam.gexf") 198 | pw2.write(gameToGexf(contentGraph)) 199 | pw2.close() 200 | 201 | spark.close() 202 | } 203 | 204 | /** 205 | * 数据写入MongoDB 206 | */ 207 | def writeToMongodb() = { 208 | 209 | val spark = SparkSession.builder() 210 | .master("local") 211 | .appName("MongoDBProcess") 212 | .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/test.China.graph") 213 | .getOrCreate() 214 | 215 | // 设置log级别 216 | spark.sparkContext.setLogLevel("WARN") 217 | 218 | val document1 = new Document() 219 | document1.append("name", "sunshangxiang").append("age", 18).append("sex", "female") 220 | 221 | val seq = Seq(document1) 222 | val df = spark.sparkContext.parallelize(seq) 223 | 224 | // 将数据写入mongo 225 | MongoSpark.save(df) 226 | 227 | spark.stop() 228 | } 229 | 230 | /** 231 | * 用户-游戏图,相比底下的图需要指定x,y的坐标 232 | * @param graph 233 | * @tparam VD 234 | * @tparam ED 235 | * @return 236 | */ 237 | def gameToGexf[VD, ED](graph: Graph[VD, ED]) = { 238 | 239 | "\n" + 240 | "\n" + 241 | "\n" + 242 | "\n" + 243 | "\n" + 244 | "\n" + 245 | "\n " + 246 | graph.vertices.map(v => { 247 | // 根据类别填充颜色和attvalue 248 | val types = v._2.toString.split(",")(0).replace("(", "") 249 | val name = v._2.toString.split(",")(1).replace(")", "") 250 | var color = "" 251 | var attvalue = 0 252 | if (types.equals("user")) { 253 | color = "r=\"236\" g=\"81\" b=\"72\"" 254 | attvalue = 1 255 | } else { 256 | color = "r=\"236\" g=\"181\" b=\"72\"" 257 | attvalue = 0 258 | } 259 | "\n" + 260 | "\n" + 261 | "\n" + 262 | "\n" + 263 | "\n" + 264 | // (x, y) 坐标 265 | "\n" + 266 | "\n" + 267 | "\n" 268 | }).collect().mkString + 269 | "\n " + 270 | "\n" + 271 | graph.edges.map(e => { 272 | "\n" 273 | }).collect().mkString + 274 | "\n" + 275 | "\n" + 276 | "" 277 | } 278 | 279 | /** 280 | * 输出为指定gexf格式 281 | * 282 | * @param graph :图 283 | * @tparam VD 284 | * @tparam ED 285 | * @return 286 | */ 287 | def hoursToGexf[VD, ED](graph: Graph[VD, ED]) = { 288 | 289 | "\n" + 290 | "\n" + 291 | "\n" + 292 | "\n" + 293 | "\n" + 294 | "\n" + 295 | "\n " + 296 | graph.vertices.map(v => { 297 | // 根据类别填充颜色和attvalue 298 | val types = v._2.toString.split(",")(0).replace("(", "") 299 | val name = v._2.toString.split(",")(1).replace(")", "") 300 | var color = "" 301 | var attvalue = 0 302 | if (types.equals("user")) { 303 | color = "r=\"236\" g=\"81\" b=\"72\"" 304 | attvalue = 1 305 | } else { 306 | color = "r=\"236\" g=\"181\" b=\"72\"" 307 | attvalue = 0 308 | } 309 | "\n" + 310 | "\n" + 311 | "\n" + 312 | "\n" + 313 | "\n" + 314 | "\n" + 315 | "\n" 316 | }).collect().mkString + 317 | "\n " + 318 | "\n" + 319 | graph.edges.map(e => { 320 | "\n" 321 | }).collect().mkString + 322 | "\n" + 323 | "\n" + 324 | "" 325 | } 326 | } 327 | -------------------------------------------------------------------------------- /spark-graphx/src/main/scala/cn/edu/nju/MongoDBProcess.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import com.mongodb.spark.config.{ReadConfig, WriteConfig} 4 | import com.mongodb.spark.sql._ 5 | import org.apache.spark.sql.SparkSession 6 | 7 | /** 8 | * Created by thpffcj on 2019/10/31. 9 | */ 10 | object MongoDBProcess { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | val MongoUri1 = args(0).toString 15 | val MongoUri2 = args(1).toString 16 | val SparkMasterUri = args(2).toString 17 | 18 | def makeMongoURI(uri: String, database: String, collection: String) = (s"${uri}/${database}.${collection}") 19 | 20 | val mongoURI1 = s"mongodb://${MongoUri1}:27017" 21 | val mongoURI2 = s"mongodb://${MongoUri2}:27017" 22 | 23 | val CONFdb1 = makeMongoURI(s"${mongoURI1}", "MyColletion1", "df") 24 | val CONFdb2 = makeMongoURI(s"${mongoURI2}", "MyColletion2", "df") 25 | 26 | val WRITEdb1: WriteConfig = WriteConfig(scala.collection.immutable.Map("uri" -> CONFdb1)) 27 | val READdb1: ReadConfig = ReadConfig(Map("uri" -> CONFdb1)) 28 | 29 | val WRITEdb2: WriteConfig = WriteConfig(scala.collection.immutable.Map("uri" -> CONFdb2)) 30 | val READdb2: ReadConfig = ReadConfig(Map("uri" -> CONFdb2)) 31 | 32 | val spark = SparkSession 33 | .builder 34 | .appName("AppMongo") 35 | .config("spark.worker.cleanup.enabled", "true") 36 | .config("spark.scheduler.mode", "FAIR") 37 | .getOrCreate() 38 | 39 | val df1 = spark.read.mongo(READdb1) 40 | val df2 = spark.read.mongo(READdb2) 41 | df1.write.mode("overwrite").mongo(WRITEdb1) 42 | df2.write.mode("overwrite").mongo(WRITEdb2) 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /spark-mllib/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | cn.edu.nju 7 | mf1932063 8 | 1.0-SNAPSHOT 9 | 10 | 11 | 2.11.8 12 | 2.4.0 13 | 1.8 14 | 1.8 15 | 16 | 17 | 18 | 19 | 20 | cloudera 21 | https://repository.cloudera.com/artifactory/cloudera-repos 22 | 23 | 24 | 25 | 26 | 27 | 28 | org.scala-lang 29 | scala-library 30 | ${scala.version} 31 | 32 | 33 | 34 | org.apache.spark 35 | spark-mllib_2.11 36 | ${spark.version} 37 | 38 | 39 | 40 | com.fasterxml.jackson.module 41 | jackson-module-scala_2.11 42 | 2.6.5 43 | 44 | 45 | 46 | io.netty 47 | netty-all 48 | 4.1.42.Final 49 | 50 | 51 | 52 | 53 | 54 | 58 | 59 | 60 | org.scala-tools 61 | maven-scala-plugin 62 | 63 | 64 | 65 | compile 66 | testCompile 67 | 68 | 69 | 70 | 71 | ${scala.version} 72 | 73 | -target:jvm-1.8 74 | 75 | 76 | 77 | 78 | org.apache.maven.plugins 79 | maven-eclipse-plugin 80 | 81 | true 82 | 83 | ch.epfl.lamp.sdt.core.scalabuilder 84 | 85 | 86 | ch.epfl.lamp.sdt.core.scalanature 87 | 88 | 89 | org.eclipse.jdt.launching.JRE_CONTAINER 90 | ch.epfl.lamp.sdt.launching.SCALA_CONTAINER 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | org.scala-tools 100 | maven-scala-plugin 101 | 102 | ${scala.version} 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /spark-mllib/src/main/resources/game_content.txt: -------------------------------------------------------------------------------- 1 | Best hunting game ever created. 2 | The best game in Assassin's creed franchise in my opinion. The story is so good! 3 | very shooty much nice 4 | It's a good game. However Horde mode slightly lets me down. But the shooting is still very nice ! 5 | Really fun game with plenty to do about, it would be even better if there would be more modding for the game. 6 | It is quite a good game. Add mods, and it becomes better. 7 | best game made me want to break things but in a good way 8 | Play this with your dates and get your knob wet! Or play this with your friends and get yelled at 9 | Best Tom Clancy game I've ever played. I never get tired of playing this game. 10 | Classic style and very challenging, but tons of fun. I can't even beat the entire game. I would still highly recommend this game to anyone. 11 | Not worth it 12 | Bad data handling policies at Paradox prevent me from recommending any of their games. CK2 in particular is an especially egregious example. The original purchase agreement was altered from opt out data collection to forced data collection with no compensation given. Changing the base game to free did not help at all since those who bought it already paid for the privilege of having their data stolen all the time and local data sometimes deleted despite settings preventing updates. 13 | I really wanted to like this game. Unfortunately, I found it frenetic, and the opening levels didn't give the sense of achievement that helps you pursue success at more difficult levels. Having to go around counters which are in your way right from the beginning, is very frustrating. No chef in their right mind would set up such a crappy working layout, and it drove me batty. 14 | Don't play this game. 15 | I only recommend for hardcore AS fans just for story purposes not like the story is very good to begin with. Although it is set in one of my favourite historic time periods, The French Revolution, it doesn't make me feel very invested because the story just wasn't interesting to me. Game does and sill has bad bugs even after it's disastrous release back when it first released. 16 | I downloaded the Trial version a few months back as it was free. Prior to downloading, I checked that my computer and GPU was adequate enough to run it (an 8gb i7-4770 with a 2gb GTX 750ti was above it's minimum specs) and spent an hour downloading it. Upon playing it however, it was choppy and slow throughout and regardless of whatever low-medium range settings I used to try to remedy these issues (I certainly wasn't expecting high settings at 1080p given my setup anyway) it was still unplayable and was uninstalled around 7 minutes later. So unless the performance issues are patched, I'd best steer clear until these are rectified. 17 | I really wanted to like this game. Unfortunately, I found it frenetic, and the opening levels didn't give the sense of achievement that helps you pursue success at more difficult levels. Having to go around counters which are in your way right from the beginning, is very frustrating. No chef in their right mind would set up such a crappy working layout, and it drove me batty. :( 18 | Terrible boring game, saving the game doesn't work, you always spawn at a checkpoint. Same enemies over and over again. Your basically stuck in a cave all the damn time. Weak story. 19 | horrible ai dont lots of bugs still eq dosent have enough power frame rate suck also waste yur money like i did like on 1/2 steam games i have 20 | The mechanics make no sense. Strategy like this should be turn-based or else its just a race of who clicks faster 21 | -------------------------------------------------------------------------------- /spark-mllib/src/main/resources/recommend_validate: -------------------------------------------------------------------------------- 1 | userId,gameId,gameName,rating,random 2 | 1,22,Dota 2,8.0,0.1 3 | 1,40,Counter-Strike Global Offensive,3.0,0.2 4 | 1,5,Left 4 Dead 2,5.0,0.3 5 | 1,10,Team Fortress 2,6.0,0.4 6 | 1,29,Sid Meier's Civilization V,7.0,0.5 7 | 1,8,Poly Bridge,9.0,0.6 8 | 1,875,Assassin's Creed IV,9.0,0.7 9 | 1,412,Cities Skylines,8.0,0.8 10 | 1,2,Fallout 4,9.0,0.9 11 | 1,6,HuniePop,3.0,1.0 -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/cn/edu/nju/DataProcessing.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import java.util 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.sql.{SaveMode, SparkSession} 7 | 8 | import scala.util.Random 9 | 10 | /** 11 | * Created by thpffcj on 2019/11/18. 12 | */ 13 | object DataProcessing { 14 | 15 | def main(args: Array[String]): Unit = { 16 | 17 | getStreamRating() 18 | } 19 | 20 | def getStreamRating() = { 21 | 22 | val gameMap = new util.HashMap[String, Int]() 23 | // 游戏出现次数 24 | val gameNumber = new util.HashMap[String, Int]() 25 | val maxTimeMap = new util.HashMap[String, Double] 26 | 27 | val conf = new SparkConf().setMaster("local").setAppName("DataProcessing") 28 | val spark = SparkSession.builder().config(conf).getOrCreate() 29 | 30 | var data = spark.read.format("csv") 31 | .option("header", "true") 32 | .option("inferSchema", "true") 33 | .load("src/main/resources/steam.csv") 34 | .select("userId", "gameName", "behavior", "duration", "gameId") 35 | 36 | data = data.filter(row => row.getAs("behavior").equals("play")) 37 | 38 | var key = 1 39 | data.collect().foreach(row => { 40 | 41 | val gameName = row.getAs("gameName").toString 42 | val duration = row.getAs("duration").toString.toDouble 43 | 44 | if (!gameMap.containsKey(gameName)) { 45 | gameMap.put(gameName, key) 46 | key = key + 1 47 | } 48 | 49 | if (gameNumber.containsKey(gameName)) { 50 | gameNumber.put(gameName, gameNumber.get(gameName) + 1) 51 | } else { 52 | gameNumber.put(gameName, 1) 53 | } 54 | 55 | if (maxTimeMap.containsKey(gameName)) { 56 | if (duration > maxTimeMap.get(gameName)) { 57 | maxTimeMap.put(gameName, duration) 58 | } 59 | } else { 60 | maxTimeMap.put(gameName, duration) 61 | } 62 | 63 | }) 64 | 65 | import spark.implicits._ 66 | val rand = new Random() 67 | val cleanData = data.filter(row => { 68 | gameNumber.get(row.getAs("gameName").toString) > 2 69 | }).map(row => { 70 | 71 | val userId = row.getAs("userId").toString 72 | val gameName = row.getAs("gameName").toString 73 | var duration = (row.getAs("duration").toString.toDouble / maxTimeMap.get(gameName) * 10).formatted("%.2f") 74 | if (duration.toDouble < 1.0) { 75 | duration = "1.0" 76 | } 77 | val gameId = gameMap.get(gameName) 78 | val random = rand.nextDouble() 79 | 80 | (userId, gameId, gameName, duration, random) 81 | }) 82 | 83 | cleanData.repartition(1).write.format("com.databricks.spark.csv") 84 | .option("header", "false") 85 | .option("delimiter", ",") 86 | .mode(SaveMode.Overwrite) 87 | .save("src/main/resources/steam_rating.csv") 88 | 89 | spark.stop() 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/cn/edu/nju/EmotionAnalysis.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel} 5 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 6 | import org.apache.spark.ml.feature.{HashingTF, IDF, IDFModel} 7 | import org.apache.spark.sql.SparkSession 8 | 9 | import scala.util.Random 10 | 11 | /** 12 | * Created by thpffcj on 2019/11/20. 13 | */ 14 | 15 | object EmotionAnalysis { 16 | 17 | def main(args: Array[String]): Unit = { 18 | test() 19 | } 20 | 21 | /** 22 | * (31806,32974,64780) 23 | * accuracy is 0.6932404540763674 24 | */ 25 | def train() = { 26 | 27 | val conf = new SparkConf().setMaster("local").setAppName("EmotionAnalysis") 28 | val spark = SparkSession.builder().config(conf).getOrCreate() 29 | // 日志级别 30 | spark.sparkContext.setLogLevel("WARN") 31 | 32 | val rand = new Random() 33 | 34 | import spark.implicits._ 35 | // 数据预处理 36 | val neg = spark.read.textFile("src/main/resources/neg.txt").map(line => { 37 | // 分词 38 | (line.split(" ").filter(!_.equals(" ")), 0, rand.nextDouble()) 39 | }).toDF("words", "value", "random") 40 | 41 | val pos = spark.read.textFile("src/main/resources/pos.txt").map(line => { 42 | (line.split(" ").filter(!_.equals(" ")), 1, rand.nextDouble()) 43 | }).toDF("words", "value", "random") // 思考:这里把inner function提出重用来如何操作 44 | 45 | // 合并乱序 46 | val data = neg.union(pos).sort("random") 47 | println(neg.count(), pos.count(), data.count()) // 合并 48 | 49 | // 文本特征抽取(TF-IDF) 50 | val hashingTf = new HashingTF() 51 | .setInputCol("words") 52 | .setOutputCol("hashing") 53 | .transform(data) 54 | 55 | val idfModel = new IDF() 56 | .setInputCol("hashing") 57 | .setOutputCol("tfidf") 58 | .fit(hashingTf) 59 | 60 | val transformedData = idfModel.transform(hashingTf) 61 | val Array(training, test) = transformedData 62 | .randomSplit(Array(0.7, 0.3)) 63 | 64 | // 根据抽取到的文本特征,使用分类器进行分类,这是一个二分类问题 65 | // 分类器是可替换的 66 | val bayes = new NaiveBayes() 67 | .setFeaturesCol("tfidf") // X 68 | .setLabelCol("value") // y 0:消极,1:积极 69 | .fit(training) 70 | 71 | // 交叉验证 72 | val result = bayes.transform(test) 73 | // result.show(false) 74 | 75 | // 评估模型的准确率 76 | val evaluator = new MulticlassClassificationEvaluator() 77 | .setLabelCol("value") 78 | .setPredictionCol("prediction") 79 | .setMetricName("accuracy") 80 | 81 | val accuracy = evaluator.evaluate(result) 82 | println(s"""accuracy is $accuracy""") 83 | 84 | // idfModel.save("src/main/resources/model/IDFModel.model") 85 | // bayes.save("src/main/resources/model/content_emotion.model") 86 | 87 | // 重构思考: 88 | // 尝试用pipeline重构代码 89 | // 尝试用模型预测随便属于一句话的情感,例如: 90 | // You are a bad girl,I hate you. ^_^ 91 | 92 | spark.stop() 93 | } 94 | 95 | def test() = { 96 | 97 | val conf = new SparkConf().setMaster("local").setAppName("EmotionAnalysis") 98 | val spark = SparkSession.builder().config(conf).getOrCreate() 99 | 100 | import spark.implicits._ 101 | val content = spark.read.textFile("src/main/resources/game_content.txt").map(line => { 102 | (line.split(" ").filter(!_.equals(" "))) 103 | }).toDF("words") 104 | 105 | // 文本特征抽取(TF-IDF) 106 | val hashingTf = new HashingTF() 107 | .setInputCol("words") 108 | .setOutputCol("hashing") 109 | .transform(content) 110 | 111 | val idfModel = IDFModel.load("src/main/resources/model/IDFModel.model") 112 | 113 | val transformedData = idfModel.transform(hashingTf) 114 | 115 | val bayes = NaiveBayesModel.load("src/main/resources/model/content_emotion.model") 116 | 117 | val result = bayes.transform(transformedData) 118 | result.show() 119 | 120 | spark.stop() 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/cn/edu/nju/SteamGameRecommendation.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.ml.evaluation.RegressionEvaluator 5 | import org.apache.spark.ml.recommendation.{ALS, ALSModel} 6 | import org.apache.spark.sql.SparkSession 7 | 8 | /** 9 | * Created by thpffcj on 2019/11/16. 10 | */ 11 | object SteamGameRecommendation { 12 | 13 | def main(args: Array[String]): Unit = { 14 | 15 | test() 16 | } 17 | 18 | def train() = { 19 | 20 | val conf = new SparkConf().setMaster("local[4]").setAppName("SteamGameRecommendation") 21 | val spark = SparkSession.builder().config(conf).getOrCreate() 22 | spark.sparkContext.setLogLevel("WARN") 23 | 24 | val data = spark.read.format("csv") 25 | .option("header", "true") 26 | .option("inferSchema", "true") 27 | .load("src/main/resources/steam_rating.csv") 28 | .select("userId", "gameId", "gameName", "rating", "random") 29 | .sort("random") 30 | .select("userId", "gameId", "rating") 31 | 32 | val Array(train, test) = data.randomSplit(Array(0.7, 0.3)) 33 | 34 | val als = new ALS() 35 | .setMaxIter(20) 36 | .setUserCol("userId") 37 | .setItemCol("gameId") 38 | .setRatingCol("rating") 39 | // 正则化参数 40 | .setRegParam(0.01) 41 | 42 | val model = als.fit(train) 43 | 44 | // 冷启动策略 45 | model.setColdStartStrategy("drop") 46 | 47 | val predictions = model.transform(test) 48 | // 根据(userId, gameId)预测rating 49 | predictions.show(false) 50 | 51 | // 模型评估 52 | val evaluator = new RegressionEvaluator() 53 | .setMetricName("rmse") 54 | .setLabelCol("rating") 55 | .setPredictionCol("prediction") 56 | 57 | val rmse = evaluator.evaluate(predictions) 58 | println(s"Root-mean-square error is $rmse \n") 59 | 60 | // Spark机器学习模型的持久化 61 | // 模型保存 62 | model.save("src/main/resources/model/game_recommendation.model") 63 | 64 | spark.stop() 65 | } 66 | 67 | def test() = { 68 | 69 | val conf = new SparkConf().setMaster("local").setAppName("SteamGameRecommendation") 70 | val spark = SparkSession.builder().config(conf).getOrCreate() 71 | spark.sparkContext.setLogLevel("WARN") 72 | 73 | // 模型加载 74 | val model = ALSModel.load("src/main/resources/model/game_recommendation.model") 75 | 76 | import spark.implicits._ 77 | val users = spark.createDataset(Array(1)).toDF("userId") 78 | users.show(false) 79 | 80 | model.recommendForUserSubset(users, 20).show(false) 81 | 82 | spark.stop() 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /spark-streaming/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 4.0.0 6 | cn.edu.nju 7 | spark-streaming 8 | 1.0 9 | 10 | org.springframework.boot 11 | spring-boot-starter-parent 12 | 2.2.0.RELEASE 13 | 14 | 15 | 16 | 2.11.8 17 | 2.2.0 18 | 2.4.0 19 | 2.6.0-cdh5.16.2 20 | 1.2.0-cdh5.16.2 21 | 1.8 22 | 1.8 23 | 24 | 25 | 26 | 27 | 28 | cloudera 29 | https://repository.cloudera.com/artifactory/cloudera-repos 30 | 31 | 32 | central 33 | aliyun maven 34 | http://maven.aliyun.com/nexus/content/groups/public/ 35 | default 36 | 37 | 38 | 39 | 40 | 41 | org.springframework.boot 42 | spring-boot-starter-web 43 | 44 | 45 | ch.qos.logback 46 | logback-classic 47 | 48 | 49 | 50 | 51 | org.springframework.boot 52 | spring-boot-starter-websocket 53 | 54 | 55 | org.springframework.session 56 | spring-session-core 57 | 58 | 59 | cn.hutool 60 | hutool-log 61 | 4.1.1 62 | 63 | 64 | 65 | org.springframework.boot 66 | spring-boot-starter-test 67 | test 68 | 69 | 70 | org.junit.vintage 71 | junit-vintage-engine 72 | 73 | 74 | 75 | 76 | 77 | org.projectlombok 78 | lombok 79 | 1.16.18 80 | 81 | 82 | 83 | org.scala-lang 84 | scala-library 85 | ${scala.version} 86 | 87 | 88 | 89 | org.mongodb.spark 90 | mongo-spark-connector_2.11 91 | ${spark.version} 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | org.apache.hadoop 104 | hadoop-client 105 | ${hadoop.version} 106 | 107 | 108 | 109 | 110 | org.apache.hbase 111 | hbase-client 112 | ${hbase.version} 113 | 114 | 115 | 116 | org.apache.hbase 117 | hbase-server 118 | ${hbase.version} 119 | 120 | 121 | 122 | 123 | org.apache.spark 124 | spark-streaming_2.11 125 | ${spark.version} 126 | 127 | 128 | 129 | org.apache.spark 130 | spark-graphx_2.11 131 | ${spark.version} 132 | 133 | 134 | 135 | org.apache.spark 136 | spark-streaming-kafka-0-10_2.11 137 | ${spark.version} 138 | 139 | 140 | 141 | org.apache.commons 142 | commons-lang3 143 | 3.5 144 | 145 | 146 | 147 | 148 | org.apache.spark 149 | spark-sql_2.11 150 | ${spark.version} 151 | 152 | 153 | 154 | com.fasterxml.jackson.module 155 | jackson-module-scala_2.11 156 | 2.6.5 157 | 158 | 159 | 160 | org.elasticsearch 161 | elasticsearch-spark-20_2.11 162 | 6.5.4 163 | 164 | 165 | 166 | com.alibaba 167 | fastjson 168 | 1.2.47 169 | 170 | 171 | 172 | com.fasterxml.jackson.core 173 | jackson-databind 174 | 2.9.10.1 175 | 176 | 177 | 178 | net.jpountz.lz4 179 | lz4 180 | 1.3.0 181 | 182 | 183 | 184 | org.codehaus.janino 185 | janino 186 | 3.0.8 187 | 188 | 189 | 190 | mysql 191 | mysql-connector-java 192 | 5.1.38 193 | 194 | 195 | 196 | io.netty 197 | netty-all 198 | 4.1.42.Final 199 | 200 | 201 | 202 | com.mchange 203 | c3p0 204 | 0.9.5.2 205 | 206 | 207 | 208 | 209 | 210 | 214 | 215 | 216 | org.scala-tools 217 | maven-scala-plugin 218 | 219 | 220 | 221 | compile 222 | testCompile 223 | 224 | 225 | 226 | 227 | ${scala.version} 228 | 229 | -target:jvm-1.8 230 | 231 | 232 | 233 | 234 | org.apache.maven.plugins 235 | maven-eclipse-plugin 236 | 237 | true 238 | 239 | ch.epfl.lamp.sdt.core.scalabuilder 240 | 241 | 242 | ch.epfl.lamp.sdt.core.scalanature 243 | 244 | 245 | org.eclipse.jdt.launching.JRE_CONTAINER 246 | ch.epfl.lamp.sdt.launching.SCALA_CONTAINER 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | org.scala-tools 256 | maven-scala-plugin 257 | 258 | ${scala.version} 259 | 260 | 261 | 262 | 263 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/ApiReturnUtil.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import cn.edu.nju.api.ApiReturnObject; 4 | import cn.hutool.log.Log; 5 | import cn.hutool.log.LogFactory; 6 | 7 | public class ApiReturnUtil { 8 | 9 | static Log log = LogFactory.get(WebSocketServer.class); 10 | 11 | public static ApiReturnObject error(String s) { 12 | log.error(s); 13 | return new ApiReturnObject(null); 14 | } 15 | 16 | public static ApiReturnObject success(String cid) { 17 | log.info("success:" + cid); 18 | return new ApiReturnObject(null); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/DemoMessageController.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import cn.edu.nju.api.ApiReturnObject; 4 | import org.springframework.stereotype.Controller; 5 | import org.springframework.web.bind.annotation.GetMapping; 6 | import org.springframework.web.bind.annotation.PathVariable; 7 | import org.springframework.web.bind.annotation.RequestMapping; 8 | import org.springframework.web.bind.annotation.ResponseBody; 9 | 10 | import java.io.IOException; 11 | 12 | @Controller 13 | @RequestMapping("/websocket") 14 | public class DemoMessageController { 15 | 16 | //页面请求 17 | @GetMapping("/") 18 | public String index() { 19 | return "index"; 20 | } 21 | //推送数据接口 22 | @ResponseBody 23 | @RequestMapping("/socket/push/{cid}") 24 | public ApiReturnObject pushToWeb(@PathVariable String cid, String message) { 25 | try { 26 | WebSocketServer.sendInfo(message,cid); 27 | } catch (IOException e) { 28 | e.printStackTrace(); 29 | return ApiReturnUtil.error(cid+"#"+e.getMessage()); 30 | } 31 | return ApiReturnUtil.success(cid); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/SteamserverdemoApplication.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | 6 | @SpringBootApplication 7 | public class SteamserverdemoApplication { 8 | 9 | public static void main(String[] args) { 10 | SpringApplication.run(SteamserverdemoApplication.class, args); 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/Test.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import cn.edu.nju.api.ApiReturnObject; 4 | import cn.edu.nju.api.TagReturnObject; 5 | import cn.edu.nju.utils.DateUtils; 6 | 7 | /** 8 | * Created by thpffcj on 2019/10/24. 9 | */ 10 | public class Test { 11 | 12 | public static void main(String[] args) { 13 | 14 | MySQLProcess mySQLProcess = new MySQLProcess(); 15 | 16 | ApiReturnObject apiReturnObject = mySQLProcess.getTimeFieldData(DateUtils.getSteamDates()); 17 | 18 | TagReturnObject tagReturnObject = mySQLProcess.getTagData(2); 19 | 20 | System.out.println("hello"); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/TimeFieldObject.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import cn.edu.nju.domain.GameObject; 4 | import org.springframework.beans.factory.annotation.Autowired; 5 | 6 | import java.io.Serializable; 7 | import java.util.ArrayList; 8 | 9 | public class TimeFieldObject implements Serializable{ 10 | 11 | @Autowired 12 | private String name; 13 | 14 | @Autowired 15 | private ArrayList values; 16 | 17 | public TimeFieldObject(String name, ArrayList values) { 18 | this.name = name; 19 | this.values = values; 20 | } 21 | 22 | public String getName() { 23 | return name; 24 | } 25 | 26 | public void setName(String name) { 27 | this.name = name; 28 | } 29 | 30 | public ArrayList getValues() { 31 | return values; 32 | } 33 | 34 | public void setValues(ArrayList values) { 35 | this.values = values; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/WebSocketConfig.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import org.springframework.context.annotation.Bean; 4 | import org.springframework.context.annotation.Configuration; 5 | import org.springframework.web.socket.server.standard.ServerEndpointExporter; 6 | 7 | @Configuration 8 | public class WebSocketConfig { 9 | 10 | @Bean 11 | public ServerEndpointExporter serverEndpointExporter(){ 12 | return new ServerEndpointExporter(); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/WebSocketServer.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.concurrent.CopyOnWriteArraySet; 6 | 7 | import javax.websocket.*; 8 | import javax.websocket.server.PathParam; 9 | import javax.websocket.server.ServerEndpoint; 10 | 11 | import cn.edu.nju.api.ApiReturnObject; 12 | import cn.edu.nju.api.TagReturnObject; 13 | import cn.edu.nju.domain.GameObject; 14 | import cn.edu.nju.encoder.ApiObjectEncoder; 15 | import cn.edu.nju.utils.DateUtils; 16 | import org.springframework.stereotype.Component; 17 | import cn.hutool.log.Log; 18 | import cn.hutool.log.LogFactory; 19 | 20 | @ServerEndpoint(value = "/websocket/{sid}", encoders = {ApiObjectEncoder.class}) 21 | @Component 22 | public class WebSocketServer { 23 | 24 | static Log log = LogFactory.get(WebSocketServer.class); 25 | 26 | // 静态变量,用来记录当前在线连接数。应该把它设计成线程安全的。 27 | private static int onlineCount = 0; 28 | 29 | // concurrent包的线程安全Set,用来存放每个客户端对应的MyWebSocket对象。 30 | private static CopyOnWriteArraySet webSocketSet = new CopyOnWriteArraySet(); 31 | 32 | //与某个客户端的连接会话,需要通过它来给客户端发送数据 33 | private Session session; 34 | 35 | //接收sid 36 | private String sid = ""; 37 | 38 | /** 39 | * 连接建立成功调用的方法 40 | */ 41 | @OnOpen 42 | public void onOpen(Session session, @PathParam("sid") String sid) { 43 | this.session = session; 44 | webSocketSet.add(this); //加入set中 45 | addOnlineCount(); //在线数加1 46 | log.info("有新窗口开始监听:" + sid + ",当前在线人数为" + getOnlineCount()); 47 | this.sid = sid; 48 | // GameObject gameObject1 = new GameObject("edge", "just so so", 2200, "blue"); 49 | // GameObject gameObject2 = new GameObject("fire fox", "good", 900, "green"); 50 | // GameObject gameObject3 = new GameObject("chrome", "excellent", 3800, "red"); 51 | // GameObject gameObject4 = new GameObject("edge", "just so so", 1500, "blue"); 52 | // GameObject gameObject5 = new GameObject("fire fox", "good", 1900, "green"); 53 | // GameObject gameObject6 = new GameObject("chrome", "excellent", 2800, "red"); 54 | // GameObject gameObject7 = new GameObject("edge", "just so so", 2600, "blue"); 55 | // GameObject gameObject8 = new GameObject("fire fox", "good", 2200, "green"); 56 | // GameObject gameObject9 = new GameObject("chrome", "excellent", 1800, "red"); 57 | // ArrayList gameObjects1 = new ArrayList<>(); 58 | // ArrayList gameObjects2 = new ArrayList<>(); 59 | // ArrayList gameObjects3 = new ArrayList<>(); 60 | // gameObjects1.add(gameObject1); 61 | // gameObjects1.add(gameObject2); 62 | // gameObjects1.add(gameObject3); 63 | // gameObjects2.add(gameObject4); 64 | // gameObjects2.add(gameObject5); 65 | // gameObjects2.add(gameObject6); 66 | // gameObjects3.add(gameObject7); 67 | // gameObjects3.add(gameObject8); 68 | // gameObjects3.add(gameObject9); 69 | // TimeFieldObject timeFieldObject1 = new TimeFieldObject("2017", gameObjects1); 70 | // TimeFieldObject timeFieldObject2 = new TimeFieldObject("2018", gameObjects2); 71 | // TimeFieldObject timeFieldObject3 = new TimeFieldObject("2019", gameObjects3); 72 | // ArrayList timeFieldObjects = new ArrayList<>(); 73 | // timeFieldObjects.add(timeFieldObject1); 74 | // timeFieldObjects.add(timeFieldObject2); 75 | // timeFieldObjects.add(timeFieldObject3); 76 | MySQLProcess mySQLProcess = new MySQLProcess(); 77 | ApiReturnObject apiReturnObject = mySQLProcess.getTimeFieldData(DateUtils.getSteamDates()); 78 | try { 79 | sendData(apiReturnObject); 80 | for (int i = 1; i <= 7; i++) { 81 | TagReturnObject tagReturnObject = mySQLProcess.getTagData(i); 82 | sendTagData(tagReturnObject); 83 | Thread.sleep(5000); 84 | } 85 | } catch (IOException | EncodeException | InterruptedException e) { 86 | log.error("websocket IO异常"+e.getMessage()); 87 | } 88 | } 89 | 90 | /** 91 | * 连接关闭调用的方法 92 | */ 93 | @OnClose 94 | public void onClose() { 95 | webSocketSet.remove(this); //从set中删除 96 | subOnlineCount(); //在线数减1 97 | log.info("有一连接关闭!当前在线人数为" + getOnlineCount()); 98 | } 99 | 100 | /** 101 | * 收到客户端消息后调用的方法 102 | * 103 | * @param message 客户端发送过来的消息 104 | */ 105 | @OnMessage 106 | public void onMessage(String message, Session session) { 107 | log.info("收到来自窗口" + sid + "的信息:" + message); 108 | //群发消息 109 | for (WebSocketServer item : webSocketSet) { 110 | try { 111 | item.sendMessage(message); 112 | } catch (IOException e) { 113 | e.printStackTrace(); 114 | } 115 | } 116 | } 117 | 118 | /** 119 | * @param session 120 | * @param error 121 | */ 122 | @OnError 123 | public void onError(Session session, Throwable error) { 124 | log.error("发生错误"); 125 | error.printStackTrace(); 126 | } 127 | 128 | /** 129 | * 实现服务器主动推送 130 | */ 131 | public void sendMessage(String message) throws IOException { 132 | this.session.getBasicRemote().sendText(message); 133 | } 134 | 135 | /** 136 | * 实现服务器主动推送 137 | */ 138 | public void sendData(ApiReturnObject data) throws IOException, EncodeException { 139 | this.session.getBasicRemote().sendObject(data); 140 | } 141 | 142 | public void sendTagData(TagReturnObject data) throws IOException, EncodeException { 143 | this.session.getBasicRemote().sendObject(data); 144 | } 145 | 146 | /** 147 | * 群发自定义消息 148 | */ 149 | public static void sendInfo(String message, @PathParam("sid") String sid) throws IOException { 150 | log.info("推送消息到窗口" + sid + ",推送内容:" + message); 151 | for (WebSocketServer item : webSocketSet) { 152 | try { 153 | //这里可以设定只推送给这个sid的,为null则全部推送 154 | if (sid == null) { 155 | item.sendMessage(message); 156 | } else if (item.sid.equals(sid)) { 157 | item.sendMessage(message); 158 | } 159 | } catch (IOException e) { 160 | continue; 161 | } 162 | } 163 | } 164 | 165 | public static synchronized int getOnlineCount() { 166 | return onlineCount; 167 | } 168 | 169 | public static synchronized void addOnlineCount() { 170 | WebSocketServer.onlineCount++; 171 | } 172 | 173 | public static synchronized void subOnlineCount() { 174 | WebSocketServer.onlineCount--; 175 | } 176 | } 177 | 178 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/api/ApiReturnObject.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.api; 2 | 3 | import cn.edu.nju.TimeFieldObject; 4 | 5 | import java.io.Serializable; 6 | import java.util.ArrayList; 7 | 8 | public class ApiReturnObject implements Serializable { 9 | 10 | private ArrayList timeFieldObjects; 11 | 12 | public ApiReturnObject(ArrayList timeFieldObjects) { 13 | this.timeFieldObjects = timeFieldObjects; 14 | } 15 | 16 | public ArrayList getTimeFieldObjects() { 17 | return timeFieldObjects; 18 | } 19 | 20 | public void setTimeFieldObjects(ArrayList timeFieldObjects) { 21 | this.timeFieldObjects = timeFieldObjects; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/api/TagReturnObject.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.api; 2 | 3 | import cn.edu.nju.domain.TagObject; 4 | import lombok.AllArgsConstructor; 5 | import lombok.Data; 6 | import lombok.NoArgsConstructor; 7 | 8 | import java.io.Serializable; 9 | import java.util.ArrayList; 10 | 11 | /** 12 | * Created by thpffcj on 2019/10/25. 13 | */ 14 | public class TagReturnObject implements Serializable { 15 | 16 | private ArrayList tagObjects; 17 | 18 | public TagReturnObject(ArrayList tagObjects) { 19 | this.tagObjects = tagObjects; 20 | } 21 | 22 | public ArrayList getTagObjects() { 23 | return tagObjects; 24 | } 25 | 26 | public void setTagObjects(ArrayList tagObjects) { 27 | this.tagObjects = tagObjects; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/domain/GameObject.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain; 2 | 3 | import org.springframework.beans.factory.annotation.Autowired; 4 | 5 | import java.io.Serializable; 6 | 7 | public class GameObject implements Serializable { 8 | 9 | @Autowired 10 | private String id; 11 | 12 | @Autowired 13 | private String label; 14 | 15 | @Autowired 16 | private int value; 17 | 18 | @Autowired 19 | private String color; 20 | 21 | public GameObject(String id, String label, int value, String color) { 22 | this.id = id; 23 | this.label = label; 24 | this.value = value; 25 | this.color = color; 26 | } 27 | 28 | public String getId() { 29 | return id; 30 | } 31 | 32 | public void setId(String id) { 33 | this.id = id; 34 | } 35 | 36 | public String getLabel() { 37 | return label; 38 | } 39 | 40 | public void setLabel(String label) { 41 | this.label = label; 42 | } 43 | 44 | public int getValue() { 45 | return value; 46 | } 47 | 48 | public void setValue(int value) { 49 | this.value = value; 50 | } 51 | 52 | public String getColor() { 53 | return color; 54 | } 55 | 56 | public void setColor(String color) { 57 | this.color = color; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/domain/TagObject.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain; 2 | 3 | import lombok.AllArgsConstructor; 4 | import lombok.Data; 5 | import lombok.NoArgsConstructor; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | import java.io.Serializable; 9 | 10 | /** 11 | * Created by thpffcj on 2019/10/25. 12 | */ 13 | public class TagObject implements Serializable { 14 | 15 | @Autowired 16 | private String label; 17 | 18 | @Autowired 19 | private int value; 20 | 21 | public TagObject() { 22 | } 23 | 24 | public TagObject(String label, int value) { 25 | this.label = label; 26 | this.value = value; 27 | } 28 | 29 | public String getLabel() { 30 | return label; 31 | } 32 | 33 | public void setLabel(String label) { 34 | this.label = label; 35 | } 36 | 37 | public int getValue() { 38 | return value; 39 | } 40 | 41 | public void setValue(int value) { 42 | this.value = value; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/encoder/ApiObjectEncoder.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.encoder; 2 | 3 | import javax.websocket.EncodeException; 4 | import javax.websocket.Encoder; 5 | import javax.websocket.EndpointConfig; 6 | 7 | import cn.edu.nju.api.ApiReturnObject; 8 | import com.alibaba.fastjson.JSON; 9 | import com.alibaba.fastjson.serializer.SerializerFeature; 10 | import com.alibaba.fastjson.serializer.SimplePropertyPreFilter; 11 | 12 | public class ApiObjectEncoder implements Encoder.Text { 13 | 14 | @Override 15 | public String encode(ApiReturnObject apiReturnObject) throws EncodeException { 16 | SimplePropertyPreFilter filter = new SimplePropertyPreFilter( 17 | ApiReturnObject.class, "timeFieldObjects"); 18 | return JSON.toJSONString(apiReturnObject,filter,SerializerFeature.DisableCircularReferenceDetect); 19 | } 20 | 21 | @Override 22 | public void init(EndpointConfig endpointConfig) { 23 | 24 | } 25 | 26 | @Override 27 | public void destroy() { 28 | 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/encoder/TagObjectEncoder.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.encoder; 2 | 3 | import cn.edu.nju.api.TagReturnObject; 4 | import com.alibaba.fastjson.JSON; 5 | import com.alibaba.fastjson.serializer.SerializerFeature; 6 | import com.alibaba.fastjson.serializer.SimplePropertyPreFilter; 7 | 8 | import javax.websocket.EncodeException; 9 | import javax.websocket.Encoder; 10 | import javax.websocket.EndpointConfig; 11 | 12 | /** 13 | * Created by thpffcj on 2019/10/26. 14 | */ 15 | public class TagObjectEncoder implements Encoder.Text { 16 | 17 | @Override 18 | public String encode(TagReturnObject tagReturnObject) throws EncodeException { 19 | SimplePropertyPreFilter filter = new SimplePropertyPreFilter( 20 | TagReturnObject.class, "tagObjects"); 21 | return JSON.toJSONString(tagReturnObject,filter, SerializerFeature.DisableCircularReferenceDetect); 22 | } 23 | 24 | @Override 25 | public void init(EndpointConfig endpointConfig) { 26 | 27 | } 28 | 29 | @Override 30 | public void destroy() { 31 | 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/utils/DbPool.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.utils; 2 | 3 | import cn.edu.nju.domain.TagObject; 4 | import com.mchange.v2.c3p0.ComboPooledDataSource; 5 | 6 | import java.sql.Connection; 7 | import java.sql.SQLException; 8 | import java.util.ArrayList; 9 | import java.sql.PreparedStatement; 10 | import java.sql.ResultSet; 11 | import java.util.List; 12 | 13 | /** 14 | * Created by thpffcj on 2020/2/26. 15 | */ 16 | public class DbPool { 17 | 18 | private static DbPool instance; 19 | 20 | private ComboPooledDataSource ds; 21 | 22 | private DbPool() throws Exception { 23 | ds = new ComboPooledDataSource(); 24 | ds.setDriverClass("oracle.jdbc.driver.OracleDriver"); //驱动 25 | ds.setJdbcUrl("jdbc:oracle:thin:@localhost:1521:orcl"); //地址 26 | ds.setUser("test0816"); //数据库用户名 27 | ds.setPassword("934617699"); //数据库用户密码 28 | 29 | // 初始化时获取三个连接,取值应在minPoolSize与maxPoolSize之间。Default: 5 initialPoolSize 30 | ds.setInitialPoolSize(5); 31 | // 连接池中保留的最大连接数。Default: 20 maxPoolSize 32 | ds.setMaxPoolSize(20); 33 | // 连接池中保留的最小连接数。 34 | ds.setMinPoolSize(1); 35 | // 当连接池中的连接耗尽的时候c3p0一次同时获取的连接数。Default: 5 acquireIncrement 36 | ds.setAcquireIncrement(10); 37 | } 38 | 39 | // 用来返回该对象 40 | public static final DbPool getInstance() { 41 | 42 | if (instance == null) { 43 | try { 44 | instance = new DbPool(); 45 | } catch (Exception e) { 46 | e.printStackTrace(); 47 | } 48 | } 49 | return instance; 50 | } 51 | 52 | // 返回一个连接 53 | public synchronized final Connection getConnection() { 54 | try { 55 | return ds.getConnection(); 56 | } catch (SQLException e) { 57 | e.printStackTrace(); 58 | } 59 | return null; 60 | } 61 | 62 | public static void main(String[] args) { 63 | DbPool dbPool = DbPool.getInstance() ; 64 | 65 | List list = new ArrayList<>(); 66 | 67 | Connection connection = dbPool.getConnection(); 68 | String sql = "select * from person " ; 69 | 70 | try { 71 | PreparedStatement pt = connection.prepareStatement(sql) ; 72 | ResultSet rt = pt.executeQuery() ; 73 | 74 | while(rt.next()) { 75 | TagObject tag = new TagObject(); 76 | tag.setLabel(rt.getString("label")); 77 | tag.setValue(rt.getInt("value")); 78 | list.add(tag) ; 79 | } 80 | 81 | for(TagObject tag : list) { 82 | System.out.println(tag); 83 | } 84 | } catch (SQLException e) { 85 | e.printStackTrace(); 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/utils/HBaseUtils.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.utils; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.hbase.client.HBaseAdmin; 5 | import org.apache.hadoop.hbase.client.HTable; 6 | import org.apache.hadoop.hbase.client.Put; 7 | import org.apache.hadoop.hbase.util.Bytes; 8 | 9 | import java.io.IOException; 10 | 11 | /** 12 | * Created by thpffcj on 2019/10/17. 13 | */ 14 | public class HBaseUtils { 15 | 16 | HBaseAdmin admin = null; 17 | Configuration configuration = null; 18 | 19 | /** 20 | * 私有改造方法 21 | */ 22 | private HBaseUtils() { 23 | configuration = new Configuration(); 24 | configuration.set("hbase.zookeeper.quorum", "192.168.92.130:2181"); 25 | configuration.set("hbase.rootdir", "hdfs://192.168.92.130:8020/hbase"); 26 | 27 | try { 28 | admin = new HBaseAdmin(configuration); 29 | } catch (IOException e) { 30 | e.printStackTrace(); 31 | } 32 | } 33 | 34 | private static HBaseUtils instance = null; 35 | 36 | public static synchronized HBaseUtils getInstance() { 37 | if (null == instance) { 38 | instance = new HBaseUtils(); 39 | } 40 | return instance; 41 | } 42 | 43 | 44 | /** 45 | * 根据表名获取到HTable实例 46 | */ 47 | public HTable getTable(String tableName) { 48 | 49 | HTable table = null; 50 | 51 | try { 52 | table = new HTable(configuration, tableName); 53 | } catch (IOException e) { 54 | e.printStackTrace(); 55 | } 56 | 57 | return table; 58 | } 59 | 60 | /** 61 | * 添加一条记录到HBase表 62 | * 63 | * @param tableName HBase表名 64 | * @param rowkey HBase表的rowkey 65 | * @param cf HBase表的columnfamily 66 | * @param column HBase表的列 67 | * @param value 写入HBase表的值 68 | */ 69 | public void put(String tableName, String rowkey, String cf, String column, String value) { 70 | HTable table = getTable(tableName); 71 | 72 | Put put = new Put(Bytes.toBytes(rowkey)); 73 | put.add(Bytes.toBytes(cf), Bytes.toBytes(column), Bytes.toBytes(value)); 74 | 75 | try { 76 | table.put(put); 77 | } catch (IOException e) { 78 | e.printStackTrace(); 79 | } 80 | } 81 | 82 | public static void main(String[] args) { 83 | 84 | // HTable table = HBaseUtils.getInstance().getTable("imooc_course_clickcount"); 85 | // System.out.println(table.getName().getNameAsString()); 86 | 87 | String tableName = "imooc_course_clickcount"; 88 | String rowkey = "20171111_88"; 89 | String cf = "info"; 90 | String column = "click_count"; 91 | String value = "2"; 92 | HBaseUtils.getInstance().put(tableName, rowkey, cf, column, value); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/cn/edu/nju/utils/Test.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.utils; 2 | 3 | import java.sql.Connection; 4 | import java.sql.DriverManager; 5 | import java.sql.ResultSet; 6 | import java.sql.Statement; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | /** 11 | * Created by thpffcj on 2019/10/28. 12 | */ 13 | public class Test { 14 | 15 | public static void main(String[] args) { 16 | 17 | try { 18 | int[] time = new int[]{1483228800 19 | ,1485907200 20 | ,1488326400 21 | ,1491004800 22 | ,1493596800 23 | ,1496275200 24 | ,1498867200 25 | ,1501545600 26 | ,1504224000 27 | ,1506816000 28 | ,1509494400 29 | ,1512086400 30 | ,1514764800 31 | ,1517443200 32 | ,1519862400 33 | ,1522540800 34 | ,1525132800 35 | ,1527811200 36 | ,1530403200 37 | ,1533081600 38 | ,1535760000 39 | ,1538352000 40 | ,1541030400 41 | ,1543622400 42 | ,1546300800 43 | ,1548979200 44 | ,1551398400 45 | ,1554076800 46 | ,1556668800 47 | ,1559347200 48 | ,1561939200 49 | ,1564617600 50 | ,1567296000 51 | ,1569888000}; 52 | //调用Class.forName()方法加载驱动程序 53 | Class.forName("com.mysql.jdbc.Driver"); 54 | System.out.println("成功加载MySQL驱动!"); 55 | 56 | String url = "jdbc:mysql://172.19.240.128:3306/steam"; //JDBC的URL 57 | Connection conn; 58 | 59 | conn = DriverManager.getConnection(url, "root", "root"); 60 | 61 | Statement stmt = conn.createStatement(); 62 | System.out.println("成功连接到数据库!"); 63 | 64 | String sql = "select distinct name from roll_up"; 65 | ResultSet rs = stmt.executeQuery(sql); 66 | List gameName = new ArrayList<>(); 67 | while (rs.next()) { 68 | gameName.add(rs.getString(1)); 69 | } 70 | 71 | for (int i = 1; i < time.length; i++) { 72 | for (int j = 0; j < gameName.size(); j++) { 73 | 74 | sql = "select recommendations_up from roll_up where name = '" + gameName.get(j) + "' and time = " + time[i]; 75 | rs = stmt.executeQuery(sql); 76 | int up1 = 0; 77 | while (rs.next()) { 78 | up1 = rs.getInt(1); 79 | } 80 | 81 | sql = "select recommendations_up from roll_up where name = '" + gameName.get(j) + "' and time = " + time[i - 1]; 82 | rs = stmt.executeQuery(sql); 83 | int up2 = 0; 84 | while (rs.next()) { 85 | up2 = rs.getInt(1); 86 | } 87 | 88 | System.out.println(up1 + " " + up2); 89 | int up = up1 + up2; 90 | sql = "update roll_up set recommendations_up = " + up + " where name = '" + gameName.get(j) + "' and time = " + time[i]; 91 | System.out.println(sql); 92 | stmt.executeUpdate(sql); 93 | } 94 | } 95 | 96 | rs.close(); 97 | stmt.close(); 98 | conn.close(); 99 | } catch (Exception e) { 100 | e.printStackTrace(); 101 | } 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/RollupCSV/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/RollupCSV/.part-00000-4a85a460-3bdb-48ad-bc81-9b68057082a8-c000.csv.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/spark-streaming/src/main/resources/RollupCSV/.part-00000-4a85a460-3bdb-48ad-bc81-9b68057082a8-c000.csv.crc -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/RollupCSV/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/spark-streaming/src/main/resources/RollupCSV/_SUCCESS -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/game.json: -------------------------------------------------------------------------------- 1 | { 2 | "img_src": "https://media.st.dl.bscstorage.net/steam/apps/1085660/capsule_sm_120.jpg?t=1570039639", 3 | "game_detail": { 4 | "support_tags": [ 5 | "单人", 6 | "在线多人" 7 | ], 8 | "user_reviews": { 9 | "发行商:": "Bungie", 10 | "发行日期:": "2019年10月1日", 11 | "开发商:": "Bungie" 12 | }, 13 | "user_tags": [ 14 | "第一人称射击", 15 | "多人" 16 | ], 17 | "reviewsChart": { 18 | "weeks": [], 19 | "rollup_type": "week", 20 | "end_date": 1571529600, 21 | "recent": [ 22 | { 23 | "date": 1569888000, 24 | "recommendations_up": 5205, 25 | "recommendations_down": 1467 26 | }, 27 | { 28 | "date": 1569974400, 29 | "recommendations_up": 3881, 30 | "recommendations_down": 1616 31 | } 32 | ], 33 | "rollups": [ 34 | { 35 | "date": 1569888000, 36 | "recommendations_up": 16003, 37 | "recommendations_down": 6234 38 | } 39 | ], 40 | "start_date": 1569888000 41 | } 42 | }, 43 | "original_price": "免费开玩", 44 | "review_summary": "多半好评
30,477 篇用户的游戏评测中有 72% 为好评。", 45 | "price": "免费开玩", 46 | "date": "2019年10月1日", 47 | "name": "Destiny 2", 48 | "page": 1, 49 | "href": "https://store.steampowered.com/app/1085660/Destiny_2/?snr=1_7_7_230_150_1" 50 | } -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/gameAll.json: -------------------------------------------------------------------------------- 1 | { 2 | "page": 1, 3 | "date": "2019年9月26日", 4 | "href": "https://store.steampowered.com/app/678960/CODE_VEIN/?snr=1_7_7_230_150_1", 5 | "review_summary": "特别好评
4,945 篇用户的游戏评测中有 84% 为好评。", 6 | "img_src": "https://media.st.dl.bscstorage.net/steam/apps/678960/capsule_sm_120.jpg?t=1570113292", 7 | "name": "CODE VEIN", 8 | "game_detail": { 9 | "user_reviews": { 10 | "开发商:": "BANDAI NAMCO Studios", 11 | "发行商:": "BANDAI NAMCO Entertainment", 12 | "发行日期:": "2019年9月26日" 13 | }, 14 | "support_tags": [ 15 | "单人", 16 | "在线合作", 17 | "Steam 成就", 18 | "Steam 集换式卡牌", 19 | "部分支持控制器", 20 | "Steam 云" 21 | ], 22 | "user_tags": [ 23 | "动漫", 24 | "角色定制", 25 | "类魂系列", 26 | "角色扮演", 27 | "动作", 28 | "合作", 29 | "日系角色扮演", 30 | "第三人称视角", 31 | "吸血鬼", 32 | "暴力", 33 | "多人", 34 | "黑暗奇幻", 35 | "困难", 36 | "血腥", 37 | "动作角色扮演", 38 | "单人", 39 | "末日", 40 | "砍杀", 41 | "冒险", 42 | "好评原声音轨" 43 | ], 44 | "reviewsChart": { 45 | "rollups": [ 46 | { 47 | "recommendations_up": 2680, 48 | "date": 1569456000, 49 | "recommendations_down": 549 50 | }, 51 | { 52 | "recommendations_up": 907, 53 | "date": 1570060800, 54 | "recommendations_down": 164 55 | }, 56 | { 57 | "recommendations_up": 437, 58 | "date": 1570665600, 59 | "recommendations_down": 74 60 | }, 61 | { 62 | "recommendations_up": 167, 63 | "date": 1571270400, 64 | "recommendations_down": 34 65 | } 66 | ], 67 | "weeks": [], 68 | "start_date": 1569456000, 69 | "rollup_type": "week", 70 | "end_date": 1571616000, 71 | "recent": [ 72 | { 73 | "recommendations_up": 29, 74 | "date": 1569456000, 75 | "recommendations_down": 9 76 | }, 77 | { 78 | "recommendations_up": 918, 79 | "date": 1569542400, 80 | "recommendations_down": 160 81 | }, 82 | { 83 | "recommendations_up": 448, 84 | "date": 1569628800, 85 | "recommendations_down": 131 86 | }, 87 | { 88 | "recommendations_up": 397, 89 | "date": 1569715200, 90 | "recommendations_down": 88 91 | }, 92 | { 93 | "recommendations_up": 374, 94 | "date": 1569801600, 95 | "recommendations_down": 77 96 | }, 97 | { 98 | "recommendations_up": 344, 99 | "date": 1569888000, 100 | "recommendations_down": 48 101 | }, 102 | { 103 | "recommendations_up": 170, 104 | "date": 1569974400, 105 | "recommendations_down": 36 106 | }, 107 | { 108 | "recommendations_up": 197, 109 | "date": 1570060800, 110 | "recommendations_down": 35 111 | }, 112 | { 113 | "recommendations_up": 136, 114 | "date": 1570147200, 115 | "recommendations_down": 36 116 | }, 117 | { 118 | "recommendations_up": 151, 119 | "date": 1570233600, 120 | "recommendations_down": 37 121 | }, 122 | { 123 | "recommendations_up": 131, 124 | "date": 1570320000, 125 | "recommendations_down": 23 126 | }, 127 | { 128 | "recommendations_up": 121, 129 | "date": 1570406400, 130 | "recommendations_down": 17 131 | }, 132 | { 133 | "recommendations_up": 94, 134 | "date": 1570492800, 135 | "recommendations_down": 8 136 | }, 137 | { 138 | "recommendations_up": 77, 139 | "date": 1570579200, 140 | "recommendations_down": 8 141 | }, 142 | { 143 | "recommendations_up": 68, 144 | "date": 1570665600, 145 | "recommendations_down": 11 146 | }, 147 | { 148 | "recommendations_up": 62, 149 | "date": 1570752000, 150 | "recommendations_down": 21 151 | }, 152 | { 153 | "recommendations_up": 68, 154 | "date": 1570838400, 155 | "recommendations_down": 11 156 | }, 157 | { 158 | "recommendations_up": 79, 159 | "date": 1570924800, 160 | "recommendations_down": 12 161 | }, 162 | { 163 | "recommendations_up": 67, 164 | "date": 1571011200, 165 | "recommendations_down": 5 166 | }, 167 | { 168 | "recommendations_up": 54, 169 | "date": 1571097600, 170 | "recommendations_down": 9 171 | }, 172 | { 173 | "recommendations_up": 39, 174 | "date": 1571184000, 175 | "recommendations_down": 5 176 | }, 177 | { 178 | "recommendations_up": 44, 179 | "date": 1571270400, 180 | "recommendations_down": 7 181 | }, 182 | { 183 | "recommendations_up": 32, 184 | "date": 1571356800, 185 | "recommendations_down": 8 186 | }, 187 | { 188 | "recommendations_up": 43, 189 | "date": 1571443200, 190 | "recommendations_down": 11 191 | }, 192 | { 193 | "recommendations_up": 41, 194 | "date": 1571529600, 195 | "recommendations_down": 7 196 | }, 197 | { 198 | "recommendations_up": 7, 199 | "date": 1571616000, 200 | "recommendations_down": 1 201 | } 202 | ] 203 | } 204 | }, 205 | "price": "¥ 268", 206 | "original_price": "¥ 268" 207 | } 208 | -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/gameDetail.json: -------------------------------------------------------------------------------- 1 | { 2 | "reviewsChart": { 3 | "end_date": 1571616000, 4 | "rollups": [ 5 | { 6 | "recommendations_down": 34, 7 | "date": 1571270400, 8 | "recommendations_up": 167 9 | } 10 | ], 11 | "recent": [ 12 | { 13 | "recommendations_down": 1, 14 | "date": 1571616000, 15 | "recommendations_up": 7 16 | } 17 | ], 18 | "rollup_type": "week", 19 | "weeks": [], 20 | "start_date": 1569456000 21 | }, 22 | "support_tags": [ 23 | "单人", 24 | "在线合作", 25 | "Steam 成就" 26 | ], 27 | "user_tags": [ 28 | "动漫", 29 | "砍杀", 30 | "冒险", 31 | "好评原声音轨" 32 | ], 33 | "user_reviews": { 34 | "发行日期:": "2019年9月26日", 35 | "开发商:": "BANDAI NAMCO Studios", 36 | "发行商:": "BANDAI NAMCO Entertainment" 37 | } 38 | } -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Set the default spark-shell log level to WARN. When running the spark-shell, the 9 | # log level for this class is used to overwrite the root logger's log level, so that 10 | # the user can have different defaults for the shell and regular Spark apps. 11 | log4j.logger.org.apache.spark.repl.Main=WARN 12 | 13 | # Settings to quiet third party logs that are too verbose 14 | log4j.logger.org.spark_project.jetty=WARN 15 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR 16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 18 | log4j.logger.org.apache.parquet=ERROR 19 | log4j.logger.parquet=ERROR 20 | 21 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 22 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 23 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/test.txt: -------------------------------------------------------------------------------- 1 | 1,使命召唤1,2019-10-13,好玩1 2 | 4,使命召唤2,2019-10-14,好玩2 3 | 1,使命召唤3,2019-10-15,好玩3 4 | 2,使命召唤4,2019-10-16,好玩4 5 | 1,使命召唤5,2019-10-13,好玩5 6 | 1,使命召唤1,2019-10-15,好玩6 7 | 9999,使命召唤1,2019-10-15,好玩6 8 | https://media.st.dl.bscstorage.net/steam/apps/678960/capsule_sm_120.jpg?t=1570113292 {"reviewsChart": {"end_date": 1571616000, "rollups": [{"recommendations_down": 34, "date": 1571270400, "recommendations_up": 167}, {"recommendations_down": 34, "date": 1571270400, "recommendations_up": 167}], "recent": [{"recommendations_down": 1, "date": 1571616000, "recommendations_up": 7}], "rollup_type": "month", "weeks": [], "start_date": 1569456000}, "support_tags": ["单人", "在线合作", "Steam 成就"], "user_tags": ["动漫", "砍杀", "冒险", "好评原声音轨"], "user_reviews": {"发行日期:": "2019年9月26日", "开发商:": "BANDAI NAMCO Studios", "发行商:": "BANDAI NAMCO Entertainment"}} ¥ 268 ¥ 268 特别好评
4,945 篇用户的游戏评测中有 84% 为好评。 2019年9月26日 CODE VEIN -------------------------------------------------------------------------------- /spark-streaming/src/main/resources/update.sql: -------------------------------------------------------------------------------- 1 | 1483228800 2 | 1485907200 3 | 1488326400 4 | 1491004800 5 | 1493596800 6 | 1496275200 7 | 1498867200 8 | 1501545600 9 | 1504224000 10 | 1506816000 11 | 1509494400 12 | 1512086400 13 | 1514764800 14 | 1517443200 15 | 1519862400 16 | 1522540800 17 | 1525132800 18 | 1527811200 19 | 1530403200 20 | 1533081600 21 | 1535760000 22 | 1538352000 23 | 1541030400 24 | 1543622400 25 | 1546300800 26 | 1548979200 27 | 1551398400 28 | 1554076800 29 | 1556668800 30 | 1559347200 31 | 1561939200 32 | 1564617600 33 | 1567296000 34 | 1569888000 -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/BatchProcess.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util.Properties 5 | 6 | import cn.edu.nju.utils.DateUtils 7 | import org.apache.spark.broadcast.Broadcast 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession} 10 | 11 | import scala.collection.mutable.ListBuffer 12 | 13 | 14 | /** 15 | * Created by thpffcj on 2019/10/19. 16 | */ 17 | object BatchProcess { 18 | 19 | def main(args: Array[String]): Unit = { 20 | saveTop10ToCsv() 21 | } 22 | 23 | /** 24 | * 从MySQL中读取top10写入csv文件 25 | */ 26 | def saveTop10ToCsv(): Unit = { 27 | 28 | val sparkConf = new SparkConf().setMaster("local").setAppName("BatchProcess") 29 | val sc = SparkSession.builder().config(sparkConf).getOrCreate() 30 | 31 | val csvSavePath = "src/main/resources/RollupCSV" 32 | 33 | val tableName = "(select name, recommendations_up, time from top10_new order by time) as top10" 34 | val data: DataFrame = readMysqlTable(sc, tableName) 35 | 36 | import sc.implicits._ 37 | data.map(row => { 38 | 39 | val name = row.getAs("name").toString 40 | val types = "game" 41 | val recommendations_up = row.getAs("recommendations_up").toString 42 | val date = DateUtils.tranTimestampToString(row.getAs("time")) 43 | 44 | println((name, types, recommendations_up, date)) 45 | 46 | (name, types, recommendations_up, date) 47 | }).toDF("name", "type", "value", "date").write.mode(SaveMode.Overwrite).csv(csvSavePath) 48 | 49 | sc.stop() 50 | } 51 | 52 | /** 53 | * 按月份统计top10存入MySQL 54 | */ 55 | def saveRollUpToMysql() = { 56 | 57 | val sparkConf = new SparkConf().setMaster("local[1]").setAppName("BatchProcess") 58 | val sc = SparkSession.builder().config(sparkConf).getOrCreate() 59 | 60 | val dates = DateUtils.getSteamDates() 61 | 62 | for (date <- dates) { 63 | val time = (new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(date).getTime / 1000).toInt 64 | println(time) 65 | val tableName = "(select * from roll_up where time = " + time + " order by recommendations_up desc limit 10) as roll_up" 66 | val data: DataFrame = readMysqlTable(sc, tableName) 67 | 68 | val properties = new Properties() 69 | properties.setProperty("user", "root") 70 | properties.setProperty("password", "root") 71 | data.write.mode(SaveMode.Append).jdbc("jdbc:mysql://172.19.240.128:3306/steam", "top10_new", properties) 72 | } 73 | 74 | sc.stop() 75 | } 76 | 77 | // TODO Spark不支持Update操作 78 | def addRollUpByMonth()= { 79 | 80 | val sparkConf = new SparkConf().setMaster("local[2]").setAppName("BatchProcess") 81 | val sc = SparkSession.builder().config(sparkConf).getOrCreate() 82 | val dates = DateUtils.getSteamDates() 83 | 84 | var tableName = "(select name from roll_up) as roll_up" 85 | val data: DataFrame = readMysqlTable(sc, tableName) 86 | 87 | // 广播变量 88 | val gameName = new ListBuffer[String] 89 | val broadcast: Broadcast[ListBuffer[String]] = sc.sparkContext.broadcast(gameName) 90 | data.foreach(row => { 91 | broadcast.value.append(row.getAs("name").toString) 92 | }) 93 | 94 | for (game <- gameName) { 95 | tableName = "(select recommendations_up from roll_up where name = '" + game + "') as roll_up" 96 | val data: DataFrame = readMysqlTable(sc, tableName) 97 | data.show() 98 | } 99 | 100 | sc.stop() 101 | } 102 | 103 | def readMysqlTable(sparkSession: SparkSession, tableName: String) = { 104 | 105 | sparkSession 106 | .read 107 | .format("jdbc") 108 | .option("url", "jdbc:mysql://172.19.240.128:3306/steam") 109 | .option("user", "root") 110 | .option("password", "root") 111 | .option("dbtable", tableName) 112 | .load() 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/HDFSProcess.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import java.sql.DriverManager 4 | 5 | import cn.edu.nju.domain.CommentLog 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.streaming.{Seconds, StreamingContext} 8 | 9 | /** 10 | * Created by thpffcj on 2019/10/2. 11 | */ 12 | object HDFSProcess { 13 | 14 | def main(args: Array[String]): Unit = { 15 | 16 | val sparkConf = new SparkConf().setMaster("local[2]").setAppName("HDFSProcess") 17 | // val sparkConf = new SparkConf().setMaster("spark://thpffcj:7077").setAppName("HDFSProcess") 18 | 19 | // 创建StreamingContext需要两个参数:SparkConf和batch interval 20 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 21 | 22 | // 如果使用了stateful的算子,必须要设置checkpoint 23 | // 在生产环境中,建议把checkpoint设置到HDFS的某个文件夹中 24 | // . 代表当前目录 25 | ssc.checkpoint("/Users/thpffcj/Public/file/cloud_checkpoint/hdfs_process") 26 | 27 | // val data = ssc.textFileStream("hdfs://thpffcj:9000/cloud-computing/") 28 | // nc -lk 9999 29 | val data = ssc.socketTextStream("localhost", 9999) 30 | 31 | // 构建黑名单 32 | val blacks = List("9999") 33 | val blacksRDD = ssc.sparkContext.parallelize(blacks).map(x => (x, true)) 34 | 35 | // 过滤黑名单 36 | val cleanData = data.map(line => (line.split(",")(0), line)) 37 | .transform(rdd => { 38 | rdd.leftOuterJoin(blacksRDD) 39 | .filter(x => x._2._2.getOrElse(false) != true) 40 | .map(x => x._2._1) 41 | }) 42 | 43 | val logs = cleanData.map(line => { 44 | val infos = line.split(",") 45 | CommentLog(infos(0), infos(1), infos(2), infos(3)) 46 | }).filter(commentLog => commentLog.gameName != "") 47 | 48 | // 按游戏名统计评论数 49 | val gameNumber = logs.map(log => { 50 | (log.gameName, 1) 51 | }).updateStateByKey[Int](updateFunction _) 52 | 53 | gameNumber.print() 54 | 55 | ssc.start() 56 | ssc.awaitTermination() 57 | } 58 | 59 | /** 60 | * 把当前的数据去更新已有的或者是旧的数据 61 | * @param currentValues 当前数据 62 | * @param preValues 旧数据 63 | * @return 64 | */ 65 | def updateFunction(currentValues: Seq[Int], preValues: Option[Int]): Option[Int] = { 66 | val current = currentValues.sum 67 | val pre = preValues.getOrElse(0) 68 | Some(current + pre) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/KafkaProcess.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import cn.edu.nju.domain.UserData 4 | import org.apache.kafka.clients.consumer.ConsumerConfig 5 | import org.apache.kafka.common.serialization.StringDeserializer 6 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 7 | import org.apache.spark.streaming.{Seconds, StreamingContext} 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | 10 | /** 11 | * Created by thpffcj on 2019/10/19. 12 | */ 13 | object KafkaProcess { 14 | 15 | def main(args: Array[String]): Unit = { 16 | 17 | val sparkConf = new SparkConf().setMaster("local[2]").setAppName("StreamProcess") 18 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 19 | 20 | ssc.checkpoint("/Users/thpffcj/Public/file/cloud_checkpoint/stream_process") 21 | 22 | val bootstrapServers = "thpffcj1:9092" 23 | val groupId = "test" 24 | val topicName = "steam" 25 | val maxPoll = 20000 26 | 27 | val kafkaParams = Map( 28 | ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers, 29 | ConsumerConfig.GROUP_ID_CONFIG -> groupId, 30 | ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> maxPoll.toString, 31 | ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer], 32 | ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer] 33 | ) 34 | 35 | val messages = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, 36 | ConsumerStrategies.Subscribe[String, String](Set(topicName), kafkaParams)) 37 | 38 | val rawData = messages.map(_.value()) 39 | 40 | val data = rawData.map(line => { 41 | val record = line.split("\t") 42 | UserData(record(0), record(1), record(2), record(3).toDouble) 43 | }) 44 | 45 | // 游戏销量 46 | val gameSale = data.filter(userData => userData.behavior == "purchase") 47 | .map(userData => { 48 | (userData.gameName, 1) 49 | }).updateStateByKey[Int](updateFunction _) 50 | 51 | gameSale.print() 52 | 53 | // 游戏游玩平均时长 54 | val gamePopularity = data.filter(userData => userData.behavior == "play").map( 55 | userData => { 56 | (userData.gameName, (userData.duration, 1)) 57 | } 58 | ).reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2)) 59 | 60 | gamePopularity.print() 61 | 62 | // Dota 2游玩时长 63 | val gameDuration = data.filter( 64 | userData => userData.gameName == "Dota 2" & userData.behavior == "play").map( 65 | userData => { 66 | (userData.userId, userData.duration) 67 | }) 68 | 69 | gameDuration.print() 70 | 71 | ssc.start() 72 | ssc.awaitTermination() 73 | } 74 | 75 | /** 76 | * 把当前的数据去更新已有的或者是旧的数据 77 | * @param currentValues 当前数据 78 | * @param preValues 旧数据 79 | * @return 80 | */ 81 | def updateFunction(currentValues: Seq[Int], preValues: Option[Int]): Option[Int] = { 82 | val current = currentValues.sum 83 | val pre = preValues.getOrElse(0) 84 | Some(current + pre) 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/MongoDBProcess.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import com.mongodb.spark.MongoSpark 4 | import org.apache.log4j.{Level, Logger} 5 | import org.apache.spark.sql.DataFrame 6 | 7 | /** 8 | * Created by thpffcj on 2019/9/24. 9 | */ 10 | object MongoDBProcess { 11 | 12 | Logger.getLogger("org").setLevel(Level.ERROR) 13 | 14 | def main(args: Array[String]): Unit = { 15 | 16 | import org.apache.spark.sql.SparkSession 17 | 18 | val spark = SparkSession.builder() 19 | .master("local[2]") 20 | .appName("MongoDBProcess") 21 | .config("spark.mongodb.input.uri", "mongodb://steam:steam@***.***.***.***:27017/steam_db.China.games") 22 | .getOrCreate() 23 | 24 | val frame: DataFrame = MongoSpark.load(spark) 25 | frame.createTempView("games") 26 | 27 | val res: DataFrame = spark.sql("SELECT name from games") 28 | res.show() 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/MySQLProcess.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util 5 | 6 | import cn.edu.nju.api.{ApiReturnObject, TagReturnObject} 7 | import cn.edu.nju.domain.{GameObject, TagObject} 8 | import org.apache.spark.sql.{DataFrame, SQLContext} 9 | import org.apache.spark.{SparkConf, SparkContext} 10 | 11 | import scala.collection.mutable.ListBuffer 12 | import scala.util.Random 13 | 14 | /** 15 | * Created by thpffcj on 2019/10/24. 16 | */ 17 | class MySQLProcess { 18 | 19 | /** 20 | * 返回动态图所需数据 21 | * @param dates 22 | * @return 23 | */ 24 | def getTimeFieldData(dates: ListBuffer[String]): ApiReturnObject = { 25 | 26 | val sparkConf = new SparkConf().setMaster("local[1]").setAppName("MySQLProcess") 27 | val sc = new SparkContext(sparkConf) 28 | 29 | val sqlContext = new SQLContext(sc) 30 | 31 | val timeFieldObjects = new util.ArrayList[TimeFieldObject] 32 | 33 | for (date <- dates){ 34 | val time = (new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(date).getTime / 1000).toInt 35 | val tableName = "(select * from top10 where time = " + time + " order by recommendations_up desc limit 10) as top10" 36 | val data: DataFrame = readMysqlTable(sqlContext, tableName) 37 | 38 | val gameObjects = new util.ArrayList[GameObject] 39 | val broadcast = sc.broadcast(gameObjects) 40 | var id = 1 41 | data.foreach(row => { 42 | 43 | val name = row.getAs("name").toString 44 | var color = "" 45 | if (MySQLProcess.map.containsKey(name)) { 46 | color = MySQLProcess.map.get(name).toString 47 | } else { 48 | // rgb(218, 198, 76) 49 | color = "rgb(" + Random.nextInt(255) + ", " + Random.nextInt(255) + ", " + Random.nextInt(255) + ")" 50 | MySQLProcess.map.put(name, color) 51 | } 52 | 53 | val gameObject = new GameObject(id.toString, name, row.getAs("recommendations_up"), color) 54 | broadcast.value.add(gameObject) 55 | id = id + 1 56 | }) 57 | 58 | val name = "截止" + date.substring(0, 4) + "年" + date.substring(5, 7) + "月" + "好评累计总数" 59 | val timeFieldObject = new TimeFieldObject(name, broadcast.value) 60 | timeFieldObjects.add(timeFieldObject) 61 | } 62 | 63 | val apiReturnObject = new ApiReturnObject(timeFieldObjects) 64 | 65 | sc.stop() 66 | 67 | apiReturnObject 68 | } 69 | 70 | /** 71 | * 返回词云需要的数据 72 | * @return 73 | */ 74 | def getTagData(round: Int): TagReturnObject = { 75 | 76 | val sparkConf = new SparkConf().setMaster("local[1]").setAppName("MySQLProcess") 77 | val sc = new SparkContext(sparkConf) 78 | 79 | val sqlContext = new SQLContext(sc) 80 | 81 | val tableName = "(select * from tag limit " + 0 + "," + round * 50 + ") as top10" 82 | println(tableName) 83 | val data: DataFrame = readMysqlTable(sqlContext, tableName) 84 | 85 | val tagObjects =new util.ArrayList[TagObject] 86 | val broadcast = sc.broadcast(tagObjects) 87 | data.foreach(row => { 88 | val tagObject = new TagObject(row.getAs("game_name"), row.getAs("number")) 89 | broadcast.value.add(tagObject) 90 | }) 91 | 92 | val tagReturnObject = new TagReturnObject(tagObjects) 93 | 94 | sc.stop() 95 | 96 | tagReturnObject 97 | } 98 | 99 | def readMysqlTable(sqlContext: SQLContext, tableName: String) = { 100 | sqlContext 101 | .read 102 | .format("jdbc") 103 | .option("driver", "com.mysql.jdbc.Driver") 104 | .option("url", "jdbc:mysql://172.19.240.128:3306/steam") 105 | .option("user", "root") 106 | .option("password", "root") 107 | .option("dbtable", tableName) 108 | .load() 109 | } 110 | } 111 | 112 | object MySQLProcess { 113 | 114 | val map = new util.HashMap[String, String]() 115 | } 116 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/SteamProcess.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import cn.edu.nju.dao.{RollUpDAO, TagDAO} 4 | import cn.edu.nju.domain.{GameDetail, ReviewsChart, RollUp, SteamLog, Tag} 5 | import com.google.gson.Gson 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.streaming.dstream.DStream 8 | import org.apache.spark.streaming.{Seconds, StreamingContext} 9 | 10 | import scala.collection.mutable.{ListBuffer, Set} 11 | 12 | /** 13 | * Created by thpffcj on 2019/10/21. 14 | */ 15 | object SteamProcess { 16 | 17 | def main(args: Array[String]): Unit = { 18 | 19 | // val sparkConf = new SparkConf().setMaster("local[2]").setAppName("HDFSProcess") 20 | val sparkConf = new SparkConf().setMaster("spark://thpffcj:7077").setAppName("SteamProcess") 21 | 22 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 23 | 24 | // 如果使用了stateful的算子,必须要设置checkpoint 25 | // 在生产环境中,建议把checkpoint设置到HDFS的某个文件夹中 26 | // . 代表当前目录 27 | ssc.checkpoint("/Users/thpffcj/Public/file/cloud_checkpoint/hdfs_process") 28 | 29 | val rawData = ssc.textFileStream("hdfs://thpffcj:9000/cloud-computing/") 30 | 31 | val gameNameSet: Set[String] = Set() 32 | 33 | /** 34 | * 过滤空行 35 | * 过滤日期为空的数据 36 | * 过滤重复的数据,使用游戏名称过滤 37 | * 过滤game_detail为空的数据 38 | * 为了game_detail为bundle的数据 39 | */ 40 | val data = rawData.filter(rdd => !rdd.isEmpty).map(line => { 41 | val log = line.split("\t") 42 | if (log.length < 7) { 43 | SteamLog("", "", "", "", "", "", "") 44 | } else { 45 | SteamLog(log(0), log(1), log(2), log(3), log(4), log(5), log(6)) 46 | } 47 | }).filter(steamLog => !steamLog.date.isEmpty) 48 | .filter(steamLog => !gameNameSet.contains(steamLog.name)) 49 | .filter(steamLog => !steamLog.game_detail.isEmpty) 50 | .filter(steamLog => !steamLog.game_detail.equals("bundle")) 51 | .map(steamLog => { 52 | gameNameSet.add(steamLog.name) 53 | steamLog 54 | }) 55 | 56 | // 取出用户标签 57 | val userTags = data.map(steamLog => { 58 | val gameDetail = jsonToGameDetail(steamLog.game_detail) 59 | if (gameDetail != null) { 60 | gameDetail.user_tags.toString.replace(" ", "") 61 | } else { 62 | null 63 | } 64 | }).filter(userTags => userTags != null) 65 | 66 | // 标签统计 67 | val tagsNumber = userTags.flatMap(line => line.substring(1, line.length - 1).split(",")) 68 | .map(tag => (tag, 1)).updateStateByKey[Int](updateFunction _) 69 | 70 | // writeTagToMysql(tagsNumber) 71 | tagsNumber.print() 72 | 73 | /** 74 | * (steamLog.name,jsonToReviewsChart(gameDetail.reviewsChart.toString)) 75 | * (CODE VEIN,{recommendations_down=34.0,date=1.5712704E9,recommendations_up=167.0},{recommendations_down=34.0,date=1.5712704E9,recommendations_up=167.0) 76 | */ 77 | val rollups = data.map(steamLog => { 78 | val gameDetail = jsonToGameDetail(steamLog.game_detail) 79 | 80 | // 过滤 reviewsChart["start_date"] 和 reviewsChart["end_date"] 为空的数据 81 | if ((gameDetail != null) && (gameDetail.reviewsChart.get("start_date") != "None") 82 | && (gameDetail.reviewsChart.get("end_date") != "None")) { 83 | (steamLog.name, jsonToReviewsChart(gameDetail.reviewsChart.toString)) 84 | } else { 85 | null 86 | } 87 | }).filter(rollups => rollups != null) 88 | // 目前只考虑以月为时间单位的数据 89 | .filter(reviewsChart => reviewsChart._2.rollup_type == "month") 90 | .map(reviewsChart => { 91 | val line = reviewsChart._2.rollups.toString 92 | (reviewsChart._1, line.substring(1, line.length - 2).replace(" ", "")) 93 | }) 94 | 95 | // 将每个游戏好评数写入到MySQL 96 | rollups.foreachRDD(rdd => { 97 | rdd.foreachPartition(partitionOfRecords => { 98 | val list = new ListBuffer[(String, Int, Int, Int)] 99 | 100 | partitionOfRecords.foreach(record => { 101 | record._2.split("},").foreach(data => { 102 | val rollUp = jsonToRollUp(data + "}") 103 | list.append((record._1, rollUp.date, rollUp.recommendations_up, rollUp.recommendations_down)) 104 | }) 105 | }) 106 | 107 | RollUpDAO.insertRollUp(list) 108 | }) 109 | }) 110 | 111 | // 单条插入 112 | // rollups.foreachRDD(rdd => { 113 | // rdd.foreachPartition(partitionOfRecords => { 114 | // val connection = createConnection() 115 | // partitionOfRecords.foreach(record => { 116 | // record._2.split("},").foreach(data => { 117 | // val rollUp = jsonToRollUp(data + "}") 118 | // val sql = "insert into roll_up(name, time, recommendations_up, recommendations_down) values('" + record._1.replace("'", "") + "'," + rollUp.date + "," + rollUp.recommendations_up + "," + rollUp.recommendations_down + ")" 119 | // connection.createStatement().execute(sql) 120 | // }) 121 | // }) 122 | // connection.close() 123 | // }) 124 | // }) 125 | 126 | rollups.print() 127 | 128 | ssc.start() 129 | ssc.awaitTermination() 130 | } 131 | 132 | // def createConnection() = { 133 | // Class.forName("com.mysql.jdbc.Driver") 134 | // DriverManager.getConnection("jdbc:mysql://localhost:3306/steam?useUnicode=true&characterEncoding=utf-8", "root", "000000") 135 | // } 136 | 137 | def jsonToGameDetail(jsonStr: String): GameDetail = { 138 | try { 139 | val gson = new Gson() 140 | gson.fromJson(jsonStr, classOf[GameDetail]) 141 | } catch { 142 | case e: Exception => { 143 | // e.printStackTrace() 144 | null 145 | } 146 | } 147 | } 148 | 149 | def jsonToReviewsChart(jsonStr: String): ReviewsChart = { 150 | try { 151 | val gson = new Gson() 152 | gson.fromJson(jsonStr, classOf[ReviewsChart]) 153 | } catch { 154 | case e: Exception => { 155 | // e.printStackTrace() 156 | null 157 | } 158 | } 159 | } 160 | 161 | def jsonToRollUp(jsonStr: String): RollUp = { 162 | try { 163 | val gson = new Gson() 164 | gson.fromJson(jsonStr, classOf[RollUp]) 165 | } catch { 166 | case e: Exception => { 167 | // e.printStackTrace() 168 | null 169 | } 170 | } 171 | } 172 | 173 | /** 174 | * 把当前的数据去更新已有的或者是旧的数据 175 | * 176 | * @param currentValues 当前数据 177 | * @param preValues 旧数据 178 | * @return 179 | */ 180 | def updateFunction(currentValues: Seq[Int], preValues: Option[Int]): Option[Int] = { 181 | val current = currentValues.sum 182 | val pre = preValues.getOrElse(0) 183 | Some(current + pre) 184 | } 185 | 186 | /** 187 | * 标签数据写入MySQL 188 | * @param tagsNumber 189 | */ 190 | def writeTagToMysql(tagsNumber: DStream[(String, Int)]): Unit = { 191 | 192 | tagsNumber.foreachRDD(rdd => { 193 | rdd.foreachPartition(partitionOfRecords => { 194 | val list = new ListBuffer[Tag] 195 | partitionOfRecords.foreach(record => { 196 | list.append(Tag(record._1, record._2)) 197 | }) 198 | TagDAO.insertTag(list) 199 | }) 200 | }) 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/dao/CourseClickCountDAO.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.dao 2 | 3 | import cn.edu.nju.domain.CourseClickCount 4 | import cn.edu.nju.utils.HBaseUtils 5 | import org.apache.hadoop.hbase.client.Get 6 | import org.apache.hadoop.hbase.util.Bytes 7 | 8 | import scala.collection.mutable.ListBuffer/** 9 | * Created by thpffcj on 2019/10/17. 10 | */ 11 | object CourseClickCountDAO { 12 | 13 | val tableName = "imooc_course_clickcount" 14 | val cf = "info" 15 | val qualifer = "click_count" 16 | 17 | /** 18 | * 保存数据到HBase 19 | * @param list CourseClickCount集合 20 | */ 21 | def save(list: ListBuffer[CourseClickCount]): Unit = { 22 | 23 | val table = HBaseUtils.getInstance().getTable(tableName) 24 | 25 | for(ele <- list) { 26 | table.incrementColumnValue(Bytes.toBytes(ele.day_course), 27 | Bytes.toBytes(cf), 28 | Bytes.toBytes(qualifer), 29 | ele.click_count) 30 | } 31 | } 32 | 33 | /** 34 | * 根据rowkey查询值 35 | */ 36 | def count(day_course: String): Long = { 37 | val table = HBaseUtils.getInstance().getTable(tableName) 38 | 39 | val get = new Get(Bytes.toBytes(day_course)) 40 | val value = table.get(get).getValue(cf.getBytes, qualifer.getBytes) 41 | 42 | if(value == null) { 43 | 0L 44 | }else{ 45 | Bytes.toLong(value) 46 | } 47 | } 48 | 49 | def main(args: Array[String]): Unit = { 50 | 51 | val list = new ListBuffer[CourseClickCount] 52 | list.append(CourseClickCount("20171111_8",8)) 53 | list.append(CourseClickCount("20171111_9",9)) 54 | list.append(CourseClickCount("20171111_1",100)) 55 | 56 | save(list) 57 | 58 | println(count("20171111_8") + " : " + count("20171111_9")+ " : " + count("20171111_1")) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/dao/CourseSearchClickCountDAO.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.dao 2 | 3 | import cn.edu.nju.domain.CourseSearchClickCount 4 | import cn.edu.nju.utils.HBaseUtils 5 | import org.apache.hadoop.hbase.client.Get 6 | import org.apache.hadoop.hbase.util.Bytes 7 | 8 | import scala.collection.mutable.ListBuffer 9 | 10 | /** 11 | * Created by thpffcj on 2019/10/17. 12 | */ 13 | object CourseSearchClickCountDAO { 14 | 15 | val tableName = "imooc_course_search_clickcount" 16 | val cf = "info" 17 | val qualifer = "click_count" 18 | 19 | /** 20 | * 保存数据到HBase 21 | * 22 | * @param list CourseSearchClickCount集合 23 | */ 24 | def save(list: ListBuffer[CourseSearchClickCount]): Unit = { 25 | 26 | val table = HBaseUtils.getInstance().getTable(tableName) 27 | 28 | for(ele <- list) { 29 | table.incrementColumnValue(Bytes.toBytes(ele.day_search_course), 30 | Bytes.toBytes(cf), 31 | Bytes.toBytes(qualifer), 32 | ele.click_count) 33 | } 34 | } 35 | 36 | /** 37 | * 根据rowkey查询值 38 | */ 39 | def count(day_search_course: String):Long = { 40 | val table = HBaseUtils.getInstance().getTable(tableName) 41 | 42 | val get = new Get(Bytes.toBytes(day_search_course)) 43 | val value = table.get(get).getValue(cf.getBytes, qualifer.getBytes) 44 | 45 | if(value == null) { 46 | 0L 47 | }else{ 48 | Bytes.toLong(value) 49 | } 50 | } 51 | 52 | def main(args: Array[String]): Unit = { 53 | 54 | val list = new ListBuffer[CourseSearchClickCount] 55 | list.append(CourseSearchClickCount("20171111_www.baidu.com_8",8)) 56 | list.append(CourseSearchClickCount("20171111_cn.bing.com_9",9)) 57 | 58 | save(list) 59 | 60 | println(count("20171111_www.baidu.com_8") + " : " + count("20171111_cn.bing.com_9")) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/dao/RollUpDAO.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.dao 2 | 3 | import cn.edu.nju.domain.RollUp 4 | import java.sql.{Connection, PreparedStatement} 5 | 6 | import cn.edu.nju.utils.MySQLUtils 7 | 8 | import scala.collection.mutable.ListBuffer 9 | 10 | /** 11 | * Created by thpffcj on 2019/10/25. 12 | */ 13 | object RollUpDAO { 14 | 15 | /** 16 | * 批量保存RollUp到数据库 17 | */ 18 | def insertRollUp(list: ListBuffer[(String, Int, Int, Int)]): Unit = { 19 | 20 | var connection: Connection = null 21 | var pstmt: PreparedStatement = null 22 | 23 | try { 24 | connection = MySQLUtils.getConnection() 25 | 26 | connection.setAutoCommit(false) //设置手动提交 27 | 28 | val sql = "insert into roll_up(name, time, recommendations_up, recommendations_down) values (?,?,?,?) " 29 | pstmt = connection.prepareStatement(sql) 30 | 31 | for (element <- list) { 32 | pstmt.setString(1, element._1) 33 | pstmt.setInt(2, element._2) 34 | pstmt.setInt(3, element._3) 35 | pstmt.setInt(4, element._4) 36 | 37 | pstmt.addBatch() 38 | } 39 | 40 | pstmt.executeBatch() // 执行批量处理 41 | connection.commit() // 手工提交 42 | } catch { 43 | case e: Exception => e.printStackTrace() 44 | } finally { 45 | MySQLUtils.release(connection, pstmt) 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/dao/TagDAO.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.dao 2 | 3 | import cn.edu.nju.domain.Tag 4 | import java.sql.{Connection, PreparedStatement} 5 | import cn.edu.nju.utils.MySQLUtils 6 | import scala.collection.mutable.ListBuffer 7 | 8 | /** 9 | * Created by thpffcj on 2019/10/25. 10 | */ 11 | object TagDAO { 12 | 13 | /** 14 | * 批量保存Tag到数据库 15 | */ 16 | def insertTag(list: ListBuffer[Tag]): Unit = { 17 | 18 | var connection: Connection = null 19 | var pstmt: PreparedStatement = null 20 | 21 | try { 22 | connection = MySQLUtils.getConnection() 23 | 24 | connection.setAutoCommit(false) //设置手动提交 25 | 26 | val sql = "insert into tag(game_name, number) values (?,?)" 27 | pstmt = connection.prepareStatement(sql) 28 | 29 | for (element <- list) { 30 | pstmt.setString(1, element.tagName) 31 | pstmt.setInt(2, element.number) 32 | 33 | pstmt.addBatch() 34 | } 35 | 36 | pstmt.executeBatch() // 执行批量处理 37 | connection.commit() // 手工提交 38 | } catch { 39 | case e: Exception => e.printStackTrace() 40 | } finally { 41 | MySQLUtils.release(connection, pstmt) 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/domain/ClickLog.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain 2 | 3 | /** 4 | * Created by thpffcj on 2019/10/17. 5 | */ 6 | case class ClickLog(ip:String, time:String, courseId:Int, statusCode:Int, referrer:String) 7 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/domain/CommentLog.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain 2 | 3 | /** 4 | * Created by thpffcj on 2019/10/17. 5 | */ 6 | case class CommentLog(userId:String, gameName:String, commentTime:String, comment:String) 7 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/domain/CourseClickCount.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain 2 | 3 | /** 4 | * Created by thpffcj on 2019/10/17. 5 | */ 6 | case class CourseClickCount(day_course:String, click_count:Long) 7 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/domain/CourseSearchClickCount.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain 2 | 3 | /** 4 | * Created by thpffcj on 2019/10/17. 5 | */ 6 | case class CourseSearchClickCount(day_search_course:String, click_count:Long) 7 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/domain/DouBanLog.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain 2 | 3 | /** 4 | * Created by thpffcj on 2019/10/18. 5 | */ 6 | case class DouBanLog(star:Double, bd:String, quote:String, title:String) 7 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/domain/GameDetail.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain 2 | 3 | import com.alibaba.fastjson.JSONObject 4 | 5 | /** 6 | * Created by thpffcj on 2019/10/21. 7 | * @param support_tags 8 | * @param user_reviews 9 | * @param user_tags 10 | * @param reviewsChart 11 | */ 12 | case class GameDetail(support_tags: Object, user_reviews: JSONObject, user_tags: Object, 13 | reviewsChart: JSONObject) 14 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/domain/ReviewsChart.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain 2 | 3 | /** 4 | * Created by thpffcj on 2019/10/21. 5 | * 6 | * @param weeks 7 | * @param rollup_type 8 | * @param end_date 9 | * @param recent 10 | * @param rollups 11 | * @param start_date 12 | */ 13 | case class ReviewsChart(weeks: Object, rollup_type: String, end_date: Float, recent: Object, 14 | rollups: Object, start_date: Float) 15 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/domain/RollUp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain 2 | 3 | /** 4 | * Created by thpffcj on 2019/10/23. 5 | */ 6 | case class RollUp(recommendations_up: Int, date: Int, recommendations_down: Int) 7 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/domain/SteamLog.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain 2 | 3 | import com.alibaba.fastjson.JSONObject 4 | 5 | /** 6 | * Created by thpffcj on 2019/10/21. 7 | * 8 | * @param img_src 9 | * @param game_detail 10 | * @param original_price 11 | * @param price 12 | * @param review_summary 13 | * @param date 14 | * @param name 15 | */ 16 | case class SteamLog(img_src: String, game_detail: String, original_price: String, 17 | price: String, review_summary: String, date: String, name: String) 18 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/domain/Tag.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain 2 | 3 | /** 4 | * Created by thpffcj on 2019/10/25. 5 | */ 6 | case class Tag(tagName: String, number: Int) 7 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/domain/UserData.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain 2 | 3 | /** 4 | * Created by thpffcj on 2019/10/19. 5 | * @param userId 游戏玩家ID号 6 | * @param gameName 游戏名称 7 | * @param behavior 玩家购买游戏的行为(购买/玩) 8 | * @param duration 游戏时长,1代表该买了该游戏 9 | */ 10 | case class UserData(userId:String, gameName:String, behavior:String, duration:Double) 11 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/test/DateTest.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.test 2 | 3 | import java.text.SimpleDateFormat 4 | 5 | import cn.edu.nju.utils.DateUtils 6 | 7 | /** 8 | * Created by thpffcj on 2019/10/25. 9 | */ 10 | object DateTest { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | val startDate = "2017-03-01 08:00:00" 15 | val startTime: Int = (new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(startDate).getTime / 1000).toInt 16 | 17 | val endDate = "2019-10-01 08:00:00" 18 | val endTime : Int = (new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(endDate).getTime / 1000).toInt 19 | 20 | for (date <- DateUtils.getSteamDates()) { 21 | println((new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(date).getTime / 1000).toInt) 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/test/HDFSProcessTest.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.test 2 | 3 | import cn.edu.nju.domain.{CommentLog, DouBanLog} 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | 7 | /** 8 | * Created by thpffcj on 2019/10/2. 9 | */ 10 | object HDFSProcessTest { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | val sparkConf = new SparkConf().setMaster("local[2]").setAppName("HDFSProcess") 15 | 16 | // 创建StreamingContext需要两个参数:SparkConf和batch interval 17 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 18 | 19 | val data = ssc.textFileStream("hdfs://thpffcj:9000/cloud-computing/") 20 | 21 | val log = data.map(line => { 22 | 23 | val infos = line.split("\t") 24 | 25 | DouBanLog(infos(0).toDouble, infos(1), infos(2), infos(3)) 26 | }) 27 | 28 | log.print() 29 | 30 | ssc.start() 31 | ssc.awaitTermination() 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/test/JsonTest.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.test 2 | 3 | import cn.edu.nju.domain.{GameDetail, ReviewsChart, UserData} 4 | import com.google.gson.Gson 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | 7 | import scala.util.parsing.json.JSONObject 8 | 9 | /** 10 | * Created by thpffcj on 2019/10/21. 11 | */ 12 | object JsonTest { 13 | 14 | def main(args: Array[String]): Unit = { 15 | 16 | // val result1 = jsonToGameDetail("{\"reviewsChart\": {\"end_date\": 1571616000, \"rollups\": [{\"recommendations_down\": 34, \"date\": 1571270400, \"recommendations_up\": 167}], \"recent\": [{\"recommendations_down\": 1, \"date\": 1571616000, \"recommendations_up\": 7}], \"rollup_type\": \"week\", \"weeks\": [], \"start_date\": 1569456000}, \"support_tags\": [\"单人\", \"在线合作\", \"Steam 成就\"], \"user_tags\": [\"动漫\", \"砍杀\", \"冒险\", \"好评原声音轨\"], \"user_reviews\": {\"发行日期:\": \"2019年9月26日\", \"开发商:\": \"BANDAI NAMCO Studios\", \"发行商:\": \"BANDAI NAMCO Entertainment\"}}") 17 | // print(result1) 18 | 19 | val result2 = jsonToReviewsChart("{\"end_date\":1.571616E9,\"weeks\":[],\"rollup_type\":\"week\",\"recent\":[{\"recommendations_down\":1.0,\"date\":1.571616E9,\"recommendations_up\":7.0}],\"rollups\":[{\"recommendations_down\":34.0,\"date\":1.5712704E9,\"recommendations_up\":167.0}],\"start_date\":1.569456E9}") 20 | print(result2) 21 | 22 | } 23 | 24 | def jsonToGameDetail(jsonStr: String): GameDetail = { 25 | val gson = new Gson() 26 | gson.fromJson(jsonStr, classOf[GameDetail]) 27 | } 28 | 29 | def jsonToReviewsChart(jsonStr: String): ReviewsChart = { 30 | val gson = new Gson() 31 | gson.fromJson(jsonStr, classOf[ReviewsChart]) 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/test/StatStreaming.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.test 2 | 3 | import cn.edu.nju.dao.{CourseClickCountDAO, CourseSearchClickCountDAO} 4 | import cn.edu.nju.domain.{ClickLog, CourseClickCount, CourseSearchClickCount} 5 | import cn.edu.nju.utils.DateUtils 6 | import org.apache.kafka.clients.consumer.ConsumerConfig 7 | import org.apache.kafka.common.serialization.StringDeserializer 8 | import org.apache.spark.SparkConf 9 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 10 | import org.apache.spark.streaming.{Seconds, StreamingContext} 11 | 12 | import scala.collection.mutable.ListBuffer 13 | 14 | /** 15 | * Created by thpffcj on 2019/10/17. 16 | */ 17 | object StatStreaming { 18 | 19 | def main(args: Array[String]): Unit = { 20 | 21 | val sparkConf = new SparkConf().setAppName("StatStreaming") //.setMaster("local[5]") 22 | val ssc = new StreamingContext(sparkConf, Seconds(60)) 23 | 24 | val bootstrapServers = "thpffcj1:9092" 25 | val groupId = "test" 26 | val topicName = "test" 27 | val maxPoll = 20000 28 | 29 | val kafkaParams = Map( 30 | ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers, 31 | ConsumerConfig.GROUP_ID_CONFIG -> groupId, 32 | ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> maxPoll.toString, 33 | ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer], 34 | ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer] 35 | ) 36 | 37 | val messages = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, 38 | ConsumerStrategies.Subscribe[String, String](Set(topicName), kafkaParams)) 39 | 40 | // 测试步骤一:测试数据接收 41 | // messages.map(_._2).count().print 42 | 43 | // 测试步骤二:数据清洗 44 | val logs = messages.map(_.value()) 45 | val cleanData = logs.map(line => { 46 | val infos = line.split("\t") 47 | 48 | // infos(2) = "GET /class/130.html HTTP/1.1" 49 | // url = /class/130.html 50 | val url = infos(2).split(" ")(1) 51 | var courseId = 0 52 | 53 | // 把实战课程的课程编号拿到了 54 | if (url.startsWith("/class")) { 55 | val courseIdHTML = url.split("/")(2) 56 | courseId = courseIdHTML.substring(0, courseIdHTML.lastIndexOf(".")).toInt 57 | } 58 | 59 | ClickLog(infos(0), DateUtils.parseToMinute(infos(1)), courseId, infos(3).toInt, infos(4)) 60 | }).filter(clicklog => clicklog.courseId != 0) 61 | 62 | // cleanData.print() 63 | 64 | // 测试步骤三:统计今天到现在为止实战课程的访问量 65 | 66 | cleanData.map(x => { 67 | 68 | // HBase rowkey设计: 20171111_88 69 | (x.time.substring(0, 8) + "_" + x.courseId, 1) 70 | }).reduceByKey(_ + _).foreachRDD(rdd => { 71 | rdd.foreachPartition(partitionRecords => { 72 | val list = new ListBuffer[CourseClickCount] 73 | 74 | partitionRecords.foreach(pair => { 75 | list.append(CourseClickCount(pair._1, pair._2)) 76 | }) 77 | 78 | CourseClickCountDAO.save(list) 79 | }) 80 | }) 81 | 82 | // 测试步骤四:统计从搜索引擎过来的今天到现在为止实战课程的访问量 83 | 84 | cleanData.map(x => { 85 | 86 | /** 87 | * https://www.sogou.com/web?query=Spark SQL实战 88 | */ 89 | val referrer = x.referrer.replaceAll("//", "/") 90 | val splits = referrer.split("/") 91 | var host = "" 92 | if(splits.length > 2) { 93 | host = splits(1) 94 | } 95 | 96 | (host, x.courseId, x.time) 97 | }).filter(_._1 != "").map(x => { 98 | (x._3.substring(0,8) + "_" + x._1 + "_" + x._2 , 1) 99 | }).reduceByKey(_ + _).foreachRDD(rdd => { 100 | rdd.foreachPartition(partitionRecords => { 101 | val list = new ListBuffer[CourseSearchClickCount] 102 | 103 | partitionRecords.foreach(pair => { 104 | list.append(CourseSearchClickCount(pair._1, pair._2)) 105 | }) 106 | 107 | CourseSearchClickCountDAO.save(list) 108 | }) 109 | }) 110 | 111 | ssc.start() 112 | ssc.awaitTermination() 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/test/TransformTest.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.test 2 | 3 | import cn.edu.nju.domain.GameDetail 4 | import com.google.gson.Gson 5 | 6 | /** 7 | * Created by thpffcj on 2019/10/25. 8 | */ 9 | object TransformTest { 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | jsonToGameDetail("") 14 | } 15 | 16 | def jsonToGameDetail(jsonStr: String): GameDetail = { 17 | try { 18 | val gson = new Gson() 19 | gson.fromJson(jsonStr, classOf[GameDetail]) 20 | } catch { 21 | case e: Exception => { 22 | e.printStackTrace() 23 | null 24 | } 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/utils/DateUtils.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.utils 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util.Date 5 | 6 | import org.apache.commons.lang3.time.FastDateFormat 7 | 8 | import scala.collection.mutable.ListBuffer 9 | 10 | /** 11 | * Created by thpffcj on 2019/10/17. 12 | */ 13 | object DateUtils { 14 | 15 | val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss") 16 | val TARGE_FORMAT = FastDateFormat.getInstance("yyyyMMddHHmmss") 17 | 18 | 19 | def getTime(time: String) = { 20 | YYYYMMDDHHMMSS_FORMAT.parse(time).getTime 21 | } 22 | 23 | def parseToMinute(time :String) = { 24 | TARGE_FORMAT.format(new Date(getTime(time))) 25 | } 26 | 27 | def getSteamDates(): ListBuffer[String] = { 28 | val dates = new ListBuffer[String] 29 | dates.append("2017-01-01 08:00:00") 30 | dates.append("2017-02-01 08:00:00") 31 | dates.append("2017-03-01 08:00:00") 32 | dates.append("2017-04-01 08:00:00") 33 | dates.append("2017-05-01 08:00:00") 34 | dates.append("2017-06-01 08:00:00") 35 | dates.append("2017-07-01 08:00:00") 36 | dates.append("2017-08-01 08:00:00") 37 | dates.append("2017-09-01 08:00:00") 38 | dates.append("2017-10-01 08:00:00") 39 | dates.append("2017-11-01 08:00:00") 40 | dates.append("2017-12-01 08:00:00") 41 | dates.append("2018-01-01 08:00:00") 42 | dates.append("2018-02-01 08:00:00") 43 | dates.append("2018-03-01 08:00:00") 44 | dates.append("2018-04-01 08:00:00") 45 | dates.append("2018-05-01 08:00:00") 46 | dates.append("2018-06-01 08:00:00") 47 | dates.append("2018-07-01 08:00:00") 48 | dates.append("2018-08-01 08:00:00") 49 | dates.append("2018-09-01 08:00:00") 50 | dates.append("2018-10-01 08:00:00") 51 | dates.append("2018-11-01 08:00:00") 52 | dates.append("2018-12-01 08:00:00") 53 | dates.append("2019-01-01 08:00:00") 54 | dates.append("2019-02-01 08:00:00") 55 | dates.append("2019-03-01 08:00:00") 56 | dates.append("2019-04-01 08:00:00") 57 | dates.append("2019-05-01 08:00:00") 58 | dates.append("2019-06-01 08:00:00") 59 | dates.append("2019-07-01 08:00:00") 60 | dates.append("2019-08-01 08:00:00") 61 | dates.append("2019-09-01 08:00:00") 62 | dates.append("2019-10-01 08:00:00") 63 | 64 | dates 65 | } 66 | 67 | def tranTimestampToString(tm: Int): String={ 68 | val fm = new SimpleDateFormat("yyyy/MM") 69 | val tim = fm.format(new Date(tm.toLong * 1000)) 70 | tim 71 | } 72 | 73 | def main(args: Array[String]): Unit = { 74 | 75 | println(tranTimestampToString(1569888000)) 76 | } 77 | } -------------------------------------------------------------------------------- /spark-streaming/src/main/scala/cn/edu/nju/utils/MySQLUtils.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.utils 2 | 3 | import java.sql.{Connection, DriverManager, PreparedStatement} 4 | 5 | /** 6 | * Created by thpffcj on 2019/10/25. 7 | */ 8 | object MySQLUtils { 9 | 10 | /** 11 | * 获取MySQL的连接 12 | */ 13 | def getConnection() = { 14 | Class.forName("com.mysql.jdbc.Driver") 15 | DriverManager.getConnection("jdbc:mysql://localhost:3306/steam?useUnicode=true&characterEncoding=utf-8", "root", "000000") 16 | } 17 | 18 | /** 19 | * 释放数据库连接等资源 20 | * @param connection 21 | * @param pstmt 22 | */ 23 | def release(connection: Connection, pstmt: PreparedStatement): Unit = { 24 | try { 25 | if (pstmt != null) { 26 | pstmt.close() 27 | } 28 | } catch { 29 | case e: Exception => e.printStackTrace() 30 | } finally { 31 | if (connection != null) { 32 | connection.close() 33 | } 34 | } 35 | } 36 | 37 | def main(args: Array[String]) { 38 | println(getConnection()) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /spider/spark-graphx/steam-reviews-official.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import pymongo 4 | import re 5 | import threading 6 | from multiprocessing import JoinableQueue 7 | import time 8 | 9 | #爬取steam所有评论,存入mongodb中 10 | def write(item): 11 | try: 12 | if isinstance(item,list): 13 | collection.insert_many(item) 14 | else: 15 | collection.insert_one(item) 16 | except Exception as e: 17 | print(e) 18 | return True 19 | 20 | def getAllApps(): 21 | try: 22 | apps = [] 23 | for g in regions.game_id.find().skip(0).limit(1400): 24 | apps.append({"id":g["id"],"name":g["name"]}) 25 | return apps 26 | except: 27 | print(e) 28 | 29 | def fetchReview(url,params,headers,app): 30 | try: 31 | res = session.get(url,params=params,headers=headers,timeout=30,verify=False) 32 | if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码,此页爬取失败 33 | if res.status_code != requests.codes.forbidden and res.status_code != requests.codes.bad_gateway: #403、502不打印 34 | print(res.status_code,":",url) 35 | return None 36 | except Exception as e: #网络有问题访问失败 37 | print(e) 38 | return None 39 | 40 | result = res.json() 41 | # print(res.url) 42 | # print(result) 43 | reviews = result["reviews"] 44 | if not reviews:#该游戏没有更多评论了 45 | print(result) 46 | return None 47 | cursor = result["cursor"] 48 | if not cursor: 49 | print(result) 50 | for review in reviews: 51 | review["game"] = app 52 | write(reviews) 53 | # print(url) 54 | # print(reviews) 55 | # print() 56 | return cursor 57 | 58 | def fetch(apps): 59 | for app in apps: 60 | #建立会话,Cookie设置语言为简体中文,出生日期为1987.1.1(允许访问成人内容) 61 | headers = { 62 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36', 63 | 'Cookie':'Steam_Language=schinese; birthtime=533750401; timezoneOffset=28800,0;', 64 | } 65 | cursor = "*" 66 | reviewsCount = 0 67 | while cursor: 68 | # print("cursor:",cursor) 69 | # url = "https://store.steampowered.com/appreviews/"+app["id"]+"?json=1&filter=recent&language=schinese&day_range=360"+ \ 70 | # "&cursor="+cursor+"&review_type=all&purchase_type=all&num_per_page=100" 71 | url = "https://store.steampowered.com/appreviews/"+app["id"] 72 | params = { 73 | "json":1, 74 | "filter":"recent", #all,recent,updated 75 | "language":"schinese", #all,schinese,zh-CN 76 | "day_range":"360", 77 | "cursor":cursor, 78 | "review_type":"all", 79 | "purchase_type":"all", 80 | "num_per_page":100, 81 | } 82 | cursor = fetchReview(url,params,headers,app) 83 | reviewsCount = reviewsCount+100 84 | if reviewsCount>=10000: 85 | break 86 | print(url,reviewsCount) 87 | 88 | #mongodb连接 89 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db') 90 | db = client.steam_db 91 | regions = db.China 92 | collection = regions.reviews_official 93 | 94 | requests.packages.urllib3.disable_warnings() 95 | session = requests.session() 96 | 97 | appInfos = getAllApps() 98 | # print(appInfos) 99 | numOfThreads = 1 100 | badPages = fetch(appInfos) 101 | print("all finished") 102 | 103 | # https://store.steampowered.com/appreviews/243470?json=1&filter=all&language=all&day_range=360&cursor=*&review_type=all&purchase_type=all&num_per_page=10 -------------------------------------------------------------------------------- /spider/spark-graphx/steam-reviews.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import pymongo 4 | import re 5 | import threading 6 | from multiprocessing import JoinableQueue 7 | import time 8 | 9 | # 3 1 1400 678960,1122050,1100620,730,1041320 10 | #爬取中国区steam所有评论,存入mongodb中 11 | def write(item): 12 | try: 13 | if isinstance(item,list): 14 | # firstUser = regions.first_review_user.find({"game":item[0]["game"]["id"]})[0] 15 | # if firstUser["user"]==item[0]["user"]["name"]: #重复评论 16 | # return False 17 | collection.insert_many(item) 18 | else: 19 | collection.insert_one(item) 20 | except Exception as e: 21 | print(e) 22 | return True 23 | 24 | def getAllApps(): 25 | try: 26 | apps = [] 27 | for g in regions.game_id.find().skip(0).limit(1400): 28 | apps.append({"id":g["id"],"name":g["name"],"firstUser":None}) 29 | return apps 30 | except: 31 | print(e) 32 | 33 | def fetchReview(url,headers,app): 34 | try: 35 | res = session.get(url,headers=headers,timeout=30,verify=False) 36 | if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码,此页爬取失败 37 | if res.status_code != requests.codes.forbidden and res.status_code != requests.codes.bad_gateway: #403、502不打印 38 | print(res.status_code,":",url) 39 | return None 40 | except Exception as e: #网络有问题访问失败 41 | print(e) 42 | return None 43 | 44 | if not res.text:#该游戏没有更多评论了 45 | return None 46 | 47 | try: 48 | soup = BeautifulSoup(res.text,'lxml') 49 | 50 | reviewGroup = [] 51 | for card in soup.find_all(class_="apphub_Card modalContentLink interactable"): 52 | userCard = card.find(class_="apphub_friend_block") 53 | if not userCard:# 没有用户的评论扔掉 54 | continue 55 | if not userCard.find(class_="apphub_CardContentAuthorName"):# 没有用户的评论扔掉 56 | continue 57 | if(len(userCard.find(class_="apphub_CardContentAuthorName").find_all("a"))!=1): 58 | print(userCard.find(class_="apphub_CardContentAuthorName")) 59 | name = userCard.find(class_="apphub_CardContentAuthorName").find("a").string 60 | name = name.strip() if name else "" 61 | product_owns = userCard.find(class_="apphub_CardContentMoreLink").string 62 | product_owns = product_owns.strip() if product_owns else "" 63 | user = { 64 | "name":name,# 可能为"" 65 | "product_owns": product_owns,# 可能为"" 66 | } 67 | comment_count = card.find(class_="apphub_CardCommentCount").string.strip() 68 | found_helpful = card.find(class_="found_helpful").contents 69 | helpful_num = found_helpful[0].strip() 70 | funny_num = found_helpful[-1].strip() if len(found_helpful)>1 else "" 71 | title = card.find(class_="reviewInfo").find(class_="title").string.strip() 72 | hours = card.find(class_="reviewInfo").find(class_="hours") 73 | hours = hours.string.strip() if hours else "" 74 | 75 | cardTextContent = card.find(class_="apphub_CardTextContent") 76 | date_posted = cardTextContent.find(class_="date_posted").string.strip() 77 | content = cardTextContent.contents[5:] if cardTextContent.find(class_="received_compensation") else cardTextContent.contents[2:] 78 | content = "".join(item.string if item.string else "
" for item in content).strip() 79 | 80 | review = { 81 | "game":app, 82 | "user":user, 83 | "comment_count":comment_count,#该评论回复数 84 | "helpful_num":helpful_num,#几人觉得这篇评测有价值 有的是一句话,有的是数字 85 | "funny_num":funny_num,#几人觉得这篇评测欢乐 86 | "title":title,#推荐/不推荐 87 | "hours":hours,#总时数 可能为"" 88 | "date_posted":date_posted,#发布于 89 | "content":content,#评论内容 90 | } 91 | reviewGroup.append(review) 92 | form = soup.find("form") 93 | nextUrl = form.attrs["action"]+"?" 94 | for arg in form.find_all("input"): 95 | nextUrl = nextUrl+arg.attrs["name"]+"="+arg.attrs["value"]+"&" 96 | nextUrl = nextUrl[:-1] 97 | # if app["firstUser"]==reviewGroup[0]["user"]["name"]: 98 | # return None 99 | # write(reviewGroup) 100 | print(url) 101 | print(reviewGroup) 102 | print() 103 | # print("nextUrl",nextUrl) 104 | return nextUrl 105 | 106 | except Exception as e: #steam服务器响应不正确 107 | print("bad url:",url,e) 108 | return None 109 | 110 | class fetchThread(threading.Thread): 111 | def __init__(self, tQueue, app, threadNum): 112 | threading.Thread.__init__(self) 113 | self.tQueue = tQueue 114 | self.app = app 115 | self.threadNum = threadNum 116 | def run(self): 117 | id = self.app["id"] 118 | p = str(self.threadNum+1) 119 | userreviewsoffset = str((int(p)-1)*10) 120 | numperpage = "10" 121 | # url = "https://steamcommunity.com/app/"+id+"/homecontent/?userreviewsoffset="+userreviewsoffset+"&p="+p+ \ 122 | # "&workshopitemspage="+p+"&readytouseitemspage="+p+"&mtxitemspage="+p+"&itemspage="+p+"&screenshotspage="+p+ \ 123 | # "&videospage="+p+"&artpage="+p+"&allguidepage="+p+"&webguidepage="+p+"&integratedguidepage="+p+ \ 124 | # "&discussionspage="+p+"&numperpage="+numperpage+"&browsefilter=trendyear&browsefilter=trendyear&l=schinese"+ \ 125 | # "&appHubSubSection="+numperpage+"&filterLanguage=default&searchText=&forceanon=1" 126 | url = "https://steamcommunity.com/app/"+id+"/reviews/?p=1&browsefilter=trendyear" 127 | #建立会话,Cookie设置语言为简体中文,出生日期为1987.1.1(允许访问成人内容) 128 | headers = { 129 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36', 130 | 'accept':'text/javascript, text/html, application/xml, text/xml, */*', 131 | 'accept-encoding': 'gzip, deflate, br', 132 | 'accept-language': 'zh,zh-CN;q=0.9,zh-TW;q=0.8,en;q=0.7,en-GB;q=0.6,en-US;q=0.5', 133 | 'cache-control': 'no-cache', 134 | 'pragma': 'no-cache', 135 | 'Cookie':'Steam_Language=schinese; birthtime=533750401; timezoneOffset=28800,0; sessionid=04a0dcb8f1f8f31bed482819; recentlyVisitedAppHubs=816340%2C678960%2C242920%2C1122050; steamCountry=CN%7C72e4ed8aa9f1f07b0eeba82d9349680e; app_impressions=1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_', 136 | 'sec-fetch-mode': 'cors', 137 | 'sec-fetch-site': 'same-origin', 138 | 'x-prototype-version': '1.7', 139 | 'x-requested-with': 'XMLHttpRequest', 140 | 'referer': url 141 | } 142 | # print(url) 143 | reviewCount = 0 144 | while url: 145 | # print("threadNum:"+str(self.threadNum)+" offset:"+userreviewsoffset) 146 | nextUrl = fetchReview(url,headers,self.app) 147 | if not nextUrl: 148 | nextUrl = fetchReview(url,headers,self.app) 149 | if not nextUrl: #2次失败认为这个游戏评论已爬完 150 | break 151 | url = nextUrl 152 | # if not self.app["firstUser"]: 153 | # self.app["firstUser"]=reviewGroup[0]["user"]["name"] 154 | reviewCount = reviewCount+10 155 | if int(reviewCount)>5000:#超过5K条评论后面的就不爬了(以魂3评论量为基准) 156 | break 157 | # p = str(int(p)+self.tQueue.numOfThreads*1) 158 | # userreviewsoffset = str((int(p)-1)*10) 159 | # url = "https://steamcommunity.com/app/"+id+"/homecontent/?userreviewsoffset="+userreviewsoffset+"&p="+p+ \ 160 | # "&workshopitemspage="+p+"&readytouseitemspage="+p+"&mtxitemspage="+p+"&itemspage="+p+"&screenshotspage="+p+ \ 161 | # "&videospage="+p+"&artpage="+p+"&allguidepage="+p+"&webguidepage="+p+"&integratedguidepage="+p+ \ 162 | # "&discussionspage="+p+"&numperpage="+numperpage+"&browsefilter=toprated&browsefilter=toprated&l=schinese"+ \ 163 | # "&appHubSubSection="+numperpage+"&filterLanguage=default&searchText=&forceanon=1" 164 | # trendyear toprated trendweek trendday mostrecent 165 | # print("nextUrl",url) 166 | # time.sleep(2) 167 | # break 168 | 169 | class threadQueue: 170 | def __init__(self, numOfThreads, app): 171 | self.numOfThreads = numOfThreads 172 | self.app = app 173 | self.threads = [] 174 | self.badItems = [] 175 | 176 | for i in range(0,numOfThreads): 177 | # 创建线程爬取详情页面 178 | thread = fetchThread(self,app,i) 179 | thread.start() 180 | self.threads.append(thread) 181 | # def addBadItem(self,info): 182 | # self.badItems.append(info) 183 | def waitForStop(self): 184 | #等待当前页的线程爬取完后再开始爬下一页 185 | for t in self.threads: 186 | t.join() 187 | if self.badItems: 188 | print("badItems ",self.badItems) 189 | 190 | def fetch(apps): 191 | for app in apps: 192 | queue = threadQueue(numOfThreads,app) 193 | queue.waitForStop() 194 | print(app["id"],"finished") 195 | badItems = queue.badItems 196 | 197 | #错页重爬 198 | for app in badItems: 199 | queue = threadQueue(numOfThreads,app) 200 | queue.waitForStop() 201 | return queue.badItems 202 | 203 | #mongodb连接 204 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db') 205 | db = client.steam_db 206 | regions = db.China 207 | collection = regions.reviews 208 | 209 | requests.packages.urllib3.disable_warnings() 210 | session = requests.session() 211 | 212 | appInfos = getAllApps() 213 | # print(appInfos) 214 | numOfThreads = 1 215 | badPages = fetch(appInfos) 216 | print("all finished") 217 | 218 | # http://store.steampowered.com/appreviews/243470?json=1&filter=all&language=all&day_range=360&cursor=*&review_type=all&purchase_type=all&num_per_page=10 -------------------------------------------------------------------------------- /spider/spark-streaming/steam-games-multithread-queue.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import pymongo 4 | import re 5 | import threading 6 | from multiprocessing import JoinableQueue 7 | 8 | #爬取中国区steam所有产品,存入mongodb中 9 | def write(item): 10 | try: 11 | collection.insert_one(item) 12 | except: 13 | print(e) 14 | 15 | def fetchReviewsChart(appID): 16 | url = "https://store.steampowered.com/appreviewhistogram/"+appID+"?l=schinese&review_score_preference=0" 17 | try: 18 | res = session.get(url,headers=headers,timeout=30) 19 | if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码,此页爬取失败 20 | print(res.status_code,":",url) 21 | return None 22 | chart = res.json() 23 | #chart.results.rollup_type: 取值有"week"、"month",指的是chart.results.rollups中每个date的时间跨度 24 | #chart.results.recent 中每个date的时间跨度是1天 25 | #date: 时间 26 | #recommendations_down: 差评数 27 | #recommendations_up: 好评数 28 | return chart["results"] if chart["success"]==1 else None 29 | except: #网络有问题访问失败 30 | print(url) 31 | return None 32 | 33 | def fetchGameInfo(url): 34 | try: 35 | res = session.get(url,headers=headers,timeout=30) 36 | if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码,此页爬取失败 37 | print(res.status_code,":",url) 38 | return None 39 | except: #网络有问题访问失败 40 | print(url) 41 | return None 42 | 43 | try: 44 | soup = BeautifulSoup(res.text,'lxml') 45 | 46 | #社区的URL 47 | communityUrl = soup.find(class_="apphub_OtherSiteInfo").find("a").attrs["href"] 48 | appID = communityUrl.split("/")[-1] 49 | 50 | #右上角的概览 51 | user_reviews = soup.find(class_="user_reviews") 52 | user_reviews_json = {} 53 | for item in user_reviews.find_all("div",class_="subtitle column"): 54 | user_reviews_json[item.string.strip()] = re.sub('\r|\n|\t', '', item.parent.find_all("div")[1].get_text().strip()) 55 | 56 | #用户自定义标签 57 | user_tags = soup.find(class_="glance_tags popular_tags") 58 | if user_tags: 59 | user_tags = [item.string.strip() for item in user_tags.find_all("a")] 60 | else: 61 | user_tags = [] 62 | 63 | #该游戏支持的活动 64 | support_tags = soup.find_all(class_="game_area_details_specs") 65 | support_tags = [item.find(class_="name").get_text().strip() for item in support_tags] 66 | 67 | #爬取评论量图表 68 | reviewsChart = fetchReviewsChart(appID) 69 | if not reviewsChart: #失败重爬一次 70 | reviewsChart = fetchReviewsChart(appID) 71 | reviewsChart = reviewsChart if reviewsChart else "" 72 | 73 | #该页面的所有信息 74 | game_detail = { 75 | "user_reviews":user_reviews_json, 76 | "user_tags":user_tags, 77 | "support_tags":support_tags, 78 | "reviewsChart":reviewsChart, 79 | } 80 | # print(game_detail) 81 | return game_detail 82 | 83 | except: #steam服务器响应不正确 84 | print("bad url:",url) 85 | return None 86 | 87 | class fetchThread(threading.Thread): 88 | def __init__(self, tqueue): 89 | threading.Thread.__init__(self) 90 | self.tqueue = tqueue 91 | def run(self): 92 | while True: 93 | info = self.tqueue.tasks.get() 94 | href = info["href"] 95 | if href.startswith("https://store.steampowered.com/bundle/") or href.startswith("https://store.steampowered.com/sub/"): 96 | game_detail = "bundle" #捆绑包不爬详情页 97 | else: 98 | game_detail = fetchGameInfo(href) 99 | if not game_detail: #失败重爬一次 100 | game_detail = fetchGameInfo(href) 101 | game_detail = game_detail if game_detail else "" 102 | info["game_detail"] = game_detail 103 | 104 | # print(info) 105 | write(info) 106 | if game_detail=="": 107 | self.tqueue.addBadItem(info) 108 | self.tqueue.finishOne() 109 | self.tqueue.tasks.task_done() #已经处理完从队列中拿走的一个项目 110 | 111 | class threadQueue: 112 | def __init__(self, numOfThreads): 113 | self.numOfThreads = numOfThreads 114 | self.threads = [] 115 | self.tasks = JoinableQueue()#实例一个队列 116 | self.tasksNum = 0 117 | self.badItems = [] 118 | 119 | for i in range(1,numOfThreads): 120 | # 创建线程爬取详情页面 121 | thread = fetchThread(self) 122 | thread.start() 123 | self.threads.append(thread) 124 | def add(self,info): 125 | self.tasks.put(info) 126 | def finishOne(self): 127 | threadLock = threading.Lock() 128 | threadLock.acquire() 129 | self.tasksNum=self.tasksNum+1 130 | if self.tasksNum%25==0: 131 | print(self.tasksNum,"/",(totalPage-1)*25,"finished") 132 | threadLock.release() 133 | def addBadItem(self,info): 134 | self.badItems.append(info) 135 | def waitForStop(self): 136 | self.tasks.join()#等,直到消费者把自己放入队列中的所有项目都取走处理完后调用task_done()之后 137 | if self.badItems: 138 | print("badItems ",self.badItems) 139 | 140 | def fetch(pageRange): 141 | badPages = [] 142 | page =1 #每页一个request 143 | for page in pageRange: 144 | try: #网络有问题访问失败,保存失败的请求然后跳过 145 | url = "https://store.steampowered.com/search/?page=" + str(page) 146 | res = session.get(url,headers=headers) 147 | except: 148 | badPages.append(page) 149 | continue 150 | 151 | if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码然后跳过 152 | print("page",page,":",res.status_code) 153 | badPages.append(page) 154 | continue 155 | 156 | try: #曾出现过异常,当时没仔细看,但是后面都没再出现了,可能与steam的服务器有关 157 | soup = BeautifulSoup(res.text,'lxml') 158 | contents = soup.find(id="search_resultsRows").find_all('a') 159 | except: 160 | print("bad page:",page) 161 | badPages.append(page) 162 | continue 163 | 164 | for content in contents: 165 | try: 166 | name = content.find(class_="title").get_text().strip() 167 | date = content.find("div",class_="col search_released responsive_secondrow").string 168 | date = date.strip() if date else ""#未上市的没有发行日期 169 | priceDiv=content.find("div",class_="col search_price discounted responsive_secondrow") 170 | if priceDiv:#打折游戏 171 | original_price=priceDiv.find("strike").string.strip() 172 | price=priceDiv.contents[-1].strip() 173 | else:#原价游戏 174 | original_price= content.find("div",class_="col search_price responsive_secondrow").string.strip() 175 | price=original_price 176 | img_src = content.find("div",class_="col search_capsule").find('img').get("src") 177 | href = content.get("href") 178 | review_summary = content.find("span",class_="search_review_summary") 179 | review_summary = review_summary.attrs['data-tooltip-html'].strip() if review_summary else ""#未上市的没有总评 180 | result={ 181 | "page":page, 182 | "name":name, 183 | "href":href, 184 | "date":date, 185 | "original_price":original_price, 186 | "price":price, 187 | "img_src":img_src, 188 | "review_summary":review_summary, 189 | } 190 | queue.add(result) 191 | except: 192 | print(content) 193 | queue.waitForStop() 194 | return badPages 195 | 196 | #mongodb连接 197 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db') 198 | db = client.steam_db 199 | regions = db.China 200 | collection = regions.games 201 | 202 | #建立会话,Cookie设置语言为简体中文,出生日期为1987.1.1(允许访问成人内容) 203 | headers = { 204 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36', 205 | 'Cookie':'Steam_Language=schinese; birthtime=533750401' 206 | } 207 | session = requests.session() 208 | 209 | queue = threadQueue(100) 210 | 211 | totalPage = 2 #目前有2608页 212 | badPages = fetch(range(1, totalPage)) 213 | if badPages: #重爬坏页 214 | badPages = fetch(badPages) 215 | print("all finished") 216 | if badPages: 217 | print("badPages:",badPages) 218 | -------------------------------------------------------------------------------- /spider/spark-streaming/steam-hotN.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import pymongo 4 | 5 | #爬取中国区steam当前热销榜,存入mongodb中 6 | def write(item): 7 | try: 8 | collection.insert_one(item) 9 | except: 10 | print(e) 11 | 12 | def fetch(pageRange): 13 | badPages = [] 14 | badItems = [] 15 | page =1 #每页一个request 16 | for page in pageRange: 17 | try: #网络有问题访问失败,保存失败的请求然后跳过 18 | url = "https://store.steampowered.com/search/?filter=globaltopsellers&page=" + str(page) + "&os=win" 19 | #Cookie设置语言为简体中文 20 | headers = { 21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36', 22 | 'Cookie':'Steam_Language=schinese' 23 | } 24 | s = requests.session() 25 | res = s.get(url,headers=headers) 26 | except: 27 | badPages.append(page) 28 | continue 29 | 30 | if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码然后跳过 31 | print("page",page,":",res.status_code) 32 | badPages.append(page) 33 | continue 34 | 35 | try: #曾出现过异常,当时没仔细看,但是后面都没再出现了,可能与steam的服务器有关 36 | soup = BeautifulSoup(res.text,'lxml') 37 | contents = soup.find(id="search_resultsRows").find_all('a') 38 | except: 39 | print(soup) 40 | badPages.append(page) 41 | 42 | for content in contents: 43 | try: 44 | name = content.find(class_="title").get_text().strip() 45 | date = content.find("div",class_="col search_released responsive_secondrow").string 46 | date = date.strip() if date else ""#未上市的没有发行日期 47 | priceDiv=content.find("div",class_="col search_price discounted responsive_secondrow") 48 | if priceDiv:#打折游戏 49 | original_price=priceDiv.find("strike").string.strip() 50 | price=priceDiv.contents[-1].strip() 51 | else:#原价游戏 52 | original_price= content.find("div",class_="col search_price responsive_secondrow").string.strip() 53 | price=original_price 54 | img_src = content.find("div",class_="col search_capsule").find('img').get("src") 55 | href = content.get("href") 56 | review_summary = content.find("span",class_="search_review_summary") 57 | review_summary = review_summary.attrs['data-tooltip-html'].strip() if review_summary else ""#未上市的没有发行日期 58 | result={ 59 | "page":page, 60 | "name":name, 61 | "href":href, 62 | "date":date, 63 | "original_price":original_price, 64 | "price":price, 65 | "img_src":img_src, 66 | "review_summary":review_summary, 67 | } 68 | # print(result) 69 | write(result) 70 | except: 71 | print(content) 72 | badItems.append(content) 73 | if page%10==0: 74 | print(page,"/",totalPage,"finished")#每10页打印一次进度 75 | print("badItems:",badItems) 76 | return badPages 77 | 78 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db') 79 | db = client.steam_db 80 | regions = db.China 81 | collection = regions.hot 82 | 83 | totalPage = 593 #目前有593页 84 | badPages = fetch(range(1, totalPage)) 85 | if badPages: #重爬坏页 86 | badPages = fetch(badPages) 87 | print("all finished") 88 | if badPages: 89 | print("badPages:",badPages) 90 | --------------------------------------------------------------------------------