├── .gitignore
├── README.md
├── data
├── China.games.json
└── China.reviews_official.json
├── pic
├── 好评数量.png
├── 数据格式.png
├── 数据流.png
├── 游戏动力对比.png
├── 游戏用户关系图.png
├── 游戏用户关系图2.png
├── 游玩时长分析.png
├── 用户游戏推荐.png
├── 用户社群聚合图.png
├── 用户社群聚合图2.png
├── 用户社群聚合图3.png
├── 评论情感分析.png
├── 评论能力对比.png
└── 词云图.png
├── ppt
├── GraphX_报告.pdf
├── MLlib_报告.pdf
└── Streaming_报告.pdf
├── scripts
├── generate_kaggle_log.py
├── generate_log.py
├── read_mongodb.py
├── steam-recommend.py
└── write_log.sh
├── spark-graphx
├── pom.xml
└── src
│ └── main
│ ├── resources
│ ├── follows.txt
│ ├── log4j.properties
│ ├── output
│ │ ├── graph.gexf
│ │ ├── graphWeapon.gexf
│ │ ├── isolate.txt
│ │ └── minDegrees.gexf
│ ├── steam
│ │ ├── hours_3_10W.gexf
│ │ ├── hours_5_20W.gexf
│ │ ├── hours_6_30W.gexf
│ │ ├── hours_7_30W.gexf
│ │ ├── steam_3_10W.gexf
│ │ ├── steam_5_20W.gexf
│ │ ├── steam_6_30W.gexf
│ │ └── steam_7_30W.gexf
│ └── user.txt
│ └── scala
│ └── cn
│ └── edu
│ └── nju
│ ├── GraphExample.scala
│ ├── GraphExample2.scala
│ ├── GraphExample3.scala
│ ├── GraphProcess.scala
│ ├── GraphProcessTest.scala
│ └── MongoDBProcess.scala
├── spark-mllib
├── pom.xml
└── src
│ └── main
│ ├── resources
│ ├── game_content.txt
│ ├── neg.txt
│ ├── pos.txt
│ └── recommend_validate
│ └── scala
│ └── cn
│ └── edu
│ └── nju
│ ├── DataProcessing.scala
│ ├── EmotionAnalysis.scala
│ └── SteamGameRecommendation.scala
├── spark-streaming
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── cn
│ │ └── edu
│ │ └── nju
│ │ ├── ApiReturnUtil.java
│ │ ├── DemoMessageController.java
│ │ ├── SteamserverdemoApplication.java
│ │ ├── Test.java
│ │ ├── TimeFieldObject.java
│ │ ├── WebSocketConfig.java
│ │ ├── WebSocketServer.java
│ │ ├── api
│ │ ├── ApiReturnObject.java
│ │ └── TagReturnObject.java
│ │ ├── domain
│ │ ├── GameObject.java
│ │ └── TagObject.java
│ │ ├── encoder
│ │ ├── ApiObjectEncoder.java
│ │ └── TagObjectEncoder.java
│ │ └── utils
│ │ ├── DbPool.java
│ │ ├── HBaseUtils.java
│ │ └── Test.java
│ ├── resources
│ ├── RollupCSV
│ │ ├── ._SUCCESS.crc
│ │ ├── .part-00000-4a85a460-3bdb-48ad-bc81-9b68057082a8-c000.csv.crc
│ │ └── _SUCCESS
│ ├── game.json
│ ├── gameAll.json
│ ├── gameDetail.json
│ ├── log4j.properties
│ ├── test.txt
│ └── update.sql
│ └── scala
│ └── cn
│ └── edu
│ └── nju
│ ├── BatchProcess.scala
│ ├── HDFSProcess.scala
│ ├── KafkaProcess.scala
│ ├── MongoDBProcess.scala
│ ├── MySQLProcess.scala
│ ├── SteamProcess.scala
│ ├── dao
│ ├── CourseClickCountDAO.scala
│ ├── CourseSearchClickCountDAO.scala
│ ├── RollUpDAO.scala
│ └── TagDAO.scala
│ ├── domain
│ ├── ClickLog.scala
│ ├── CommentLog.scala
│ ├── CourseClickCount.scala
│ ├── CourseSearchClickCount.scala
│ ├── DouBanLog.scala
│ ├── GameDetail.scala
│ ├── ReviewsChart.scala
│ ├── RollUp.scala
│ ├── SteamLog.scala
│ ├── Tag.scala
│ └── UserData.scala
│ ├── test
│ ├── DateTest.scala
│ ├── HDFSProcessTest.scala
│ ├── JsonTest.scala
│ ├── StatStreaming.scala
│ └── TransformTest.scala
│ └── utils
│ ├── DateUtils.scala
│ └── MySQLUtils.scala
└── spider
├── spark-graphx
├── steam-reviews-official.py
└── steam-reviews.py
└── spark-streaming
├── steam-games-multithread-queue.py
└── steam-hotN.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled class file
2 | *.class
3 |
4 | # Log file
5 | *.log
6 |
7 | # BlueJ files
8 | *.ctxt
9 |
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 |
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.nar
17 | *.ear
18 | *.zip
19 | *.tar.gz
20 | *.rar
21 |
22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
23 | hs_err_pid*
24 |
25 | *.iml
26 | *.idea
27 | target
28 | *.csv
29 | model
30 |
31 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 云计算大作业
2 |
3 | ## 1. 云计算作业介绍
4 |
5 | - **本人只负责逻辑层面的业务处理,所以代码大部分只负责到处理数据后落地,前端展示基本使用ECharts**
6 | - **详细流程参考PPT文件下三次汇报PPT**
7 | - **单机业务流程就可以跑通,集群搭建步骤请参考:[集群搭建](https://github.com/Thpffcj/BigData-Getting-Started/blob/master/%E9%9B%86%E7%BE%A4%E6%90%AD%E5%BB%BA.md)**
8 | - 实践作业分为三个部分:Spark Streaming 计算模拟、Spark GraphX 计算和基于 Spark MLlib 的计算
9 | - Spark Streaming
10 | - 要求针对DStream数据开展的计算中至少使用到5个Transformation操作,可以
11 | 是多个业务问题;必须使用到至少1个全局统计的量;结果展示不少于2类图示。Streaming 程序监听的必须是HDFS文件夹。原始数据存储在MongoDB中,模拟数据流时,从MongoDB 中读取数据,写入HDFS中
12 | - Spark GraphX
13 | - 要求必须使用边和点的RDD构造图;用于业务计算的图中不少于1 万个点和1万条边;对于图的计算使用不少于6个GraphX的API调用,可以是解决多个业务问题; 至少使用1次聚合操作或者关联操作;结果展示不少于2类图示。从MongoDB中读取图数据,结果存回 MongoDB中
14 | - Spark MLlib
15 | - 展示不仅包括实验结果,还需包括数据的相关分析
16 |
17 | ***
18 |
19 | ## 2. spark-streaming:Steam数据爬取和流模拟
20 |
21 | ### 1. 研究问题
22 |
23 | - 截至目前那些游戏最火爆
24 | - 玩家游戏时长的分布
25 | - 哪些类型的游戏最受欢迎
26 |
27 | ### 2. 数据
28 |
29 | - /data/China.games.json
30 |
31 | 
32 |
33 | - /data/steam.csv
34 |
35 |
36 | userId,gameName,behavior,duration,none
37 | 151603712,"The Elder Scrolls V Skyrim",purchase,1.0,0
38 | 151603712,"The Elder Scrolls V Skyrim",play,273.0,0
39 | 151603712,"Fallout 4",purchase,1.0,0
40 | 151603712,"Fallout 4",play,87.0,0
41 | 151603712,"Spore",purchase,1.0,0
42 | 151603712,"Spore",play,14.9,0
43 | 151603712,"Fallout New Vegas",purchase,1.0,0
44 | 151603712,"Fallout New Vegas",play,12.1,0
45 | 151603712,"Left 4 Dead 2",purchase,1.0,0
46 | 151603712,"Left 4 Dead 2",play,8.9,0
47 |
48 |
49 | ### 3. 数据流
50 |
51 | 
52 |
53 | ### 4. 展示效果
54 |
55 | - 游戏销量动态排名图
56 |
57 | 
58 |
59 | - 动态词云图
60 |
61 | 
62 |
63 | - 游玩时长分布图
64 |
65 | 
66 |
67 | ***
68 |
69 | ## 3. spark-graphx
70 |
71 | ### 1. 研究问题
72 |
73 | - 游戏的口碑和热度
74 | - 用户社群
75 | - 游戏对市场的占有力和用户粘性
76 | - 游戏间的竞争关系
77 |
78 | **相关指标**
79 |
80 | - 游戏评论
81 | - 玩家评论游戏数
82 | - 游戏所受评论数
83 | - 玩家游戏时长
84 |
85 | ### 2. 展示效果
86 |
87 | - 游戏用户关系图
88 |
89 | 
90 |
91 | 
92 |
93 | - 用户社群聚合图
94 |
95 | 
96 |
97 | 
98 |
99 | 
100 |
101 | ***
102 |
103 | ## 4. spark-mllib
104 |
105 | ### 1. 研究问题
106 |
107 | - 玩家间有哪些社群?
108 | - 各社群特点?
109 | - 可能感兴趣的游戏?
110 |
111 | ### 2. 用户游戏推荐
112 |
113 | - 我们想利用某个兴趣相投、拥有共同经验群体的喜好来推荐感兴趣的游戏给玩家
114 | - 协同过滤技术旨在补充用户 - 商品关联矩阵中所缺失的部分
115 | - 我们并没有直观的用户对游戏的评分,于是用户的游玩时长代替用户对游戏的评价,为了消除游戏本身游玩时长的影响,我们将每款游戏的游玩时长映射到0 – 10之间代替用户对该游戏的评分
116 |
117 | 
118 |
119 | ### 3. 评论情感分析
120 |
121 | - 用户对游戏的评论通常有一个标签:推荐/不推荐
122 | - 我们想通过对评论的情感分析,判断一条评论是推荐这个游戏还是不推荐
123 |
124 | 
125 |
126 | ### 4. 社群聚类分析
127 |
128 | - 聚类特征
129 | - 玩家拥有游戏数
130 | - 玩家总评论数
131 | - 游玩时长
132 | - 是否推荐游戏
133 | - 被认为评论有用
134 | - 被认为评论欢乐数
135 | - 被回复数
136 |
137 | 
138 |
139 | 
140 |
141 | ***
142 |
143 | ## 5. 云计算作业介绍
144 |
145 | - 由于本人只负责Spark计算的过程,所以每部分代码可能都不是完整的业务流程,主要记录学习Spark过程
146 |
147 |
148 |
149 |
150 |
151 |
152 |
--------------------------------------------------------------------------------
/pic/好评数量.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/好评数量.png
--------------------------------------------------------------------------------
/pic/数据格式.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/数据格式.png
--------------------------------------------------------------------------------
/pic/数据流.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/数据流.png
--------------------------------------------------------------------------------
/pic/游戏动力对比.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/游戏动力对比.png
--------------------------------------------------------------------------------
/pic/游戏用户关系图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/游戏用户关系图.png
--------------------------------------------------------------------------------
/pic/游戏用户关系图2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/游戏用户关系图2.png
--------------------------------------------------------------------------------
/pic/游玩时长分析.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/游玩时长分析.png
--------------------------------------------------------------------------------
/pic/用户游戏推荐.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/用户游戏推荐.png
--------------------------------------------------------------------------------
/pic/用户社群聚合图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/用户社群聚合图.png
--------------------------------------------------------------------------------
/pic/用户社群聚合图2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/用户社群聚合图2.png
--------------------------------------------------------------------------------
/pic/用户社群聚合图3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/用户社群聚合图3.png
--------------------------------------------------------------------------------
/pic/评论情感分析.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/评论情感分析.png
--------------------------------------------------------------------------------
/pic/评论能力对比.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/评论能力对比.png
--------------------------------------------------------------------------------
/pic/词云图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/词云图.png
--------------------------------------------------------------------------------
/ppt/GraphX_报告.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/ppt/GraphX_报告.pdf
--------------------------------------------------------------------------------
/ppt/MLlib_报告.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/ppt/MLlib_报告.pdf
--------------------------------------------------------------------------------
/ppt/Streaming_报告.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/ppt/Streaming_报告.pdf
--------------------------------------------------------------------------------
/scripts/generate_kaggle_log.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | # Created by thpffcj on 2019/10/19.
3 |
4 | import time
5 | import pandas as pd
6 |
7 | pd.set_option('display.max_columns', 40)
8 | pd.set_option('display.width', 1000)
9 |
10 |
11 | def generate_log(count=200000):
12 |
13 | data = pd.read_csv("/Users/thpffcj/Public/file/steam.csv")
14 | f = open("/Users/thpffcj/Public/file/user_data.log", "a")
15 |
16 | flag = 0
17 | position = 0
18 | while count >= 1:
19 | log = data.loc[position:position]
20 | query_log = "{user_id}\t{game_name}\t{behavior}\t{duration}".format(
21 | user_id=log["userId"].values.max(), game_name=log["gameName"].values.max(),
22 | behavior=log["behavior"].values.max(), duration=log["duration"].values.max())
23 |
24 | f.write(query_log + "\n")
25 | print(query_log)
26 |
27 | if flag % 500 == 0:
28 | time.sleep(5)
29 |
30 | count = count - 1
31 | position = position + 1
32 |
33 |
34 | if __name__ == '__main__':
35 | generate_log()
36 |
--------------------------------------------------------------------------------
/scripts/generate_log.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | # Created by thpffcj on 2019/10/2.
3 |
4 | import pymongo
5 | import time
6 | import os
7 |
8 |
9 | # 连接数据库
10 | client = pymongo.MongoClient("***.***.***.***", 27017)
11 |
12 | db = client['steam_db']
13 | db.authenticate("steam", "steam")
14 |
15 | table = db['China.games']
16 |
17 | data = table.find().limit(1000)
18 | print("数据加载完成...")
19 | # 65175
20 | print(data.count())
21 |
22 |
23 | def generate_log(count=200):
24 | print("进入方法...")
25 | flag = 0
26 | steam_log = ""
27 | for game_data in data:
28 | query_log = "{img_src}\t{game_detail}\t{original_price}\t{price}\t{review_summary}\t{date}\t{name}".format(
29 | img_src=game_data["img_src"],
30 | game_detail=str(game_data["game_detail"]),
31 | original_price=game_data["original_price"],
32 | price=game_data["price"],
33 | review_summary=game_data["review_summary"],
34 | date=game_data["date"],
35 | name=game_data["name"])
36 |
37 | steam_log = steam_log + query_log + "\n"
38 | flag = flag + 1
39 | if flag % 200 == 0:
40 | print("flag:" + str(flag))
41 |
42 | if flag == count:
43 | print("写日志...")
44 | f = open("/Users/thpffcj/Public/local-repository/Python-Learning/cloud-computing/utils/test.log", "w")
45 | f.write(steam_log)
46 | time.sleep(2)
47 |
48 | # 上传
49 | print("上传日志...")
50 | os.system("./write_log.sh")
51 |
52 | flag = 0
53 | steam_log = ""
54 | f.close()
55 | time.sleep(3)
56 |
57 | print("结束...")
58 |
59 |
60 | def write_log():
61 | print("进入方法...")
62 | flag = 0
63 | f = open("/Users/thpffcj/Public/local-repository/Python-Learning/cloud-computing/utils/test.log", "a")
64 | for game_data in data:
65 | query_log = "{img_src}\t{game_detail}\t{original_price}\t{price}\t{review_summary}\t{date}\t{name}".format(
66 | img_src=game_data["img_src"],
67 | game_detail=str(game_data["game_detail"]),
68 | original_price=game_data["original_price"],
69 | price=game_data["price"],
70 | review_summary=game_data["review_summary"],
71 | date=game_data["date"],
72 | name=game_data["name"])
73 |
74 | flag = flag + 1
75 | if flag % 200 == 0:
76 | print("flag:" + str(flag))
77 |
78 | f.write(query_log + "\n")
79 |
80 | f.close()
81 | print("结束...")
82 |
83 |
84 | def clean():
85 | f = open("/Users/thpffcj/Public/local-repository/Python-Learning/cloud-computing/utils/test.log", "w")
86 | f.write("")
87 | f.close()
88 |
89 |
90 | if __name__ == '__main__':
91 | generate_log()
92 |
--------------------------------------------------------------------------------
/scripts/read_mongodb.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | # Created by thpffcj on 2019/10/30.
3 |
4 | import pymongo
5 |
6 |
7 | # 连接数据库
8 | client1 = pymongo.MongoClient("***.***.***.***", 27017)
9 |
10 | db1 = client1['steam_db']
11 | db1.authenticate("steam", "steam")
12 |
13 | table = db1['China.reviews_official']
14 |
15 | data = table.find().limit(300000)
16 | print("数据加载完成...")
17 | # 65175
18 | # for d in data:
19 | # print(d["game"])
20 |
21 |
22 | # Python写MongoDB
23 | client2 = pymongo.MongoClient("127.0.0.1", 27017)
24 | # 库名inventory
25 | db2 = client2['test']
26 | # 集合名items
27 | collection = db2['China.reviews_official_30W']
28 |
29 | # 插入一个文档,item是一个字典{}
30 | collection.insert(data)
--------------------------------------------------------------------------------
/scripts/steam-recommend.py:
--------------------------------------------------------------------------------
1 | from pyspark.ml.clustering import KMeans
2 | from pyspark.ml.evaluation import ClusteringEvaluator
3 | from pyspark.sql import SparkSession
4 | from pyspark.ml.linalg import Vectors
5 | import pymongo
6 |
7 | # db.addUser("steam",{roles:[ {role:"root",db:"steam_db"} ]})
8 | #mongodb连接
9 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db')
10 | db = client.steam_db
11 | regions = db.China
12 | test_collection = regions.test_collection
13 | train_collection = regions.train_collection
14 | print(train_collection.find()[0])
15 |
16 | # kmeans_path = "./kmeans"
17 | model_path = "./kmeans_model"
18 |
19 | def getData(collection):
20 | return map(lambda r: (Vectors.dense([r["author"]["num_games_owned"],
21 | r["author"]["num_reviews"],
22 | r["author"]["playtime_forever"],
23 | # r["review"],
24 | r["voted_up"],
25 | r["votes_up"],
26 | r["votes_funny"],
27 | r["comment_count"]]),), collection.find())
28 |
29 | spark = SparkSession\
30 | .builder\
31 | .appName("KMeansExample")\
32 | .getOrCreate()
33 | train_data = getData(train_collection)
34 | test_data = getData(test_collection)
35 |
36 | # Loads data.
37 | # dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt")
38 | # data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
39 | # (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
40 | # data = [(Vectors.dense([0.0, 0.0, 0.0]),), (Vectors.dense([0.1, 0.1, 0.1]),), (Vectors.dense([0.2, 0.2, 0.2]),),
41 | # (Vectors.dense([9.0, 9.0, 9.0]),), (Vectors.dense([9.1, 9.1, 9.1]),), (Vectors.dense([9.2, 9.2, 9.2]),)]
42 | train_dataset = spark.createDataFrame(train_data, ["features"])
43 | test_dataset = spark.createDataFrame(test_data, ["features"])
44 |
45 | # Trains a k-means model.
46 | kmeans = KMeans().setK(5).setSeed(1)
47 | # kmeans = KMeans.load(kmeans_path)
48 | model = kmeans.fit(train_dataset)
49 | # model = KMeansModel.load(model_path)
50 | clusterSizes = model.summary.clusterSizes
51 | print(clusterSizes)
52 |
53 | # print("点(-3 -3)所属族:" + model.predict((Vectors.dense([1, 1, 1, 1, 1, 1, 1]),)))
54 |
55 | # Make predictions
56 | predictions = model.transform(test_dataset)
57 | # print(predictions.collect())
58 |
59 | # Evaluate clustering by computing Silhouette score
60 | evaluator = ClusteringEvaluator()
61 |
62 | silhouette = evaluator.evaluate(predictions) #轮廓系数 silhouette coefficient 0.8758693672037696
63 | print("Silhouette with squared euclidean distance = " + str(silhouette))
64 |
65 | # Shows the result.
66 | centers = model.clusterCenters()
67 | print("Cluster Centers: ")
68 | for center in centers:
69 | print(center)
70 |
71 | # kmeans.save(kmeans_path)
72 | # model.save(model_path)
73 |
74 | kmeans_centers = regions.kmeans_centers
75 | kmeans_centers.drop()
76 | i = 0
77 | for center in centers:
78 | json = {
79 | "num_games_owned":center[0],
80 | "num_reviews":center[1],
81 | "playtime_forever":center[2],
82 | "voted_up":center[3],
83 | "votes_up":center[4],
84 | "votes_funny":center[5],
85 | "comment_count":center[6],
86 | "num_of_reviews":clusterSizes[i],
87 | }
88 | i+=1
89 | kmeans_centers.insert_one(json)
90 |
91 | spark.stop()
--------------------------------------------------------------------------------
/scripts/write_log.sh:
--------------------------------------------------------------------------------
1 | # HDFS命令
2 | HDFS="hadoop fs"
3 |
4 | # Streaming监听的文件目录,要与Streaming程序中保持一致
5 | streaming_dir="/cloud-computing"
6 |
7 | # 清空旧数据
8 | $HDFS -rm "${streaming_dir}"'/tmp/*'>/dev/null 2>&1
9 | $HDFS -rm "${streaming_dir}"'/*'>/dev/null 2>&1
10 | $HDFS -mkdir ${streaming_dir}/tmp
11 |
12 | # 生成日志
13 |
14 | # 加上时间戳,防止重名
15 | templog="access.`date +'%s'`.log"
16 | # 先将日志放到临时目录,再移动到Streaming监听目录,确保原子性
17 | $HDFS -put test.log ${streaming_dir}/tmp/$templog
18 | $HDFS -mv ${streaming_dir}/tmp/$templog ${streaming_dir}/
--------------------------------------------------------------------------------
/spark-graphx/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | 4.0.0
6 |
7 | cn.edu.nju
8 | mf1932063
9 | 1.0-SNAPSHOT
10 |
11 |
12 | 2.11.8
13 | 2.2.0
14 | 2.6.0-cdh5.16.2
15 | 1.8
16 | 1.8
17 |
18 |
19 |
20 |
21 |
22 | cloudera
23 | https://repository.cloudera.com/artifactory/cloudera-repos
24 |
25 |
26 | central
27 | aliyun maven
28 | http://maven.aliyun.com/nexus/content/groups/public/
29 | default
30 |
31 |
32 |
33 |
34 |
35 | org.projectlombok
36 | lombok
37 | 1.16.18
38 |
39 |
40 |
41 | org.scala-lang
42 | scala-library
43 | ${scala.version}
44 |
45 |
46 |
47 | org.mongodb.spark
48 | mongo-spark-connector_2.11
49 | ${spark.version}
50 |
51 |
52 |
53 | org.apache.spark
54 | spark-graphx_2.11
55 | ${spark.version}
56 |
57 |
58 |
59 | org.apache.commons
60 | commons-lang3
61 | 3.5
62 |
63 |
64 |
65 |
66 | org.apache.spark
67 | spark-sql_2.11
68 | ${spark.version}
69 |
70 |
71 |
72 | com.fasterxml.jackson.module
73 | jackson-module-scala_2.11
74 | 2.6.5
75 |
76 |
77 |
78 | com.alibaba
79 | fastjson
80 | 1.2.47
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 | net.jpountz.lz4
91 | lz4
92 | 1.3.0
93 |
94 |
95 |
96 | org.codehaus.janino
97 | janino
98 | 3.0.8
99 |
100 |
101 |
102 | mysql
103 | mysql-connector-java
104 | 5.1.38
105 |
106 |
107 |
108 | io.netty
109 | netty-all
110 | 4.0.42.Final
111 |
112 |
113 |
114 |
115 |
116 |
120 |
121 |
122 | org.scala-tools
123 | maven-scala-plugin
124 |
125 |
126 |
127 | compile
128 | testCompile
129 |
130 |
131 |
132 |
133 | ${scala.version}
134 |
135 | -target:jvm-1.8
136 |
137 |
138 |
139 |
140 | org.apache.maven.plugins
141 | maven-eclipse-plugin
142 |
143 | true
144 |
145 | ch.epfl.lamp.sdt.core.scalabuilder
146 |
147 |
148 | ch.epfl.lamp.sdt.core.scalanature
149 |
150 |
151 | org.eclipse.jdt.launching.JRE_CONTAINER
152 | ch.epfl.lamp.sdt.launching.SCALA_CONTAINER
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 | org.scala-tools
162 | maven-scala-plugin
163 |
164 | ${scala.version}
165 |
166 |
167 |
168 |
169 |
170 |
--------------------------------------------------------------------------------
/spark-graphx/src/main/resources/follows.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 2 3
3 | 3 5
4 | 4 6
5 | 7 6
6 | 6 7
--------------------------------------------------------------------------------
/spark-graphx/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set everything to be logged to the console
2 | log4j.rootCategory=INFO, console
3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
4 | log4j.appender.console.target=System.err
5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
7 |
8 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
9 | # log level for this class is used to overwrite the root logger's log level, so that
10 | # the user can have different defaults for the shell and regular Spark apps.
11 | log4j.logger.org.apache.spark.repl.Main=WARN
12 |
13 | # Settings to quiet third party logs that are too verbose
14 | log4j.logger.org.spark_project.jetty=WARN
15 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
18 | log4j.logger.org.apache.parquet=ERROR
19 | log4j.logger.parquet=ERROR
20 |
21 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
22 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
23 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
--------------------------------------------------------------------------------
/spark-graphx/src/main/resources/output/graphWeapon.gexf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
--------------------------------------------------------------------------------
/spark-graphx/src/main/resources/output/isolate.txt:
--------------------------------------------------------------------------------
1 | CompactBuffer(15, 103)
2 | CompactBuffer(39, 189, 96, 81, 153, 156, 66, 138, 171, 36, 111, 57, 75, 45, 132, 63, 72, 90, 18, 12, 9, 183, 144, 159, 21, 120, 0, 42, 102, 186, 69, 123, 174, 147, 19, 34, 52, 151, 4, 16, 82, 130, 28, 79, 127, 64, 175, 37, 133, 154, 1, 70, 109, 10, 145, 100, 115, 160, 187, 178, 76, 112, 43, 169, 25, 166, 46, 73, 172, 124, 40, 163, 7, 58, 88, 119, 155, 71, 80, 107, 98, 125, 65, 170, 14, 50, 35, 110, 161, 104, 146, 188, 17, 173, 20, 167, 122, 41, 47, 77, 95, 59, 128, 182, 62, 113, 86, 176, 26, 68)
3 | CompactBuffer(177, 168, 150, 180, 54, 105, 30, 24, 51, 108, 78, 99, 162, 84, 48, 117, 27, 93, 33, 126, 141, 6, 3, 135, 165, 60, 114, 87, 129, 13, 55, 121, 157, 106, 49, 94, 148, 61, 139, 184, 97, 22, 142, 181, 118, 67, 85, 136, 91, 31, 101, 137, 134, 158, 29, 11, 92, 152, 149, 140, 185, 74, 83, 89, 179, 38, 56, 53, 116, 131, 32, 23, 164, 143, 8, 44, 5, 2)
--------------------------------------------------------------------------------
/spark-graphx/src/main/resources/output/minDegrees.gexf:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
--------------------------------------------------------------------------------
/spark-graphx/src/main/resources/user.txt:
--------------------------------------------------------------------------------
1 | 1,Thpffcj1
2 | 2,Thpffcj2
3 | 3,Thpffcj3
4 | 4,Thpffcj4
5 | 5,Thpffcj5
6 | 6,Thpffcj6
7 | 7,Thpffcj7
--------------------------------------------------------------------------------
/spark-graphx/src/main/scala/cn/edu/nju/GraphExample.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import org.apache.spark.graphx.{Edge, Graph}
4 | import org.apache.spark.rdd.RDD
5 | import org.apache.spark.{SparkConf, SparkContext}
6 |
7 | /**
8 | * Created by thpffcj on 2019/10/3.
9 | */
10 | object GraphExample {
11 |
12 | def main(args: Array[String]): Unit = {
13 |
14 | val conf = new SparkConf().setAppName("GraphTest").setMaster("local")
15 | val sc = new SparkContext(conf)
16 |
17 | // 构建顶点 返回的这个Long其实是VertexId类型,都是一样的
18 | val users: RDD[(Long, (String, String))] =
19 | sc.parallelize(
20 | Array((3L, ("rxin", "student")),
21 | (7L, ("jgonzal", "postdoc")),
22 | (5L, ("franklin", "prof")),
23 | (2L, ("istoica", "prof"))))
24 |
25 | // 构建边 (边有个独特的类Edge,某种程度讲代表的就是一些关系)
26 | val relationships: RDD[Edge[String]] =
27 | sc.parallelize(
28 | Array(Edge(3L, 7L, "collab"),
29 | Edge(5L, 3L, "advisor"),
30 | Edge(2L, 5L, "colleague"),
31 | Edge(5L, 7L, "pi")))
32 |
33 | // 顶点和边,这样就构建了我们的图
34 | val graph = Graph(users, relationships)
35 |
36 | // .vertices获取到这个图中所有的顶点
37 | val count = graph.vertices.filter {
38 | case (id, (name, pos)) => {
39 | // 计算我们这个图中有多少个postdoc博士后
40 | pos == "postdoc"
41 | }
42 | }.count()
43 |
44 | // 1
45 | println(count)
46 |
47 | //.edges获取到这个图中所有的边,过滤出 源ID<目标ID 的数量
48 | val count1 = graph.edges.filter(e => e.srcId < e.dstId).count()
49 |
50 | // 3
51 | println(count1)
52 |
53 | sc.stop()
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/spark-graphx/src/main/scala/cn/edu/nju/GraphExample2.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import org.apache.spark.graphx.{Edge, Graph, VertexId, VertexRDD}
4 | import org.apache.spark.rdd.RDD
5 | import org.apache.spark.{SparkConf, SparkContext}
6 |
7 | /**
8 | * Created by thpffcj on 2019/10/31.
9 | */
10 | object GraphExample2 {
11 |
12 | def main(args: Array[String]): Unit = {
13 |
14 | // 设置运行环境
15 | val conf = new SparkConf().setAppName("GraphExample2").setMaster("local")
16 | val sc = new SparkContext(conf)
17 |
18 | // 设置顶点和边,注意顶点和边都是用元组定义的Array
19 | // 顶点的数据类型是VD:(String, Int)
20 | val vertexArray = Array(
21 | (1L, ("Alice", 28)),
22 | (2L, ("Bob", 27)),
23 | (3L, ("Charlie", 65)),
24 | (4L, ("David", 42)),
25 | (5L, ("Ed", 55)),
26 | (6L, ("Fran", 50))
27 | )
28 |
29 | // 边的数据类型ED:Int
30 | val edgeArray = Array(
31 | Edge(2L, 1L, 7),
32 | Edge(2L, 4L, 2),
33 | Edge(3L, 2L, 4),
34 | Edge(3L, 6L, 3),
35 | Edge(4L, 1L, 1),
36 | Edge(5L, 2L, 2),
37 | Edge(5L, 3L, 8),
38 | Edge(5L, 6L, 3)
39 | )
40 |
41 | // 构造vertexRDD和edgeRDD
42 | val vertexRDD: RDD[(Long, (String, Int))] = sc.parallelize(vertexArray)
43 | val edgeRDD: RDD[Edge[Int]] = sc.parallelize(edgeArray)
44 |
45 | // 构造图Graph[VD,ED]
46 | val graph: Graph[(String, Int), Int] = Graph(vertexRDD, edgeRDD)
47 |
48 | // 找出图中年龄大于30的顶点
49 | graph.vertices.filter { case (id, (name, age)) => age > 30 }.collect.foreach {
50 | case (id, (name, age)) => println(s"$name is $age")
51 | }
52 |
53 | // 边操作:找出图中属性大于5的边
54 | graph.edges.filter(e => e.attr > 5)
55 | .collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))
56 |
57 | // triplets操作,((srcId, srcAttr), (dstId, dstAttr), attr)
58 | // 列出边属性 >5 的tripltes
59 | for (triplet <- graph.triplets.filter(t => t.attr > 5).collect) {
60 | println(s"${triplet.srcAttr._1} likes ${triplet.dstAttr._1}")
61 | }
62 |
63 | // Degrees操作
64 | // 找出图中最大的出度、入度、度数
65 | def max(a: (VertexId, Int), b: (VertexId, Int)): (VertexId, Int) = {
66 | if (a._2 > b._2) a else b
67 | }
68 |
69 | println("max of outDegrees:" + graph.outDegrees.reduce(max) + " max of inDegrees:" + graph.inDegrees.reduce(max) + " max of Degrees:" + graph.degrees.reduce(max))
70 |
71 | // 转换操作
72 | // 顶点的转换操作,顶点age + 10
73 | graph.mapVertices { case (id, (name, age)) => (id, (name, age + 10)) }
74 | .vertices.collect.foreach(v => println(s"${v._2._1} is ${v._2._2}"))
75 |
76 | // 边的转换操作,边的属性*2
77 | graph.mapEdges(e => e.attr * 2)
78 | .edges.collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))
79 |
80 | // 顶点年纪>30的子图
81 | val subGraph = graph.subgraph(vpred = (id, vd) => vd._2 >= 30)
82 | subGraph.vertices.collect.foreach(v => println(s"${v._2._1} is ${v._2._2}"))
83 |
84 | // 子图所有边
85 | subGraph.edges.collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))
86 |
87 | // 连接操作
88 | val inDegrees: VertexRDD[Int] = graph.inDegrees
89 | case class User(name: String, age: Int, inDeg: Int, outDeg: Int)
90 |
91 | // 创建一个新图,顶点VD的数据类型为User,并从graph做类型转换
92 | val initialUserGraph: Graph[User, Int] = graph.mapVertices {
93 | case (id, (name, age)) => User(name, age, 0, 0)}
94 |
95 | // initialUserGraph与inDegrees、outDegrees(RDD)进行连接,并修改initialUserGraph中inDeg值、outDeg值
96 | val userGraph = initialUserGraph.outerJoinVertices(initialUserGraph.inDegrees) {
97 | case (id, u, inDegOpt) => User(u.name, u.age, inDegOpt.getOrElse(0), u.outDeg)
98 | }.outerJoinVertices(initialUserGraph.outDegrees) {
99 | case (id, u, outDegOpt) => User(u.name, u.age, u.inDeg,outDegOpt.getOrElse(0))
100 | }
101 |
102 | // 连接图的属性
103 | userGraph.vertices.collect.foreach(v => println(s"${v._2.name} inDeg: ${v._2.inDeg} outDeg: ${v._2.outDeg}"))
104 |
105 | // 出度和入读相同的人员
106 | userGraph.vertices.filter {
107 | case (id, u) => u.inDeg == u.outDeg
108 | }.collect.foreach {
109 | case (id, property) => println(property.name)
110 | }
111 |
112 | // 聚合操作
113 | // 找出年纪最大的追求者
114 | // val oldestFollower: VertexRDD[(String, Int)] = userGraph.mapReduceTriplets[(String, Int)](
115 | //
116 | // // 将源顶点的属性发送给目标顶点,map过程
117 | // edge => Iterator((edge.dstId, (edge.srcAttr.name, edge.srcAttr.age))),
118 | //
119 | // // 得到最大追求者,reduce过程
120 | // (a, b) => if (a._2 > b._2) a else b
121 | // )
122 |
123 | // userGraph.vertices.leftJoin(oldestFollower) { (id, user, optOldestFollower) =>
124 | // optOldestFollower match {
125 | // case None => s"${user.name} does not have any followers."
126 | // case Some((name, age)) => s"${name} is the oldest follower of ${user.name}."
127 | // }
128 | // }.collect.foreach { case (id, str) => println(str)}
129 |
130 |
131 | }
132 |
133 |
134 | }
135 |
--------------------------------------------------------------------------------
/spark-graphx/src/main/scala/cn/edu/nju/GraphExample3.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import org.apache.spark.graphx.GraphLoader
4 | import org.apache.spark.{SparkConf, SparkContext}
5 |
6 | /**
7 | * Created by thpffcj on 2019/10/3.
8 | *
9 | * 图计算官网案例示范
10 | * 主要解决项目中遇到的 把同一个用户识别出来,如果是同一个用户就合并到一起
11 | */
12 | object GraphExample3 {
13 |
14 | def main(args: Array[String]): Unit = {
15 |
16 | // graphx 基于RDD
17 | val conf = new SparkConf().setMaster("local").setAppName("ConnectedComponentsExample")
18 | val sc = new SparkContext(conf)
19 |
20 | // 构建出来图有多种方式
21 | val graph = GraphLoader.edgeListFile(sc, "src/main/resources/follows.txt")
22 | /**
23 | * 就是把所有的数字作为key,value都写为1
24 | * (4,1)
25 | * (1,1)
26 | * (6,1)
27 | * (3,1)
28 | * (7,1)
29 | * (5,1)
30 | * (2,1)
31 | */
32 | graph.vertices.foreach(println(_))
33 |
34 | /**
35 | * .connectedComponents()计算每个顶点的连接组件成员,并返回带有顶点的图形
36 | * 包含该顶点的连通组件中包含最低顶点id的值。
37 | */
38 | val cc = graph.connectedComponents().vertices
39 | /**
40 | * (4,4)
41 | * (1,1)
42 | * (6,4)
43 | * (3,1)
44 | * (7,4)
45 | * (5,1)
46 | * (2,1)
47 | */
48 | cc.foreach(println(_))
49 |
50 | val users = sc.textFile("src/main/resources/user.txt").map(line => {
51 | // 因为要join,所以要变成kv形式
52 | val fields = line.split(",")
53 | (fields(0).toLong, fields(1))
54 | })
55 |
56 | // (1,Thpffcj1) join (1,1)
57 | // (1,(Thpffcj1,1)) 代表的是同一个好友的那个id
58 | users.join(cc).map {
59 | case (id, (username, cclastid)) => (cclastid, username)
60 | }.reduceByKey((x: String, y: String) => x + "," + y)
61 | .foreach(tuple => {
62 | /**
63 | * Thpffcj4,Thpffcj6,Thpffcj7
64 | * Thpffcj1,Thpffcj3,Thpffcj5,Thpffcj2
65 | */
66 | println(tuple._2)
67 | })
68 |
69 | sc.stop()
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/spark-graphx/src/main/scala/cn/edu/nju/GraphProcess.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import java.io.PrintWriter
4 | import java.util
5 | import java.util.concurrent.ConcurrentHashMap
6 |
7 | import com.alibaba.fastjson.JSON
8 | import com.mongodb.spark.MongoSpark
9 | import org.apache.spark.{SparkConf, SparkContext}
10 | import org.apache.spark.graphx.{Edge, Graph, VertexId, VertexRDD}
11 | import org.apache.spark.rdd.RDD
12 | import org.apache.spark.sql.{DataFrame, SparkSession}
13 | import org.bson.Document
14 |
15 | import scala.util.Random
16 |
17 | /**
18 | * Created by thpffcj on 2019/11/2.
19 | * hours_3_10W 53 160
20 | * steam_3_10W 53 160
21 | * hours_5_20W 98 464
22 | * steam_5_20W 96 452
23 | * hours_7_30W 103 535
24 | * steam_7_30W 103 535
25 | * hours_6_30W 178 1060
26 | * steam_6_30W 175 1039
27 | *
28 | */
29 | object GraphProcess {
30 |
31 | // 点集,根据用户id或游戏名找点id
32 | // user_76561198380840992 1L
33 | // game_CODE VEIN 2L
34 | val pointMap = new ConcurrentHashMap[String, Long]()
35 |
36 | // 评论边
37 | val edgeMap1 = new ConcurrentHashMap[(Long, Long), String]()
38 |
39 | // 时长边
40 | val edgeMap2 = new ConcurrentHashMap[(Long, Long), String]()
41 |
42 | // 点权重map,根据图中点id得到权重
43 | // 1L 10
44 | val weightMap = new ConcurrentHashMap[Long, Int]()
45 |
46 | def main(args: Array[String]): Unit = {
47 |
48 | val conf = new SparkConf().setMaster("local[4]").setAppName("GraphProcess")
49 | conf.set("spark.mongodb.input.uri", "mongodb://localhost:27017/test.China.reviews_official_30W")
50 | conf.set("spark.mongodb.input.partitioner", "MongoPaginateBySizePartitioner")
51 | conf.set("spark.mongodb.output.uri", "mongodb://localhost:27017/test.steam.graph_vertice")
52 |
53 | val spark = SparkSession.builder().config(conf).getOrCreate()
54 |
55 | val frame: DataFrame = MongoSpark.load(spark)
56 |
57 | var key = 0L
58 |
59 | // 需要collect到一个节点上,key递增
60 | frame.collect().foreach(row => {
61 |
62 | val gameArray = row.getAs("game").toString.split(",")
63 | var game = gameArray(0)
64 | game = game.substring(1, game.length)
65 | // 过滤前端无法识别非法字符,比如表情等
66 | val gameNamePatterns = "[^\\u4e00-\\u9fa5a-zA-Z0-9 ]".r
67 | // 游戏名称
68 | game = gameNamePatterns.replaceAllIn(game, "")
69 |
70 | val jsonAuthor = JSON.parse(row.getAs("author").toString)
71 | val authorArray = jsonAuthor.toString.split(",")
72 | // 用户id
73 | val userId = authorArray(4)
74 | // 游玩时长
75 | val hours = authorArray(5)
76 |
77 | // 评论
78 | var review = row.getAs("review").toString
79 | val reviewPatterns = "[^\\u4e00-\\u9fa5a-zA-Z0-9 ]".r
80 | review = reviewPatterns.replaceAllIn(review, "")
81 |
82 | // 玩家顶点
83 | val playerKey = "user_" + userId
84 | var playerPoint = 0L
85 | if (pointMap.containsKey(playerKey)) {
86 | playerPoint = pointMap.get(playerKey)
87 | // 权重+1
88 | weightMap.put(playerPoint, weightMap.get(playerPoint) + 1)
89 | } else {
90 | // 点id递增
91 | this.synchronized {
92 | key = key + 1
93 | playerPoint = key
94 | }
95 | pointMap.put(playerKey, playerPoint)
96 | // 权重赋予1
97 | weightMap.put(playerPoint, 1)
98 | }
99 |
100 | // 游戏顶点
101 | val gameKey = "game_" + game
102 | var gamePoint = 0L
103 | if (pointMap.containsKey(gameKey)) {
104 | gamePoint = pointMap.get(gameKey)
105 | // 权重+1
106 | weightMap.put(gamePoint, weightMap.get(gamePoint) + 1)
107 | } else {
108 | this.synchronized {
109 | key = key + 1
110 | gamePoint = key
111 | }
112 | pointMap.put(gameKey, gamePoint)
113 | // 权重赋予1
114 | weightMap.put(gamePoint, 1)
115 | }
116 |
117 | edgeMap1.put((playerPoint, gamePoint), review)
118 | edgeMap2.put((playerPoint, gamePoint), hours)
119 | })
120 |
121 | println("foreach 结束")
122 |
123 | // 点集
124 | var vertexArray = Seq((0L, ("test", "test")))
125 | // 评论边
126 | var edgeArray1 = Seq(Edge(0L, 0L, ""))
127 | // 时长边
128 | var edgeArray2 = Seq(Edge(0L, 0L, ""))
129 |
130 | // 添加点
131 | val pointSet = pointMap.keySet()
132 | // TODO 遍历迭代map,这个阶段非常耗时,如何改进?
133 | val point_iter = pointSet.iterator
134 | while (point_iter.hasNext) {
135 | val key = point_iter.next
136 | val name = key.split("_")
137 | vertexArray = vertexArray :+ (pointMap.get(key), (name(0), name(1)))
138 | }
139 |
140 | println("遍历点集结束")
141 |
142 | // 添加评论边
143 | val edgeSet1 = edgeMap1.keySet()
144 | // 遍历迭代map
145 | val edge_iter1 = edgeSet1.iterator
146 | while (edge_iter1.hasNext) {
147 | val key = edge_iter1.next
148 | edgeArray1 = edgeArray1 :+ Edge(key._1, key._2, edgeMap1.get(key))
149 | }
150 |
151 | println("遍历评论边结束")
152 |
153 | // 添加时长边
154 | val edgeSet2 = edgeMap2.keySet()
155 | // 遍历迭代map
156 | val edge_iter2 = edgeSet2.iterator
157 | while (edge_iter2.hasNext) {
158 | val key = edge_iter2.next
159 | edgeArray2 = edgeArray2 :+ Edge(key._1, key._2, edgeMap2.get(key))
160 | }
161 |
162 | println("遍历结束")
163 |
164 | // 构造vertexRDD和edgeRDD
165 | val vertexRDD: RDD[(Long, (String, String))] = spark.sparkContext.parallelize(vertexArray)
166 | val edgeRDD1: RDD[Edge[String]] = spark.sparkContext.parallelize(edgeArray1)
167 | val edgeRDD2: RDD[Edge[String]] = spark.sparkContext.parallelize(edgeArray2)
168 |
169 | // 构造图Graph[VD,ED]
170 | var contentGraph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD1)
171 |
172 | println("构造contentGraph结束")
173 |
174 | // 构建子图,过滤评论为空的边
175 | contentGraph = contentGraph.subgraph(epred = e => !e.attr.equals(""))
176 | // 构建子图,过滤游戏权重大于10的
177 | contentGraph = contentGraph.subgraph(vpred = (id, vd) => {
178 | ((vd._1.equals("game") & weightMap.get(id) > 10) | (vd._1.equals("user")))
179 | })
180 |
181 | val degreeThreshold = 6
182 | // 度数>degreeThreshold的点集
183 | val contentDegreeArray = contentGraph.degrees.filter(_._2 > degreeThreshold).map(_._1).collect()
184 |
185 | // 保留度数符合规定的点
186 | contentGraph = contentGraph.subgraph(vpred = (id, vd) => {
187 | contentDegreeArray.contains(id)
188 | })
189 |
190 | // 边的转换操作,去除前端无法识别的字符,如评论表情等
191 | val reviewPatterns = "[^\\u4e00-\\u9fa5a-zA-Z0-9 ]".r
192 | contentGraph.mapEdges(e => e.attr = reviewPatterns.replaceAllIn(e.attr, ""))
193 |
194 | println("处理contentGraph结束")
195 |
196 | // 时长图
197 | var hourGraph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD2)
198 |
199 | println("构造hourGraph结束")
200 |
201 | // TODO 顶点的转换操作,根据用户id寻找用户名称
202 | hourGraph = hourGraph.mapVertices {
203 | case (id, (types, name)) => (types, name)
204 | }
205 |
206 | hourGraph = hourGraph.subgraph(vpred = (id, vd) => {
207 | ((vd._1.equals("game") & weightMap.get(id) > 10) | (vd._1.equals("user")))
208 | })
209 |
210 | // 度数>0的点集
211 | val hourDegreeArray = hourGraph.degrees.filter(_._2 > degreeThreshold).map(_._1).collect()
212 |
213 | // 去除孤立的点
214 | hourGraph = hourGraph.subgraph(vpred = (id, vd) => {
215 | hourDegreeArray.contains(id)
216 | })
217 |
218 | println("处理hourGraph结束")
219 |
220 | // 独立群体检测
221 | hourGraph.connectedComponents
222 | .vertices
223 | .map(_.swap)
224 | .groupByKey()
225 | .map(_._2)
226 | .foreach(println)
227 |
228 | /**
229 | * 将点数据写入MongoDB
230 | * Spark的算子是在executor上执行的,数据也是放在executor上。executor和driver并不在同一个jvm(local[*]除外),
231 | * 所以算子是不能访问在driver上的SparkSession对象
232 | * 如果一定要“在算子里访问SparkSession”,那你只能把数据collect回Driver,然后用Scala 集合的算子去做。这种情况下只能适
233 | * 用于数据量不大(多大取决于你分配给Driver的内存)
234 | */
235 | hourGraph.vertices.collect.foreach(v => {
236 |
237 | val id = v._1.toString
238 | val name = v._2.toString
239 |
240 | writeVerticesToMongodb(spark, id, name)
241 | })
242 |
243 |
244 | // 输出到文件
245 | val outputPath = "src/main/resources/"
246 | val pw1 = new PrintWriter(outputPath + "steam/hours_6_30W.gexf")
247 | pw1.write(hoursToGexf(hourGraph))
248 | pw1.close()
249 |
250 | val pw2 = new PrintWriter(outputPath + "steam/steam_6_30W.gexf")
251 | pw2.write(gameToGexf(contentGraph))
252 | pw2.close()
253 |
254 | spark.close()
255 | }
256 |
257 | /**
258 | * 点数据写入MongoDB
259 | */
260 | def writeVerticesToMongodb(spark: SparkSession, id: String, name: String) = {
261 |
262 | val document = new Document()
263 | document.append("verticeId", id).append("name", name)
264 |
265 | val seq = Seq(document)
266 | val df = spark.sparkContext.parallelize(seq)
267 |
268 | // 将数据写入mongo
269 | MongoSpark.save(df)
270 | }
271 |
272 | /**
273 | * 边据写入MongoDB
274 | */
275 | def writeEdgesToMongodb(spark: SparkSession, srcId: String, dstId: String, attr: String) = {
276 |
277 | val document = new Document()
278 | document.append("srcId", srcId).append("dstId", dstId).append("attr", attr)
279 |
280 | val seq = Seq(document)
281 | val df = spark.sparkContext.parallelize(seq)
282 |
283 | // 将数据写入mongo
284 | MongoSpark.save(df)
285 | }
286 |
287 | /**
288 | * 用户-游戏图,相比底下的图需要指定x,y的坐标
289 | *
290 | * @param graph
291 | * @tparam VD
292 | * @tparam ED
293 | * @return
294 | */
295 | def gameToGexf[VD, ED](graph: Graph[VD, ED]) = {
296 |
297 | "\n" +
298 | "\n" +
299 | "\n" +
300 | "\n" +
301 | "\n" +
302 | "\n" +
303 | "\n " +
304 | graph.vertices.map(v => {
305 | // 根据类别填充颜色和attvalue
306 | val types = v._2.toString.split(",")(0).replace("(", "")
307 | val name = v._2.toString.split(",")(1).replace(")", "")
308 | var color = ""
309 | var attvalue = 0
310 | if (types.equals("user")) {
311 | color = "r=\"236\" g=\"81\" b=\"72\""
312 | attvalue = 1
313 | } else {
314 | color = "r=\"236\" g=\"181\" b=\"72\""
315 | attvalue = 0
316 | }
317 | "\n" +
318 | "\n" +
319 | "\n" +
320 | "\n" +
321 | "\n" +
322 | // (x, y) 坐标
323 | "\n" +
324 | "\n" +
325 | "\n"
326 | }).collect().mkString +
327 | "\n " +
328 | "\n" +
329 | graph.edges.map(e => {
330 | "\n"
331 | }).collect().mkString +
332 | "\n" +
333 | "\n" +
334 | ""
335 | }
336 |
337 | /**
338 | * 时间输出为指定gexf格式
339 | *
340 | * @param graph :图
341 | * @tparam VD
342 | * @tparam ED
343 | * @return
344 | */
345 | def hoursToGexf[VD, ED](graph: Graph[VD, ED]) = {
346 |
347 | "\n" +
348 | "\n" +
349 | "\n" +
350 | "\n" +
351 | "\n" +
352 | "\n" +
353 | "\n " +
354 | graph.vertices.map(v => {
355 | // 根据类别填充颜色和attvalue
356 | val types = v._2.toString.split(",")(0).replace("(", "")
357 | val name = v._2.toString.split(",")(1).replace(")", "")
358 | var color = ""
359 | var attvalue = 0
360 | if (types.equals("user")) {
361 | color = "r=\"236\" g=\"81\" b=\"72\""
362 | attvalue = 1
363 | } else {
364 | color = "r=\"236\" g=\"181\" b=\"72\""
365 | attvalue = 0
366 | }
367 | "\n" +
368 | "\n" +
369 | "\n" +
370 | "\n" +
371 | "\n" +
372 | "\n" +
373 | "\n"
374 | }).collect().mkString +
375 | "\n " +
376 | "\n" +
377 | graph.edges.map(e => {
378 | "\n"
379 | }).collect().mkString +
380 | "\n" +
381 | "\n" +
382 | ""
383 | }
384 | }
385 |
--------------------------------------------------------------------------------
/spark-graphx/src/main/scala/cn/edu/nju/GraphProcessTest.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import java.io.PrintWriter
4 | import java.util
5 |
6 | import com.mongodb.spark.MongoSpark
7 | import org.apache.spark.SparkConf
8 | import org.apache.spark.graphx.{Edge, Graph}
9 | import org.apache.spark.rdd.RDD
10 | import org.apache.spark.sql.{DataFrame, SparkSession}
11 | import org.bson.Document
12 |
13 | import scala.util.Random
14 |
15 | /**
16 | * Created by thpffcj on 2019/11/2.
17 | */
18 | object GraphProcessTest {
19 |
20 | val pointMap = new util.HashMap[String, Long]()
21 | // 评论边
22 | val edgeMap1 = new util.HashMap[(Long, Long), String]()
23 | // 时长边
24 | val edgeMap2 = new util.HashMap[(Long, Long), String]()
25 | // 点权重map,根据id得到权重
26 | val weightMap = new util.HashMap[Long, Int]()
27 | // 点权重map,根据id得到权重
28 | val topGameSet = new util.HashSet[Long]()
29 |
30 | def main(args: Array[String]): Unit = {
31 |
32 | val conf = new SparkConf().setMaster("local[4]").setAppName("GraphProcess")
33 | conf.set("spark.mongodb.input.uri", "mongodb://localhost:27017/test.China.reviews")
34 | conf.set("spark.mongodb.input.partitioner", "MongoPaginateBySizePartitioner")
35 |
36 | val spark = SparkSession.builder().config(conf).getOrCreate()
37 |
38 | val frame: DataFrame = MongoSpark.load(spark)
39 |
40 | var key = 0L
41 |
42 | frame.foreach(row => {
43 |
44 | /**
45 | * 用户信息
46 | * 过滤非法输入符号
47 | */
48 | val jsonPlayer = row.getAs("user").toString.split(",")
49 | var player = ""
50 | if (jsonPlayer.length > 2) {
51 | player = jsonPlayer(jsonPlayer.length - 1)
52 | player = player.substring(0, player.length - 1)
53 | } else if (jsonPlayer(0).contains("帐户内")) {
54 | player = jsonPlayer(1)
55 | player = player.substring(0, player.length - 1)
56 | } else {
57 | player = jsonPlayer(0)
58 | player = player.substring(1, player.length)
59 | }
60 |
61 | // 过滤前端无法识别非法字符,比如表情等
62 | val namePatterns1 = "[`~!@#$%^&*()+=|{}':;',\\[\\]<>/?~!@#� \uE009\uF8F5¥%……& amp;*()——+|{}【】‘;:”“’。,、?]".r
63 | val namePatterns2 = "[^\\u4e00-\\u9fa5a-zA-Z0-9]".r
64 | player = namePatterns1.replaceAllIn(player, "")
65 | player = namePatterns2.replaceAllIn(player, "")
66 | if (player.length == 0) {
67 | player = "anonymous"
68 | }
69 |
70 | // 过滤用户名过滤后为空的数据
71 | if (!player.equals("anonymous")) {
72 | // 游戏信息
73 | val jsonGame = row.getAs("game").toString.split(",")
74 | var game = jsonGame(0).substring(1)
75 | game = namePatterns1.replaceAllIn(game, "")
76 |
77 | // 评论
78 | var content = row.getAs("content").toString.replace("
", "")
79 | val contentPatterns = "[^\\u4e00-\\u9fa5a-zA-Z0-9 ]".r
80 | content = contentPatterns.replaceAllIn(content, "")
81 |
82 | // 游玩时长
83 | val patterns = "[\\u4e00-\\u9fa5]".r // 匹配汉字
84 | val hours = patterns.replaceAllIn(row.getAs("hours").toString, "")
85 |
86 | // 玩家顶点
87 | val playerKey = "user_" + player
88 | var playerPoint = 0L
89 | if (pointMap.containsKey(playerKey)) {
90 | playerPoint = pointMap.get(playerKey)
91 | // 权重+1
92 | weightMap.put(playerPoint, weightMap.get(playerPoint) + 1)
93 | } else {
94 | key = key + 1
95 | playerPoint = key
96 | pointMap.put(playerKey, playerPoint)
97 | // 权重赋予1
98 | weightMap.put(playerPoint, 1)
99 | }
100 |
101 | // 游戏顶点
102 | val gameKey = "game_" + game
103 | var gamePoint = 0L
104 | if (pointMap.containsKey(gameKey)) {
105 | gamePoint = pointMap.get(gameKey)
106 | // 权重+1
107 | weightMap.put(gamePoint, weightMap.get(gamePoint) + 1)
108 | } else {
109 | key = key + 1
110 | gamePoint = key
111 | pointMap.put(gameKey, gamePoint)
112 | // 权重赋予1
113 | weightMap.put(gamePoint, 1)
114 | }
115 |
116 | edgeMap1.put((playerPoint, gamePoint), content)
117 | edgeMap2.put((playerPoint, gamePoint), hours)
118 | }
119 |
120 | // KurokaneSS CODE VEIN 带妹子也就图一乐,打架还得靠云哥
121 | // println(player + " " + game + " " + content)
122 | })
123 |
124 | // 点集
125 | var vertexArray = Seq((0L, ("test", "test")))
126 | // 评论边
127 | var edgeArray1 = Seq(Edge(0L, 0L, ""))
128 | // 时长边
129 | var edgeArray2 = Seq(Edge(0L, 0L, ""))
130 |
131 | // 添加点
132 | val pointSet = pointMap.keySet()
133 | // 遍历迭代map
134 | val point_iter = pointSet.iterator
135 | while (point_iter.hasNext) {
136 | val key = point_iter.next
137 | // println(key)
138 | vertexArray = vertexArray :+ (pointMap.get(key), (key.split("_")(0), key.split("_")(1)))
139 | }
140 |
141 | // 添加边
142 | val edgeSet1 = edgeMap1.keySet()
143 | // 遍历迭代map
144 | val edge_iter1 = edgeSet1.iterator
145 | while (edge_iter1.hasNext) {
146 | val key = edge_iter1.next
147 | edgeArray1 = edgeArray1 :+ Edge(key._1, key._2, edgeMap1.get(key))
148 | }
149 |
150 | // 添加边
151 | val edgeSet2 = edgeMap2.keySet()
152 | // 遍历迭代map
153 | val edge_iter2 = edgeSet2.iterator
154 | while (edge_iter2.hasNext) {
155 | val key = edge_iter2.next
156 | edgeArray2 = edgeArray2 :+ Edge(key._1, key._2, edgeMap2.get(key))
157 | }
158 |
159 | // 构造vertexRDD和edgeRDD
160 | val vertexRDD: RDD[(Long, (String, String))] = spark.sparkContext.parallelize(vertexArray)
161 | val edgeRDD1: RDD[Edge[String]] = spark.sparkContext.parallelize(edgeArray1)
162 | val edgeRDD2: RDD[Edge[String]] = spark.sparkContext.parallelize(edgeArray2)
163 |
164 | // 构造图Graph[VD,ED]
165 | var contentGraph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD1)
166 | // 构建子图,过滤评论为空的边
167 | contentGraph = contentGraph.subgraph(epred = e => !e.attr.equals(""))
168 | // 构建子图,过滤游戏权重大于15的
169 | contentGraph = contentGraph.subgraph(vpred = (id, vd) => {
170 | ((vd._1.equals("game") & weightMap.get(id) > 15) | (vd._1.equals("user")))
171 | })
172 |
173 | contentGraph.vertices.foreach(v => {
174 | if (v._2._1.equals("game")) {
175 | topGameSet.add(v._1)
176 | }
177 | })
178 |
179 |
180 |
181 | // 经过过滤后有些顶点是没有边,所以采用leftOuterJoin将这部分顶点去除
182 | // val vertices = contentGraph.vertices.leftOuterJoin(vertex).map(x => (x._1, x._2._2.getOrElse("")))
183 | // val newGraph: Graph[(String, String), String] = Graph(vertices, edge)
184 |
185 |
186 | val hourGraph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD2)
187 |
188 | contentGraph.vertices.foreach(println(_))
189 | // println(hourGraph.toString)
190 |
191 | // 输出到文件
192 | val outputPath = "src/main/resources/"
193 | // val pw1 = new PrintWriter(outputPath + "hours.xml")
194 | // pw1.write(hoursToGexf(hourGraph))
195 | // pw1.close()
196 |
197 | val pw2 = new PrintWriter(outputPath + "steam.gexf")
198 | pw2.write(gameToGexf(contentGraph))
199 | pw2.close()
200 |
201 | spark.close()
202 | }
203 |
204 | /**
205 | * 数据写入MongoDB
206 | */
207 | def writeToMongodb() = {
208 |
209 | val spark = SparkSession.builder()
210 | .master("local")
211 | .appName("MongoDBProcess")
212 | .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/test.China.graph")
213 | .getOrCreate()
214 |
215 | // 设置log级别
216 | spark.sparkContext.setLogLevel("WARN")
217 |
218 | val document1 = new Document()
219 | document1.append("name", "sunshangxiang").append("age", 18).append("sex", "female")
220 |
221 | val seq = Seq(document1)
222 | val df = spark.sparkContext.parallelize(seq)
223 |
224 | // 将数据写入mongo
225 | MongoSpark.save(df)
226 |
227 | spark.stop()
228 | }
229 |
230 | /**
231 | * 用户-游戏图,相比底下的图需要指定x,y的坐标
232 | * @param graph
233 | * @tparam VD
234 | * @tparam ED
235 | * @return
236 | */
237 | def gameToGexf[VD, ED](graph: Graph[VD, ED]) = {
238 |
239 | "\n" +
240 | "\n" +
241 | "\n" +
242 | "\n" +
243 | "\n" +
244 | "\n" +
245 | "\n " +
246 | graph.vertices.map(v => {
247 | // 根据类别填充颜色和attvalue
248 | val types = v._2.toString.split(",")(0).replace("(", "")
249 | val name = v._2.toString.split(",")(1).replace(")", "")
250 | var color = ""
251 | var attvalue = 0
252 | if (types.equals("user")) {
253 | color = "r=\"236\" g=\"81\" b=\"72\""
254 | attvalue = 1
255 | } else {
256 | color = "r=\"236\" g=\"181\" b=\"72\""
257 | attvalue = 0
258 | }
259 | "\n" +
260 | "\n" +
261 | "\n" +
262 | "\n" +
263 | "\n" +
264 | // (x, y) 坐标
265 | "\n" +
266 | "\n" +
267 | "\n"
268 | }).collect().mkString +
269 | "\n " +
270 | "\n" +
271 | graph.edges.map(e => {
272 | "\n"
273 | }).collect().mkString +
274 | "\n" +
275 | "\n" +
276 | ""
277 | }
278 |
279 | /**
280 | * 输出为指定gexf格式
281 | *
282 | * @param graph :图
283 | * @tparam VD
284 | * @tparam ED
285 | * @return
286 | */
287 | def hoursToGexf[VD, ED](graph: Graph[VD, ED]) = {
288 |
289 | "\n" +
290 | "\n" +
291 | "\n" +
292 | "\n" +
293 | "\n" +
294 | "\n" +
295 | "\n " +
296 | graph.vertices.map(v => {
297 | // 根据类别填充颜色和attvalue
298 | val types = v._2.toString.split(",")(0).replace("(", "")
299 | val name = v._2.toString.split(",")(1).replace(")", "")
300 | var color = ""
301 | var attvalue = 0
302 | if (types.equals("user")) {
303 | color = "r=\"236\" g=\"81\" b=\"72\""
304 | attvalue = 1
305 | } else {
306 | color = "r=\"236\" g=\"181\" b=\"72\""
307 | attvalue = 0
308 | }
309 | "\n" +
310 | "\n" +
311 | "\n" +
312 | "\n" +
313 | "\n" +
314 | "\n" +
315 | "\n"
316 | }).collect().mkString +
317 | "\n " +
318 | "\n" +
319 | graph.edges.map(e => {
320 | "\n"
321 | }).collect().mkString +
322 | "\n" +
323 | "\n" +
324 | ""
325 | }
326 | }
327 |
--------------------------------------------------------------------------------
/spark-graphx/src/main/scala/cn/edu/nju/MongoDBProcess.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import com.mongodb.spark.config.{ReadConfig, WriteConfig}
4 | import com.mongodb.spark.sql._
5 | import org.apache.spark.sql.SparkSession
6 |
7 | /**
8 | * Created by thpffcj on 2019/10/31.
9 | */
10 | object MongoDBProcess {
11 |
12 | def main(args: Array[String]): Unit = {
13 |
14 | val MongoUri1 = args(0).toString
15 | val MongoUri2 = args(1).toString
16 | val SparkMasterUri = args(2).toString
17 |
18 | def makeMongoURI(uri: String, database: String, collection: String) = (s"${uri}/${database}.${collection}")
19 |
20 | val mongoURI1 = s"mongodb://${MongoUri1}:27017"
21 | val mongoURI2 = s"mongodb://${MongoUri2}:27017"
22 |
23 | val CONFdb1 = makeMongoURI(s"${mongoURI1}", "MyColletion1", "df")
24 | val CONFdb2 = makeMongoURI(s"${mongoURI2}", "MyColletion2", "df")
25 |
26 | val WRITEdb1: WriteConfig = WriteConfig(scala.collection.immutable.Map("uri" -> CONFdb1))
27 | val READdb1: ReadConfig = ReadConfig(Map("uri" -> CONFdb1))
28 |
29 | val WRITEdb2: WriteConfig = WriteConfig(scala.collection.immutable.Map("uri" -> CONFdb2))
30 | val READdb2: ReadConfig = ReadConfig(Map("uri" -> CONFdb2))
31 |
32 | val spark = SparkSession
33 | .builder
34 | .appName("AppMongo")
35 | .config("spark.worker.cleanup.enabled", "true")
36 | .config("spark.scheduler.mode", "FAIR")
37 | .getOrCreate()
38 |
39 | val df1 = spark.read.mongo(READdb1)
40 | val df2 = spark.read.mongo(READdb2)
41 | df1.write.mode("overwrite").mongo(WRITEdb1)
42 | df2.write.mode("overwrite").mongo(WRITEdb2)
43 | }
44 | }
45 |
46 |
--------------------------------------------------------------------------------
/spark-mllib/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 |
6 | cn.edu.nju
7 | mf1932063
8 | 1.0-SNAPSHOT
9 |
10 |
11 | 2.11.8
12 | 2.4.0
13 | 1.8
14 | 1.8
15 |
16 |
17 |
18 |
19 |
20 | cloudera
21 | https://repository.cloudera.com/artifactory/cloudera-repos
22 |
23 |
24 |
25 |
26 |
27 |
28 | org.scala-lang
29 | scala-library
30 | ${scala.version}
31 |
32 |
33 |
34 | org.apache.spark
35 | spark-mllib_2.11
36 | ${spark.version}
37 |
38 |
39 |
40 | com.fasterxml.jackson.module
41 | jackson-module-scala_2.11
42 | 2.6.5
43 |
44 |
45 |
46 | io.netty
47 | netty-all
48 | 4.1.42.Final
49 |
50 |
51 |
52 |
53 |
54 |
58 |
59 |
60 | org.scala-tools
61 | maven-scala-plugin
62 |
63 |
64 |
65 | compile
66 | testCompile
67 |
68 |
69 |
70 |
71 | ${scala.version}
72 |
73 | -target:jvm-1.8
74 |
75 |
76 |
77 |
78 | org.apache.maven.plugins
79 | maven-eclipse-plugin
80 |
81 | true
82 |
83 | ch.epfl.lamp.sdt.core.scalabuilder
84 |
85 |
86 | ch.epfl.lamp.sdt.core.scalanature
87 |
88 |
89 | org.eclipse.jdt.launching.JRE_CONTAINER
90 | ch.epfl.lamp.sdt.launching.SCALA_CONTAINER
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 | org.scala-tools
100 | maven-scala-plugin
101 |
102 | ${scala.version}
103 |
104 |
105 |
106 |
107 |
108 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/resources/game_content.txt:
--------------------------------------------------------------------------------
1 | Best hunting game ever created.
2 | The best game in Assassin's creed franchise in my opinion. The story is so good!
3 | very shooty much nice
4 | It's a good game. However Horde mode slightly lets me down. But the shooting is still very nice !
5 | Really fun game with plenty to do about, it would be even better if there would be more modding for the game.
6 | It is quite a good game. Add mods, and it becomes better.
7 | best game made me want to break things but in a good way
8 | Play this with your dates and get your knob wet! Or play this with your friends and get yelled at
9 | Best Tom Clancy game I've ever played. I never get tired of playing this game.
10 | Classic style and very challenging, but tons of fun. I can't even beat the entire game. I would still highly recommend this game to anyone.
11 | Not worth it
12 | Bad data handling policies at Paradox prevent me from recommending any of their games. CK2 in particular is an especially egregious example. The original purchase agreement was altered from opt out data collection to forced data collection with no compensation given. Changing the base game to free did not help at all since those who bought it already paid for the privilege of having their data stolen all the time and local data sometimes deleted despite settings preventing updates.
13 | I really wanted to like this game. Unfortunately, I found it frenetic, and the opening levels didn't give the sense of achievement that helps you pursue success at more difficult levels. Having to go around counters which are in your way right from the beginning, is very frustrating. No chef in their right mind would set up such a crappy working layout, and it drove me batty.
14 | Don't play this game.
15 | I only recommend for hardcore AS fans just for story purposes not like the story is very good to begin with. Although it is set in one of my favourite historic time periods, The French Revolution, it doesn't make me feel very invested because the story just wasn't interesting to me. Game does and sill has bad bugs even after it's disastrous release back when it first released.
16 | I downloaded the Trial version a few months back as it was free. Prior to downloading, I checked that my computer and GPU was adequate enough to run it (an 8gb i7-4770 with a 2gb GTX 750ti was above it's minimum specs) and spent an hour downloading it. Upon playing it however, it was choppy and slow throughout and regardless of whatever low-medium range settings I used to try to remedy these issues (I certainly wasn't expecting high settings at 1080p given my setup anyway) it was still unplayable and was uninstalled around 7 minutes later. So unless the performance issues are patched, I'd best steer clear until these are rectified.
17 | I really wanted to like this game. Unfortunately, I found it frenetic, and the opening levels didn't give the sense of achievement that helps you pursue success at more difficult levels. Having to go around counters which are in your way right from the beginning, is very frustrating. No chef in their right mind would set up such a crappy working layout, and it drove me batty. :(
18 | Terrible boring game, saving the game doesn't work, you always spawn at a checkpoint. Same enemies over and over again. Your basically stuck in a cave all the damn time. Weak story.
19 | horrible ai dont lots of bugs still eq dosent have enough power frame rate suck also waste yur money like i did like on 1/2 steam games i have
20 | The mechanics make no sense. Strategy like this should be turn-based or else its just a race of who clicks faster
21 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/resources/recommend_validate:
--------------------------------------------------------------------------------
1 | userId,gameId,gameName,rating,random
2 | 1,22,Dota 2,8.0,0.1
3 | 1,40,Counter-Strike Global Offensive,3.0,0.2
4 | 1,5,Left 4 Dead 2,5.0,0.3
5 | 1,10,Team Fortress 2,6.0,0.4
6 | 1,29,Sid Meier's Civilization V,7.0,0.5
7 | 1,8,Poly Bridge,9.0,0.6
8 | 1,875,Assassin's Creed IV,9.0,0.7
9 | 1,412,Cities Skylines,8.0,0.8
10 | 1,2,Fallout 4,9.0,0.9
11 | 1,6,HuniePop,3.0,1.0
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/cn/edu/nju/DataProcessing.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import java.util
4 |
5 | import org.apache.spark.SparkConf
6 | import org.apache.spark.sql.{SaveMode, SparkSession}
7 |
8 | import scala.util.Random
9 |
10 | /**
11 | * Created by thpffcj on 2019/11/18.
12 | */
13 | object DataProcessing {
14 |
15 | def main(args: Array[String]): Unit = {
16 |
17 | getStreamRating()
18 | }
19 |
20 | def getStreamRating() = {
21 |
22 | val gameMap = new util.HashMap[String, Int]()
23 | // 游戏出现次数
24 | val gameNumber = new util.HashMap[String, Int]()
25 | val maxTimeMap = new util.HashMap[String, Double]
26 |
27 | val conf = new SparkConf().setMaster("local").setAppName("DataProcessing")
28 | val spark = SparkSession.builder().config(conf).getOrCreate()
29 |
30 | var data = spark.read.format("csv")
31 | .option("header", "true")
32 | .option("inferSchema", "true")
33 | .load("src/main/resources/steam.csv")
34 | .select("userId", "gameName", "behavior", "duration", "gameId")
35 |
36 | data = data.filter(row => row.getAs("behavior").equals("play"))
37 |
38 | var key = 1
39 | data.collect().foreach(row => {
40 |
41 | val gameName = row.getAs("gameName").toString
42 | val duration = row.getAs("duration").toString.toDouble
43 |
44 | if (!gameMap.containsKey(gameName)) {
45 | gameMap.put(gameName, key)
46 | key = key + 1
47 | }
48 |
49 | if (gameNumber.containsKey(gameName)) {
50 | gameNumber.put(gameName, gameNumber.get(gameName) + 1)
51 | } else {
52 | gameNumber.put(gameName, 1)
53 | }
54 |
55 | if (maxTimeMap.containsKey(gameName)) {
56 | if (duration > maxTimeMap.get(gameName)) {
57 | maxTimeMap.put(gameName, duration)
58 | }
59 | } else {
60 | maxTimeMap.put(gameName, duration)
61 | }
62 |
63 | })
64 |
65 | import spark.implicits._
66 | val rand = new Random()
67 | val cleanData = data.filter(row => {
68 | gameNumber.get(row.getAs("gameName").toString) > 2
69 | }).map(row => {
70 |
71 | val userId = row.getAs("userId").toString
72 | val gameName = row.getAs("gameName").toString
73 | var duration = (row.getAs("duration").toString.toDouble / maxTimeMap.get(gameName) * 10).formatted("%.2f")
74 | if (duration.toDouble < 1.0) {
75 | duration = "1.0"
76 | }
77 | val gameId = gameMap.get(gameName)
78 | val random = rand.nextDouble()
79 |
80 | (userId, gameId, gameName, duration, random)
81 | })
82 |
83 | cleanData.repartition(1).write.format("com.databricks.spark.csv")
84 | .option("header", "false")
85 | .option("delimiter", ",")
86 | .mode(SaveMode.Overwrite)
87 | .save("src/main/resources/steam_rating.csv")
88 |
89 | spark.stop()
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/cn/edu/nju/EmotionAnalysis.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
5 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
6 | import org.apache.spark.ml.feature.{HashingTF, IDF, IDFModel}
7 | import org.apache.spark.sql.SparkSession
8 |
9 | import scala.util.Random
10 |
11 | /**
12 | * Created by thpffcj on 2019/11/20.
13 | */
14 |
15 | object EmotionAnalysis {
16 |
17 | def main(args: Array[String]): Unit = {
18 | test()
19 | }
20 |
21 | /**
22 | * (31806,32974,64780)
23 | * accuracy is 0.6932404540763674
24 | */
25 | def train() = {
26 |
27 | val conf = new SparkConf().setMaster("local").setAppName("EmotionAnalysis")
28 | val spark = SparkSession.builder().config(conf).getOrCreate()
29 | // 日志级别
30 | spark.sparkContext.setLogLevel("WARN")
31 |
32 | val rand = new Random()
33 |
34 | import spark.implicits._
35 | // 数据预处理
36 | val neg = spark.read.textFile("src/main/resources/neg.txt").map(line => {
37 | // 分词
38 | (line.split(" ").filter(!_.equals(" ")), 0, rand.nextDouble())
39 | }).toDF("words", "value", "random")
40 |
41 | val pos = spark.read.textFile("src/main/resources/pos.txt").map(line => {
42 | (line.split(" ").filter(!_.equals(" ")), 1, rand.nextDouble())
43 | }).toDF("words", "value", "random") // 思考:这里把inner function提出重用来如何操作
44 |
45 | // 合并乱序
46 | val data = neg.union(pos).sort("random")
47 | println(neg.count(), pos.count(), data.count()) // 合并
48 |
49 | // 文本特征抽取(TF-IDF)
50 | val hashingTf = new HashingTF()
51 | .setInputCol("words")
52 | .setOutputCol("hashing")
53 | .transform(data)
54 |
55 | val idfModel = new IDF()
56 | .setInputCol("hashing")
57 | .setOutputCol("tfidf")
58 | .fit(hashingTf)
59 |
60 | val transformedData = idfModel.transform(hashingTf)
61 | val Array(training, test) = transformedData
62 | .randomSplit(Array(0.7, 0.3))
63 |
64 | // 根据抽取到的文本特征,使用分类器进行分类,这是一个二分类问题
65 | // 分类器是可替换的
66 | val bayes = new NaiveBayes()
67 | .setFeaturesCol("tfidf") // X
68 | .setLabelCol("value") // y 0:消极,1:积极
69 | .fit(training)
70 |
71 | // 交叉验证
72 | val result = bayes.transform(test)
73 | // result.show(false)
74 |
75 | // 评估模型的准确率
76 | val evaluator = new MulticlassClassificationEvaluator()
77 | .setLabelCol("value")
78 | .setPredictionCol("prediction")
79 | .setMetricName("accuracy")
80 |
81 | val accuracy = evaluator.evaluate(result)
82 | println(s"""accuracy is $accuracy""")
83 |
84 | // idfModel.save("src/main/resources/model/IDFModel.model")
85 | // bayes.save("src/main/resources/model/content_emotion.model")
86 |
87 | // 重构思考:
88 | // 尝试用pipeline重构代码
89 | // 尝试用模型预测随便属于一句话的情感,例如:
90 | // You are a bad girl,I hate you. ^_^
91 |
92 | spark.stop()
93 | }
94 |
95 | def test() = {
96 |
97 | val conf = new SparkConf().setMaster("local").setAppName("EmotionAnalysis")
98 | val spark = SparkSession.builder().config(conf).getOrCreate()
99 |
100 | import spark.implicits._
101 | val content = spark.read.textFile("src/main/resources/game_content.txt").map(line => {
102 | (line.split(" ").filter(!_.equals(" ")))
103 | }).toDF("words")
104 |
105 | // 文本特征抽取(TF-IDF)
106 | val hashingTf = new HashingTF()
107 | .setInputCol("words")
108 | .setOutputCol("hashing")
109 | .transform(content)
110 |
111 | val idfModel = IDFModel.load("src/main/resources/model/IDFModel.model")
112 |
113 | val transformedData = idfModel.transform(hashingTf)
114 |
115 | val bayes = NaiveBayesModel.load("src/main/resources/model/content_emotion.model")
116 |
117 | val result = bayes.transform(transformedData)
118 | result.show()
119 |
120 | spark.stop()
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/cn/edu/nju/SteamGameRecommendation.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.ml.evaluation.RegressionEvaluator
5 | import org.apache.spark.ml.recommendation.{ALS, ALSModel}
6 | import org.apache.spark.sql.SparkSession
7 |
8 | /**
9 | * Created by thpffcj on 2019/11/16.
10 | */
11 | object SteamGameRecommendation {
12 |
13 | def main(args: Array[String]): Unit = {
14 |
15 | test()
16 | }
17 |
18 | def train() = {
19 |
20 | val conf = new SparkConf().setMaster("local[4]").setAppName("SteamGameRecommendation")
21 | val spark = SparkSession.builder().config(conf).getOrCreate()
22 | spark.sparkContext.setLogLevel("WARN")
23 |
24 | val data = spark.read.format("csv")
25 | .option("header", "true")
26 | .option("inferSchema", "true")
27 | .load("src/main/resources/steam_rating.csv")
28 | .select("userId", "gameId", "gameName", "rating", "random")
29 | .sort("random")
30 | .select("userId", "gameId", "rating")
31 |
32 | val Array(train, test) = data.randomSplit(Array(0.7, 0.3))
33 |
34 | val als = new ALS()
35 | .setMaxIter(20)
36 | .setUserCol("userId")
37 | .setItemCol("gameId")
38 | .setRatingCol("rating")
39 | // 正则化参数
40 | .setRegParam(0.01)
41 |
42 | val model = als.fit(train)
43 |
44 | // 冷启动策略
45 | model.setColdStartStrategy("drop")
46 |
47 | val predictions = model.transform(test)
48 | // 根据(userId, gameId)预测rating
49 | predictions.show(false)
50 |
51 | // 模型评估
52 | val evaluator = new RegressionEvaluator()
53 | .setMetricName("rmse")
54 | .setLabelCol("rating")
55 | .setPredictionCol("prediction")
56 |
57 | val rmse = evaluator.evaluate(predictions)
58 | println(s"Root-mean-square error is $rmse \n")
59 |
60 | // Spark机器学习模型的持久化
61 | // 模型保存
62 | model.save("src/main/resources/model/game_recommendation.model")
63 |
64 | spark.stop()
65 | }
66 |
67 | def test() = {
68 |
69 | val conf = new SparkConf().setMaster("local").setAppName("SteamGameRecommendation")
70 | val spark = SparkSession.builder().config(conf).getOrCreate()
71 | spark.sparkContext.setLogLevel("WARN")
72 |
73 | // 模型加载
74 | val model = ALSModel.load("src/main/resources/model/game_recommendation.model")
75 |
76 | import spark.implicits._
77 | val users = spark.createDataset(Array(1)).toDF("userId")
78 | users.show(false)
79 |
80 | model.recommendForUserSubset(users, 20).show(false)
81 |
82 | spark.stop()
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/spark-streaming/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 | 4.0.0
6 | cn.edu.nju
7 | spark-streaming
8 | 1.0
9 |
10 | org.springframework.boot
11 | spring-boot-starter-parent
12 | 2.2.0.RELEASE
13 |
14 |
15 |
16 | 2.11.8
17 | 2.2.0
18 | 2.4.0
19 | 2.6.0-cdh5.16.2
20 | 1.2.0-cdh5.16.2
21 | 1.8
22 | 1.8
23 |
24 |
25 |
26 |
27 |
28 | cloudera
29 | https://repository.cloudera.com/artifactory/cloudera-repos
30 |
31 |
32 | central
33 | aliyun maven
34 | http://maven.aliyun.com/nexus/content/groups/public/
35 | default
36 |
37 |
38 |
39 |
40 |
41 | org.springframework.boot
42 | spring-boot-starter-web
43 |
44 |
45 | ch.qos.logback
46 | logback-classic
47 |
48 |
49 |
50 |
51 | org.springframework.boot
52 | spring-boot-starter-websocket
53 |
54 |
55 | org.springframework.session
56 | spring-session-core
57 |
58 |
59 | cn.hutool
60 | hutool-log
61 | 4.1.1
62 |
63 |
64 |
65 | org.springframework.boot
66 | spring-boot-starter-test
67 | test
68 |
69 |
70 | org.junit.vintage
71 | junit-vintage-engine
72 |
73 |
74 |
75 |
76 |
77 | org.projectlombok
78 | lombok
79 | 1.16.18
80 |
81 |
82 |
83 | org.scala-lang
84 | scala-library
85 | ${scala.version}
86 |
87 |
88 |
89 | org.mongodb.spark
90 | mongo-spark-connector_2.11
91 | ${spark.version}
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 | org.apache.hadoop
104 | hadoop-client
105 | ${hadoop.version}
106 |
107 |
108 |
109 |
110 | org.apache.hbase
111 | hbase-client
112 | ${hbase.version}
113 |
114 |
115 |
116 | org.apache.hbase
117 | hbase-server
118 | ${hbase.version}
119 |
120 |
121 |
122 |
123 | org.apache.spark
124 | spark-streaming_2.11
125 | ${spark.version}
126 |
127 |
128 |
129 | org.apache.spark
130 | spark-graphx_2.11
131 | ${spark.version}
132 |
133 |
134 |
135 | org.apache.spark
136 | spark-streaming-kafka-0-10_2.11
137 | ${spark.version}
138 |
139 |
140 |
141 | org.apache.commons
142 | commons-lang3
143 | 3.5
144 |
145 |
146 |
147 |
148 | org.apache.spark
149 | spark-sql_2.11
150 | ${spark.version}
151 |
152 |
153 |
154 | com.fasterxml.jackson.module
155 | jackson-module-scala_2.11
156 | 2.6.5
157 |
158 |
159 |
160 | org.elasticsearch
161 | elasticsearch-spark-20_2.11
162 | 6.5.4
163 |
164 |
165 |
166 | com.alibaba
167 | fastjson
168 | 1.2.47
169 |
170 |
171 |
172 | com.fasterxml.jackson.core
173 | jackson-databind
174 | 2.9.10.1
175 |
176 |
177 |
178 | net.jpountz.lz4
179 | lz4
180 | 1.3.0
181 |
182 |
183 |
184 | org.codehaus.janino
185 | janino
186 | 3.0.8
187 |
188 |
189 |
190 | mysql
191 | mysql-connector-java
192 | 5.1.38
193 |
194 |
195 |
196 | io.netty
197 | netty-all
198 | 4.1.42.Final
199 |
200 |
201 |
202 | com.mchange
203 | c3p0
204 | 0.9.5.2
205 |
206 |
207 |
208 |
209 |
210 |
214 |
215 |
216 | org.scala-tools
217 | maven-scala-plugin
218 |
219 |
220 |
221 | compile
222 | testCompile
223 |
224 |
225 |
226 |
227 | ${scala.version}
228 |
229 | -target:jvm-1.8
230 |
231 |
232 |
233 |
234 | org.apache.maven.plugins
235 | maven-eclipse-plugin
236 |
237 | true
238 |
239 | ch.epfl.lamp.sdt.core.scalabuilder
240 |
241 |
242 | ch.epfl.lamp.sdt.core.scalanature
243 |
244 |
245 | org.eclipse.jdt.launching.JRE_CONTAINER
246 | ch.epfl.lamp.sdt.launching.SCALA_CONTAINER
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 | org.scala-tools
256 | maven-scala-plugin
257 |
258 | ${scala.version}
259 |
260 |
261 |
262 |
263 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/ApiReturnUtil.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju;
2 |
3 | import cn.edu.nju.api.ApiReturnObject;
4 | import cn.hutool.log.Log;
5 | import cn.hutool.log.LogFactory;
6 |
7 | public class ApiReturnUtil {
8 |
9 | static Log log = LogFactory.get(WebSocketServer.class);
10 |
11 | public static ApiReturnObject error(String s) {
12 | log.error(s);
13 | return new ApiReturnObject(null);
14 | }
15 |
16 | public static ApiReturnObject success(String cid) {
17 | log.info("success:" + cid);
18 | return new ApiReturnObject(null);
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/DemoMessageController.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju;
2 |
3 | import cn.edu.nju.api.ApiReturnObject;
4 | import org.springframework.stereotype.Controller;
5 | import org.springframework.web.bind.annotation.GetMapping;
6 | import org.springframework.web.bind.annotation.PathVariable;
7 | import org.springframework.web.bind.annotation.RequestMapping;
8 | import org.springframework.web.bind.annotation.ResponseBody;
9 |
10 | import java.io.IOException;
11 |
12 | @Controller
13 | @RequestMapping("/websocket")
14 | public class DemoMessageController {
15 |
16 | //页面请求
17 | @GetMapping("/")
18 | public String index() {
19 | return "index";
20 | }
21 | //推送数据接口
22 | @ResponseBody
23 | @RequestMapping("/socket/push/{cid}")
24 | public ApiReturnObject pushToWeb(@PathVariable String cid, String message) {
25 | try {
26 | WebSocketServer.sendInfo(message,cid);
27 | } catch (IOException e) {
28 | e.printStackTrace();
29 | return ApiReturnUtil.error(cid+"#"+e.getMessage());
30 | }
31 | return ApiReturnUtil.success(cid);
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/SteamserverdemoApplication.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju;
2 |
3 | import org.springframework.boot.SpringApplication;
4 | import org.springframework.boot.autoconfigure.SpringBootApplication;
5 |
6 | @SpringBootApplication
7 | public class SteamserverdemoApplication {
8 |
9 | public static void main(String[] args) {
10 | SpringApplication.run(SteamserverdemoApplication.class, args);
11 | }
12 |
13 | }
14 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/Test.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju;
2 |
3 | import cn.edu.nju.api.ApiReturnObject;
4 | import cn.edu.nju.api.TagReturnObject;
5 | import cn.edu.nju.utils.DateUtils;
6 |
7 | /**
8 | * Created by thpffcj on 2019/10/24.
9 | */
10 | public class Test {
11 |
12 | public static void main(String[] args) {
13 |
14 | MySQLProcess mySQLProcess = new MySQLProcess();
15 |
16 | ApiReturnObject apiReturnObject = mySQLProcess.getTimeFieldData(DateUtils.getSteamDates());
17 |
18 | TagReturnObject tagReturnObject = mySQLProcess.getTagData(2);
19 |
20 | System.out.println("hello");
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/TimeFieldObject.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju;
2 |
3 | import cn.edu.nju.domain.GameObject;
4 | import org.springframework.beans.factory.annotation.Autowired;
5 |
6 | import java.io.Serializable;
7 | import java.util.ArrayList;
8 |
9 | public class TimeFieldObject implements Serializable{
10 |
11 | @Autowired
12 | private String name;
13 |
14 | @Autowired
15 | private ArrayList values;
16 |
17 | public TimeFieldObject(String name, ArrayList values) {
18 | this.name = name;
19 | this.values = values;
20 | }
21 |
22 | public String getName() {
23 | return name;
24 | }
25 |
26 | public void setName(String name) {
27 | this.name = name;
28 | }
29 |
30 | public ArrayList getValues() {
31 | return values;
32 | }
33 |
34 | public void setValues(ArrayList values) {
35 | this.values = values;
36 | }
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/WebSocketConfig.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju;
2 |
3 | import org.springframework.context.annotation.Bean;
4 | import org.springframework.context.annotation.Configuration;
5 | import org.springframework.web.socket.server.standard.ServerEndpointExporter;
6 |
7 | @Configuration
8 | public class WebSocketConfig {
9 |
10 | @Bean
11 | public ServerEndpointExporter serverEndpointExporter(){
12 | return new ServerEndpointExporter();
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/WebSocketServer.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.concurrent.CopyOnWriteArraySet;
6 |
7 | import javax.websocket.*;
8 | import javax.websocket.server.PathParam;
9 | import javax.websocket.server.ServerEndpoint;
10 |
11 | import cn.edu.nju.api.ApiReturnObject;
12 | import cn.edu.nju.api.TagReturnObject;
13 | import cn.edu.nju.domain.GameObject;
14 | import cn.edu.nju.encoder.ApiObjectEncoder;
15 | import cn.edu.nju.utils.DateUtils;
16 | import org.springframework.stereotype.Component;
17 | import cn.hutool.log.Log;
18 | import cn.hutool.log.LogFactory;
19 |
20 | @ServerEndpoint(value = "/websocket/{sid}", encoders = {ApiObjectEncoder.class})
21 | @Component
22 | public class WebSocketServer {
23 |
24 | static Log log = LogFactory.get(WebSocketServer.class);
25 |
26 | // 静态变量,用来记录当前在线连接数。应该把它设计成线程安全的。
27 | private static int onlineCount = 0;
28 |
29 | // concurrent包的线程安全Set,用来存放每个客户端对应的MyWebSocket对象。
30 | private static CopyOnWriteArraySet webSocketSet = new CopyOnWriteArraySet();
31 |
32 | //与某个客户端的连接会话,需要通过它来给客户端发送数据
33 | private Session session;
34 |
35 | //接收sid
36 | private String sid = "";
37 |
38 | /**
39 | * 连接建立成功调用的方法
40 | */
41 | @OnOpen
42 | public void onOpen(Session session, @PathParam("sid") String sid) {
43 | this.session = session;
44 | webSocketSet.add(this); //加入set中
45 | addOnlineCount(); //在线数加1
46 | log.info("有新窗口开始监听:" + sid + ",当前在线人数为" + getOnlineCount());
47 | this.sid = sid;
48 | // GameObject gameObject1 = new GameObject("edge", "just so so", 2200, "blue");
49 | // GameObject gameObject2 = new GameObject("fire fox", "good", 900, "green");
50 | // GameObject gameObject3 = new GameObject("chrome", "excellent", 3800, "red");
51 | // GameObject gameObject4 = new GameObject("edge", "just so so", 1500, "blue");
52 | // GameObject gameObject5 = new GameObject("fire fox", "good", 1900, "green");
53 | // GameObject gameObject6 = new GameObject("chrome", "excellent", 2800, "red");
54 | // GameObject gameObject7 = new GameObject("edge", "just so so", 2600, "blue");
55 | // GameObject gameObject8 = new GameObject("fire fox", "good", 2200, "green");
56 | // GameObject gameObject9 = new GameObject("chrome", "excellent", 1800, "red");
57 | // ArrayList gameObjects1 = new ArrayList<>();
58 | // ArrayList gameObjects2 = new ArrayList<>();
59 | // ArrayList gameObjects3 = new ArrayList<>();
60 | // gameObjects1.add(gameObject1);
61 | // gameObjects1.add(gameObject2);
62 | // gameObjects1.add(gameObject3);
63 | // gameObjects2.add(gameObject4);
64 | // gameObjects2.add(gameObject5);
65 | // gameObjects2.add(gameObject6);
66 | // gameObjects3.add(gameObject7);
67 | // gameObjects3.add(gameObject8);
68 | // gameObjects3.add(gameObject9);
69 | // TimeFieldObject timeFieldObject1 = new TimeFieldObject("2017", gameObjects1);
70 | // TimeFieldObject timeFieldObject2 = new TimeFieldObject("2018", gameObjects2);
71 | // TimeFieldObject timeFieldObject3 = new TimeFieldObject("2019", gameObjects3);
72 | // ArrayList timeFieldObjects = new ArrayList<>();
73 | // timeFieldObjects.add(timeFieldObject1);
74 | // timeFieldObjects.add(timeFieldObject2);
75 | // timeFieldObjects.add(timeFieldObject3);
76 | MySQLProcess mySQLProcess = new MySQLProcess();
77 | ApiReturnObject apiReturnObject = mySQLProcess.getTimeFieldData(DateUtils.getSteamDates());
78 | try {
79 | sendData(apiReturnObject);
80 | for (int i = 1; i <= 7; i++) {
81 | TagReturnObject tagReturnObject = mySQLProcess.getTagData(i);
82 | sendTagData(tagReturnObject);
83 | Thread.sleep(5000);
84 | }
85 | } catch (IOException | EncodeException | InterruptedException e) {
86 | log.error("websocket IO异常"+e.getMessage());
87 | }
88 | }
89 |
90 | /**
91 | * 连接关闭调用的方法
92 | */
93 | @OnClose
94 | public void onClose() {
95 | webSocketSet.remove(this); //从set中删除
96 | subOnlineCount(); //在线数减1
97 | log.info("有一连接关闭!当前在线人数为" + getOnlineCount());
98 | }
99 |
100 | /**
101 | * 收到客户端消息后调用的方法
102 | *
103 | * @param message 客户端发送过来的消息
104 | */
105 | @OnMessage
106 | public void onMessage(String message, Session session) {
107 | log.info("收到来自窗口" + sid + "的信息:" + message);
108 | //群发消息
109 | for (WebSocketServer item : webSocketSet) {
110 | try {
111 | item.sendMessage(message);
112 | } catch (IOException e) {
113 | e.printStackTrace();
114 | }
115 | }
116 | }
117 |
118 | /**
119 | * @param session
120 | * @param error
121 | */
122 | @OnError
123 | public void onError(Session session, Throwable error) {
124 | log.error("发生错误");
125 | error.printStackTrace();
126 | }
127 |
128 | /**
129 | * 实现服务器主动推送
130 | */
131 | public void sendMessage(String message) throws IOException {
132 | this.session.getBasicRemote().sendText(message);
133 | }
134 |
135 | /**
136 | * 实现服务器主动推送
137 | */
138 | public void sendData(ApiReturnObject data) throws IOException, EncodeException {
139 | this.session.getBasicRemote().sendObject(data);
140 | }
141 |
142 | public void sendTagData(TagReturnObject data) throws IOException, EncodeException {
143 | this.session.getBasicRemote().sendObject(data);
144 | }
145 |
146 | /**
147 | * 群发自定义消息
148 | */
149 | public static void sendInfo(String message, @PathParam("sid") String sid) throws IOException {
150 | log.info("推送消息到窗口" + sid + ",推送内容:" + message);
151 | for (WebSocketServer item : webSocketSet) {
152 | try {
153 | //这里可以设定只推送给这个sid的,为null则全部推送
154 | if (sid == null) {
155 | item.sendMessage(message);
156 | } else if (item.sid.equals(sid)) {
157 | item.sendMessage(message);
158 | }
159 | } catch (IOException e) {
160 | continue;
161 | }
162 | }
163 | }
164 |
165 | public static synchronized int getOnlineCount() {
166 | return onlineCount;
167 | }
168 |
169 | public static synchronized void addOnlineCount() {
170 | WebSocketServer.onlineCount++;
171 | }
172 |
173 | public static synchronized void subOnlineCount() {
174 | WebSocketServer.onlineCount--;
175 | }
176 | }
177 |
178 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/api/ApiReturnObject.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.api;
2 |
3 | import cn.edu.nju.TimeFieldObject;
4 |
5 | import java.io.Serializable;
6 | import java.util.ArrayList;
7 |
8 | public class ApiReturnObject implements Serializable {
9 |
10 | private ArrayList timeFieldObjects;
11 |
12 | public ApiReturnObject(ArrayList timeFieldObjects) {
13 | this.timeFieldObjects = timeFieldObjects;
14 | }
15 |
16 | public ArrayList getTimeFieldObjects() {
17 | return timeFieldObjects;
18 | }
19 |
20 | public void setTimeFieldObjects(ArrayList timeFieldObjects) {
21 | this.timeFieldObjects = timeFieldObjects;
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/api/TagReturnObject.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.api;
2 |
3 | import cn.edu.nju.domain.TagObject;
4 | import lombok.AllArgsConstructor;
5 | import lombok.Data;
6 | import lombok.NoArgsConstructor;
7 |
8 | import java.io.Serializable;
9 | import java.util.ArrayList;
10 |
11 | /**
12 | * Created by thpffcj on 2019/10/25.
13 | */
14 | public class TagReturnObject implements Serializable {
15 |
16 | private ArrayList tagObjects;
17 |
18 | public TagReturnObject(ArrayList tagObjects) {
19 | this.tagObjects = tagObjects;
20 | }
21 |
22 | public ArrayList getTagObjects() {
23 | return tagObjects;
24 | }
25 |
26 | public void setTagObjects(ArrayList tagObjects) {
27 | this.tagObjects = tagObjects;
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/domain/GameObject.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain;
2 |
3 | import org.springframework.beans.factory.annotation.Autowired;
4 |
5 | import java.io.Serializable;
6 |
7 | public class GameObject implements Serializable {
8 |
9 | @Autowired
10 | private String id;
11 |
12 | @Autowired
13 | private String label;
14 |
15 | @Autowired
16 | private int value;
17 |
18 | @Autowired
19 | private String color;
20 |
21 | public GameObject(String id, String label, int value, String color) {
22 | this.id = id;
23 | this.label = label;
24 | this.value = value;
25 | this.color = color;
26 | }
27 |
28 | public String getId() {
29 | return id;
30 | }
31 |
32 | public void setId(String id) {
33 | this.id = id;
34 | }
35 |
36 | public String getLabel() {
37 | return label;
38 | }
39 |
40 | public void setLabel(String label) {
41 | this.label = label;
42 | }
43 |
44 | public int getValue() {
45 | return value;
46 | }
47 |
48 | public void setValue(int value) {
49 | this.value = value;
50 | }
51 |
52 | public String getColor() {
53 | return color;
54 | }
55 |
56 | public void setColor(String color) {
57 | this.color = color;
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/domain/TagObject.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain;
2 |
3 | import lombok.AllArgsConstructor;
4 | import lombok.Data;
5 | import lombok.NoArgsConstructor;
6 | import org.springframework.beans.factory.annotation.Autowired;
7 |
8 | import java.io.Serializable;
9 |
10 | /**
11 | * Created by thpffcj on 2019/10/25.
12 | */
13 | public class TagObject implements Serializable {
14 |
15 | @Autowired
16 | private String label;
17 |
18 | @Autowired
19 | private int value;
20 |
21 | public TagObject() {
22 | }
23 |
24 | public TagObject(String label, int value) {
25 | this.label = label;
26 | this.value = value;
27 | }
28 |
29 | public String getLabel() {
30 | return label;
31 | }
32 |
33 | public void setLabel(String label) {
34 | this.label = label;
35 | }
36 |
37 | public int getValue() {
38 | return value;
39 | }
40 |
41 | public void setValue(int value) {
42 | this.value = value;
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/encoder/ApiObjectEncoder.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.encoder;
2 |
3 | import javax.websocket.EncodeException;
4 | import javax.websocket.Encoder;
5 | import javax.websocket.EndpointConfig;
6 |
7 | import cn.edu.nju.api.ApiReturnObject;
8 | import com.alibaba.fastjson.JSON;
9 | import com.alibaba.fastjson.serializer.SerializerFeature;
10 | import com.alibaba.fastjson.serializer.SimplePropertyPreFilter;
11 |
12 | public class ApiObjectEncoder implements Encoder.Text {
13 |
14 | @Override
15 | public String encode(ApiReturnObject apiReturnObject) throws EncodeException {
16 | SimplePropertyPreFilter filter = new SimplePropertyPreFilter(
17 | ApiReturnObject.class, "timeFieldObjects");
18 | return JSON.toJSONString(apiReturnObject,filter,SerializerFeature.DisableCircularReferenceDetect);
19 | }
20 |
21 | @Override
22 | public void init(EndpointConfig endpointConfig) {
23 |
24 | }
25 |
26 | @Override
27 | public void destroy() {
28 |
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/encoder/TagObjectEncoder.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.encoder;
2 |
3 | import cn.edu.nju.api.TagReturnObject;
4 | import com.alibaba.fastjson.JSON;
5 | import com.alibaba.fastjson.serializer.SerializerFeature;
6 | import com.alibaba.fastjson.serializer.SimplePropertyPreFilter;
7 |
8 | import javax.websocket.EncodeException;
9 | import javax.websocket.Encoder;
10 | import javax.websocket.EndpointConfig;
11 |
12 | /**
13 | * Created by thpffcj on 2019/10/26.
14 | */
15 | public class TagObjectEncoder implements Encoder.Text {
16 |
17 | @Override
18 | public String encode(TagReturnObject tagReturnObject) throws EncodeException {
19 | SimplePropertyPreFilter filter = new SimplePropertyPreFilter(
20 | TagReturnObject.class, "tagObjects");
21 | return JSON.toJSONString(tagReturnObject,filter, SerializerFeature.DisableCircularReferenceDetect);
22 | }
23 |
24 | @Override
25 | public void init(EndpointConfig endpointConfig) {
26 |
27 | }
28 |
29 | @Override
30 | public void destroy() {
31 |
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/utils/DbPool.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.utils;
2 |
3 | import cn.edu.nju.domain.TagObject;
4 | import com.mchange.v2.c3p0.ComboPooledDataSource;
5 |
6 | import java.sql.Connection;
7 | import java.sql.SQLException;
8 | import java.util.ArrayList;
9 | import java.sql.PreparedStatement;
10 | import java.sql.ResultSet;
11 | import java.util.List;
12 |
13 | /**
14 | * Created by thpffcj on 2020/2/26.
15 | */
16 | public class DbPool {
17 |
18 | private static DbPool instance;
19 |
20 | private ComboPooledDataSource ds;
21 |
22 | private DbPool() throws Exception {
23 | ds = new ComboPooledDataSource();
24 | ds.setDriverClass("oracle.jdbc.driver.OracleDriver"); //驱动
25 | ds.setJdbcUrl("jdbc:oracle:thin:@localhost:1521:orcl"); //地址
26 | ds.setUser("test0816"); //数据库用户名
27 | ds.setPassword("934617699"); //数据库用户密码
28 |
29 | // 初始化时获取三个连接,取值应在minPoolSize与maxPoolSize之间。Default: 5 initialPoolSize
30 | ds.setInitialPoolSize(5);
31 | // 连接池中保留的最大连接数。Default: 20 maxPoolSize
32 | ds.setMaxPoolSize(20);
33 | // 连接池中保留的最小连接数。
34 | ds.setMinPoolSize(1);
35 | // 当连接池中的连接耗尽的时候c3p0一次同时获取的连接数。Default: 5 acquireIncrement
36 | ds.setAcquireIncrement(10);
37 | }
38 |
39 | // 用来返回该对象
40 | public static final DbPool getInstance() {
41 |
42 | if (instance == null) {
43 | try {
44 | instance = new DbPool();
45 | } catch (Exception e) {
46 | e.printStackTrace();
47 | }
48 | }
49 | return instance;
50 | }
51 |
52 | // 返回一个连接
53 | public synchronized final Connection getConnection() {
54 | try {
55 | return ds.getConnection();
56 | } catch (SQLException e) {
57 | e.printStackTrace();
58 | }
59 | return null;
60 | }
61 |
62 | public static void main(String[] args) {
63 | DbPool dbPool = DbPool.getInstance() ;
64 |
65 | List list = new ArrayList<>();
66 |
67 | Connection connection = dbPool.getConnection();
68 | String sql = "select * from person " ;
69 |
70 | try {
71 | PreparedStatement pt = connection.prepareStatement(sql) ;
72 | ResultSet rt = pt.executeQuery() ;
73 |
74 | while(rt.next()) {
75 | TagObject tag = new TagObject();
76 | tag.setLabel(rt.getString("label"));
77 | tag.setValue(rt.getInt("value"));
78 | list.add(tag) ;
79 | }
80 |
81 | for(TagObject tag : list) {
82 | System.out.println(tag);
83 | }
84 | } catch (SQLException e) {
85 | e.printStackTrace();
86 | }
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/utils/HBaseUtils.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.utils;
2 |
3 | import org.apache.hadoop.conf.Configuration;
4 | import org.apache.hadoop.hbase.client.HBaseAdmin;
5 | import org.apache.hadoop.hbase.client.HTable;
6 | import org.apache.hadoop.hbase.client.Put;
7 | import org.apache.hadoop.hbase.util.Bytes;
8 |
9 | import java.io.IOException;
10 |
11 | /**
12 | * Created by thpffcj on 2019/10/17.
13 | */
14 | public class HBaseUtils {
15 |
16 | HBaseAdmin admin = null;
17 | Configuration configuration = null;
18 |
19 | /**
20 | * 私有改造方法
21 | */
22 | private HBaseUtils() {
23 | configuration = new Configuration();
24 | configuration.set("hbase.zookeeper.quorum", "192.168.92.130:2181");
25 | configuration.set("hbase.rootdir", "hdfs://192.168.92.130:8020/hbase");
26 |
27 | try {
28 | admin = new HBaseAdmin(configuration);
29 | } catch (IOException e) {
30 | e.printStackTrace();
31 | }
32 | }
33 |
34 | private static HBaseUtils instance = null;
35 |
36 | public static synchronized HBaseUtils getInstance() {
37 | if (null == instance) {
38 | instance = new HBaseUtils();
39 | }
40 | return instance;
41 | }
42 |
43 |
44 | /**
45 | * 根据表名获取到HTable实例
46 | */
47 | public HTable getTable(String tableName) {
48 |
49 | HTable table = null;
50 |
51 | try {
52 | table = new HTable(configuration, tableName);
53 | } catch (IOException e) {
54 | e.printStackTrace();
55 | }
56 |
57 | return table;
58 | }
59 |
60 | /**
61 | * 添加一条记录到HBase表
62 | *
63 | * @param tableName HBase表名
64 | * @param rowkey HBase表的rowkey
65 | * @param cf HBase表的columnfamily
66 | * @param column HBase表的列
67 | * @param value 写入HBase表的值
68 | */
69 | public void put(String tableName, String rowkey, String cf, String column, String value) {
70 | HTable table = getTable(tableName);
71 |
72 | Put put = new Put(Bytes.toBytes(rowkey));
73 | put.add(Bytes.toBytes(cf), Bytes.toBytes(column), Bytes.toBytes(value));
74 |
75 | try {
76 | table.put(put);
77 | } catch (IOException e) {
78 | e.printStackTrace();
79 | }
80 | }
81 |
82 | public static void main(String[] args) {
83 |
84 | // HTable table = HBaseUtils.getInstance().getTable("imooc_course_clickcount");
85 | // System.out.println(table.getName().getNameAsString());
86 |
87 | String tableName = "imooc_course_clickcount";
88 | String rowkey = "20171111_88";
89 | String cf = "info";
90 | String column = "click_count";
91 | String value = "2";
92 | HBaseUtils.getInstance().put(tableName, rowkey, cf, column, value);
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/utils/Test.java:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.utils;
2 |
3 | import java.sql.Connection;
4 | import java.sql.DriverManager;
5 | import java.sql.ResultSet;
6 | import java.sql.Statement;
7 | import java.util.ArrayList;
8 | import java.util.List;
9 |
10 | /**
11 | * Created by thpffcj on 2019/10/28.
12 | */
13 | public class Test {
14 |
15 | public static void main(String[] args) {
16 |
17 | try {
18 | int[] time = new int[]{1483228800
19 | ,1485907200
20 | ,1488326400
21 | ,1491004800
22 | ,1493596800
23 | ,1496275200
24 | ,1498867200
25 | ,1501545600
26 | ,1504224000
27 | ,1506816000
28 | ,1509494400
29 | ,1512086400
30 | ,1514764800
31 | ,1517443200
32 | ,1519862400
33 | ,1522540800
34 | ,1525132800
35 | ,1527811200
36 | ,1530403200
37 | ,1533081600
38 | ,1535760000
39 | ,1538352000
40 | ,1541030400
41 | ,1543622400
42 | ,1546300800
43 | ,1548979200
44 | ,1551398400
45 | ,1554076800
46 | ,1556668800
47 | ,1559347200
48 | ,1561939200
49 | ,1564617600
50 | ,1567296000
51 | ,1569888000};
52 | //调用Class.forName()方法加载驱动程序
53 | Class.forName("com.mysql.jdbc.Driver");
54 | System.out.println("成功加载MySQL驱动!");
55 |
56 | String url = "jdbc:mysql://172.19.240.128:3306/steam"; //JDBC的URL
57 | Connection conn;
58 |
59 | conn = DriverManager.getConnection(url, "root", "root");
60 |
61 | Statement stmt = conn.createStatement();
62 | System.out.println("成功连接到数据库!");
63 |
64 | String sql = "select distinct name from roll_up";
65 | ResultSet rs = stmt.executeQuery(sql);
66 | List gameName = new ArrayList<>();
67 | while (rs.next()) {
68 | gameName.add(rs.getString(1));
69 | }
70 |
71 | for (int i = 1; i < time.length; i++) {
72 | for (int j = 0; j < gameName.size(); j++) {
73 |
74 | sql = "select recommendations_up from roll_up where name = '" + gameName.get(j) + "' and time = " + time[i];
75 | rs = stmt.executeQuery(sql);
76 | int up1 = 0;
77 | while (rs.next()) {
78 | up1 = rs.getInt(1);
79 | }
80 |
81 | sql = "select recommendations_up from roll_up where name = '" + gameName.get(j) + "' and time = " + time[i - 1];
82 | rs = stmt.executeQuery(sql);
83 | int up2 = 0;
84 | while (rs.next()) {
85 | up2 = rs.getInt(1);
86 | }
87 |
88 | System.out.println(up1 + " " + up2);
89 | int up = up1 + up2;
90 | sql = "update roll_up set recommendations_up = " + up + " where name = '" + gameName.get(j) + "' and time = " + time[i];
91 | System.out.println(sql);
92 | stmt.executeUpdate(sql);
93 | }
94 | }
95 |
96 | rs.close();
97 | stmt.close();
98 | conn.close();
99 | } catch (Exception e) {
100 | e.printStackTrace();
101 | }
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/RollupCSV/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/RollupCSV/.part-00000-4a85a460-3bdb-48ad-bc81-9b68057082a8-c000.csv.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/spark-streaming/src/main/resources/RollupCSV/.part-00000-4a85a460-3bdb-48ad-bc81-9b68057082a8-c000.csv.crc
--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/RollupCSV/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/spark-streaming/src/main/resources/RollupCSV/_SUCCESS
--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/game.json:
--------------------------------------------------------------------------------
1 | {
2 | "img_src": "https://media.st.dl.bscstorage.net/steam/apps/1085660/capsule_sm_120.jpg?t=1570039639",
3 | "game_detail": {
4 | "support_tags": [
5 | "单人",
6 | "在线多人"
7 | ],
8 | "user_reviews": {
9 | "发行商:": "Bungie",
10 | "发行日期:": "2019年10月1日",
11 | "开发商:": "Bungie"
12 | },
13 | "user_tags": [
14 | "第一人称射击",
15 | "多人"
16 | ],
17 | "reviewsChart": {
18 | "weeks": [],
19 | "rollup_type": "week",
20 | "end_date": 1571529600,
21 | "recent": [
22 | {
23 | "date": 1569888000,
24 | "recommendations_up": 5205,
25 | "recommendations_down": 1467
26 | },
27 | {
28 | "date": 1569974400,
29 | "recommendations_up": 3881,
30 | "recommendations_down": 1616
31 | }
32 | ],
33 | "rollups": [
34 | {
35 | "date": 1569888000,
36 | "recommendations_up": 16003,
37 | "recommendations_down": 6234
38 | }
39 | ],
40 | "start_date": 1569888000
41 | }
42 | },
43 | "original_price": "免费开玩",
44 | "review_summary": "多半好评
30,477 篇用户的游戏评测中有 72% 为好评。",
45 | "price": "免费开玩",
46 | "date": "2019年10月1日",
47 | "name": "Destiny 2",
48 | "page": 1,
49 | "href": "https://store.steampowered.com/app/1085660/Destiny_2/?snr=1_7_7_230_150_1"
50 | }
--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/gameAll.json:
--------------------------------------------------------------------------------
1 | {
2 | "page": 1,
3 | "date": "2019年9月26日",
4 | "href": "https://store.steampowered.com/app/678960/CODE_VEIN/?snr=1_7_7_230_150_1",
5 | "review_summary": "特别好评
4,945 篇用户的游戏评测中有 84% 为好评。",
6 | "img_src": "https://media.st.dl.bscstorage.net/steam/apps/678960/capsule_sm_120.jpg?t=1570113292",
7 | "name": "CODE VEIN",
8 | "game_detail": {
9 | "user_reviews": {
10 | "开发商:": "BANDAI NAMCO Studios",
11 | "发行商:": "BANDAI NAMCO Entertainment",
12 | "发行日期:": "2019年9月26日"
13 | },
14 | "support_tags": [
15 | "单人",
16 | "在线合作",
17 | "Steam 成就",
18 | "Steam 集换式卡牌",
19 | "部分支持控制器",
20 | "Steam 云"
21 | ],
22 | "user_tags": [
23 | "动漫",
24 | "角色定制",
25 | "类魂系列",
26 | "角色扮演",
27 | "动作",
28 | "合作",
29 | "日系角色扮演",
30 | "第三人称视角",
31 | "吸血鬼",
32 | "暴力",
33 | "多人",
34 | "黑暗奇幻",
35 | "困难",
36 | "血腥",
37 | "动作角色扮演",
38 | "单人",
39 | "末日",
40 | "砍杀",
41 | "冒险",
42 | "好评原声音轨"
43 | ],
44 | "reviewsChart": {
45 | "rollups": [
46 | {
47 | "recommendations_up": 2680,
48 | "date": 1569456000,
49 | "recommendations_down": 549
50 | },
51 | {
52 | "recommendations_up": 907,
53 | "date": 1570060800,
54 | "recommendations_down": 164
55 | },
56 | {
57 | "recommendations_up": 437,
58 | "date": 1570665600,
59 | "recommendations_down": 74
60 | },
61 | {
62 | "recommendations_up": 167,
63 | "date": 1571270400,
64 | "recommendations_down": 34
65 | }
66 | ],
67 | "weeks": [],
68 | "start_date": 1569456000,
69 | "rollup_type": "week",
70 | "end_date": 1571616000,
71 | "recent": [
72 | {
73 | "recommendations_up": 29,
74 | "date": 1569456000,
75 | "recommendations_down": 9
76 | },
77 | {
78 | "recommendations_up": 918,
79 | "date": 1569542400,
80 | "recommendations_down": 160
81 | },
82 | {
83 | "recommendations_up": 448,
84 | "date": 1569628800,
85 | "recommendations_down": 131
86 | },
87 | {
88 | "recommendations_up": 397,
89 | "date": 1569715200,
90 | "recommendations_down": 88
91 | },
92 | {
93 | "recommendations_up": 374,
94 | "date": 1569801600,
95 | "recommendations_down": 77
96 | },
97 | {
98 | "recommendations_up": 344,
99 | "date": 1569888000,
100 | "recommendations_down": 48
101 | },
102 | {
103 | "recommendations_up": 170,
104 | "date": 1569974400,
105 | "recommendations_down": 36
106 | },
107 | {
108 | "recommendations_up": 197,
109 | "date": 1570060800,
110 | "recommendations_down": 35
111 | },
112 | {
113 | "recommendations_up": 136,
114 | "date": 1570147200,
115 | "recommendations_down": 36
116 | },
117 | {
118 | "recommendations_up": 151,
119 | "date": 1570233600,
120 | "recommendations_down": 37
121 | },
122 | {
123 | "recommendations_up": 131,
124 | "date": 1570320000,
125 | "recommendations_down": 23
126 | },
127 | {
128 | "recommendations_up": 121,
129 | "date": 1570406400,
130 | "recommendations_down": 17
131 | },
132 | {
133 | "recommendations_up": 94,
134 | "date": 1570492800,
135 | "recommendations_down": 8
136 | },
137 | {
138 | "recommendations_up": 77,
139 | "date": 1570579200,
140 | "recommendations_down": 8
141 | },
142 | {
143 | "recommendations_up": 68,
144 | "date": 1570665600,
145 | "recommendations_down": 11
146 | },
147 | {
148 | "recommendations_up": 62,
149 | "date": 1570752000,
150 | "recommendations_down": 21
151 | },
152 | {
153 | "recommendations_up": 68,
154 | "date": 1570838400,
155 | "recommendations_down": 11
156 | },
157 | {
158 | "recommendations_up": 79,
159 | "date": 1570924800,
160 | "recommendations_down": 12
161 | },
162 | {
163 | "recommendations_up": 67,
164 | "date": 1571011200,
165 | "recommendations_down": 5
166 | },
167 | {
168 | "recommendations_up": 54,
169 | "date": 1571097600,
170 | "recommendations_down": 9
171 | },
172 | {
173 | "recommendations_up": 39,
174 | "date": 1571184000,
175 | "recommendations_down": 5
176 | },
177 | {
178 | "recommendations_up": 44,
179 | "date": 1571270400,
180 | "recommendations_down": 7
181 | },
182 | {
183 | "recommendations_up": 32,
184 | "date": 1571356800,
185 | "recommendations_down": 8
186 | },
187 | {
188 | "recommendations_up": 43,
189 | "date": 1571443200,
190 | "recommendations_down": 11
191 | },
192 | {
193 | "recommendations_up": 41,
194 | "date": 1571529600,
195 | "recommendations_down": 7
196 | },
197 | {
198 | "recommendations_up": 7,
199 | "date": 1571616000,
200 | "recommendations_down": 1
201 | }
202 | ]
203 | }
204 | },
205 | "price": "¥ 268",
206 | "original_price": "¥ 268"
207 | }
208 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/gameDetail.json:
--------------------------------------------------------------------------------
1 | {
2 | "reviewsChart": {
3 | "end_date": 1571616000,
4 | "rollups": [
5 | {
6 | "recommendations_down": 34,
7 | "date": 1571270400,
8 | "recommendations_up": 167
9 | }
10 | ],
11 | "recent": [
12 | {
13 | "recommendations_down": 1,
14 | "date": 1571616000,
15 | "recommendations_up": 7
16 | }
17 | ],
18 | "rollup_type": "week",
19 | "weeks": [],
20 | "start_date": 1569456000
21 | },
22 | "support_tags": [
23 | "单人",
24 | "在线合作",
25 | "Steam 成就"
26 | ],
27 | "user_tags": [
28 | "动漫",
29 | "砍杀",
30 | "冒险",
31 | "好评原声音轨"
32 | ],
33 | "user_reviews": {
34 | "发行日期:": "2019年9月26日",
35 | "开发商:": "BANDAI NAMCO Studios",
36 | "发行商:": "BANDAI NAMCO Entertainment"
37 | }
38 | }
--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set everything to be logged to the console
2 | log4j.rootCategory=ERROR, console
3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
4 | log4j.appender.console.target=System.err
5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
7 |
8 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
9 | # log level for this class is used to overwrite the root logger's log level, so that
10 | # the user can have different defaults for the shell and regular Spark apps.
11 | log4j.logger.org.apache.spark.repl.Main=WARN
12 |
13 | # Settings to quiet third party logs that are too verbose
14 | log4j.logger.org.spark_project.jetty=WARN
15 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
18 | log4j.logger.org.apache.parquet=ERROR
19 | log4j.logger.parquet=ERROR
20 |
21 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
22 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
23 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/test.txt:
--------------------------------------------------------------------------------
1 | 1,使命召唤1,2019-10-13,好玩1
2 | 4,使命召唤2,2019-10-14,好玩2
3 | 1,使命召唤3,2019-10-15,好玩3
4 | 2,使命召唤4,2019-10-16,好玩4
5 | 1,使命召唤5,2019-10-13,好玩5
6 | 1,使命召唤1,2019-10-15,好玩6
7 | 9999,使命召唤1,2019-10-15,好玩6
8 | https://media.st.dl.bscstorage.net/steam/apps/678960/capsule_sm_120.jpg?t=1570113292 {"reviewsChart": {"end_date": 1571616000, "rollups": [{"recommendations_down": 34, "date": 1571270400, "recommendations_up": 167}, {"recommendations_down": 34, "date": 1571270400, "recommendations_up": 167}], "recent": [{"recommendations_down": 1, "date": 1571616000, "recommendations_up": 7}], "rollup_type": "month", "weeks": [], "start_date": 1569456000}, "support_tags": ["单人", "在线合作", "Steam 成就"], "user_tags": ["动漫", "砍杀", "冒险", "好评原声音轨"], "user_reviews": {"发行日期:": "2019年9月26日", "开发商:": "BANDAI NAMCO Studios", "发行商:": "BANDAI NAMCO Entertainment"}} ¥ 268 ¥ 268 特别好评
4,945 篇用户的游戏评测中有 84% 为好评。 2019年9月26日 CODE VEIN
--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/update.sql:
--------------------------------------------------------------------------------
1 | 1483228800
2 | 1485907200
3 | 1488326400
4 | 1491004800
5 | 1493596800
6 | 1496275200
7 | 1498867200
8 | 1501545600
9 | 1504224000
10 | 1506816000
11 | 1509494400
12 | 1512086400
13 | 1514764800
14 | 1517443200
15 | 1519862400
16 | 1522540800
17 | 1525132800
18 | 1527811200
19 | 1530403200
20 | 1533081600
21 | 1535760000
22 | 1538352000
23 | 1541030400
24 | 1543622400
25 | 1546300800
26 | 1548979200
27 | 1551398400
28 | 1554076800
29 | 1556668800
30 | 1559347200
31 | 1561939200
32 | 1564617600
33 | 1567296000
34 | 1569888000
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/BatchProcess.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import java.text.SimpleDateFormat
4 | import java.util.Properties
5 |
6 | import cn.edu.nju.utils.DateUtils
7 | import org.apache.spark.broadcast.Broadcast
8 | import org.apache.spark.{SparkConf, SparkContext}
9 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession}
10 |
11 | import scala.collection.mutable.ListBuffer
12 |
13 |
14 | /**
15 | * Created by thpffcj on 2019/10/19.
16 | */
17 | object BatchProcess {
18 |
19 | def main(args: Array[String]): Unit = {
20 | saveTop10ToCsv()
21 | }
22 |
23 | /**
24 | * 从MySQL中读取top10写入csv文件
25 | */
26 | def saveTop10ToCsv(): Unit = {
27 |
28 | val sparkConf = new SparkConf().setMaster("local").setAppName("BatchProcess")
29 | val sc = SparkSession.builder().config(sparkConf).getOrCreate()
30 |
31 | val csvSavePath = "src/main/resources/RollupCSV"
32 |
33 | val tableName = "(select name, recommendations_up, time from top10_new order by time) as top10"
34 | val data: DataFrame = readMysqlTable(sc, tableName)
35 |
36 | import sc.implicits._
37 | data.map(row => {
38 |
39 | val name = row.getAs("name").toString
40 | val types = "game"
41 | val recommendations_up = row.getAs("recommendations_up").toString
42 | val date = DateUtils.tranTimestampToString(row.getAs("time"))
43 |
44 | println((name, types, recommendations_up, date))
45 |
46 | (name, types, recommendations_up, date)
47 | }).toDF("name", "type", "value", "date").write.mode(SaveMode.Overwrite).csv(csvSavePath)
48 |
49 | sc.stop()
50 | }
51 |
52 | /**
53 | * 按月份统计top10存入MySQL
54 | */
55 | def saveRollUpToMysql() = {
56 |
57 | val sparkConf = new SparkConf().setMaster("local[1]").setAppName("BatchProcess")
58 | val sc = SparkSession.builder().config(sparkConf).getOrCreate()
59 |
60 | val dates = DateUtils.getSteamDates()
61 |
62 | for (date <- dates) {
63 | val time = (new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(date).getTime / 1000).toInt
64 | println(time)
65 | val tableName = "(select * from roll_up where time = " + time + " order by recommendations_up desc limit 10) as roll_up"
66 | val data: DataFrame = readMysqlTable(sc, tableName)
67 |
68 | val properties = new Properties()
69 | properties.setProperty("user", "root")
70 | properties.setProperty("password", "root")
71 | data.write.mode(SaveMode.Append).jdbc("jdbc:mysql://172.19.240.128:3306/steam", "top10_new", properties)
72 | }
73 |
74 | sc.stop()
75 | }
76 |
77 | // TODO Spark不支持Update操作
78 | def addRollUpByMonth()= {
79 |
80 | val sparkConf = new SparkConf().setMaster("local[2]").setAppName("BatchProcess")
81 | val sc = SparkSession.builder().config(sparkConf).getOrCreate()
82 | val dates = DateUtils.getSteamDates()
83 |
84 | var tableName = "(select name from roll_up) as roll_up"
85 | val data: DataFrame = readMysqlTable(sc, tableName)
86 |
87 | // 广播变量
88 | val gameName = new ListBuffer[String]
89 | val broadcast: Broadcast[ListBuffer[String]] = sc.sparkContext.broadcast(gameName)
90 | data.foreach(row => {
91 | broadcast.value.append(row.getAs("name").toString)
92 | })
93 |
94 | for (game <- gameName) {
95 | tableName = "(select recommendations_up from roll_up where name = '" + game + "') as roll_up"
96 | val data: DataFrame = readMysqlTable(sc, tableName)
97 | data.show()
98 | }
99 |
100 | sc.stop()
101 | }
102 |
103 | def readMysqlTable(sparkSession: SparkSession, tableName: String) = {
104 |
105 | sparkSession
106 | .read
107 | .format("jdbc")
108 | .option("url", "jdbc:mysql://172.19.240.128:3306/steam")
109 | .option("user", "root")
110 | .option("password", "root")
111 | .option("dbtable", tableName)
112 | .load()
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/HDFSProcess.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import java.sql.DriverManager
4 |
5 | import cn.edu.nju.domain.CommentLog
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.streaming.{Seconds, StreamingContext}
8 |
9 | /**
10 | * Created by thpffcj on 2019/10/2.
11 | */
12 | object HDFSProcess {
13 |
14 | def main(args: Array[String]): Unit = {
15 |
16 | val sparkConf = new SparkConf().setMaster("local[2]").setAppName("HDFSProcess")
17 | // val sparkConf = new SparkConf().setMaster("spark://thpffcj:7077").setAppName("HDFSProcess")
18 |
19 | // 创建StreamingContext需要两个参数:SparkConf和batch interval
20 | val ssc = new StreamingContext(sparkConf, Seconds(5))
21 |
22 | // 如果使用了stateful的算子,必须要设置checkpoint
23 | // 在生产环境中,建议把checkpoint设置到HDFS的某个文件夹中
24 | // . 代表当前目录
25 | ssc.checkpoint("/Users/thpffcj/Public/file/cloud_checkpoint/hdfs_process")
26 |
27 | // val data = ssc.textFileStream("hdfs://thpffcj:9000/cloud-computing/")
28 | // nc -lk 9999
29 | val data = ssc.socketTextStream("localhost", 9999)
30 |
31 | // 构建黑名单
32 | val blacks = List("9999")
33 | val blacksRDD = ssc.sparkContext.parallelize(blacks).map(x => (x, true))
34 |
35 | // 过滤黑名单
36 | val cleanData = data.map(line => (line.split(",")(0), line))
37 | .transform(rdd => {
38 | rdd.leftOuterJoin(blacksRDD)
39 | .filter(x => x._2._2.getOrElse(false) != true)
40 | .map(x => x._2._1)
41 | })
42 |
43 | val logs = cleanData.map(line => {
44 | val infos = line.split(",")
45 | CommentLog(infos(0), infos(1), infos(2), infos(3))
46 | }).filter(commentLog => commentLog.gameName != "")
47 |
48 | // 按游戏名统计评论数
49 | val gameNumber = logs.map(log => {
50 | (log.gameName, 1)
51 | }).updateStateByKey[Int](updateFunction _)
52 |
53 | gameNumber.print()
54 |
55 | ssc.start()
56 | ssc.awaitTermination()
57 | }
58 |
59 | /**
60 | * 把当前的数据去更新已有的或者是旧的数据
61 | * @param currentValues 当前数据
62 | * @param preValues 旧数据
63 | * @return
64 | */
65 | def updateFunction(currentValues: Seq[Int], preValues: Option[Int]): Option[Int] = {
66 | val current = currentValues.sum
67 | val pre = preValues.getOrElse(0)
68 | Some(current + pre)
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/KafkaProcess.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import cn.edu.nju.domain.UserData
4 | import org.apache.kafka.clients.consumer.ConsumerConfig
5 | import org.apache.kafka.common.serialization.StringDeserializer
6 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
7 | import org.apache.spark.streaming.{Seconds, StreamingContext}
8 | import org.apache.spark.{SparkConf, SparkContext}
9 |
10 | /**
11 | * Created by thpffcj on 2019/10/19.
12 | */
13 | object KafkaProcess {
14 |
15 | def main(args: Array[String]): Unit = {
16 |
17 | val sparkConf = new SparkConf().setMaster("local[2]").setAppName("StreamProcess")
18 | val ssc = new StreamingContext(sparkConf, Seconds(5))
19 |
20 | ssc.checkpoint("/Users/thpffcj/Public/file/cloud_checkpoint/stream_process")
21 |
22 | val bootstrapServers = "thpffcj1:9092"
23 | val groupId = "test"
24 | val topicName = "steam"
25 | val maxPoll = 20000
26 |
27 | val kafkaParams = Map(
28 | ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers,
29 | ConsumerConfig.GROUP_ID_CONFIG -> groupId,
30 | ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> maxPoll.toString,
31 | ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
32 | ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer]
33 | )
34 |
35 | val messages = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
36 | ConsumerStrategies.Subscribe[String, String](Set(topicName), kafkaParams))
37 |
38 | val rawData = messages.map(_.value())
39 |
40 | val data = rawData.map(line => {
41 | val record = line.split("\t")
42 | UserData(record(0), record(1), record(2), record(3).toDouble)
43 | })
44 |
45 | // 游戏销量
46 | val gameSale = data.filter(userData => userData.behavior == "purchase")
47 | .map(userData => {
48 | (userData.gameName, 1)
49 | }).updateStateByKey[Int](updateFunction _)
50 |
51 | gameSale.print()
52 |
53 | // 游戏游玩平均时长
54 | val gamePopularity = data.filter(userData => userData.behavior == "play").map(
55 | userData => {
56 | (userData.gameName, (userData.duration, 1))
57 | }
58 | ).reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2))
59 |
60 | gamePopularity.print()
61 |
62 | // Dota 2游玩时长
63 | val gameDuration = data.filter(
64 | userData => userData.gameName == "Dota 2" & userData.behavior == "play").map(
65 | userData => {
66 | (userData.userId, userData.duration)
67 | })
68 |
69 | gameDuration.print()
70 |
71 | ssc.start()
72 | ssc.awaitTermination()
73 | }
74 |
75 | /**
76 | * 把当前的数据去更新已有的或者是旧的数据
77 | * @param currentValues 当前数据
78 | * @param preValues 旧数据
79 | * @return
80 | */
81 | def updateFunction(currentValues: Seq[Int], preValues: Option[Int]): Option[Int] = {
82 | val current = currentValues.sum
83 | val pre = preValues.getOrElse(0)
84 | Some(current + pre)
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/MongoDBProcess.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import com.mongodb.spark.MongoSpark
4 | import org.apache.log4j.{Level, Logger}
5 | import org.apache.spark.sql.DataFrame
6 |
7 | /**
8 | * Created by thpffcj on 2019/9/24.
9 | */
10 | object MongoDBProcess {
11 |
12 | Logger.getLogger("org").setLevel(Level.ERROR)
13 |
14 | def main(args: Array[String]): Unit = {
15 |
16 | import org.apache.spark.sql.SparkSession
17 |
18 | val spark = SparkSession.builder()
19 | .master("local[2]")
20 | .appName("MongoDBProcess")
21 | .config("spark.mongodb.input.uri", "mongodb://steam:steam@***.***.***.***:27017/steam_db.China.games")
22 | .getOrCreate()
23 |
24 | val frame: DataFrame = MongoSpark.load(spark)
25 | frame.createTempView("games")
26 |
27 | val res: DataFrame = spark.sql("SELECT name from games")
28 | res.show()
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/MySQLProcess.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import java.text.SimpleDateFormat
4 | import java.util
5 |
6 | import cn.edu.nju.api.{ApiReturnObject, TagReturnObject}
7 | import cn.edu.nju.domain.{GameObject, TagObject}
8 | import org.apache.spark.sql.{DataFrame, SQLContext}
9 | import org.apache.spark.{SparkConf, SparkContext}
10 |
11 | import scala.collection.mutable.ListBuffer
12 | import scala.util.Random
13 |
14 | /**
15 | * Created by thpffcj on 2019/10/24.
16 | */
17 | class MySQLProcess {
18 |
19 | /**
20 | * 返回动态图所需数据
21 | * @param dates
22 | * @return
23 | */
24 | def getTimeFieldData(dates: ListBuffer[String]): ApiReturnObject = {
25 |
26 | val sparkConf = new SparkConf().setMaster("local[1]").setAppName("MySQLProcess")
27 | val sc = new SparkContext(sparkConf)
28 |
29 | val sqlContext = new SQLContext(sc)
30 |
31 | val timeFieldObjects = new util.ArrayList[TimeFieldObject]
32 |
33 | for (date <- dates){
34 | val time = (new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(date).getTime / 1000).toInt
35 | val tableName = "(select * from top10 where time = " + time + " order by recommendations_up desc limit 10) as top10"
36 | val data: DataFrame = readMysqlTable(sqlContext, tableName)
37 |
38 | val gameObjects = new util.ArrayList[GameObject]
39 | val broadcast = sc.broadcast(gameObjects)
40 | var id = 1
41 | data.foreach(row => {
42 |
43 | val name = row.getAs("name").toString
44 | var color = ""
45 | if (MySQLProcess.map.containsKey(name)) {
46 | color = MySQLProcess.map.get(name).toString
47 | } else {
48 | // rgb(218, 198, 76)
49 | color = "rgb(" + Random.nextInt(255) + ", " + Random.nextInt(255) + ", " + Random.nextInt(255) + ")"
50 | MySQLProcess.map.put(name, color)
51 | }
52 |
53 | val gameObject = new GameObject(id.toString, name, row.getAs("recommendations_up"), color)
54 | broadcast.value.add(gameObject)
55 | id = id + 1
56 | })
57 |
58 | val name = "截止" + date.substring(0, 4) + "年" + date.substring(5, 7) + "月" + "好评累计总数"
59 | val timeFieldObject = new TimeFieldObject(name, broadcast.value)
60 | timeFieldObjects.add(timeFieldObject)
61 | }
62 |
63 | val apiReturnObject = new ApiReturnObject(timeFieldObjects)
64 |
65 | sc.stop()
66 |
67 | apiReturnObject
68 | }
69 |
70 | /**
71 | * 返回词云需要的数据
72 | * @return
73 | */
74 | def getTagData(round: Int): TagReturnObject = {
75 |
76 | val sparkConf = new SparkConf().setMaster("local[1]").setAppName("MySQLProcess")
77 | val sc = new SparkContext(sparkConf)
78 |
79 | val sqlContext = new SQLContext(sc)
80 |
81 | val tableName = "(select * from tag limit " + 0 + "," + round * 50 + ") as top10"
82 | println(tableName)
83 | val data: DataFrame = readMysqlTable(sqlContext, tableName)
84 |
85 | val tagObjects =new util.ArrayList[TagObject]
86 | val broadcast = sc.broadcast(tagObjects)
87 | data.foreach(row => {
88 | val tagObject = new TagObject(row.getAs("game_name"), row.getAs("number"))
89 | broadcast.value.add(tagObject)
90 | })
91 |
92 | val tagReturnObject = new TagReturnObject(tagObjects)
93 |
94 | sc.stop()
95 |
96 | tagReturnObject
97 | }
98 |
99 | def readMysqlTable(sqlContext: SQLContext, tableName: String) = {
100 | sqlContext
101 | .read
102 | .format("jdbc")
103 | .option("driver", "com.mysql.jdbc.Driver")
104 | .option("url", "jdbc:mysql://172.19.240.128:3306/steam")
105 | .option("user", "root")
106 | .option("password", "root")
107 | .option("dbtable", tableName)
108 | .load()
109 | }
110 | }
111 |
112 | object MySQLProcess {
113 |
114 | val map = new util.HashMap[String, String]()
115 | }
116 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/SteamProcess.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju
2 |
3 | import cn.edu.nju.dao.{RollUpDAO, TagDAO}
4 | import cn.edu.nju.domain.{GameDetail, ReviewsChart, RollUp, SteamLog, Tag}
5 | import com.google.gson.Gson
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.streaming.dstream.DStream
8 | import org.apache.spark.streaming.{Seconds, StreamingContext}
9 |
10 | import scala.collection.mutable.{ListBuffer, Set}
11 |
12 | /**
13 | * Created by thpffcj on 2019/10/21.
14 | */
15 | object SteamProcess {
16 |
17 | def main(args: Array[String]): Unit = {
18 |
19 | // val sparkConf = new SparkConf().setMaster("local[2]").setAppName("HDFSProcess")
20 | val sparkConf = new SparkConf().setMaster("spark://thpffcj:7077").setAppName("SteamProcess")
21 |
22 | val ssc = new StreamingContext(sparkConf, Seconds(5))
23 |
24 | // 如果使用了stateful的算子,必须要设置checkpoint
25 | // 在生产环境中,建议把checkpoint设置到HDFS的某个文件夹中
26 | // . 代表当前目录
27 | ssc.checkpoint("/Users/thpffcj/Public/file/cloud_checkpoint/hdfs_process")
28 |
29 | val rawData = ssc.textFileStream("hdfs://thpffcj:9000/cloud-computing/")
30 |
31 | val gameNameSet: Set[String] = Set()
32 |
33 | /**
34 | * 过滤空行
35 | * 过滤日期为空的数据
36 | * 过滤重复的数据,使用游戏名称过滤
37 | * 过滤game_detail为空的数据
38 | * 为了game_detail为bundle的数据
39 | */
40 | val data = rawData.filter(rdd => !rdd.isEmpty).map(line => {
41 | val log = line.split("\t")
42 | if (log.length < 7) {
43 | SteamLog("", "", "", "", "", "", "")
44 | } else {
45 | SteamLog(log(0), log(1), log(2), log(3), log(4), log(5), log(6))
46 | }
47 | }).filter(steamLog => !steamLog.date.isEmpty)
48 | .filter(steamLog => !gameNameSet.contains(steamLog.name))
49 | .filter(steamLog => !steamLog.game_detail.isEmpty)
50 | .filter(steamLog => !steamLog.game_detail.equals("bundle"))
51 | .map(steamLog => {
52 | gameNameSet.add(steamLog.name)
53 | steamLog
54 | })
55 |
56 | // 取出用户标签
57 | val userTags = data.map(steamLog => {
58 | val gameDetail = jsonToGameDetail(steamLog.game_detail)
59 | if (gameDetail != null) {
60 | gameDetail.user_tags.toString.replace(" ", "")
61 | } else {
62 | null
63 | }
64 | }).filter(userTags => userTags != null)
65 |
66 | // 标签统计
67 | val tagsNumber = userTags.flatMap(line => line.substring(1, line.length - 1).split(","))
68 | .map(tag => (tag, 1)).updateStateByKey[Int](updateFunction _)
69 |
70 | // writeTagToMysql(tagsNumber)
71 | tagsNumber.print()
72 |
73 | /**
74 | * (steamLog.name,jsonToReviewsChart(gameDetail.reviewsChart.toString))
75 | * (CODE VEIN,{recommendations_down=34.0,date=1.5712704E9,recommendations_up=167.0},{recommendations_down=34.0,date=1.5712704E9,recommendations_up=167.0)
76 | */
77 | val rollups = data.map(steamLog => {
78 | val gameDetail = jsonToGameDetail(steamLog.game_detail)
79 |
80 | // 过滤 reviewsChart["start_date"] 和 reviewsChart["end_date"] 为空的数据
81 | if ((gameDetail != null) && (gameDetail.reviewsChart.get("start_date") != "None")
82 | && (gameDetail.reviewsChart.get("end_date") != "None")) {
83 | (steamLog.name, jsonToReviewsChart(gameDetail.reviewsChart.toString))
84 | } else {
85 | null
86 | }
87 | }).filter(rollups => rollups != null)
88 | // 目前只考虑以月为时间单位的数据
89 | .filter(reviewsChart => reviewsChart._2.rollup_type == "month")
90 | .map(reviewsChart => {
91 | val line = reviewsChart._2.rollups.toString
92 | (reviewsChart._1, line.substring(1, line.length - 2).replace(" ", ""))
93 | })
94 |
95 | // 将每个游戏好评数写入到MySQL
96 | rollups.foreachRDD(rdd => {
97 | rdd.foreachPartition(partitionOfRecords => {
98 | val list = new ListBuffer[(String, Int, Int, Int)]
99 |
100 | partitionOfRecords.foreach(record => {
101 | record._2.split("},").foreach(data => {
102 | val rollUp = jsonToRollUp(data + "}")
103 | list.append((record._1, rollUp.date, rollUp.recommendations_up, rollUp.recommendations_down))
104 | })
105 | })
106 |
107 | RollUpDAO.insertRollUp(list)
108 | })
109 | })
110 |
111 | // 单条插入
112 | // rollups.foreachRDD(rdd => {
113 | // rdd.foreachPartition(partitionOfRecords => {
114 | // val connection = createConnection()
115 | // partitionOfRecords.foreach(record => {
116 | // record._2.split("},").foreach(data => {
117 | // val rollUp = jsonToRollUp(data + "}")
118 | // val sql = "insert into roll_up(name, time, recommendations_up, recommendations_down) values('" + record._1.replace("'", "") + "'," + rollUp.date + "," + rollUp.recommendations_up + "," + rollUp.recommendations_down + ")"
119 | // connection.createStatement().execute(sql)
120 | // })
121 | // })
122 | // connection.close()
123 | // })
124 | // })
125 |
126 | rollups.print()
127 |
128 | ssc.start()
129 | ssc.awaitTermination()
130 | }
131 |
132 | // def createConnection() = {
133 | // Class.forName("com.mysql.jdbc.Driver")
134 | // DriverManager.getConnection("jdbc:mysql://localhost:3306/steam?useUnicode=true&characterEncoding=utf-8", "root", "000000")
135 | // }
136 |
137 | def jsonToGameDetail(jsonStr: String): GameDetail = {
138 | try {
139 | val gson = new Gson()
140 | gson.fromJson(jsonStr, classOf[GameDetail])
141 | } catch {
142 | case e: Exception => {
143 | // e.printStackTrace()
144 | null
145 | }
146 | }
147 | }
148 |
149 | def jsonToReviewsChart(jsonStr: String): ReviewsChart = {
150 | try {
151 | val gson = new Gson()
152 | gson.fromJson(jsonStr, classOf[ReviewsChart])
153 | } catch {
154 | case e: Exception => {
155 | // e.printStackTrace()
156 | null
157 | }
158 | }
159 | }
160 |
161 | def jsonToRollUp(jsonStr: String): RollUp = {
162 | try {
163 | val gson = new Gson()
164 | gson.fromJson(jsonStr, classOf[RollUp])
165 | } catch {
166 | case e: Exception => {
167 | // e.printStackTrace()
168 | null
169 | }
170 | }
171 | }
172 |
173 | /**
174 | * 把当前的数据去更新已有的或者是旧的数据
175 | *
176 | * @param currentValues 当前数据
177 | * @param preValues 旧数据
178 | * @return
179 | */
180 | def updateFunction(currentValues: Seq[Int], preValues: Option[Int]): Option[Int] = {
181 | val current = currentValues.sum
182 | val pre = preValues.getOrElse(0)
183 | Some(current + pre)
184 | }
185 |
186 | /**
187 | * 标签数据写入MySQL
188 | * @param tagsNumber
189 | */
190 | def writeTagToMysql(tagsNumber: DStream[(String, Int)]): Unit = {
191 |
192 | tagsNumber.foreachRDD(rdd => {
193 | rdd.foreachPartition(partitionOfRecords => {
194 | val list = new ListBuffer[Tag]
195 | partitionOfRecords.foreach(record => {
196 | list.append(Tag(record._1, record._2))
197 | })
198 | TagDAO.insertTag(list)
199 | })
200 | })
201 | }
202 | }
203 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/dao/CourseClickCountDAO.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.dao
2 |
3 | import cn.edu.nju.domain.CourseClickCount
4 | import cn.edu.nju.utils.HBaseUtils
5 | import org.apache.hadoop.hbase.client.Get
6 | import org.apache.hadoop.hbase.util.Bytes
7 |
8 | import scala.collection.mutable.ListBuffer/**
9 | * Created by thpffcj on 2019/10/17.
10 | */
11 | object CourseClickCountDAO {
12 |
13 | val tableName = "imooc_course_clickcount"
14 | val cf = "info"
15 | val qualifer = "click_count"
16 |
17 | /**
18 | * 保存数据到HBase
19 | * @param list CourseClickCount集合
20 | */
21 | def save(list: ListBuffer[CourseClickCount]): Unit = {
22 |
23 | val table = HBaseUtils.getInstance().getTable(tableName)
24 |
25 | for(ele <- list) {
26 | table.incrementColumnValue(Bytes.toBytes(ele.day_course),
27 | Bytes.toBytes(cf),
28 | Bytes.toBytes(qualifer),
29 | ele.click_count)
30 | }
31 | }
32 |
33 | /**
34 | * 根据rowkey查询值
35 | */
36 | def count(day_course: String): Long = {
37 | val table = HBaseUtils.getInstance().getTable(tableName)
38 |
39 | val get = new Get(Bytes.toBytes(day_course))
40 | val value = table.get(get).getValue(cf.getBytes, qualifer.getBytes)
41 |
42 | if(value == null) {
43 | 0L
44 | }else{
45 | Bytes.toLong(value)
46 | }
47 | }
48 |
49 | def main(args: Array[String]): Unit = {
50 |
51 | val list = new ListBuffer[CourseClickCount]
52 | list.append(CourseClickCount("20171111_8",8))
53 | list.append(CourseClickCount("20171111_9",9))
54 | list.append(CourseClickCount("20171111_1",100))
55 |
56 | save(list)
57 |
58 | println(count("20171111_8") + " : " + count("20171111_9")+ " : " + count("20171111_1"))
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/dao/CourseSearchClickCountDAO.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.dao
2 |
3 | import cn.edu.nju.domain.CourseSearchClickCount
4 | import cn.edu.nju.utils.HBaseUtils
5 | import org.apache.hadoop.hbase.client.Get
6 | import org.apache.hadoop.hbase.util.Bytes
7 |
8 | import scala.collection.mutable.ListBuffer
9 |
10 | /**
11 | * Created by thpffcj on 2019/10/17.
12 | */
13 | object CourseSearchClickCountDAO {
14 |
15 | val tableName = "imooc_course_search_clickcount"
16 | val cf = "info"
17 | val qualifer = "click_count"
18 |
19 | /**
20 | * 保存数据到HBase
21 | *
22 | * @param list CourseSearchClickCount集合
23 | */
24 | def save(list: ListBuffer[CourseSearchClickCount]): Unit = {
25 |
26 | val table = HBaseUtils.getInstance().getTable(tableName)
27 |
28 | for(ele <- list) {
29 | table.incrementColumnValue(Bytes.toBytes(ele.day_search_course),
30 | Bytes.toBytes(cf),
31 | Bytes.toBytes(qualifer),
32 | ele.click_count)
33 | }
34 | }
35 |
36 | /**
37 | * 根据rowkey查询值
38 | */
39 | def count(day_search_course: String):Long = {
40 | val table = HBaseUtils.getInstance().getTable(tableName)
41 |
42 | val get = new Get(Bytes.toBytes(day_search_course))
43 | val value = table.get(get).getValue(cf.getBytes, qualifer.getBytes)
44 |
45 | if(value == null) {
46 | 0L
47 | }else{
48 | Bytes.toLong(value)
49 | }
50 | }
51 |
52 | def main(args: Array[String]): Unit = {
53 |
54 | val list = new ListBuffer[CourseSearchClickCount]
55 | list.append(CourseSearchClickCount("20171111_www.baidu.com_8",8))
56 | list.append(CourseSearchClickCount("20171111_cn.bing.com_9",9))
57 |
58 | save(list)
59 |
60 | println(count("20171111_www.baidu.com_8") + " : " + count("20171111_cn.bing.com_9"))
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/dao/RollUpDAO.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.dao
2 |
3 | import cn.edu.nju.domain.RollUp
4 | import java.sql.{Connection, PreparedStatement}
5 |
6 | import cn.edu.nju.utils.MySQLUtils
7 |
8 | import scala.collection.mutable.ListBuffer
9 |
10 | /**
11 | * Created by thpffcj on 2019/10/25.
12 | */
13 | object RollUpDAO {
14 |
15 | /**
16 | * 批量保存RollUp到数据库
17 | */
18 | def insertRollUp(list: ListBuffer[(String, Int, Int, Int)]): Unit = {
19 |
20 | var connection: Connection = null
21 | var pstmt: PreparedStatement = null
22 |
23 | try {
24 | connection = MySQLUtils.getConnection()
25 |
26 | connection.setAutoCommit(false) //设置手动提交
27 |
28 | val sql = "insert into roll_up(name, time, recommendations_up, recommendations_down) values (?,?,?,?) "
29 | pstmt = connection.prepareStatement(sql)
30 |
31 | for (element <- list) {
32 | pstmt.setString(1, element._1)
33 | pstmt.setInt(2, element._2)
34 | pstmt.setInt(3, element._3)
35 | pstmt.setInt(4, element._4)
36 |
37 | pstmt.addBatch()
38 | }
39 |
40 | pstmt.executeBatch() // 执行批量处理
41 | connection.commit() // 手工提交
42 | } catch {
43 | case e: Exception => e.printStackTrace()
44 | } finally {
45 | MySQLUtils.release(connection, pstmt)
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/dao/TagDAO.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.dao
2 |
3 | import cn.edu.nju.domain.Tag
4 | import java.sql.{Connection, PreparedStatement}
5 | import cn.edu.nju.utils.MySQLUtils
6 | import scala.collection.mutable.ListBuffer
7 |
8 | /**
9 | * Created by thpffcj on 2019/10/25.
10 | */
11 | object TagDAO {
12 |
13 | /**
14 | * 批量保存Tag到数据库
15 | */
16 | def insertTag(list: ListBuffer[Tag]): Unit = {
17 |
18 | var connection: Connection = null
19 | var pstmt: PreparedStatement = null
20 |
21 | try {
22 | connection = MySQLUtils.getConnection()
23 |
24 | connection.setAutoCommit(false) //设置手动提交
25 |
26 | val sql = "insert into tag(game_name, number) values (?,?)"
27 | pstmt = connection.prepareStatement(sql)
28 |
29 | for (element <- list) {
30 | pstmt.setString(1, element.tagName)
31 | pstmt.setInt(2, element.number)
32 |
33 | pstmt.addBatch()
34 | }
35 |
36 | pstmt.executeBatch() // 执行批量处理
37 | connection.commit() // 手工提交
38 | } catch {
39 | case e: Exception => e.printStackTrace()
40 | } finally {
41 | MySQLUtils.release(connection, pstmt)
42 | }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/ClickLog.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 |
3 | /**
4 | * Created by thpffcj on 2019/10/17.
5 | */
6 | case class ClickLog(ip:String, time:String, courseId:Int, statusCode:Int, referrer:String)
7 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/CommentLog.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 |
3 | /**
4 | * Created by thpffcj on 2019/10/17.
5 | */
6 | case class CommentLog(userId:String, gameName:String, commentTime:String, comment:String)
7 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/CourseClickCount.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 |
3 | /**
4 | * Created by thpffcj on 2019/10/17.
5 | */
6 | case class CourseClickCount(day_course:String, click_count:Long)
7 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/CourseSearchClickCount.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 |
3 | /**
4 | * Created by thpffcj on 2019/10/17.
5 | */
6 | case class CourseSearchClickCount(day_search_course:String, click_count:Long)
7 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/DouBanLog.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 |
3 | /**
4 | * Created by thpffcj on 2019/10/18.
5 | */
6 | case class DouBanLog(star:Double, bd:String, quote:String, title:String)
7 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/GameDetail.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 |
3 | import com.alibaba.fastjson.JSONObject
4 |
5 | /**
6 | * Created by thpffcj on 2019/10/21.
7 | * @param support_tags
8 | * @param user_reviews
9 | * @param user_tags
10 | * @param reviewsChart
11 | */
12 | case class GameDetail(support_tags: Object, user_reviews: JSONObject, user_tags: Object,
13 | reviewsChart: JSONObject)
14 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/ReviewsChart.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 |
3 | /**
4 | * Created by thpffcj on 2019/10/21.
5 | *
6 | * @param weeks
7 | * @param rollup_type
8 | * @param end_date
9 | * @param recent
10 | * @param rollups
11 | * @param start_date
12 | */
13 | case class ReviewsChart(weeks: Object, rollup_type: String, end_date: Float, recent: Object,
14 | rollups: Object, start_date: Float)
15 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/RollUp.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 |
3 | /**
4 | * Created by thpffcj on 2019/10/23.
5 | */
6 | case class RollUp(recommendations_up: Int, date: Int, recommendations_down: Int)
7 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/SteamLog.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 |
3 | import com.alibaba.fastjson.JSONObject
4 |
5 | /**
6 | * Created by thpffcj on 2019/10/21.
7 | *
8 | * @param img_src
9 | * @param game_detail
10 | * @param original_price
11 | * @param price
12 | * @param review_summary
13 | * @param date
14 | * @param name
15 | */
16 | case class SteamLog(img_src: String, game_detail: String, original_price: String,
17 | price: String, review_summary: String, date: String, name: String)
18 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/Tag.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 |
3 | /**
4 | * Created by thpffcj on 2019/10/25.
5 | */
6 | case class Tag(tagName: String, number: Int)
7 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/UserData.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 |
3 | /**
4 | * Created by thpffcj on 2019/10/19.
5 | * @param userId 游戏玩家ID号
6 | * @param gameName 游戏名称
7 | * @param behavior 玩家购买游戏的行为(购买/玩)
8 | * @param duration 游戏时长,1代表该买了该游戏
9 | */
10 | case class UserData(userId:String, gameName:String, behavior:String, duration:Double)
11 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/test/DateTest.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.test
2 |
3 | import java.text.SimpleDateFormat
4 |
5 | import cn.edu.nju.utils.DateUtils
6 |
7 | /**
8 | * Created by thpffcj on 2019/10/25.
9 | */
10 | object DateTest {
11 |
12 | def main(args: Array[String]): Unit = {
13 |
14 | val startDate = "2017-03-01 08:00:00"
15 | val startTime: Int = (new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(startDate).getTime / 1000).toInt
16 |
17 | val endDate = "2019-10-01 08:00:00"
18 | val endTime : Int = (new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(endDate).getTime / 1000).toInt
19 |
20 | for (date <- DateUtils.getSteamDates()) {
21 | println((new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(date).getTime / 1000).toInt)
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/test/HDFSProcessTest.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.test
2 |
3 | import cn.edu.nju.domain.{CommentLog, DouBanLog}
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
6 |
7 | /**
8 | * Created by thpffcj on 2019/10/2.
9 | */
10 | object HDFSProcessTest {
11 |
12 | def main(args: Array[String]): Unit = {
13 |
14 | val sparkConf = new SparkConf().setMaster("local[2]").setAppName("HDFSProcess")
15 |
16 | // 创建StreamingContext需要两个参数:SparkConf和batch interval
17 | val ssc = new StreamingContext(sparkConf, Seconds(5))
18 |
19 | val data = ssc.textFileStream("hdfs://thpffcj:9000/cloud-computing/")
20 |
21 | val log = data.map(line => {
22 |
23 | val infos = line.split("\t")
24 |
25 | DouBanLog(infos(0).toDouble, infos(1), infos(2), infos(3))
26 | })
27 |
28 | log.print()
29 |
30 | ssc.start()
31 | ssc.awaitTermination()
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/test/JsonTest.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.test
2 |
3 | import cn.edu.nju.domain.{GameDetail, ReviewsChart, UserData}
4 | import com.google.gson.Gson
5 | import org.apache.spark.{SparkConf, SparkContext}
6 |
7 | import scala.util.parsing.json.JSONObject
8 |
9 | /**
10 | * Created by thpffcj on 2019/10/21.
11 | */
12 | object JsonTest {
13 |
14 | def main(args: Array[String]): Unit = {
15 |
16 | // val result1 = jsonToGameDetail("{\"reviewsChart\": {\"end_date\": 1571616000, \"rollups\": [{\"recommendations_down\": 34, \"date\": 1571270400, \"recommendations_up\": 167}], \"recent\": [{\"recommendations_down\": 1, \"date\": 1571616000, \"recommendations_up\": 7}], \"rollup_type\": \"week\", \"weeks\": [], \"start_date\": 1569456000}, \"support_tags\": [\"单人\", \"在线合作\", \"Steam 成就\"], \"user_tags\": [\"动漫\", \"砍杀\", \"冒险\", \"好评原声音轨\"], \"user_reviews\": {\"发行日期:\": \"2019年9月26日\", \"开发商:\": \"BANDAI NAMCO Studios\", \"发行商:\": \"BANDAI NAMCO Entertainment\"}}")
17 | // print(result1)
18 |
19 | val result2 = jsonToReviewsChart("{\"end_date\":1.571616E9,\"weeks\":[],\"rollup_type\":\"week\",\"recent\":[{\"recommendations_down\":1.0,\"date\":1.571616E9,\"recommendations_up\":7.0}],\"rollups\":[{\"recommendations_down\":34.0,\"date\":1.5712704E9,\"recommendations_up\":167.0}],\"start_date\":1.569456E9}")
20 | print(result2)
21 |
22 | }
23 |
24 | def jsonToGameDetail(jsonStr: String): GameDetail = {
25 | val gson = new Gson()
26 | gson.fromJson(jsonStr, classOf[GameDetail])
27 | }
28 |
29 | def jsonToReviewsChart(jsonStr: String): ReviewsChart = {
30 | val gson = new Gson()
31 | gson.fromJson(jsonStr, classOf[ReviewsChart])
32 | }
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/test/StatStreaming.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.test
2 |
3 | import cn.edu.nju.dao.{CourseClickCountDAO, CourseSearchClickCountDAO}
4 | import cn.edu.nju.domain.{ClickLog, CourseClickCount, CourseSearchClickCount}
5 | import cn.edu.nju.utils.DateUtils
6 | import org.apache.kafka.clients.consumer.ConsumerConfig
7 | import org.apache.kafka.common.serialization.StringDeserializer
8 | import org.apache.spark.SparkConf
9 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
10 | import org.apache.spark.streaming.{Seconds, StreamingContext}
11 |
12 | import scala.collection.mutable.ListBuffer
13 |
14 | /**
15 | * Created by thpffcj on 2019/10/17.
16 | */
17 | object StatStreaming {
18 |
19 | def main(args: Array[String]): Unit = {
20 |
21 | val sparkConf = new SparkConf().setAppName("StatStreaming") //.setMaster("local[5]")
22 | val ssc = new StreamingContext(sparkConf, Seconds(60))
23 |
24 | val bootstrapServers = "thpffcj1:9092"
25 | val groupId = "test"
26 | val topicName = "test"
27 | val maxPoll = 20000
28 |
29 | val kafkaParams = Map(
30 | ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers,
31 | ConsumerConfig.GROUP_ID_CONFIG -> groupId,
32 | ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> maxPoll.toString,
33 | ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
34 | ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer]
35 | )
36 |
37 | val messages = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
38 | ConsumerStrategies.Subscribe[String, String](Set(topicName), kafkaParams))
39 |
40 | // 测试步骤一:测试数据接收
41 | // messages.map(_._2).count().print
42 |
43 | // 测试步骤二:数据清洗
44 | val logs = messages.map(_.value())
45 | val cleanData = logs.map(line => {
46 | val infos = line.split("\t")
47 |
48 | // infos(2) = "GET /class/130.html HTTP/1.1"
49 | // url = /class/130.html
50 | val url = infos(2).split(" ")(1)
51 | var courseId = 0
52 |
53 | // 把实战课程的课程编号拿到了
54 | if (url.startsWith("/class")) {
55 | val courseIdHTML = url.split("/")(2)
56 | courseId = courseIdHTML.substring(0, courseIdHTML.lastIndexOf(".")).toInt
57 | }
58 |
59 | ClickLog(infos(0), DateUtils.parseToMinute(infos(1)), courseId, infos(3).toInt, infos(4))
60 | }).filter(clicklog => clicklog.courseId != 0)
61 |
62 | // cleanData.print()
63 |
64 | // 测试步骤三:统计今天到现在为止实战课程的访问量
65 |
66 | cleanData.map(x => {
67 |
68 | // HBase rowkey设计: 20171111_88
69 | (x.time.substring(0, 8) + "_" + x.courseId, 1)
70 | }).reduceByKey(_ + _).foreachRDD(rdd => {
71 | rdd.foreachPartition(partitionRecords => {
72 | val list = new ListBuffer[CourseClickCount]
73 |
74 | partitionRecords.foreach(pair => {
75 | list.append(CourseClickCount(pair._1, pair._2))
76 | })
77 |
78 | CourseClickCountDAO.save(list)
79 | })
80 | })
81 |
82 | // 测试步骤四:统计从搜索引擎过来的今天到现在为止实战课程的访问量
83 |
84 | cleanData.map(x => {
85 |
86 | /**
87 | * https://www.sogou.com/web?query=Spark SQL实战
88 | */
89 | val referrer = x.referrer.replaceAll("//", "/")
90 | val splits = referrer.split("/")
91 | var host = ""
92 | if(splits.length > 2) {
93 | host = splits(1)
94 | }
95 |
96 | (host, x.courseId, x.time)
97 | }).filter(_._1 != "").map(x => {
98 | (x._3.substring(0,8) + "_" + x._1 + "_" + x._2 , 1)
99 | }).reduceByKey(_ + _).foreachRDD(rdd => {
100 | rdd.foreachPartition(partitionRecords => {
101 | val list = new ListBuffer[CourseSearchClickCount]
102 |
103 | partitionRecords.foreach(pair => {
104 | list.append(CourseSearchClickCount(pair._1, pair._2))
105 | })
106 |
107 | CourseSearchClickCountDAO.save(list)
108 | })
109 | })
110 |
111 | ssc.start()
112 | ssc.awaitTermination()
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/test/TransformTest.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.test
2 |
3 | import cn.edu.nju.domain.GameDetail
4 | import com.google.gson.Gson
5 |
6 | /**
7 | * Created by thpffcj on 2019/10/25.
8 | */
9 | object TransformTest {
10 |
11 | def main(args: Array[String]): Unit = {
12 |
13 | jsonToGameDetail("")
14 | }
15 |
16 | def jsonToGameDetail(jsonStr: String): GameDetail = {
17 | try {
18 | val gson = new Gson()
19 | gson.fromJson(jsonStr, classOf[GameDetail])
20 | } catch {
21 | case e: Exception => {
22 | e.printStackTrace()
23 | null
24 | }
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/utils/DateUtils.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.utils
2 |
3 | import java.text.SimpleDateFormat
4 | import java.util.Date
5 |
6 | import org.apache.commons.lang3.time.FastDateFormat
7 |
8 | import scala.collection.mutable.ListBuffer
9 |
10 | /**
11 | * Created by thpffcj on 2019/10/17.
12 | */
13 | object DateUtils {
14 |
15 | val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
16 | val TARGE_FORMAT = FastDateFormat.getInstance("yyyyMMddHHmmss")
17 |
18 |
19 | def getTime(time: String) = {
20 | YYYYMMDDHHMMSS_FORMAT.parse(time).getTime
21 | }
22 |
23 | def parseToMinute(time :String) = {
24 | TARGE_FORMAT.format(new Date(getTime(time)))
25 | }
26 |
27 | def getSteamDates(): ListBuffer[String] = {
28 | val dates = new ListBuffer[String]
29 | dates.append("2017-01-01 08:00:00")
30 | dates.append("2017-02-01 08:00:00")
31 | dates.append("2017-03-01 08:00:00")
32 | dates.append("2017-04-01 08:00:00")
33 | dates.append("2017-05-01 08:00:00")
34 | dates.append("2017-06-01 08:00:00")
35 | dates.append("2017-07-01 08:00:00")
36 | dates.append("2017-08-01 08:00:00")
37 | dates.append("2017-09-01 08:00:00")
38 | dates.append("2017-10-01 08:00:00")
39 | dates.append("2017-11-01 08:00:00")
40 | dates.append("2017-12-01 08:00:00")
41 | dates.append("2018-01-01 08:00:00")
42 | dates.append("2018-02-01 08:00:00")
43 | dates.append("2018-03-01 08:00:00")
44 | dates.append("2018-04-01 08:00:00")
45 | dates.append("2018-05-01 08:00:00")
46 | dates.append("2018-06-01 08:00:00")
47 | dates.append("2018-07-01 08:00:00")
48 | dates.append("2018-08-01 08:00:00")
49 | dates.append("2018-09-01 08:00:00")
50 | dates.append("2018-10-01 08:00:00")
51 | dates.append("2018-11-01 08:00:00")
52 | dates.append("2018-12-01 08:00:00")
53 | dates.append("2019-01-01 08:00:00")
54 | dates.append("2019-02-01 08:00:00")
55 | dates.append("2019-03-01 08:00:00")
56 | dates.append("2019-04-01 08:00:00")
57 | dates.append("2019-05-01 08:00:00")
58 | dates.append("2019-06-01 08:00:00")
59 | dates.append("2019-07-01 08:00:00")
60 | dates.append("2019-08-01 08:00:00")
61 | dates.append("2019-09-01 08:00:00")
62 | dates.append("2019-10-01 08:00:00")
63 |
64 | dates
65 | }
66 |
67 | def tranTimestampToString(tm: Int): String={
68 | val fm = new SimpleDateFormat("yyyy/MM")
69 | val tim = fm.format(new Date(tm.toLong * 1000))
70 | tim
71 | }
72 |
73 | def main(args: Array[String]): Unit = {
74 |
75 | println(tranTimestampToString(1569888000))
76 | }
77 | }
--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/utils/MySQLUtils.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.utils
2 |
3 | import java.sql.{Connection, DriverManager, PreparedStatement}
4 |
5 | /**
6 | * Created by thpffcj on 2019/10/25.
7 | */
8 | object MySQLUtils {
9 |
10 | /**
11 | * 获取MySQL的连接
12 | */
13 | def getConnection() = {
14 | Class.forName("com.mysql.jdbc.Driver")
15 | DriverManager.getConnection("jdbc:mysql://localhost:3306/steam?useUnicode=true&characterEncoding=utf-8", "root", "000000")
16 | }
17 |
18 | /**
19 | * 释放数据库连接等资源
20 | * @param connection
21 | * @param pstmt
22 | */
23 | def release(connection: Connection, pstmt: PreparedStatement): Unit = {
24 | try {
25 | if (pstmt != null) {
26 | pstmt.close()
27 | }
28 | } catch {
29 | case e: Exception => e.printStackTrace()
30 | } finally {
31 | if (connection != null) {
32 | connection.close()
33 | }
34 | }
35 | }
36 |
37 | def main(args: Array[String]) {
38 | println(getConnection())
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/spider/spark-graphx/steam-reviews-official.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import pymongo
4 | import re
5 | import threading
6 | from multiprocessing import JoinableQueue
7 | import time
8 |
9 | #爬取steam所有评论,存入mongodb中
10 | def write(item):
11 | try:
12 | if isinstance(item,list):
13 | collection.insert_many(item)
14 | else:
15 | collection.insert_one(item)
16 | except Exception as e:
17 | print(e)
18 | return True
19 |
20 | def getAllApps():
21 | try:
22 | apps = []
23 | for g in regions.game_id.find().skip(0).limit(1400):
24 | apps.append({"id":g["id"],"name":g["name"]})
25 | return apps
26 | except:
27 | print(e)
28 |
29 | def fetchReview(url,params,headers,app):
30 | try:
31 | res = session.get(url,params=params,headers=headers,timeout=30,verify=False)
32 | if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码,此页爬取失败
33 | if res.status_code != requests.codes.forbidden and res.status_code != requests.codes.bad_gateway: #403、502不打印
34 | print(res.status_code,":",url)
35 | return None
36 | except Exception as e: #网络有问题访问失败
37 | print(e)
38 | return None
39 |
40 | result = res.json()
41 | # print(res.url)
42 | # print(result)
43 | reviews = result["reviews"]
44 | if not reviews:#该游戏没有更多评论了
45 | print(result)
46 | return None
47 | cursor = result["cursor"]
48 | if not cursor:
49 | print(result)
50 | for review in reviews:
51 | review["game"] = app
52 | write(reviews)
53 | # print(url)
54 | # print(reviews)
55 | # print()
56 | return cursor
57 |
58 | def fetch(apps):
59 | for app in apps:
60 | #建立会话,Cookie设置语言为简体中文,出生日期为1987.1.1(允许访问成人内容)
61 | headers = {
62 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
63 | 'Cookie':'Steam_Language=schinese; birthtime=533750401; timezoneOffset=28800,0;',
64 | }
65 | cursor = "*"
66 | reviewsCount = 0
67 | while cursor:
68 | # print("cursor:",cursor)
69 | # url = "https://store.steampowered.com/appreviews/"+app["id"]+"?json=1&filter=recent&language=schinese&day_range=360"+ \
70 | # "&cursor="+cursor+"&review_type=all&purchase_type=all&num_per_page=100"
71 | url = "https://store.steampowered.com/appreviews/"+app["id"]
72 | params = {
73 | "json":1,
74 | "filter":"recent", #all,recent,updated
75 | "language":"schinese", #all,schinese,zh-CN
76 | "day_range":"360",
77 | "cursor":cursor,
78 | "review_type":"all",
79 | "purchase_type":"all",
80 | "num_per_page":100,
81 | }
82 | cursor = fetchReview(url,params,headers,app)
83 | reviewsCount = reviewsCount+100
84 | if reviewsCount>=10000:
85 | break
86 | print(url,reviewsCount)
87 |
88 | #mongodb连接
89 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db')
90 | db = client.steam_db
91 | regions = db.China
92 | collection = regions.reviews_official
93 |
94 | requests.packages.urllib3.disable_warnings()
95 | session = requests.session()
96 |
97 | appInfos = getAllApps()
98 | # print(appInfos)
99 | numOfThreads = 1
100 | badPages = fetch(appInfos)
101 | print("all finished")
102 |
103 | # https://store.steampowered.com/appreviews/243470?json=1&filter=all&language=all&day_range=360&cursor=*&review_type=all&purchase_type=all&num_per_page=10
--------------------------------------------------------------------------------
/spider/spark-graphx/steam-reviews.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import pymongo
4 | import re
5 | import threading
6 | from multiprocessing import JoinableQueue
7 | import time
8 |
9 | # 3 1 1400 678960,1122050,1100620,730,1041320
10 | #爬取中国区steam所有评论,存入mongodb中
11 | def write(item):
12 | try:
13 | if isinstance(item,list):
14 | # firstUser = regions.first_review_user.find({"game":item[0]["game"]["id"]})[0]
15 | # if firstUser["user"]==item[0]["user"]["name"]: #重复评论
16 | # return False
17 | collection.insert_many(item)
18 | else:
19 | collection.insert_one(item)
20 | except Exception as e:
21 | print(e)
22 | return True
23 |
24 | def getAllApps():
25 | try:
26 | apps = []
27 | for g in regions.game_id.find().skip(0).limit(1400):
28 | apps.append({"id":g["id"],"name":g["name"],"firstUser":None})
29 | return apps
30 | except:
31 | print(e)
32 |
33 | def fetchReview(url,headers,app):
34 | try:
35 | res = session.get(url,headers=headers,timeout=30,verify=False)
36 | if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码,此页爬取失败
37 | if res.status_code != requests.codes.forbidden and res.status_code != requests.codes.bad_gateway: #403、502不打印
38 | print(res.status_code,":",url)
39 | return None
40 | except Exception as e: #网络有问题访问失败
41 | print(e)
42 | return None
43 |
44 | if not res.text:#该游戏没有更多评论了
45 | return None
46 |
47 | try:
48 | soup = BeautifulSoup(res.text,'lxml')
49 |
50 | reviewGroup = []
51 | for card in soup.find_all(class_="apphub_Card modalContentLink interactable"):
52 | userCard = card.find(class_="apphub_friend_block")
53 | if not userCard:# 没有用户的评论扔掉
54 | continue
55 | if not userCard.find(class_="apphub_CardContentAuthorName"):# 没有用户的评论扔掉
56 | continue
57 | if(len(userCard.find(class_="apphub_CardContentAuthorName").find_all("a"))!=1):
58 | print(userCard.find(class_="apphub_CardContentAuthorName"))
59 | name = userCard.find(class_="apphub_CardContentAuthorName").find("a").string
60 | name = name.strip() if name else ""
61 | product_owns = userCard.find(class_="apphub_CardContentMoreLink").string
62 | product_owns = product_owns.strip() if product_owns else ""
63 | user = {
64 | "name":name,# 可能为""
65 | "product_owns": product_owns,# 可能为""
66 | }
67 | comment_count = card.find(class_="apphub_CardCommentCount").string.strip()
68 | found_helpful = card.find(class_="found_helpful").contents
69 | helpful_num = found_helpful[0].strip()
70 | funny_num = found_helpful[-1].strip() if len(found_helpful)>1 else ""
71 | title = card.find(class_="reviewInfo").find(class_="title").string.strip()
72 | hours = card.find(class_="reviewInfo").find(class_="hours")
73 | hours = hours.string.strip() if hours else ""
74 |
75 | cardTextContent = card.find(class_="apphub_CardTextContent")
76 | date_posted = cardTextContent.find(class_="date_posted").string.strip()
77 | content = cardTextContent.contents[5:] if cardTextContent.find(class_="received_compensation") else cardTextContent.contents[2:]
78 | content = "".join(item.string if item.string else "
" for item in content).strip()
79 |
80 | review = {
81 | "game":app,
82 | "user":user,
83 | "comment_count":comment_count,#该评论回复数
84 | "helpful_num":helpful_num,#几人觉得这篇评测有价值 有的是一句话,有的是数字
85 | "funny_num":funny_num,#几人觉得这篇评测欢乐
86 | "title":title,#推荐/不推荐
87 | "hours":hours,#总时数 可能为""
88 | "date_posted":date_posted,#发布于
89 | "content":content,#评论内容
90 | }
91 | reviewGroup.append(review)
92 | form = soup.find("form")
93 | nextUrl = form.attrs["action"]+"?"
94 | for arg in form.find_all("input"):
95 | nextUrl = nextUrl+arg.attrs["name"]+"="+arg.attrs["value"]+"&"
96 | nextUrl = nextUrl[:-1]
97 | # if app["firstUser"]==reviewGroup[0]["user"]["name"]:
98 | # return None
99 | # write(reviewGroup)
100 | print(url)
101 | print(reviewGroup)
102 | print()
103 | # print("nextUrl",nextUrl)
104 | return nextUrl
105 |
106 | except Exception as e: #steam服务器响应不正确
107 | print("bad url:",url,e)
108 | return None
109 |
110 | class fetchThread(threading.Thread):
111 | def __init__(self, tQueue, app, threadNum):
112 | threading.Thread.__init__(self)
113 | self.tQueue = tQueue
114 | self.app = app
115 | self.threadNum = threadNum
116 | def run(self):
117 | id = self.app["id"]
118 | p = str(self.threadNum+1)
119 | userreviewsoffset = str((int(p)-1)*10)
120 | numperpage = "10"
121 | # url = "https://steamcommunity.com/app/"+id+"/homecontent/?userreviewsoffset="+userreviewsoffset+"&p="+p+ \
122 | # "&workshopitemspage="+p+"&readytouseitemspage="+p+"&mtxitemspage="+p+"&itemspage="+p+"&screenshotspage="+p+ \
123 | # "&videospage="+p+"&artpage="+p+"&allguidepage="+p+"&webguidepage="+p+"&integratedguidepage="+p+ \
124 | # "&discussionspage="+p+"&numperpage="+numperpage+"&browsefilter=trendyear&browsefilter=trendyear&l=schinese"+ \
125 | # "&appHubSubSection="+numperpage+"&filterLanguage=default&searchText=&forceanon=1"
126 | url = "https://steamcommunity.com/app/"+id+"/reviews/?p=1&browsefilter=trendyear"
127 | #建立会话,Cookie设置语言为简体中文,出生日期为1987.1.1(允许访问成人内容)
128 | headers = {
129 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
130 | 'accept':'text/javascript, text/html, application/xml, text/xml, */*',
131 | 'accept-encoding': 'gzip, deflate, br',
132 | 'accept-language': 'zh,zh-CN;q=0.9,zh-TW;q=0.8,en;q=0.7,en-GB;q=0.6,en-US;q=0.5',
133 | 'cache-control': 'no-cache',
134 | 'pragma': 'no-cache',
135 | 'Cookie':'Steam_Language=schinese; birthtime=533750401; timezoneOffset=28800,0; sessionid=04a0dcb8f1f8f31bed482819; recentlyVisitedAppHubs=816340%2C678960%2C242920%2C1122050; steamCountry=CN%7C72e4ed8aa9f1f07b0eeba82d9349680e; app_impressions=1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_',
136 | 'sec-fetch-mode': 'cors',
137 | 'sec-fetch-site': 'same-origin',
138 | 'x-prototype-version': '1.7',
139 | 'x-requested-with': 'XMLHttpRequest',
140 | 'referer': url
141 | }
142 | # print(url)
143 | reviewCount = 0
144 | while url:
145 | # print("threadNum:"+str(self.threadNum)+" offset:"+userreviewsoffset)
146 | nextUrl = fetchReview(url,headers,self.app)
147 | if not nextUrl:
148 | nextUrl = fetchReview(url,headers,self.app)
149 | if not nextUrl: #2次失败认为这个游戏评论已爬完
150 | break
151 | url = nextUrl
152 | # if not self.app["firstUser"]:
153 | # self.app["firstUser"]=reviewGroup[0]["user"]["name"]
154 | reviewCount = reviewCount+10
155 | if int(reviewCount)>5000:#超过5K条评论后面的就不爬了(以魂3评论量为基准)
156 | break
157 | # p = str(int(p)+self.tQueue.numOfThreads*1)
158 | # userreviewsoffset = str((int(p)-1)*10)
159 | # url = "https://steamcommunity.com/app/"+id+"/homecontent/?userreviewsoffset="+userreviewsoffset+"&p="+p+ \
160 | # "&workshopitemspage="+p+"&readytouseitemspage="+p+"&mtxitemspage="+p+"&itemspage="+p+"&screenshotspage="+p+ \
161 | # "&videospage="+p+"&artpage="+p+"&allguidepage="+p+"&webguidepage="+p+"&integratedguidepage="+p+ \
162 | # "&discussionspage="+p+"&numperpage="+numperpage+"&browsefilter=toprated&browsefilter=toprated&l=schinese"+ \
163 | # "&appHubSubSection="+numperpage+"&filterLanguage=default&searchText=&forceanon=1"
164 | # trendyear toprated trendweek trendday mostrecent
165 | # print("nextUrl",url)
166 | # time.sleep(2)
167 | # break
168 |
169 | class threadQueue:
170 | def __init__(self, numOfThreads, app):
171 | self.numOfThreads = numOfThreads
172 | self.app = app
173 | self.threads = []
174 | self.badItems = []
175 |
176 | for i in range(0,numOfThreads):
177 | # 创建线程爬取详情页面
178 | thread = fetchThread(self,app,i)
179 | thread.start()
180 | self.threads.append(thread)
181 | # def addBadItem(self,info):
182 | # self.badItems.append(info)
183 | def waitForStop(self):
184 | #等待当前页的线程爬取完后再开始爬下一页
185 | for t in self.threads:
186 | t.join()
187 | if self.badItems:
188 | print("badItems ",self.badItems)
189 |
190 | def fetch(apps):
191 | for app in apps:
192 | queue = threadQueue(numOfThreads,app)
193 | queue.waitForStop()
194 | print(app["id"],"finished")
195 | badItems = queue.badItems
196 |
197 | #错页重爬
198 | for app in badItems:
199 | queue = threadQueue(numOfThreads,app)
200 | queue.waitForStop()
201 | return queue.badItems
202 |
203 | #mongodb连接
204 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db')
205 | db = client.steam_db
206 | regions = db.China
207 | collection = regions.reviews
208 |
209 | requests.packages.urllib3.disable_warnings()
210 | session = requests.session()
211 |
212 | appInfos = getAllApps()
213 | # print(appInfos)
214 | numOfThreads = 1
215 | badPages = fetch(appInfos)
216 | print("all finished")
217 |
218 | # http://store.steampowered.com/appreviews/243470?json=1&filter=all&language=all&day_range=360&cursor=*&review_type=all&purchase_type=all&num_per_page=10
--------------------------------------------------------------------------------
/spider/spark-streaming/steam-games-multithread-queue.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import pymongo
4 | import re
5 | import threading
6 | from multiprocessing import JoinableQueue
7 |
8 | #爬取中国区steam所有产品,存入mongodb中
9 | def write(item):
10 | try:
11 | collection.insert_one(item)
12 | except:
13 | print(e)
14 |
15 | def fetchReviewsChart(appID):
16 | url = "https://store.steampowered.com/appreviewhistogram/"+appID+"?l=schinese&review_score_preference=0"
17 | try:
18 | res = session.get(url,headers=headers,timeout=30)
19 | if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码,此页爬取失败
20 | print(res.status_code,":",url)
21 | return None
22 | chart = res.json()
23 | #chart.results.rollup_type: 取值有"week"、"month",指的是chart.results.rollups中每个date的时间跨度
24 | #chart.results.recent 中每个date的时间跨度是1天
25 | #date: 时间
26 | #recommendations_down: 差评数
27 | #recommendations_up: 好评数
28 | return chart["results"] if chart["success"]==1 else None
29 | except: #网络有问题访问失败
30 | print(url)
31 | return None
32 |
33 | def fetchGameInfo(url):
34 | try:
35 | res = session.get(url,headers=headers,timeout=30)
36 | if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码,此页爬取失败
37 | print(res.status_code,":",url)
38 | return None
39 | except: #网络有问题访问失败
40 | print(url)
41 | return None
42 |
43 | try:
44 | soup = BeautifulSoup(res.text,'lxml')
45 |
46 | #社区的URL
47 | communityUrl = soup.find(class_="apphub_OtherSiteInfo").find("a").attrs["href"]
48 | appID = communityUrl.split("/")[-1]
49 |
50 | #右上角的概览
51 | user_reviews = soup.find(class_="user_reviews")
52 | user_reviews_json = {}
53 | for item in user_reviews.find_all("div",class_="subtitle column"):
54 | user_reviews_json[item.string.strip()] = re.sub('\r|\n|\t', '', item.parent.find_all("div")[1].get_text().strip())
55 |
56 | #用户自定义标签
57 | user_tags = soup.find(class_="glance_tags popular_tags")
58 | if user_tags:
59 | user_tags = [item.string.strip() for item in user_tags.find_all("a")]
60 | else:
61 | user_tags = []
62 |
63 | #该游戏支持的活动
64 | support_tags = soup.find_all(class_="game_area_details_specs")
65 | support_tags = [item.find(class_="name").get_text().strip() for item in support_tags]
66 |
67 | #爬取评论量图表
68 | reviewsChart = fetchReviewsChart(appID)
69 | if not reviewsChart: #失败重爬一次
70 | reviewsChart = fetchReviewsChart(appID)
71 | reviewsChart = reviewsChart if reviewsChart else ""
72 |
73 | #该页面的所有信息
74 | game_detail = {
75 | "user_reviews":user_reviews_json,
76 | "user_tags":user_tags,
77 | "support_tags":support_tags,
78 | "reviewsChart":reviewsChart,
79 | }
80 | # print(game_detail)
81 | return game_detail
82 |
83 | except: #steam服务器响应不正确
84 | print("bad url:",url)
85 | return None
86 |
87 | class fetchThread(threading.Thread):
88 | def __init__(self, tqueue):
89 | threading.Thread.__init__(self)
90 | self.tqueue = tqueue
91 | def run(self):
92 | while True:
93 | info = self.tqueue.tasks.get()
94 | href = info["href"]
95 | if href.startswith("https://store.steampowered.com/bundle/") or href.startswith("https://store.steampowered.com/sub/"):
96 | game_detail = "bundle" #捆绑包不爬详情页
97 | else:
98 | game_detail = fetchGameInfo(href)
99 | if not game_detail: #失败重爬一次
100 | game_detail = fetchGameInfo(href)
101 | game_detail = game_detail if game_detail else ""
102 | info["game_detail"] = game_detail
103 |
104 | # print(info)
105 | write(info)
106 | if game_detail=="":
107 | self.tqueue.addBadItem(info)
108 | self.tqueue.finishOne()
109 | self.tqueue.tasks.task_done() #已经处理完从队列中拿走的一个项目
110 |
111 | class threadQueue:
112 | def __init__(self, numOfThreads):
113 | self.numOfThreads = numOfThreads
114 | self.threads = []
115 | self.tasks = JoinableQueue()#实例一个队列
116 | self.tasksNum = 0
117 | self.badItems = []
118 |
119 | for i in range(1,numOfThreads):
120 | # 创建线程爬取详情页面
121 | thread = fetchThread(self)
122 | thread.start()
123 | self.threads.append(thread)
124 | def add(self,info):
125 | self.tasks.put(info)
126 | def finishOne(self):
127 | threadLock = threading.Lock()
128 | threadLock.acquire()
129 | self.tasksNum=self.tasksNum+1
130 | if self.tasksNum%25==0:
131 | print(self.tasksNum,"/",(totalPage-1)*25,"finished")
132 | threadLock.release()
133 | def addBadItem(self,info):
134 | self.badItems.append(info)
135 | def waitForStop(self):
136 | self.tasks.join()#等,直到消费者把自己放入队列中的所有项目都取走处理完后调用task_done()之后
137 | if self.badItems:
138 | print("badItems ",self.badItems)
139 |
140 | def fetch(pageRange):
141 | badPages = []
142 | page =1 #每页一个request
143 | for page in pageRange:
144 | try: #网络有问题访问失败,保存失败的请求然后跳过
145 | url = "https://store.steampowered.com/search/?page=" + str(page)
146 | res = session.get(url,headers=headers)
147 | except:
148 | badPages.append(page)
149 | continue
150 |
151 | if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码然后跳过
152 | print("page",page,":",res.status_code)
153 | badPages.append(page)
154 | continue
155 |
156 | try: #曾出现过异常,当时没仔细看,但是后面都没再出现了,可能与steam的服务器有关
157 | soup = BeautifulSoup(res.text,'lxml')
158 | contents = soup.find(id="search_resultsRows").find_all('a')
159 | except:
160 | print("bad page:",page)
161 | badPages.append(page)
162 | continue
163 |
164 | for content in contents:
165 | try:
166 | name = content.find(class_="title").get_text().strip()
167 | date = content.find("div",class_="col search_released responsive_secondrow").string
168 | date = date.strip() if date else ""#未上市的没有发行日期
169 | priceDiv=content.find("div",class_="col search_price discounted responsive_secondrow")
170 | if priceDiv:#打折游戏
171 | original_price=priceDiv.find("strike").string.strip()
172 | price=priceDiv.contents[-1].strip()
173 | else:#原价游戏
174 | original_price= content.find("div",class_="col search_price responsive_secondrow").string.strip()
175 | price=original_price
176 | img_src = content.find("div",class_="col search_capsule").find('img').get("src")
177 | href = content.get("href")
178 | review_summary = content.find("span",class_="search_review_summary")
179 | review_summary = review_summary.attrs['data-tooltip-html'].strip() if review_summary else ""#未上市的没有总评
180 | result={
181 | "page":page,
182 | "name":name,
183 | "href":href,
184 | "date":date,
185 | "original_price":original_price,
186 | "price":price,
187 | "img_src":img_src,
188 | "review_summary":review_summary,
189 | }
190 | queue.add(result)
191 | except:
192 | print(content)
193 | queue.waitForStop()
194 | return badPages
195 |
196 | #mongodb连接
197 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db')
198 | db = client.steam_db
199 | regions = db.China
200 | collection = regions.games
201 |
202 | #建立会话,Cookie设置语言为简体中文,出生日期为1987.1.1(允许访问成人内容)
203 | headers = {
204 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
205 | 'Cookie':'Steam_Language=schinese; birthtime=533750401'
206 | }
207 | session = requests.session()
208 |
209 | queue = threadQueue(100)
210 |
211 | totalPage = 2 #目前有2608页
212 | badPages = fetch(range(1, totalPage))
213 | if badPages: #重爬坏页
214 | badPages = fetch(badPages)
215 | print("all finished")
216 | if badPages:
217 | print("badPages:",badPages)
218 |
--------------------------------------------------------------------------------
/spider/spark-streaming/steam-hotN.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import pymongo
4 |
5 | #爬取中国区steam当前热销榜,存入mongodb中
6 | def write(item):
7 | try:
8 | collection.insert_one(item)
9 | except:
10 | print(e)
11 |
12 | def fetch(pageRange):
13 | badPages = []
14 | badItems = []
15 | page =1 #每页一个request
16 | for page in pageRange:
17 | try: #网络有问题访问失败,保存失败的请求然后跳过
18 | url = "https://store.steampowered.com/search/?filter=globaltopsellers&page=" + str(page) + "&os=win"
19 | #Cookie设置语言为简体中文
20 | headers = {
21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
22 | 'Cookie':'Steam_Language=schinese'
23 | }
24 | s = requests.session()
25 | res = s.get(url,headers=headers)
26 | except:
27 | badPages.append(page)
28 | continue
29 |
30 | if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码然后跳过
31 | print("page",page,":",res.status_code)
32 | badPages.append(page)
33 | continue
34 |
35 | try: #曾出现过异常,当时没仔细看,但是后面都没再出现了,可能与steam的服务器有关
36 | soup = BeautifulSoup(res.text,'lxml')
37 | contents = soup.find(id="search_resultsRows").find_all('a')
38 | except:
39 | print(soup)
40 | badPages.append(page)
41 |
42 | for content in contents:
43 | try:
44 | name = content.find(class_="title").get_text().strip()
45 | date = content.find("div",class_="col search_released responsive_secondrow").string
46 | date = date.strip() if date else ""#未上市的没有发行日期
47 | priceDiv=content.find("div",class_="col search_price discounted responsive_secondrow")
48 | if priceDiv:#打折游戏
49 | original_price=priceDiv.find("strike").string.strip()
50 | price=priceDiv.contents[-1].strip()
51 | else:#原价游戏
52 | original_price= content.find("div",class_="col search_price responsive_secondrow").string.strip()
53 | price=original_price
54 | img_src = content.find("div",class_="col search_capsule").find('img').get("src")
55 | href = content.get("href")
56 | review_summary = content.find("span",class_="search_review_summary")
57 | review_summary = review_summary.attrs['data-tooltip-html'].strip() if review_summary else ""#未上市的没有发行日期
58 | result={
59 | "page":page,
60 | "name":name,
61 | "href":href,
62 | "date":date,
63 | "original_price":original_price,
64 | "price":price,
65 | "img_src":img_src,
66 | "review_summary":review_summary,
67 | }
68 | # print(result)
69 | write(result)
70 | except:
71 | print(content)
72 | badItems.append(content)
73 | if page%10==0:
74 | print(page,"/",totalPage,"finished")#每10页打印一次进度
75 | print("badItems:",badItems)
76 | return badPages
77 |
78 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db')
79 | db = client.steam_db
80 | regions = db.China
81 | collection = regions.hot
82 |
83 | totalPage = 593 #目前有593页
84 | badPages = fetch(range(1, totalPage))
85 | if badPages: #重爬坏页
86 | badPages = fetch(badPages)
87 | print("all finished")
88 | if badPages:
89 | print("badPages:",badPages)
90 |
--------------------------------------------------------------------------------