├── .gitignore
├── README.md
├── data
    ├── China.games.json
    └── China.reviews_official.json
├── pic
    ├── 好评数量.png
    ├── 数据格式.png
    ├── 数据流.png
    ├── 游戏动力对比.png
    ├── 游戏用户关系图.png
    ├── 游戏用户关系图2.png
    ├── 游玩时长分析.png
    ├── 用户游戏推荐.png
    ├── 用户社群聚合图.png
    ├── 用户社群聚合图2.png
    ├── 用户社群聚合图3.png
    ├── 评论情感分析.png
    ├── 评论能力对比.png
    └── 词云图.png
├── ppt
    ├── GraphX_报告.pdf
    ├── MLlib_报告.pdf
    └── Streaming_报告.pdf
├── scripts
    ├── generate_kaggle_log.py
    ├── generate_log.py
    ├── read_mongodb.py
    ├── steam-recommend.py
    └── write_log.sh
├── spark-graphx
    ├── pom.xml
    └── src
    │   └── main
    │       ├── resources
    │           ├── follows.txt
    │           ├── log4j.properties
    │           ├── output
    │           │   ├── graph.gexf
    │           │   ├── graphWeapon.gexf
    │           │   ├── isolate.txt
    │           │   └── minDegrees.gexf
    │           ├── steam
    │           │   ├── hours_3_10W.gexf
    │           │   ├── hours_5_20W.gexf
    │           │   ├── hours_6_30W.gexf
    │           │   ├── hours_7_30W.gexf
    │           │   ├── steam_3_10W.gexf
    │           │   ├── steam_5_20W.gexf
    │           │   ├── steam_6_30W.gexf
    │           │   └── steam_7_30W.gexf
    │           └── user.txt
    │       └── scala
    │           └── cn
    │               └── edu
    │                   └── nju
    │                       ├── GraphExample.scala
    │                       ├── GraphExample2.scala
    │                       ├── GraphExample3.scala
    │                       ├── GraphProcess.scala
    │                       ├── GraphProcessTest.scala
    │                       └── MongoDBProcess.scala
├── spark-mllib
    ├── pom.xml
    └── src
    │   └── main
    │       ├── resources
    │           ├── game_content.txt
    │           ├── neg.txt
    │           ├── pos.txt
    │           └── recommend_validate
    │       └── scala
    │           └── cn
    │               └── edu
    │                   └── nju
    │                       ├── DataProcessing.scala
    │                       ├── EmotionAnalysis.scala
    │                       └── SteamGameRecommendation.scala
├── spark-streaming
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── cn
    │           │   └── edu
    │           │       └── nju
    │           │           ├── ApiReturnUtil.java
    │           │           ├── DemoMessageController.java
    │           │           ├── SteamserverdemoApplication.java
    │           │           ├── Test.java
    │           │           ├── TimeFieldObject.java
    │           │           ├── WebSocketConfig.java
    │           │           ├── WebSocketServer.java
    │           │           ├── api
    │           │               ├── ApiReturnObject.java
    │           │               └── TagReturnObject.java
    │           │           ├── domain
    │           │               ├── GameObject.java
    │           │               └── TagObject.java
    │           │           ├── encoder
    │           │               ├── ApiObjectEncoder.java
    │           │               └── TagObjectEncoder.java
    │           │           └── utils
    │           │               ├── DbPool.java
    │           │               ├── HBaseUtils.java
    │           │               └── Test.java
    │       ├── resources
    │           ├── RollupCSV
    │           │   ├── ._SUCCESS.crc
    │           │   ├── .part-00000-4a85a460-3bdb-48ad-bc81-9b68057082a8-c000.csv.crc
    │           │   └── _SUCCESS
    │           ├── game.json
    │           ├── gameAll.json
    │           ├── gameDetail.json
    │           ├── log4j.properties
    │           ├── test.txt
    │           └── update.sql
    │       └── scala
    │           └── cn
    │               └── edu
    │                   └── nju
    │                       ├── BatchProcess.scala
    │                       ├── HDFSProcess.scala
    │                       ├── KafkaProcess.scala
    │                       ├── MongoDBProcess.scala
    │                       ├── MySQLProcess.scala
    │                       ├── SteamProcess.scala
    │                       ├── dao
    │                           ├── CourseClickCountDAO.scala
    │                           ├── CourseSearchClickCountDAO.scala
    │                           ├── RollUpDAO.scala
    │                           └── TagDAO.scala
    │                       ├── domain
    │                           ├── ClickLog.scala
    │                           ├── CommentLog.scala
    │                           ├── CourseClickCount.scala
    │                           ├── CourseSearchClickCount.scala
    │                           ├── DouBanLog.scala
    │                           ├── GameDetail.scala
    │                           ├── ReviewsChart.scala
    │                           ├── RollUp.scala
    │                           ├── SteamLog.scala
    │                           ├── Tag.scala
    │                           └── UserData.scala
    │                       ├── test
    │                           ├── DateTest.scala
    │                           ├── HDFSProcessTest.scala
    │                           ├── JsonTest.scala
    │                           ├── StatStreaming.scala
    │                           └── TransformTest.scala
    │                       └── utils
    │                           ├── DateUtils.scala
    │                           └── MySQLUtils.scala
└── spider
    ├── spark-graphx
        ├── steam-reviews-official.py
        └── steam-reviews.py
    └── spark-streaming
        ├── steam-games-multithread-queue.py
        └── steam-hotN.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled class file
 2 | *.class
 3 | 
 4 | # Log file
 5 | *.log
 6 | 
 7 | # BlueJ files
 8 | *.ctxt
 9 | 
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 | 
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.nar
17 | *.ear
18 | *.zip
19 | *.tar.gz
20 | *.rar
21 | 
22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
23 | hs_err_pid*
24 | 
25 | *.iml
26 | *.idea
27 | target
28 | *.csv
29 | model
30 | 
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 云计算大作业
  2 | 
  3 | ## 1. 云计算作业介绍
  4 | 
  5 | - **本人只负责逻辑层面的业务处理，所以代码大部分只负责到处理数据后落地，前端展示基本使用ECharts**
  6 | - **详细流程参考PPT文件下三次汇报PPT**
  7 | - **单机业务流程就可以跑通，集群搭建步骤请参考：[集群搭建](https://github.com/Thpffcj/BigData-Getting-Started/blob/master/%E9%9B%86%E7%BE%A4%E6%90%AD%E5%BB%BA.md)**
  8 | - 实践作业分为三个部分:Spark Streaming 计算模拟、Spark GraphX 计算和基于 Spark MLlib 的计算
  9 | - Spark Streaming
 10 |   - 要求针对DStream数据开展的计算中至少使用到5个Transformation操作，可以
 11 | 是多个业务问题;必须使用到至少1个全局统计的量;结果展示不少于2类图示。Streaming 程序监听的必须是HDFS文件夹。原始数据存储在MongoDB中，模拟数据流时，从MongoDB 中读取数据，写入HDFS中
 12 | - Spark GraphX
 13 |   - 要求必须使用边和点的RDD构造图;用于业务计算的图中不少于1 万个点和1万条边;对于图的计算使用不少于6个GraphX的API调用，可以是解决多个业务问题; 至少使用1次聚合操作或者关联操作;结果展示不少于2类图示。从MongoDB中读取图数据，结果存回 MongoDB中
 14 | - Spark MLlib
 15 |   - 展示不仅包括实验结果，还需包括数据的相关分析
 16 | 
 17 | ***
 18 | 
 19 | ## 2. spark-streaming：Steam数据爬取和流模拟
 20 | 
 21 | ### 1. 研究问题
 22 | 
 23 | - 截至目前那些游戏最火爆
 24 | - 玩家游戏时长的分布
 25 | - 哪些类型的游戏最受欢迎   
 26 | 
 27 | ### 2. 数据
 28 | 
 29 | - /data/China.games.json
 30 | 
 31 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E6%95%B0%E6%8D%AE%E6%A0%BC%E5%BC%8F.png)
 32 | 
 33 | - /data/steam.csv
 34 | 
 35 | 
 36 |         userId,gameName,behavior,duration,none
 37 |         151603712,"The Elder Scrolls V Skyrim",purchase,1.0,0
 38 |         151603712,"The Elder Scrolls V Skyrim",play,273.0,0
 39 |         151603712,"Fallout 4",purchase,1.0,0
 40 |         151603712,"Fallout 4",play,87.0,0
 41 |         151603712,"Spore",purchase,1.0,0
 42 |         151603712,"Spore",play,14.9,0
 43 |         151603712,"Fallout New Vegas",purchase,1.0,0
 44 |         151603712,"Fallout New Vegas",play,12.1,0
 45 |         151603712,"Left 4 Dead 2",purchase,1.0,0
 46 |         151603712,"Left 4 Dead 2",play,8.9,0
 47 | 
 48 | 
 49 | ### 3. 数据流
 50 | 
 51 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E6%95%B0%E6%8D%AE%E6%B5%81.png)
 52 | 
 53 | ### 4. 展示效果
 54 | 
 55 | - 游戏销量动态排名图
 56 | 
 57 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E5%A5%BD%E8%AF%84%E6%95%B0%E9%87%8F.png)
 58 | 
 59 | - 动态词云图
 60 | 
 61 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E8%AF%8D%E4%BA%91%E5%9B%BE.png)
 62 | 
 63 | - 游玩时长分布图
 64 | 
 65 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E6%B8%B8%E7%8E%A9%E6%97%B6%E9%95%BF%E5%88%86%E6%9E%90.png)
 66 | 
 67 | ***
 68 | 
 69 | ## 3. spark-graphx
 70 | 
 71 | ### 1. 研究问题
 72 | 
 73 | - 游戏的口碑和热度
 74 | - 用户社群
 75 | - 游戏对市场的占有力和用户粘性
 76 | - 游戏间的竞争关系 
 77 |  
 78 | **相关指标**
 79 | 
 80 | - 游戏评论
 81 | - 玩家评论游戏数
 82 | - 游戏所受评论数
 83 | - 玩家游戏时长
 84 | 
 85 | ### 2. 展示效果
 86 | 
 87 | - 游戏用户关系图
 88 | 
 89 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E6%B8%B8%E6%88%8F%E7%94%A8%E6%88%B7%E5%85%B3%E7%B3%BB%E5%9B%BE.png)
 90 | 
 91 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E6%B8%B8%E6%88%8F%E7%94%A8%E6%88%B7%E5%85%B3%E7%B3%BB%E5%9B%BE2.png)
 92 | 
 93 | - 用户社群聚合图
 94 | 
 95 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E7%94%A8%E6%88%B7%E7%A4%BE%E7%BE%A4%E8%81%9A%E5%90%88%E5%9B%BE.png)
 96 | 
 97 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E7%94%A8%E6%88%B7%E7%A4%BE%E7%BE%A4%E8%81%9A%E5%90%88%E5%9B%BE2.png)
 98 | 
 99 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E7%94%A8%E6%88%B7%E7%A4%BE%E7%BE%A4%E8%81%9A%E5%90%88%E5%9B%BE3.png)
100 | 
101 | ***
102 | 
103 | ## 4. spark-mllib
104 | 
105 | ### 1. 研究问题
106 | 
107 | - 玩家间有哪些社群？
108 | - 各社群特点？
109 | - 可能感兴趣的游戏？
110 | 
111 | ### 2. 用户游戏推荐
112 | 
113 | - 我们想利用某个兴趣相投、拥有共同经验群体的喜好来推荐感兴趣的游戏给玩家
114 | - 协同过滤技术旨在补充用户 - 商品关联矩阵中所缺失的部分
115 | - 我们并没有直观的用户对游戏的评分，于是用户的游玩时长代替用户对游戏的评价，为了消除游戏本身游玩时长的影响，我们将每款游戏的游玩时长映射到0 – 10之间代替用户对该游戏的评分
116 | 
117 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E7%94%A8%E6%88%B7%E6%B8%B8%E6%88%8F%E6%8E%A8%E8%8D%90.png)
118 | 
119 | ### 3. 评论情感分析
120 | 
121 | - 用户对游戏的评论通常有一个标签：推荐/不推荐
122 | - 我们想通过对评论的情感分析，判断一条评论是推荐这个游戏还是不推荐
123 | 
124 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E8%AF%84%E8%AE%BA%E6%83%85%E6%84%9F%E5%88%86%E6%9E%90.png)
125 | 
126 | ### 4. 社群聚类分析
127 | 
128 | - 聚类特征
129 |   - 玩家拥有游戏数
130 |   - 玩家总评论数
131 |   - 游玩时长
132 |   - 是否推荐游戏
133 |   - 被认为评论有用
134 |   - 被认为评论欢乐数
135 |   - 被回复数
136 | 
137 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E6%B8%B8%E6%88%8F%E5%8A%A8%E5%8A%9B%E5%AF%B9%E6%AF%94.png)
138 | 
139 | ![](https://raw.githubusercontent.com/Thpffcj/cloud-computing/master/pic/%E8%AF%84%E8%AE%BA%E8%83%BD%E5%8A%9B%E5%AF%B9%E6%AF%94.png)
140 | 
141 | ***
142 | 
143 | ## 5. 云计算作业介绍
144 | 
145 | - 由于本人只负责Spark计算的过程，所以每部分代码可能都不是完整的业务流程，主要记录学习Spark过程
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/pic/好评数量.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/好评数量.png


--------------------------------------------------------------------------------
/pic/数据格式.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/数据格式.png


--------------------------------------------------------------------------------
/pic/数据流.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/数据流.png


--------------------------------------------------------------------------------
/pic/游戏动力对比.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/游戏动力对比.png


--------------------------------------------------------------------------------
/pic/游戏用户关系图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/游戏用户关系图.png


--------------------------------------------------------------------------------
/pic/游戏用户关系图2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/游戏用户关系图2.png


--------------------------------------------------------------------------------
/pic/游玩时长分析.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/游玩时长分析.png


--------------------------------------------------------------------------------
/pic/用户游戏推荐.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/用户游戏推荐.png


--------------------------------------------------------------------------------
/pic/用户社群聚合图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/用户社群聚合图.png


--------------------------------------------------------------------------------
/pic/用户社群聚合图2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/用户社群聚合图2.png


--------------------------------------------------------------------------------
/pic/用户社群聚合图3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/用户社群聚合图3.png


--------------------------------------------------------------------------------
/pic/评论情感分析.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/评论情感分析.png


--------------------------------------------------------------------------------
/pic/评论能力对比.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/评论能力对比.png


--------------------------------------------------------------------------------
/pic/词云图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/pic/词云图.png


--------------------------------------------------------------------------------
/ppt/GraphX_报告.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/ppt/GraphX_报告.pdf


--------------------------------------------------------------------------------
/ppt/MLlib_报告.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/ppt/MLlib_报告.pdf


--------------------------------------------------------------------------------
/ppt/Streaming_报告.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/ppt/Streaming_报告.pdf


--------------------------------------------------------------------------------
/scripts/generate_kaggle_log.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # Created by thpffcj on 2019/10/19.
 3 | 
 4 | import time
 5 | import pandas as pd
 6 | 
 7 | pd.set_option('display.max_columns', 40)
 8 | pd.set_option('display.width', 1000)
 9 | 
10 | 
11 | def generate_log(count=200000):
12 | 
13 |     data = pd.read_csv("/Users/thpffcj/Public/file/steam.csv")
14 |     f = open("/Users/thpffcj/Public/file/user_data.log", "a")
15 | 
16 |     flag = 0
17 |     position = 0
18 |     while count >= 1:
19 |         log = data.loc[position:position]
20 |         query_log = "{user_id}\t{game_name}\t{behavior}\t{duration}".format(
21 |             user_id=log["userId"].values.max(), game_name=log["gameName"].values.max(),
22 |             behavior=log["behavior"].values.max(), duration=log["duration"].values.max())
23 | 
24 |         f.write(query_log + "\n")
25 |         print(query_log)
26 | 
27 |         if flag % 500 == 0:
28 |             time.sleep(5)
29 | 
30 |         count = count - 1
31 |         position = position + 1
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     generate_log()
36 | 


--------------------------------------------------------------------------------
/scripts/generate_log.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # Created by thpffcj on 2019/10/2.
 3 | 
 4 | import pymongo
 5 | import time
 6 | import os
 7 | 
 8 | 
 9 | # 连接数据库
10 | client = pymongo.MongoClient("***.***.***.***", 27017)
11 | 
12 | db = client['steam_db']
13 | db.authenticate("steam", "steam")
14 | 
15 | table = db['China.games']
16 | 
17 | data = table.find().limit(1000)
18 | print("数据加载完成...")
19 | # 65175
20 | print(data.count())
21 | 
22 | 
23 | def generate_log(count=200):
24 |     print("进入方法...")
25 |     flag = 0
26 |     steam_log = ""
27 |     for game_data in data:
28 |         query_log = "{img_src}\t{game_detail}\t{original_price}\t{price}\t{review_summary}\t{date}\t{name}".format(
29 |             img_src=game_data["img_src"],
30 |             game_detail=str(game_data["game_detail"]),
31 |             original_price=game_data["original_price"],
32 |             price=game_data["price"],
33 |             review_summary=game_data["review_summary"],
34 |             date=game_data["date"],
35 |             name=game_data["name"])
36 | 
37 |         steam_log = steam_log + query_log + "\n"
38 |         flag = flag + 1
39 |         if flag % 200 == 0:
40 |             print("flag:" + str(flag))
41 | 
42 |         if flag == count:
43 |             print("写日志...")
44 |             f = open("/Users/thpffcj/Public/local-repository/Python-Learning/cloud-computing/utils/test.log", "w")
45 |             f.write(steam_log)
46 |             time.sleep(2)
47 | 
48 |             # 上传
49 |             print("上传日志...")
50 |             os.system("./write_log.sh")
51 | 
52 |             flag = 0
53 |             steam_log = ""
54 |             f.close()
55 |             time.sleep(3)
56 | 
57 |     print("结束...")
58 | 
59 | 
60 | def write_log():
61 |     print("进入方法...")
62 |     flag = 0
63 |     f = open("/Users/thpffcj/Public/local-repository/Python-Learning/cloud-computing/utils/test.log", "a")
64 |     for game_data in data:
65 |         query_log = "{img_src}\t{game_detail}\t{original_price}\t{price}\t{review_summary}\t{date}\t{name}".format(
66 |             img_src=game_data["img_src"],
67 |             game_detail=str(game_data["game_detail"]),
68 |             original_price=game_data["original_price"],
69 |             price=game_data["price"],
70 |             review_summary=game_data["review_summary"],
71 |             date=game_data["date"],
72 |             name=game_data["name"])
73 | 
74 |         flag = flag + 1
75 |         if flag % 200 == 0:
76 |             print("flag:" + str(flag))
77 | 
78 |         f.write(query_log + "\n")
79 | 
80 |     f.close()
81 |     print("结束...")
82 | 
83 | 
84 | def clean():
85 |     f = open("/Users/thpffcj/Public/local-repository/Python-Learning/cloud-computing/utils/test.log", "w")
86 |     f.write("")
87 |     f.close()
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     generate_log()
92 | 


--------------------------------------------------------------------------------
/scripts/read_mongodb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # Created by thpffcj on 2019/10/30.
 3 | 
 4 | import pymongo
 5 | 
 6 | 
 7 | # 连接数据库
 8 | client1 = pymongo.MongoClient("***.***.***.***", 27017)
 9 | 
10 | db1 = client1['steam_db']
11 | db1.authenticate("steam", "steam")
12 | 
13 | table = db1['China.reviews_official']
14 | 
15 | data = table.find().limit(300000)
16 | print("数据加载完成...")
17 | # 65175
18 | # for d in data:
19 | #     print(d["game"])
20 | 
21 | 
22 | # Python写MongoDB
23 | client2 = pymongo.MongoClient("127.0.0.1", 27017)
24 | # 库名inventory
25 | db2 = client2['test']
26 | # 集合名items
27 | collection = db2['China.reviews_official_30W']
28 | 
29 | # 插入一个文档，item是一个字典{}
30 | collection.insert(data)


--------------------------------------------------------------------------------
/scripts/steam-recommend.py:
--------------------------------------------------------------------------------
 1 | from pyspark.ml.clustering import KMeans
 2 | from pyspark.ml.evaluation import ClusteringEvaluator
 3 | from pyspark.sql import SparkSession
 4 | from pyspark.ml.linalg import Vectors
 5 | import pymongo
 6 | 
 7 | # db.addUser("steam",{roles:[ {role:"root",db:"steam_db"} ]})
 8 | #mongodb连接
 9 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db')
10 | db = client.steam_db
11 | regions = db.China
12 | test_collection = regions.test_collection
13 | train_collection = regions.train_collection
14 | print(train_collection.find()[0])
15 | 
16 | # kmeans_path = "./kmeans"
17 | model_path = "./kmeans_model"
18 | 
19 | def getData(collection):
20 |     return map(lambda r: (Vectors.dense([r["author"]["num_games_owned"], 
21 |                                          r["author"]["num_reviews"], 
22 |                                          r["author"]["playtime_forever"], 
23 | #                                          r["review"], 
24 |                                          r["voted_up"],
25 |                                          r["votes_up"], 
26 |                                          r["votes_funny"], 
27 |                                          r["comment_count"]]),), collection.find())
28 |     
29 | spark = SparkSession\
30 |         .builder\
31 |         .appName("KMeansExample")\
32 |         .getOrCreate()
33 | train_data = getData(train_collection)
34 | test_data = getData(test_collection)
35 | 
36 | # Loads data.
37 | # dataset = spark.read.format("libsvm").load("sample_kmeans_data.txt")
38 | # data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
39 | #         (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
40 | # data = [(Vectors.dense([0.0, 0.0, 0.0]),), (Vectors.dense([0.1, 0.1, 0.1]),), (Vectors.dense([0.2, 0.2, 0.2]),),
41 | #         (Vectors.dense([9.0, 9.0, 9.0]),), (Vectors.dense([9.1, 9.1, 9.1]),), (Vectors.dense([9.2, 9.2, 9.2]),)]
42 | train_dataset = spark.createDataFrame(train_data, ["features"])
43 | test_dataset = spark.createDataFrame(test_data, ["features"])
44 | 
45 | # Trains a k-means model.
46 | kmeans = KMeans().setK(5).setSeed(1)
47 | # kmeans = KMeans.load(kmeans_path)
48 | model = kmeans.fit(train_dataset)
49 | # model = KMeansModel.load(model_path)
50 | clusterSizes = model.summary.clusterSizes
51 | print(clusterSizes)
52 | 
53 | # print("点（-3 -3）所属族:" + model.predict((Vectors.dense([1, 1, 1, 1, 1, 1, 1]),)))
54 | 
55 | # Make predictions
56 | predictions = model.transform(test_dataset)
57 | # print(predictions.collect())
58 | 
59 | # Evaluate clustering by computing Silhouette score
60 | evaluator = ClusteringEvaluator()
61 | 
62 | silhouette = evaluator.evaluate(predictions) #轮廓系数 silhouette coefficient 0.8758693672037696
63 | print("Silhouette with squared euclidean distance = " + str(silhouette))
64 | 
65 | # Shows the result.
66 | centers = model.clusterCenters()
67 | print("Cluster Centers: ")
68 | for center in centers:
69 |     print(center)
70 |     
71 | # kmeans.save(kmeans_path)
72 | # model.save(model_path)
73 | 
74 | kmeans_centers = regions.kmeans_centers
75 | kmeans_centers.drop()
76 | i = 0
77 | for center in centers:
78 |     json = {
79 |         "num_games_owned":center[0],
80 |         "num_reviews":center[1],
81 |         "playtime_forever":center[2],
82 |         "voted_up":center[3],
83 |         "votes_up":center[4],
84 |         "votes_funny":center[5],
85 |         "comment_count":center[6],
86 |         "num_of_reviews":clusterSizes[i],
87 |     }
88 |     i+=1
89 |     kmeans_centers.insert_one(json)
90 | 
91 | spark.stop()


--------------------------------------------------------------------------------
/scripts/write_log.sh:
--------------------------------------------------------------------------------
 1 | # HDFS命令
 2 | HDFS="hadoop fs"
 3 | 
 4 | # Streaming监听的文件目录，要与Streaming程序中保持一致
 5 | streaming_dir="/cloud-computing"
 6 | 
 7 | # 清空旧数据
 8 | $HDFS -rm "${streaming_dir}"'/tmp/*'>/dev/null 2>&1
 9 | $HDFS -rm "${streaming_dir}"'/*'>/dev/null 2>&1
10 | $HDFS -mkdir ${streaming_dir}/tmp
11 | 
12 | # 生成日志
13 | 
14 | # 加上时间戳，防止重名
15 | templog="access.`date +'%s'`.log"
16 | # 先将日志放到临时目录，再移动到Streaming监听目录，确保原子性
17 | $HDFS -put test.log ${streaming_dir}/tmp/$templog
18 | $HDFS -mv ${streaming_dir}/tmp/$templog ${streaming_dir}/


--------------------------------------------------------------------------------
/spark-graphx/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>cn.edu.nju</groupId>
  8 |     <artifactId>mf1932063</artifactId>
  9 |     <version>1.0-SNAPSHOT</version>
 10 | 
 11 |     <properties>
 12 |         <scala.version>2.11.8</scala.version>
 13 |         <spark.version>2.2.0</spark.version>
 14 |         <hadoop.version>2.6.0-cdh5.16.2</hadoop.version>
 15 |         <maven.compiler.source>1.8</maven.compiler.source>
 16 |         <maven.compiler.target>1.8</maven.compiler.target>
 17 |     </properties>
 18 | 
 19 |     <!--添加cloudera的repository-->
 20 |     <repositories>
 21 |         <repository>
 22 |             <id>cloudera</id>
 23 |             <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
 24 |         </repository>
 25 |         <repository>
 26 |             <id>central</id>
 27 |             <name>aliyun maven</name>
 28 |             <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
 29 |             <layout>default</layout>
 30 |         </repository>
 31 |     </repositories>
 32 | 
 33 |     <dependencies>
 34 |         <dependency>
 35 |             <groupId>org.projectlombok</groupId>
 36 |             <artifactId>lombok</artifactId>
 37 |             <version>1.16.18</version>
 38 |         </dependency>
 39 | 
 40 |         <dependency>
 41 |             <groupId>org.scala-lang</groupId>
 42 |             <artifactId>scala-library</artifactId>
 43 |             <version>${scala.version}</version>
 44 |         </dependency>
 45 | 
 46 |         <dependency>
 47 |             <groupId>org.mongodb.spark</groupId>
 48 |             <artifactId>mongo-spark-connector_2.11</artifactId>
 49 |             <version>${spark.version}</version>
 50 |         </dependency>
 51 | 
 52 |         <dependency>
 53 |             <groupId>org.apache.spark</groupId>
 54 |             <artifactId>spark-graphx_2.11</artifactId>
 55 |             <version>${spark.version}</version>
 56 |         </dependency>
 57 | 
 58 |         <dependency>
 59 |             <groupId>org.apache.commons</groupId>
 60 |             <artifactId>commons-lang3</artifactId>
 61 |             <version>3.5</version>
 62 |         </dependency>
 63 | 
 64 |         <!-- Spark SQL 依赖-->
 65 |         <dependency>
 66 |             <groupId>org.apache.spark</groupId>
 67 |             <artifactId>spark-sql_2.11</artifactId>
 68 |             <version>${spark.version}</version>
 69 |         </dependency>
 70 | 
 71 |         <dependency>
 72 |             <groupId>com.fasterxml.jackson.module</groupId>
 73 |             <artifactId>jackson-module-scala_2.11</artifactId>
 74 |             <version>2.6.5</version>
 75 |         </dependency>
 76 | 
 77 |         <dependency>
 78 |             <groupId>com.alibaba</groupId>
 79 |             <artifactId>fastjson</artifactId>
 80 |             <version>1.2.47</version>
 81 |         </dependency>
 82 | 
 83 | <!--        <dependency>-->
 84 | <!--            <groupId>com.fasterxml.jackson.core</groupId>-->
 85 | <!--            <artifactId>jackson-databind</artifactId>-->
 86 | <!--            <version>2.9.10.1</version>-->
 87 | <!--        </dependency>-->
 88 | 
 89 |         <dependency>
 90 |             <groupId>net.jpountz.lz4</groupId>
 91 |             <artifactId>lz4</artifactId>
 92 |             <version>1.3.0</version>
 93 |         </dependency>
 94 | 
 95 |         <dependency>
 96 |             <groupId>org.codehaus.janino</groupId>
 97 |             <artifactId>janino</artifactId>
 98 |             <version>3.0.8</version>
 99 |         </dependency>
100 | 
101 |         <dependency>
102 |             <groupId>mysql</groupId>
103 |             <artifactId>mysql-connector-java</artifactId>
104 |             <version>5.1.38</version>
105 |         </dependency>
106 | 
107 |         <dependency>
108 |             <groupId>io.netty</groupId>
109 |             <artifactId>netty-all</artifactId>
110 |             <version>4.0.42.Final</version>
111 |         </dependency>
112 | 
113 |     </dependencies>
114 | 
115 |     <build>
116 |         <!--
117 |         <sourceDirectory>src/main/scala</sourceDirectory>
118 |         <testSourceDirectory>src/test/scala</testSourceDirectory>
119 |         -->
120 |         <plugins>
121 |             <plugin>
122 |                 <groupId>org.scala-tools</groupId>
123 |                 <artifactId>maven-scala-plugin</artifactId>
124 |                 <executions>
125 |                     <execution>
126 |                         <goals>
127 |                             <goal>compile</goal>
128 |                             <goal>testCompile</goal>
129 |                         </goals>
130 |                     </execution>
131 |                 </executions>
132 |                 <configuration>
133 |                     <scalaVersion>${scala.version}</scalaVersion>
134 |                     <args>
135 |                         <arg>-target:jvm-1.8</arg>
136 |                     </args>
137 |                 </configuration>
138 |             </plugin>
139 |             <plugin>
140 |                 <groupId>org.apache.maven.plugins</groupId>
141 |                 <artifactId>maven-eclipse-plugin</artifactId>
142 |                 <configuration>
143 |                     <downloadSources>true</downloadSources>
144 |                     <buildcommands>
145 |                         <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
146 |                     </buildcommands>
147 |                     <additionalProjectnatures>
148 |                         <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
149 |                     </additionalProjectnatures>
150 |                     <classpathContainers>
151 |                         <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
152 |                         <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
153 |                     </classpathContainers>
154 |                 </configuration>
155 |             </plugin>
156 |         </plugins>
157 |     </build>
158 |     <reporting>
159 |         <plugins>
160 |             <plugin>
161 |                 <groupId>org.scala-tools</groupId>
162 |                 <artifactId>maven-scala-plugin</artifactId>
163 |                 <configuration>
164 |                     <scalaVersion>${scala.version}</scalaVersion>
165 |                 </configuration>
166 |             </plugin>
167 |         </plugins>
168 |     </reporting>
169 | </project>
170 | 


--------------------------------------------------------------------------------
/spark-graphx/src/main/resources/follows.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 2 3
3 | 3 5
4 | 4 6
5 | 7 6
6 | 6 7


--------------------------------------------------------------------------------
/spark-graphx/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=INFO, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
 9 | # log level for this class is used to overwrite the root logger's log level, so that
10 | # the user can have different defaults for the shell and regular Spark apps.
11 | log4j.logger.org.apache.spark.repl.Main=WARN
12 | 
13 | # Settings to quiet third party logs that are too verbose
14 | log4j.logger.org.spark_project.jetty=WARN
15 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
18 | log4j.logger.org.apache.parquet=ERROR
19 | log4j.logger.parquet=ERROR
20 | 
21 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
22 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
23 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR


--------------------------------------------------------------------------------
/spark-graphx/src/main/resources/output/graphWeapon.gexf:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <gexf xmlns="http://www.gexf.net/1.2draft" version="1.2">
  3 |  <graph mode="static" defaultedgetype="directed">
  4 |   <nodes>
  5 |    <node id="39" label="郭芙" />
  6 |   <node id="15" label="欧阳锋" />
  7 |   <node id="21" label="屠龙刀" />
  8 |   <node id="66" label="周芷若" />
  9 |   <node id="54" label="玄冥神掌" />
 10 |   <node id="48" label="谢逊" />
 11 |   <node id="30" label="玉女剑" />
 12 |   <node id="27" label="成昆" />
 13 |   <node id="0" label="九阳神功" />
 14 |   <node id="24" label="说不得" />
 15 |   <node id="36" label="五毒神掌" />
 16 |   <node id="51" label="李莫愁" />
 17 |   <node id="33" label="空见" />
 18 |   <node id="42" label="张无忌" />
 19 |   <node id="57" label="潇湘子" />
 20 |   <node id="45" label="尼摩星" />
 21 |   <node id="63" label="柯镇恶" />
 22 |   <node id="6" label="布袋和尚" />
 23 |   <node id="3" label="圆真" />
 24 |   <node id="12" label="郭靖" />
 25 |   <node id="9" label="无忌" />
 26 |   <node id="18" label="俞岱岩" />
 27 |   <node id="60" label="打狗棒法" />
 28 |   <node id="13" label="布袋" />
 29 |   <node id="19" label="鲁有脚" />
 30 |   <node id="34" label="金轮" />
 31 |   <node id="52" label="铁枪" />
 32 |   <node id="61" label="欧阳克" />
 33 |   <node id="4" label="拂尘" />
 34 |   <node id="16" label="霍都" />
 35 |   <node id="55" label="铁杖" />
 36 |   <node id="22" label="落英神剑掌" />
 37 |   <node id="25" label="长鞭" />
 38 |   <node id="28" label="芷若" />
 39 |   <node id="46" label="张三丰" />
 40 |   <node id="64" label="梅超风" />
 41 |   <node id="37" label="杨铁心" />
 42 |   <node id="1" label="赵敏" />
 43 |   <node id="40" label="忽必烈" />
 44 |   <node id="7" label="七伤拳" />
 45 |   <node id="58" label="尹克西" />
 46 |   <node id="49" label="太极拳" />
 47 |   <node id="31" label="淑女剑" />
 48 |   <node id="10" label="金轮法王" />
 49 |   <node id="43" label="洪七公" />
 50 |   <node id="41" label="君子剑" />
 51 |   <node id="56" label="小龙女" />
 52 |   <node id="47" label="周伯通" />
 53 |   <node id="53" label="打狗棒" />
 54 |   <node id="29" label="降龙十八掌" />
 55 |   <node id="59" label="达尔巴" />
 56 |   <node id="65" label="张翠山" />
 57 |   <node id="11" label="冰魄银针" />
 58 |   <node id="14" label="蛤蟆功" />
 59 |   <node id="32" label="软猬甲" />
 60 |   <node id="35" label="一灯" />
 61 |   <node id="50" label="黄药师" />
 62 |   <node id="62" label="乾坤大挪移" />
 63 |   <node id="23" label="黄蓉" />
 64 |   <node id="17" label="宝刀" />
 65 |   <node id="8" label="教主" />
 66 |   <node id="20" label="金花婆婆" />
 67 |   <node id="38" label="九阴真经" />
 68 |   <node id="44" label="郭襄" />
 69 |   <node id="26" label="黯然销魂掌" />
 70 |   <node id="5" label="杨过" />
 71 |   <node id="2" label="混元霹雳手" />
 72 | </nodes>
 73 |   <edges>
 74 |   <edge source="2" target="27" weight="21"/>
 75 |   <edge source="12" target="29" weight="45"/>
 76 |   <edge source="12" target="53" weight="10"/>
 77 |   <edge source="13" target="3" weight="12"/>
 78 |   <edge source="13" target="9" weight="23"/>
 79 |   <edge source="21" target="1" weight="19"/>
 80 |   <edge source="21" target="42" weight="46"/>
 81 |   <edge source="21" target="65" weight="16"/>
 82 |   <edge source="23" target="14" weight="10"/>
 83 |   <edge source="23" target="22" weight="11"/>
 84 |   <edge source="23" target="34" weight="44"/>
 85 |   <edge source="23" target="52" weight="14"/>
 86 |   <edge source="39" target="34" weight="18"/>
 87 |   <edge source="43" target="34" weight="10"/>
 88 |   <edge source="45" target="34" weight="20"/>
 89 |   <edge source="0" target="42" weight="57"/>
 90 |   <edge source="5" target="26" weight="15"/>
 91 |   <edge source="5" target="53" weight="31"/>
 92 |   <edge source="7" target="9" weight="22"/>
 93 |   <edge source="7" target="48" weight="14"/>
 94 |   <edge source="13" target="6" weight="9"/>
 95 |   <edge source="15" target="38" weight="24"/>
 96 |   <edge source="25" target="66" weight="11"/>
 97 |   <edge source="40" target="34" weight="9"/>
 98 |   <edge source="57" target="34" weight="26"/>
 99 |   <edge source="62" target="42" weight="40"/>
100 |   <edge source="0" target="9" weight="60"/>
101 |   <edge source="5" target="31" weight="9"/>
102 |   <edge source="7" target="33" weight="14"/>
103 |   <edge source="10" target="34" weight="363"/>
104 |   <edge source="13" target="42" weight="23"/>
105 |   <edge source="17" target="9" weight="20"/>
106 |   <edge source="17" target="65" weight="14"/>
107 |   <edge source="21" target="9" weight="47"/>
108 |   <edge source="21" target="48" weight="57"/>
109 |   <edge source="23" target="32" weight="21"/>
110 |   <edge source="23" target="60" weight="26"/>
111 |   <edge source="49" target="46" weight="18"/>
112 |   <edge source="50" target="38" weight="27"/>
113 |   <edge source="63" target="52" weight="10"/>
114 |   <edge source="63" target="55" weight="31"/>
115 |   <edge source="64" target="38" weight="18"/>
116 |   <edge source="7" target="42" weight="19"/>
117 |   <edge source="12" target="38" weight="59"/>
118 |   <edge source="16" target="34" weight="20"/>
119 |   <edge source="17" target="42" weight="19"/>
120 |   <edge source="19" target="53" weight="12"/>
121 |   <edge source="35" target="38" weight="9"/>
122 |   <edge source="43" target="53" weight="28"/>
123 |   <edge source="44" target="34" weight="15"/>
124 |   <edge source="49" target="9" weight="21"/>
125 |   <edge source="51" target="36" weight="11"/>
126 |   <edge source="54" target="9" weight="21"/>
127 |   <edge source="56" target="30" weight="9"/>
128 |   <edge source="5" target="34" weight="140"/>
129 |   <edge source="12" target="34" weight="50"/>
130 |   <edge source="15" target="14" weight="19"/>
131 |   <edge source="17" target="48" weight="35"/>
132 |   <edge source="21" target="18" weight="12"/>
133 |   <edge source="23" target="4" weight="9"/>
134 |   <edge source="37" target="52" weight="14"/>
135 |   <edge source="43" target="38" weight="17"/>
136 |   <edge source="47" target="38" weight="24"/>
137 |   <edge source="49" target="42" weight="20"/>
138 |   <edge source="51" target="4" weight="107"/>
139 |   <edge source="56" target="31" weight="10"/>
140 |   <edge source="61" target="38" weight="9"/>
141 |   <edge source="5" target="41" weight="12"/>
142 |   <edge source="17" target="18" weight="18"/>
143 |   <edge source="21" target="28" weight="9"/>
144 |   <edge source="23" target="29" weight="17"/>
145 |   <edge source="23" target="38" weight="25"/>
146 |   <edge source="23" target="53" weight="70"/>
147 |   <edge source="43" target="60" weight="11"/>
148 |   <edge source="54" target="42" weight="18"/>
149 |   <edge source="56" target="4" weight="10"/>
150 |   <edge source="56" target="34" weight="64"/>
151 |   <edge source="59" target="34" weight="27"/>
152 |   <edge source="62" target="8" weight="10"/>
153 |   <edge source="62" target="9" weight="40"/>
154 |   <edge source="2" target="2" weight="23"/>
155 |   <edge source="5" target="4" weight="32"/>
156 |   <edge source="5" target="30" weight="11"/>
157 |   <edge source="12" target="14" weight="17"/>
158 |   <edge source="13" target="24" weight="20"/>
159 |   <edge source="16" target="53" weight="20"/>
160 |   <edge source="21" target="20" weight="13"/>
161 |   <edge source="25" target="28" weight="11"/>
162 |   <edge source="43" target="29" weight="24"/>
163 |   <edge source="47" target="34" weight="10"/>
164 |   <edge source="51" target="11" weight="20"/>
165 |   <edge source="58" target="34" weight="14"/>
166 | </edges>
167 |         </graph>
168 |       </gexf>


--------------------------------------------------------------------------------
/spark-graphx/src/main/resources/output/isolate.txt:
--------------------------------------------------------------------------------
1 | CompactBuffer(15, 103)
2 | CompactBuffer(39, 189, 96, 81, 153, 156, 66, 138, 171, 36, 111, 57, 75, 45, 132, 63, 72, 90, 18, 12, 9, 183, 144, 159, 21, 120, 0, 42, 102, 186, 69, 123, 174, 147, 19, 34, 52, 151, 4, 16, 82, 130, 28, 79, 127, 64, 175, 37, 133, 154, 1, 70, 109, 10, 145, 100, 115, 160, 187, 178, 76, 112, 43, 169, 25, 166, 46, 73, 172, 124, 40, 163, 7, 58, 88, 119, 155, 71, 80, 107, 98, 125, 65, 170, 14, 50, 35, 110, 161, 104, 146, 188, 17, 173, 20, 167, 122, 41, 47, 77, 95, 59, 128, 182, 62, 113, 86, 176, 26, 68)
3 | CompactBuffer(177, 168, 150, 180, 54, 105, 30, 24, 51, 108, 78, 99, 162, 84, 48, 117, 27, 93, 33, 126, 141, 6, 3, 135, 165, 60, 114, 87, 129, 13, 55, 121, 157, 106, 49, 94, 148, 61, 139, 184, 97, 22, 142, 181, 118, 67, 85, 136, 91, 31, 101, 137, 134, 158, 29, 11, 92, 152, 149, 140, 185, 74, 83, 89, 179, 38, 56, 53, 116, 131, 32, 23, 164, 143, 8, 44, 5, 2)


--------------------------------------------------------------------------------
/spark-graphx/src/main/resources/output/minDegrees.gexf:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <gexf xmlns="http://www.gexf.net/1.2draft" version="1.2">
  3 |  <graph mode="static" defaultedgetype="directed">
  4 |   <nodes>
  5 |    <node id="177" label="西华子" />
  6 |   <node id="39" label="博尔术" />
  7 |   <node id="189" label="丘处机" />
  8 |   <node id="96" label="陆冠英" />
  9 |   <node id="168" label="纪晓芙" />
 10 |   <node id="81" label="郭襄" />
 11 |   <node id="150" label="胡青牛" />
 12 |   <node id="153" label="程瑶迦" />
 13 |   <node id="180" label="青书" />
 14 |   <node id="156" label="柯镇恶" />
 15 |   <node id="66" label="鹿清笃" />
 16 |   <node id="54" label="鲜于通" />
 17 |   <node id="105" label="范遥" />
 18 |   <node id="138" label="郭啸天" />
 19 |   <node id="171" label="老毒物" />
 20 |   <node id="30" label="宋远桥" />
 21 |   <node id="36" label="蓉儿" />
 22 |   <node id="24" label="蝶谷医仙" />
 23 |   <node id="111" label="王重阳" />
 24 |   <node id="57" label="吕文德" />
 25 |   <node id="51" label="张松溪" />
 26 |   <node id="75" label="神雕侠" />
 27 |   <node id="45" label="全金发" />
 28 |   <node id="132" label="黄药师" />
 29 |   <node id="63" label="哲别" />
 30 |   <node id="72" label="铁木真" />
 31 |   <node id="90" label="慈恩" />
 32 |   <node id="18" label="欧阳锋" />
 33 |   <node id="12" label="天竺僧" />
 34 |   <node id="9" label="朱子柳" />
 35 |   <node id="183" label="无色" />
 36 |   <node id="108" label="空性" />
 37 |   <node id="78" label="圆音" />
 38 |   <node id="144" label="都史" />
 39 |   <node id="99" label="空见" />
 40 |   <node id="162" label="朱九真" />
 41 |   <node id="84" label="鹿杖客" />
 42 |   <node id="159" label="小王爷" />
 43 |   <node id="15" label="高则成" />
 44 |   <node id="21" label="岛主" />
 45 |   <node id="120" label="梁子翁" />
 46 |   <node id="48" label="常遇春" />
 47 |   <node id="117" label="都大锦" />
 48 |   <node id="27" label="金花婆婆" />
 49 |   <node id="0" label="孙不二" />
 50 |   <node id="93" label="渡厄" />
 51 |   <node id="33" label="赵敏" />
 52 |   <node id="42" label="沙通天" />
 53 |   <node id="102" label="南希仁" />
 54 |   <node id="126" label="徐达" />
 55 |   <node id="141" label="杨不悔" />
 56 |   <node id="186" label="李萍" />
 57 |   <node id="6" label="薛公远" />
 58 |   <node id="69" label="武敦儒" />
 59 |   <node id="3" label="圆真" />
 60 |   <node id="135" label="周颠" />
 61 |   <node id="165" label="詹春" />
 62 |   <node id="60" label="敏妹" />
 63 |   <node id="123" label="尼摩星" />
 64 |   <node id="114" label="掌钵龙头" />
 65 |   <node id="174" label="穆念慈" />
 66 |   <node id="87" label="张真人" />
 67 |   <node id="129" label="郡主" />
 68 |   <node id="147" label="潇湘子" />
 69 |   <node id="13" label="朱元璋" />
 70 |   <node id="19" label="龙儿" />
 71 |   <node id="34" label="完颜萍" />
 72 |   <node id="52" label="察合台" />
 73 |   <node id="151" label="达尔巴" />
 74 |   <node id="4" label="杨过" />
 75 |   <node id="16" label="何足道" />
 76 |   <node id="55" label="张五侠" />
 77 |   <node id="82" label="灵智上人" />
 78 |   <node id="130" label="洪凌波" />
 79 |   <node id="28" label="焦木" />
 80 |   <node id="79" label="忽必烈" />
 81 |   <node id="121" label="张无忌" />
 82 |   <node id="157" label="何太冲" />
 83 |   <node id="127" label="北丐" />
 84 |   <node id="64" label="韩宝驹" />
 85 |   <node id="175" label="西毒" />
 86 |   <node id="37" label="韩小莹" />
 87 |   <node id="133" label="靖哥哥" />
 88 |   <node id="154" label="赤练仙子" />
 89 |   <node id="106" label="殷素素" />
 90 |   <node id="1" label="郝大通" />
 91 |   <node id="70" label="王罕" />
 92 |   <node id="49" label="渡劫" />
 93 |   <node id="103" label="蒋涛" />
 94 |   <node id="109" label="公孙止" />
 95 |   <node id="94" label="宋青书" />
 96 |   <node id="10" label="过儿" />
 97 |   <node id="145" label="小龙女" />
 98 |   <node id="100" label="彭连虎" />
 99 |   <node id="148" label="妙风使" />
100 |   <node id="61" label="简捷" />
101 |   <node id="115" label="樊一翁" />
102 |   <node id="160" label="孙婆婆" />
103 |   <node id="187" label="朱聪" />
104 |   <node id="178" label="桑昆" />
105 |   <node id="76" label="傻姑" />
106 |   <node id="112" label="陆无双" />
107 |   <node id="139" label="空闻" />
108 |   <node id="43" label="霍都" />
109 |   <node id="169" label="姑姑" />
110 |   <node id="184" label="陈友谅" />
111 |   <node id="97" label="谢逊" />
112 |   <node id="22" label="俞岱岩" />
113 |   <node id="25" label="黄蓉" />
114 |   <node id="166" label="李莫愁" />
115 |   <node id="46" label="鲁有脚" />
116 |   <node id="142" label="班淑娴" />
117 |   <node id="73" label="郭芙" />
118 |   <node id="172" label="尹克西" />
119 |   <node id="181" label="韦一笑" />
120 |   <node id="124" label="耶律齐" />
121 |   <node id="118" label="鹤笔翁" />
122 |   <node id="40" label="公孙绿萼" />
123 |   <node id="67" label="苦头陀" />
124 |   <node id="163" label="华筝" />
125 |   <node id="7" label="武修文" />
126 |   <node id="85" label="殷野王" />
127 |   <node id="58" label="马钰" />
128 |   <node id="136" label="朱长龄" />
129 |   <node id="88" label="周伯通" />
130 |   <node id="91" label="杨逍" />
131 |   <node id="31" label="成昆" />
132 |   <node id="119" label="拖雷" />
133 |   <node id="101" label="彭莹玉" />
134 |   <node id="137" label="金毛狮王" />
135 |   <node id="155" label="南帝" />
136 |   <node id="71" label="耶律燕" />
137 |   <node id="134" label="圆业" />
138 |   <node id="158" label="张翠山" />
139 |   <node id="80" label="老顽童" />
140 |   <node id="29" label="说不得" />
141 |   <node id="107" label="郭大侠" />
142 |   <node id="98" label="侯通海" />
143 |   <node id="125" label="裘千尺" />
144 |   <node id="65" label="龙姑娘" />
145 |   <node id="170" label="王处一" />
146 |   <node id="11" label="教主" />
147 |   <node id="14" label="金轮法王" />
148 |   <node id="50" label="段天德" />
149 |   <node id="35" label="东邪" />
150 |   <node id="92" label="史火龙" />
151 |   <node id="152" label="五姑" />
152 |   <node id="110" label="杨铁心" />
153 |   <node id="149" label="殷天正" />
154 |   <node id="161" label="简长老" />
155 |   <node id="104" label="一灯" />
156 |   <node id="140" label="小昭" />
157 |   <node id="185" label="周芷若" />
158 |   <node id="146" label="张阿生" />
159 |   <node id="188" label="欧阳克" />
160 |   <node id="74" label="阳顶天" />
161 |   <node id="83" label="张三丰" />
162 |   <node id="89" label="掌棒龙头" />
163 |   <node id="17" label="靖儿" />
164 |   <node id="173" label="马光佐" />
165 |   <node id="20" label="完颜洪烈" />
166 |   <node id="179" label="宗维侠" />
167 |   <node id="38" label="无忌" />
168 |   <node id="167" label="觉远" />
169 |   <node id="122" label="洪七公" />
170 |   <node id="41" label="郭靖" />
171 |   <node id="56" label="矮老者" />
172 |   <node id="47" label="张君宝" />
173 |   <node id="77" label="杨康" />
174 |   <node id="53" label="灭绝师太" />
175 |   <node id="95" label="傻蛋" />
176 |   <node id="116" label="殷离" />
177 |   <node id="131" label="卫璧" />
178 |   <node id="59" label="裘千仞" />
179 |   <node id="128" label="札木合" />
180 |   <node id="182" label="梅超风" />
181 |   <node id="32" label="芷若" />
182 |   <node id="62" label="大头鬼" />
183 |   <node id="113" label="术赤" />
184 |   <node id="23" label="空智" />
185 |   <node id="164" label="韩林儿" />
186 |   <node id="143" label="高老者" />
187 |   <node id="8" label="殷梨亭" />
188 |   <node id="86" label="武三通" />
189 |   <node id="44" label="武青婴" />
190 |   <node id="176" label="尹志平" />
191 |   <node id="26" label="赵志敬" />
192 |   <node id="68" label="窝阔台" />
193 |   <node id="5" label="混元霹雳手" />
194 |   <node id="2" label="莫声谷" />
195 | </nodes>
196 |   <edges>
197 |   <edge source="4" target="95" weight="59"/>
198 |   <edge source="6" target="61" weight="25"/>
199 |   <edge source="12" target="4" weight="17"/>
200 |   <edge source="41" target="107" weight="18"/>
201 |   <edge source="41" target="133" weight="62"/>
202 |   <edge source="103" target="15" weight="16"/>
203 |   <edge source="138" target="110" weight="25"/>
204 |   <edge source="186" target="50" weight="22"/>
205 |   <edge source="3" target="74" weight="20"/>
206 |   <edge source="24" target="150" weight="17"/>
207 |   <edge source="37" target="146" weight="19"/>
208 |   <edge source="41" target="17" weight="27"/>
209 |   <edge source="41" target="21" weight="30"/>
210 |   <edge source="54" target="121" weight="23"/>
211 |   <edge source="66" target="4" weight="19"/>
212 |   <edge source="81" target="62" weight="20"/>
213 |   <edge source="134" target="158" weight="18"/>
214 |   <edge source="153" target="188" weight="16"/>
215 |   <edge source="179" target="121" weight="20"/>
216 |   <edge source="5" target="31" weight="21"/>
217 |   <edge source="25" target="46" weight="44"/>
218 |   <edge source="38" target="148" weight="17"/>
219 |   <edge source="60" target="38" weight="20"/>
220 |   <edge source="81" target="75" weight="23"/>
221 |   <edge source="81" target="183" weight="33"/>
222 |   <edge source="89" target="94" weight="16"/>
223 |   <edge source="134" target="78" weight="17"/>
224 |   <edge source="172" target="47" weight="24"/>
225 |   <edge source="180" target="114" weight="18"/>
226 |   <edge source="4" target="19" weight="33"/>
227 |   <edge source="4" target="65" weight="16"/>
228 |   <edge source="28" target="189" weight="19"/>
229 |   <edge source="41" target="57" weight="18"/>
230 |   <edge source="41" target="113" weight="19"/>
231 |   <edge source="60" target="121" weight="20"/>
232 |   <edge source="81" target="16" weight="20"/>
233 |   <edge source="87" target="83" weight="18"/>
234 |   <edge source="112" target="95" weight="63"/>
235 |   <edge source="115" target="147" weight="17"/>
236 |   <edge source="117" target="158" weight="33"/>
237 |   <edge source="126" target="13" weight="19"/>
238 |   <edge source="148" target="121" weight="17"/>
239 |   <edge source="165" target="157" weight="16"/>
240 |   <edge source="167" target="16" weight="18"/>
241 |   <edge source="186" target="41" weight="17"/>
242 |   <edge source="38" target="85" weight="28"/>
243 |   <edge source="38" target="92" weight="22"/>
244 |   <edge source="38" target="179" weight="20"/>
245 |   <edge source="39" target="63" weight="26"/>
246 |   <edge source="41" target="46" weight="36"/>
247 |   <edge source="52" target="113" weight="29"/>
248 |   <edge source="85" target="121" weight="28"/>
249 |   <edge source="97" target="99" weight="30"/>
250 |   <edge source="111" target="4" weight="18"/>
251 |   <edge source="114" target="94" weight="18"/>
252 |   <edge source="122" target="17" weight="21"/>
253 |   <edge source="145" target="19" weight="18"/>
254 |   <edge source="145" target="65" weight="22"/>
255 |   <edge source="153" target="96" weight="48"/>
256 |   <edge source="157" target="152" weight="19"/>
257 |   <edge source="166" target="154" weight="19"/>
258 |   <edge source="38" target="177" weight="18"/>
259 |   <edge source="41" target="159" weight="31"/>
260 |   <edge source="44" target="131" weight="22"/>
261 |   <edge source="48" target="126" weight="16"/>
262 |   <edge source="55" target="158" weight="30"/>
263 |   <edge source="115" target="4" weight="51"/>
264 |   <edge source="121" target="92" weight="22"/>
265 |   <edge source="132" target="21" weight="23"/>
266 |   <edge source="180" target="89" weight="16"/>
267 |   <edge source="0" target="4" weight="18"/>
268 |   <edge source="4" target="107" weight="18"/>
269 |   <edge source="12" target="9" weight="19"/>
270 |   <edge source="25" target="133" weight="169"/>
271 |   <edge source="25" target="161" weight="18"/>
272 |   <edge source="31" target="99" weight="16"/>
273 |   <edge source="34" target="71" weight="19"/>
274 |   <edge source="38" target="54" weight="23"/>
275 |   <edge source="40" target="4" weight="41"/>
276 |   <edge source="52" target="68" weight="20"/>
277 |   <edge source="81" target="167" weight="28"/>
278 |   <edge source="119" target="68" weight="28"/>
279 |   <edge source="121" target="177" weight="17"/>
280 |   <edge source="189" target="110" weight="24"/>
281 | </edges>
282 |         </graph>
283 |       </gexf>


--------------------------------------------------------------------------------
/spark-graphx/src/main/resources/user.txt:
--------------------------------------------------------------------------------
1 | 1,Thpffcj1
2 | 2,Thpffcj2
3 | 3,Thpffcj3
4 | 4,Thpffcj4
5 | 5,Thpffcj5
6 | 6,Thpffcj6
7 | 7,Thpffcj7


--------------------------------------------------------------------------------
/spark-graphx/src/main/scala/cn/edu/nju/GraphExample.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju
 2 | 
 3 | import org.apache.spark.graphx.{Edge, Graph}
 4 | import org.apache.spark.rdd.RDD
 5 | import org.apache.spark.{SparkConf, SparkContext}
 6 | 
 7 | /**
 8 |  * Created by thpffcj on 2019/10/3.
 9 |  */
10 | object GraphExample {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 |     val conf = new SparkConf().setAppName("GraphTest").setMaster("local")
15 |     val sc = new SparkContext(conf)
16 | 
17 |     // 构建顶点 返回的这个Long其实是VertexId类型，都是一样的
18 |     val users: RDD[(Long, (String, String))] =
19 |       sc.parallelize(
20 |         Array((3L, ("rxin", "student")),
21 |         (7L, ("jgonzal", "postdoc")),
22 |         (5L, ("franklin", "prof")),
23 |         (2L, ("istoica", "prof"))))
24 | 
25 |     // 构建边 （边有个独特的类Edge，某种程度讲代表的就是一些关系）
26 |     val relationships: RDD[Edge[String]] =
27 |       sc.parallelize(
28 |         Array(Edge(3L, 7L, "collab"),
29 |           Edge(5L, 3L, "advisor"),
30 |           Edge(2L, 5L, "colleague"),
31 |           Edge(5L, 7L, "pi")))
32 | 
33 |     // 顶点和边，这样就构建了我们的图
34 |     val graph = Graph(users, relationships)
35 | 
36 |     // .vertices获取到这个图中所有的顶点
37 |     val count = graph.vertices.filter {
38 |       case (id, (name, pos)) => {
39 |         // 计算我们这个图中有多少个postdoc博士后
40 |         pos == "postdoc"
41 |       }
42 |     }.count()
43 | 
44 |     // 1
45 |     println(count)
46 | 
47 |     //.edges获取到这个图中所有的边，过滤出 源ID<目标ID 的数量
48 |     val count1 = graph.edges.filter(e => e.srcId < e.dstId).count()
49 | 
50 |     // 3
51 |     println(count1)
52 | 
53 |     sc.stop()
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/spark-graphx/src/main/scala/cn/edu/nju/GraphExample2.scala:
--------------------------------------------------------------------------------
  1 | package cn.edu.nju
  2 | 
  3 | import org.apache.spark.graphx.{Edge, Graph, VertexId, VertexRDD}
  4 | import org.apache.spark.rdd.RDD
  5 | import org.apache.spark.{SparkConf, SparkContext}
  6 | 
  7 | /**
  8 |  * Created by thpffcj on 2019/10/31.
  9 |  */
 10 | object GraphExample2 {
 11 | 
 12 |   def main(args: Array[String]): Unit = {
 13 | 
 14 |     // 设置运行环境
 15 |     val conf = new SparkConf().setAppName("GraphExample2").setMaster("local")
 16 |     val sc = new SparkContext(conf)
 17 | 
 18 |     // 设置顶点和边，注意顶点和边都是用元组定义的Array
 19 |     // 顶点的数据类型是VD:(String, Int)
 20 |     val vertexArray = Array(
 21 |       (1L, ("Alice", 28)),
 22 |       (2L, ("Bob", 27)),
 23 |       (3L, ("Charlie", 65)),
 24 |       (4L, ("David", 42)),
 25 |       (5L, ("Ed", 55)),
 26 |       (6L, ("Fran", 50))
 27 |     )
 28 | 
 29 |     // 边的数据类型ED:Int
 30 |     val edgeArray = Array(
 31 |       Edge(2L, 1L, 7),
 32 |       Edge(2L, 4L, 2),
 33 |       Edge(3L, 2L, 4),
 34 |       Edge(3L, 6L, 3),
 35 |       Edge(4L, 1L, 1),
 36 |       Edge(5L, 2L, 2),
 37 |       Edge(5L, 3L, 8),
 38 |       Edge(5L, 6L, 3)
 39 |     )
 40 | 
 41 |     // 构造vertexRDD和edgeRDD
 42 |     val vertexRDD: RDD[(Long, (String, Int))] = sc.parallelize(vertexArray)
 43 |     val edgeRDD: RDD[Edge[Int]] = sc.parallelize(edgeArray)
 44 | 
 45 |     // 构造图Graph[VD,ED]
 46 |     val graph: Graph[(String, Int), Int] = Graph(vertexRDD, edgeRDD)
 47 | 
 48 |     // 找出图中年龄大于30的顶点
 49 |     graph.vertices.filter { case (id, (name, age)) => age > 30 }.collect.foreach {
 50 |       case (id, (name, age)) => println(s"$name is $age")
 51 |     }
 52 | 
 53 |     // 边操作：找出图中属性大于5的边
 54 |     graph.edges.filter(e => e.attr > 5)
 55 |       .collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))
 56 | 
 57 |     // triplets操作，((srcId, srcAttr), (dstId, dstAttr), attr)
 58 |     // 列出边属性 >5 的tripltes
 59 |     for (triplet <- graph.triplets.filter(t => t.attr > 5).collect) {
 60 |       println(s"${triplet.srcAttr._1} likes ${triplet.dstAttr._1}")
 61 |     }
 62 | 
 63 |     // Degrees操作
 64 |     // 找出图中最大的出度、入度、度数
 65 |     def max(a: (VertexId, Int), b: (VertexId, Int)): (VertexId, Int) = {
 66 |       if (a._2 > b._2) a else b
 67 |     }
 68 | 
 69 |     println("max of outDegrees:" + graph.outDegrees.reduce(max) + " max of inDegrees:" + graph.inDegrees.reduce(max) + " max of Degrees:" + graph.degrees.reduce(max))
 70 | 
 71 |     // 转换操作
 72 |     // 顶点的转换操作，顶点age + 10
 73 |     graph.mapVertices { case (id, (name, age)) => (id, (name, age + 10)) }
 74 |       .vertices.collect.foreach(v => println(s"${v._2._1} is ${v._2._2}"))
 75 | 
 76 |     // 边的转换操作，边的属性*2
 77 |     graph.mapEdges(e => e.attr * 2)
 78 |       .edges.collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))
 79 | 
 80 |     // 顶点年纪>30的子图
 81 |     val subGraph = graph.subgraph(vpred = (id, vd) => vd._2 >= 30)
 82 |     subGraph.vertices.collect.foreach(v => println(s"${v._2._1} is ${v._2._2}"))
 83 | 
 84 |     // 子图所有边
 85 |     subGraph.edges.collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))
 86 | 
 87 |     // 连接操作
 88 |     val inDegrees: VertexRDD[Int] = graph.inDegrees
 89 |     case class User(name: String, age: Int, inDeg: Int, outDeg: Int)
 90 | 
 91 |     // 创建一个新图，顶点VD的数据类型为User，并从graph做类型转换
 92 |     val initialUserGraph: Graph[User, Int] = graph.mapVertices {
 93 |       case (id, (name, age)) => User(name, age, 0, 0)}
 94 | 
 95 |     // initialUserGraph与inDegrees、outDegrees（RDD）进行连接，并修改initialUserGraph中inDeg值、outDeg值
 96 |     val userGraph = initialUserGraph.outerJoinVertices(initialUserGraph.inDegrees) {
 97 |       case (id, u, inDegOpt) => User(u.name, u.age, inDegOpt.getOrElse(0), u.outDeg)
 98 |     }.outerJoinVertices(initialUserGraph.outDegrees) {
 99 |       case (id, u, outDegOpt) => User(u.name, u.age, u.inDeg,outDegOpt.getOrElse(0))
100 |     }
101 | 
102 |     // 连接图的属性
103 |     userGraph.vertices.collect.foreach(v => println(s"${v._2.name} inDeg: ${v._2.inDeg}  outDeg: ${v._2.outDeg}"))
104 | 
105 |     // 出度和入读相同的人员
106 |     userGraph.vertices.filter {
107 |       case (id, u) => u.inDeg == u.outDeg
108 |     }.collect.foreach {
109 |       case (id, property) => println(property.name)
110 |     }
111 | 
112 |     // 聚合操作
113 |     // 找出年纪最大的追求者
114 | //    val oldestFollower: VertexRDD[(String, Int)] = userGraph.mapReduceTriplets[(String, Int)](
115 | //
116 | //      // 将源顶点的属性发送给目标顶点，map过程
117 | //      edge => Iterator((edge.dstId, (edge.srcAttr.name, edge.srcAttr.age))),
118 | //
119 | //      // 得到最大追求者，reduce过程
120 | //      (a, b) => if (a._2 > b._2) a else b
121 | //    )
122 | 
123 | //    userGraph.vertices.leftJoin(oldestFollower) { (id, user, optOldestFollower) =>
124 | //      optOldestFollower match {
125 | //        case None => s"${user.name} does not have any followers."
126 | //        case Some((name, age)) => s"${name} is the oldest follower of ${user.name}."
127 | //      }
128 | //    }.collect.foreach { case (id, str) => println(str)}
129 | 
130 | 
131 |   }
132 | 
133 | 
134 | }
135 | 


--------------------------------------------------------------------------------
/spark-graphx/src/main/scala/cn/edu/nju/GraphExample3.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju
 2 | 
 3 | import org.apache.spark.graphx.GraphLoader
 4 | import org.apache.spark.{SparkConf, SparkContext}
 5 | 
 6 | /**
 7 |  * Created by thpffcj on 2019/10/3.
 8 |  *
 9 |  * 图计算官网案例示范
10 |  * 主要解决项目中遇到的 把同一个用户识别出来，如果是同一个用户就合并到一起
11 |  */
12 | object GraphExample3 {
13 | 
14 |   def main(args: Array[String]): Unit = {
15 | 
16 |     // graphx 基于RDD
17 |     val conf = new SparkConf().setMaster("local").setAppName("ConnectedComponentsExample")
18 |     val sc = new SparkContext(conf)
19 | 
20 |     // 构建出来图有多种方式
21 |     val graph = GraphLoader.edgeListFile(sc, "src/main/resources/follows.txt")
22 |     /**
23 |      * 就是把所有的数字作为key，value都写为1
24 |      * (4,1)
25 |      * (1,1)
26 |      * (6,1)
27 |      * (3,1)
28 |      * (7,1)
29 |      * (5,1)
30 |      * (2,1)
31 |      */
32 |     graph.vertices.foreach(println(_))
33 | 
34 |     /**
35 |      * .connectedComponents()计算每个顶点的连接组件成员，并返回带有顶点的图形
36 |      * 包含该顶点的连通组件中包含最低顶点id的值。
37 |      */
38 |     val cc = graph.connectedComponents().vertices
39 |     /**
40 |      * (4,4)
41 |      * (1,1)
42 |      * (6,4)
43 |      * (3,1)
44 |      * (7,4)
45 |      * (5,1)
46 |      * (2,1)
47 |      */
48 |     cc.foreach(println(_))
49 | 
50 |     val users = sc.textFile("src/main/resources/user.txt").map(line => {
51 |       // 因为要join，所以要变成kv形式
52 |       val fields = line.split(",")
53 |       (fields(0).toLong, fields(1))
54 |     })
55 | 
56 |     // (1,Thpffcj1)  join  (1,1)
57 |     // (1,(Thpffcj1,1)) 代表的是同一个好友的那个id
58 |     users.join(cc).map {
59 |       case (id, (username, cclastid)) => (cclastid, username)
60 |     }.reduceByKey((x: String, y: String) => x + "," + y)
61 |       .foreach(tuple => {
62 |         /**
63 |          * Thpffcj4,Thpffcj6,Thpffcj7
64 |          * Thpffcj1,Thpffcj3,Thpffcj5,Thpffcj2
65 |          */
66 |         println(tuple._2)
67 |       })
68 | 
69 |     sc.stop()
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/spark-graphx/src/main/scala/cn/edu/nju/GraphProcess.scala:
--------------------------------------------------------------------------------
  1 | package cn.edu.nju
  2 | 
  3 | import java.io.PrintWriter
  4 | import java.util
  5 | import java.util.concurrent.ConcurrentHashMap
  6 | 
  7 | import com.alibaba.fastjson.JSON
  8 | import com.mongodb.spark.MongoSpark
  9 | import org.apache.spark.{SparkConf, SparkContext}
 10 | import org.apache.spark.graphx.{Edge, Graph, VertexId, VertexRDD}
 11 | import org.apache.spark.rdd.RDD
 12 | import org.apache.spark.sql.{DataFrame, SparkSession}
 13 | import org.bson.Document
 14 | 
 15 | import scala.util.Random
 16 | 
 17 | /**
 18 |  * Created by thpffcj on 2019/11/2.
 19 |  * hours_3_10W 53 160
 20 |  * steam_3_10W 53 160
 21 |  * hours_5_20W 98 464
 22 |  * steam_5_20W 96 452
 23 |  * hours_7_30W 103 535
 24 |  * steam_7_30W 103 535
 25 |  * hours_6_30W 178 1060
 26 |  * steam_6_30W 175 1039
 27 |  *
 28 |  */
 29 | object GraphProcess {
 30 | 
 31 |   // 点集，根据用户id或游戏名找点id
 32 |   // user_76561198380840992 1L
 33 |   // game_CODE VEIN 2L
 34 |   val pointMap = new ConcurrentHashMap[String, Long]()
 35 | 
 36 |   // 评论边
 37 |   val edgeMap1 = new ConcurrentHashMap[(Long, Long), String]()
 38 | 
 39 |   // 时长边
 40 |   val edgeMap2 = new ConcurrentHashMap[(Long, Long), String]()
 41 | 
 42 |   //  点权重map，根据图中点id得到权重
 43 |   // 1L 10
 44 |   val weightMap = new ConcurrentHashMap[Long, Int]()
 45 | 
 46 |   def main(args: Array[String]): Unit = {
 47 | 
 48 |     val conf = new SparkConf().setMaster("local[4]").setAppName("GraphProcess")
 49 |     conf.set("spark.mongodb.input.uri", "mongodb://localhost:27017/test.China.reviews_official_30W")
 50 |     conf.set("spark.mongodb.input.partitioner", "MongoPaginateBySizePartitioner")
 51 |     conf.set("spark.mongodb.output.uri", "mongodb://localhost:27017/test.steam.graph_vertice")
 52 | 
 53 |     val spark = SparkSession.builder().config(conf).getOrCreate()
 54 | 
 55 |     val frame: DataFrame = MongoSpark.load(spark)
 56 | 
 57 |     var key = 0L
 58 | 
 59 |     // 需要collect到一个节点上，key递增
 60 |     frame.collect().foreach(row => {
 61 | 
 62 |       val gameArray = row.getAs("game").toString.split(",")
 63 |       var game = gameArray(0)
 64 |       game = game.substring(1, game.length)
 65 |       // 过滤前端无法识别非法字符，比如表情等
 66 |       val gameNamePatterns = "[^\\u4e00-\\u9fa5a-zA-Z0-9 ]".r
 67 |       // 游戏名称
 68 |       game = gameNamePatterns.replaceAllIn(game, "")
 69 | 
 70 |       val jsonAuthor = JSON.parse(row.getAs("author").toString)
 71 |       val authorArray = jsonAuthor.toString.split(",")
 72 |       // 用户id
 73 |       val userId = authorArray(4)
 74 |       // 游玩时长
 75 |       val hours = authorArray(5)
 76 | 
 77 |       // 评论
 78 |       var review = row.getAs("review").toString
 79 |       val reviewPatterns = "[^\\u4e00-\\u9fa5a-zA-Z0-9 ]".r
 80 |       review = reviewPatterns.replaceAllIn(review, "")
 81 | 
 82 |       // 玩家顶点
 83 |       val playerKey = "user_" + userId
 84 |       var playerPoint = 0L
 85 |       if (pointMap.containsKey(playerKey)) {
 86 |         playerPoint = pointMap.get(playerKey)
 87 |         // 权重+1
 88 |         weightMap.put(playerPoint, weightMap.get(playerPoint) + 1)
 89 |       } else {
 90 |         // 点id递增
 91 |         this.synchronized {
 92 |           key = key + 1
 93 |           playerPoint = key
 94 |         }
 95 |         pointMap.put(playerKey, playerPoint)
 96 |         // 权重赋予1
 97 |         weightMap.put(playerPoint, 1)
 98 |       }
 99 | 
100 |       // 游戏顶点
101 |       val gameKey = "game_" + game
102 |       var gamePoint = 0L
103 |       if (pointMap.containsKey(gameKey)) {
104 |         gamePoint = pointMap.get(gameKey)
105 |         // 权重+1
106 |         weightMap.put(gamePoint, weightMap.get(gamePoint) + 1)
107 |       } else {
108 |         this.synchronized {
109 |           key = key + 1
110 |           gamePoint = key
111 |         }
112 |         pointMap.put(gameKey, gamePoint)
113 |         // 权重赋予1
114 |         weightMap.put(gamePoint, 1)
115 |       }
116 | 
117 |       edgeMap1.put((playerPoint, gamePoint), review)
118 |       edgeMap2.put((playerPoint, gamePoint), hours)
119 |     })
120 | 
121 |     println("foreach 结束")
122 | 
123 |     // 点集
124 |     var vertexArray = Seq((0L, ("test", "test")))
125 |     // 评论边
126 |     var edgeArray1 = Seq(Edge(0L, 0L, ""))
127 |     // 时长边
128 |     var edgeArray2 = Seq(Edge(0L, 0L, ""))
129 | 
130 |     // 添加点
131 |     val pointSet = pointMap.keySet()
132 |     // TODO 遍历迭代map，这个阶段非常耗时，如何改进？
133 |     val point_iter = pointSet.iterator
134 |     while (point_iter.hasNext) {
135 |       val key = point_iter.next
136 |       val name = key.split("_")
137 |       vertexArray = vertexArray :+ (pointMap.get(key), (name(0), name(1)))
138 |     }
139 | 
140 |     println("遍历点集结束")
141 | 
142 |     // 添加评论边
143 |     val edgeSet1 = edgeMap1.keySet()
144 |     // 遍历迭代map
145 |     val edge_iter1 = edgeSet1.iterator
146 |     while (edge_iter1.hasNext) {
147 |       val key = edge_iter1.next
148 |       edgeArray1 = edgeArray1 :+ Edge(key._1, key._2, edgeMap1.get(key))
149 |     }
150 | 
151 |     println("遍历评论边结束")
152 | 
153 |     // 添加时长边
154 |     val edgeSet2 = edgeMap2.keySet()
155 |     // 遍历迭代map
156 |     val edge_iter2 = edgeSet2.iterator
157 |     while (edge_iter2.hasNext) {
158 |       val key = edge_iter2.next
159 |       edgeArray2 = edgeArray2 :+ Edge(key._1, key._2, edgeMap2.get(key))
160 |     }
161 | 
162 |     println("遍历结束")
163 | 
164 |     // 构造vertexRDD和edgeRDD
165 |     val vertexRDD: RDD[(Long, (String, String))] = spark.sparkContext.parallelize(vertexArray)
166 |     val edgeRDD1: RDD[Edge[String]] = spark.sparkContext.parallelize(edgeArray1)
167 |     val edgeRDD2: RDD[Edge[String]] = spark.sparkContext.parallelize(edgeArray2)
168 | 
169 |     // 构造图Graph[VD,ED]
170 |     var contentGraph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD1)
171 | 
172 |     println("构造contentGraph结束")
173 | 
174 |     // 构建子图，过滤评论为空的边
175 |     contentGraph = contentGraph.subgraph(epred = e => !e.attr.equals(""))
176 |     // 构建子图，过滤游戏权重大于10的
177 |     contentGraph = contentGraph.subgraph(vpred = (id, vd) => {
178 |       ((vd._1.equals("game") & weightMap.get(id) > 10) | (vd._1.equals("user")))
179 |     })
180 | 
181 |     val degreeThreshold = 6
182 |     // 度数>degreeThreshold的点集
183 |     val contentDegreeArray = contentGraph.degrees.filter(_._2 > degreeThreshold).map(_._1).collect()
184 | 
185 |     // 保留度数符合规定的点
186 |     contentGraph = contentGraph.subgraph(vpred = (id, vd) => {
187 |       contentDegreeArray.contains(id)
188 |     })
189 | 
190 |     // 边的转换操作，去除前端无法识别的字符，如评论表情等
191 |     val reviewPatterns = "[^\\u4e00-\\u9fa5a-zA-Z0-9 ]".r
192 |     contentGraph.mapEdges(e => e.attr = reviewPatterns.replaceAllIn(e.attr, ""))
193 | 
194 |     println("处理contentGraph结束")
195 | 
196 |     // 时长图
197 |     var hourGraph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD2)
198 | 
199 |     println("构造hourGraph结束")
200 | 
201 |     // TODO 顶点的转换操作，根据用户id寻找用户名称
202 |     hourGraph = hourGraph.mapVertices {
203 |       case (id, (types, name)) => (types, name)
204 |     }
205 | 
206 |     hourGraph = hourGraph.subgraph(vpred = (id, vd) => {
207 |       ((vd._1.equals("game") & weightMap.get(id) > 10) | (vd._1.equals("user")))
208 |     })
209 | 
210 |     // 度数>0的点集
211 |     val hourDegreeArray = hourGraph.degrees.filter(_._2 > degreeThreshold).map(_._1).collect()
212 | 
213 |     // 去除孤立的点
214 |     hourGraph = hourGraph.subgraph(vpred = (id, vd) => {
215 |       hourDegreeArray.contains(id)
216 |     })
217 | 
218 |     println("处理hourGraph结束")
219 | 
220 |     // 独立群体检测
221 |     hourGraph.connectedComponents
222 |       .vertices
223 |       .map(_.swap)
224 |       .groupByKey()
225 |       .map(_._2)
226 |       .foreach(println)
227 | 
228 |     /**
229 |      * 将点数据写入MongoDB
230 |      * Spark的算子是在executor上执行的，数据也是放在executor上。executor和driver并不在同一个jvm（local[*]除外），
231 |      * 所以算子是不能访问在driver上的SparkSession对象
232 |      * 如果一定要“在算子里访问SparkSession”，那你只能把数据collect回Driver，然后用Scala 集合的算子去做。这种情况下只能适
233 |      * 用于数据量不大（多大取决于你分配给Driver的内存）
234 |      */
235 |     hourGraph.vertices.collect.foreach(v => {
236 | 
237 |       val id = v._1.toString
238 |       val name = v._2.toString
239 | 
240 |       writeVerticesToMongodb(spark, id, name)
241 |     })
242 | 
243 | 
244 |     // 输出到文件
245 |     val outputPath = "src/main/resources/"
246 |     val pw1 = new PrintWriter(outputPath + "steam/hours_6_30W.gexf")
247 |     pw1.write(hoursToGexf(hourGraph))
248 |     pw1.close()
249 | 
250 |     val pw2 = new PrintWriter(outputPath + "steam/steam_6_30W.gexf")
251 |     pw2.write(gameToGexf(contentGraph))
252 |     pw2.close()
253 | 
254 |     spark.close()
255 |   }
256 | 
257 |   /**
258 |    * 点数据写入MongoDB
259 |    */
260 |   def writeVerticesToMongodb(spark: SparkSession, id: String, name: String) = {
261 | 
262 |     val document = new Document()
263 |     document.append("verticeId", id).append("name", name)
264 | 
265 |     val seq = Seq(document)
266 |     val df = spark.sparkContext.parallelize(seq)
267 | 
268 |     // 将数据写入mongo
269 |     MongoSpark.save(df)
270 |   }
271 | 
272 |   /**
273 |    * 边据写入MongoDB
274 |    */
275 |   def writeEdgesToMongodb(spark: SparkSession, srcId: String, dstId: String, attr: String) = {
276 | 
277 |     val document = new Document()
278 |     document.append("srcId", srcId).append("dstId", dstId).append("attr", attr)
279 | 
280 |     val seq = Seq(document)
281 |     val df = spark.sparkContext.parallelize(seq)
282 | 
283 |     // 将数据写入mongo
284 |     MongoSpark.save(df)
285 |   }
286 | 
287 |   /**
288 |    * 用户-游戏图，相比底下的图需要指定x,y的坐标
289 |    *
290 |    * @param graph
291 |    * @tparam VD
292 |    * @tparam ED
293 |    * @return
294 |    */
295 |   def gameToGexf[VD, ED](graph: Graph[VD, ED]) = {
296 | 
297 |     "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
298 |       "<gexf xmlns=\"http://www.gexf.net/1.2draft\" version=\"1.2\" xmlns:viz=\"http://www.gexf.net/1.2draft/viz\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd\">\n" +
299 |       "<graph defaultedgetype=\"directed\" mode=\"static\">\n" +
300 |       "<attributes class=\"node\" mode=\"static\">\n" +
301 |       "<attribute id=\"modularity_class\" title=\"Modularity Class\" type=\"integer\"></attribute>\n" +
302 |       "</attributes>\n" +
303 |       "<nodes>\n " +
304 |       graph.vertices.map(v => {
305 |         // 根据类别填充颜色和attvalue
306 |         val types = v._2.toString.split(",")(0).replace("(", "")
307 |         val name = v._2.toString.split(",")(1).replace(")", "")
308 |         var color = ""
309 |         var attvalue = 0
310 |         if (types.equals("user")) {
311 |           color = "r=\"236\" g=\"81\" b=\"72\""
312 |           attvalue = 1
313 |         } else {
314 |           color = "r=\"236\" g=\"181\" b=\"72\""
315 |           attvalue = 0
316 |         }
317 |         "<node id=\"" + v._1 + "\" label=\"" + name + "\">\n" +
318 |           "<attvalues>\n" +
319 |           "<attvalue for=\"modularity_class\" value=\"" + attvalue + "\"></attvalue>\n" +
320 |           "</attvalues>\n" +
321 |           "<viz:size value=\"" + weightMap.get(v._1) + "\"></viz:size>\n" +
322 |           // (x, y) 坐标
323 |           "<viz:position x=\"" + (Random.nextInt(20000) - 10000).toString + "\" y=\"" + (Random.nextInt(20000) - 10000).toString + "\" z=\"0.0\"></viz:position>\n" +
324 |           "<viz:color " + color + "></viz:color>\n" +
325 |           "</node>\n"
326 |       }).collect().mkString +
327 |       "</nodes>\n  " +
328 |       "<edges>\n" +
329 |       graph.edges.map(e => {
330 |         "<edge source=\"" + e.srcId + "\" target=\"" + e.dstId + "\" label=\"" + e.attr + "\" weight=\"" + 1.0 + "\"/>\n"
331 |       }).collect().mkString +
332 |       "</edges>\n" +
333 |       "</graph>\n" +
334 |       "</gexf>"
335 |   }
336 | 
337 |   /**
338 |    * 时间输出为指定gexf格式
339 |    *
340 |    * @param graph ：图
341 |    * @tparam VD
342 |    * @tparam ED
343 |    * @return
344 |    */
345 |   def hoursToGexf[VD, ED](graph: Graph[VD, ED]) = {
346 | 
347 |     "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
348 |       "<gexf xmlns=\"http://www.gexf.net/1.2draft\" version=\"1.2\" xmlns:viz=\"http://www.gexf.net/1.2draft/viz\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd\">\n" +
349 |       "<graph defaultedgetype=\"directed\" mode=\"static\">\n" +
350 |       "<attributes class=\"node\" mode=\"static\">\n" +
351 |       "<attribute id=\"modularity_class\" title=\"Modularity Class\" type=\"integer\"></attribute>\n" +
352 |       "</attributes>\n" +
353 |       "<nodes>\n " +
354 |       graph.vertices.map(v => {
355 |         // 根据类别填充颜色和attvalue
356 |         val types = v._2.toString.split(",")(0).replace("(", "")
357 |         val name = v._2.toString.split(",")(1).replace(")", "")
358 |         var color = ""
359 |         var attvalue = 0
360 |         if (types.equals("user")) {
361 |           color = "r=\"236\" g=\"81\" b=\"72\""
362 |           attvalue = 1
363 |         } else {
364 |           color = "r=\"236\" g=\"181\" b=\"72\""
365 |           attvalue = 0
366 |         }
367 |         "<node id=\"" + v._1 + "\" label=\"" + name + "\">\n" +
368 |           "<attvalues>\n" +
369 |           "<attvalue for=\"modularity_class\" value=\"" + attvalue + "\"></attvalue>\n" +
370 |           "</attvalues>\n" +
371 |           "<viz:size value=\"" + weightMap.get(v._1) + "\"></viz:size>\n" +
372 |           "<viz:color " + color + "></viz:color>\n" +
373 |           "</node>\n"
374 |       }).collect().mkString +
375 |       "</nodes>\n  " +
376 |       "<edges>\n" +
377 |       graph.edges.map(e => {
378 |         "<edge source=\"" + e.srcId + "\" target=\"" + e.dstId + "\" label=\"" + e.attr + "\" weight=\"" + 1.0 + "\"/>\n"
379 |       }).collect().mkString +
380 |       "</edges>\n" +
381 |       "</graph>\n" +
382 |       "</gexf>"
383 |   }
384 | }
385 | 


--------------------------------------------------------------------------------
/spark-graphx/src/main/scala/cn/edu/nju/GraphProcessTest.scala:
--------------------------------------------------------------------------------
  1 | package cn.edu.nju
  2 | 
  3 | import java.io.PrintWriter
  4 | import java.util
  5 | 
  6 | import com.mongodb.spark.MongoSpark
  7 | import org.apache.spark.SparkConf
  8 | import org.apache.spark.graphx.{Edge, Graph}
  9 | import org.apache.spark.rdd.RDD
 10 | import org.apache.spark.sql.{DataFrame, SparkSession}
 11 | import org.bson.Document
 12 | 
 13 | import scala.util.Random
 14 | 
 15 | /**
 16 |  * Created by thpffcj on 2019/11/2.
 17 |  */
 18 | object GraphProcessTest {
 19 | 
 20 |   val pointMap = new util.HashMap[String, Long]()
 21 |   // 评论边
 22 |   val edgeMap1 = new util.HashMap[(Long, Long), String]()
 23 |   // 时长边
 24 |   val edgeMap2 = new util.HashMap[(Long, Long), String]()
 25 |   //  点权重map，根据id得到权重
 26 |   val weightMap = new util.HashMap[Long, Int]()
 27 |   //  点权重map，根据id得到权重
 28 |   val topGameSet = new util.HashSet[Long]()
 29 | 
 30 |   def main(args: Array[String]): Unit = {
 31 | 
 32 |     val conf = new SparkConf().setMaster("local[4]").setAppName("GraphProcess")
 33 |     conf.set("spark.mongodb.input.uri", "mongodb://localhost:27017/test.China.reviews")
 34 |     conf.set("spark.mongodb.input.partitioner", "MongoPaginateBySizePartitioner")
 35 | 
 36 |     val spark = SparkSession.builder().config(conf).getOrCreate()
 37 | 
 38 |     val frame: DataFrame = MongoSpark.load(spark)
 39 | 
 40 |     var key = 0L
 41 | 
 42 |     frame.foreach(row => {
 43 | 
 44 |       /**
 45 |        * 用户信息
 46 |        * 过滤非法输入符号
 47 |        */
 48 |       val jsonPlayer = row.getAs("user").toString.split(",")
 49 |       var player = ""
 50 |       if (jsonPlayer.length > 2) {
 51 |         player = jsonPlayer(jsonPlayer.length - 1)
 52 |         player = player.substring(0, player.length - 1)
 53 |       } else if (jsonPlayer(0).contains("帐户内")) {
 54 |         player = jsonPlayer(1)
 55 |         player = player.substring(0, player.length - 1)
 56 |       } else {
 57 |         player = jsonPlayer(0)
 58 |         player = player.substring(1, player.length)
 59 |       }
 60 | 
 61 |       // 过滤前端无法识别非法字符，比如表情等
 62 |       val namePatterns1 = "[`~!@#$%^&*()+=|{}':;',\\[\\]<>/?~！@#� \uE009\uF8F5￥%……& amp;*（）——+|{}【】‘；：”“’。，、？]".r
 63 |       val namePatterns2 = "[^\\u4e00-\\u9fa5a-zA-Z0-9]".r
 64 |       player = namePatterns1.replaceAllIn(player, "")
 65 |       player = namePatterns2.replaceAllIn(player, "")
 66 |       if (player.length == 0) {
 67 |         player = "anonymous"
 68 |       }
 69 | 
 70 |       // 过滤用户名过滤后为空的数据
 71 |       if (!player.equals("anonymous")) {
 72 |         // 游戏信息
 73 |         val jsonGame = row.getAs("game").toString.split(",")
 74 |         var game = jsonGame(0).substring(1)
 75 |         game = namePatterns1.replaceAllIn(game, "")
 76 | 
 77 |         // 评论
 78 |         var content = row.getAs("content").toString.replace("<br>", "")
 79 |         val contentPatterns = "[^\\u4e00-\\u9fa5a-zA-Z0-9 ]".r
 80 |         content = contentPatterns.replaceAllIn(content, "")
 81 | 
 82 |         // 游玩时长
 83 |         val patterns = "[\\u4e00-\\u9fa5]".r  // 匹配汉字
 84 |         val hours = patterns.replaceAllIn(row.getAs("hours").toString, "")
 85 | 
 86 |         // 玩家顶点
 87 |         val playerKey = "user_" + player
 88 |         var playerPoint = 0L
 89 |         if (pointMap.containsKey(playerKey)) {
 90 |           playerPoint = pointMap.get(playerKey)
 91 |           // 权重+1
 92 |           weightMap.put(playerPoint, weightMap.get(playerPoint) + 1)
 93 |         } else {
 94 |           key = key + 1
 95 |           playerPoint = key
 96 |           pointMap.put(playerKey, playerPoint)
 97 |           // 权重赋予1
 98 |           weightMap.put(playerPoint, 1)
 99 |         }
100 | 
101 |         // 游戏顶点
102 |         val gameKey = "game_" + game
103 |         var gamePoint = 0L
104 |         if (pointMap.containsKey(gameKey)) {
105 |           gamePoint = pointMap.get(gameKey)
106 |           // 权重+1
107 |           weightMap.put(gamePoint, weightMap.get(gamePoint) + 1)
108 |         } else {
109 |           key = key + 1
110 |           gamePoint = key
111 |           pointMap.put(gameKey, gamePoint)
112 |           // 权重赋予1
113 |           weightMap.put(gamePoint, 1)
114 |         }
115 | 
116 |         edgeMap1.put((playerPoint, gamePoint), content)
117 |         edgeMap2.put((playerPoint, gamePoint), hours)
118 |       }
119 | 
120 |       // KurokaneSS CODE VEIN 带妹子也就图一乐,打架还得靠云哥
121 | //      println(player + " " + game + " " + content)
122 |     })
123 | 
124 |     // 点集
125 |     var vertexArray = Seq((0L, ("test", "test")))
126 |     // 评论边
127 |     var edgeArray1 = Seq(Edge(0L, 0L, ""))
128 |     // 时长边
129 |     var edgeArray2 = Seq(Edge(0L, 0L, ""))
130 | 
131 |     // 添加点
132 |     val pointSet = pointMap.keySet()
133 |     // 遍历迭代map
134 |     val point_iter = pointSet.iterator
135 |     while (point_iter.hasNext) {
136 |       val key = point_iter.next
137 | //      println(key)
138 |       vertexArray = vertexArray :+ (pointMap.get(key), (key.split("_")(0), key.split("_")(1)))
139 |     }
140 | 
141 |     // 添加边
142 |     val edgeSet1 = edgeMap1.keySet()
143 |     // 遍历迭代map
144 |     val edge_iter1 = edgeSet1.iterator
145 |     while (edge_iter1.hasNext) {
146 |       val key = edge_iter1.next
147 |       edgeArray1 = edgeArray1 :+ Edge(key._1, key._2, edgeMap1.get(key))
148 |     }
149 | 
150 |     // 添加边
151 |     val edgeSet2 = edgeMap2.keySet()
152 |     // 遍历迭代map
153 |     val edge_iter2 = edgeSet2.iterator
154 |     while (edge_iter2.hasNext) {
155 |       val key = edge_iter2.next
156 |       edgeArray2 = edgeArray2 :+ Edge(key._1, key._2, edgeMap2.get(key))
157 |     }
158 | 
159 |     // 构造vertexRDD和edgeRDD
160 |     val vertexRDD: RDD[(Long, (String, String))] = spark.sparkContext.parallelize(vertexArray)
161 |     val edgeRDD1: RDD[Edge[String]] = spark.sparkContext.parallelize(edgeArray1)
162 |     val edgeRDD2: RDD[Edge[String]] = spark.sparkContext.parallelize(edgeArray2)
163 | 
164 |     // 构造图Graph[VD,ED]
165 |     var contentGraph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD1)
166 |     // 构建子图，过滤评论为空的边
167 |     contentGraph = contentGraph.subgraph(epred = e => !e.attr.equals(""))
168 |     // 构建子图，过滤游戏权重大于15的
169 |     contentGraph = contentGraph.subgraph(vpred = (id, vd) => {
170 |       ((vd._1.equals("game") & weightMap.get(id) > 15) | (vd._1.equals("user")))
171 |     })
172 | 
173 |     contentGraph.vertices.foreach(v => {
174 |       if (v._2._1.equals("game")) {
175 |         topGameSet.add(v._1)
176 |       }
177 |     })
178 | 
179 | 
180 | 
181 |     // 经过过滤后有些顶点是没有边，所以采用leftOuterJoin将这部分顶点去除
182 | //    val vertices = contentGraph.vertices.leftOuterJoin(vertex).map(x => (x._1, x._2._2.getOrElse("")))
183 | //    val newGraph: Graph[(String, String), String] = Graph(vertices, edge)
184 | 
185 | 
186 |     val hourGraph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD2)
187 | 
188 |     contentGraph.vertices.foreach(println(_))
189 | //    println(hourGraph.toString)
190 | 
191 |     // 输出到文件
192 |     val outputPath = "src/main/resources/"
193 | //    val pw1 = new PrintWriter(outputPath + "hours.xml")
194 | //    pw1.write(hoursToGexf(hourGraph))
195 | //    pw1.close()
196 | 
197 |     val pw2 = new PrintWriter(outputPath + "steam.gexf")
198 |     pw2.write(gameToGexf(contentGraph))
199 |     pw2.close()
200 | 
201 |     spark.close()
202 |   }
203 | 
204 |   /**
205 |    * 数据写入MongoDB
206 |    */
207 |   def writeToMongodb() = {
208 | 
209 |     val spark = SparkSession.builder()
210 |       .master("local")
211 |       .appName("MongoDBProcess")
212 |       .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/test.China.graph")
213 |       .getOrCreate()
214 | 
215 |     // 设置log级别
216 |     spark.sparkContext.setLogLevel("WARN")
217 | 
218 |     val document1 = new Document()
219 |     document1.append("name", "sunshangxiang").append("age", 18).append("sex", "female")
220 | 
221 |     val seq = Seq(document1)
222 |     val df = spark.sparkContext.parallelize(seq)
223 | 
224 |     // 将数据写入mongo
225 |     MongoSpark.save(df)
226 | 
227 |     spark.stop()
228 |   }
229 | 
230 |   /**
231 |    * 用户-游戏图，相比底下的图需要指定x,y的坐标
232 |    * @param graph
233 |    * @tparam VD
234 |    * @tparam ED
235 |    * @return
236 |    */
237 |   def gameToGexf[VD, ED](graph: Graph[VD, ED]) = {
238 | 
239 |     "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
240 |       "<gexf xmlns=\"http://www.gexf.net/1.2draft\" version=\"1.2\" xmlns:viz=\"http://www.gexf.net/1.2draft/viz\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd\">\n" +
241 |       "<graph defaultedgetype=\"directed\" mode=\"static\">\n" +
242 |       "<attributes class=\"node\" mode=\"static\">\n" +
243 |       "<attribute id=\"modularity_class\" title=\"Modularity Class\" type=\"integer\"></attribute>\n" +
244 |       "</attributes>\n" +
245 |       "<nodes>\n " +
246 |       graph.vertices.map(v => {
247 |         // 根据类别填充颜色和attvalue
248 |         val types = v._2.toString.split(",")(0).replace("(", "")
249 |         val name = v._2.toString.split(",")(1).replace(")", "")
250 |         var color = ""
251 |         var attvalue = 0
252 |         if (types.equals("user")) {
253 |           color = "r=\"236\" g=\"81\" b=\"72\""
254 |           attvalue = 1
255 |         } else {
256 |           color = "r=\"236\" g=\"181\" b=\"72\""
257 |           attvalue = 0
258 |         }
259 |         "<node id=\"" + v._1 + "\" label=\"" + name + "\">\n" +
260 |           "<attvalues>\n" +
261 |           "<attvalue for=\"modularity_class\" value=\"" + attvalue + "\"></attvalue>\n" +
262 |           "</attvalues>\n" +
263 |           "<viz:size value=\"" + weightMap.get(v._1) + "\"></viz:size>\n" +
264 |           // (x, y) 坐标
265 |           "<viz:position x=\"" + (Random.nextInt(20000) - 10000).toString + "\" y=\"" + (Random.nextInt(20000) - 10000).toString + "\" z=\"0.0\"></viz:position>\n" +
266 |           "<viz:color " + color +"></viz:color>\n" +
267 |           "</node>\n"
268 |       }).collect().mkString +
269 |       "</nodes>\n  " +
270 |       "<edges>\n" +
271 |       graph.edges.map(e => {
272 |         "<edge source=\"" + e.srcId + "\" target=\"" + e.dstId + "\" label=\"" + e.attr + "\" weight=\"" + 1.0 + "\"/>\n"
273 |       }).collect().mkString +
274 |       "</edges>\n" +
275 |       "</graph>\n" +
276 |       "</gexf>"
277 |   }
278 | 
279 |   /**
280 |    * 输出为指定gexf格式
281 |    *
282 |    * @param graph ：图
283 |    * @tparam VD
284 |    * @tparam ED
285 |    * @return
286 |    */
287 |   def hoursToGexf[VD, ED](graph: Graph[VD, ED]) = {
288 | 
289 |     "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
290 |       "<gexf xmlns=\"http://www.gexf.net/1.2draft\" version=\"1.2\" xmlns:viz=\"http://www.gexf.net/1.2draft/viz\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd\">\n" +
291 |       "<graph defaultedgetype=\"directed\" mode=\"static\">\n" +
292 |       "<attributes class=\"node\" mode=\"static\">\n" +
293 |       "<attribute id=\"modularity_class\" title=\"Modularity Class\" type=\"integer\"></attribute>\n" +
294 |       "</attributes>\n" +
295 |       "<nodes>\n " +
296 |       graph.vertices.map(v => {
297 |         // 根据类别填充颜色和attvalue
298 |         val types = v._2.toString.split(",")(0).replace("(", "")
299 |         val name = v._2.toString.split(",")(1).replace(")", "")
300 |         var color = ""
301 |         var attvalue = 0
302 |         if (types.equals("user")) {
303 |           color = "r=\"236\" g=\"81\" b=\"72\""
304 |           attvalue = 1
305 |         } else {
306 |           color = "r=\"236\" g=\"181\" b=\"72\""
307 |           attvalue = 0
308 |         }
309 |         "<node id=\"" + v._1 + "\" label=\"" + types + "-" + name + "\">\n" +
310 |           "<attvalues>\n" +
311 |           "<attvalue for=\"modularity_class\" value=\"" + attvalue + "\"></attvalue>\n" +
312 |           "</attvalues>\n" +
313 |           "<viz:size value=\"" + weightMap.get(v._1) + "\"></viz:size>\n" +
314 |           "<viz:color " + color +"></viz:color>\n" +
315 |           "</node>\n"
316 |       }).collect().mkString +
317 |       "</nodes>\n  " +
318 |       "<edges>\n" +
319 |       graph.edges.map(e => {
320 |         "<edge source=\"" + e.srcId + "\" target=\"" + e.dstId + "\" label=\"" + e.attr + "\" weight=\"" + 1.0 + "\"/>\n"
321 |       }).collect().mkString +
322 |       "</edges>\n" +
323 |       "</graph>\n" +
324 |       "</gexf>"
325 |   }
326 | }
327 | 


--------------------------------------------------------------------------------
/spark-graphx/src/main/scala/cn/edu/nju/MongoDBProcess.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju
 2 | 
 3 | import com.mongodb.spark.config.{ReadConfig, WriteConfig}
 4 | import com.mongodb.spark.sql._
 5 | import org.apache.spark.sql.SparkSession
 6 | 
 7 | /**
 8 |  * Created by thpffcj on 2019/10/31.
 9 |  */
10 | object MongoDBProcess {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 |     val MongoUri1 = args(0).toString
15 |     val MongoUri2 = args(1).toString
16 |     val SparkMasterUri = args(2).toString
17 | 
18 |     def makeMongoURI(uri: String, database: String, collection: String) = (s"${uri}/${database}.${collection}")
19 | 
20 |     val mongoURI1 = s"mongodb://${MongoUri1}:27017"
21 |     val mongoURI2 = s"mongodb://${MongoUri2}:27017"
22 | 
23 |     val CONFdb1 = makeMongoURI(s"${mongoURI1}", "MyColletion1", "df")
24 |     val CONFdb2 = makeMongoURI(s"${mongoURI2}", "MyColletion2", "df")
25 | 
26 |     val WRITEdb1: WriteConfig = WriteConfig(scala.collection.immutable.Map("uri" -> CONFdb1))
27 |     val READdb1: ReadConfig = ReadConfig(Map("uri" -> CONFdb1))
28 | 
29 |     val WRITEdb2: WriteConfig = WriteConfig(scala.collection.immutable.Map("uri" -> CONFdb2))
30 |     val READdb2: ReadConfig = ReadConfig(Map("uri" -> CONFdb2))
31 | 
32 |     val spark = SparkSession
33 |       .builder
34 |       .appName("AppMongo")
35 |       .config("spark.worker.cleanup.enabled", "true")
36 |       .config("spark.scheduler.mode", "FAIR")
37 |       .getOrCreate()
38 | 
39 |     val df1 = spark.read.mongo(READdb1)
40 |     val df2 = spark.read.mongo(READdb2)
41 |     df1.write.mode("overwrite").mongo(WRITEdb1)
42 |     df2.write.mode("overwrite").mongo(WRITEdb2)
43 |   }
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/spark-mllib/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  4 |     <modelVersion>4.0.0</modelVersion>
  5 | 
  6 |     <groupId>cn.edu.nju</groupId>
  7 |     <artifactId>mf1932063</artifactId>
  8 |     <version>1.0-SNAPSHOT</version>
  9 | 
 10 |     <properties>
 11 |         <scala.version>2.11.8</scala.version>
 12 |         <spark.version>2.4.0</spark.version>
 13 |         <maven.compiler.source>1.8</maven.compiler.source>
 14 |         <maven.compiler.target>1.8</maven.compiler.target>
 15 |     </properties>
 16 | 
 17 |     <!--添加cloudera的repository-->
 18 |     <repositories>
 19 |         <repository>
 20 |             <id>cloudera</id>
 21 |             <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
 22 |         </repository>
 23 |     </repositories>
 24 | 
 25 |     <dependencies>
 26 | 
 27 |         <dependency>
 28 |             <groupId>org.scala-lang</groupId>
 29 |             <artifactId>scala-library</artifactId>
 30 |             <version>${scala.version}</version>
 31 |         </dependency>
 32 | 
 33 |         <dependency>
 34 |             <groupId>org.apache.spark</groupId>
 35 |             <artifactId>spark-mllib_2.11</artifactId>
 36 |             <version>${spark.version}</version>
 37 |         </dependency>
 38 | 
 39 |         <dependency>
 40 |             <groupId>com.fasterxml.jackson.module</groupId>
 41 |             <artifactId>jackson-module-scala_2.11</artifactId>
 42 |             <version>2.6.5</version>
 43 |         </dependency>
 44 | 
 45 |         <dependency>
 46 |             <groupId>io.netty</groupId>
 47 |             <artifactId>netty-all</artifactId>
 48 |             <version>4.1.42.Final</version>
 49 |         </dependency>
 50 | 
 51 |     </dependencies>
 52 | 
 53 |     <build>
 54 |         <!--
 55 |         <sourceDirectory>src/main/scala</sourceDirectory>
 56 |         <testSourceDirectory>src/test/scala</testSourceDirectory>
 57 |         -->
 58 |         <plugins>
 59 |             <plugin>
 60 |                 <groupId>org.scala-tools</groupId>
 61 |                 <artifactId>maven-scala-plugin</artifactId>
 62 |                 <executions>
 63 |                     <execution>
 64 |                         <goals>
 65 |                             <goal>compile</goal>
 66 |                             <goal>testCompile</goal>
 67 |                         </goals>
 68 |                     </execution>
 69 |                 </executions>
 70 |                 <configuration>
 71 |                     <scalaVersion>${scala.version}</scalaVersion>
 72 |                     <args>
 73 |                         <arg>-target:jvm-1.8</arg>
 74 |                     </args>
 75 |                 </configuration>
 76 |             </plugin>
 77 |             <plugin>
 78 |                 <groupId>org.apache.maven.plugins</groupId>
 79 |                 <artifactId>maven-eclipse-plugin</artifactId>
 80 |                 <configuration>
 81 |                     <downloadSources>true</downloadSources>
 82 |                     <buildcommands>
 83 |                         <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
 84 |                     </buildcommands>
 85 |                     <additionalProjectnatures>
 86 |                         <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
 87 |                     </additionalProjectnatures>
 88 |                     <classpathContainers>
 89 |                         <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
 90 |                         <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
 91 |                     </classpathContainers>
 92 |                 </configuration>
 93 |             </plugin>
 94 |         </plugins>
 95 |     </build>
 96 |     <reporting>
 97 |         <plugins>
 98 |             <plugin>
 99 |                 <groupId>org.scala-tools</groupId>
100 |                 <artifactId>maven-scala-plugin</artifactId>
101 |                 <configuration>
102 |                     <scalaVersion>${scala.version}</scalaVersion>
103 |                 </configuration>
104 |             </plugin>
105 |         </plugins>
106 |     </reporting>
107 | </project>
108 | 


--------------------------------------------------------------------------------
/spark-mllib/src/main/resources/game_content.txt:
--------------------------------------------------------------------------------
 1 | Best hunting game ever created.
 2 | The best game in Assassin's creed franchise in my opinion. The story is so good!
 3 | very shooty much nice
 4 | It's a good game. However Horde mode slightly lets me down. But the shooting is still very nice !
 5 | Really fun game with plenty to do about, it would be even better if there would be more modding for the game.
 6 | It is quite a good game.  Add mods, and it becomes better.
 7 | best game made me want to break things but in a good way
 8 | Play this with your dates and get your knob wet! Or play this with your friends and get yelled at
 9 | Best Tom Clancy game I've ever played. I never get tired of playing this game.
10 | Classic style and very challenging, but tons of fun. I can't even beat the entire game. I would still highly recommend this game to anyone.
11 | Not worth it
12 | Bad data handling policies at Paradox prevent me from recommending any of their games. CK2 in particular is an especially egregious example. The original purchase agreement was altered from opt out data collection to forced data collection with no compensation given. Changing the base game to free did not help at all since those who bought it already paid for the privilege of having their data stolen all the time and local data sometimes deleted despite settings preventing updates.
13 | I really wanted to like this game. Unfortunately, I found it frenetic, and the opening levels didn't give the sense of achievement that helps you pursue success at more difficult levels. Having to go around counters which are in your way right from the beginning, is very frustrating. No chef in their right mind would set up such a crappy working layout, and it drove me batty.
14 | Don't play this game.
15 | I only recommend for hardcore AS fans just for story purposes not like the story is very good to begin with. Although it is set in one of my favourite historic time periods, The French Revolution, it doesn't make me feel very invested because the story just wasn't interesting to me. Game does and sill has bad bugs even after it's disastrous release back when it first released.
16 | I downloaded the Trial version a few months back as it was free. Prior to downloading, I checked that my computer and GPU was adequate enough to run it (an 8gb i7-4770 with a 2gb GTX 750ti was above it's minimum specs) and spent an hour downloading it. Upon playing it however, it was choppy and slow throughout and regardless of whatever low-medium range settings I used to try to remedy these issues (I certainly wasn't expecting high settings at 1080p given my setup anyway) it was still unplayable and was uninstalled around 7 minutes later. So unless the performance issues are patched, I'd best steer clear until these are rectified.
17 | I really wanted to like this game. Unfortunately, I found it frenetic, and the opening levels didn't give the sense of achievement that helps you pursue success at more difficult levels. Having to go around counters which are in your way right from the beginning, is very frustrating. No chef in their right mind would set up such a crappy working layout, and it drove me batty. :(
18 | Terrible boring game, saving the game doesn't work, you always spawn at a checkpoint. Same enemies over and over again. Your basically stuck in a cave all the damn time.  Weak story.
19 | horrible ai dont lots of bugs still eq dosent have enough power frame rate suck also  waste yur money like i did like on 1/2 steam games i have
20 | The mechanics make no sense. Strategy like this should be turn-based or else its just a race of who clicks faster
21 | 


--------------------------------------------------------------------------------
/spark-mllib/src/main/resources/recommend_validate:
--------------------------------------------------------------------------------
 1 | userId,gameId,gameName,rating,random
 2 | 1,22,Dota 2,8.0,0.1
 3 | 1,40,Counter-Strike Global Offensive,3.0,0.2
 4 | 1,5,Left 4 Dead 2,5.0,0.3
 5 | 1,10,Team Fortress 2,6.0,0.4
 6 | 1,29,Sid Meier's Civilization V,7.0,0.5
 7 | 1,8,Poly Bridge,9.0,0.6
 8 | 1,875,Assassin's Creed IV,9.0,0.7
 9 | 1,412,Cities Skylines,8.0,0.8
10 | 1,2,Fallout 4,9.0,0.9
11 | 1,6,HuniePop,3.0,1.0


--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/cn/edu/nju/DataProcessing.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju
 2 | 
 3 | import java.util
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.sql.{SaveMode, SparkSession}
 7 | 
 8 | import scala.util.Random
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019/11/18.
12 |  */
13 | object DataProcessing {
14 | 
15 |   def main(args: Array[String]): Unit = {
16 | 
17 |     getStreamRating()
18 |   }
19 | 
20 |   def getStreamRating() = {
21 | 
22 |     val gameMap = new util.HashMap[String, Int]()
23 |     // 游戏出现次数
24 |     val gameNumber = new util.HashMap[String, Int]()
25 |     val maxTimeMap = new util.HashMap[String, Double]
26 | 
27 |     val conf = new SparkConf().setMaster("local").setAppName("DataProcessing")
28 |     val spark = SparkSession.builder().config(conf).getOrCreate()
29 | 
30 |     var data = spark.read.format("csv")
31 |       .option("header", "true")
32 |       .option("inferSchema", "true")
33 |       .load("src/main/resources/steam.csv")
34 |       .select("userId", "gameName", "behavior", "duration", "gameId")
35 | 
36 |     data = data.filter(row => row.getAs("behavior").equals("play"))
37 | 
38 |     var key = 1
39 |     data.collect().foreach(row => {
40 | 
41 |       val gameName = row.getAs("gameName").toString
42 |       val duration = row.getAs("duration").toString.toDouble
43 | 
44 |       if (!gameMap.containsKey(gameName)) {
45 |         gameMap.put(gameName, key)
46 |         key = key + 1
47 |       }
48 | 
49 |       if (gameNumber.containsKey(gameName)) {
50 |         gameNumber.put(gameName, gameNumber.get(gameName) + 1)
51 |       } else {
52 |         gameNumber.put(gameName, 1)
53 |       }
54 | 
55 |       if (maxTimeMap.containsKey(gameName)) {
56 |         if (duration > maxTimeMap.get(gameName)) {
57 |           maxTimeMap.put(gameName, duration)
58 |         }
59 |       } else {
60 |         maxTimeMap.put(gameName, duration)
61 |       }
62 | 
63 |     })
64 | 
65 |     import spark.implicits._
66 |     val rand = new Random()
67 |     val cleanData = data.filter(row => {
68 |       gameNumber.get(row.getAs("gameName").toString) > 2
69 |     }).map(row => {
70 | 
71 |       val userId = row.getAs("userId").toString
72 |       val gameName = row.getAs("gameName").toString
73 |       var duration = (row.getAs("duration").toString.toDouble / maxTimeMap.get(gameName) * 10).formatted("%.2f")
74 |       if (duration.toDouble < 1.0) {
75 |         duration = "1.0"
76 |       }
77 |       val gameId = gameMap.get(gameName)
78 |       val random = rand.nextDouble()
79 | 
80 |       (userId, gameId, gameName, duration, random)
81 |     })
82 | 
83 |     cleanData.repartition(1).write.format("com.databricks.spark.csv")
84 |       .option("header", "false")
85 |       .option("delimiter", ",")
86 |       .mode(SaveMode.Overwrite)
87 |       .save("src/main/resources/steam_rating.csv")
88 | 
89 |     spark.stop()
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/cn/edu/nju/EmotionAnalysis.scala:
--------------------------------------------------------------------------------
  1 | package cn.edu.nju
  2 | 
  3 | import org.apache.spark.SparkConf
  4 | import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
  5 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
  6 | import org.apache.spark.ml.feature.{HashingTF, IDF, IDFModel}
  7 | import org.apache.spark.sql.SparkSession
  8 | 
  9 | import scala.util.Random
 10 | 
 11 | /**
 12 |  * Created by thpffcj on 2019/11/20.
 13 |  */
 14 | 
 15 | object EmotionAnalysis {
 16 | 
 17 |   def main(args: Array[String]): Unit = {
 18 |     test()
 19 |   }
 20 | 
 21 |   /**
 22 |    * (31806,32974,64780)
 23 |    * accuracy is 0.6932404540763674
 24 |    */
 25 |   def train() = {
 26 | 
 27 |     val conf = new SparkConf().setMaster("local").setAppName("EmotionAnalysis")
 28 |     val spark = SparkSession.builder().config(conf).getOrCreate()
 29 |     // 日志级别
 30 |     spark.sparkContext.setLogLevel("WARN")
 31 | 
 32 |     val rand = new Random()
 33 | 
 34 |     import spark.implicits._
 35 |     // 数据预处理
 36 |     val neg = spark.read.textFile("src/main/resources/neg.txt").map(line => {
 37 |       // 分词
 38 |       (line.split(" ").filter(!_.equals(" ")), 0, rand.nextDouble())
 39 |     }).toDF("words", "value", "random")
 40 | 
 41 |     val pos = spark.read.textFile("src/main/resources/pos.txt").map(line => {
 42 |       (line.split(" ").filter(!_.equals(" ")), 1, rand.nextDouble())
 43 |     }).toDF("words", "value", "random")  // 思考：这里把inner function提出重用来如何操作
 44 | 
 45 |     // 合并乱序
 46 |     val data = neg.union(pos).sort("random")
 47 |     println(neg.count(), pos.count(), data.count()) // 合并
 48 | 
 49 |     // 文本特征抽取(TF-IDF)
 50 |     val hashingTf = new HashingTF()
 51 |       .setInputCol("words")
 52 |       .setOutputCol("hashing")
 53 |       .transform(data)
 54 | 
 55 |     val idfModel = new IDF()
 56 |       .setInputCol("hashing")
 57 |       .setOutputCol("tfidf")
 58 |       .fit(hashingTf)
 59 | 
 60 |     val transformedData = idfModel.transform(hashingTf)
 61 |     val Array(training, test) = transformedData
 62 |       .randomSplit(Array(0.7, 0.3))
 63 | 
 64 |     // 根据抽取到的文本特征，使用分类器进行分类，这是一个二分类问题
 65 |     // 分类器是可替换的
 66 |     val bayes = new NaiveBayes()
 67 |       .setFeaturesCol("tfidf")  // X
 68 |       .setLabelCol("value")  // y 0:消极,1:积极
 69 |       .fit(training)
 70 | 
 71 |     // 交叉验证
 72 |     val result = bayes.transform(test)
 73 |     //    result.show(false)
 74 | 
 75 |     // 评估模型的准确率
 76 |     val evaluator = new MulticlassClassificationEvaluator()
 77 |       .setLabelCol("value")
 78 |       .setPredictionCol("prediction")
 79 |       .setMetricName("accuracy")
 80 | 
 81 |     val accuracy = evaluator.evaluate(result)
 82 |     println(s"""accuracy is $accuracy""")
 83 | 
 84 | //    idfModel.save("src/main/resources/model/IDFModel.model")
 85 | //    bayes.save("src/main/resources/model/content_emotion.model")
 86 | 
 87 |     // 重构思考：
 88 |     // 尝试用pipeline重构代码
 89 |     // 尝试用模型预测随便属于一句话的情感，例如：
 90 |     // You are a bad girl,I hate you. ^_^
 91 | 
 92 |     spark.stop()
 93 |   }
 94 | 
 95 |   def test() = {
 96 | 
 97 |     val conf = new SparkConf().setMaster("local").setAppName("EmotionAnalysis")
 98 |     val spark = SparkSession.builder().config(conf).getOrCreate()
 99 | 
100 |     import spark.implicits._
101 |     val content = spark.read.textFile("src/main/resources/game_content.txt").map(line => {
102 |       (line.split(" ").filter(!_.equals(" ")))
103 |     }).toDF("words")
104 | 
105 |     // 文本特征抽取(TF-IDF)
106 |     val hashingTf = new HashingTF()
107 |       .setInputCol("words")
108 |       .setOutputCol("hashing")
109 |       .transform(content)
110 | 
111 |     val idfModel = IDFModel.load("src/main/resources/model/IDFModel.model")
112 | 
113 |     val transformedData = idfModel.transform(hashingTf)
114 | 
115 |     val bayes = NaiveBayesModel.load("src/main/resources/model/content_emotion.model")
116 | 
117 |     val result = bayes.transform(transformedData)
118 |     result.show()
119 | 
120 |     spark.stop()
121 |   }
122 | }
123 | 


--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/cn/edu/nju/SteamGameRecommendation.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.ml.evaluation.RegressionEvaluator
 5 | import org.apache.spark.ml.recommendation.{ALS, ALSModel}
 6 | import org.apache.spark.sql.SparkSession
 7 | 
 8 | /**
 9 |  * Created by thpffcj on 2019/11/16.
10 |  */
11 | object SteamGameRecommendation {
12 | 
13 |   def main(args: Array[String]): Unit = {
14 | 
15 |     test()
16 |   }
17 | 
18 |   def train() = {
19 | 
20 |     val conf = new SparkConf().setMaster("local[4]").setAppName("SteamGameRecommendation")
21 |     val spark = SparkSession.builder().config(conf).getOrCreate()
22 |     spark.sparkContext.setLogLevel("WARN")
23 | 
24 |     val data = spark.read.format("csv")
25 |       .option("header", "true")
26 |       .option("inferSchema", "true")
27 |       .load("src/main/resources/steam_rating.csv")
28 |       .select("userId", "gameId", "gameName", "rating", "random")
29 |       .sort("random")
30 |       .select("userId", "gameId", "rating")
31 | 
32 |     val Array(train, test) = data.randomSplit(Array(0.7, 0.3))
33 | 
34 |     val als = new ALS()
35 |       .setMaxIter(20)
36 |       .setUserCol("userId")
37 |       .setItemCol("gameId")
38 |       .setRatingCol("rating")
39 |       // 正则化参数
40 |       .setRegParam(0.01)
41 | 
42 |     val model = als.fit(train)
43 | 
44 |     // 冷启动策略
45 |     model.setColdStartStrategy("drop")
46 | 
47 |     val predictions = model.transform(test)
48 |     // 根据(userId, gameId)预测rating
49 |     predictions.show(false)
50 | 
51 |     // 模型评估
52 |     val evaluator = new RegressionEvaluator()
53 |       .setMetricName("rmse")
54 |       .setLabelCol("rating")
55 |       .setPredictionCol("prediction")
56 | 
57 |     val rmse = evaluator.evaluate(predictions)
58 |     println(s"Root-mean-square error is $rmse \n")
59 | 
60 |     // Spark机器学习模型的持久化
61 |     // 模型保存
62 |     model.save("src/main/resources/model/game_recommendation.model")
63 | 
64 |     spark.stop()
65 |   }
66 | 
67 |   def test() = {
68 | 
69 |     val conf = new SparkConf().setMaster("local").setAppName("SteamGameRecommendation")
70 |     val spark = SparkSession.builder().config(conf).getOrCreate()
71 |     spark.sparkContext.setLogLevel("WARN")
72 | 
73 |     // 模型加载
74 |     val model = ALSModel.load("src/main/resources/model/game_recommendation.model")
75 | 
76 |     import spark.implicits._
77 |     val users = spark.createDataset(Array(1)).toDF("userId")
78 |     users.show(false)
79 | 
80 |     model.recommendForUserSubset(users, 20).show(false)
81 | 
82 |     spark.stop()
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/spark-streaming/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  4 | 
  5 |     <modelVersion>4.0.0</modelVersion>
  6 |     <groupId>cn.edu.nju</groupId>
  7 |     <artifactId>spark-streaming</artifactId>
  8 |     <version>1.0</version>
  9 |     <parent>
 10 |         <groupId>org.springframework.boot</groupId>
 11 |         <artifactId>spring-boot-starter-parent</artifactId>
 12 |         <version>2.2.0.RELEASE</version>
 13 |         <relativePath/> <!-- lookup parent from repository -->
 14 |     </parent>
 15 |     <properties>
 16 |         <scala.version>2.11.8</scala.version>
 17 |         <kafka.version>2.2.0</kafka.version>
 18 |         <spark.version>2.4.0</spark.version>
 19 |         <hadoop.version>2.6.0-cdh5.16.2</hadoop.version>
 20 |         <hbase.version>1.2.0-cdh5.16.2</hbase.version>
 21 |         <maven.compiler.source>1.8</maven.compiler.source>
 22 |         <maven.compiler.target>1.8</maven.compiler.target>
 23 |     </properties>
 24 | 
 25 |     <!--添加cloudera的repository-->
 26 |     <repositories>
 27 |         <repository>
 28 |             <id>cloudera</id>
 29 |             <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
 30 |         </repository>
 31 |         <repository>
 32 |             <id>central</id>
 33 |             <name>aliyun maven</name>
 34 |             <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
 35 |             <layout>default</layout>
 36 |         </repository>
 37 |     </repositories>
 38 | 
 39 |     <dependencies>
 40 |         <dependency>
 41 |             <groupId>org.springframework.boot</groupId>
 42 |             <artifactId>spring-boot-starter-web</artifactId>
 43 |             <exclusions>
 44 |                 <exclusion>
 45 |                     <groupId>ch.qos.logback</groupId>
 46 |                     <artifactId>logback-classic</artifactId>
 47 |                 </exclusion>
 48 |             </exclusions>
 49 |         </dependency>
 50 |         <dependency>
 51 |             <groupId>org.springframework.boot</groupId>
 52 |             <artifactId>spring-boot-starter-websocket</artifactId>
 53 |         </dependency>
 54 |         <dependency>
 55 |             <groupId>org.springframework.session</groupId>
 56 |             <artifactId>spring-session-core</artifactId>
 57 |         </dependency>
 58 |         <dependency>
 59 |             <groupId>cn.hutool</groupId>
 60 |             <artifactId>hutool-log</artifactId>
 61 |             <version>4.1.1</version>
 62 |         </dependency>
 63 | 
 64 |         <dependency>
 65 |             <groupId>org.springframework.boot</groupId>
 66 |             <artifactId>spring-boot-starter-test</artifactId>
 67 |             <scope>test</scope>
 68 |             <exclusions>
 69 |                 <exclusion>
 70 |                     <groupId>org.junit.vintage</groupId>
 71 |                     <artifactId>junit-vintage-engine</artifactId>
 72 |                 </exclusion>
 73 |             </exclusions>
 74 |         </dependency>
 75 | 
 76 |         <dependency>
 77 |             <groupId>org.projectlombok</groupId>
 78 |             <artifactId>lombok</artifactId>
 79 |             <version>1.16.18</version>
 80 |         </dependency>
 81 | 
 82 |         <dependency>
 83 |             <groupId>org.scala-lang</groupId>
 84 |             <artifactId>scala-library</artifactId>
 85 |             <version>${scala.version}</version>
 86 |         </dependency>
 87 | 
 88 |         <dependency>
 89 |             <groupId>org.mongodb.spark</groupId>
 90 |             <artifactId>mongo-spark-connector_2.11</artifactId>
 91 |             <version>${spark.version}</version>
 92 |         </dependency>
 93 | 
 94 |         <!-- Kafka 依赖-->
 95 | <!--        <dependency>-->
 96 | <!--            <groupId>org.apache.kafka</groupId>-->
 97 | <!--            <artifactId>kafka_2.11</artifactId>-->
 98 | <!--            <version>${kafka.version}</version>-->
 99 | <!--        </dependency>-->
100 | 
101 |         <!-- Hadoop 依赖-->
102 |         <dependency>
103 |             <groupId>org.apache.hadoop</groupId>
104 |             <artifactId>hadoop-client</artifactId>
105 |             <version>${hadoop.version}</version>
106 |         </dependency>
107 | 
108 |         <!-- HBase 依赖-->
109 |         <dependency>
110 |             <groupId>org.apache.hbase</groupId>
111 |             <artifactId>hbase-client</artifactId>
112 |             <version>${hbase.version}</version>
113 |         </dependency>
114 | 
115 |         <dependency>
116 |             <groupId>org.apache.hbase</groupId>
117 |             <artifactId>hbase-server</artifactId>
118 |             <version>${hbase.version}</version>
119 |         </dependency>
120 | 
121 |         <!-- Spark Streaming 依赖-->
122 |         <dependency>
123 |             <groupId>org.apache.spark</groupId>
124 |             <artifactId>spark-streaming_2.11</artifactId>
125 |             <version>${spark.version}</version>
126 |         </dependency>
127 | 
128 |         <dependency>
129 |             <groupId>org.apache.spark</groupId>
130 |             <artifactId>spark-graphx_2.11</artifactId>
131 |             <version>${spark.version}</version>
132 |         </dependency>
133 | 
134 |         <dependency>
135 |             <groupId>org.apache.spark</groupId>
136 |             <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
137 |             <version>${spark.version}</version>
138 |         </dependency>
139 | 
140 |         <dependency>
141 |             <groupId>org.apache.commons</groupId>
142 |             <artifactId>commons-lang3</artifactId>
143 |             <version>3.5</version>
144 |         </dependency>
145 | 
146 |         <!-- Spark SQL 依赖-->
147 |         <dependency>
148 |             <groupId>org.apache.spark</groupId>
149 |             <artifactId>spark-sql_2.11</artifactId>
150 |             <version>${spark.version}</version>
151 |         </dependency>
152 | 
153 |                 <dependency>
154 |                     <groupId>com.fasterxml.jackson.module</groupId>
155 |                     <artifactId>jackson-module-scala_2.11</artifactId>
156 |                     <version>2.6.5</version>
157 |                 </dependency>
158 | 
159 |         <dependency>
160 |             <groupId>org.elasticsearch</groupId>
161 |             <artifactId>elasticsearch-spark-20_2.11</artifactId>
162 |             <version>6.5.4</version>
163 |         </dependency>
164 | 
165 |         <dependency>
166 |             <groupId>com.alibaba</groupId>
167 |             <artifactId>fastjson</artifactId>
168 |             <version>1.2.47</version>
169 |         </dependency>
170 | 
171 |         <dependency>
172 |             <groupId>com.fasterxml.jackson.core</groupId>
173 |             <artifactId>jackson-databind</artifactId>
174 |             <version>2.9.10.1</version>
175 |         </dependency>
176 | 
177 |         <dependency>
178 |             <groupId>net.jpountz.lz4</groupId>
179 |             <artifactId>lz4</artifactId>
180 |             <version>1.3.0</version>
181 |         </dependency>
182 | 
183 |         <dependency>
184 |             <groupId>org.codehaus.janino</groupId>
185 |             <artifactId>janino</artifactId>
186 |             <version>3.0.8</version>
187 |         </dependency>
188 | 
189 |         <dependency>
190 |             <groupId>mysql</groupId>
191 |             <artifactId>mysql-connector-java</artifactId>
192 |             <version>5.1.38</version>
193 |         </dependency>
194 | 
195 |         <dependency>
196 |             <groupId>io.netty</groupId>
197 |             <artifactId>netty-all</artifactId>
198 |             <version>4.1.42.Final</version>
199 |         </dependency>
200 | 
201 |         <dependency>
202 |             <groupId>com.mchange</groupId>
203 |             <artifactId>c3p0</artifactId>
204 |             <version>0.9.5.2</version>
205 |         </dependency>
206 | 
207 |     </dependencies>
208 | 
209 |     <build>
210 |         <!--
211 |         <sourceDirectory>src/main/scala</sourceDirectory>
212 |         <testSourceDirectory>src/test/scala</testSourceDirectory>
213 |         -->
214 |         <plugins>
215 |             <plugin>
216 |                 <groupId>org.scala-tools</groupId>
217 |                 <artifactId>maven-scala-plugin</artifactId>
218 |                 <executions>
219 |                     <execution>
220 |                         <goals>
221 |                             <goal>compile</goal>
222 |                             <goal>testCompile</goal>
223 |                         </goals>
224 |                     </execution>
225 |                 </executions>
226 |                 <configuration>
227 |                     <scalaVersion>${scala.version}</scalaVersion>
228 |                     <args>
229 |                         <arg>-target:jvm-1.8</arg>
230 |                     </args>
231 |                 </configuration>
232 |             </plugin>
233 |             <plugin>
234 |                 <groupId>org.apache.maven.plugins</groupId>
235 |                 <artifactId>maven-eclipse-plugin</artifactId>
236 |                 <configuration>
237 |                     <downloadSources>true</downloadSources>
238 |                     <buildcommands>
239 |                         <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
240 |                     </buildcommands>
241 |                     <additionalProjectnatures>
242 |                         <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
243 |                     </additionalProjectnatures>
244 |                     <classpathContainers>
245 |                         <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
246 |                         <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
247 |                     </classpathContainers>
248 |                 </configuration>
249 |             </plugin>
250 |         </plugins>
251 |     </build>
252 |     <reporting>
253 |         <plugins>
254 |             <plugin>
255 |                 <groupId>org.scala-tools</groupId>
256 |                 <artifactId>maven-scala-plugin</artifactId>
257 |                 <configuration>
258 |                     <scalaVersion>${scala.version}</scalaVersion>
259 |                 </configuration>
260 |             </plugin>
261 |         </plugins>
262 |     </reporting>
263 | </project>


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/ApiReturnUtil.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import cn.edu.nju.api.ApiReturnObject;
 4 | import cn.hutool.log.Log;
 5 | import cn.hutool.log.LogFactory;
 6 | 
 7 | public class ApiReturnUtil {
 8 | 
 9 |     static Log log = LogFactory.get(WebSocketServer.class);
10 | 
11 |     public static ApiReturnObject error(String s) {
12 |         log.error(s);
13 |         return new ApiReturnObject(null);
14 |     }
15 | 
16 |     public static ApiReturnObject success(String cid) {
17 |         log.info("success:" + cid);
18 |         return new ApiReturnObject(null);
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/DemoMessageController.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import cn.edu.nju.api.ApiReturnObject;
 4 | import org.springframework.stereotype.Controller;
 5 | import org.springframework.web.bind.annotation.GetMapping;
 6 | import org.springframework.web.bind.annotation.PathVariable;
 7 | import org.springframework.web.bind.annotation.RequestMapping;
 8 | import org.springframework.web.bind.annotation.ResponseBody;
 9 | 
10 | import java.io.IOException;
11 | 
12 | @Controller
13 | @RequestMapping("/websocket")
14 | public class DemoMessageController {
15 | 
16 |     //页面请求
17 |     @GetMapping("/")
18 |     public String index() {
19 |         return "index";
20 |     }
21 |     //推送数据接口
22 |     @ResponseBody
23 |     @RequestMapping("/socket/push/{cid}")
24 |     public ApiReturnObject pushToWeb(@PathVariable String cid, String message) {
25 |         try {
26 |             WebSocketServer.sendInfo(message,cid);
27 |         } catch (IOException e) {
28 |             e.printStackTrace();
29 |             return ApiReturnUtil.error(cid+"#"+e.getMessage());
30 |         }
31 |         return ApiReturnUtil.success(cid);
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/SteamserverdemoApplication.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import org.springframework.boot.SpringApplication;
 4 | import org.springframework.boot.autoconfigure.SpringBootApplication;
 5 | 
 6 | @SpringBootApplication
 7 | public class SteamserverdemoApplication {
 8 | 
 9 |     public static void main(String[] args) {
10 |         SpringApplication.run(SteamserverdemoApplication.class, args);
11 |     }
12 | 
13 | }
14 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/Test.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import cn.edu.nju.api.ApiReturnObject;
 4 | import cn.edu.nju.api.TagReturnObject;
 5 | import cn.edu.nju.utils.DateUtils;
 6 | 
 7 | /**
 8 |  * Created by thpffcj on 2019/10/24.
 9 |  */
10 | public class Test {
11 | 
12 |     public static void main(String[] args) {
13 | 
14 |         MySQLProcess mySQLProcess = new MySQLProcess();
15 | 
16 |         ApiReturnObject apiReturnObject = mySQLProcess.getTimeFieldData(DateUtils.getSteamDates());
17 | 
18 |         TagReturnObject tagReturnObject = mySQLProcess.getTagData(2);
19 | 
20 |         System.out.println("hello");
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/TimeFieldObject.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import cn.edu.nju.domain.GameObject;
 4 | import org.springframework.beans.factory.annotation.Autowired;
 5 | 
 6 | import java.io.Serializable;
 7 | import java.util.ArrayList;
 8 | 
 9 | public class TimeFieldObject implements Serializable{
10 | 
11 |     @Autowired
12 |     private String name;
13 | 
14 |     @Autowired
15 |     private ArrayList<GameObject> values;
16 | 
17 |     public TimeFieldObject(String name, ArrayList<GameObject> values) {
18 |         this.name = name;
19 |         this.values = values;
20 |     }
21 | 
22 |     public String getName() {
23 |         return name;
24 |     }
25 | 
26 |     public void setName(String name) {
27 |         this.name = name;
28 |     }
29 | 
30 |     public ArrayList<GameObject> getValues() {
31 |         return values;
32 |     }
33 | 
34 |     public void setValues(ArrayList<GameObject> values) {
35 |         this.values = values;
36 |     }
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/WebSocketConfig.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import org.springframework.context.annotation.Bean;
 4 | import org.springframework.context.annotation.Configuration;
 5 | import org.springframework.web.socket.server.standard.ServerEndpointExporter;
 6 | 
 7 | @Configuration
 8 | public class WebSocketConfig {
 9 | 
10 |     @Bean
11 |     public ServerEndpointExporter serverEndpointExporter(){
12 |         return new ServerEndpointExporter();
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/WebSocketServer.java:
--------------------------------------------------------------------------------
  1 | package cn.edu.nju;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.ArrayList;
  5 | import java.util.concurrent.CopyOnWriteArraySet;
  6 | 
  7 | import javax.websocket.*;
  8 | import javax.websocket.server.PathParam;
  9 | import javax.websocket.server.ServerEndpoint;
 10 | 
 11 | import cn.edu.nju.api.ApiReturnObject;
 12 | import cn.edu.nju.api.TagReturnObject;
 13 | import cn.edu.nju.domain.GameObject;
 14 | import cn.edu.nju.encoder.ApiObjectEncoder;
 15 | import cn.edu.nju.utils.DateUtils;
 16 | import org.springframework.stereotype.Component;
 17 | import cn.hutool.log.Log;
 18 | import cn.hutool.log.LogFactory;
 19 | 
 20 | @ServerEndpoint(value = "/websocket/{sid}", encoders = {ApiObjectEncoder.class})
 21 | @Component
 22 | public class WebSocketServer {
 23 | 
 24 |     static Log log = LogFactory.get(WebSocketServer.class);
 25 | 
 26 |     // 静态变量，用来记录当前在线连接数。应该把它设计成线程安全的。
 27 |     private static int onlineCount = 0;
 28 | 
 29 |     // concurrent包的线程安全Set，用来存放每个客户端对应的MyWebSocket对象。
 30 |     private static CopyOnWriteArraySet<WebSocketServer> webSocketSet = new CopyOnWriteArraySet<WebSocketServer>();
 31 | 
 32 |     //与某个客户端的连接会话，需要通过它来给客户端发送数据
 33 |     private Session session;
 34 | 
 35 |     //接收sid
 36 |     private String sid = "";
 37 | 
 38 |     /**
 39 |      * 连接建立成功调用的方法
 40 |      */
 41 |     @OnOpen
 42 |     public void onOpen(Session session, @PathParam("sid") String sid) {
 43 |         this.session = session;
 44 |         webSocketSet.add(this);     //加入set中
 45 |         addOnlineCount();           //在线数加1
 46 |         log.info("有新窗口开始监听:" + sid + ",当前在线人数为" + getOnlineCount());
 47 |         this.sid = sid;
 48 | //        GameObject gameObject1 = new GameObject("edge", "just so so", 2200, "blue");
 49 | //        GameObject gameObject2 = new GameObject("fire fox", "good", 900, "green");
 50 | //        GameObject gameObject3 = new GameObject("chrome", "excellent", 3800, "red");
 51 | //        GameObject gameObject4 = new GameObject("edge", "just so so", 1500, "blue");
 52 | //        GameObject gameObject5 = new GameObject("fire fox", "good", 1900, "green");
 53 | //        GameObject gameObject6 = new GameObject("chrome", "excellent", 2800, "red");
 54 | //        GameObject gameObject7 = new GameObject("edge", "just so so", 2600, "blue");
 55 | //        GameObject gameObject8 = new GameObject("fire fox", "good", 2200, "green");
 56 | //        GameObject gameObject9 = new GameObject("chrome", "excellent", 1800, "red");
 57 | //        ArrayList<GameObject> gameObjects1 = new ArrayList<>();
 58 | //        ArrayList<GameObject> gameObjects2 = new ArrayList<>();
 59 | //        ArrayList<GameObject> gameObjects3 = new ArrayList<>();
 60 | //        gameObjects1.add(gameObject1);
 61 | //        gameObjects1.add(gameObject2);
 62 | //        gameObjects1.add(gameObject3);
 63 | //        gameObjects2.add(gameObject4);
 64 | //        gameObjects2.add(gameObject5);
 65 | //        gameObjects2.add(gameObject6);
 66 | //        gameObjects3.add(gameObject7);
 67 | //        gameObjects3.add(gameObject8);
 68 | //        gameObjects3.add(gameObject9);
 69 | //        TimeFieldObject timeFieldObject1 = new TimeFieldObject("2017", gameObjects1);
 70 | //        TimeFieldObject timeFieldObject2 = new TimeFieldObject("2018", gameObjects2);
 71 | //        TimeFieldObject timeFieldObject3 = new TimeFieldObject("2019", gameObjects3);
 72 | //        ArrayList<TimeFieldObject> timeFieldObjects = new ArrayList<>();
 73 | //        timeFieldObjects.add(timeFieldObject1);
 74 | //        timeFieldObjects.add(timeFieldObject2);
 75 | //        timeFieldObjects.add(timeFieldObject3);
 76 |         MySQLProcess mySQLProcess = new MySQLProcess();
 77 |         ApiReturnObject apiReturnObject = mySQLProcess.getTimeFieldData(DateUtils.getSteamDates());
 78 |         try {
 79 |             sendData(apiReturnObject);
 80 |             for (int i = 1; i <= 7; i++) {
 81 |                 TagReturnObject tagReturnObject = mySQLProcess.getTagData(i);
 82 |                 sendTagData(tagReturnObject);
 83 |                 Thread.sleep(5000);
 84 |             }
 85 |         } catch (IOException | EncodeException | InterruptedException e) {
 86 |             log.error("websocket IO异常"+e.getMessage());
 87 |         }
 88 |     }
 89 | 
 90 |     /**
 91 |      * 连接关闭调用的方法
 92 |      */
 93 |     @OnClose
 94 |     public void onClose() {
 95 |         webSocketSet.remove(this);  //从set中删除
 96 |         subOnlineCount();           //在线数减1
 97 |         log.info("有一连接关闭！当前在线人数为" + getOnlineCount());
 98 |     }
 99 | 
100 |     /**
101 |      * 收到客户端消息后调用的方法
102 |      *
103 |      * @param message 客户端发送过来的消息
104 |      */
105 |     @OnMessage
106 |     public void onMessage(String message, Session session) {
107 |         log.info("收到来自窗口" + sid + "的信息:" + message);
108 |         //群发消息
109 |         for (WebSocketServer item : webSocketSet) {
110 |             try {
111 |                 item.sendMessage(message);
112 |             } catch (IOException e) {
113 |                 e.printStackTrace();
114 |             }
115 |         }
116 |     }
117 | 
118 |     /**
119 |      * @param session
120 |      * @param error
121 |      */
122 |     @OnError
123 |     public void onError(Session session, Throwable error) {
124 |         log.error("发生错误");
125 |         error.printStackTrace();
126 |     }
127 | 
128 |     /**
129 |      * 实现服务器主动推送
130 |      */
131 |     public void sendMessage(String message) throws IOException {
132 |         this.session.getBasicRemote().sendText(message);
133 |     }
134 | 
135 |     /**
136 |      * 实现服务器主动推送
137 |      */
138 |     public void sendData(ApiReturnObject data) throws IOException, EncodeException {
139 |         this.session.getBasicRemote().sendObject(data);
140 |     }
141 | 
142 |     public void sendTagData(TagReturnObject data) throws IOException, EncodeException {
143 |         this.session.getBasicRemote().sendObject(data);
144 |     }
145 | 
146 |     /**
147 |      * 群发自定义消息
148 |      */
149 |     public static void sendInfo(String message, @PathParam("sid") String sid) throws IOException {
150 |         log.info("推送消息到窗口" + sid + "，推送内容:" + message);
151 |         for (WebSocketServer item : webSocketSet) {
152 |             try {
153 |                 //这里可以设定只推送给这个sid的，为null则全部推送
154 |                 if (sid == null) {
155 |                     item.sendMessage(message);
156 |                 } else if (item.sid.equals(sid)) {
157 |                     item.sendMessage(message);
158 |                 }
159 |             } catch (IOException e) {
160 |                 continue;
161 |             }
162 |         }
163 |     }
164 | 
165 |     public static synchronized int getOnlineCount() {
166 |         return onlineCount;
167 |     }
168 | 
169 |     public static synchronized void addOnlineCount() {
170 |         WebSocketServer.onlineCount++;
171 |     }
172 | 
173 |     public static synchronized void subOnlineCount() {
174 |         WebSocketServer.onlineCount--;
175 |     }
176 | }
177 | 
178 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/api/ApiReturnObject.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.api;
 2 | 
 3 | import cn.edu.nju.TimeFieldObject;
 4 | 
 5 | import java.io.Serializable;
 6 | import java.util.ArrayList;
 7 | 
 8 | public class ApiReturnObject implements Serializable {
 9 | 
10 |     private ArrayList<TimeFieldObject> timeFieldObjects;
11 | 
12 |     public ApiReturnObject(ArrayList<TimeFieldObject> timeFieldObjects) {
13 |         this.timeFieldObjects = timeFieldObjects;
14 |     }
15 | 
16 |     public ArrayList<TimeFieldObject> getTimeFieldObjects() {
17 |         return timeFieldObjects;
18 |     }
19 | 
20 |     public void setTimeFieldObjects(ArrayList<TimeFieldObject> timeFieldObjects) {
21 |         this.timeFieldObjects = timeFieldObjects;
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/api/TagReturnObject.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.api;
 2 | 
 3 | import cn.edu.nju.domain.TagObject;
 4 | import lombok.AllArgsConstructor;
 5 | import lombok.Data;
 6 | import lombok.NoArgsConstructor;
 7 | 
 8 | import java.io.Serializable;
 9 | import java.util.ArrayList;
10 | 
11 | /**
12 |  * Created by thpffcj on 2019/10/25.
13 |  */
14 | public class TagReturnObject implements Serializable {
15 | 
16 |     private ArrayList<TagObject> tagObjects;
17 | 
18 |     public TagReturnObject(ArrayList<TagObject> tagObjects) {
19 |         this.tagObjects = tagObjects;
20 |     }
21 | 
22 |     public ArrayList<TagObject> getTagObjects() {
23 |         return tagObjects;
24 |     }
25 | 
26 |     public void setTagObjects(ArrayList<TagObject> tagObjects) {
27 |         this.tagObjects = tagObjects;
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/domain/GameObject.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.domain;
 2 | 
 3 | import org.springframework.beans.factory.annotation.Autowired;
 4 | 
 5 | import java.io.Serializable;
 6 | 
 7 | public class GameObject implements Serializable {
 8 | 
 9 |     @Autowired
10 |     private String id;
11 | 
12 |     @Autowired
13 |     private String label;
14 | 
15 |     @Autowired
16 |     private int value;
17 | 
18 |     @Autowired
19 |     private String color;
20 | 
21 |     public GameObject(String id, String label, int value, String color) {
22 |         this.id = id;
23 |         this.label = label;
24 |         this.value = value;
25 |         this.color = color;
26 |     }
27 | 
28 |     public String getId() {
29 |         return id;
30 |     }
31 | 
32 |     public void setId(String id) {
33 |         this.id = id;
34 |     }
35 | 
36 |     public String getLabel() {
37 |         return label;
38 |     }
39 | 
40 |     public void setLabel(String label) {
41 |         this.label = label;
42 |     }
43 | 
44 |     public int getValue() {
45 |         return value;
46 |     }
47 | 
48 |     public void setValue(int value) {
49 |         this.value = value;
50 |     }
51 | 
52 |     public String getColor() {
53 |         return color;
54 |     }
55 | 
56 |     public void setColor(String color) {
57 |         this.color = color;
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/domain/TagObject.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.domain;
 2 | 
 3 | import lombok.AllArgsConstructor;
 4 | import lombok.Data;
 5 | import lombok.NoArgsConstructor;
 6 | import org.springframework.beans.factory.annotation.Autowired;
 7 | 
 8 | import java.io.Serializable;
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019/10/25.
12 |  */
13 | public class TagObject implements Serializable {
14 | 
15 |     @Autowired
16 |     private String label;
17 | 
18 |     @Autowired
19 |     private int value;
20 | 
21 |     public TagObject() {
22 |     }
23 | 
24 |     public TagObject(String label, int value) {
25 |         this.label = label;
26 |         this.value = value;
27 |     }
28 | 
29 |     public String getLabel() {
30 |         return label;
31 |     }
32 | 
33 |     public void setLabel(String label) {
34 |         this.label = label;
35 |     }
36 | 
37 |     public int getValue() {
38 |         return value;
39 |     }
40 | 
41 |     public void setValue(int value) {
42 |         this.value = value;
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/encoder/ApiObjectEncoder.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.encoder;
 2 | 
 3 | import javax.websocket.EncodeException;
 4 | import javax.websocket.Encoder;
 5 | import javax.websocket.EndpointConfig;
 6 | 
 7 | import cn.edu.nju.api.ApiReturnObject;
 8 | import com.alibaba.fastjson.JSON;
 9 | import com.alibaba.fastjson.serializer.SerializerFeature;
10 | import com.alibaba.fastjson.serializer.SimplePropertyPreFilter;
11 | 
12 | public class ApiObjectEncoder implements Encoder.Text<ApiReturnObject> {
13 | 
14 |     @Override
15 |     public String encode(ApiReturnObject apiReturnObject) throws EncodeException {
16 |         SimplePropertyPreFilter filter = new SimplePropertyPreFilter(
17 |                 ApiReturnObject.class, "timeFieldObjects");
18 |         return JSON.toJSONString(apiReturnObject,filter,SerializerFeature.DisableCircularReferenceDetect);
19 |     }
20 | 
21 |     @Override
22 |     public void init(EndpointConfig endpointConfig) {
23 | 
24 |     }
25 | 
26 |     @Override
27 |     public void destroy() {
28 | 
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/encoder/TagObjectEncoder.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.encoder;
 2 | 
 3 | import cn.edu.nju.api.TagReturnObject;
 4 | import com.alibaba.fastjson.JSON;
 5 | import com.alibaba.fastjson.serializer.SerializerFeature;
 6 | import com.alibaba.fastjson.serializer.SimplePropertyPreFilter;
 7 | 
 8 | import javax.websocket.EncodeException;
 9 | import javax.websocket.Encoder;
10 | import javax.websocket.EndpointConfig;
11 | 
12 | /**
13 |  * Created by thpffcj on 2019/10/26.
14 |  */
15 | public class TagObjectEncoder implements Encoder.Text<TagReturnObject> {
16 | 
17 |     @Override
18 |     public String encode(TagReturnObject tagReturnObject) throws EncodeException {
19 |         SimplePropertyPreFilter filter = new SimplePropertyPreFilter(
20 |                 TagReturnObject.class, "tagObjects");
21 |         return JSON.toJSONString(tagReturnObject,filter, SerializerFeature.DisableCircularReferenceDetect);
22 |     }
23 | 
24 |     @Override
25 |     public void init(EndpointConfig endpointConfig) {
26 | 
27 |     }
28 | 
29 |     @Override
30 |     public void destroy() {
31 | 
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/utils/DbPool.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.utils;
 2 | 
 3 | import cn.edu.nju.domain.TagObject;
 4 | import com.mchange.v2.c3p0.ComboPooledDataSource;
 5 | 
 6 | import java.sql.Connection;
 7 | import java.sql.SQLException;
 8 | import java.util.ArrayList;
 9 | import java.sql.PreparedStatement;
10 | import java.sql.ResultSet;
11 | import java.util.List;
12 | 
13 | /**
14 |  * Created by thpffcj on 2020/2/26.
15 |  */
16 | public class DbPool {
17 | 
18 |     private static DbPool instance;
19 | 
20 |     private ComboPooledDataSource ds;
21 | 
22 |     private DbPool() throws Exception {
23 |         ds = new ComboPooledDataSource();
24 |         ds.setDriverClass("oracle.jdbc.driver.OracleDriver");  //驱动
25 |         ds.setJdbcUrl("jdbc:oracle:thin:@localhost:1521:orcl");  //地址
26 |         ds.setUser("test0816");  //数据库用户名
27 |         ds.setPassword("934617699");  //数据库用户密码
28 | 
29 |         // 初始化时获取三个连接，取值应在minPoolSize与maxPoolSize之间。Default: 5 initialPoolSize
30 |         ds.setInitialPoolSize(5);
31 |         // 连接池中保留的最大连接数。Default:  20 maxPoolSize
32 |         ds.setMaxPoolSize(20);
33 |         // 连接池中保留的最小连接数。
34 |         ds.setMinPoolSize(1);
35 |         // 当连接池中的连接耗尽的时候c3p0一次同时获取的连接数。Default: 5 acquireIncrement
36 |         ds.setAcquireIncrement(10);
37 |     }
38 | 
39 |     // 用来返回该对象
40 |     public static final DbPool getInstance() {
41 | 
42 |         if (instance == null) {
43 |             try {
44 |                 instance = new DbPool();
45 |             } catch (Exception e) {
46 |                 e.printStackTrace();
47 |             }
48 |         }
49 |         return instance;
50 |     }
51 | 
52 |     // 返回一个连接
53 |     public synchronized final Connection getConnection() {
54 |         try {
55 |             return ds.getConnection();
56 |         } catch (SQLException e) {
57 |             e.printStackTrace();
58 |         }
59 |         return null;
60 |     }
61 | 
62 |     public static void main(String[] args) {
63 |         DbPool dbPool = DbPool.getInstance() ;
64 | 
65 |         List<TagObject> list = new ArrayList<>();
66 | 
67 |         Connection connection = dbPool.getConnection();
68 |         String sql = "select * from person " ;
69 | 
70 |         try {
71 |             PreparedStatement pt = connection.prepareStatement(sql) ;
72 |             ResultSet rt = pt.executeQuery() ;
73 | 
74 |             while(rt.next()) {
75 |                 TagObject tag = new TagObject();
76 |                 tag.setLabel(rt.getString("label"));
77 |                 tag.setValue(rt.getInt("value"));
78 |                 list.add(tag) ;
79 |             }
80 | 
81 |             for(TagObject tag : list) {
82 |                 System.out.println(tag);
83 |             }
84 |         } catch (SQLException e) {
85 |             e.printStackTrace();
86 |         }
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/utils/HBaseUtils.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.utils;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.hbase.client.HBaseAdmin;
 5 | import org.apache.hadoop.hbase.client.HTable;
 6 | import org.apache.hadoop.hbase.client.Put;
 7 | import org.apache.hadoop.hbase.util.Bytes;
 8 | 
 9 | import java.io.IOException;
10 | 
11 | /**
12 |  * Created by thpffcj on 2019/10/17.
13 |  */
14 | public class HBaseUtils {
15 | 
16 |     HBaseAdmin admin = null;
17 |     Configuration configuration = null;
18 | 
19 |     /**
20 |      * 私有改造方法
21 |      */
22 |     private HBaseUtils() {
23 |         configuration = new Configuration();
24 |         configuration.set("hbase.zookeeper.quorum", "192.168.92.130:2181");
25 |         configuration.set("hbase.rootdir", "hdfs://192.168.92.130:8020/hbase");
26 | 
27 |         try {
28 |             admin = new HBaseAdmin(configuration);
29 |         } catch (IOException e) {
30 |             e.printStackTrace();
31 |         }
32 |     }
33 | 
34 |     private static HBaseUtils instance = null;
35 | 
36 |     public static synchronized HBaseUtils getInstance() {
37 |         if (null == instance) {
38 |             instance = new HBaseUtils();
39 |         }
40 |         return instance;
41 |     }
42 | 
43 | 
44 |     /**
45 |      * 根据表名获取到HTable实例
46 |      */
47 |     public HTable getTable(String tableName) {
48 | 
49 |         HTable table = null;
50 | 
51 |         try {
52 |             table = new HTable(configuration, tableName);
53 |         } catch (IOException e) {
54 |             e.printStackTrace();
55 |         }
56 | 
57 |         return table;
58 |     }
59 | 
60 |     /**
61 |      * 添加一条记录到HBase表
62 |      *
63 |      * @param tableName HBase表名
64 |      * @param rowkey    HBase表的rowkey
65 |      * @param cf        HBase表的columnfamily
66 |      * @param column    HBase表的列
67 |      * @param value     写入HBase表的值
68 |      */
69 |     public void put(String tableName, String rowkey, String cf, String column, String value) {
70 |         HTable table = getTable(tableName);
71 | 
72 |         Put put = new Put(Bytes.toBytes(rowkey));
73 |         put.add(Bytes.toBytes(cf), Bytes.toBytes(column), Bytes.toBytes(value));
74 | 
75 |         try {
76 |             table.put(put);
77 |         } catch (IOException e) {
78 |             e.printStackTrace();
79 |         }
80 |     }
81 | 
82 |     public static void main(String[] args) {
83 | 
84 | //        HTable table = HBaseUtils.getInstance().getTable("imooc_course_clickcount");
85 | //        System.out.println(table.getName().getNameAsString());
86 | 
87 |         String tableName = "imooc_course_clickcount";
88 |         String rowkey = "20171111_88";
89 |         String cf = "info";
90 |         String column = "click_count";
91 |         String value = "2";
92 |         HBaseUtils.getInstance().put(tableName, rowkey, cf, column, value);
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/java/cn/edu/nju/utils/Test.java:
--------------------------------------------------------------------------------
  1 | package cn.edu.nju.utils;
  2 | 
  3 | import java.sql.Connection;
  4 | import java.sql.DriverManager;
  5 | import java.sql.ResultSet;
  6 | import java.sql.Statement;
  7 | import java.util.ArrayList;
  8 | import java.util.List;
  9 | 
 10 | /**
 11 |  * Created by thpffcj on 2019/10/28.
 12 |  */
 13 | public class Test {
 14 | 
 15 |     public static void main(String[] args) {
 16 | 
 17 |         try {
 18 |             int[] time = new int[]{1483228800
 19 |                     ,1485907200
 20 |                     ,1488326400
 21 |                     ,1491004800
 22 |                     ,1493596800
 23 |                     ,1496275200
 24 |                     ,1498867200
 25 |                     ,1501545600
 26 |                     ,1504224000
 27 |                     ,1506816000
 28 |                     ,1509494400
 29 |                     ,1512086400
 30 |                     ,1514764800
 31 |                     ,1517443200
 32 |                     ,1519862400
 33 |                     ,1522540800
 34 |                     ,1525132800
 35 |                     ,1527811200
 36 |                     ,1530403200
 37 |                     ,1533081600
 38 |                     ,1535760000
 39 |                     ,1538352000
 40 |                     ,1541030400
 41 |                     ,1543622400
 42 |                     ,1546300800
 43 |                     ,1548979200
 44 |                     ,1551398400
 45 |                     ,1554076800
 46 |                     ,1556668800
 47 |                     ,1559347200
 48 |                     ,1561939200
 49 |                     ,1564617600
 50 |                     ,1567296000
 51 |                     ,1569888000};
 52 |             //调用Class.forName()方法加载驱动程序
 53 |             Class.forName("com.mysql.jdbc.Driver");
 54 |             System.out.println("成功加载MySQL驱动！");
 55 | 
 56 |             String url = "jdbc:mysql://172.19.240.128:3306/steam";    //JDBC的URL
 57 |             Connection conn;
 58 | 
 59 |             conn = DriverManager.getConnection(url, "root", "root");
 60 | 
 61 |             Statement stmt = conn.createStatement();
 62 |             System.out.println("成功连接到数据库！");
 63 | 
 64 |             String sql = "select distinct name from roll_up";
 65 |             ResultSet rs = stmt.executeQuery(sql);
 66 |             List<String> gameName = new ArrayList<>();
 67 |             while (rs.next()) {
 68 |                 gameName.add(rs.getString(1));
 69 |             }
 70 | 
 71 |            for (int i = 1; i < time.length; i++) {
 72 |               for (int j = 0; j < gameName.size(); j++) {
 73 | 
 74 |                   sql = "select recommendations_up from roll_up where name = '" + gameName.get(j) + "' and time = " + time[i];
 75 |                   rs = stmt.executeQuery(sql);
 76 |                   int up1 = 0;
 77 |                   while (rs.next()) {
 78 |                       up1  = rs.getInt(1);
 79 |                   }
 80 | 
 81 |                   sql = "select recommendations_up from roll_up where name = '" + gameName.get(j) + "' and time = " + time[i - 1];
 82 |                   rs = stmt.executeQuery(sql);
 83 |                   int up2 = 0;
 84 |                   while (rs.next()) {
 85 |                       up2  = rs.getInt(1);
 86 |                   }
 87 | 
 88 |                   System.out.println(up1 + " " + up2);
 89 |                   int up = up1 + up2;
 90 |                   sql = "update roll_up set recommendations_up = " + up + " where name = '" + gameName.get(j) + "' and time = " + time[i];
 91 |                   System.out.println(sql);
 92 |                   stmt.executeUpdate(sql);
 93 |               }
 94 |            }
 95 | 
 96 |             rs.close();
 97 |             stmt.close();
 98 |             conn.close();
 99 |         } catch (Exception e) {
100 |             e.printStackTrace();
101 |         }
102 |     }
103 | }
104 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/RollupCSV/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/RollupCSV/.part-00000-4a85a460-3bdb-48ad-bc81-9b68057082a8-c000.csv.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/spark-streaming/src/main/resources/RollupCSV/.part-00000-4a85a460-3bdb-48ad-bc81-9b68057082a8-c000.csv.crc


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/RollupCSV/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/cloud-computing/64711533b0fe0728c49703aa98833953a35e9530/spark-streaming/src/main/resources/RollupCSV/_SUCCESS


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/game.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "img_src": "https://media.st.dl.bscstorage.net/steam/apps/1085660/capsule_sm_120.jpg?t=1570039639",
 3 |   "game_detail": {
 4 |     "support_tags": [
 5 |       "单人",
 6 |       "在线多人"
 7 |     ],
 8 |     "user_reviews": {
 9 |       "发行商:": "Bungie",
10 |       "发行日期:": "2019年10月1日",
11 |       "开发商:": "Bungie"
12 |     },
13 |     "user_tags": [
14 |       "第一人称射击",
15 |       "多人"
16 |     ],
17 |     "reviewsChart": {
18 |       "weeks": [],
19 |       "rollup_type": "week",
20 |       "end_date": 1571529600,
21 |       "recent": [
22 |         {
23 |           "date": 1569888000,
24 |           "recommendations_up": 5205,
25 |           "recommendations_down": 1467
26 |         },
27 |         {
28 |           "date": 1569974400,
29 |           "recommendations_up": 3881,
30 |           "recommendations_down": 1616
31 |         }
32 |       ],
33 |       "rollups": [
34 |         {
35 |           "date": 1569888000,
36 |           "recommendations_up": 16003,
37 |           "recommendations_down": 6234
38 |         }
39 |       ],
40 |       "start_date": 1569888000
41 |     }
42 |   },
43 |   "original_price": "免费开玩",
44 |   "review_summary": "多半好评<br>30,477 篇用户的游戏评测中有 72% 为好评。",
45 |   "price": "免费开玩",
46 |   "date": "2019年10月1日",
47 |   "name": "Destiny 2",
48 |   "page": 1,
49 |   "href": "https://store.steampowered.com/app/1085660/Destiny_2/?snr=1_7_7_230_150_1"
50 | }


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/gameAll.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "page": 1,
  3 |   "date": "2019年9月26日",
  4 |   "href": "https://store.steampowered.com/app/678960/CODE_VEIN/?snr=1_7_7_230_150_1",
  5 |   "review_summary": "特别好评<br>4,945 篇用户的游戏评测中有 84% 为好评。",
  6 |   "img_src": "https://media.st.dl.bscstorage.net/steam/apps/678960/capsule_sm_120.jpg?t=1570113292",
  7 |   "name": "CODE VEIN",
  8 |   "game_detail": {
  9 |     "user_reviews": {
 10 |       "开发商:": "BANDAI NAMCO Studios",
 11 |       "发行商:": "BANDAI NAMCO Entertainment",
 12 |       "发行日期:": "2019年9月26日"
 13 |     },
 14 |     "support_tags": [
 15 |       "单人",
 16 |       "在线合作",
 17 |       "Steam 成就",
 18 |       "Steam 集换式卡牌",
 19 |       "部分支持控制器",
 20 |       "Steam 云"
 21 |     ],
 22 |     "user_tags": [
 23 |       "动漫",
 24 |       "角色定制",
 25 |       "类魂系列",
 26 |       "角色扮演",
 27 |       "动作",
 28 |       "合作",
 29 |       "日系角色扮演",
 30 |       "第三人称视角",
 31 |       "吸血鬼",
 32 |       "暴力",
 33 |       "多人",
 34 |       "黑暗奇幻",
 35 |       "困难",
 36 |       "血腥",
 37 |       "动作角色扮演",
 38 |       "单人",
 39 |       "末日",
 40 |       "砍杀",
 41 |       "冒险",
 42 |       "好评原声音轨"
 43 |     ],
 44 |     "reviewsChart": {
 45 |       "rollups": [
 46 |         {
 47 |           "recommendations_up": 2680,
 48 |           "date": 1569456000,
 49 |           "recommendations_down": 549
 50 |         },
 51 |         {
 52 |           "recommendations_up": 907,
 53 |           "date": 1570060800,
 54 |           "recommendations_down": 164
 55 |         },
 56 |         {
 57 |           "recommendations_up": 437,
 58 |           "date": 1570665600,
 59 |           "recommendations_down": 74
 60 |         },
 61 |         {
 62 |           "recommendations_up": 167,
 63 |           "date": 1571270400,
 64 |           "recommendations_down": 34
 65 |         }
 66 |       ],
 67 |       "weeks": [],
 68 |       "start_date": 1569456000,
 69 |       "rollup_type": "week",
 70 |       "end_date": 1571616000,
 71 |       "recent": [
 72 |         {
 73 |           "recommendations_up": 29,
 74 |           "date": 1569456000,
 75 |           "recommendations_down": 9
 76 |         },
 77 |         {
 78 |           "recommendations_up": 918,
 79 |           "date": 1569542400,
 80 |           "recommendations_down": 160
 81 |         },
 82 |         {
 83 |           "recommendations_up": 448,
 84 |           "date": 1569628800,
 85 |           "recommendations_down": 131
 86 |         },
 87 |         {
 88 |           "recommendations_up": 397,
 89 |           "date": 1569715200,
 90 |           "recommendations_down": 88
 91 |         },
 92 |         {
 93 |           "recommendations_up": 374,
 94 |           "date": 1569801600,
 95 |           "recommendations_down": 77
 96 |         },
 97 |         {
 98 |           "recommendations_up": 344,
 99 |           "date": 1569888000,
100 |           "recommendations_down": 48
101 |         },
102 |         {
103 |           "recommendations_up": 170,
104 |           "date": 1569974400,
105 |           "recommendations_down": 36
106 |         },
107 |         {
108 |           "recommendations_up": 197,
109 |           "date": 1570060800,
110 |           "recommendations_down": 35
111 |         },
112 |         {
113 |           "recommendations_up": 136,
114 |           "date": 1570147200,
115 |           "recommendations_down": 36
116 |         },
117 |         {
118 |           "recommendations_up": 151,
119 |           "date": 1570233600,
120 |           "recommendations_down": 37
121 |         },
122 |         {
123 |           "recommendations_up": 131,
124 |           "date": 1570320000,
125 |           "recommendations_down": 23
126 |         },
127 |         {
128 |           "recommendations_up": 121,
129 |           "date": 1570406400,
130 |           "recommendations_down": 17
131 |         },
132 |         {
133 |           "recommendations_up": 94,
134 |           "date": 1570492800,
135 |           "recommendations_down": 8
136 |         },
137 |         {
138 |           "recommendations_up": 77,
139 |           "date": 1570579200,
140 |           "recommendations_down": 8
141 |         },
142 |         {
143 |           "recommendations_up": 68,
144 |           "date": 1570665600,
145 |           "recommendations_down": 11
146 |         },
147 |         {
148 |           "recommendations_up": 62,
149 |           "date": 1570752000,
150 |           "recommendations_down": 21
151 |         },
152 |         {
153 |           "recommendations_up": 68,
154 |           "date": 1570838400,
155 |           "recommendations_down": 11
156 |         },
157 |         {
158 |           "recommendations_up": 79,
159 |           "date": 1570924800,
160 |           "recommendations_down": 12
161 |         },
162 |         {
163 |           "recommendations_up": 67,
164 |           "date": 1571011200,
165 |           "recommendations_down": 5
166 |         },
167 |         {
168 |           "recommendations_up": 54,
169 |           "date": 1571097600,
170 |           "recommendations_down": 9
171 |         },
172 |         {
173 |           "recommendations_up": 39,
174 |           "date": 1571184000,
175 |           "recommendations_down": 5
176 |         },
177 |         {
178 |           "recommendations_up": 44,
179 |           "date": 1571270400,
180 |           "recommendations_down": 7
181 |         },
182 |         {
183 |           "recommendations_up": 32,
184 |           "date": 1571356800,
185 |           "recommendations_down": 8
186 |         },
187 |         {
188 |           "recommendations_up": 43,
189 |           "date": 1571443200,
190 |           "recommendations_down": 11
191 |         },
192 |         {
193 |           "recommendations_up": 41,
194 |           "date": 1571529600,
195 |           "recommendations_down": 7
196 |         },
197 |         {
198 |           "recommendations_up": 7,
199 |           "date": 1571616000,
200 |           "recommendations_down": 1
201 |         }
202 |       ]
203 |     }
204 |   },
205 |   "price": "¥ 268",
206 |   "original_price": "¥ 268"
207 | }
208 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/gameDetail.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "reviewsChart": {
 3 |     "end_date": 1571616000,
 4 |     "rollups": [
 5 |       {
 6 |         "recommendations_down": 34,
 7 |         "date": 1571270400,
 8 |         "recommendations_up": 167
 9 |       }
10 |     ],
11 |     "recent": [
12 |       {
13 |         "recommendations_down": 1,
14 |         "date": 1571616000,
15 |         "recommendations_up": 7
16 |       }
17 |     ],
18 |     "rollup_type": "week",
19 |     "weeks": [],
20 |     "start_date": 1569456000
21 |   },
22 |   "support_tags": [
23 |     "单人",
24 |     "在线合作",
25 |     "Steam 成就"
26 |   ],
27 |   "user_tags": [
28 |     "动漫",
29 |     "砍杀",
30 |     "冒险",
31 |     "好评原声音轨"
32 |   ],
33 |   "user_reviews": {
34 |     "发行日期:": "2019年9月26日",
35 |     "开发商:": "BANDAI NAMCO Studios",
36 |     "发行商:": "BANDAI NAMCO Entertainment"
37 |   }
38 | }


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=ERROR, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
 9 | # log level for this class is used to overwrite the root logger's log level, so that
10 | # the user can have different defaults for the shell and regular Spark apps.
11 | log4j.logger.org.apache.spark.repl.Main=WARN
12 | 
13 | # Settings to quiet third party logs that are too verbose
14 | log4j.logger.org.spark_project.jetty=WARN
15 | log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR
16 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
17 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
18 | log4j.logger.org.apache.parquet=ERROR
19 | log4j.logger.parquet=ERROR
20 | 
21 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
22 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
23 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/test.txt:
--------------------------------------------------------------------------------
1 | 1,使命召唤1,2019-10-13,好玩1
2 | 4,使命召唤2,2019-10-14,好玩2
3 | 1,使命召唤3,2019-10-15,好玩3
4 | 2,使命召唤4,2019-10-16,好玩4
5 | 1,使命召唤5,2019-10-13,好玩5
6 | 1,使命召唤1,2019-10-15,好玩6
7 | 9999,使命召唤1,2019-10-15,好玩6
8 | https://media.st.dl.bscstorage.net/steam/apps/678960/capsule_sm_120.jpg?t=1570113292	{"reviewsChart": {"end_date": 1571616000, "rollups": [{"recommendations_down": 34, "date": 1571270400, "recommendations_up": 167}, {"recommendations_down": 34, "date": 1571270400, "recommendations_up": 167}], "recent": [{"recommendations_down": 1, "date": 1571616000, "recommendations_up": 7}], "rollup_type": "month", "weeks": [], "start_date": 1569456000}, "support_tags": ["单人", "在线合作", "Steam 成就"], "user_tags": ["动漫", "砍杀", "冒险", "好评原声音轨"], "user_reviews": {"发行日期:": "2019年9月26日", "开发商:": "BANDAI NAMCO Studios", "发行商:": "BANDAI NAMCO Entertainment"}}	¥ 268	¥ 268	特别好评<br>4,945 篇用户的游戏评测中有 84% 为好评。	2019年9月26日	CODE VEIN


--------------------------------------------------------------------------------
/spark-streaming/src/main/resources/update.sql:
--------------------------------------------------------------------------------
 1 | 1483228800
 2 | 1485907200
 3 | 1488326400
 4 | 1491004800
 5 | 1493596800
 6 | 1496275200
 7 | 1498867200
 8 | 1501545600
 9 | 1504224000
10 | 1506816000
11 | 1509494400
12 | 1512086400
13 | 1514764800
14 | 1517443200
15 | 1519862400
16 | 1522540800
17 | 1525132800
18 | 1527811200
19 | 1530403200
20 | 1533081600
21 | 1535760000
22 | 1538352000
23 | 1541030400
24 | 1543622400
25 | 1546300800
26 | 1548979200
27 | 1551398400
28 | 1554076800
29 | 1556668800
30 | 1559347200
31 | 1561939200
32 | 1564617600
33 | 1567296000
34 | 1569888000


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/BatchProcess.scala:
--------------------------------------------------------------------------------
  1 | package cn.edu.nju
  2 | 
  3 | import java.text.SimpleDateFormat
  4 | import java.util.Properties
  5 | 
  6 | import cn.edu.nju.utils.DateUtils
  7 | import org.apache.spark.broadcast.Broadcast
  8 | import org.apache.spark.{SparkConf, SparkContext}
  9 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession}
 10 | 
 11 | import scala.collection.mutable.ListBuffer
 12 | 
 13 | 
 14 | /**
 15 |  * Created by thpffcj on 2019/10/19.
 16 |  */
 17 | object BatchProcess {
 18 | 
 19 |   def main(args: Array[String]): Unit = {
 20 |     saveTop10ToCsv()
 21 |   }
 22 | 
 23 |   /**
 24 |    * 从MySQL中读取top10写入csv文件
 25 |    */
 26 |   def saveTop10ToCsv(): Unit = {
 27 | 
 28 |     val sparkConf = new SparkConf().setMaster("local").setAppName("BatchProcess")
 29 |     val sc = SparkSession.builder().config(sparkConf).getOrCreate()
 30 | 
 31 |     val csvSavePath = "src/main/resources/RollupCSV"
 32 | 
 33 |     val tableName = "(select name, recommendations_up, time from top10_new order by time) as top10"
 34 |     val data: DataFrame = readMysqlTable(sc, tableName)
 35 | 
 36 |     import sc.implicits._
 37 |     data.map(row => {
 38 | 
 39 |       val name = row.getAs("name").toString
 40 |       val types = "game"
 41 |       val recommendations_up = row.getAs("recommendations_up").toString
 42 |       val date = DateUtils.tranTimestampToString(row.getAs("time"))
 43 | 
 44 |       println((name, types, recommendations_up, date))
 45 | 
 46 |       (name, types, recommendations_up, date)
 47 |     }).toDF("name", "type", "value", "date").write.mode(SaveMode.Overwrite).csv(csvSavePath)
 48 | 
 49 |     sc.stop()
 50 |   }
 51 | 
 52 |   /**
 53 |    * 按月份统计top10存入MySQL
 54 |    */
 55 |   def saveRollUpToMysql() = {
 56 | 
 57 |     val sparkConf = new SparkConf().setMaster("local[1]").setAppName("BatchProcess")
 58 |     val sc = SparkSession.builder().config(sparkConf).getOrCreate()
 59 | 
 60 |     val dates = DateUtils.getSteamDates()
 61 | 
 62 |     for (date <- dates) {
 63 |       val time = (new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(date).getTime / 1000).toInt
 64 |       println(time)
 65 |       val tableName = "(select * from roll_up where time = " + time + " order by recommendations_up desc limit 10) as roll_up"
 66 |       val data: DataFrame = readMysqlTable(sc, tableName)
 67 | 
 68 |       val properties = new Properties()
 69 |       properties.setProperty("user", "root")
 70 |       properties.setProperty("password", "root")
 71 |       data.write.mode(SaveMode.Append).jdbc("jdbc:mysql://172.19.240.128:3306/steam", "top10_new", properties)
 72 |     }
 73 | 
 74 |     sc.stop()
 75 |   }
 76 | 
 77 |   // TODO Spark不支持Update操作
 78 |   def addRollUpByMonth()= {
 79 | 
 80 |     val sparkConf = new SparkConf().setMaster("local[2]").setAppName("BatchProcess")
 81 |     val sc = SparkSession.builder().config(sparkConf).getOrCreate()
 82 |     val dates = DateUtils.getSteamDates()
 83 | 
 84 |     var tableName = "(select name from roll_up) as roll_up"
 85 |     val data: DataFrame = readMysqlTable(sc, tableName)
 86 | 
 87 |     // 广播变量
 88 |     val gameName = new ListBuffer[String]
 89 |     val broadcast: Broadcast[ListBuffer[String]] = sc.sparkContext.broadcast(gameName)
 90 |     data.foreach(row => {
 91 |       broadcast.value.append(row.getAs("name").toString)
 92 |     })
 93 | 
 94 |     for (game <- gameName) {
 95 |       tableName = "(select recommendations_up from roll_up where name = '" + game + "') as roll_up"
 96 |       val data: DataFrame = readMysqlTable(sc, tableName)
 97 |       data.show()
 98 |     }
 99 | 
100 |     sc.stop()
101 |   }
102 | 
103 |   def readMysqlTable(sparkSession: SparkSession, tableName: String) = {
104 | 
105 |     sparkSession
106 |       .read
107 |       .format("jdbc")
108 |       .option("url", "jdbc:mysql://172.19.240.128:3306/steam")
109 |       .option("user", "root")
110 |       .option("password", "root")
111 |       .option("dbtable", tableName)
112 |       .load()
113 |   }
114 | }
115 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/HDFSProcess.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju
 2 | 
 3 | import java.sql.DriverManager
 4 | 
 5 | import cn.edu.nju.domain.CommentLog
 6 | import org.apache.spark.SparkConf
 7 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 8 | 
 9 | /**
10 |  * Created by thpffcj on 2019/10/2.
11 |  */
12 | object HDFSProcess {
13 | 
14 |   def main(args: Array[String]): Unit = {
15 | 
16 |     val sparkConf = new SparkConf().setMaster("local[2]").setAppName("HDFSProcess")
17 | //    val sparkConf = new SparkConf().setMaster("spark://thpffcj:7077").setAppName("HDFSProcess")
18 | 
19 |     // 创建StreamingContext需要两个参数：SparkConf和batch interval
20 |     val ssc = new StreamingContext(sparkConf, Seconds(5))
21 | 
22 |     // 如果使用了stateful的算子，必须要设置checkpoint
23 |     // 在生产环境中，建议把checkpoint设置到HDFS的某个文件夹中
24 |     // . 代表当前目录
25 |     ssc.checkpoint("/Users/thpffcj/Public/file/cloud_checkpoint/hdfs_process")
26 | 
27 | //    val data = ssc.textFileStream("hdfs://thpffcj:9000/cloud-computing/")
28 |     // nc -lk 9999
29 |     val data = ssc.socketTextStream("localhost", 9999)
30 | 
31 |     // 构建黑名单
32 |     val blacks = List("9999")
33 |     val blacksRDD = ssc.sparkContext.parallelize(blacks).map(x => (x, true))
34 | 
35 |     // 过滤黑名单
36 |     val cleanData = data.map(line => (line.split(",")(0), line))
37 |         .transform(rdd => {
38 |           rdd.leftOuterJoin(blacksRDD)
39 |             .filter(x => x._2._2.getOrElse(false) != true)
40 |             .map(x => x._2._1)
41 |         })
42 | 
43 |     val logs = cleanData.map(line => {
44 |       val infos = line.split(",")
45 |       CommentLog(infos(0), infos(1), infos(2), infos(3))
46 |     }).filter(commentLog => commentLog.gameName != "")
47 | 
48 |     // 按游戏名统计评论数
49 |     val gameNumber = logs.map(log => {
50 |       (log.gameName, 1)
51 |     }).updateStateByKey[Int](updateFunction _)
52 | 
53 |     gameNumber.print()
54 | 
55 |     ssc.start()
56 |     ssc.awaitTermination()
57 |   }
58 | 
59 |   /**
60 |    * 把当前的数据去更新已有的或者是旧的数据
61 |    * @param currentValues 当前数据
62 |    * @param preValues 旧数据
63 |    * @return
64 |    */
65 |   def updateFunction(currentValues: Seq[Int], preValues: Option[Int]): Option[Int] = {
66 |     val current = currentValues.sum
67 |     val pre = preValues.getOrElse(0)
68 |     Some(current + pre)
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/KafkaProcess.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju
 2 | 
 3 | import cn.edu.nju.domain.UserData
 4 | import org.apache.kafka.clients.consumer.ConsumerConfig
 5 | import org.apache.kafka.common.serialization.StringDeserializer
 6 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
 7 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 8 | import org.apache.spark.{SparkConf, SparkContext}
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019/10/19.
12 |  */
13 | object KafkaProcess {
14 | 
15 |   def main(args: Array[String]): Unit = {
16 | 
17 |     val sparkConf = new SparkConf().setMaster("local[2]").setAppName("StreamProcess")
18 |     val ssc = new StreamingContext(sparkConf, Seconds(5))
19 | 
20 |     ssc.checkpoint("/Users/thpffcj/Public/file/cloud_checkpoint/stream_process")
21 | 
22 |     val bootstrapServers = "thpffcj1:9092"
23 |     val groupId = "test"
24 |     val topicName = "steam"
25 |     val maxPoll = 20000
26 | 
27 |     val kafkaParams = Map(
28 |       ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers,
29 |       ConsumerConfig.GROUP_ID_CONFIG -> groupId,
30 |       ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> maxPoll.toString,
31 |       ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
32 |       ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer]
33 |     )
34 | 
35 |     val messages = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
36 |       ConsumerStrategies.Subscribe[String, String](Set(topicName), kafkaParams))
37 | 
38 |     val rawData = messages.map(_.value())
39 | 
40 |     val data = rawData.map(line => {
41 |       val record = line.split("\t")
42 |       UserData(record(0), record(1), record(2), record(3).toDouble)
43 |     })
44 | 
45 |     // 游戏销量
46 |     val gameSale = data.filter(userData => userData.behavior == "purchase")
47 |       .map(userData => {
48 |         (userData.gameName, 1)
49 |       }).updateStateByKey[Int](updateFunction _)
50 | 
51 |     gameSale.print()
52 | 
53 |     // 游戏游玩平均时长
54 |     val gamePopularity = data.filter(userData => userData.behavior == "play").map(
55 |       userData => {
56 |         (userData.gameName, (userData.duration, 1))
57 |       }
58 |     ).reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2))
59 | 
60 |     gamePopularity.print()
61 | 
62 |     // Dota 2游玩时长
63 |     val gameDuration = data.filter(
64 |       userData => userData.gameName == "Dota 2" & userData.behavior == "play").map(
65 |       userData => {
66 |         (userData.userId, userData.duration)
67 |       })
68 | 
69 |     gameDuration.print()
70 | 
71 |     ssc.start()
72 |     ssc.awaitTermination()
73 |   }
74 | 
75 |   /**
76 |    * 把当前的数据去更新已有的或者是旧的数据
77 |    * @param currentValues 当前数据
78 |    * @param preValues 旧数据
79 |    * @return
80 |    */
81 |   def updateFunction(currentValues: Seq[Int], preValues: Option[Int]): Option[Int] = {
82 |     val current = currentValues.sum
83 |     val pre = preValues.getOrElse(0)
84 |     Some(current + pre)
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/MongoDBProcess.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju
 2 | 
 3 | import com.mongodb.spark.MongoSpark
 4 | import org.apache.log4j.{Level, Logger}
 5 | import org.apache.spark.sql.DataFrame
 6 | 
 7 | /**
 8 |  * Created by thpffcj on 2019/9/24.
 9 |  */
10 | object MongoDBProcess {
11 | 
12 |   Logger.getLogger("org").setLevel(Level.ERROR)
13 | 
14 |   def main(args: Array[String]): Unit = {
15 | 
16 |     import org.apache.spark.sql.SparkSession
17 | 
18 |     val spark = SparkSession.builder()
19 |       .master("local[2]")
20 |       .appName("MongoDBProcess")
21 |       .config("spark.mongodb.input.uri", "mongodb://steam:steam@***.***.***.***:27017/steam_db.China.games")
22 |       .getOrCreate()
23 | 
24 |     val frame: DataFrame = MongoSpark.load(spark)
25 |     frame.createTempView("games")
26 | 
27 |     val res: DataFrame = spark.sql("SELECT name from games")
28 |     res.show()
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/MySQLProcess.scala:
--------------------------------------------------------------------------------
  1 | package cn.edu.nju
  2 | 
  3 | import java.text.SimpleDateFormat
  4 | import java.util
  5 | 
  6 | import cn.edu.nju.api.{ApiReturnObject, TagReturnObject}
  7 | import cn.edu.nju.domain.{GameObject, TagObject}
  8 | import org.apache.spark.sql.{DataFrame, SQLContext}
  9 | import org.apache.spark.{SparkConf, SparkContext}
 10 | 
 11 | import scala.collection.mutable.ListBuffer
 12 | import scala.util.Random
 13 | 
 14 | /**
 15 |  * Created by thpffcj on 2019/10/24.
 16 |  */
 17 | class MySQLProcess {
 18 | 
 19 |   /**
 20 |    * 返回动态图所需数据
 21 |    * @param dates
 22 |    * @return
 23 |    */
 24 |   def getTimeFieldData(dates: ListBuffer[String]): ApiReturnObject = {
 25 | 
 26 |     val sparkConf = new SparkConf().setMaster("local[1]").setAppName("MySQLProcess")
 27 |     val sc = new SparkContext(sparkConf)
 28 | 
 29 |     val sqlContext = new SQLContext(sc)
 30 | 
 31 |     val timeFieldObjects = new util.ArrayList[TimeFieldObject]
 32 | 
 33 |     for (date <- dates){
 34 |       val time = (new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(date).getTime / 1000).toInt
 35 |       val tableName = "(select * from top10 where time = " + time + " order by recommendations_up desc limit 10) as top10"
 36 |       val data: DataFrame = readMysqlTable(sqlContext, tableName)
 37 | 
 38 |       val gameObjects = new util.ArrayList[GameObject]
 39 |       val broadcast = sc.broadcast(gameObjects)
 40 |       var id = 1
 41 |       data.foreach(row => {
 42 | 
 43 |         val name = row.getAs("name").toString
 44 |         var color = ""
 45 |         if (MySQLProcess.map.containsKey(name)) {
 46 |           color = MySQLProcess.map.get(name).toString
 47 |         } else {
 48 |           // rgb(218, 198, 76)
 49 |           color = "rgb(" + Random.nextInt(255) + ", " + Random.nextInt(255) + ", " + Random.nextInt(255) + ")"
 50 |           MySQLProcess.map.put(name, color)
 51 |         }
 52 | 
 53 |         val gameObject = new GameObject(id.toString, name, row.getAs("recommendations_up"), color)
 54 |         broadcast.value.add(gameObject)
 55 |         id = id + 1
 56 |       })
 57 | 
 58 |       val name = "截止" + date.substring(0, 4) + "年" + date.substring(5, 7) + "月" + "好评累计总数"
 59 |       val timeFieldObject = new TimeFieldObject(name, broadcast.value)
 60 |       timeFieldObjects.add(timeFieldObject)
 61 |     }
 62 | 
 63 |     val apiReturnObject = new ApiReturnObject(timeFieldObjects)
 64 | 
 65 |     sc.stop()
 66 | 
 67 |     apiReturnObject
 68 |   }
 69 | 
 70 |   /**
 71 |    * 返回词云需要的数据
 72 |    * @return
 73 |    */
 74 |   def getTagData(round: Int): TagReturnObject = {
 75 | 
 76 |     val sparkConf = new SparkConf().setMaster("local[1]").setAppName("MySQLProcess")
 77 |     val sc = new SparkContext(sparkConf)
 78 | 
 79 |     val sqlContext = new SQLContext(sc)
 80 | 
 81 |     val tableName = "(select * from tag limit " + 0 + "," + round * 50 + ") as top10"
 82 |     println(tableName)
 83 |     val data: DataFrame = readMysqlTable(sqlContext, tableName)
 84 | 
 85 |     val tagObjects =new util.ArrayList[TagObject]
 86 |     val broadcast = sc.broadcast(tagObjects)
 87 |     data.foreach(row => {
 88 |       val tagObject = new TagObject(row.getAs("game_name"), row.getAs("number"))
 89 |       broadcast.value.add(tagObject)
 90 |     })
 91 | 
 92 |     val tagReturnObject = new TagReturnObject(tagObjects)
 93 | 
 94 |     sc.stop()
 95 | 
 96 |     tagReturnObject
 97 |   }
 98 | 
 99 |   def readMysqlTable(sqlContext: SQLContext, tableName: String) = {
100 |     sqlContext
101 |       .read
102 |       .format("jdbc")
103 |       .option("driver", "com.mysql.jdbc.Driver")
104 |       .option("url", "jdbc:mysql://172.19.240.128:3306/steam")
105 |       .option("user", "root")
106 |       .option("password", "root")
107 |       .option("dbtable", tableName)
108 |       .load()
109 |   }
110 | }
111 | 
112 | object MySQLProcess {
113 | 
114 |   val map = new util.HashMap[String, String]()
115 | }
116 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/SteamProcess.scala:
--------------------------------------------------------------------------------
  1 | package cn.edu.nju
  2 | 
  3 | import cn.edu.nju.dao.{RollUpDAO, TagDAO}
  4 | import cn.edu.nju.domain.{GameDetail, ReviewsChart, RollUp, SteamLog, Tag}
  5 | import com.google.gson.Gson
  6 | import org.apache.spark.SparkConf
  7 | import org.apache.spark.streaming.dstream.DStream
  8 | import org.apache.spark.streaming.{Seconds, StreamingContext}
  9 | 
 10 | import scala.collection.mutable.{ListBuffer, Set}
 11 | 
 12 | /**
 13 |  * Created by thpffcj on 2019/10/21.
 14 |  */
 15 | object SteamProcess {
 16 | 
 17 |   def main(args: Array[String]): Unit = {
 18 | 
 19 | //    val sparkConf = new SparkConf().setMaster("local[2]").setAppName("HDFSProcess")
 20 |     val sparkConf = new SparkConf().setMaster("spark://thpffcj:7077").setAppName("SteamProcess")
 21 | 
 22 |     val ssc = new StreamingContext(sparkConf, Seconds(5))
 23 | 
 24 |     // 如果使用了stateful的算子，必须要设置checkpoint
 25 |     // 在生产环境中，建议把checkpoint设置到HDFS的某个文件夹中
 26 |     // . 代表当前目录
 27 |     ssc.checkpoint("/Users/thpffcj/Public/file/cloud_checkpoint/hdfs_process")
 28 | 
 29 |     val rawData = ssc.textFileStream("hdfs://thpffcj:9000/cloud-computing/")
 30 | 
 31 |     val gameNameSet: Set[String] = Set()
 32 | 
 33 |     /**
 34 |      * 过滤空行
 35 |      * 过滤日期为空的数据
 36 |      * 过滤重复的数据，使用游戏名称过滤
 37 |      * 过滤game_detail为空的数据
 38 |      * 为了game_detail为bundle的数据
 39 |      */
 40 |     val data = rawData.filter(rdd => !rdd.isEmpty).map(line => {
 41 |       val log = line.split("\t")
 42 |       if (log.length < 7) {
 43 |         SteamLog("", "", "", "", "", "", "")
 44 |       } else {
 45 |         SteamLog(log(0), log(1), log(2), log(3), log(4), log(5), log(6))
 46 |       }
 47 |     }).filter(steamLog => !steamLog.date.isEmpty)
 48 |       .filter(steamLog => !gameNameSet.contains(steamLog.name))
 49 |       .filter(steamLog => !steamLog.game_detail.isEmpty)
 50 |       .filter(steamLog => !steamLog.game_detail.equals("bundle"))
 51 |       .map(steamLog => {
 52 |         gameNameSet.add(steamLog.name)
 53 |         steamLog
 54 |       })
 55 | 
 56 |     // 取出用户标签
 57 |     val userTags = data.map(steamLog => {
 58 |       val gameDetail = jsonToGameDetail(steamLog.game_detail)
 59 |       if (gameDetail != null) {
 60 |         gameDetail.user_tags.toString.replace(" ", "")
 61 |       } else {
 62 |         null
 63 |       }
 64 |     }).filter(userTags => userTags != null)
 65 | 
 66 |     // 标签统计
 67 |     val tagsNumber = userTags.flatMap(line => line.substring(1, line.length - 1).split(","))
 68 |       .map(tag => (tag, 1)).updateStateByKey[Int](updateFunction _)
 69 | 
 70 | //    writeTagToMysql(tagsNumber)
 71 |     tagsNumber.print()
 72 | 
 73 |     /**
 74 |      * (steamLog.name,jsonToReviewsChart(gameDetail.reviewsChart.toString))
 75 |      * (CODE VEIN,{recommendations_down=34.0,date=1.5712704E9,recommendations_up=167.0},{recommendations_down=34.0,date=1.5712704E9,recommendations_up=167.0)
 76 |      */
 77 |     val rollups = data.map(steamLog => {
 78 |       val gameDetail = jsonToGameDetail(steamLog.game_detail)
 79 | 
 80 |       // 过滤 reviewsChart["start_date"] 和 reviewsChart["end_date"] 为空的数据
 81 |       if ((gameDetail != null) && (gameDetail.reviewsChart.get("start_date") != "None")
 82 |         && (gameDetail.reviewsChart.get("end_date") != "None")) {
 83 |         (steamLog.name, jsonToReviewsChart(gameDetail.reviewsChart.toString))
 84 |       } else {
 85 |         null
 86 |       }
 87 |     }).filter(rollups => rollups != null)
 88 |       // 目前只考虑以月为时间单位的数据
 89 |       .filter(reviewsChart => reviewsChart._2.rollup_type == "month")
 90 |       .map(reviewsChart => {
 91 |         val line = reviewsChart._2.rollups.toString
 92 |         (reviewsChart._1, line.substring(1, line.length - 2).replace(" ", ""))
 93 |       })
 94 | 
 95 |     // 将每个游戏好评数写入到MySQL
 96 |     rollups.foreachRDD(rdd => {
 97 |       rdd.foreachPartition(partitionOfRecords => {
 98 |         val list = new ListBuffer[(String, Int, Int, Int)]
 99 |         
100 |         partitionOfRecords.foreach(record => {
101 |           record._2.split("},").foreach(data => {
102 |             val rollUp = jsonToRollUp(data + "}")
103 |             list.append((record._1, rollUp.date, rollUp.recommendations_up, rollUp.recommendations_down))
104 |           })
105 |         })
106 | 
107 |         RollUpDAO.insertRollUp(list)
108 |       })
109 |     })
110 | 
111 |     // 单条插入
112 | //    rollups.foreachRDD(rdd => {
113 | //      rdd.foreachPartition(partitionOfRecords => {
114 | //        val connection = createConnection()
115 | //        partitionOfRecords.foreach(record => {
116 | //          record._2.split("},").foreach(data => {
117 | //            val rollUp = jsonToRollUp(data + "}")
118 | //            val sql = "insert into roll_up(name, time, recommendations_up, recommendations_down) values('" + record._1.replace("'", "") + "'," + rollUp.date + "," + rollUp.recommendations_up + "," + rollUp.recommendations_down + ")"
119 | //            connection.createStatement().execute(sql)
120 | //          })
121 | //        })
122 | //        connection.close()
123 | //      })
124 | //    })
125 | 
126 |     rollups.print()
127 | 
128 |     ssc.start()
129 |     ssc.awaitTermination()
130 |   }
131 | 
132 | //  def createConnection() = {
133 | //    Class.forName("com.mysql.jdbc.Driver")
134 | //    DriverManager.getConnection("jdbc:mysql://localhost:3306/steam?useUnicode=true&characterEncoding=utf-8", "root", "000000")
135 | //  }
136 | 
137 |   def jsonToGameDetail(jsonStr: String): GameDetail = {
138 |     try {
139 |       val gson = new Gson()
140 |       gson.fromJson(jsonStr, classOf[GameDetail])
141 |     } catch {
142 |       case e: Exception => {
143 | //        e.printStackTrace()
144 |         null
145 |       }
146 |     }
147 |   }
148 | 
149 |   def jsonToReviewsChart(jsonStr: String): ReviewsChart = {
150 |     try {
151 |       val gson = new Gson()
152 |       gson.fromJson(jsonStr, classOf[ReviewsChart])
153 |     } catch {
154 |       case e: Exception => {
155 | //        e.printStackTrace()
156 |         null
157 |       }
158 |     }
159 |   }
160 | 
161 |   def jsonToRollUp(jsonStr: String): RollUp = {
162 |     try {
163 |       val gson = new Gson()
164 |       gson.fromJson(jsonStr, classOf[RollUp])
165 |     } catch {
166 |       case e: Exception => {
167 | //        e.printStackTrace()
168 |         null
169 |       }
170 |     }
171 |   }
172 | 
173 |   /**
174 |    * 把当前的数据去更新已有的或者是旧的数据
175 |    *
176 |    * @param currentValues 当前数据
177 |    * @param preValues     旧数据
178 |    * @return
179 |    */
180 |   def updateFunction(currentValues: Seq[Int], preValues: Option[Int]): Option[Int] = {
181 |     val current = currentValues.sum
182 |     val pre = preValues.getOrElse(0)
183 |     Some(current + pre)
184 |   }
185 | 
186 |   /**
187 |    * 标签数据写入MySQL
188 |    * @param tagsNumber
189 |    */
190 |   def writeTagToMysql(tagsNumber: DStream[(String, Int)]): Unit = {
191 | 
192 |     tagsNumber.foreachRDD(rdd => {
193 |       rdd.foreachPartition(partitionOfRecords => {
194 |         val list = new ListBuffer[Tag]
195 |         partitionOfRecords.foreach(record => {
196 |           list.append(Tag(record._1, record._2))
197 |         })
198 |         TagDAO.insertTag(list)
199 |       })
200 |     })
201 |   }
202 | }
203 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/dao/CourseClickCountDAO.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.dao
 2 | 
 3 | import cn.edu.nju.domain.CourseClickCount
 4 | import cn.edu.nju.utils.HBaseUtils
 5 | import org.apache.hadoop.hbase.client.Get
 6 | import org.apache.hadoop.hbase.util.Bytes
 7 | 
 8 | import scala.collection.mutable.ListBuffer/**
 9 |  * Created by thpffcj on 2019/10/17.
10 |  */
11 | object CourseClickCountDAO {
12 | 
13 |   val tableName = "imooc_course_clickcount"
14 |   val cf = "info"
15 |   val qualifer = "click_count"
16 | 
17 |   /**
18 |    * 保存数据到HBase
19 |    * @param list  CourseClickCount集合
20 |    */
21 |   def save(list: ListBuffer[CourseClickCount]): Unit = {
22 | 
23 |     val table = HBaseUtils.getInstance().getTable(tableName)
24 | 
25 |     for(ele <- list) {
26 |       table.incrementColumnValue(Bytes.toBytes(ele.day_course),
27 |         Bytes.toBytes(cf),
28 |         Bytes.toBytes(qualifer),
29 |         ele.click_count)
30 |     }
31 |   }
32 | 
33 |   /**
34 |    * 根据rowkey查询值
35 |    */
36 |   def count(day_course: String): Long = {
37 |     val table = HBaseUtils.getInstance().getTable(tableName)
38 | 
39 |     val get = new Get(Bytes.toBytes(day_course))
40 |     val value = table.get(get).getValue(cf.getBytes, qualifer.getBytes)
41 | 
42 |     if(value == null) {
43 |       0L
44 |     }else{
45 |       Bytes.toLong(value)
46 |     }
47 |   }
48 | 
49 |   def main(args: Array[String]): Unit = {
50 | 
51 |     val list = new ListBuffer[CourseClickCount]
52 |     list.append(CourseClickCount("20171111_8",8))
53 |     list.append(CourseClickCount("20171111_9",9))
54 |     list.append(CourseClickCount("20171111_1",100))
55 | 
56 |     save(list)
57 | 
58 |     println(count("20171111_8") + " : " + count("20171111_9")+ " : " + count("20171111_1"))
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/dao/CourseSearchClickCountDAO.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.dao
 2 | 
 3 | import cn.edu.nju.domain.CourseSearchClickCount
 4 | import cn.edu.nju.utils.HBaseUtils
 5 | import org.apache.hadoop.hbase.client.Get
 6 | import org.apache.hadoop.hbase.util.Bytes
 7 | 
 8 | import scala.collection.mutable.ListBuffer
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019/10/17.
12 |  */
13 | object CourseSearchClickCountDAO {
14 | 
15 |   val tableName = "imooc_course_search_clickcount"
16 |   val cf = "info"
17 |   val qualifer = "click_count"
18 | 
19 |   /**
20 |    * 保存数据到HBase
21 |    *
22 |    * @param list  CourseSearchClickCount集合
23 |    */
24 |   def save(list: ListBuffer[CourseSearchClickCount]): Unit = {
25 | 
26 |     val table = HBaseUtils.getInstance().getTable(tableName)
27 | 
28 |     for(ele <- list) {
29 |       table.incrementColumnValue(Bytes.toBytes(ele.day_search_course),
30 |         Bytes.toBytes(cf),
31 |         Bytes.toBytes(qualifer),
32 |         ele.click_count)
33 |     }
34 |   }
35 | 
36 |   /**
37 |    * 根据rowkey查询值
38 |    */
39 |   def count(day_search_course: String):Long = {
40 |     val table = HBaseUtils.getInstance().getTable(tableName)
41 | 
42 |     val get = new Get(Bytes.toBytes(day_search_course))
43 |     val value = table.get(get).getValue(cf.getBytes, qualifer.getBytes)
44 | 
45 |     if(value == null) {
46 |       0L
47 |     }else{
48 |       Bytes.toLong(value)
49 |     }
50 |   }
51 | 
52 |   def main(args: Array[String]): Unit = {
53 | 
54 |     val list = new ListBuffer[CourseSearchClickCount]
55 |     list.append(CourseSearchClickCount("20171111_www.baidu.com_8",8))
56 |     list.append(CourseSearchClickCount("20171111_cn.bing.com_9",9))
57 | 
58 |     save(list)
59 | 
60 |     println(count("20171111_www.baidu.com_8") + " : " + count("20171111_cn.bing.com_9"))
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/dao/RollUpDAO.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.dao
 2 | 
 3 | import cn.edu.nju.domain.RollUp
 4 | import java.sql.{Connection, PreparedStatement}
 5 | 
 6 | import cn.edu.nju.utils.MySQLUtils
 7 | 
 8 | import scala.collection.mutable.ListBuffer
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019/10/25.
12 |  */
13 | object RollUpDAO {
14 | 
15 |   /**
16 |    * 批量保存RollUp到数据库
17 |    */
18 |   def insertRollUp(list: ListBuffer[(String, Int, Int, Int)]): Unit = {
19 | 
20 |     var connection: Connection = null
21 |     var pstmt: PreparedStatement = null
22 | 
23 |     try {
24 |       connection = MySQLUtils.getConnection()
25 | 
26 |       connection.setAutoCommit(false) //设置手动提交
27 | 
28 |       val sql = "insert into roll_up(name, time, recommendations_up, recommendations_down) values (?,?,?,?) "
29 |       pstmt = connection.prepareStatement(sql)
30 | 
31 |       for (element <- list) {
32 |         pstmt.setString(1, element._1)
33 |         pstmt.setInt(2, element._2)
34 |         pstmt.setInt(3, element._3)
35 |         pstmt.setInt(4, element._4)
36 | 
37 |         pstmt.addBatch()
38 |       }
39 | 
40 |       pstmt.executeBatch() // 执行批量处理
41 |       connection.commit() // 手工提交
42 |     } catch {
43 |       case e: Exception => e.printStackTrace()
44 |     } finally {
45 |       MySQLUtils.release(connection, pstmt)
46 |     }
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/dao/TagDAO.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.dao
 2 | 
 3 | import cn.edu.nju.domain.Tag
 4 | import java.sql.{Connection, PreparedStatement}
 5 | import cn.edu.nju.utils.MySQLUtils
 6 | import scala.collection.mutable.ListBuffer
 7 | 
 8 | /**
 9 |  * Created by thpffcj on 2019/10/25.
10 |  */
11 | object TagDAO {
12 | 
13 |   /**
14 |    * 批量保存Tag到数据库
15 |    */
16 |   def insertTag(list: ListBuffer[Tag]): Unit = {
17 | 
18 |     var connection: Connection = null
19 |     var pstmt: PreparedStatement = null
20 | 
21 |     try {
22 |       connection = MySQLUtils.getConnection()
23 | 
24 |       connection.setAutoCommit(false) //设置手动提交
25 | 
26 |       val sql = "insert into tag(game_name, number) values (?,?)"
27 |       pstmt = connection.prepareStatement(sql)
28 | 
29 |       for (element <- list) {
30 |         pstmt.setString(1, element.tagName)
31 |         pstmt.setInt(2, element.number)
32 | 
33 |         pstmt.addBatch()
34 |       }
35 | 
36 |       pstmt.executeBatch() // 执行批量处理
37 |       connection.commit() // 手工提交
38 |     } catch {
39 |       case e: Exception => e.printStackTrace()
40 |     } finally {
41 |       MySQLUtils.release(connection, pstmt)
42 |     }
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/ClickLog.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 | 
3 | /**
4 |  * Created by thpffcj on 2019/10/17.
5 |  */
6 | case class ClickLog(ip:String, time:String, courseId:Int, statusCode:Int, referrer:String)
7 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/CommentLog.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 | 
3 | /**
4 |  * Created by thpffcj on 2019/10/17.
5 |  */
6 | case class CommentLog(userId:String, gameName:String, commentTime:String, comment:String)
7 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/CourseClickCount.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 | 
3 | /**
4 |  * Created by thpffcj on 2019/10/17.
5 |  */
6 | case class CourseClickCount(day_course:String, click_count:Long)
7 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/CourseSearchClickCount.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 | 
3 | /**
4 |  * Created by thpffcj on 2019/10/17.
5 |  */
6 | case class CourseSearchClickCount(day_search_course:String, click_count:Long)
7 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/DouBanLog.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 | 
3 | /**
4 |  * Created by thpffcj on 2019/10/18.
5 |  */
6 | case class DouBanLog(star:Double, bd:String, quote:String, title:String)
7 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/GameDetail.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.domain
 2 | 
 3 | import com.alibaba.fastjson.JSONObject
 4 | 
 5 | /**
 6 |  * Created by thpffcj on 2019/10/21.
 7 |  * @param support_tags
 8 |  * @param user_reviews
 9 |  * @param user_tags
10 |  * @param reviewsChart
11 |  */
12 | case class GameDetail(support_tags: Object, user_reviews: JSONObject, user_tags: Object,
13 |                       reviewsChart: JSONObject)
14 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/ReviewsChart.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.domain
 2 | 
 3 | /**
 4 |  * Created by thpffcj on 2019/10/21.
 5 |  *
 6 |  * @param weeks
 7 |  * @param rollup_type
 8 |  * @param end_date
 9 |  * @param recent
10 |  * @param rollups
11 |  * @param start_date
12 |  */
13 | case class ReviewsChart(weeks: Object, rollup_type: String, end_date: Float, recent: Object,
14 |                         rollups: Object, start_date: Float)
15 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/RollUp.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 | 
3 | /**
4 |  * Created by thpffcj on 2019/10/23.
5 |  */
6 | case class RollUp(recommendations_up: Int, date: Int, recommendations_down: Int)
7 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/SteamLog.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.domain
 2 | 
 3 | import com.alibaba.fastjson.JSONObject
 4 | 
 5 | /**
 6 |  * Created by thpffcj on 2019/10/21.
 7 |  *
 8 |  * @param img_src
 9 |  * @param game_detail
10 |  * @param original_price
11 |  * @param price
12 |  * @param review_summary
13 |  * @param date
14 |  * @param name
15 |  */
16 | case class SteamLog(img_src: String, game_detail: String, original_price: String,
17 |                     price: String, review_summary: String, date: String, name: String)
18 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/Tag.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.domain
2 | 
3 | /**
4 |  * Created by thpffcj on 2019/10/25.
5 |  */
6 | case class Tag(tagName: String, number: Int)
7 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/domain/UserData.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.domain
 2 | 
 3 | /**
 4 |  * Created by thpffcj on 2019/10/19.
 5 |  * @param userId 游戏玩家ID号
 6 |  * @param gameName 游戏名称
 7 |  * @param behavior 玩家购买游戏的行为（购买/玩）
 8 |  * @param duration 游戏时长，1代表该买了该游戏
 9 |  */
10 | case class UserData(userId:String, gameName:String, behavior:String, duration:Double)
11 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/test/DateTest.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.test
 2 | 
 3 | import java.text.SimpleDateFormat
 4 | 
 5 | import cn.edu.nju.utils.DateUtils
 6 | 
 7 | /**
 8 |  * Created by thpffcj on 2019/10/25.
 9 |  */
10 | object DateTest {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 |     val startDate = "2017-03-01 08:00:00"
15 |     val startTime: Int = (new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(startDate).getTime / 1000).toInt
16 | 
17 |     val endDate = "2019-10-01 08:00:00"
18 |     val endTime : Int = (new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(endDate).getTime / 1000).toInt
19 | 
20 |     for (date <- DateUtils.getSteamDates()) {
21 |       println((new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(date).getTime / 1000).toInt)
22 |     }
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/test/HDFSProcessTest.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.test
 2 | 
 3 | import cn.edu.nju.domain.{CommentLog, DouBanLog}
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 6 | 
 7 | /**
 8 |  * Created by thpffcj on 2019/10/2.
 9 |  */
10 | object HDFSProcessTest {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 |     val sparkConf = new SparkConf().setMaster("local[2]").setAppName("HDFSProcess")
15 | 
16 |     // 创建StreamingContext需要两个参数：SparkConf和batch interval
17 |     val ssc = new StreamingContext(sparkConf, Seconds(5))
18 | 
19 |     val data = ssc.textFileStream("hdfs://thpffcj:9000/cloud-computing/")
20 | 
21 |     val log = data.map(line => {
22 | 
23 |       val infos = line.split("\t")
24 | 
25 |       DouBanLog(infos(0).toDouble, infos(1), infos(2), infos(3))
26 |     })
27 | 
28 |     log.print()
29 | 
30 |     ssc.start()
31 |     ssc.awaitTermination()
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/test/JsonTest.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.test
 2 | 
 3 | import cn.edu.nju.domain.{GameDetail, ReviewsChart, UserData}
 4 | import com.google.gson.Gson
 5 | import org.apache.spark.{SparkConf, SparkContext}
 6 | 
 7 | import scala.util.parsing.json.JSONObject
 8 | 
 9 | /**
10 |  * Created by thpffcj on 2019/10/21.
11 |  */
12 | object JsonTest {
13 | 
14 |   def main(args: Array[String]): Unit = {
15 | 
16 | //    val result1 = jsonToGameDetail("{\"reviewsChart\": {\"end_date\": 1571616000, \"rollups\": [{\"recommendations_down\": 34, \"date\": 1571270400, \"recommendations_up\": 167}], \"recent\": [{\"recommendations_down\": 1, \"date\": 1571616000, \"recommendations_up\": 7}], \"rollup_type\": \"week\", \"weeks\": [], \"start_date\": 1569456000}, \"support_tags\": [\"单人\", \"在线合作\", \"Steam 成就\"], \"user_tags\": [\"动漫\", \"砍杀\", \"冒险\", \"好评原声音轨\"], \"user_reviews\": {\"发行日期:\": \"2019年9月26日\", \"开发商:\": \"BANDAI NAMCO Studios\", \"发行商:\": \"BANDAI NAMCO Entertainment\"}}")
17 | //    print(result1)
18 | 
19 |     val result2 = jsonToReviewsChart("{\"end_date\":1.571616E9,\"weeks\":[],\"rollup_type\":\"week\",\"recent\":[{\"recommendations_down\":1.0,\"date\":1.571616E9,\"recommendations_up\":7.0}],\"rollups\":[{\"recommendations_down\":34.0,\"date\":1.5712704E9,\"recommendations_up\":167.0}],\"start_date\":1.569456E9}")
20 |     print(result2)
21 | 
22 |   }
23 | 
24 |   def jsonToGameDetail(jsonStr: String): GameDetail = {
25 |     val gson = new Gson()
26 |     gson.fromJson(jsonStr, classOf[GameDetail])
27 |   }
28 | 
29 |   def jsonToReviewsChart(jsonStr: String): ReviewsChart = {
30 |     val gson = new Gson()
31 |     gson.fromJson(jsonStr, classOf[ReviewsChart])
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/test/StatStreaming.scala:
--------------------------------------------------------------------------------
  1 | package cn.edu.nju.test
  2 | 
  3 | import cn.edu.nju.dao.{CourseClickCountDAO, CourseSearchClickCountDAO}
  4 | import cn.edu.nju.domain.{ClickLog, CourseClickCount, CourseSearchClickCount}
  5 | import cn.edu.nju.utils.DateUtils
  6 | import org.apache.kafka.clients.consumer.ConsumerConfig
  7 | import org.apache.kafka.common.serialization.StringDeserializer
  8 | import org.apache.spark.SparkConf
  9 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
 10 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 11 | 
 12 | import scala.collection.mutable.ListBuffer
 13 | 
 14 | /**
 15 |  * Created by thpffcj on 2019/10/17.
 16 |  */
 17 | object StatStreaming {
 18 | 
 19 |   def main(args: Array[String]): Unit = {
 20 | 
 21 |     val sparkConf = new SparkConf().setAppName("StatStreaming") //.setMaster("local[5]")
 22 |     val ssc = new StreamingContext(sparkConf, Seconds(60))
 23 | 
 24 |     val bootstrapServers = "thpffcj1:9092"
 25 |     val groupId = "test"
 26 |     val topicName = "test"
 27 |     val maxPoll = 20000
 28 | 
 29 |     val kafkaParams = Map(
 30 |       ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers,
 31 |       ConsumerConfig.GROUP_ID_CONFIG -> groupId,
 32 |       ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> maxPoll.toString,
 33 |       ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
 34 |       ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer]
 35 |     )
 36 | 
 37 |     val messages = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
 38 |       ConsumerStrategies.Subscribe[String, String](Set(topicName), kafkaParams))
 39 | 
 40 |     // 测试步骤一：测试数据接收
 41 |     // messages.map(_._2).count().print
 42 | 
 43 |     // 测试步骤二：数据清洗
 44 |     val logs = messages.map(_.value())
 45 |     val cleanData = logs.map(line => {
 46 |       val infos = line.split("\t")
 47 | 
 48 |       // infos(2) = "GET /class/130.html HTTP/1.1"
 49 |       // url = /class/130.html
 50 |       val url = infos(2).split(" ")(1)
 51 |       var courseId = 0
 52 | 
 53 |       // 把实战课程的课程编号拿到了
 54 |       if (url.startsWith("/class")) {
 55 |         val courseIdHTML = url.split("/")(2)
 56 |         courseId = courseIdHTML.substring(0, courseIdHTML.lastIndexOf(".")).toInt
 57 |       }
 58 | 
 59 |       ClickLog(infos(0), DateUtils.parseToMinute(infos(1)), courseId, infos(3).toInt, infos(4))
 60 |     }).filter(clicklog => clicklog.courseId != 0)
 61 | 
 62 |     //    cleanData.print()
 63 | 
 64 |     // 测试步骤三：统计今天到现在为止实战课程的访问量
 65 | 
 66 |     cleanData.map(x => {
 67 | 
 68 |       // HBase rowkey设计： 20171111_88
 69 |       (x.time.substring(0, 8) + "_" + x.courseId, 1)
 70 |     }).reduceByKey(_ + _).foreachRDD(rdd => {
 71 |       rdd.foreachPartition(partitionRecords => {
 72 |         val list = new ListBuffer[CourseClickCount]
 73 | 
 74 |         partitionRecords.foreach(pair => {
 75 |           list.append(CourseClickCount(pair._1, pair._2))
 76 |         })
 77 | 
 78 |         CourseClickCountDAO.save(list)
 79 |       })
 80 |     })
 81 | 
 82 |     // 测试步骤四：统计从搜索引擎过来的今天到现在为止实战课程的访问量
 83 | 
 84 |     cleanData.map(x => {
 85 | 
 86 |       /**
 87 |        * https://www.sogou.com/web?query=Spark SQL实战
 88 |        */
 89 |       val referrer = x.referrer.replaceAll("//", "/")
 90 |       val splits = referrer.split("/")
 91 |       var host = ""
 92 |       if(splits.length > 2) {
 93 |         host = splits(1)
 94 |       }
 95 | 
 96 |       (host, x.courseId, x.time)
 97 |     }).filter(_._1 != "").map(x => {
 98 |       (x._3.substring(0,8) + "_" + x._1 + "_" + x._2 , 1)
 99 |     }).reduceByKey(_ + _).foreachRDD(rdd => {
100 |       rdd.foreachPartition(partitionRecords => {
101 |         val list = new ListBuffer[CourseSearchClickCount]
102 | 
103 |         partitionRecords.foreach(pair => {
104 |           list.append(CourseSearchClickCount(pair._1, pair._2))
105 |         })
106 | 
107 |         CourseSearchClickCountDAO.save(list)
108 |       })
109 |     })
110 | 
111 |     ssc.start()
112 |     ssc.awaitTermination()
113 |   }
114 | }
115 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/test/TransformTest.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.test
 2 | 
 3 | import cn.edu.nju.domain.GameDetail
 4 | import com.google.gson.Gson
 5 | 
 6 | /**
 7 |  * Created by thpffcj on 2019/10/25.
 8 |  */
 9 | object TransformTest {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 | 
13 |     jsonToGameDetail("")
14 |   }
15 | 
16 |   def jsonToGameDetail(jsonStr: String): GameDetail = {
17 |     try {
18 |       val gson = new Gson()
19 |       gson.fromJson(jsonStr, classOf[GameDetail])
20 |     } catch {
21 |       case e: Exception => {
22 |         e.printStackTrace()
23 |         null
24 |       }
25 |     }
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/utils/DateUtils.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.utils
 2 | 
 3 | import java.text.SimpleDateFormat
 4 | import java.util.Date
 5 | 
 6 | import org.apache.commons.lang3.time.FastDateFormat
 7 | 
 8 | import scala.collection.mutable.ListBuffer
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019/10/17.
12 |  */
13 | object DateUtils {
14 | 
15 |   val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
16 |   val TARGE_FORMAT = FastDateFormat.getInstance("yyyyMMddHHmmss")
17 | 
18 | 
19 |   def getTime(time: String) = {
20 |     YYYYMMDDHHMMSS_FORMAT.parse(time).getTime
21 |   }
22 | 
23 |   def parseToMinute(time :String) = {
24 |     TARGE_FORMAT.format(new Date(getTime(time)))
25 |   }
26 | 
27 |   def getSteamDates(): ListBuffer[String] = {
28 |     val dates = new ListBuffer[String]
29 |     dates.append("2017-01-01 08:00:00")
30 |     dates.append("2017-02-01 08:00:00")
31 |     dates.append("2017-03-01 08:00:00")
32 |     dates.append("2017-04-01 08:00:00")
33 |     dates.append("2017-05-01 08:00:00")
34 |     dates.append("2017-06-01 08:00:00")
35 |     dates.append("2017-07-01 08:00:00")
36 |     dates.append("2017-08-01 08:00:00")
37 |     dates.append("2017-09-01 08:00:00")
38 |     dates.append("2017-10-01 08:00:00")
39 |     dates.append("2017-11-01 08:00:00")
40 |     dates.append("2017-12-01 08:00:00")
41 |     dates.append("2018-01-01 08:00:00")
42 |     dates.append("2018-02-01 08:00:00")
43 |     dates.append("2018-03-01 08:00:00")
44 |     dates.append("2018-04-01 08:00:00")
45 |     dates.append("2018-05-01 08:00:00")
46 |     dates.append("2018-06-01 08:00:00")
47 |     dates.append("2018-07-01 08:00:00")
48 |     dates.append("2018-08-01 08:00:00")
49 |     dates.append("2018-09-01 08:00:00")
50 |     dates.append("2018-10-01 08:00:00")
51 |     dates.append("2018-11-01 08:00:00")
52 |     dates.append("2018-12-01 08:00:00")
53 |     dates.append("2019-01-01 08:00:00")
54 |     dates.append("2019-02-01 08:00:00")
55 |     dates.append("2019-03-01 08:00:00")
56 |     dates.append("2019-04-01 08:00:00")
57 |     dates.append("2019-05-01 08:00:00")
58 |     dates.append("2019-06-01 08:00:00")
59 |     dates.append("2019-07-01 08:00:00")
60 |     dates.append("2019-08-01 08:00:00")
61 |     dates.append("2019-09-01 08:00:00")
62 |     dates.append("2019-10-01 08:00:00")
63 | 
64 |     dates
65 |   }
66 | 
67 |   def tranTimestampToString(tm: Int): String={
68 |     val fm = new SimpleDateFormat("yyyy/MM")
69 |     val tim = fm.format(new Date(tm.toLong * 1000))
70 |     tim
71 |   }
72 | 
73 |   def main(args: Array[String]): Unit = {
74 | 
75 |     println(tranTimestampToString(1569888000))
76 |   }
77 | }


--------------------------------------------------------------------------------
/spark-streaming/src/main/scala/cn/edu/nju/utils/MySQLUtils.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.utils
 2 | 
 3 | import java.sql.{Connection, DriverManager, PreparedStatement}
 4 | 
 5 | /**
 6 |  * Created by thpffcj on 2019/10/25.
 7 |  */
 8 | object MySQLUtils {
 9 | 
10 |   /**
11 |    * 获取MySQL的连接
12 |    */
13 |   def getConnection() = {
14 |     Class.forName("com.mysql.jdbc.Driver")
15 |     DriverManager.getConnection("jdbc:mysql://localhost:3306/steam?useUnicode=true&characterEncoding=utf-8", "root", "000000")
16 |   }
17 | 
18 |   /**
19 |    * 释放数据库连接等资源
20 |    * @param connection
21 |    * @param pstmt
22 |    */
23 |   def release(connection: Connection, pstmt: PreparedStatement): Unit = {
24 |     try {
25 |       if (pstmt != null) {
26 |         pstmt.close()
27 |       }
28 |     } catch {
29 |       case e: Exception => e.printStackTrace()
30 |     } finally {
31 |       if (connection != null) {
32 |         connection.close()
33 |       }
34 |     }
35 |   }
36 | 
37 |   def main(args: Array[String]) {
38 |     println(getConnection())
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/spider/spark-graphx/steam-reviews-official.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import pymongo
  4 | import re
  5 | import threading
  6 | from multiprocessing import JoinableQueue
  7 | import time
  8 | 
  9 | #爬取steam所有评论，存入mongodb中
 10 | def write(item):
 11 |     try:
 12 |         if isinstance(item,list):
 13 |             collection.insert_many(item)
 14 |         else:
 15 |             collection.insert_one(item)
 16 |     except Exception as e:
 17 |         print(e)
 18 |     return True
 19 |         
 20 | def getAllApps():
 21 |     try:
 22 |         apps = []
 23 |         for g in regions.game_id.find().skip(0).limit(1400):
 24 |             apps.append({"id":g["id"],"name":g["name"]})
 25 |         return apps
 26 |     except:
 27 |         print(e)
 28 |         
 29 | def fetchReview(url,params,headers,app):
 30 |     try: 
 31 |         res = session.get(url,params=params,headers=headers,timeout=30,verify=False)
 32 |         if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码,此页爬取失败
 33 |             if res.status_code != requests.codes.forbidden and res.status_code != requests.codes.bad_gateway: #403、502不打印
 34 |                 print(res.status_code,":",url)
 35 |             return None
 36 |     except Exception as e: #网络有问题访问失败
 37 |         print(e)
 38 |         return None
 39 |     
 40 |     result = res.json()
 41 | #     print(res.url)
 42 | #     print(result)
 43 |     reviews = result["reviews"]
 44 |     if not reviews:#该游戏没有更多评论了
 45 |         print(result)
 46 |         return None
 47 |     cursor = result["cursor"]
 48 |     if not cursor:
 49 |         print(result)
 50 |     for review in reviews:
 51 |         review["game"] = app
 52 |     write(reviews)
 53 | #     print(url)
 54 | #     print(reviews)
 55 | #     print()
 56 |     return cursor
 57 |         
 58 | def fetch(apps):
 59 |     for app in apps:
 60 |         #建立会话，Cookie设置语言为简体中文,出生日期为1987.1.1（允许访问成人内容）
 61 |         headers = {
 62 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
 63 |             'Cookie':'Steam_Language=schinese; birthtime=533750401; timezoneOffset=28800,0;',
 64 |         }
 65 |         cursor = "*"
 66 |         reviewsCount = 0
 67 |         while cursor:
 68 | #             print("cursor:",cursor)
 69 | #             url = "https://store.steampowered.com/appreviews/"+app["id"]+"?json=1&filter=recent&language=schinese&day_range=360"+ \
 70 | #             "&cursor="+cursor+"&review_type=all&purchase_type=all&num_per_page=100"
 71 |             url = "https://store.steampowered.com/appreviews/"+app["id"]
 72 |             params = {
 73 |                 "json":1,
 74 |                 "filter":"recent", #all,recent,updated 
 75 |                 "language":"schinese", #all,schinese,zh-CN
 76 |                 "day_range":"360",
 77 |                 "cursor":cursor,
 78 |                 "review_type":"all",
 79 |                 "purchase_type":"all",
 80 |                 "num_per_page":100,
 81 |             }
 82 |             cursor = fetchReview(url,params,headers,app)
 83 |             reviewsCount = reviewsCount+100
 84 |             if reviewsCount>=10000:
 85 |                 break
 86 |         print(url,reviewsCount)
 87 | 
 88 | #mongodb连接
 89 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db')
 90 | db = client.steam_db
 91 | regions = db.China
 92 | collection = regions.reviews_official
 93 | 
 94 | requests.packages.urllib3.disable_warnings()
 95 | session = requests.session()
 96 | 
 97 | appInfos = getAllApps()
 98 | # print(appInfos)
 99 | numOfThreads = 1
100 | badPages = fetch(appInfos)
101 | print("all finished")
102 | 
103 | # https://store.steampowered.com/appreviews/243470?json=1&filter=all&language=all&day_range=360&cursor=*&review_type=all&purchase_type=all&num_per_page=10


--------------------------------------------------------------------------------
/spider/spark-graphx/steam-reviews.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import pymongo
  4 | import re
  5 | import threading
  6 | from multiprocessing import JoinableQueue
  7 | import time
  8 | 
  9 | # 3 1        1400           678960,1122050,1100620,730,1041320
 10 | #爬取中国区steam所有评论，存入mongodb中
 11 | def write(item):
 12 |     try:
 13 |         if isinstance(item,list):
 14 | #             firstUser = regions.first_review_user.find({"game":item[0]["game"]["id"]})[0]
 15 | #             if firstUser["user"]==item[0]["user"]["name"]: #重复评论
 16 | #                 return False
 17 |             collection.insert_many(item)
 18 |         else:
 19 |             collection.insert_one(item)
 20 |     except Exception as e:
 21 |         print(e)
 22 |     return True
 23 |         
 24 | def getAllApps():
 25 |     try:
 26 |         apps = []
 27 |         for g in regions.game_id.find().skip(0).limit(1400):
 28 |             apps.append({"id":g["id"],"name":g["name"],"firstUser":None})
 29 |         return apps
 30 |     except:
 31 |         print(e)
 32 |         
 33 | def fetchReview(url,headers,app):
 34 |     try: 
 35 |         res = session.get(url,headers=headers,timeout=30,verify=False)
 36 |         if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码,此页爬取失败
 37 |             if res.status_code != requests.codes.forbidden and res.status_code != requests.codes.bad_gateway: #403、502不打印
 38 |                 print(res.status_code,":",url)
 39 |             return None
 40 |     except Exception as e: #网络有问题访问失败
 41 |         print(e)
 42 |         return None
 43 |     
 44 |     if not res.text:#该游戏没有更多评论了
 45 |         return None
 46 |         
 47 |     try: 
 48 |         soup = BeautifulSoup(res.text,'lxml')
 49 |         
 50 |         reviewGroup = []
 51 |         for card in soup.find_all(class_="apphub_Card modalContentLink interactable"):
 52 |             userCard = card.find(class_="apphub_friend_block")
 53 |             if not userCard:# 没有用户的评论扔掉
 54 |                 continue
 55 |             if not userCard.find(class_="apphub_CardContentAuthorName"):# 没有用户的评论扔掉
 56 |                 continue
 57 |             if(len(userCard.find(class_="apphub_CardContentAuthorName").find_all("a"))!=1):
 58 |                 print(userCard.find(class_="apphub_CardContentAuthorName"))
 59 |             name = userCard.find(class_="apphub_CardContentAuthorName").find("a").string
 60 |             name = name.strip() if name else ""
 61 |             product_owns = userCard.find(class_="apphub_CardContentMoreLink").string
 62 |             product_owns = product_owns.strip() if product_owns else ""
 63 |             user = {
 64 |                 "name":name,# 可能为""
 65 |                 "product_owns": product_owns,# 可能为""
 66 |             }
 67 |             comment_count = card.find(class_="apphub_CardCommentCount").string.strip()
 68 |             found_helpful = card.find(class_="found_helpful").contents
 69 |             helpful_num = found_helpful[0].strip()
 70 |             funny_num = found_helpful[-1].strip() if len(found_helpful)>1 else ""
 71 |             title = card.find(class_="reviewInfo").find(class_="title").string.strip()
 72 |             hours = card.find(class_="reviewInfo").find(class_="hours")
 73 |             hours = hours.string.strip() if hours else ""
 74 |             
 75 |             cardTextContent = card.find(class_="apphub_CardTextContent")
 76 |             date_posted = cardTextContent.find(class_="date_posted").string.strip()
 77 |             content = cardTextContent.contents[5:] if cardTextContent.find(class_="received_compensation") else cardTextContent.contents[2:]
 78 |             content = "".join(item.string if item.string else "<br>" for item in content).strip()
 79 |             
 80 |             review = {
 81 |                 "game":app,
 82 |                 "user":user,
 83 |                 "comment_count":comment_count,#该评论回复数
 84 |                 "helpful_num":helpful_num,#几人觉得这篇评测有价值 有的是一句话，有的是数字
 85 |                 "funny_num":funny_num,#几人觉得这篇评测欢乐
 86 |                 "title":title,#推荐/不推荐
 87 |                 "hours":hours,#总时数 可能为""
 88 |                 "date_posted":date_posted,#发布于
 89 |                 "content":content,#评论内容
 90 |             }
 91 |             reviewGroup.append(review)
 92 |         form = soup.find("form")
 93 |         nextUrl = form.attrs["action"]+"?"
 94 |         for arg in form.find_all("input"):
 95 |             nextUrl = nextUrl+arg.attrs["name"]+"="+arg.attrs["value"]+"&"
 96 |         nextUrl = nextUrl[:-1]
 97 | #         if app["firstUser"]==reviewGroup[0]["user"]["name"]:
 98 | #             return None
 99 | #         write(reviewGroup)
100 |         print(url)
101 |         print(reviewGroup)
102 |         print()
103 | #         print("nextUrl",nextUrl)
104 |         return nextUrl
105 | 
106 |     except Exception as e: #steam服务器响应不正确
107 |         print("bad url:",url,e)
108 |         return None
109 | 
110 | class fetchThread(threading.Thread):
111 |     def __init__(self, tQueue, app, threadNum):
112 |         threading.Thread.__init__(self)
113 |         self.tQueue = tQueue
114 |         self.app = app
115 |         self.threadNum = threadNum
116 |     def run(self):
117 |         id = self.app["id"]
118 |         p = str(self.threadNum+1)
119 |         userreviewsoffset = str((int(p)-1)*10)
120 |         numperpage = "10"
121 | #         url = "https://steamcommunity.com/app/"+id+"/homecontent/?userreviewsoffset="+userreviewsoffset+"&p="+p+ \
122 | #         "&workshopitemspage="+p+"&readytouseitemspage="+p+"&mtxitemspage="+p+"&itemspage="+p+"&screenshotspage="+p+ \
123 | #         "&videospage="+p+"&artpage="+p+"&allguidepage="+p+"&webguidepage="+p+"&integratedguidepage="+p+ \
124 | #         "&discussionspage="+p+"&numperpage="+numperpage+"&browsefilter=trendyear&browsefilter=trendyear&l=schinese"+ \
125 | #         "&appHubSubSection="+numperpage+"&filterLanguage=default&searchText=&forceanon=1"
126 |         url = "https://steamcommunity.com/app/"+id+"/reviews/?p=1&browsefilter=trendyear"
127 |         #建立会话，Cookie设置语言为简体中文,出生日期为1987.1.1（允许访问成人内容）
128 |         headers = {
129 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
130 |             'accept':'text/javascript, text/html, application/xml, text/xml, */*',
131 |             'accept-encoding': 'gzip, deflate, br',
132 |             'accept-language': 'zh,zh-CN;q=0.9,zh-TW;q=0.8,en;q=0.7,en-GB;q=0.6,en-US;q=0.5',
133 |             'cache-control': 'no-cache',
134 |             'pragma': 'no-cache',
135 |             'Cookie':'Steam_Language=schinese; birthtime=533750401; timezoneOffset=28800,0; sessionid=04a0dcb8f1f8f31bed482819; recentlyVisitedAppHubs=816340%2C678960%2C242920%2C1122050; steamCountry=CN%7C72e4ed8aa9f1f07b0eeba82d9349680e; app_impressions=1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_|1122050@2_9_100010_',
136 |             'sec-fetch-mode': 'cors',
137 |             'sec-fetch-site': 'same-origin',
138 |             'x-prototype-version': '1.7',
139 |             'x-requested-with': 'XMLHttpRequest',
140 |             'referer': url
141 |         }
142 | #         print(url)
143 |         reviewCount = 0
144 |         while url:
145 | #             print("threadNum:"+str(self.threadNum)+" offset:"+userreviewsoffset)
146 |             nextUrl = fetchReview(url,headers,self.app)
147 |             if not nextUrl:
148 |                 nextUrl = fetchReview(url,headers,self.app)
149 |                 if not nextUrl: #2次失败认为这个游戏评论已爬完
150 |                     break
151 |             url = nextUrl
152 | #             if not self.app["firstUser"]:
153 | #                 self.app["firstUser"]=reviewGroup[0]["user"]["name"]
154 |             reviewCount = reviewCount+10
155 |             if int(reviewCount)>5000:#超过5K条评论后面的就不爬了（以魂3评论量为基准）
156 |                 break
157 | #             p = str(int(p)+self.tQueue.numOfThreads*1)
158 | #             userreviewsoffset = str((int(p)-1)*10)
159 | #             url = "https://steamcommunity.com/app/"+id+"/homecontent/?userreviewsoffset="+userreviewsoffset+"&p="+p+ \
160 | #             "&workshopitemspage="+p+"&readytouseitemspage="+p+"&mtxitemspage="+p+"&itemspage="+p+"&screenshotspage="+p+ \
161 | #             "&videospage="+p+"&artpage="+p+"&allguidepage="+p+"&webguidepage="+p+"&integratedguidepage="+p+ \
162 | #             "&discussionspage="+p+"&numperpage="+numperpage+"&browsefilter=toprated&browsefilter=toprated&l=schinese"+ \
163 | #             "&appHubSubSection="+numperpage+"&filterLanguage=default&searchText=&forceanon=1"
164 | #             trendyear toprated trendweek trendday mostrecent
165 | #             print("nextUrl",url)
166 | #             time.sleep(2)
167 | #             break
168 | 
169 | class threadQueue:
170 |     def __init__(self, numOfThreads, app):
171 |         self.numOfThreads = numOfThreads
172 |         self.app = app
173 |         self.threads = []
174 |         self.badItems = []
175 |         
176 |         for i in range(0,numOfThreads):
177 |             # 创建线程爬取详情页面
178 |             thread = fetchThread(self,app,i)
179 |             thread.start()
180 |             self.threads.append(thread)
181 | #     def addBadItem(self,info):
182 | #         self.badItems.append(info)
183 |     def waitForStop(self):
184 |         #等待当前页的线程爬取完后再开始爬下一页
185 |         for t in self.threads:
186 |             t.join()
187 |         if self.badItems:
188 |             print("badItems ",self.badItems) 
189 |         
190 | def fetch(apps):
191 |     for app in apps:
192 |         queue = threadQueue(numOfThreads,app)
193 |         queue.waitForStop()
194 |         print(app["id"],"finished")
195 |     badItems = queue.badItems
196 |     
197 |     #错页重爬
198 |     for app in badItems:
199 |         queue = threadQueue(numOfThreads,app)
200 |         queue.waitForStop()
201 |     return queue.badItems
202 | 
203 | #mongodb连接
204 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db')
205 | db = client.steam_db
206 | regions = db.China
207 | collection = regions.reviews
208 | 
209 | requests.packages.urllib3.disable_warnings()
210 | session = requests.session()
211 | 
212 | appInfos = getAllApps()
213 | # print(appInfos)
214 | numOfThreads = 1
215 | badPages = fetch(appInfos)
216 | print("all finished")
217 | 
218 | # http://store.steampowered.com/appreviews/243470?json=1&filter=all&language=all&day_range=360&cursor=*&review_type=all&purchase_type=all&num_per_page=10


--------------------------------------------------------------------------------
/spider/spark-streaming/steam-games-multithread-queue.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import pymongo
  4 | import re
  5 | import threading
  6 | from multiprocessing import JoinableQueue
  7 | 
  8 | #爬取中国区steam所有产品，存入mongodb中
  9 | def write(item):
 10 |     try:
 11 |         collection.insert_one(item)
 12 |     except:
 13 |         print(e)
 14 |         
 15 | def fetchReviewsChart(appID):
 16 |     url = "https://store.steampowered.com/appreviewhistogram/"+appID+"?l=schinese&review_score_preference=0"
 17 |     try: 
 18 |         res = session.get(url,headers=headers,timeout=30)
 19 |         if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码,此页爬取失败
 20 |             print(res.status_code,":",url)
 21 |             return None
 22 |         chart = res.json()
 23 |         #chart.results.rollup_type: 取值有"week"、"month"，指的是chart.results.rollups中每个date的时间跨度
 24 |         #chart.results.recent 中每个date的时间跨度是1天
 25 |         #date: 时间
 26 |         #recommendations_down: 差评数
 27 |         #recommendations_up: 好评数
 28 |         return chart["results"] if chart["success"]==1 else None
 29 |     except: #网络有问题访问失败
 30 |         print(url)
 31 |         return None
 32 | 
 33 | def fetchGameInfo(url):
 34 |     try: 
 35 |         res = session.get(url,headers=headers,timeout=30)
 36 |         if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码,此页爬取失败
 37 |             print(res.status_code,":",url)
 38 |             return None
 39 |     except: #网络有问题访问失败
 40 |         print(url)
 41 |         return None
 42 |         
 43 |     try: 
 44 |         soup = BeautifulSoup(res.text,'lxml')
 45 |         
 46 |         #社区的URL
 47 |         communityUrl = soup.find(class_="apphub_OtherSiteInfo").find("a").attrs["href"]
 48 |         appID = communityUrl.split("/")[-1]
 49 |         
 50 |         #右上角的概览
 51 |         user_reviews = soup.find(class_="user_reviews")
 52 |         user_reviews_json = {}
 53 |         for item in user_reviews.find_all("div",class_="subtitle column"):
 54 |             user_reviews_json[item.string.strip()] = re.sub('\r|\n|\t', '', item.parent.find_all("div")[1].get_text().strip())
 55 | 
 56 |         #用户自定义标签
 57 |         user_tags = soup.find(class_="glance_tags popular_tags")
 58 |         if user_tags:
 59 |             user_tags = [item.string.strip() for item in user_tags.find_all("a")]
 60 |         else:
 61 |             user_tags = []
 62 |         
 63 |         #该游戏支持的活动
 64 |         support_tags = soup.find_all(class_="game_area_details_specs")
 65 |         support_tags = [item.find(class_="name").get_text().strip() for item in support_tags]
 66 |         
 67 |         #爬取评论量图表
 68 |         reviewsChart = fetchReviewsChart(appID)
 69 |         if not reviewsChart: #失败重爬一次
 70 |             reviewsChart = fetchReviewsChart(appID)
 71 |         reviewsChart = reviewsChart if reviewsChart else ""
 72 | 
 73 |         #该页面的所有信息
 74 |         game_detail = {
 75 |             "user_reviews":user_reviews_json,
 76 |             "user_tags":user_tags,
 77 |             "support_tags":support_tags,
 78 |             "reviewsChart":reviewsChart,
 79 |         }
 80 | #         print(game_detail)
 81 |         return game_detail
 82 | 
 83 |     except: #steam服务器响应不正确
 84 |         print("bad url:",url)
 85 |         return None
 86 | 
 87 | class fetchThread(threading.Thread):
 88 |     def __init__(self, tqueue):
 89 |         threading.Thread.__init__(self)
 90 |         self.tqueue = tqueue
 91 |     def run(self):
 92 |         while True:
 93 |             info = self.tqueue.tasks.get()
 94 |             href = info["href"]
 95 |             if href.startswith("https://store.steampowered.com/bundle/") or href.startswith("https://store.steampowered.com/sub/"):
 96 |                 game_detail = "bundle" #捆绑包不爬详情页
 97 |             else:
 98 |                 game_detail = fetchGameInfo(href)
 99 |                 if not game_detail: #失败重爬一次
100 |                     game_detail = fetchGameInfo(href)
101 |                 game_detail = game_detail if game_detail else ""
102 |             info["game_detail"] = game_detail
103 | 
104 | #             print(info)
105 |             write(info)
106 |             if game_detail=="":
107 |                 self.tqueue.addBadItem(info)
108 |             self.tqueue.finishOne()
109 |             self.tqueue.tasks.task_done() #已经处理完从队列中拿走的一个项目
110 | 
111 | class threadQueue:
112 |     def __init__(self, numOfThreads):
113 |         self.numOfThreads = numOfThreads
114 |         self.threads = []
115 |         self.tasks = JoinableQueue()#实例一个队列
116 |         self.tasksNum = 0
117 |         self.badItems = []
118 |         
119 |         for i in range(1,numOfThreads):
120 |             # 创建线程爬取详情页面
121 |             thread = fetchThread(self)
122 |             thread.start()
123 |             self.threads.append(thread)
124 |     def add(self,info):
125 |         self.tasks.put(info)
126 |     def finishOne(self):
127 |         threadLock = threading.Lock()
128 |         threadLock.acquire()
129 |         self.tasksNum=self.tasksNum+1
130 |         if self.tasksNum%25==0:
131 |             print(self.tasksNum,"/",(totalPage-1)*25,"finished")
132 |         threadLock.release()
133 |     def addBadItem(self,info):
134 |         self.badItems.append(info)
135 |     def waitForStop(self):
136 |         self.tasks.join()#等,直到消费者把自己放入队列中的所有项目都取走处理完后调用task_done()之后
137 |         if self.badItems:
138 |             print("badItems ",self.badItems) 
139 |         
140 | def fetch(pageRange):
141 |     badPages = []
142 |     page =1 #每页一个request
143 |     for page in pageRange:
144 |         try: #网络有问题访问失败，保存失败的请求然后跳过
145 |             url = "https://store.steampowered.com/search/?page=" + str(page)
146 |             res = session.get(url,headers=headers)
147 |         except:
148 |             badPages.append(page)
149 |             continue
150 | 
151 |         if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码然后跳过
152 |             print("page",page,":",res.status_code)
153 |             badPages.append(page)
154 |             continue
155 | 
156 |         try: #曾出现过异常，当时没仔细看，但是后面都没再出现了，可能与steam的服务器有关
157 |             soup = BeautifulSoup(res.text,'lxml')
158 |             contents = soup.find(id="search_resultsRows").find_all('a')
159 |         except:
160 |             print("bad page:",page)
161 |             badPages.append(page)
162 |             continue
163 | 
164 |         for content in contents:
165 |             try:
166 |                 name = content.find(class_="title").get_text().strip()
167 |                 date = content.find("div",class_="col search_released responsive_secondrow").string
168 |                 date = date.strip() if date else ""#未上市的没有发行日期
169 |                 priceDiv=content.find("div",class_="col search_price discounted responsive_secondrow")
170 |                 if priceDiv:#打折游戏
171 |                     original_price=priceDiv.find("strike").string.strip()
172 |                     price=priceDiv.contents[-1].strip()
173 |                 else:#原价游戏
174 |                     original_price= content.find("div",class_="col search_price responsive_secondrow").string.strip()
175 |                     price=original_price
176 |                 img_src = content.find("div",class_="col search_capsule").find('img').get("src")
177 |                 href = content.get("href")
178 |                 review_summary = content.find("span",class_="search_review_summary")
179 |                 review_summary = review_summary.attrs['data-tooltip-html'].strip() if review_summary else ""#未上市的没有总评
180 |                 result={
181 |                     "page":page,
182 |                     "name":name,
183 |                     "href":href,
184 |                     "date":date,
185 |                     "original_price":original_price,
186 |                     "price":price,
187 |                     "img_src":img_src,
188 |                     "review_summary":review_summary,
189 |                 }
190 |                 queue.add(result)
191 |             except:
192 |                 print(content)
193 |     queue.waitForStop()
194 |     return badPages
195 | 
196 | #mongodb连接
197 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db')
198 | db = client.steam_db
199 | regions = db.China
200 | collection = regions.games
201 | 
202 | #建立会话，Cookie设置语言为简体中文,出生日期为1987.1.1（允许访问成人内容）
203 | headers = {
204 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
205 |     'Cookie':'Steam_Language=schinese; birthtime=533750401'
206 | }
207 | session = requests.session()
208 | 
209 | queue = threadQueue(100)
210 | 
211 | totalPage = 2 #目前有2608页
212 | badPages = fetch(range(1, totalPage))
213 | if badPages: #重爬坏页
214 |     badPages = fetch(badPages)
215 | print("all finished")
216 | if badPages:
217 |     print("badPages:",badPages)
218 | 


--------------------------------------------------------------------------------
/spider/spark-streaming/steam-hotN.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import pymongo
 4 | 
 5 | #爬取中国区steam当前热销榜，存入mongodb中
 6 | def write(item):
 7 |     try:
 8 |         collection.insert_one(item)
 9 |     except:
10 |         print(e)
11 | 
12 | def fetch(pageRange):
13 |     badPages = []
14 |     badItems = []
15 |     page =1 #每页一个request
16 |     for page in pageRange:
17 |         try: #网络有问题访问失败，保存失败的请求然后跳过
18 |             url = "https://store.steampowered.com/search/?filter=globaltopsellers&page=" + str(page) + "&os=win"
19 |             #Cookie设置语言为简体中文
20 |             headers = {
21 |                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
22 |                 'Cookie':'Steam_Language=schinese'
23 |             }
24 |             s = requests.session()
25 |             res = s.get(url,headers=headers)
26 |         except:
27 |             badPages.append(page)
28 |             continue
29 | 
30 |         if res.status_code != requests.codes.ok:#请求被拒绝打印出状态码然后跳过
31 |             print("page",page,":",res.status_code)
32 |             badPages.append(page)
33 |             continue
34 | 
35 |         try: #曾出现过异常，当时没仔细看，但是后面都没再出现了，可能与steam的服务器有关
36 |             soup = BeautifulSoup(res.text,'lxml')
37 |             contents = soup.find(id="search_resultsRows").find_all('a')
38 |         except:
39 |             print(soup)
40 |             badPages.append(page)
41 | 
42 |         for content in contents:
43 |             try:
44 |                 name = content.find(class_="title").get_text().strip()
45 |                 date = content.find("div",class_="col search_released responsive_secondrow").string
46 |                 date = date.strip() if date else ""#未上市的没有发行日期
47 |                 priceDiv=content.find("div",class_="col search_price discounted responsive_secondrow")
48 |                 if priceDiv:#打折游戏
49 |                     original_price=priceDiv.find("strike").string.strip()
50 |                     price=priceDiv.contents[-1].strip()
51 |                 else:#原价游戏
52 |                     original_price= content.find("div",class_="col search_price responsive_secondrow").string.strip()
53 |                     price=original_price
54 |                 img_src = content.find("div",class_="col search_capsule").find('img').get("src")
55 |                 href = content.get("href")
56 |                 review_summary = content.find("span",class_="search_review_summary")
57 |                 review_summary = review_summary.attrs['data-tooltip-html'].strip() if review_summary else ""#未上市的没有发行日期
58 |                 result={
59 |                     "page":page,
60 |                     "name":name,
61 |                     "href":href,
62 |                     "date":date,
63 |                     "original_price":original_price,
64 |                     "price":price,
65 |                     "img_src":img_src,
66 |                     "review_summary":review_summary,
67 |                 }
68 | #                 print(result)
69 |                 write(result)
70 |             except:
71 |                 print(content)
72 |                 badItems.append(content)
73 |         if page%10==0:
74 |             print(page,"/",totalPage,"finished")#每10页打印一次进度
75 |     print("badItems:",badItems)
76 |     return badPages
77 | 
78 | client = pymongo.MongoClient('mongodb://steam:steam@***.***.***.***:27017/steam_db')
79 | db = client.steam_db
80 | regions = db.China
81 | collection = regions.hot
82 | 
83 | totalPage = 593 #目前有593页
84 | badPages = fetch(range(1, totalPage))
85 | if badPages: #重爬坏页
86 |     badPages = fetch(badPages)
87 | print("all finished")
88 | if badPages:
89 |     print("badPages:",badPages)
90 | 


--------------------------------------------------------------------------------