├── .settings
└── org.eclipse.m2e.core.prefs
├── 01.png
├── 02.png
├── 03.png
├── 04.png
├── 05.png
├── 06.png
├── 07.png
├── 08.png
├── 09.png
├── README.md
├── Recommendation
├── .settings
│ └── org.eclipse.m2e.core.prefs
├── ContentRecommendation
│ ├── .settings
│ │ ├── org.eclipse.jdt.core.prefs
│ │ └── org.eclipse.m2e.core.prefs
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── resources
│ │ └── log4j.properties
│ │ └── scala
│ │ └── com
│ │ └── z
│ │ └── content
│ │ └── ContentRecommender.scala
├── ItemCFRecommendation
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── resources
│ │ └── log4j.properties
│ │ └── scala
│ │ └── com
│ │ └── z
│ │ └── itemcf
│ │ └── ItemCFRecommender.scala
├── KafkaStream
│ ├── .settings
│ │ ├── org.eclipse.core.resources.prefs
│ │ ├── org.eclipse.jdt.core.prefs
│ │ └── org.eclipse.m2e.core.prefs
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── java
│ │ └── com
│ │ │ └── z
│ │ │ └── kafkastream
│ │ │ ├── Application.java
│ │ │ ├── LogProcessor.java
│ │ │ └── MyEventTimeExtractor.java
│ │ └── resources
│ │ └── log4j.properties
├── OfflineRecommendation
│ ├── .settings
│ │ ├── org.eclipse.jdt.core.prefs
│ │ └── org.eclipse.m2e.core.prefs
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── resources
│ │ └── log4j.properties
│ │ └── scala
│ │ └── com
│ │ └── z
│ │ └── offline
│ │ ├── ALSTrainer.scala
│ │ └── OfflineRecommender.scala
├── StatisticsRecommendation
│ ├── .settings
│ │ ├── org.eclipse.jdt.core.prefs
│ │ └── org.eclipse.m2e.core.prefs
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── resources
│ │ └── log4j.properties
│ │ └── scala
│ │ └── com
│ │ └── z
│ │ └── statistics
│ │ └── StatisticsRecommender.scala
├── StreamingRecommendation
│ ├── .settings
│ │ ├── org.eclipse.jdt.core.prefs
│ │ └── org.eclipse.m2e.core.prefs
│ ├── pom.xml
│ └── src
│ │ └── main
│ │ ├── resources
│ │ └── log4j.properties
│ │ └── scala
│ │ └── com
│ │ └── z
│ │ └── streaming
│ │ └── StreamingRecommender.scala
└── pom.xml
├── pom.xml
├── readme.docx
└── ~$readme.docx
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/01.png
--------------------------------------------------------------------------------
/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/02.png
--------------------------------------------------------------------------------
/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/03.png
--------------------------------------------------------------------------------
/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/04.png
--------------------------------------------------------------------------------
/05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/05.png
--------------------------------------------------------------------------------
/06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/06.png
--------------------------------------------------------------------------------
/07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/07.png
--------------------------------------------------------------------------------
/08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/08.png
--------------------------------------------------------------------------------
/09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/09.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ***源码:https://github.com/wolf-song-ml/RecommendationSystem***
2 | ## 实战篇
3 |
4 | ## 1 项目技术架构
5 | 
6 |
7 | ## 2 项目涉及关键技术
8 |
9 | - Redis:存储用户最近评测队列
10 | - Mongdb:BI可视化查询
11 | - Elastic Search:文本关键词模糊检索索引、类别完全匹配检索、More like this基于内容推荐api
12 | - Flume:实时评测数据采集
13 | - Kafka:采集数据中间消息通道 Kafka stream:消息转发中间管道
14 | - Spark:spark sql、spark stream、spark M数据统计、加载数据源引擎、机器学习模型
15 | - ScalaNLP:JAVA矩阵计算
16 |
17 | ## 理论篇
18 |
19 | ## 1 推荐系统的意义 - 解决信息过载
20 |
21 | - 搜索引擎时代
22 |
23 | 分类导航:雅虎
24 | 搜索:谷歌、百度
25 |
26 | - 个性化时代(提高用户粘度、增加营收)
27 |
28 | 系统自动推荐相关的东西:今日头条、豆瓣、电商
29 |
30 | ## 2 推荐系统的分类
31 |
32 | - 基于人口统计学的推荐
33 | - 基于内容的推荐
34 |
35 | - 基于协同过滤的推荐
36 |
37 | ## 3 基于人口统计学的推荐
38 |
39 | 基于人口统计学的推荐机制(Demographic-based Recommendation)是一种最易于实现的推荐方法,它只是简单的根据系统用户的基本信息发现用户的相关程度,然后将相似用户喜爱的其他物品推荐给当前用户。
40 | 
41 |
42 | ## 4 基于内容的推荐
43 |
44 | ## 4.1 定义
45 |
46 | 基于内容的推荐是在推荐引擎出现之初应用最为广泛的推荐机制,它的核心思想是根据推荐物品或内容的元数据,发现物品或者内容的相关性,然后基于用户以往的喜好记录,推荐给用户相似的物品。
47 |
48 | ## 4.2 算法流程
49 |
50 | - 对于物品的特征提取——打标签(tag)
51 | - 对于文本信息的特征提取——关键词
52 | - 生成分词特征向量矩阵
53 | - 计算相似度,常用余弦相似度
54 | 
55 |
56 | ## 4.3 核心代码
57 |
58 | ## 4.3.1 spark TF-IDF
59 |
60 | ```java
61 | // 核心部分: 用TF-IDF从内容信息中提取电影特征向量
62 | // 创建一个分词器,默认按空格分词
63 | val tokenizer = new Tokenizer().setInputCol("genres").setOutputCol("words")
64 |
65 | // 用分词器对原始数据做转换,生成新的一列words
66 | val wordsData = tokenizer.transform(movieTagsDF)
67 |
68 | // 引入HashingTF工具,可以把一个词语序列转化成对应的词频
69 | val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(50)
70 | val featurizedData = hashingTF.transform(wordsData)
71 |
72 | // 引入IDF工具,可以得到idf模型
73 | val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
74 | // 训练idf模型,得到每个词的逆文档频率
75 | val idfModel = idf.fit(featurizedData)
76 | // 用模型对原数据进行处理,得到文档中每个词的tf-idf,作为新的特征向量
77 | val rescaledData = idfModel.transform(featurizedData)
78 |
79 | val movieRecs = movieFeatures.cartesian(movieFeatures)
80 | .filter{
81 | // 把自己跟自己的配对过滤掉
82 | case (a, b) => a._1 != b._1
83 | }
84 | .map{
85 | case (a, b) => {
86 | val simScore = this.consinSim(a._2, b._2)
87 | ( a._1, ( b._1, simScore ) )
88 | }
89 | }
90 | .filter(_._2._2 > 0.6) // 过滤出相似度大于0.6的
91 | .groupByKey()
92 | .map{
93 | case (mid, items) => MovieRecs( mid, items.toList.sortWith(_._2 > _._2).map(x => Recommendation(x._1, x._2)) )
94 | }
95 | .toDF()
96 | ```
97 |
98 | ## 4.3.2 ElasticSearch More like this
99 |
100 | ```java
101 | MoreLikeThisQueryBuilder query = QueryBuilders.moreLikeThisQuery(
102 | /*new String[]{"name", "descri", "genres", "actors", "directors", "tags"},*/
103 | new MoreLikeThisQueryBuilder.Item[]{new MoreLikeThisQueryBuilder.Item(Constant.ES_INDEX,
104 | Constant.ES_MOVIE_TYPE, String.valueOf(mid))});
105 | ```
106 |
107 | ## 5 基于协同过滤的推荐
108 |
109 | ## 5.1基于用户的协同过滤(UserCF)
110 |
111 | *计算用户的相似度,推荐相似用户的喜好*
112 | 
113 |
114 | ## 5.2 基于物品的协同过滤(ItemCF重点)
115 |
116 | *计算物品的相似度,推荐相似度高的物品(不同于基于内容的推荐)*
117 | 
118 |
119 | ## 5.2.1核心算法:计算同现相似度
120 | 
121 |
122 | ## 5.2.2 核心算法实例
123 |
124 | ```java
125 | // ( mid1, (mid2, score) )
126 | val simDF = cooccurrenceDF.map{
127 | row =>
128 | val coocSim = cooccurrenceSim( row.getAs[Long]("cocount"), row.getAs[Long]("count1"),
129 | row.getAs[Long]("count2") )
130 | ( row.getInt(0), ( row.getInt(1), coocSim ) )
131 | }
132 | .rdd
133 | .groupByKey()
134 | .map{
135 | case (mid, recs) =>
136 | MoviesRecs( mid, recs.toList.sortWith(_._2>_._2).take(MAX_RECOMMENDATION)
137 | .map(x=>Recommendation(x._1,x._2)) )
138 | }
139 | .toDF()
140 | ```
141 |
142 | ## 5.3 基于隐语义算法模型推荐
143 |
144 | ## 5.3.1 思想
145 | *找到隐藏因子,可以对user和item进行关联*
146 | 
147 | ## 5.3.2 算法公式
148 | 
149 | 
150 |
151 | ## 5.3.3 核心算法实例
152 |
153 | ```java
154 | // 训练隐语义模型:Rating(user:Int, product:Int, rating:Double)
155 | val trainData = ratingRDD.map(x => Rating(x._1, x._2, x._3))
156 | // 多个变量赋值
157 | val (rank, iterations, lambda) = (200, 5, 0.1)
158 | val model = ALS.train(trainData, rank, iterations, lambda)
159 |
160 | // 从rating数据中提取所有的uid和mid,并去重
161 | val userRDD = ratingRDD.map(_._1).distinct()
162 | val movieRDD = ratingRDD.map(_._2).distinct()
163 | val userMovies = userRDD.cartesian(movieRDD)
164 |
165 | // 调用model的predict方法预测评分
166 | val preRatings = model.predict(userMovies)
167 |
168 | val userRecs = preRatings
169 | .filter(_.rating > 0)
170 | .map(rating => (rating.user, (rating.product, rating.rating))) // Rating->(uid, (mid, score))
171 | .groupByKey()
172 | .map {
173 | case (uid, recs) => UserRecs(uid, recs.toList.sortWith(_._2 > _._2).take(USER_MAX_RECOMMENDATION).map(x => Recommendation(x._1, x._2)))
174 | }
175 | .toDF()
176 | ```
177 |
--------------------------------------------------------------------------------
/Recommendation/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/Recommendation/ContentRecommendation/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
3 | org.eclipse.jdt.core.compiler.compliance=1.8
4 | org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
5 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
6 | org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
7 | org.eclipse.jdt.core.compiler.release=disabled
8 | org.eclipse.jdt.core.compiler.source=1.8
9 |
--------------------------------------------------------------------------------
/Recommendation/ContentRecommendation/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/Recommendation/ContentRecommendation/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | Recommendation
7 | com.z
8 | 1.0-SNAPSHOT
9 |
10 |
11 | 4.0.0
12 | ContentRecommendation
13 |
14 |
15 |
16 | org.scalanlp
17 | jblas
18 | ${jblas.version}
19 |
20 |
21 |
22 |
23 | org.apache.spark
24 | spark-core_2.11
25 |
26 |
27 | org.apache.spark
28 | spark-sql_2.11
29 |
30 |
31 | org.apache.spark
32 | spark-mllib_2.11
33 |
34 |
35 |
36 |
40 |
41 |
42 | org.mongodb
43 | casbah-core_2.11
44 | ${casbah.version}
45 |
46 |
47 | org.mongodb.spark
48 | mongo-spark-connector_2.11
49 | ${mongodb-spark.version}
50 |
51 |
52 |
53 |
54 | ContentRecommendation
55 |
56 |
57 | org.apache.maven.plugins
58 | maven-assembly-plugin
59 |
60 |
61 |
62 | com.z.content.ContentRecommender
63 |
64 |
65 |
66 | jar-with-dependencies
67 |
68 |
69 |
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/Recommendation/ContentRecommendation/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=info, stdout
2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
3 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
4 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%50t] %-80c(line:%5L) : %m%n
--------------------------------------------------------------------------------
/Recommendation/ContentRecommendation/src/main/scala/com/z/content/ContentRecommender.scala:
--------------------------------------------------------------------------------
1 | package com.z.content
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
5 | import org.apache.spark.ml.linalg.SparseVector
6 | import org.apache.spark.sql.SparkSession
7 | import org.jblas.DoubleMatrix
8 |
9 | // 需要的数据源是电影内容信息
10 | case class Movie(mid: Int, name: String, descri: String, timelong: String, issue: String, shoot: String, language: String,
11 | genres: String, actors: String, directors: String)
12 |
13 | case class MongoConfig(uri:String, db:String)
14 |
15 | // 定义一个基准推荐对象
16 | case class Recommendation( mid: Int, score: Double )
17 |
18 | // 定义电影内容信息提取出的特征向量的电影相似度列表
19 | case class MovieRecs( mid: Int, recs: Seq[Recommendation] )
20 |
21 | object ContentRecommender {
22 |
23 | // 定义表名和常量
24 | val MONGODB_MOVIE_COLLECTION = "Movie"
25 |
26 | val CONTENT_MOVIE_RECS = "ContentMovieRecs"
27 |
28 | def main(args: Array[String]): Unit = {
29 | val config = Map(
30 | "mongo.uri" -> "mongodb://192.168.0.241:27017/recommender",
31 | "mongo.db" -> "recommender"
32 | )
33 |
34 | // 创建一个sparkConf
35 | val warehouseLocation : String = "hdfs://node1:9000/user/hive/warehouse"
36 | val sparkConf = new SparkConf()
37 | .setAppName("推荐系统 - 内容推荐")
38 | .setMaster("spark://node1:7077,node3:7077")
39 | .setJars(List("G:\\JavaEE\\Hadoop-Spark\\RecommendationSystem\\Recommendation\\ContentRecommendation\\target\\" +
40 | "ContentRecommendation-jar-with-dependencies.jar"))
41 | .setIfMissing("spark.driver.host", "192.168.0.28")
42 | .set("spark.num.executors", "3")
43 | .set("spark.executor.cores", "2")
44 | .set("spark.executor.memory", "1800m")
45 | .set("spark.sql.warehouse.dir", warehouseLocation)
46 |
47 | // 创建一个SparkSession
48 | val spark = SparkSession.builder().config(sparkConf).getOrCreate()
49 |
50 | import spark.implicits._
51 | implicit val mongoConfig = MongoConfig(config("mongo.uri"), config("mongo.db"))
52 |
53 | // 加载数据,并作预处理
54 | val movieTagsDF = spark.read
55 | .option("uri", mongoConfig.uri)
56 | .option("collection", MONGODB_MOVIE_COLLECTION)
57 | .format("com.mongodb.spark.sql")
58 | .load()
59 | .as[Movie]
60 | .map(
61 | // 提取mid,name,genres三项作为原始内容特征,分词器默认按照空格做分词
62 | x => ( x.mid, x.name, x.genres.map(c=> if(c=='|') ' ' else c) )
63 | )
64 | .toDF("mid", "name", "genres")
65 | .cache()
66 |
67 | // 核心部分: 用TF-IDF从内容信息中提取电影特征向量
68 | // 创建一个分词器,默认按空格分词
69 | val tokenizer = new Tokenizer().setInputCol("genres").setOutputCol("words")
70 |
71 | // 用分词器对原始数据做转换,生成新的一列words
72 | val wordsData = tokenizer.transform(movieTagsDF)
73 |
74 | // 引入HashingTF工具,可以把一个词语序列转化成对应的词频
75 | val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(50)
76 | val featurizedData = hashingTF.transform(wordsData)
77 |
78 | // 引入IDF工具,可以得到idf模型
79 | val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
80 | // 训练idf模型,得到每个词的逆文档频率
81 | val idfModel = idf.fit(featurizedData)
82 | // 用模型对原数据进行处理,得到文档中每个词的tf-idf,作为新的特征向量
83 | val rescaledData = idfModel.transform(featurizedData)
84 |
85 | val movieFeatures = rescaledData.map {
86 | row => (row.getAs[Int]("mid"), row.getAs[SparseVector]("features").toArray)
87 | }.rdd
88 | .map {
89 | x => (x._1, new DoubleMatrix(x._2))
90 | }
91 |
92 | // 对所有电影两两计算它们的相似度,先做笛卡尔积
93 | val movieRecs = movieFeatures.cartesian(movieFeatures)
94 | .filter{
95 | // 把自己跟自己的配对过滤掉
96 | case (a, b) => a._1 != b._1
97 | }
98 | .map{
99 | case (a, b) => {
100 | val simScore = this.consinSim(a._2, b._2)
101 | ( a._1, ( b._1, simScore ) )
102 | }
103 | }
104 | .filter(_._2._2 > 0.6) // 过滤出相似度大于0.6的
105 | .groupByKey()
106 | .map{
107 | case (mid, items) => MovieRecs( mid, items.toList.sortWith(_._2 > _._2).map(x => Recommendation(x._1, x._2)) )
108 | }
109 | .toDF()
110 |
111 | movieRecs.write
112 | .option("uri", mongoConfig.uri)
113 | .option("collection", CONTENT_MOVIE_RECS)
114 | .mode("overwrite")
115 | .format("com.mongodb.spark.sql")
116 | .save()
117 |
118 | spark.stop()
119 | }
120 |
121 | // 求向量余弦相似度
122 | def consinSim(movie1: DoubleMatrix, movie2: DoubleMatrix):Double ={
123 | movie1.dot(movie2) / ( movie1.norm2() * movie2.norm2() )
124 | }
125 | }
126 |
--------------------------------------------------------------------------------
/Recommendation/ItemCFRecommendation/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | Recommendation
7 | com.z
8 | 1.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | ItemCFRecommendation
13 |
14 |
15 |
16 | org.apache.spark
17 | spark-core_2.11
18 |
19 |
20 | org.apache.spark
21 | spark-sql_2.11
22 |
23 |
28 |
29 |
30 |
31 | org.mongodb
32 | casbah-core_2.11
33 | ${casbah.version}
34 |
35 |
36 |
37 | org.mongodb.spark
38 | mongo-spark-connector_2.11
39 | ${mongodb-spark.version}
40 |
41 |
42 |
43 |
44 | ItemCFRecommendation
45 |
46 |
47 | org.apache.maven.plugins
48 | maven-assembly-plugin
49 |
50 |
51 |
52 | com.z.itemcf.ItemCFRecommender
53 |
54 |
55 |
56 | jar-with-dependencies
57 |
58 |
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/Recommendation/ItemCFRecommendation/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootLogger=warn, stdout
2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
3 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
4 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%50t] %-80c(line:%5L) : %m%n
5 |
--------------------------------------------------------------------------------
/Recommendation/ItemCFRecommendation/src/main/scala/com/z/itemcf/ItemCFRecommender.scala:
--------------------------------------------------------------------------------
1 | package com.z.itemcf
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.sql.SparkSession
5 |
6 | case class MongoConfig( uri: String, db: String )
7 |
8 | case class MovieRating( uid: Int, mid: Int, score: Double, timestamp: Int )
9 |
10 | case class Recommendation( mid: Int, score: Double )
11 |
12 | case class MoviesRecs( mid: Int, recs: Seq[Recommendation] )
13 |
14 | object ItemCFRecommender {
15 | // 定义常量和表名
16 | val MONGODB_RATING_COLLECTION = "Rating"
17 | val ITEM_CF_MOVIE_RECS = "ItemCFMoviesRecs"
18 | val MAX_RECOMMENDATION = 10
19 |
20 | def main(args: Array[String]): Unit = {
21 | val config = Map(
22 | "mongo.uri" -> "mongodb://192.168.0.241:27017/recommender",
23 | "mongo.db" -> "recommender"
24 | )
25 | // 创建一个sparkConf
26 | val warehouseLocation : String = "hdfs://node1:9000/user/hive/warehouse"
27 | val sparkConf = new SparkConf()
28 | .setAppName("推荐系统 - itemCF")
29 | .setMaster("spark://node1:7077,node3:7077")
30 | .setJars(List("G:\\JavaEE\\Hadoop-Spark\\RecommendationSystem\\Recommendation\\ItemCFRecommendation\\target\\" +
31 | "ItemCFRecommendation-jar-with-dependencies.jar"))
32 | .setIfMissing("spark.driver.host", "192.168.0.28")
33 | .set("spark.num.executors", "3")
34 | .set("spark.executor.cores", "2")
35 | .set("spark.executor.memory", "1800m")
36 | .set("spark.sql.warehouse.dir", warehouseLocation)
37 |
38 | // 创建spark session
39 | val spark = SparkSession.builder().config(sparkConf).getOrCreate()
40 |
41 | import spark.implicits._
42 | implicit val mongoConfig = MongoConfig( config("mongo.uri"), config("mongo.db") )
43 |
44 | // 加载数据,转换成DF进行处理
45 | val ratingDF = spark.read
46 | .option("uri", mongoConfig.uri)
47 | .option("collection", MONGODB_RATING_COLLECTION)
48 | .format("com.mongodb.spark.sql")
49 | .load()
50 | .as[MovieRating]
51 | .map(
52 | x => ( x.uid, x.mid, x.score )
53 | )
54 | .toDF("uid", "mid", "score")
55 | .cache()
56 |
57 | val productRatingCountDF = ratingDF.groupBy("mid").count() //默认clos as:count
58 | val ratingWithCountDF = ratingDF.join(productRatingCountDF, "mid")
59 |
60 | // 核心算法:
61 | val joinedDF = ratingWithCountDF.join(ratingWithCountDF, "uid") // .where($"mid" != $"mid")
62 | .toDF("uid","mid1","score1","count1","mid2","score2","count2")
63 | .select("uid","mid1","count1","mid2","count2").where($"mid1" =!= $"mid2")
64 |
65 | joinedDF.createOrReplaceTempView("joined")
66 | // scala """ | stripMargin妙用.注意string.spilit("""|""")
67 | val cooccurrenceDF = spark.sql(
68 | """
69 | |select mid1
70 | |, mid2
71 | |, count(uid) as cocount
72 | |, first(count1) as count1
73 | |, first(count2) as count2
74 | |from joined
75 | |group by mid1, mid2
76 | """.stripMargin
77 | ).cache()
78 |
79 | // ( mid1, (mid2, score) )
80 | val simDF = cooccurrenceDF.map{
81 | row =>
82 | val coocSim = cooccurrenceSim( row.getAs[Long]("cocount"), row.getAs[Long]("count1"),
83 | row.getAs[Long]("count2") )
84 | ( row.getInt(0), ( row.getInt(1), coocSim ) )
85 | }
86 | .rdd
87 | .groupByKey()
88 | .map{
89 | case (mid, recs) =>
90 | MoviesRecs( mid, recs.toList.sortWith(_._2>_._2).take(MAX_RECOMMENDATION)
91 | .map(x=>Recommendation(x._1,x._2)) )
92 | }
93 | .toDF()
94 |
95 | simDF.write
96 | .option("uri", mongoConfig.uri)
97 | .option("collection", ITEM_CF_MOVIE_RECS)
98 | .mode("overwrite")
99 | .format("com.mongodb.spark.sql")
100 | .save()
101 |
102 | spark.stop()
103 | }
104 |
105 | // 同现相似度计算公式
106 | def cooccurrenceSim(coCount: Long, count1: Long, count2: Long): Double ={
107 | coCount / math.sqrt( count1 * count2 )
108 | }
109 |
110 | }
111 |
--------------------------------------------------------------------------------
/Recommendation/KafkaStream/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/main/java/com/z/kafkastream/Application.java=UTF-8
3 | encoding//src/main/java/com/z/kafkastream/LogProcessor.java=UTF-8
4 | encoding//src/main/java/com/z/kafkastream/MyEventTimeExtractor.java=UTF-8
5 |
--------------------------------------------------------------------------------
/Recommendation/KafkaStream/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
3 | org.eclipse.jdt.core.compiler.compliance=1.8
4 | org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
5 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
6 | org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
7 | org.eclipse.jdt.core.compiler.release=disabled
8 | org.eclipse.jdt.core.compiler.source=1.8
9 |
--------------------------------------------------------------------------------
/Recommendation/KafkaStream/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/Recommendation/KafkaStream/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | Recommendation
7 | com.z
8 | 1.0-SNAPSHOT
9 |
10 |
11 | 4.0.0
12 | KafkaStream
13 |
14 |
15 |
16 | org.apache.kafka
17 | kafka-streams
18 | ${kafka.version}
19 |
20 |
21 |
22 | org.apache.kafka
23 | kafka-clients
24 | ${kafka.version}
25 |
26 |
27 |
28 |
29 | kafkastream
30 |
31 |
32 | org.apache.maven.plugins
33 | maven-assembly-plugin
34 |
35 |
36 |
37 | com.z.kafkastream.Application
38 |
39 |
40 |
41 | jar-with-dependencies
42 |
43 |
44 |
45 |
46 | make-assembly
47 | package
48 |
49 | single
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/Recommendation/KafkaStream/src/main/java/com/z/kafkastream/Application.java:
--------------------------------------------------------------------------------
1 | package com.z.kafkastream;
2 |
3 | import org.apache.kafka.streams.KafkaStreams;
4 | import org.apache.kafka.streams.StreamsConfig;
5 | import org.apache.kafka.streams.processor.TopologyBuilder;
6 |
7 | import java.util.Properties;
8 |
9 | /**
10 | * kafka stream实时流
11 | */
12 | public class Application {
13 | public static void main(String[] args) {
14 | String brokers = "node1:9092,node2:9092,node3:9092";
15 | // String zookeepers = "node1:2181,node2:2181,node3:2181";
16 |
17 | // topic
18 | String from = "log";
19 | String to = "recommender";
20 |
21 | // kafka消费者配置
22 | Properties settings = new Properties();
23 | settings.put(StreamsConfig.APPLICATION_ID_CONFIG, "logFilter");
24 | settings.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
25 | // flume端的kafka是0.8.11版本,兼容发送没有发送时间戳
26 | settings.put(StreamsConfig.TIMESTAMP_EXTRACTOR_CLASS_CONFIG, MyEventTimeExtractor.class.getName());
27 | // settings.put(StreamsConfig.ZOOKEEPER_CONNECT_CONFIG, zookeepers);
28 |
29 | // 创建 kafka stream 配置对象
30 | StreamsConfig config = new StreamsConfig(settings);
31 |
32 | // 创建一个拓扑建构器
33 | TopologyBuilder builder = new TopologyBuilder();
34 |
35 | // 定义流处理的拓扑结构
36 | builder.addSource("SOURCE", from)
37 | .addProcessor("PROCESSOR", ()-> new LogProcessor(), "SOURCE")
38 | .addSink("SINK", to, "PROCESSOR");
39 |
40 | KafkaStreams streams = new KafkaStreams( builder, config );
41 |
42 | streams.start();
43 |
44 | System.out.println("Kafka stream started!>>>>>>>>>>>");
45 |
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/Recommendation/KafkaStream/src/main/java/com/z/kafkastream/LogProcessor.java:
--------------------------------------------------------------------------------
1 | package com.z.kafkastream;
2 |
3 | import org.apache.kafka.streams.processor.Processor;
4 | import org.apache.kafka.streams.processor.ProcessorContext;
5 |
6 | public class LogProcessor implements Processor {
7 |
8 | private ProcessorContext context;
9 |
10 | @Override
11 | public void init(ProcessorContext processorContext) {
12 | this.context = processorContext;
13 | }
14 |
15 | @Override
16 | public void process(byte[] dummy, byte[] line) {
17 | // 把收集到的日志信息用string表示
18 | String input = new String(line);
19 |
20 | // flume已经正则匹配:根据前缀MOVIE_RATING_PREFIX:从日志信息中提取评分数据
21 | if (input.contains("MOVIE_RATING_PREFIX:")) {
22 | input = input.split("MOVIE_RATING_PREFIX:")[1].trim();
23 | System.out.println("评分数据:" + input);
24 |
25 | context.forward("logProcessor".getBytes(), input.getBytes());
26 | }
27 | }
28 |
29 | @Override
30 | public void punctuate(long l) {
31 |
32 | }
33 |
34 | @Override
35 | public void close() {
36 |
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/Recommendation/KafkaStream/src/main/java/com/z/kafkastream/MyEventTimeExtractor.java:
--------------------------------------------------------------------------------
1 | package com.z.kafkastream;
2 |
3 | import org.apache.kafka.clients.consumer.ConsumerRecord;
4 | import org.apache.kafka.streams.processor.TimestampExtractor;
5 |
6 | /**
7 | * kafka 0.10以下生产消息没有时间戳,flume使用kafka版本较低
8 | */
9 | public class MyEventTimeExtractor implements TimestampExtractor{
10 |
11 | @Override
12 | public long extract(ConsumerRecord