├── .settings └── org.eclipse.m2e.core.prefs ├── 01.png ├── 02.png ├── 03.png ├── 04.png ├── 05.png ├── 06.png ├── 07.png ├── 08.png ├── 09.png ├── README.md ├── Recommendation ├── .settings │ └── org.eclipse.m2e.core.prefs ├── ContentRecommendation │ ├── .settings │ │ ├── org.eclipse.jdt.core.prefs │ │ └── org.eclipse.m2e.core.prefs │ ├── pom.xml │ └── src │ │ └── main │ │ ├── resources │ │ └── log4j.properties │ │ └── scala │ │ └── com │ │ └── z │ │ └── content │ │ └── ContentRecommender.scala ├── ItemCFRecommendation │ ├── pom.xml │ └── src │ │ └── main │ │ ├── resources │ │ └── log4j.properties │ │ └── scala │ │ └── com │ │ └── z │ │ └── itemcf │ │ └── ItemCFRecommender.scala ├── KafkaStream │ ├── .settings │ │ ├── org.eclipse.core.resources.prefs │ │ ├── org.eclipse.jdt.core.prefs │ │ └── org.eclipse.m2e.core.prefs │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── com │ │ │ └── z │ │ │ └── kafkastream │ │ │ ├── Application.java │ │ │ ├── LogProcessor.java │ │ │ └── MyEventTimeExtractor.java │ │ └── resources │ │ └── log4j.properties ├── OfflineRecommendation │ ├── .settings │ │ ├── org.eclipse.jdt.core.prefs │ │ └── org.eclipse.m2e.core.prefs │ ├── pom.xml │ └── src │ │ └── main │ │ ├── resources │ │ └── log4j.properties │ │ └── scala │ │ └── com │ │ └── z │ │ └── offline │ │ ├── ALSTrainer.scala │ │ └── OfflineRecommender.scala ├── StatisticsRecommendation │ ├── .settings │ │ ├── org.eclipse.jdt.core.prefs │ │ └── org.eclipse.m2e.core.prefs │ ├── pom.xml │ └── src │ │ └── main │ │ ├── resources │ │ └── log4j.properties │ │ └── scala │ │ └── com │ │ └── z │ │ └── statistics │ │ └── StatisticsRecommender.scala ├── StreamingRecommendation │ ├── .settings │ │ ├── org.eclipse.jdt.core.prefs │ │ └── org.eclipse.m2e.core.prefs │ ├── pom.xml │ └── src │ │ └── main │ │ ├── resources │ │ └── log4j.properties │ │ └── scala │ │ └── com │ │ └── z │ │ └── streaming │ │ └── StreamingRecommender.scala └── pom.xml ├── pom.xml ├── readme.docx └── ~$readme.docx /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/01.png -------------------------------------------------------------------------------- /02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/02.png -------------------------------------------------------------------------------- /03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/03.png -------------------------------------------------------------------------------- /04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/04.png -------------------------------------------------------------------------------- /05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/05.png -------------------------------------------------------------------------------- /06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/06.png -------------------------------------------------------------------------------- /07.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/07.png -------------------------------------------------------------------------------- /08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/08.png -------------------------------------------------------------------------------- /09.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/09.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ***源码:https://github.com/wolf-song-ml/RecommendationSystem*** 2 | ## 实战篇 3 | 4 | ## 1 项目技术架构 5 | ![项目技术架构](https://github.com/wolf-song-ml/RecommendationSystem/blob/master/01.png) 6 | 7 | ## 2 项目涉及关键技术 8 | 9 | - Redis:存储用户最近评测队列 10 | - Mongdb:BI可视化查询 11 | - Elastic Search:文本关键词模糊检索索引、类别完全匹配检索、More like this基于内容推荐api 12 | - Flume:实时评测数据采集 13 | - Kafka:采集数据中间消息通道 Kafka stream:消息转发中间管道 14 | - Spark:spark sql、spark stream、spark M数据统计、加载数据源引擎、机器学习模型 15 | - ScalaNLP:JAVA矩阵计算 16 | 17 | ## 理论篇 18 | 19 | ## 1 推荐系统的意义 - 解决信息过载 20 | 21 | - 搜索引擎时代 22 | 23 | 分类导航:雅虎 24 | 搜索:谷歌、百度 25 | 26 | - 个性化时代(提高用户粘度、增加营收) 27 | 28 | 系统自动推荐相关的东西:今日头条、豆瓣、电商 29 | 30 | ## 2 推荐系统的分类 31 | 32 | - 基于人口统计学的推荐 33 | - 基于内容的推荐 34 | 35 | - 基于协同过滤的推荐 36 | 37 | ## 3 基于人口统计学的推荐 38 | 39 | 基于人口统计学的推荐机制(Demographic-based Recommendation)是一种最易于实现的推荐方法,它只是简单的根据系统用户的基本信息发现用户的相关程度,然后将相似用户喜爱的其他物品推荐给当前用户。 40 | ![基于人口统计学的推荐](https://upload-images.jianshu.io/upload_images/21415382-be2ff3b26716dbe1?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 41 | 42 | ## 4 基于内容的推荐 43 | 44 | ## 4.1 定义 45 | 46 | 基于内容的推荐是在推荐引擎出现之初应用最为广泛的推荐机制,它的核心思想是根据推荐物品或内容的元数据,发现物品或者内容的相关性,然后基于用户以往的喜好记录,推荐给用户相似的物品。 47 | 48 | ## 4.2 算法流程 49 | 50 | - 对于物品的特征提取——打标签(tag) 51 | - 对于文本信息的特征提取——关键词 52 | - 生成分词特征向量矩阵 53 | - 计算相似度,常用余弦相似度 54 | ![余弦相似度公式](https://upload-images.jianshu.io/upload_images/21415382-d47a3e4a0aa0dc2f?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 55 | 56 | ## 4.3 核心代码 57 | 58 | ## 4.3.1 spark TF-IDF 59 | 60 | ```java 61 | // 核心部分: 用TF-IDF从内容信息中提取电影特征向量 62 | // 创建一个分词器,默认按空格分词 63 | val tokenizer = new Tokenizer().setInputCol("genres").setOutputCol("words") 64 | 65 | // 用分词器对原始数据做转换,生成新的一列words 66 | val wordsData = tokenizer.transform(movieTagsDF) 67 | 68 | // 引入HashingTF工具,可以把一个词语序列转化成对应的词频 69 | val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(50) 70 | val featurizedData = hashingTF.transform(wordsData) 71 | 72 | // 引入IDF工具,可以得到idf模型 73 | val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") 74 | // 训练idf模型,得到每个词的逆文档频率 75 | val idfModel = idf.fit(featurizedData) 76 | // 用模型对原数据进行处理,得到文档中每个词的tf-idf,作为新的特征向量 77 | val rescaledData = idfModel.transform(featurizedData) 78 | 79 | val movieRecs = movieFeatures.cartesian(movieFeatures) 80 | .filter{ 81 | // 把自己跟自己的配对过滤掉 82 | case (a, b) => a._1 != b._1 83 | } 84 | .map{ 85 | case (a, b) => { 86 | val simScore = this.consinSim(a._2, b._2) 87 | ( a._1, ( b._1, simScore ) ) 88 | } 89 | } 90 | .filter(_._2._2 > 0.6) // 过滤出相似度大于0.6的 91 | .groupByKey() 92 | .map{ 93 | case (mid, items) => MovieRecs( mid, items.toList.sortWith(_._2 > _._2).map(x => Recommendation(x._1, x._2)) ) 94 | } 95 | .toDF() 96 | ``` 97 | 98 | ## 4.3.2 ElasticSearch More like this 99 | 100 | ```java 101 | MoreLikeThisQueryBuilder query = QueryBuilders.moreLikeThisQuery( 102 | /*new String[]{"name", "descri", "genres", "actors", "directors", "tags"},*/ 103 | new MoreLikeThisQueryBuilder.Item[]{new MoreLikeThisQueryBuilder.Item(Constant.ES_INDEX, 104 | Constant.ES_MOVIE_TYPE, String.valueOf(mid))}); 105 | ``` 106 | 107 | ## 5 基于协同过滤的推荐 108 | 109 | ## 5.1基于用户的协同过滤(UserCF) 110 | 111 | *计算用户的相似度,推荐相似用户的喜好* 112 | ![](https://upload-images.jianshu.io/upload_images/21415382-4bf944aa309aa852?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 113 | 114 | ## 5.2 基于物品的协同过滤(ItemCF重点) 115 | 116 | *计算物品的相似度,推荐相似度高的物品(不同于基于内容的推荐)* 117 | ![](https://upload-images.jianshu.io/upload_images/21415382-c7cec43faa7254db?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 118 | 119 | ## 5.2.1核心算法:计算同现相似度 120 | ![同现相似度公式](https://upload-images.jianshu.io/upload_images/21415382-34cc04c19e2361da?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 121 | 122 | ## 5.2.2 核心算法实例 123 | 124 | ```java 125 | // ( mid1, (mid2, score) ) 126 | val simDF = cooccurrenceDF.map{ 127 | row => 128 | val coocSim = cooccurrenceSim( row.getAs[Long]("cocount"), row.getAs[Long]("count1"), 129 | row.getAs[Long]("count2") ) 130 | ( row.getInt(0), ( row.getInt(1), coocSim ) ) 131 | } 132 | .rdd 133 | .groupByKey() 134 | .map{ 135 | case (mid, recs) => 136 | MoviesRecs( mid, recs.toList.sortWith(_._2>_._2).take(MAX_RECOMMENDATION) 137 | .map(x=>Recommendation(x._1,x._2)) ) 138 | } 139 | .toDF() 140 | ``` 141 | 142 | ## 5.3 基于隐语义算法模型推荐 143 | 144 | ## 5.3.1 思想 145 | *找到隐藏因子,可以对user和item进行关联* 146 | ![](https://upload-images.jianshu.io/upload_images/21415382-6640b7dbb9a26eca?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 147 | ## 5.3.2 算法公式 148 | ![隐式分解](https://upload-images.jianshu.io/upload_images/21415382-5a091130396d255a?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 149 | ![损失函数最小化求解](https://upload-images.jianshu.io/upload_images/21415382-a87afceaf9624020.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 150 | 151 | ## 5.3.3 核心算法实例 152 | 153 | ```java 154 | // 训练隐语义模型:Rating(user:Int, product:Int, rating:Double) 155 | val trainData = ratingRDD.map(x => Rating(x._1, x._2, x._3)) 156 | // 多个变量赋值 157 | val (rank, iterations, lambda) = (200, 5, 0.1) 158 | val model = ALS.train(trainData, rank, iterations, lambda) 159 | 160 | // 从rating数据中提取所有的uid和mid,并去重 161 | val userRDD = ratingRDD.map(_._1).distinct() 162 | val movieRDD = ratingRDD.map(_._2).distinct() 163 | val userMovies = userRDD.cartesian(movieRDD) 164 | 165 | // 调用model的predict方法预测评分 166 | val preRatings = model.predict(userMovies) 167 | 168 | val userRecs = preRatings 169 | .filter(_.rating > 0) 170 | .map(rating => (rating.user, (rating.product, rating.rating))) // Rating->(uid, (mid, score)) 171 | .groupByKey() 172 | .map { 173 | case (uid, recs) => UserRecs(uid, recs.toList.sortWith(_._2 > _._2).take(USER_MAX_RECOMMENDATION).map(x => Recommendation(x._1, x._2))) 174 | } 175 | .toDF() 176 | ``` 177 | -------------------------------------------------------------------------------- /Recommendation/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /Recommendation/ContentRecommendation/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 3 | org.eclipse.jdt.core.compiler.compliance=1.8 4 | org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled 5 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 6 | org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore 7 | org.eclipse.jdt.core.compiler.release=disabled 8 | org.eclipse.jdt.core.compiler.source=1.8 9 | -------------------------------------------------------------------------------- /Recommendation/ContentRecommendation/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /Recommendation/ContentRecommendation/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | Recommendation 7 | com.z 8 | 1.0-SNAPSHOT 9 | 10 | 11 | 4.0.0 12 | ContentRecommendation 13 | 14 | 15 | 16 | org.scalanlp 17 | jblas 18 | ${jblas.version} 19 | 20 | 21 | 22 | 23 | org.apache.spark 24 | spark-core_2.11 25 | 26 | 27 | org.apache.spark 28 | spark-sql_2.11 29 | 30 | 31 | org.apache.spark 32 | spark-mllib_2.11 33 | 34 | 35 | 36 | 40 | 41 | 42 | org.mongodb 43 | casbah-core_2.11 44 | ${casbah.version} 45 | 46 | 47 | org.mongodb.spark 48 | mongo-spark-connector_2.11 49 | ${mongodb-spark.version} 50 | 51 | 52 | 53 | 54 | ContentRecommendation 55 | 56 | 57 | org.apache.maven.plugins 58 | maven-assembly-plugin 59 | 60 | 61 | 62 | com.z.content.ContentRecommender 63 | 64 | 65 | 66 | jar-with-dependencies 67 | 68 | 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /Recommendation/ContentRecommendation/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=info, stdout 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 3 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 4 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%50t] %-80c(line:%5L) : %m%n -------------------------------------------------------------------------------- /Recommendation/ContentRecommendation/src/main/scala/com/z/content/ContentRecommender.scala: -------------------------------------------------------------------------------- 1 | package com.z.content 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} 5 | import org.apache.spark.ml.linalg.SparseVector 6 | import org.apache.spark.sql.SparkSession 7 | import org.jblas.DoubleMatrix 8 | 9 | // 需要的数据源是电影内容信息 10 | case class Movie(mid: Int, name: String, descri: String, timelong: String, issue: String, shoot: String, language: String, 11 | genres: String, actors: String, directors: String) 12 | 13 | case class MongoConfig(uri:String, db:String) 14 | 15 | // 定义一个基准推荐对象 16 | case class Recommendation( mid: Int, score: Double ) 17 | 18 | // 定义电影内容信息提取出的特征向量的电影相似度列表 19 | case class MovieRecs( mid: Int, recs: Seq[Recommendation] ) 20 | 21 | object ContentRecommender { 22 | 23 | // 定义表名和常量 24 | val MONGODB_MOVIE_COLLECTION = "Movie" 25 | 26 | val CONTENT_MOVIE_RECS = "ContentMovieRecs" 27 | 28 | def main(args: Array[String]): Unit = { 29 | val config = Map( 30 | "mongo.uri" -> "mongodb://192.168.0.241:27017/recommender", 31 | "mongo.db" -> "recommender" 32 | ) 33 | 34 | // 创建一个sparkConf 35 | val warehouseLocation : String = "hdfs://node1:9000/user/hive/warehouse" 36 | val sparkConf = new SparkConf() 37 | .setAppName("推荐系统 - 内容推荐") 38 | .setMaster("spark://node1:7077,node3:7077") 39 | .setJars(List("G:\\JavaEE\\Hadoop-Spark\\RecommendationSystem\\Recommendation\\ContentRecommendation\\target\\" + 40 | "ContentRecommendation-jar-with-dependencies.jar")) 41 | .setIfMissing("spark.driver.host", "192.168.0.28") 42 | .set("spark.num.executors", "3") 43 | .set("spark.executor.cores", "2") 44 | .set("spark.executor.memory", "1800m") 45 | .set("spark.sql.warehouse.dir", warehouseLocation) 46 | 47 | // 创建一个SparkSession 48 | val spark = SparkSession.builder().config(sparkConf).getOrCreate() 49 | 50 | import spark.implicits._ 51 | implicit val mongoConfig = MongoConfig(config("mongo.uri"), config("mongo.db")) 52 | 53 | // 加载数据,并作预处理 54 | val movieTagsDF = spark.read 55 | .option("uri", mongoConfig.uri) 56 | .option("collection", MONGODB_MOVIE_COLLECTION) 57 | .format("com.mongodb.spark.sql") 58 | .load() 59 | .as[Movie] 60 | .map( 61 | // 提取mid,name,genres三项作为原始内容特征,分词器默认按照空格做分词 62 | x => ( x.mid, x.name, x.genres.map(c=> if(c=='|') ' ' else c) ) 63 | ) 64 | .toDF("mid", "name", "genres") 65 | .cache() 66 | 67 | // 核心部分: 用TF-IDF从内容信息中提取电影特征向量 68 | // 创建一个分词器,默认按空格分词 69 | val tokenizer = new Tokenizer().setInputCol("genres").setOutputCol("words") 70 | 71 | // 用分词器对原始数据做转换,生成新的一列words 72 | val wordsData = tokenizer.transform(movieTagsDF) 73 | 74 | // 引入HashingTF工具,可以把一个词语序列转化成对应的词频 75 | val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(50) 76 | val featurizedData = hashingTF.transform(wordsData) 77 | 78 | // 引入IDF工具,可以得到idf模型 79 | val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") 80 | // 训练idf模型,得到每个词的逆文档频率 81 | val idfModel = idf.fit(featurizedData) 82 | // 用模型对原数据进行处理,得到文档中每个词的tf-idf,作为新的特征向量 83 | val rescaledData = idfModel.transform(featurizedData) 84 | 85 | val movieFeatures = rescaledData.map { 86 | row => (row.getAs[Int]("mid"), row.getAs[SparseVector]("features").toArray) 87 | }.rdd 88 | .map { 89 | x => (x._1, new DoubleMatrix(x._2)) 90 | } 91 | 92 | // 对所有电影两两计算它们的相似度,先做笛卡尔积 93 | val movieRecs = movieFeatures.cartesian(movieFeatures) 94 | .filter{ 95 | // 把自己跟自己的配对过滤掉 96 | case (a, b) => a._1 != b._1 97 | } 98 | .map{ 99 | case (a, b) => { 100 | val simScore = this.consinSim(a._2, b._2) 101 | ( a._1, ( b._1, simScore ) ) 102 | } 103 | } 104 | .filter(_._2._2 > 0.6) // 过滤出相似度大于0.6的 105 | .groupByKey() 106 | .map{ 107 | case (mid, items) => MovieRecs( mid, items.toList.sortWith(_._2 > _._2).map(x => Recommendation(x._1, x._2)) ) 108 | } 109 | .toDF() 110 | 111 | movieRecs.write 112 | .option("uri", mongoConfig.uri) 113 | .option("collection", CONTENT_MOVIE_RECS) 114 | .mode("overwrite") 115 | .format("com.mongodb.spark.sql") 116 | .save() 117 | 118 | spark.stop() 119 | } 120 | 121 | // 求向量余弦相似度 122 | def consinSim(movie1: DoubleMatrix, movie2: DoubleMatrix):Double ={ 123 | movie1.dot(movie2) / ( movie1.norm2() * movie2.norm2() ) 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /Recommendation/ItemCFRecommendation/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | Recommendation 7 | com.z 8 | 1.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | ItemCFRecommendation 13 | 14 | 15 | 16 | org.apache.spark 17 | spark-core_2.11 18 | 19 | 20 | org.apache.spark 21 | spark-sql_2.11 22 | 23 | 28 | 29 | 30 | 31 | org.mongodb 32 | casbah-core_2.11 33 | ${casbah.version} 34 | 35 | 36 | 37 | org.mongodb.spark 38 | mongo-spark-connector_2.11 39 | ${mongodb-spark.version} 40 | 41 | 42 | 43 | 44 | ItemCFRecommendation 45 | 46 | 47 | org.apache.maven.plugins 48 | maven-assembly-plugin 49 | 50 | 51 | 52 | com.z.itemcf.ItemCFRecommender 53 | 54 | 55 | 56 | jar-with-dependencies 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /Recommendation/ItemCFRecommendation/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=warn, stdout 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 3 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 4 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%50t] %-80c(line:%5L) : %m%n 5 | -------------------------------------------------------------------------------- /Recommendation/ItemCFRecommendation/src/main/scala/com/z/itemcf/ItemCFRecommender.scala: -------------------------------------------------------------------------------- 1 | package com.z.itemcf 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | 6 | case class MongoConfig( uri: String, db: String ) 7 | 8 | case class MovieRating( uid: Int, mid: Int, score: Double, timestamp: Int ) 9 | 10 | case class Recommendation( mid: Int, score: Double ) 11 | 12 | case class MoviesRecs( mid: Int, recs: Seq[Recommendation] ) 13 | 14 | object ItemCFRecommender { 15 | // 定义常量和表名 16 | val MONGODB_RATING_COLLECTION = "Rating" 17 | val ITEM_CF_MOVIE_RECS = "ItemCFMoviesRecs" 18 | val MAX_RECOMMENDATION = 10 19 | 20 | def main(args: Array[String]): Unit = { 21 | val config = Map( 22 | "mongo.uri" -> "mongodb://192.168.0.241:27017/recommender", 23 | "mongo.db" -> "recommender" 24 | ) 25 | // 创建一个sparkConf 26 | val warehouseLocation : String = "hdfs://node1:9000/user/hive/warehouse" 27 | val sparkConf = new SparkConf() 28 | .setAppName("推荐系统 - itemCF") 29 | .setMaster("spark://node1:7077,node3:7077") 30 | .setJars(List("G:\\JavaEE\\Hadoop-Spark\\RecommendationSystem\\Recommendation\\ItemCFRecommendation\\target\\" + 31 | "ItemCFRecommendation-jar-with-dependencies.jar")) 32 | .setIfMissing("spark.driver.host", "192.168.0.28") 33 | .set("spark.num.executors", "3") 34 | .set("spark.executor.cores", "2") 35 | .set("spark.executor.memory", "1800m") 36 | .set("spark.sql.warehouse.dir", warehouseLocation) 37 | 38 | // 创建spark session 39 | val spark = SparkSession.builder().config(sparkConf).getOrCreate() 40 | 41 | import spark.implicits._ 42 | implicit val mongoConfig = MongoConfig( config("mongo.uri"), config("mongo.db") ) 43 | 44 | // 加载数据,转换成DF进行处理 45 | val ratingDF = spark.read 46 | .option("uri", mongoConfig.uri) 47 | .option("collection", MONGODB_RATING_COLLECTION) 48 | .format("com.mongodb.spark.sql") 49 | .load() 50 | .as[MovieRating] 51 | .map( 52 | x => ( x.uid, x.mid, x.score ) 53 | ) 54 | .toDF("uid", "mid", "score") 55 | .cache() 56 | 57 | val productRatingCountDF = ratingDF.groupBy("mid").count() //默认clos as:count 58 | val ratingWithCountDF = ratingDF.join(productRatingCountDF, "mid") 59 | 60 | // 核心算法: 61 | val joinedDF = ratingWithCountDF.join(ratingWithCountDF, "uid") // .where($"mid" != $"mid") 62 | .toDF("uid","mid1","score1","count1","mid2","score2","count2") 63 | .select("uid","mid1","count1","mid2","count2").where($"mid1" =!= $"mid2") 64 | 65 | joinedDF.createOrReplaceTempView("joined") 66 | // scala """ | stripMargin妙用.注意string.spilit("""|""") 67 | val cooccurrenceDF = spark.sql( 68 | """ 69 | |select mid1 70 | |, mid2 71 | |, count(uid) as cocount 72 | |, first(count1) as count1 73 | |, first(count2) as count2 74 | |from joined 75 | |group by mid1, mid2 76 | """.stripMargin 77 | ).cache() 78 | 79 | // ( mid1, (mid2, score) ) 80 | val simDF = cooccurrenceDF.map{ 81 | row => 82 | val coocSim = cooccurrenceSim( row.getAs[Long]("cocount"), row.getAs[Long]("count1"), 83 | row.getAs[Long]("count2") ) 84 | ( row.getInt(0), ( row.getInt(1), coocSim ) ) 85 | } 86 | .rdd 87 | .groupByKey() 88 | .map{ 89 | case (mid, recs) => 90 | MoviesRecs( mid, recs.toList.sortWith(_._2>_._2).take(MAX_RECOMMENDATION) 91 | .map(x=>Recommendation(x._1,x._2)) ) 92 | } 93 | .toDF() 94 | 95 | simDF.write 96 | .option("uri", mongoConfig.uri) 97 | .option("collection", ITEM_CF_MOVIE_RECS) 98 | .mode("overwrite") 99 | .format("com.mongodb.spark.sql") 100 | .save() 101 | 102 | spark.stop() 103 | } 104 | 105 | // 同现相似度计算公式 106 | def cooccurrenceSim(coCount: Long, count1: Long, count2: Long): Double ={ 107 | coCount / math.sqrt( count1 * count2 ) 108 | } 109 | 110 | } 111 | -------------------------------------------------------------------------------- /Recommendation/KafkaStream/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java/com/z/kafkastream/Application.java=UTF-8 3 | encoding//src/main/java/com/z/kafkastream/LogProcessor.java=UTF-8 4 | encoding//src/main/java/com/z/kafkastream/MyEventTimeExtractor.java=UTF-8 5 | -------------------------------------------------------------------------------- /Recommendation/KafkaStream/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 3 | org.eclipse.jdt.core.compiler.compliance=1.8 4 | org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled 5 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 6 | org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore 7 | org.eclipse.jdt.core.compiler.release=disabled 8 | org.eclipse.jdt.core.compiler.source=1.8 9 | -------------------------------------------------------------------------------- /Recommendation/KafkaStream/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /Recommendation/KafkaStream/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | Recommendation 7 | com.z 8 | 1.0-SNAPSHOT 9 | 10 | 11 | 4.0.0 12 | KafkaStream 13 | 14 | 15 | 16 | org.apache.kafka 17 | kafka-streams 18 | ${kafka.version} 19 | 20 | 21 | 22 | org.apache.kafka 23 | kafka-clients 24 | ${kafka.version} 25 | 26 | 27 | 28 | 29 | kafkastream 30 | 31 | 32 | org.apache.maven.plugins 33 | maven-assembly-plugin 34 | 35 | 36 | 37 | com.z.kafkastream.Application 38 | 39 | 40 | 41 | jar-with-dependencies 42 | 43 | 44 | 45 | 46 | make-assembly 47 | package 48 | 49 | single 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /Recommendation/KafkaStream/src/main/java/com/z/kafkastream/Application.java: -------------------------------------------------------------------------------- 1 | package com.z.kafkastream; 2 | 3 | import org.apache.kafka.streams.KafkaStreams; 4 | import org.apache.kafka.streams.StreamsConfig; 5 | import org.apache.kafka.streams.processor.TopologyBuilder; 6 | 7 | import java.util.Properties; 8 | 9 | /** 10 | * kafka stream实时流 11 | */ 12 | public class Application { 13 | public static void main(String[] args) { 14 | String brokers = "node1:9092,node2:9092,node3:9092"; 15 | // String zookeepers = "node1:2181,node2:2181,node3:2181"; 16 | 17 | // topic 18 | String from = "log"; 19 | String to = "recommender"; 20 | 21 | // kafka消费者配置 22 | Properties settings = new Properties(); 23 | settings.put(StreamsConfig.APPLICATION_ID_CONFIG, "logFilter"); 24 | settings.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, brokers); 25 | // flume端的kafka是0.8.11版本,兼容发送没有发送时间戳 26 | settings.put(StreamsConfig.TIMESTAMP_EXTRACTOR_CLASS_CONFIG, MyEventTimeExtractor.class.getName()); 27 | // settings.put(StreamsConfig.ZOOKEEPER_CONNECT_CONFIG, zookeepers); 28 | 29 | // 创建 kafka stream 配置对象 30 | StreamsConfig config = new StreamsConfig(settings); 31 | 32 | // 创建一个拓扑建构器 33 | TopologyBuilder builder = new TopologyBuilder(); 34 | 35 | // 定义流处理的拓扑结构 36 | builder.addSource("SOURCE", from) 37 | .addProcessor("PROCESSOR", ()-> new LogProcessor(), "SOURCE") 38 | .addSink("SINK", to, "PROCESSOR"); 39 | 40 | KafkaStreams streams = new KafkaStreams( builder, config ); 41 | 42 | streams.start(); 43 | 44 | System.out.println("Kafka stream started!>>>>>>>>>>>"); 45 | 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /Recommendation/KafkaStream/src/main/java/com/z/kafkastream/LogProcessor.java: -------------------------------------------------------------------------------- 1 | package com.z.kafkastream; 2 | 3 | import org.apache.kafka.streams.processor.Processor; 4 | import org.apache.kafka.streams.processor.ProcessorContext; 5 | 6 | public class LogProcessor implements Processor { 7 | 8 | private ProcessorContext context; 9 | 10 | @Override 11 | public void init(ProcessorContext processorContext) { 12 | this.context = processorContext; 13 | } 14 | 15 | @Override 16 | public void process(byte[] dummy, byte[] line) { 17 | // 把收集到的日志信息用string表示 18 | String input = new String(line); 19 | 20 | // flume已经正则匹配:根据前缀MOVIE_RATING_PREFIX:从日志信息中提取评分数据 21 | if (input.contains("MOVIE_RATING_PREFIX:")) { 22 | input = input.split("MOVIE_RATING_PREFIX:")[1].trim(); 23 | System.out.println("评分数据:" + input); 24 | 25 | context.forward("logProcessor".getBytes(), input.getBytes()); 26 | } 27 | } 28 | 29 | @Override 30 | public void punctuate(long l) { 31 | 32 | } 33 | 34 | @Override 35 | public void close() { 36 | 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /Recommendation/KafkaStream/src/main/java/com/z/kafkastream/MyEventTimeExtractor.java: -------------------------------------------------------------------------------- 1 | package com.z.kafkastream; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerRecord; 4 | import org.apache.kafka.streams.processor.TimestampExtractor; 5 | 6 | /** 7 | * kafka 0.10以下生产消息没有时间戳,flume使用kafka版本较低 8 | */ 9 | public class MyEventTimeExtractor implements TimestampExtractor{ 10 | 11 | @Override 12 | public long extract(ConsumerRecord record, long previousTimestamp) { 13 | return System.currentTimeMillis(); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /Recommendation/KafkaStream/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=warn, stdout 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 3 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 4 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%50t] %-80c(line:%5L) : %m%n -------------------------------------------------------------------------------- /Recommendation/OfflineRecommendation/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 3 | org.eclipse.jdt.core.compiler.compliance=1.8 4 | org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled 5 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 6 | org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore 7 | org.eclipse.jdt.core.compiler.release=disabled 8 | org.eclipse.jdt.core.compiler.source=1.8 9 | -------------------------------------------------------------------------------- /Recommendation/OfflineRecommendation/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /Recommendation/OfflineRecommendation/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | Recommendation 7 | com.z 8 | 1.0-SNAPSHOT 9 | 10 | 11 | 4.0.0 12 | OfflineRecommender 13 | 14 | 15 | 16 | org.scalanlp 17 | jblas 18 | ${jblas.version} 19 | 20 | 21 | 22 | 23 | org.apache.spark 24 | spark-core_2.11 25 | 26 | 27 | org.apache.spark 28 | spark-sql_2.11 29 | 30 | 31 | org.apache.spark 32 | spark-mllib_2.11 33 | 34 | 35 | 36 | 40 | 41 | 42 | 43 | org.mongodb 44 | casbah-core_2.11 45 | ${casbah.version} 46 | 47 | 48 | org.mongodb.spark 49 | mongo-spark-connector_2.11 50 | ${mongodb-spark.version} 51 | 52 | 53 | 54 | 55 | OfflineRecommendation 56 | 57 | 58 | org.apache.maven.plugins 59 | maven-assembly-plugin 60 | 61 | 62 | 63 | com.z.offline.OfflineRecommender 64 | 65 | 66 | 67 | jar-with-dependencies 68 | 69 | 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /Recommendation/OfflineRecommendation/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=info, stdout 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 3 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 4 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%50t] %-80c(line:%5L) : %m%n -------------------------------------------------------------------------------- /Recommendation/OfflineRecommendation/src/main/scala/com/z/offline/ALSTrainer.scala: -------------------------------------------------------------------------------- 1 | package com.z.offline 2 | 3 | import breeze.numerics.sqrt 4 | import com.z.offline.OfflineRecommender.MONGODB_RATING_COLLECTION 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating} 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.SparkSession 9 | //import org.apache.spark.ml.recommendation.{ALS, MatrixFactorizationModel, Rating} 10 | 11 | /** 12 | * 隐语模型超参数调整优化 13 | */ 14 | object ALSTrainer { 15 | def main(args: Array[String]): Unit = { 16 | val config = Map( 17 | "mongo.uri" -> "mongodb://192.168.0.241:27017/recommender", 18 | "mongo.db" -> "recommender" 19 | ) 20 | // 创建一个sparkConf 21 | val warehouseLocation = "hdfs://node1:9000/user/hive/warehouse"; 22 | val sparkConf = new SparkConf() 23 | .setAppName("推荐系统 - LFM调参cache") 24 | .setMaster("spark://node1:7077,node3:7077") 25 | .setJars(List("G:\\JavaEE\\Hadoop-Spark\\MovieRecommendSystem\\recommender\\OfflineRecommender\\target\\" + 26 | "OfflineRecommender-jar-with-dependencies.jar")) 27 | .setIfMissing("spark.driver.host", "192.168.0.28") 28 | .set("spark.sql.warehouse.dir", warehouseLocation) 29 | .set("spark.num.executors", "3") 30 | .set("spark.executor.cores", "1") 31 | .set("spark.executor.memory", "1024m") 32 | 33 | // 创建一个SparkSession 34 | val spark = SparkSession.builder().config(sparkConf).getOrCreate() 35 | 36 | import spark.implicits._ 37 | implicit val mongoConfig = MongoConfig(config("mongo.uri"), config("mongo.db")) 38 | 39 | // 加载评分数据 40 | val ratingRDD = spark.read 41 | .option("uri", mongoConfig.uri) 42 | .option("collection", MONGODB_RATING_COLLECTION) 43 | .format("com.mongodb.spark.sql") 44 | .load() 45 | .as[MovieRating] 46 | .rdd 47 | .map(rating => Rating(rating.uid, rating.mid, rating.score)) 48 | // .cache() 49 | 50 | // 随机切分数据集,生成训练集和测试集 51 | val splits = ratingRDD.randomSplit(Array(0.8, 0.2)) 52 | val trainingRDD = splits(0) 53 | val testRDD = splits(1) 54 | 55 | // 模型参数选择,输出最优参数 56 | adjustALSParam(trainingRDD, testRDD) 57 | 58 | spark.close() 59 | } 60 | 61 | /** 62 | * LFM迭代调参 63 | * @param trainData 64 | * @param testData 65 | */ 66 | def adjustALSParam(trainData: RDD[Rating], testData: RDD[Rating]): Unit = { 67 | val result = for (rank <- Array(50, 100, 200, 300); lambda <- Array(0.01, 0.1, 1)) 68 | yield { 69 | val model = ALS.train(trainData, rank, 5, lambda) 70 | val rmse = getRMSE(model, testData) 71 | (rank, lambda, rmse) 72 | } 73 | 74 | // 控制台打印输出最优参数 75 | println(result.minBy(_._3)) 76 | } 77 | 78 | /** 79 | * 均方误差的根 80 | * @param model 81 | * @param data 82 | * @return 83 | */ 84 | def getRMSE(model: MatrixFactorizationModel, data: RDD[Rating]): Double = { 85 | // 计算预测评分 86 | val userProducts = data.map(item => (item.user, item.product)) 87 | val predictRating = model.predict(userProducts) 88 | 89 | // 以uid,mid作为外键,inner join实际观测值和预测值 90 | val actual = data.map(item => ((item.user, item.product), item.rating)) 91 | val predict = predictRating.map(item => ((item.user, item.product), item.rating)) 92 | 93 | // 内连接得到(uid, mid),(actual, predict) 94 | sqrt( 95 | actual.join(predict).map { 96 | case ((uid, mid), (actual, pre)) => val err = actual - pre; err * err 97 | }.mean() 98 | ) 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /Recommendation/OfflineRecommendation/src/main/scala/com/z/offline/OfflineRecommender.scala: -------------------------------------------------------------------------------- 1 | package com.z.offline 2 | 3 | /** 4 | * 离线推荐算法: 5 | * 根据用户推荐电影列表 6 | * 电影相似度矩阵列表 7 | */ 8 | 9 | import org.apache.spark.SparkConf 10 | import org.apache.spark.mllib.recommendation.{ALS, Rating} 11 | import org.apache.spark.sql.{DataFrame, SparkSession} 12 | import org.jblas.DoubleMatrix 13 | 14 | // Rate表:与ALS算法中的Rating取别开 15 | case class MovieRating(uid: Int, mid: Int, score: Double, timestamp: Int) 16 | 17 | // 定义一个基准推荐对象 18 | case class Recommendation(mid: Int, score: Double) 19 | 20 | // 定义基于预测评分的用户推荐列表 21 | case class UserRecs(uid: Int, recs: Seq[Recommendation]) 22 | 23 | // 定义基于LFM电影特征向量的电影相似度列表 24 | case class MovieRecs(mid: Int, recs: Seq[Recommendation]) 25 | 26 | case class MongoConfig(uri: String, db: String) 27 | 28 | object OfflineRecommender { 29 | 30 | // 定义表名和常量 31 | val MONGODB_RATING_COLLECTION = "Rating" 32 | val USER_RECS = "UserRecs" 33 | val MOVIE_RECS = "MovieRecs" 34 | val USER_MAX_RECOMMENDATION = 20 35 | 36 | def main(args: Array[String]): Unit = { 37 | val config = Map( 38 | "mongo.uri" -> "mongodb://192.168.0.241:27017/recommender", 39 | "mongo.db" -> "recommender" 40 | ) 41 | 42 | // 创建一个sparkConf 43 | val warehouseLocation = "hdfs://node1:9000/user/hive/warehouse"; 44 | val sparkConf = new SparkConf() 45 | .setAppName("推荐系统 - 离线推荐") 46 | .setMaster("spark://node1:7077,node3:7077") 47 | .setJars(List("G:\\JavaEE\\Hadoop-Spark\\RecommendationSystem\\Recommendation\\OfflineRecommendation\\target\\" + 48 | "OfflineRecommendation-jar-with-dependencies.jar")) 49 | .setIfMissing("spark.driver.host", "192.168.0.28") 50 | .set("spark.sql.warehouse.dir", warehouseLocation) 51 | .set("spark.num.executors", "3") 52 | .set("spark.executor.cores", "1") 53 | .set("spark.executor.memory", "1800m") 54 | 55 | // 创建一个SparkSession 56 | val spark = SparkSession.builder().config(sparkConf).getOrCreate() 57 | 58 | import spark.implicits._ 59 | implicit val mongoConfig = MongoConfig(config("mongo.uri"), config("mongo.db")) 60 | 61 | // 加载评分数据 62 | val ratingRDD = spark.read 63 | .option("uri", mongoConfig.uri) 64 | .option("collection", MONGODB_RATING_COLLECTION) 65 | .format("com.mongodb.spark.sql") 66 | .load() 67 | .as[MovieRating] 68 | .rdd 69 | .map(rating => (rating.uid, rating.mid, rating.score)) 70 | .cache() // 多次计算缓存到内存中 71 | 72 | // 训练隐语义模型:Rating(user:Int, product:Int, rating:Double) 73 | val trainData = ratingRDD.map(x => Rating(x._1, x._2, x._3)) 74 | // 多个变量赋值 75 | val (rank, iterations, lambda) = (200, 5, 0.1) 76 | val model = ALS.train(trainData, rank, iterations, lambda) 77 | 78 | // 从rating数据中提取所有的uid和mid,并去重 79 | val userRDD = ratingRDD.map(_._1).distinct() 80 | val movieRDD = ratingRDD.map(_._2).distinct() 81 | val userMovies = userRDD.cartesian(movieRDD) 82 | 83 | // 调用model的predict方法预测评分 84 | val preRatings = model.predict(userMovies) 85 | 86 | val userRecs = preRatings 87 | .filter(_.rating > 0) 88 | .map(rating => (rating.user, (rating.product, rating.rating))) // Rating->(uid, (mid, score)) 89 | .groupByKey() 90 | .map { 91 | case (uid, recs) => UserRecs(uid, recs.toList.sortWith(_._2 > _._2).take(USER_MAX_RECOMMENDATION).map(x => Recommendation(x._1, x._2))) 92 | } 93 | .toDF() 94 | 95 | storeDFInMongoDB(userRecs, USER_RECS) 96 | 97 | /** 98 | * vector: 99 | * local vector是一种索引是0开始的整数、内容为double类型,存储在单机上的向量。MLlib支持两种矩阵,dense密集型和sparse稀疏型。 100 | * 一个dense类型的向量背后其实就是一个数组,而sparse向量背后则是两个并行数组——索引数组和值数组。比如向量(1.0, 0.0, 3.0) 101 | * 既可以用密集型向量表示为[1.0, 0.0, 3.0],也可以用稀疏型向量表示为(3, [0,2],[1.0,3.0]),其中3是数组的大小。 102 | * 103 | * dense vector与sparse vector: 104 | * new DenseVector(this.toArray) 105 | * 创建dense vector 106 | * val dv: Vector = Vectors.dense(1.0, 0.0, 3.0) 107 | * 创建sparse vector 108 | * val sv1: Vector = Vectors.sparse(3, Array(0,2), Array(1.0,3.0)) 109 | * val sv2: Vector = Vectors.sparse(3, Seq((0, 1.0), (2,3.0))) 110 | * 111 | * vecor norm范数和sqdist距离: 112 | * val norm1Vec = Vectors.dense(1.0,-1.0,2.0) 113 | * // 第一范数,就是绝对值相加 114 | * println(Vectors.norm(norm1Vec,1)) // 4.0 115 | * // 第二番薯,就是平方和开根号 116 | * println(Vectors.norm(norm1Vec,2)) // 2.449489742783178 117 | * // 无限范数 118 | * println(Vectors.norm(norm1Vec,1000)) //2.0 119 | * 120 | * val sq1 = Vectors.dense(1.0, 2.0, 3.0) 121 | * val sq2 = Vectors.dense(2.0, 4.0, 6.0) 122 | * println(Vectors.sqdist(sq1, sq2)) // (2-1)^2 + (4-2)^2 + (6-3)^2 = 14 123 | * 124 | * labeled point: 125 | * 这种labeled point其实内部也是一个vector,可能是dense也可能是sparse,不过多了一个标签列。在ML里面,labeled point 126 | * 通常用于有监督算法。这个label是double类型的,这样既可以用于回归算法,也可以用于分类。在二分类中,Label不是0就是1; 127 | * 在多分类中label可能从0开始,1,2,3,4.... 128 | * val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0)) 129 | * val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0))) 130 | * label index1:value1 index2:value2 ... 131 | * val examples: RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") 132 | **/ 133 | val movieFeatures = model.productFeatures.map { 134 | 135 | case (mid, features) => (mid, new DoubleMatrix(features)) 136 | } 137 | 138 | val movieRecs = movieFeatures.cartesian(movieFeatures) 139 | .filter { 140 | case (a, b) => a._1 != b._1 141 | } 142 | .map{ 143 | case (a, b) => val simScore = this.consinSim(a._2, b._2);(a._1, (b._1, simScore)) 144 | } 145 | .filter(_._2._2 > 0.6) // 过滤出相似度大于0.6的 146 | .groupByKey() 147 | .map {case (mid, items) => MovieRecs(mid, items.toList.sortWith(_._2 > _._2).map(x => Recommendation(x._1, x._2)))} 148 | .toDF() 149 | 150 | storeDFInMongoDB(movieRecs, MOVIE_RECS) 151 | 152 | spark.stop() 153 | } 154 | 155 | /** 156 | * 求向量余弦相似度:矩阵内积/第二范数乘积 157 | * 皮尔逊相关系数:先对向量每一分量减去分量均值,再求余弦相似度(叫取中心化) 158 | * @param movie1 159 | * @param movie2 160 | * @return 161 | */ 162 | def consinSim(movie1: DoubleMatrix, movie2: DoubleMatrix): Double = { 163 | movie1.dot(movie2) / (movie1.norm2() * movie2.norm2()) 164 | } 165 | 166 | def storeDFInMongoDB(df: DataFrame, collection_name: String)(implicit mongoConfig: MongoConfig): Unit ={ 167 | df.write 168 | .option("uri", mongoConfig.uri) 169 | .option("collection", collection_name) 170 | .mode("overwrite") 171 | .format("com.mongodb.spark.sql") 172 | .save() 173 | } 174 | 175 | } 176 | -------------------------------------------------------------------------------- /Recommendation/StatisticsRecommendation/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 3 | org.eclipse.jdt.core.compiler.compliance=1.8 4 | org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled 5 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 6 | org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore 7 | org.eclipse.jdt.core.compiler.release=disabled 8 | org.eclipse.jdt.core.compiler.source=1.8 9 | -------------------------------------------------------------------------------- /Recommendation/StatisticsRecommendation/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /Recommendation/StatisticsRecommendation/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | Recommendation 7 | com.z 8 | 1.0-SNAPSHOT 9 | 10 | 11 | 4.0.0 12 | StatisticsRecommender 13 | 14 | 15 | 16 | 17 | org.apache.spark 18 | spark-core_2.11 19 | 20 | 21 | org.apache.spark 22 | spark-sql_2.11 23 | 24 | 29 | 30 | 31 | 32 | org.mongodb 33 | casbah-core_2.11 34 | ${casbah.version} 35 | 36 | 37 | org.mongodb.spark 38 | mongo-spark-connector_2.11 39 | ${mongodb-spark.version} 40 | 41 | 42 | 43 | 44 | StatisticsRecommendation 45 | 46 | 47 | 48 | org.apache.maven.plugins 49 | maven-assembly-plugin 50 | 51 | 52 | 53 | com.z.statistics.StatisticsRecommender 54 | 55 | 56 | 57 | jar-with-dependencies 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /Recommendation/StatisticsRecommendation/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=info, stdout 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 3 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 4 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%50t] %-80c(line:%5L) : %m%n -------------------------------------------------------------------------------- /Recommendation/StatisticsRecommendation/src/main/scala/com/z/statistics/StatisticsRecommender.scala: -------------------------------------------------------------------------------- 1 | package com.z.statistics 2 | /** 3 | * 离线统计。统计的功能: 4 | * 电影的评分次数统计:mid,count 5 | * 按月维度评分排行榜:电影每月评分次数并做时间倒序、评分次数倒序 6 | * 统计电影的平均评分:mid,avg 7 | * 各类别电影评分Top10统计 8 | */ 9 | import java.text.SimpleDateFormat 10 | import java.util.Date 11 | import org.apache.spark.SparkConf 12 | import org.apache.spark.sql.{DataFrame, SparkSession} 13 | /** 14 | * Movie 数据集 15 | * 260 电影ID,mid 16 | * Star Wars: Episode IV - A New Hope (1977) 电影名称,name 17 | * Princess Leia is captured and held hostage 详情描述,descri 18 | * 121 minutes 时长,timelong 19 | * September 21, 2004 发行时间,issue 20 | * 1977 拍摄时间,shoot 21 | * English 语言,language 22 | * Action|Adventure|Sci-Fi 类型,genres 23 | * Mark Hamill|Harrison Ford|Carrie Fisher 演员表,actors 24 | * George Lucas 导演,directors 25 | */ 26 | case class Movie(mid: Int, name: String, descri: String, timelong: String, issue: String, shoot: String, language: String, 27 | genres: String, actors: String, directors: String) 28 | 29 | /** 30 | * Rate 电影评分数据集 31 | * @param uid 用户id 32 | * @param mid 电影id 33 | * @param score 评分 34 | * @param timestamp 评分时间戳 35 | */ 36 | case class Rating(uid: Int, mid: Int, score: Double, timestamp: Int ) 37 | 38 | // 定义一个基准推荐对象 39 | case class Recommendation( mid: Int, score: Double ) 40 | 41 | // 定义电影类别top10推荐对象 42 | case class GenresRecommendation(genres: String, recs: Seq[Recommendation]) 43 | 44 | case class MongoConfig(uri:String, db:String) 45 | 46 | object StatisticsRecommender { 47 | 48 | // 定义表名 49 | val MONGODB_MOVIE_COLLECTION = "Movie" 50 | val MONGODB_RATING_COLLECTION = "Rating" 51 | 52 | //统计的表的名称 53 | val RATE_MORE_MOVIES = "RateMoreMovies" 54 | val RATE_MORE_RECENTLY_MOVIES = "RateMoreRecentlyMovies" 55 | val AVERAGE_MOVIES = "AverageMovies" 56 | val GENRES_TOP_MOVIES = "GenresTopMovies" 57 | 58 | def main(args: Array[String]): Unit = { 59 | val config = Map( 60 | "mongo.uri" -> "mongodb://192.168.0.241:27017/recommender", 61 | "mongo.db" -> "recommender" 62 | ) 63 | 64 | // 创建一个sparkConf 65 | val warehouseLocation = "hdfs://node1:9000/user/hive/warehouse"; 66 | val sparkConf = new SparkConf() 67 | .setAppName("推荐系统 - 离线统计") 68 | .setMaster("spark://node1:7077,node3:7077") 69 | .setJars(List("G:\\JavaEE\\Hadoop-Spark\\RecommendationSystem\\Recommendation\\StatisticsRecommendation\\target\\" + 70 | "StatisticsRecommendation-jar-with-dependencies.jar")) 71 | .setIfMissing("spark.driver.host", "192.168.0.28") 72 | .set("spark.sql.warehouse.dir", warehouseLocation) 73 | .set("spark.num.executors", "3") 74 | .set("spark.executor.cores", "2") 75 | .set("spark.executor.memory", "1024m") 76 | 77 | // 创建一个SparkSession 78 | val spark = SparkSession.builder().config(sparkConf).getOrCreate() 79 | 80 | import spark.implicits._ 81 | implicit val mongoConfig = MongoConfig(config("mongo.uri"), config("mongo.db")) 82 | 83 | // 从mongodb加载数据 84 | val ratingDF = spark.read 85 | .option("uri", mongoConfig.uri) 86 | .option("collection", MONGODB_RATING_COLLECTION) 87 | .format("com.mongodb.spark.sql") 88 | .load() 89 | .as[Rating] // 装换成dataset强类型 90 | .toDF() 91 | 92 | val movieDF = spark.read 93 | .option("uri", mongoConfig.uri) 94 | .option("collection", MONGODB_MOVIE_COLLECTION) 95 | .format("com.mongodb.spark.sql") 96 | .load() 97 | .as[Movie] // 装换成dataset强类型 98 | .toDF() 99 | 100 | // ratings评分表copy到内存中 101 | ratingDF.createOrReplaceTempView("ratings") 102 | 103 | // 电影的评分次数统计:mid,count 104 | val rateMoreMoviesDF = spark.sql("select mid, count(mid) as count from ratings group by mid") 105 | storeDFInMongoDB(rateMoreMoviesDF, RATE_MORE_MOVIES ) 106 | 107 | // 按月维度评分排行榜:电影每月评分次数并做时间倒序、评分次数倒序 108 | val simpleDateFormat = new SimpleDateFormat("yyyyMM") 109 | spark.udf.register("changeDate", (x: Int) => simpleDateFormat.format(new Date(x * 1000L)).toInt) 110 | val ratingOfYearMonth = spark.sql("select mid, score, changeDate(timestamp) as yearmonth from ratings") 111 | ratingOfYearMonth.createOrReplaceTempView("ratingOfMonth") 112 | val rateMoreRecentlyMoviesDF = spark.sql("select mid, count(mid) as count, yearmonth from ratingOfMonth" + 113 | " group by yearmonth, mid order by yearmonth desc, count desc") 114 | storeDFInMongoDB(rateMoreRecentlyMoviesDF, RATE_MORE_RECENTLY_MOVIES) 115 | 116 | // 统计电影的平均评分:mid,avg 117 | val averageMoviesDF = spark.sql("select mid, avg(score) as avg from ratings group by mid") 118 | storeDFInMongoDB(averageMoviesDF, AVERAGE_MOVIES) 119 | 120 | // 类别下热门电影榜:对比hive与rdd实现 121 | /** 122 | * select * from ( 123 | * select mid, score, genres_name from movie_with_score lateral view explode(genres) table_tmp as genres_name 124 | * ) t row_number over(partition by genres_name order by score desc) rank 125 | * where rank <=10 126 | */ 127 | val genres = List("Action","Adventure","Animation","Comedy","Crime","Documentary","Drama","Family","Fantasy","Foreign", 128 | "History","Horror","Music","Mystery" ,"Romance","Science","Tv","Thriller","War","Western") 129 | val movieWithScore = movieDF.join(averageMoviesDF, "mid") 130 | // movieWithScore.agg($"avg".as("score")) 131 | val genresRDD = spark.sparkContext.makeRDD(genres) 132 | 133 | // DataFrame->RDD, 内容是Row 134 | val genresTopMoviesDF = genresRDD.cartesian(movieWithScore.rdd) 135 | .filter{case (genre, movieRow) => 136 | movieRow.getAs[String]("genres").toLowerCase.contains(genre.toLowerCase) 137 | } 138 | .map{ 139 | case (genre, movieRow) => (genre, (movieRow.getAs[Int]("mid"), movieRow.getAs[Double]("avg"))) 140 | } 141 | .groupByKey() 142 | .map{case (genre, items) => 143 | GenresRecommendation(genre, items.toList.sortWith(_._2>_._2).take(10).map(item=> Recommendation(item._1, item._2))) 144 | } 145 | .toDF() 146 | 147 | storeDFInMongoDB(genresTopMoviesDF, GENRES_TOP_MOVIES) 148 | 149 | spark.stop() 150 | } 151 | 152 | def storeDFInMongoDB(df: DataFrame, collection_name: String)(implicit mongoConfig: MongoConfig): Unit ={ 153 | df.write 154 | .option("uri", mongoConfig.uri) 155 | .option("collection", collection_name) 156 | .mode("overwrite") 157 | .format("com.mongodb.spark.sql") 158 | .save() 159 | } 160 | 161 | } 162 | -------------------------------------------------------------------------------- /Recommendation/StreamingRecommendation/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 3 | org.eclipse.jdt.core.compiler.compliance=1.8 4 | org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled 5 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 6 | org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore 7 | org.eclipse.jdt.core.compiler.release=disabled 8 | org.eclipse.jdt.core.compiler.source=1.8 9 | -------------------------------------------------------------------------------- /Recommendation/StreamingRecommendation/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /Recommendation/StreamingRecommendation/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | Recommendation 7 | com.z 8 | 1.0-SNAPSHOT 9 | 10 | 11 | 4.0.0 12 | StreamingRecommendation 13 | 14 | 15 | 16 | 17 | org.apache.spark 18 | spark-core_2.11 19 | 20 | 21 | org.apache.spark 22 | spark-sql_2.11 23 | 24 | 25 | org.apache.spark 26 | spark-streaming_2.11 27 | 28 | 29 | 30 | 34 | 35 | 36 | 37 | 38 | org.mongodb 39 | casbah-core_2.11 40 | ${casbah.version} 41 | 42 | 43 | 44 | org.mongodb.spark 45 | mongo-spark-connector_2.11 46 | ${mongodb-spark.version} 47 | 48 | 49 | 50 | 51 | redis.clients 52 | jedis 53 | ${redis.version} 54 | 55 | 56 | 57 | 58 | org.apache.kafka 59 | kafka-clients 60 | ${kafka.version} 61 | 62 | 63 | org.apache.spark 64 | spark-streaming-kafka-0-10_2.11 65 | ${spark.version} 66 | 67 | 68 | 69 | 70 | StreamingRecommendation 71 | 72 | 73 | org.apache.maven.plugins 74 | maven-assembly-plugin 75 | 76 | 77 | 78 | com.z.streaming.StreamingRecommender 79 | 80 | 81 | 82 | jar-with-dependencies 83 | 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /Recommendation/StreamingRecommendation/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=warn, stdout 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 3 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 4 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %5p --- [%50t] %-80c(line:%5L) : %m%n -------------------------------------------------------------------------------- /Recommendation/StreamingRecommendation/src/main/scala/com/z/streaming/StreamingRecommender.scala: -------------------------------------------------------------------------------- 1 | package com.z.streaming 2 | 3 | import com.mongodb.casbah.commons.MongoDBObject 4 | import com.mongodb.casbah.{MongoClient, MongoClientURI} 5 | import org.apache.kafka.common.serialization.StringDeserializer 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.sql.SparkSession 8 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 9 | import org.apache.spark.streaming.{Seconds, StreamingContext} 10 | import redis.clients.jedis.Jedis 11 | 12 | // 定义连接助手对象,序列化 13 | object ConnHelper extends Serializable { 14 | lazy val jedis = new Jedis("192.168.0.241") 15 | lazy val mongoClient = MongoClient(MongoClientURI("mongodb://192.168.0.241:27017/recommender")) 16 | } 17 | 18 | case class MongoConfig(uri: String, db: String) 19 | 20 | // 定义一个基准推荐对象 21 | case class Recommendation(mid: Int, score: Double) 22 | 23 | // 定义基于预测评分的用户推荐列表 24 | case class UserRecs(uid: Int, recs: Seq[Recommendation]) 25 | 26 | // 定义基于LFM电影特征向量的电影相似度列表 27 | case class MovieRecs(mid: Int, recs: Seq[Recommendation]) 28 | 29 | object StreamingRecommender { 30 | 31 | val MAX_USER_RATINGS_NUM = 20 32 | val MAX_SIM_MOVIES_NUM = 20 33 | val MONGODB_STREAM_RECS_COLLECTION = "StreamRecs" 34 | val MONGODB_RATING_COLLECTION = "Rating" 35 | val MONGODB_MOVIE_RECS_COLLECTION = "MovieRecs" 36 | 37 | def main(args: Array[String]): Unit = { 38 | val config = Map( 39 | "mongo.uri" -> "mongodb://192.168.0.241:27017/recommender", 40 | "mongo.db" -> "recommender", 41 | "kafka.topic" -> "recommender" 42 | ) 43 | 44 | // 创建一个sparkConf 45 | val warehouseLocation : String = "hdfs://node1:9000/user/hive/warehouse" 46 | val sparkConf = new SparkConf() 47 | .setAppName("推荐系统 - 实时推荐") 48 | .setMaster("spark://node1:7077,node3:7077") 49 | .setJars(List("G:\\JavaEE\\Hadoop-Spark\\RecommendationSystem\\Recommendation\\StreamingRecommendation\\target\\" + 50 | "StreamingRecommendation-jar-with-dependencies.jar")) 51 | .setIfMissing("spark.driver.host", "192.168.0.28") 52 | .set("spark.num.executors", "3") 53 | .set("spark.executor.cores", "2") 54 | .set("spark.executor.memory", "1800m") 55 | .set("spark.sql.warehouse.dir", warehouseLocation) 56 | 57 | // 创建一个SparkSession 58 | val spark = SparkSession.builder().config(sparkConf).getOrCreate() 59 | 60 | // 拿到streaming context 61 | val sc = spark.sparkContext 62 | val ssc = new StreamingContext(sc, Seconds(2)) // batch duration 63 | 64 | import spark.implicits._ 65 | implicit val mongoConfig = MongoConfig(config("mongo.uri"), config("mongo.db")) 66 | 67 | // 加载电影相似度矩阵数据,把它广播出去 68 | val simMovieMatrix = spark.read 69 | .option("uri", mongoConfig.uri) 70 | .option("collection", MONGODB_MOVIE_RECS_COLLECTION) 71 | .format("com.mongodb.spark.sql") 72 | .load() 73 | .as[MovieRecs] 74 | .rdd 75 | .map{ movieRecs => // 为了查询相似度方便,转换成map 76 | (movieRecs.mid, movieRecs.recs.map( x=> (x.mid, x.score) ).toMap ) 77 | }.collectAsMap() 78 | 79 | val simMovieMatrixBroadCast = sc.broadcast(simMovieMatrix) 80 | 81 | // 定义kafka连接参数: 82 | val kafkaParam = Map( 83 | "bootstrap.servers" -> "node1:9092,node2:9092,node3:9092", 84 | "key.deserializer" -> classOf[StringDeserializer], 85 | "value.deserializer" -> classOf[StringDeserializer], 86 | "auto.offset.reset" -> "latest", 87 | "group.id" -> "recommender" 88 | // "enable.auto.commit" -> false, 89 | // "receive.buffer.bytes" -> 65536 90 | ) 91 | // 通过kafka创建一个DStream 92 | val kafkaStream = KafkaUtils.createDirectStream[String, String](ssc, LocationStrategies.PreferConsistent, 93 | ConsumerStrategies.Subscribe[String, String](Array(config("kafka.topic")), kafkaParam) 94 | ) 95 | 96 | // 把原始数据UID|MID|SCORE|TIMESTAMP 转换成评分流 97 | val ratingStream = kafkaStream.map { 98 | msg => 99 | // """|""":scala中"""三个引号内可以直接敲回车替代\n,stripMargin取空格链接字符默认是| 100 | val attr = msg.value().split("\\|") 101 | (attr(0).toInt, attr(1).toInt, attr(2).toDouble, attr(3).toInt) 102 | } 103 | 104 | ratingStream.print() 105 | 106 | ratingStream.foreachRDD { 107 | rdd => rdd.foreach { 108 | case (uid, mid, score, timestamp) => { 109 | // 从redis里获取用户最近的K次评分:Array[(mid, score)] 110 | val userRecentlyRatings = getUserRecentlyRating(MAX_USER_RATINGS_NUM, uid, ConnHelper.jedis) 111 | 112 | // 从相似度矩阵中获取备选列表,Array[mid] 113 | val candidateMovies = getTopSimMovies(MAX_SIM_MOVIES_NUM, mid, uid, simMovieMatrixBroadCast.value) 114 | 115 | // 计算备选元素与用户最近评分物品相似度+加强减弱因子,Array[(mid, score)] 116 | val streamRecs = computeMovieScores(candidateMovies, userRecentlyRatings, simMovieMatrixBroadCast.value) 117 | 118 | // 数据保存到mongodb 119 | saveDataToMongoDB(uid, streamRecs) 120 | } 121 | } 122 | } 123 | // 开始接收和处理数据 124 | ssc.start() 125 | 126 | println(">>>>>>>>>>>>>>> streaming started!") 127 | 128 | ssc.awaitTermination() 129 | 130 | } 131 | 132 | /** 133 | * 从redis获取用户最近k次评分 134 | * @param num 数量 135 | * @param uid 用户id 136 | * @param jedis 137 | * @return 138 | */ 139 | def getUserRecentlyRating(num: Int, uid: Int, jedis: Jedis): Array[(Int, Double)] = { 140 | // java list to scala.BufferList 141 | import scala.collection.JavaConversions._ 142 | // key{uid:UID}, value{MID:SCORE} 143 | jedis.lrange("uid:" + uid, 0, num - 1) 144 | .map { 145 | item => 146 | val attr = item.split("\\:") 147 | (attr(0).trim.toInt, attr(1).trim.toDouble) 148 | } 149 | .toArray 150 | } 151 | 152 | /** 153 | * 相似从相似度矩阵中获取备选列表:过滤已评分的 154 | * @param num 相似电影的数量 155 | * @param mid 当前电影ID 156 | * @param uid 当前评分用户ID 157 | * @param simMovies 相似度矩阵 158 | * @return 159 | */ 160 | def getTopSimMovies(num: Int, mid: Int, uid: Int, simMovies: scala.collection.Map[Int, scala.collection.immutable.Map[Int, Double]]) 161 | (implicit mongoConfig: MongoConfig): Array[Int] = { 162 | val allSimMovies = simMovies(mid).toArray 163 | 164 | val ratingExist = ConnHelper.mongoClient(mongoConfig.db)(MONGODB_RATING_COLLECTION) 165 | .find(MongoDBObject("uid" -> uid)) 166 | .map { 167 | item => item.get("mid").toString.toInt 168 | }.toArray 169 | 170 | allSimMovies.filter(x => !ratingExist.contains(x._1)) 171 | .sortWith(_._2 > _._2) 172 | .take(num) 173 | .map(x => x._1) 174 | } 175 | 176 | /** 177 | * 计算备选元素与用户最近评分物品相似度+加强减弱因子:核心算法 178 | * @param candidateMovies 179 | * @param userRecentlyRatings 180 | * @param simMovies 181 | * @return 182 | */ 183 | def computeMovieScores(candidateMovies: Array[Int], userRecentlyRatings: Array[(Int, Double)], 184 | simMovies: scala.collection.Map[Int, scala.collection.immutable.Map[Int, Double]]): Array[(Int, Double)] = { 185 | val scores = scala.collection.mutable.ArrayBuffer[(Int, Double)]() 186 | // 增强减弱因子 187 | val increMap = scala.collection.mutable.HashMap[Int, Int]() 188 | val decreMap = scala.collection.mutable.HashMap[Int, Int]() 189 | 190 | for (candidateMovie <- candidateMovies; userRecentlyRating <- userRecentlyRatings) { 191 | val simScore = getMoviesSimScore(candidateMovie, userRecentlyRating._1, simMovies) 192 | 193 | if (simScore > 0.7) { 194 | scores += ((candidateMovie, simScore * userRecentlyRating._2)) 195 | if (userRecentlyRating._2 > 3) { 196 | increMap(candidateMovie) = increMap.getOrElse(candidateMovie, 0) + 1 197 | } else { 198 | decreMap(candidateMovie) = decreMap.getOrElse(candidateMovie, 0) + 1 199 | } 200 | } 201 | } 202 | 203 | scores.groupBy(_._1).map { 204 | case (mid, scoreList) => 205 | (mid, scoreList.map(_._2).sum / scoreList.length + log(increMap.getOrElse(mid, 1)) - log(decreMap.getOrElse(mid, 1))) 206 | }.toArray.sortWith(_._2 > _._2) 207 | } 208 | 209 | /** 210 | * 物品间相似度:通过查找已计算的相似矩阵(broadCast map结构很方便) 211 | * @param mid1 212 | * @param mid2 213 | * @param simMovies 214 | * @return 215 | */ 216 | def getMoviesSimScore(mid1: Int, mid2: Int, simMovies: scala.collection.Map[Int, 217 | scala.collection.immutable.Map[Int, Double]]): Double = { 218 | 219 | simMovies.get(mid1) match { 220 | case Some(sims) => sims.get(mid2) match { 221 | case Some(score) => score 222 | case None => 0.0 223 | } 224 | case None => 0.0 225 | } 226 | } 227 | 228 | /** 229 | * 求对数 230 | * @param m 231 | * @return 232 | */ 233 | def log(m: Int): Double = { 234 | val N = 10 235 | math.log(m) / math.log(N) 236 | } 237 | 238 | def saveDataToMongoDB(uid: Int, streamRecs: Array[(Int, Double)])(implicit mongoConfig: MongoConfig): Unit = { 239 | // 定义到StreamRecs表的连接 240 | val streamRecsCollection = ConnHelper.mongoClient(mongoConfig.db)(MONGODB_STREAM_RECS_COLLECTION) 241 | 242 | // 如果表中已有uid对应的数据,则删除 243 | streamRecsCollection.findAndRemove(MongoDBObject("uid" -> uid)) 244 | // 将streamRecs数据存入表中 245 | streamRecsCollection.insert(MongoDBObject("uid" -> uid, 246 | "recs" -> streamRecs.map(x => MongoDBObject("mid" -> x._1, "score" -> x._2)))) 247 | } 248 | 249 | } 250 | -------------------------------------------------------------------------------- /Recommendation/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | RecommendationSystem 7 | com.z 8 | 1.0-SNAPSHOT 9 | 10 | 11 | 4.0.0 12 | Recommendation 13 | pom 14 | 15 | 16 | StatisticsRecommendation 17 | OfflineRecommendation 18 | StreamingRecommendation 19 | ContentRecommendation 20 | KafkaStream 21 | ItemCFRecommendation 22 | 23 | 24 | 25 | 26 | 27 | 28 | org.apache.spark 29 | spark-core_2.11 30 | ${spark.version} 31 | 32 | 33 | org.apache.spark 34 | spark-sql_2.11 35 | ${spark.version} 36 | 37 | 38 | org.apache.spark 39 | spark-streaming_2.11 40 | ${spark.version} 41 | 42 | 43 | org.apache.spark 44 | spark-mllib_2.11 45 | ${spark.version} 46 | 47 | 48 | org.apache.spark 49 | spark-graphx_2.11 50 | ${spark.version} 51 | 52 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | net.alchim31.maven 65 | scala-maven-plugin 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.z 8 | RecommendationSystem 9 | pom 10 | 1.0-SNAPSHOT 11 | 12 | 13 | Recommendation 14 | 15 | 16 | 17 | 1.2.17 18 | 1.7.22 19 | 2.8.2 20 | 2.0.0 21 | 3.1.1 22 | 6.3.1 23 | 6.3.1 24 | 2.9.0 25 | 0.11.0.2 26 | 2.1.1 27 | 2.11.8 28 | 1.2.1 29 | 30 | 31 | 32 | 33 | 34 | org.slf4j 35 | jcl-over-slf4j 36 | ${slf4j.version} 37 | 38 | 39 | org.slf4j 40 | slf4j-api 41 | ${slf4j.version} 42 | 43 | 44 | org.slf4j 45 | slf4j-log4j12 46 | ${slf4j.version} 47 | 48 | 49 | log4j 50 | log4j 51 | ${log4j.version} 52 | 53 | 54 | org.apache.logging.log4j 55 | log4j-to-slf4j 56 | ${log4j-slf4j.version} 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | org.apache.maven.plugins 65 | maven-compiler-plugin 66 | 3.6.1 67 | 68 | 69 | 1.8 70 | 1.8 71 | utf8 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | org.apache.maven.plugins 81 | maven-assembly-plugin 82 | 3.0.0 83 | 84 | 85 | make-assembly 86 | package 87 | 88 | single 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | net.alchim31.maven 97 | scala-maven-plugin 98 | 3.2.2 99 | 100 | 101 | 102 | 103 | compile 104 | testCompile 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /readme.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wolf-song-ml/RecommendationSystem/6a459d6472131d697d6cf450a249caa3d6a465e4/readme.docx -------------------------------------------------------------------------------- /~$readme.docx: -------------------------------------------------------------------------------- 1 | wolfwolf --------------------------------------------------------------------------------