├── result
├── userFea
│ ├── _SUCCESS
│ ├── ._SUCCESS.crc
│ ├── .part-00000.crc
│ ├── .part-00001.crc
│ ├── .part-00002.crc
│ ├── .part-00003.crc
│ ├── part-00003
│ ├── part-00002
│ ├── part-00000
│ └── part-00001
├── productFea
│ ├── _SUCCESS
│ ├── ._SUCCESS.crc
│ ├── .part-00000.crc
│ ├── .part-00001.crc
│ ├── .part-00002.crc
│ ├── .part-00003.crc
│ ├── part-00003
│ ├── part-00000
│ ├── part-00001
│ └── part-00002
└── ratesAndPreds
│ ├── _SUCCESS
│ ├── ._SUCCESS.crc
│ ├── .part-00000.crc
│ └── part-00000
├── src
└── main
│ └── java
│ ├── log4j.properties
│ └── com
│ └── hyr
│ └── sparkml
│ └── als
│ ├── JavaALSExampleByMl.java
│ ├── JavaALSExampleByMlLib.java
│ └── SparkALSByStreaming.java
├── LICENSE
├── pom.xml
├── README.md
└── data
├── streaming_sample_movielens_ratings.txt
└── sample_movielens_ratings.txt
/result/userFea/_SUCCESS:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/result/productFea/_SUCCESS:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/result/ratesAndPreds/_SUCCESS:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/result/userFea/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/result/productFea/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/result/ratesAndPreds/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/result/userFea/.part-00000.crc:
--------------------------------------------------------------------------------
1 | crc 2qAU
--------------------------------------------------------------------------------
/result/userFea/.part-00001.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangyueranbbc/Spark_ALS/HEAD/result/userFea/.part-00001.crc
--------------------------------------------------------------------------------
/result/userFea/.part-00002.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangyueranbbc/Spark_ALS/HEAD/result/userFea/.part-00002.crc
--------------------------------------------------------------------------------
/result/userFea/.part-00003.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangyueranbbc/Spark_ALS/HEAD/result/userFea/.part-00003.crc
--------------------------------------------------------------------------------
/result/productFea/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangyueranbbc/Spark_ALS/HEAD/result/productFea/.part-00000.crc
--------------------------------------------------------------------------------
/result/productFea/.part-00001.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangyueranbbc/Spark_ALS/HEAD/result/productFea/.part-00001.crc
--------------------------------------------------------------------------------
/result/productFea/.part-00002.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangyueranbbc/Spark_ALS/HEAD/result/productFea/.part-00002.crc
--------------------------------------------------------------------------------
/result/productFea/.part-00003.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangyueranbbc/Spark_ALS/HEAD/result/productFea/.part-00003.crc
--------------------------------------------------------------------------------
/result/ratesAndPreds/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangyueranbbc/Spark_ALS/HEAD/result/ratesAndPreds/.part-00000.crc
--------------------------------------------------------------------------------
/result/userFea/part-00003:
--------------------------------------------------------------------------------
1 | (3,[D@5ebb4643)
2 | (7,[D@782d0c98)
3 | (11,[D@eb25d6f)
4 | (15,[D@2901bea7)
5 | (19,[D@15bf197)
6 | (23,[D@52f2283f)
7 | (27,[D@2a4df8d5)
8 |
--------------------------------------------------------------------------------
/result/userFea/part-00002:
--------------------------------------------------------------------------------
1 | (2,[D@19beca29)
2 | (6,[D@27fc180c)
3 | (10,[D@318dbbba)
4 | (14,[D@7ac9edd1)
5 | (18,[D@6f7f59a5)
6 | (22,[D@3373398e)
7 | (26,[D@8fb9e68)
8 |
--------------------------------------------------------------------------------
/result/userFea/part-00000:
--------------------------------------------------------------------------------
1 | (0,[D@7123fae9)
2 | (4,[D@715dc3a4)
3 | (8,[D@461a28d1)
4 | (12,[D@1c712469)
5 | (16,[D@2cc6b8f3)
6 | (20,[D@20de61ff)
7 | (24,[D@4a074445)
8 | (28,[D@1997fc83)
9 |
--------------------------------------------------------------------------------
/result/userFea/part-00001:
--------------------------------------------------------------------------------
1 | (1,[D@5b8ddb29)
2 | (5,[D@29ffe34c)
3 | (9,[D@407a1087)
4 | (13,[D@7ae5a6bd)
5 | (17,[D@480830a5)
6 | (21,[D@66c4a672)
7 | (25,[D@6e726688)
8 | (29,[D@1e0c60b3)
9 |
--------------------------------------------------------------------------------
/src/main/java/log4j.properties:
--------------------------------------------------------------------------------
1 | log4j.rootCategory=info, stdout
2 | log4j.rootLogger=info, stdout
3 |
4 | ### stdout ###
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=[%t] [%p] [%d{yyyy-MM-dd HH:mm:ss}] %m %n
--------------------------------------------------------------------------------
/result/productFea/part-00003:
--------------------------------------------------------------------------------
1 | (3,[D@71373d6c)
2 | (7,[D@29dcc70e)
3 | (11,[D@79efd4bb)
4 | (15,[D@6d5c457)
5 | (19,[D@258b8f26)
6 | (23,[D@7a79044c)
7 | (27,[D@f8e370a)
8 | (31,[D@4c9c6244)
9 | (35,[D@b382b29)
10 | (39,[D@250cb76b)
11 | (43,[D@637c37bc)
12 | (47,[D@383e70bf)
13 | (51,[D@50cde365)
14 | (55,[D@78c773d)
15 | (59,[D@2e813871)
16 | (63,[D@624327a)
17 | (67,[D@1c45b4cc)
18 | (71,[D@3d0495f)
19 | (75,[D@260630e8)
20 | (79,[D@6196a338)
21 | (83,[D@4c627b63)
22 | (87,[D@418ee0f1)
23 | (91,[D@5ddce9da)
24 | (95,[D@20e59db8)
25 | (99,[D@1c999db)
26 |
--------------------------------------------------------------------------------
/result/productFea/part-00000:
--------------------------------------------------------------------------------
1 | (0,[D@3e983005)
2 | (4,[D@1b48df48)
3 | (8,[D@59a41528)
4 | (12,[D@528af6e3)
5 | (16,[D@63d7e588)
6 | (20,[D@2a8d95f5)
7 | (24,[D@7bb2619b)
8 | (28,[D@675f5fca)
9 | (32,[D@11c81f94)
10 | (36,[D@40ccfb7c)
11 | (40,[D@6a0b14e8)
12 | (44,[D@4d3856e9)
13 | (48,[D@50fd0e9a)
14 | (52,[D@5a019c81)
15 | (56,[D@65f66398)
16 | (60,[D@6a9d8d02)
17 | (64,[D@366be64)
18 | (68,[D@2dabae40)
19 | (72,[D@29e6c51e)
20 | (76,[D@272afb7e)
21 | (80,[D@5f3549f4)
22 | (84,[D@4a756139)
23 | (88,[D@653d5eee)
24 | (92,[D@5ac10b9f)
25 | (96,[D@1df51022)
26 |
--------------------------------------------------------------------------------
/result/productFea/part-00001:
--------------------------------------------------------------------------------
1 | (1,[D@3f4cb394)
2 | (5,[D@483081e7)
3 | (9,[D@71a94fe5)
4 | (13,[D@34bef33f)
5 | (17,[D@781fb412)
6 | (21,[D@65e34723)
7 | (25,[D@3e76d64c)
8 | (29,[D@4a783f0e)
9 | (33,[D@568b872b)
10 | (37,[D@5d985c0b)
11 | (41,[D@63ba7621)
12 | (45,[D@46da5ce3)
13 | (49,[D@231ec843)
14 | (53,[D@51b08028)
15 | (57,[D@5d4f3a25)
16 | (61,[D@4c9792b)
17 | (65,[D@3ff391a1)
18 | (69,[D@cebf53c)
19 | (73,[D@3e5503b2)
20 | (77,[D@698dfb02)
21 | (81,[D@21337be0)
22 | (85,[D@510718ec)
23 | (89,[D@7180b853)
24 | (93,[D@1ccb6be2)
25 | (97,[D@1971e387)
26 |
--------------------------------------------------------------------------------
/result/productFea/part-00002:
--------------------------------------------------------------------------------
1 | (2,[D@281e497c)
2 | (6,[D@b83900d)
3 | (10,[D@5fb5d86f)
4 | (14,[D@6fa17b04)
5 | (18,[D@42dffb4)
6 | (22,[D@6627f213)
7 | (26,[D@78fb2cc8)
8 | (30,[D@965bfe5)
9 | (34,[D@3f09e343)
10 | (38,[D@49aebe61)
11 | (42,[D@289f63fc)
12 | (46,[D@77a4fb27)
13 | (50,[D@6bb446be)
14 | (54,[D@5cde2a10)
15 | (58,[D@168f4881)
16 | (62,[D@6673c6d9)
17 | (66,[D@72bf1fb2)
18 | (70,[D@211ad14c)
19 | (74,[D@63a56861)
20 | (78,[D@149e7995)
21 | (82,[D@1f37d142)
22 | (86,[D@24061445)
23 | (90,[D@23b2440e)
24 | (94,[D@21c756a3)
25 | (98,[D@6eb86bae)
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 huangyueranbbc
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | com.hyr.spark.svm
5 | Spark_SVM_DEMO
6 | 0.0.1-SNAPSHOT
7 |
8 |
9 |
10 |
11 | org.apache.spark
12 | spark-core_2.10
13 | 2.2.0
14 |
15 |
16 |
17 | org.apache.spark
18 | spark-mllib_2.10
19 | 2.2.0
20 |
21 |
22 |
23 | org.apache.spark
24 | spark-sql_2.10
25 | 2.2.0
26 |
27 |
28 |
29 | org.apache.spark
30 | spark-streaming_2.10
31 | 2.2.0
32 |
33 |
34 |
35 | org.apache.spark
36 | spark-streaming-kafka_2.10
37 | 1.6.3
38 |
39 |
40 |
41 | log4j
42 | log4j
43 | 1.2.17
44 |
45 |
46 | org.slf4j
47 | slf4j-api
48 | 1.7.12
49 |
50 |
51 | org.slf4j
52 | slf4j-log4j12
53 | 1.7.12
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Spark-ALS [](https://github.com/huangyueranbbc/Spark_ALS) [](http://spark.apache.org/docs/latest/api.html) [](http://spark.apache.org/) [](https://github.com/huangyueranbbc/SparkDemo)
2 | 简介
3 |
4 | ALS是alternating least squares的缩写 , 意为交替最小二乘法;而ALS-WR是alternating-least-squares with weighted-λ -regularization的缩写,意为加权正则化交替最小二乘法。该方法常用于基于矩阵分解的推荐系统中。例如:将用户(user)对商品(item)的评分矩阵分解为两个矩阵:一个是用户对商品隐含特征的偏好矩阵,另一个是商品所包含的隐含特征的矩阵。在这个矩阵分解的过程中,评分缺失项得到了填充,也就是说我们可以基于这个填充的评分来给用户最商品推荐了。
5 | ALS is the abbreviation of squares alternating least, meaning the alternating least squares method; and the ALS-WR is alternating-least-squares with weighted- lambda -regularization acronym, meaning weighted regularized alternating least squares method. This method is often used in recommender systems based on matrix factorization. For example, the user (user) score matrix of item is decomposed into two matrices: one is the user preference matrix for the implicit features of the commodity, and the other is the matrix of the implied features of the commodity. In the process of decomposing the matrix, the score missing is filled, that is, we can give the user the most recommended commodity based on the filled score.
6 |
7 | ALS-WR算法,简单地说就是:
8 | (数据格式为:userId, itemId, rating, timestamp )
9 | 1 对每个userId随机初始化N(10)个factor值,由这些值影响userId的权重。
10 | 2 对每个itemId也随机初始化N(10)个factor值。
11 | 3 固定userId,从userFactors矩阵和rating矩阵中分解出itemFactors矩阵。即[Item Factors Matrix] = [User Factors Matrix]^-1 * [Rating Matrix].
12 | 4 固定itemId,从itemFactors矩阵和rating矩阵中分解出userFactors矩阵。即[User Factors Matrix] = [Item Factors Matrix]^-1 * [Rating Matrix].
13 | 5 重复迭代第3,第4步,最后可以收敛到稳定的userFactors和itemFactors。
14 | 6 对itemId进行推断就为userFactors * itemId = rating value;对userId进行推断就为itemFactors * userId = rating value。
15 |
16 | #SparkALSByStreaming.java
17 | 基于Hadoop、Flume、Kafka、spark-streaming、logback、商城系统的实时推荐系统DEMO
18 | Real time recommendation system DEMO based on Hadoop, Flume, Kafka, spark-streaming, logback and mall system
19 | 商城系统采集的数据集格式 Data Format:
20 | 用户ID,商品ID,用户行为评分,时间戳
21 | UserID,ItemId,Rating,TimeStamp
22 | 53,1286513,9,1508221762
23 | 53,1172348420,9,1508221762
24 | 53,1179495514,12,1508221762
25 | 53,1184890730,3,1508221762
26 | 53,1210793742,159,1508221762
27 | 53,1215837445,9,1508221762
28 |
29 | Kafka Command:
30 |
31 | hadoop dfs -mkdir /spark-als/model
32 |
33 | hadoop dfs -mkdir /flume/logs
34 |
35 | kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic RECOMMEND_TOPIC
36 |
37 | kafka-console-producer.sh --broker-list 192.168.0.193:9092 --topic RECOMMEND_TOPIC < /data/streaming_sample_movielens_ratings.txt
38 |
--------------------------------------------------------------------------------
/src/main/java/com/hyr/sparkml/als/JavaALSExampleByMl.java:
--------------------------------------------------------------------------------
1 | package com.hyr.sparkml.als;
2 |
3 | import org.apache.spark.SparkConf;
4 | import org.apache.spark.api.java.JavaRDD;
5 | import org.apache.spark.api.java.JavaSparkContext;
6 | import org.apache.spark.api.java.function.Function;
7 | import org.apache.spark.ml.evaluation.RegressionEvaluator;
8 | import org.apache.spark.ml.recommendation.ALS;
9 | import org.apache.spark.ml.recommendation.ALSModel;
10 | import org.apache.spark.sql.Dataset;
11 | import org.apache.spark.sql.Row;
12 | import org.apache.spark.sql.SQLContext;
13 | import org.apache.spark.sql.types.DataTypes;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 |
17 | import java.io.Serializable;
18 |
19 | /**
20 | * @author huangyueran
21 | * @category ALS-WR
22 | */
23 | public class JavaALSExampleByMl {
24 |
25 | private static final Logger log = LoggerFactory.getLogger(JavaALSExampleByMl.class);
26 |
27 | public static class Rating implements Serializable {
28 | // 0::2::3::1424380312
29 | private int userId; // 0
30 | private int movieId; // 2
31 | private float rating; // 3
32 | private long timestamp; // 1424380312
33 |
34 | public Rating() {
35 | }
36 |
37 | public Rating(int userId, int movieId, float rating, long timestamp) {
38 | this.userId = userId;
39 | this.movieId = movieId;
40 | this.rating = rating;
41 | this.timestamp = timestamp;
42 | }
43 |
44 | public int getUserId() {
45 | return userId;
46 | }
47 |
48 | public int getMovieId() {
49 | return movieId;
50 | }
51 |
52 | public float getRating() {
53 | return rating;
54 | }
55 |
56 | public long getTimestamp() {
57 | return timestamp;
58 | }
59 |
60 | public static Rating parseRating(String str) {
61 | String[] fields = str.split("::");
62 | if (fields.length != 4) {
63 | throw new IllegalArgumentException("Each line must contain 4 fields");
64 | }
65 | int userId = Integer.parseInt(fields[0]);
66 | int movieId = Integer.parseInt(fields[1]);
67 | float rating = Float.parseFloat(fields[2]);
68 | long timestamp = Long.parseLong(fields[3]);
69 | return new Rating(userId, movieId, rating, timestamp);
70 | }
71 | }
72 |
73 | public static void main(String[] args) {
74 | SparkConf conf = new SparkConf().setAppName("JavaALSExample").setMaster("local");
75 | JavaSparkContext jsc = new JavaSparkContext(conf);
76 | SQLContext sqlContext = new SQLContext(jsc);
77 |
78 | JavaRDD ratingsRDD = jsc.textFile("data/sample_movielens_ratings.txt")
79 | .map(new Function() {
80 | public Rating call(String str) {
81 | return Rating.parseRating(str);
82 | }
83 | });
84 | Dataset ratings = sqlContext.createDataFrame(ratingsRDD, Rating.class);
85 | Dataset[] splits = ratings.randomSplit(new double[]{0.8, 0.2}); // //对数据进行分割,80%为训练样例,剩下的为测试样例。
86 | Dataset training = splits[0];
87 | Dataset test = splits[1];
88 |
89 | // Build the recommendation model using ALS on the training data
90 | ALS als = new ALS().setMaxIter(5) // 设置迭代次数
91 | .setRegParam(0.01) // //正则化参数,使每次迭代平滑一些,此数据集取0.1好像错误率低一些。
92 | .setUserCol("userId").setItemCol("movieId")
93 | .setRatingCol("rating");
94 | ALSModel model = als.fit(training); // //调用算法开始训练
95 |
96 |
97 | Dataset itemFactors = model.itemFactors();
98 | itemFactors.show(1500);
99 | Dataset userFactors = model.userFactors();
100 | userFactors.show();
101 |
102 | // Evaluate the model by computing the RMSE on the test data
103 | Dataset rawPredictions = model.transform(test); //对测试数据进行预测
104 | Dataset predictions = rawPredictions
105 | .withColumn("rating", rawPredictions.col("rating").cast(DataTypes.DoubleType))
106 | .withColumn("prediction", rawPredictions.col("prediction").cast(DataTypes.DoubleType));
107 |
108 | RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol("rating")
109 | .setPredictionCol("prediction");
110 | Double rmse = evaluator.evaluate(predictions);
111 | log.info("Root-mean-square error = {} ", rmse);
112 |
113 | jsc.stop();
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/src/main/java/com/hyr/sparkml/als/JavaALSExampleByMlLib.java:
--------------------------------------------------------------------------------
1 | package com.hyr.sparkml.als;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.util.Arrays;
6 |
7 | import org.apache.commons.io.FileUtils;
8 | import org.apache.commons.lang3.StringUtils;
9 | import org.apache.spark.SparkConf;
10 | import org.apache.spark.api.java.JavaDoubleRDD;
11 | import org.apache.spark.api.java.JavaPairRDD;
12 | import org.apache.spark.api.java.JavaRDD;
13 | import org.apache.spark.api.java.JavaSparkContext;
14 | import org.apache.spark.api.java.function.Function;
15 | import org.apache.spark.mllib.recommendation.ALS;
16 | import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
17 | import org.apache.spark.mllib.recommendation.Rating;
18 |
19 | import org.slf4j.Logger;
20 | import org.slf4j.LoggerFactory;
21 | import scala.Tuple2;
22 |
23 | /**
24 | * @category ALS
25 | * @author huangyueran
26 | *
27 | */
28 | public class JavaALSExampleByMlLib {
29 |
30 | private static final Logger log = LoggerFactory.getLogger(JavaALSExampleByMlLib.class);
31 |
32 | public static void main(String[] args) {
33 | SparkConf conf = new SparkConf().setAppName("JavaALSExample").setMaster("local[4]");
34 | JavaSparkContext jsc = new JavaSparkContext(conf);
35 |
36 | JavaRDD data = jsc.textFile("data/sample_movielens_ratings.txt");
37 |
38 | JavaRDD ratings = data.map(new Function() {
39 | public Rating call(String s) {
40 | String[] sarray = StringUtils.split(StringUtils.trim(s), "::");
41 | return new Rating(Integer.parseInt(sarray[0]), Integer.parseInt(sarray[1]),
42 | Double.parseDouble(sarray[2]));
43 | }
44 | });
45 |
46 | // Build the recommendation model using ALS
47 | int rank = 10;
48 | int numIterations = 6;
49 | MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01);
50 |
51 | // Evaluate the model on rating data
52 | JavaRDD> userProducts = ratings.map(new Function>() {
53 | public Tuple2