├── .gitignore
├── LICENSE
├── README.md
├── doc
├── doc.md
├── kafka
│ └── generator.sh
├── 功能.png
├── 系统整体架构.png
└── 设计思路.png
├── pom.xml
└── src
└── main
├── java
└── com
│ └── ams
│ └── recommend
│ ├── client
│ ├── HBaseClient.java
│ ├── MySQLClient.java
│ └── RedisClient.java
│ ├── common
│ └── pojo
│ │ ├── ArticlePortrait.java
│ │ ├── HotArticle.java
│ │ ├── Log.java
│ │ ├── SpiderArticle.java
│ │ ├── User.java
│ │ └── WindowedArticle.java
│ ├── nearline
│ └── task
│ │ ├── HistoryTask.java
│ │ ├── HotArticleTask.java
│ │ ├── LogTask.java
│ │ ├── PortraitTask.java
│ │ ├── UserInterestTask.java
│ │ └── tfidf
│ │ ├── SpiderTask.java
│ │ ├── TFIDFMapFunction.java
│ │ └── TFIDFSink.java
│ ├── offline
│ ├── ArticleCoeff.java
│ ├── ItemCfCoeff.java
│ └── SchedulerJob.java
│ └── util
│ ├── Constants.java
│ ├── LogUtil.java
│ ├── Property.java
│ └── WordTokenizerUtil.java
├── resources
├── config.properties
└── log4j.properties
└── test
└── com
└── ams
└── recommend
├── client
├── HBaseClientTest.java
└── RedisClientTest.java
└── util
├── LogUtilTest.java
├── NotionalTokenizer.java
└── TFTest.java
/.gitignore:
--------------------------------------------------------------------------------
1 | .gradle
2 | /build/
3 | !gradle/wrapper/gradle-wrapper.jar
4 |
5 | ### STS ###
6 | .apt_generated
7 | .classpath
8 | .factorypath
9 | .project
10 | .settings
11 | .springBeans
12 |
13 | ### IntelliJ IDEA ###
14 | .idea
15 | *.iws
16 | *.iml
17 | *.ipr
18 |
19 | ### NetBeans ###
20 | nbproject/private/
21 | build/
22 | nbbuild/
23 | dist/
24 | nbdist/
25 | .nb-gradle/
26 | /bin/
27 |
28 | /logs/
29 | /target/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AMS-recommendation-system
2 | AMS实时推荐系统
3 |
4 | ## AMS设计思路
5 |
6 | 
7 |
8 | ## 系统架构
9 |
10 | 
11 |
12 | 推荐系统的数据源从Web端传来,其中包括用户的行为日志、未处理的文章内容等信息,利用Apache Kafka高效消息队列系统将数据以队列的形式发送下游推荐引擎来处理,这样实现了业务系统和推荐系统的解耦,同时也实现了高峰错流等机制,提高系统的抗压能力。推荐引擎则用Apache Flink分布式计算引擎高吞吐、低延迟的特点实时高效地处理,包括画像构建、日志ETL、实时热门文章计算等。另外,离线推荐引擎则暂时采用多个线程来并发处理,较为高效地计算相似文章、相似用户等。最终,推荐引擎处理数据存入HBase,该数据库的特点是高吞吐的实时读写,符合系统实时性的要求。
13 |
14 |
15 |
16 | ## 系统功能
17 |
18 | 
19 |
20 |
--------------------------------------------------------------------------------
/doc/doc.md:
--------------------------------------------------------------------------------
1 | # AMS推荐系统改进思路
2 |
3 | - 用户行为日志【在线层】
4 |
5 | - 用户的点击游览,点赞,收藏,评论等作为事件实时更新用户画像,并且交给推荐引擎处理
6 |
7 | - A/B 分桶【在线层】
8 | - A/B 分流测试
9 |
10 | - 用户感兴趣 -> 基于上下文的推荐逻辑【近线层】
11 | - 如果用户对一篇文章感兴趣可能会点在页面停留一段时间,如果退出时间和开始游览时间之差大于3分钟,就认定为用户对这篇文章感兴趣
12 |
13 | - 根据用户行为实时推荐(u2i2i) 【近线层】
14 | - 根据用户点赞,感兴趣等推荐当前文章的相似文章
15 |
16 | - 相似文章推荐(i2i) -> 基于内容推荐【离线层】
17 | - 将文章通过 TF-IDF 算法得出关键词,然后利用 Alink 中的 Word2Vec Embedding 化(前 10 个关键字),最后通过 ASL 模型训练
18 |
19 | - 用户画像【离线层】
20 | - 根据文章的三个属性(作者,文章分类,关键字)给用户实时打入标签
21 |
22 | - 文章画像【离线层】
23 | - 根据用户的属性(年龄,性别,感兴趣的文章分类)给文章实时打入标签
24 |
25 |
--------------------------------------------------------------------------------
/doc/kafka/generator.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # 自动生成日志
3 | function create_kafka_topic {
4 | $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper master:2181 --replication-factor 1 --partitions 1 --topic $1
5 | }
6 |
7 | function send_messages_to_kafka {
8 | msg=$(generator_message)
9 | echo -e $msg | $KAFKA_HOME/bin/kafka-console-producer.sh --broker-list master:9092 --topic $TOPIC
10 | }
11 |
12 | function rand {
13 | min=$1
14 | max=$(($2-$min+1))
15 | num=$(date +%s%N)
16 | echo $(($num%$max+$min))
17 | }
18 |
19 | function generator_message {
20 | userId=$(rand 1 100);
21 | articleId=$(rand 1 10);
22 | timestamp=`date '+%s'`;
23 | action=1;
24 | msg=$userId","$articleId","$timestamp","$action;
25 | echo $msg
26 | }
27 |
28 | TOPIC="log"
29 | create_kafka_topic $TOPIC
30 | while true
31 | do
32 | send_messages_to_kafka
33 | sleep 0.1
34 | done
35 |
36 |
--------------------------------------------------------------------------------
/doc/功能.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xxubai/ams-recommendation-system/ec22e41e9bb8120d6cc6a105eaf693274baca2f7/doc/功能.png
--------------------------------------------------------------------------------
/doc/系统整体架构.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xxubai/ams-recommendation-system/ec22e41e9bb8120d6cc6a105eaf693274baca2f7/doc/系统整体架构.png
--------------------------------------------------------------------------------
/doc/设计思路.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xxubai/ams-recommendation-system/ec22e41e9bb8120d6cc6a105eaf693274baca2f7/doc/设计思路.png
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
19 |
21 | 4.0.0
22 |
23 | ams-runtime-recommend-system
24 | ams-runtime-recommend-system
25 | 0.1
26 | jar
27 |
28 | AMS Runtime Recommend System
29 | http://www.myorganization.org
30 |
31 |
32 | UTF-8
33 | 1.10.0
34 | 2.1.8
35 | 1.8
36 | 2.11
37 | ${java.version}
38 | ${java.version}
39 |
40 |
41 |
42 |
43 | apache.snapshots
44 | Apache Development Snapshot Repository
45 | https://repository.apache.org/content/repositories/snapshots/
46 |
47 | false
48 |
49 |
50 | true
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 | org.apache.flink
60 | flink-java
61 | ${flink.version}
62 |
63 |
64 | org.apache.flink
65 | flink-streaming-java_2.11
66 | ${flink.version}
67 |
68 |
69 | org.apache.flink
70 | flink-clients_2.11
71 | ${flink.version}
72 |
73 |
74 |
75 |
76 | org.apache.flink
77 | flink-connector-kafka_2.11
78 | ${flink.version}
79 |
80 |
81 | org.apache.flink
82 | flink-connector-filesystem_2.11
83 | ${flink.version}
84 |
85 |
86 |
87 |
88 |
89 | org.slf4j
90 | slf4j-log4j12
91 | 1.7.7
92 | runtime
93 |
94 |
100 |
101 |
102 |
103 | org.apache.hbase
104 | hbase-client
105 | ${hbase.version}
106 |
107 |
108 | org.apache.hbase
109 | hbase-server
110 | ${hbase.version}
111 |
112 |
113 | redis.clients
114 | jedis
115 | 3.0.1
116 |
117 |
118 | org.apache.flink
119 | flink-connector-redis_2.10
120 | 1.1.5
121 |
122 |
123 | org.apache.flink
124 | flink-jdbc
125 | 1.6.1
126 |
127 |
128 | mysql
129 | mysql-connector-java
130 | 8.0.28
131 |
132 |
133 | com.alibaba
134 | druid
135 | 1.1.10
136 |
137 |
138 | org.jetbrains
139 | annotations
140 | RELEASE
141 | compile
142 |
143 |
144 |
145 | junit
146 | junit
147 | 4.13.1
148 | test
149 |
150 |
151 |
152 |
153 | com.hankcs
154 | hanlp
155 | portable-1.7.5
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 | org.apache.maven.plugins
166 | maven-compiler-plugin
167 | 3.1
168 |
169 | ${java.version}
170 | ${java.version}
171 |
172 |
173 |
174 |
175 |
176 |
177 | org.apache.maven.plugins
178 | maven-shade-plugin
179 | 3.1.1
180 |
181 |
182 |
183 | package
184 |
185 | shade
186 |
187 |
188 |
189 |
190 | org.apache.flink:force-shading
191 | com.google.code.findbugs:jsr305
192 | org.slf4j:*
193 | log4j:*
194 |
195 |
196 |
197 |
198 |
200 | *:*
201 |
202 | META-INF/*.SF
203 | META-INF/*.DSA
204 | META-INF/*.RSA
205 |
206 |
207 |
208 |
209 |
210 | com.ams.recommend.StreamingJob
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 | org.eclipse.m2e
225 | lifecycle-mapping
226 | 1.0.0
227 |
228 |
229 |
230 |
231 |
232 | org.apache.maven.plugins
233 | maven-shade-plugin
234 | [3.1.1,)
235 |
236 | shade
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 | org.apache.maven.plugins
246 | maven-compiler-plugin
247 | [3.1,)
248 |
249 | testCompile
250 | compile
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/client/HBaseClient.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.client;
2 |
3 | import com.ams.recommend.util.Property;
4 | import org.apache.hadoop.conf.Configuration;
5 | import org.apache.hadoop.hbase.Cell;
6 | import org.apache.hadoop.hbase.HBaseConfiguration;
7 | import org.apache.hadoop.hbase.TableName;
8 | import org.apache.hadoop.hbase.client.*;
9 | import org.apache.hadoop.hbase.util.Bytes;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 |
13 | import java.io.IOException;
14 | import java.util.ArrayList;
15 | import java.util.HashMap;
16 | import java.util.List;
17 | import java.util.Map;
18 |
19 | public class HBaseClient {
20 |
21 | private static final Logger logger = LoggerFactory.getLogger(HBaseClient.class);
22 |
23 | private static Configuration conf;
24 |
25 | static {
26 | conf = HBaseConfiguration.create();
27 | conf.set("hbase.rootdir", Property.getStrValue("hbase.rootdir"));
28 | conf.set("hbase.zookeeper.quorum", Property.getStrValue("hbase.zookeeper.quorum"));
29 | conf.set("hbase.client.scanner.timeout.period", Property.getStrValue("hbase.client.scanner.timeout.period"));
30 | conf.set("hbase.rpc.timeout", Property.getStrValue("hbase.rpc.timeout"));
31 | conf.set("hbase.client.ipc.pool.size", Property.getStrValue("hbase.client.ipc.pool.size"));
32 | }
33 |
34 | /**
35 | * 查看表是否存在
36 | * @param tableName
37 | * @return
38 | */
39 | public static boolean existTable(String tableName) {
40 | boolean exist = false;
41 | try(Connection conn = ConnectionFactory.createConnection(conf)) {
42 | Admin admin = conn.getAdmin();
43 | exist = admin.tableExists(TableName.valueOf(tableName));
44 | admin.close();
45 | }catch (IOException e) {
46 | e.printStackTrace();
47 | }
48 | return exist;
49 | }
50 |
51 | /**
52 | * 创建一个新的表
53 | * @param tableName
54 | */
55 | public static void createTableIfNotExist(String tableName, String... family) {
56 | if(!existTable(tableName)) createOrOverwriteTable(tableName, family);
57 | else {
58 | logger.error("Table : " + tableName + " already existed");
59 | return;
60 | }
61 | }
62 |
63 | /**
64 | * 创建或覆盖表
65 | * @param tableName 表名
66 | */
67 | public static void createOrOverwriteTable(String tableName, String... cfs) {
68 | try(Connection conn = ConnectionFactory.createConnection(conf)) {
69 | Admin admin = conn.getAdmin();
70 | TableName tName = TableName.valueOf(tableName);
71 |
72 | if(admin.tableExists(tName)) {
73 | admin.enableTable(tName); //先停掉table
74 | admin.disableTable(tName); //后删除table
75 | }
76 |
77 | List columnFamilyDescriptors = new ArrayList<>();
78 | for (String cf : cfs) {
79 | columnFamilyDescriptors.add(ColumnFamilyDescriptorBuilder
80 | .newBuilder(Bytes.toBytes(cf))
81 | .build());
82 | }
83 |
84 | TableDescriptor tableDescriptor = TableDescriptorBuilder
85 | .newBuilder(tName)
86 | .setColumnFamilies(columnFamilyDescriptors) //设置添加列族
87 | .build();
88 | admin.createTable(tableDescriptor); //创建table
89 |
90 | admin.close(); //及时关闭流
91 | } catch (IOException e) {
92 | e.printStackTrace();
93 | }
94 | }
95 |
96 | /**
97 | * 插入一列数据
98 | * @param tableName
99 | * @param rowKey
100 | * @param family
101 | * @param column
102 | * @param value
103 | */
104 | public static void put(String tableName, String rowKey, String family, String column, String value) {
105 | TableName tName = TableName.valueOf(tableName);
106 | try(Connection conn = ConnectionFactory.createConnection(conf)) {
107 | Table table = conn.getTable(tName);
108 | Put put = new Put(Bytes.toBytes(rowKey))
109 | .addColumn(Bytes.toBytes(family), Bytes.toBytes(column), Bytes.toBytes(value));
110 | table.put(put);
111 |
112 | table.close(); //及时关闭流
113 | } catch (IOException e) {
114 | e.printStackTrace();
115 | }
116 | }
117 |
118 | public static void addOrUpdateColumn(String tableName, String rowKey, String family, String column) {
119 | String count = get(tableName, rowKey, family, column);
120 | if(count == null) count = "0";
121 |
122 | put(tableName, rowKey, family, column, String.valueOf(Long.valueOf(count) + 1));
123 | }
124 |
125 | public static String get(String tableName, String rowKey, String family, String column) {
126 | String res = null;
127 | TableName tName = TableName.valueOf(tableName);
128 | try(Connection conn = ConnectionFactory.createConnection(conf)) {
129 | Table table = conn.getTable(tName);
130 | Get get = new Get(Bytes.toBytes(rowKey))
131 | .addColumn(Bytes.toBytes(family), Bytes.toBytes(column));
132 | Result rs = table.get(get);
133 | res = Bytes.toString(rs.getValue(Bytes.toBytes(family), Bytes.toBytes(column)));
134 | table.close();
135 | } catch (IOException e) {
136 | e.printStackTrace();
137 | }
138 | return res;
139 | }
140 |
141 | /**
142 | * 获取一整行
143 | * @return
144 | */
145 | public static Map getRow(String tableName, String rowKey) {
146 | Map kv = new HashMap<>();
147 | TableName tName = TableName.valueOf(tableName);
148 | try(Connection conn = ConnectionFactory.createConnection(conf)) {
149 | Table table = conn.getTable(tName);
150 | Get get = new Get(Bytes.toBytes(rowKey));
151 | Result rs = table.get(get);
152 | for (Cell cell : rs.listCells()){
153 | String key = Bytes.toString(cell.getQualifierArray(),cell.getQualifierOffset(),cell.getQualifierLength());
154 | String value = Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength());
155 | kv.put(key, value);
156 | }
157 | table.close();
158 | } catch (IOException e) {
159 | e.printStackTrace();
160 | }
161 | return kv;
162 | }
163 |
164 | public static int getColumnSize(String tableName, String rowKey, String family) {
165 | int size = 0;
166 | TableName tName = TableName.valueOf(tableName);
167 | try(Connection conn = ConnectionFactory.createConnection(conf)) {
168 | Table table = conn.getTable(tName);
169 | Get get = new Get(Bytes.toBytes(rowKey));
170 | Result rs = table.get(get);
171 |
172 | if(rs.isEmpty()) return 0;
173 |
174 | Map familyMap = rs.getFamilyMap(Bytes.toBytes(family));
175 | size = familyMap.keySet().size();
176 | table.close();
177 | }catch (IOException e) {
178 | e.printStackTrace();
179 | }
180 | return size;
181 | }
182 |
183 | public static void createRow(String tableName, String rowKey, String c, String count, String value) {
184 | TableName tName = TableName.valueOf(tableName);
185 | try(Connection conn = ConnectionFactory.createConnection(conf)) {
186 | Table table = conn.getTable(tName);
187 | Put put = new Put(Bytes.toBytes(rowKey))
188 | .addColumn(Bytes.toBytes(c), Bytes.toBytes(count), Bytes.toBytes(value));
189 | table.put(put);
190 | table.close();
191 | }catch (IOException e) {
192 | e.printStackTrace();
193 | }
194 | }
195 |
196 |
197 | /**
198 | * 取出表中所有的key
199 | * @param tableName
200 | * @return
201 | */
202 | public static List getAllKey(String tableName) throws IOException {
203 | List keys = new ArrayList<>();
204 | try(Connection conn = ConnectionFactory.createConnection(conf)) {
205 | Scan scan = new Scan();
206 | Table table = conn.getTable(TableName.valueOf(tableName));
207 | ResultScanner scanner = table.getScanner(scan);
208 | for (Result r : scanner) {
209 | keys.add(new String(r.getRow()));
210 | }
211 | }catch (IOException e) {
212 | e.printStackTrace();
213 | }
214 | return keys;
215 | }
216 | }
217 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/client/MySQLClient.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.client;
2 |
3 | import com.alibaba.druid.pool.DruidDataSource;
4 | import com.ams.recommend.common.pojo.User;
5 | import com.ams.recommend.util.Property;
6 |
7 | import java.sql.Connection;
8 | import java.sql.PreparedStatement;
9 | import java.sql.ResultSet;
10 | import java.sql.SQLException;
11 |
12 | public class MySQLClient {
13 |
14 | private final static DruidDataSource dataSource;
15 |
16 | static {
17 | dataSource = new DruidDataSource();
18 | dataSource.setUrl(Property.getStrValue("mysql.url"));
19 | dataSource.setUsername(Property.getStrValue("mysql.name"));
20 | dataSource.setPassword(Property.getStrValue("mysql.password"));
21 | }
22 |
23 | /**
24 | * 根据文章id查询文章内容
25 | * @param articleId 文章id
26 | * @return 文章内容
27 | */
28 | public static String getContentById(String articleId) {
29 | String content = "";
30 | try(Connection conn = dataSource.getConnection()) {
31 | PreparedStatement pst = conn.prepareStatement("SELECT content FROM article WHERE id = ?");
32 | pst.setString(1, articleId);
33 | ResultSet rs = pst.executeQuery();
34 | content = rs.getString("content");
35 |
36 | pst.close();
37 | }catch (SQLException e) {
38 | e.printStackTrace();
39 | }
40 | return content;
41 | }
42 |
43 | /**
44 | * 根据文章id查询得到所有文章属性
45 | * @param articleId 文章id
46 | * @return 文章属性
47 | */
48 | public static ResultSet getUserPortraitById(String articleId) {
49 | ResultSet article = null;
50 | try(Connection conn = dataSource.getConnection()) {
51 | PreparedStatement pst = conn.prepareStatement("SELECT author, title, keyword FROM article WHERE id = ?");
52 | pst.setString(1, articleId);
53 | article = pst.executeQuery();
54 |
55 | pst.close();
56 | }catch (SQLException e) {
57 | e.printStackTrace();
58 | }
59 | return article;
60 | }
61 |
62 | /**
63 | * 根据用户id查询得到所有用户属性
64 | * @param userId 用户id
65 | * @return 用户属性
66 | */
67 | public static User getUserById(String userId) {
68 | User user = null;
69 | try(Connection conn = dataSource.getConnection()) {
70 | PreparedStatement pst = conn.prepareStatement("SELECT * FROM user WHERE id = ?");
71 | pst.setString(1, userId);
72 | ResultSet rs = pst.executeQuery();
73 | if(rs != null) {
74 | rs.next();
75 | user.setUserId(userId);
76 | user.setSex(rs.getInt("sex"));
77 | user.setAge(rs.getInt("age"));
78 | user.setJob(rs.getString("job"));
79 | user.setEducation(rs.getString("education"));
80 | }
81 |
82 | pst.close();
83 | }catch (SQLException e) {
84 | e.printStackTrace();
85 | }
86 | return user;
87 | }
88 |
89 | public static void putKeywordById(String id, String keyword) {
90 | try(Connection conn = dataSource.getConnection()) {
91 | PreparedStatement pst = conn.prepareStatement("INSERT INTO article(keyword) VALUES(?) WHERE id = ?");
92 | pst.setString(1, keyword);
93 | pst.setString(2, id);
94 | pst.executeUpdate();
95 |
96 | pst.close();
97 | }catch (SQLException e) {
98 | e.printStackTrace();
99 | }
100 | }
101 |
102 | }
103 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/client/RedisClient.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.client;
2 |
3 |
4 | import com.ams.recommend.util.Property;
5 | import redis.clients.jedis.Jedis;
6 |
7 | import java.util.ArrayList;
8 | import java.util.List;
9 |
10 | public class RedisClient {
11 | private static Jedis jedis;
12 |
13 | static {
14 | jedis = new Jedis(Property.getStrValue("redis.host"), Property.getIntValue("redis.port"));
15 | jedis.select(Property.getIntValue("redis.db"));
16 | }
17 |
18 | /**
19 | * 获取redis中对应的值
20 | * @param key 建
21 | * @return 值
22 | */
23 | public String getData(String key){
24 | return jedis.get(key);
25 | }
26 |
27 | /**
28 | * 获取热榜文章
29 | * @param topRange 热门文章数
30 | * @return 热门文章id
31 | */
32 | public List getTopList(int topRange){
33 | List res = new ArrayList<>();
34 | for (int i = 0; i < topRange; i++) {
35 | res.add(getData(String.valueOf(i)));
36 | }
37 | return res;
38 | }
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/common/pojo/ArticlePortrait.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.common.pojo;
2 |
3 | public class ArticlePortrait {
4 |
5 | private int man;
6 | private int woman;
7 |
8 | private int age_10;
9 | private int age_20;
10 | private int age_30;
11 | private int age_40;
12 | private int age_50;
13 | private int age_60;
14 |
15 | public int getMan() {
16 | return man;
17 | }
18 |
19 | public void setMan(int man) {
20 | this.man = man;
21 | }
22 |
23 | public int getWoman() {
24 | return woman;
25 | }
26 |
27 | public void setWoman(int woman) {
28 | this.woman = woman;
29 | }
30 |
31 | public int getAge_10() {
32 | return age_10;
33 | }
34 |
35 | public void setAge_10(int age_10) {
36 | this.age_10 = age_10;
37 | }
38 |
39 | public int getAge_20() {
40 | return age_20;
41 | }
42 |
43 | public void setAge_20(int age_20) {
44 | this.age_20 = age_20;
45 | }
46 |
47 | public int getAge_30() {
48 | return age_30;
49 | }
50 |
51 | public void setAge_30(int age_30) {
52 | this.age_30 = age_30;
53 | }
54 |
55 | public int getAge_40() {
56 | return age_40;
57 | }
58 |
59 | public void setAge_40(int age_40) {
60 | this.age_40 = age_40;
61 | }
62 |
63 | public int getAge_50() {
64 | return age_50;
65 | }
66 |
67 | public void setAge_50(int age_50) {
68 | this.age_50 = age_50;
69 | }
70 |
71 | public int getAge_60() {
72 | return age_60;
73 | }
74 |
75 | public void setAge_60(int age_60) {
76 | this.age_60 = age_60;
77 | }
78 |
79 |
80 | public int getTotal(){
81 | int ret = 0;
82 | ret += (man*man) + (woman*woman) + (age_10*age_10) + (age_20*age_20) + (age_30*age_30) + (age_40*age_40) +
83 | (age_50*age_50) + (age_60*age_60);
84 | return ret;
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/common/pojo/HotArticle.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.common.pojo;
2 |
3 | public class HotArticle {
4 |
5 | private String articleId; //文章id
6 | private long pvCount; //文章游览量
7 | private long windowEnd; //窗口结束时间戳
8 | private int rank; //热度榜名次
9 |
10 | public HotArticle() {
11 | }
12 |
13 | public HotArticle(String articleId, long pvCount, long windowEnd, int rank) {
14 | this.articleId = articleId;
15 | this.pvCount = pvCount;
16 | this.windowEnd = windowEnd;
17 | this.rank = rank;
18 | }
19 |
20 | public HotArticle(String articleId, long pvCount, long windowEnd) {
21 | this.articleId = articleId;
22 | this.pvCount = pvCount;
23 | this.windowEnd = windowEnd;
24 | this.rank = 0;
25 | }
26 |
27 | public String getArticleId() {
28 | return articleId;
29 | }
30 |
31 | public void setArticleId(String articleId) {
32 | this.articleId = articleId;
33 | }
34 |
35 | public long getPvCount() {
36 | return pvCount;
37 | }
38 |
39 | public void setPvCount(long pvCount) {
40 | this.pvCount = pvCount;
41 | }
42 |
43 | public long getWindowEnd() {
44 | return windowEnd;
45 | }
46 |
47 | public void setWindowEnd(long windowEnd) {
48 | this.windowEnd = windowEnd;
49 | }
50 |
51 | public int getRank() {
52 | return rank;
53 | }
54 |
55 | public void setRank(int rank) {
56 | this.rank = rank;
57 | }
58 |
59 | @Override
60 | public String toString() {
61 | return "HotArticle : " +
62 | "articleId='" + articleId + '\'' +
63 | ", pvCount=" + pvCount +
64 | ", windowEnd=" + windowEnd +
65 | ", rank=" + rank;
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/common/pojo/Log.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.common.pojo;
2 |
3 | /**
4 | * 日志实体
5 | */
6 | public class Log {
7 |
8 | private String userId;
9 | private String articleId;
10 | private Long time;
11 | private String action;
12 |
13 | public String getUserId() { return userId; }
14 |
15 | public void setUserId(String userId) { this.userId = userId; }
16 |
17 | public String getArticleId() { return articleId; }
18 |
19 | public void setArticleId(String articleId) {
20 | this.articleId = articleId;
21 | }
22 |
23 | public Long getTime() {
24 | return time;
25 | }
26 |
27 | public void setTime(Long time) {
28 | this.time = time;
29 | }
30 |
31 | public String getAction() {
32 | return action;
33 | }
34 |
35 | public void setAction(String action) {
36 | this.action = action;
37 | }
38 |
39 | @Override
40 | public String toString() {
41 | return "Log" +
42 | "userId='" + userId + '\'' +
43 | ", articleId='" + articleId + '\'' +
44 | ", time=" + time +
45 | ", action='" + action + '\'';
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/common/pojo/SpiderArticle.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.common.pojo;
2 |
3 | import org.apache.flink.api.java.tuple.Tuple2;
4 |
5 | import java.util.HashMap;
6 | import java.util.Map;
7 | import java.util.PriorityQueue;
8 |
9 | public class SpiderArticle {
10 |
11 | private String articleId;
12 | private long timestamp;
13 | private String content;
14 | private Map tfMap;
15 | private PriorityQueue> tfidf;
16 |
17 | public SpiderArticle() {
18 | tfMap = new HashMap<>();
19 | tfidf = new PriorityQueue();
20 | }
21 |
22 | public SpiderArticle(String articleId, long timestamp, String content) {
23 | this.articleId = articleId;
24 | this.timestamp = timestamp;
25 | this.content = content;
26 | tfMap = new HashMap<>();
27 | tfidf = new PriorityQueue();
28 | }
29 |
30 | public String getArticleId() {
31 | return articleId;
32 | }
33 |
34 | public void setArticleId(String articleId) {
35 | this.articleId = articleId;
36 | }
37 |
38 | public long getTimestamp() {
39 | return timestamp;
40 | }
41 |
42 | public void setTimestamp(long timestamp) {
43 | this.timestamp = timestamp;
44 | }
45 |
46 | public String getContent() {
47 | return this.content;
48 | }
49 |
50 | public void setContent(String content) {
51 | this.content = content;
52 | }
53 | public Map getTfMap() {
54 | return tfMap;
55 | }
56 |
57 | public void setTfMap(Map tfMap) {
58 | this.tfMap = tfMap;
59 | }
60 |
61 | public PriorityQueue> getTfidf() {
62 | return tfidf;
63 | }
64 |
65 | public void setTfidf(PriorityQueue> tfidf) {
66 | this.tfidf = tfidf;
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/common/pojo/User.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.common.pojo;
2 |
3 | public class User {
4 | private String userId;
5 | private int sex;
6 | private int age;
7 | private String job;
8 | private String education;
9 |
10 | public User(){}
11 |
12 | public User(String userId, int sex, int age, String job, String education) {
13 | this.userId = userId;
14 | this.sex = sex;
15 | this.age = age;
16 | this.job = job;
17 | this.education = education;
18 | }
19 |
20 | public String getUserId() {
21 | return userId;
22 | }
23 |
24 | public void setUserId(String userId) {
25 | this.userId = userId;
26 | }
27 |
28 | public int getSex() {
29 | return sex;
30 | }
31 |
32 | public void setSex(int sex) {
33 | this.sex = sex;
34 | }
35 |
36 | public int getAge() {
37 | return age;
38 | }
39 |
40 | public void setAge(int age) {
41 | this.age = age;
42 | }
43 |
44 | public String getJob() {
45 | return job;
46 | }
47 |
48 | public void setJob(String job) {
49 | this.job = job;
50 | }
51 |
52 | public String getEducation() {
53 | return education;
54 | }
55 |
56 | public void setEducation(String education) {
57 | this.education = education;
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/common/pojo/WindowedArticle.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.common.pojo;
2 |
3 | import java.util.HashMap;
4 | import java.util.Map;
5 |
6 | public class WindowedArticle {
7 |
8 | private String articleId;
9 | private long timestamp;
10 | private long windowEnd;
11 | private Map tfMap;
12 |
13 | public WindowedArticle() {
14 | tfMap = new HashMap<>();
15 | }
16 |
17 | public WindowedArticle(String articleId, long timestamp, long windowEnd) {
18 | this.articleId = articleId;
19 | this.timestamp = timestamp;
20 | this.windowEnd = windowEnd;
21 | tfMap = new HashMap<>();
22 | }
23 |
24 | public String getArticleId() {
25 | return articleId;
26 | }
27 |
28 | public void setArticleId(String articleId) {
29 | this.articleId = articleId;
30 | }
31 |
32 | public long getTimestamp() {
33 | return timestamp;
34 | }
35 |
36 | public void setTimestamp(long timestamp) {
37 | this.timestamp = timestamp;
38 | }
39 |
40 | public long getWindowEnd() {
41 | return windowEnd;
42 | }
43 |
44 | public void setWindowEnd(long windowEnd) {
45 | this.windowEnd = windowEnd;
46 | }
47 |
48 | public Map getTfMap() {
49 | return tfMap;
50 | }
51 |
52 | public void setTfMap(Map tfMap) {
53 | this.tfMap = tfMap;
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/nearline/task/HistoryTask.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.nearline.task;
2 |
3 | import com.ams.recommend.client.HBaseClient;
4 | import com.ams.recommend.common.pojo.Log;
5 | import com.ams.recommend.util.Constants;
6 | import com.ams.recommend.util.LogUtil;
7 | import com.ams.recommend.util.Property;
8 | import org.apache.flink.api.common.functions.FlatMapFunction;
9 | import org.apache.flink.api.common.serialization.SimpleStringSchema;
10 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
11 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
12 |
13 | public class HistoryTask {
14 |
15 | public static void main(String[] args) throws Exception {
16 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
17 |
18 | FlinkKafkaConsumer consumer = new FlinkKafkaConsumer(
19 | "log",
20 | new SimpleStringSchema(),
21 | Property.getKafkaProperties("history")
22 | );
23 |
24 | env.addSource(consumer)
25 | .flatMap((FlatMapFunction) (value, out) -> {
26 | Log log = LogUtil.toLogEntry(value);
27 | if(null != log) {
28 | //文章相对应的用户操作更新1次记录
29 | HBaseClient.addOrUpdateColumn(
30 | Constants.ARTICLE_HIS_TABLE,
31 | log.getArticleId(),
32 | "p",
33 | log.getUserId());
34 | //用户对游览的文章的操作次数加1
35 | HBaseClient.addOrUpdateColumn(
36 | Constants.USER_HIS_TABLE,
37 | log.getUserId(),
38 | "p",
39 | log.getArticleId());
40 | }
41 | });
42 |
43 | env.execute("History Task");
44 | }
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/nearline/task/HotArticleTask.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.nearline.task;
2 |
3 | import com.ams.recommend.common.pojo.HotArticle;
4 | import com.ams.recommend.common.pojo.Log;
5 | import com.ams.recommend.util.LogUtil;
6 | import com.ams.recommend.util.Property;
7 | import org.apache.flink.api.common.functions.AggregateFunction;
8 | import org.apache.flink.api.common.functions.FlatMapFunction;
9 | import org.apache.flink.api.common.serialization.SimpleStringSchema;
10 | import org.apache.flink.api.common.state.ListState;
11 | import org.apache.flink.api.common.state.ListStateDescriptor;
12 | import org.apache.flink.configuration.Configuration;
13 | import org.apache.flink.streaming.api.TimeCharacteristic;
14 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
15 | import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
16 | import org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor;
17 | import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
18 | import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
19 | import org.apache.flink.streaming.api.windowing.time.Time;
20 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
21 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
22 | import org.apache.flink.streaming.connectors.redis.RedisSink;
23 | import org.apache.flink.streaming.connectors.redis.common.config.FlinkJedisPoolConfig;
24 | import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommand;
25 | import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommandDescription;
26 | import org.apache.flink.streaming.connectors.redis.common.mapper.RedisMapper;
27 | import org.apache.flink.util.Collector;
28 | import org.slf4j.Logger;
29 | import org.slf4j.LoggerFactory;
30 |
31 | import java.util.Comparator;
32 | import java.util.LinkedList;
33 | import java.util.List;
34 | import java.util.PriorityQueue;
35 |
36 | public class HotArticleTask {
37 |
38 | private final static Logger logger = LoggerFactory.getLogger(HotArticleTask.class);
39 | private final static int HOTSIZE = 20; //热榜的文章数
40 |
41 | public static void main(String[] args) throws Exception {
42 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
43 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
44 | env.enableCheckpointing(5000L);
45 |
46 | FlinkKafkaConsumer consumer = new FlinkKafkaConsumer(
47 | "log",
48 | new SimpleStringSchema(),
49 | Property.getKafkaProperties("hot")
50 | );
51 |
52 | FlinkJedisPoolConfig redisConf = new FlinkJedisPoolConfig.Builder()
53 | .setHost(Property.getStrValue("redis.host"))
54 | .setPort(Property.getIntValue("redis.port"))
55 | .setDatabase(Property.getIntValue("redis.db"))
56 | .build();
57 |
58 | env.addSource(consumer)
59 | .flatMap(new LogFlatMapFunction())
60 | .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor(Time.minutes(5)) {
61 | @Override
62 | public long extractTimestamp(Log log) {
63 | logger.info("watermark : " + log.getTime() * 1000);
64 | return log.getTime() * 1000; //转化为毫秒
65 | }
66 | }).keyBy(log -> log.getArticleId())
67 | .timeWindow(Time.seconds(60), Time.seconds(20))
68 | .aggregate(new CountAgg(), new WindowResultFunction())
69 | .keyBy(hot -> hot.getWindowEnd())
70 | .process(new HotArticleProcessFunction(HOTSIZE))
71 | .flatMap(new TopFlatMapFunction())
72 | .addSink(new RedisSink<>(redisConf, new HotArticleSink()));
73 |
74 | env.execute("Hot article task");
75 | }
76 |
77 |
78 | private static class LogFlatMapFunction implements FlatMapFunction {
79 | @Override
80 | public void flatMap(String value, Collector out) throws Exception {
81 | Log log = LogUtil.toLogEntry(value);
82 | if("1".equals(log.getAction())) {
83 | out.collect(log);
84 | }
85 | }
86 | }
87 |
88 | private static class CountAgg implements AggregateFunction {
89 | @Override
90 | public Long createAccumulator() {
91 | return 0L;
92 | }
93 |
94 | @Override
95 | public Long add(Log value, Long accumulator) {
96 | return accumulator + 1;
97 | }
98 |
99 | @Override
100 | public Long getResult(Long accumulator) {
101 | return accumulator;
102 | }
103 |
104 | @Override
105 | public Long merge(Long a, Long b) {
106 | return a + b;
107 | }
108 | }
109 |
110 | /**
111 | * 将每个key每个窗口聚合后的结果带上热门文章对象进行输出
112 | */
113 | private static class WindowResultFunction implements WindowFunction {
114 | @Override
115 | public void apply(String articleId, TimeWindow window, Iterable input, Collector out) throws Exception {
116 | Long pvCount = input.iterator().next();
117 | HotArticle article = new HotArticle(articleId, pvCount, window.getEnd());
118 | out.collect(article);
119 |
120 | logger.info(article.toString());
121 | }
122 | }
123 |
124 | private static class HotArticleProcessFunction extends KeyedProcessFunction> {
125 |
126 | private int hotSize;
127 | private ListState hotArticleListState;
128 |
129 | public HotArticleProcessFunction(int hotSize) {
130 | if(hotSize < 1) throw new IllegalArgumentException("Article size should not less than 1!");
131 | this.hotSize = hotSize;
132 | }
133 |
134 | @Override
135 | public void open(Configuration parameters) throws Exception {
136 | super.open(parameters);
137 | ListStateDescriptor hotArticleListStateDescriptor =
138 | new ListStateDescriptor<>("hotArticle-state", HotArticle.class);
139 | hotArticleListState = getRuntimeContext().getListState(hotArticleListStateDescriptor);
140 | }
141 |
142 | @Override
143 | public void processElement(HotArticle hotArticle, Context ctx, Collector> out) throws Exception {
144 | hotArticleListState.add(hotArticle);
145 | //注册下一次的事件计时器
146 | ctx.timerService().registerEventTimeTimer(hotArticle.getWindowEnd() + 1);
147 | }
148 |
149 | @Override
150 | public void onTimer(long timestamp, OnTimerContext ctx, Collector> out) throws Exception {
151 | PriorityQueue hotArticles = new PriorityQueue<>(hotSize, new Comparator() {
152 | @Override
153 | public int compare(HotArticle o1, HotArticle o2) {
154 | if(o1.getPvCount() > o2.getPvCount()) return -1;
155 | else if(o1.getPvCount() < o2.getPvCount()) return 1;
156 | else return 0;
157 | }
158 | });
159 |
160 | for(HotArticle hotArticle : hotArticleListState.get()) {
161 | hotArticles.add(hotArticle);
162 | }
163 | //清理状态
164 | hotArticleListState.clear();
165 |
166 | out.collect(new LinkedList<>(hotArticles));
167 | }
168 | }
169 |
170 | private static class HotArticleSink implements RedisMapper {
171 | @Override
172 | public RedisCommandDescription getCommandDescription() {
173 | return new RedisCommandDescription(RedisCommand.SET, null);
174 | }
175 |
176 | @Override
177 | public String getKeyFromData(HotArticle hotArticle) {
178 | logger.info("Redis Key : " + hotArticle.getRank());
179 | return String.valueOf(hotArticle.getRank());
180 | }
181 |
182 | @Override
183 | public String getValueFromData(HotArticle hotArticle) {
184 | logger.info("Redis Value : " + hotArticle.getRank());
185 | return hotArticle.getArticleId();
186 | }
187 | }
188 |
189 | private static class TopFlatMapFunction implements FlatMapFunction, HotArticle>{
190 | @Override
191 | public void flatMap(List topArticles, Collector out) throws Exception {
192 | StringBuilder builder = new StringBuilder();
193 |
194 | builder.append("\n========== Hot Articles ==========\n");
195 | int rank = 1;
196 | for(HotArticle topArticle : topArticles) {
197 | topArticle.setRank(rank++);
198 | builder.append("Article ID: " + topArticle.getArticleId())
199 | .append(", Rank: " + topArticle.getRank())
200 | .append(", PV: " + topArticle.getPvCount() + "\n");
201 |
202 | out.collect(topArticle);
203 | }
204 |
205 | logger.info(builder.toString());
206 | }
207 | }
208 | }
209 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/nearline/task/LogTask.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.nearline.task;
2 |
3 | import com.ams.recommend.client.HBaseClient;
4 | import com.ams.recommend.common.pojo.Log;
5 | import com.ams.recommend.util.LogUtil;
6 | import com.ams.recommend.util.Property;
7 | import org.apache.flink.api.common.functions.FlatMapFunction;
8 | import org.apache.flink.api.common.serialization.SimpleStringSchema;
9 | import org.apache.flink.streaming.api.TimeCharacteristic;
10 | import org.apache.flink.streaming.api.datastream.DataStream;
11 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
12 | import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
13 | import org.apache.flink.streaming.api.windowing.time.Time;
14 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
15 | import org.apache.flink.util.Collector;
16 | import org.slf4j.Logger;
17 | import org.slf4j.LoggerFactory;
18 |
19 | import java.util.Properties;
20 |
21 | public class LogTask {
22 |
23 | private static final Logger logger = LoggerFactory.getLogger(LogTask.class);
24 |
25 | public static void main(String[] args) throws Exception {
26 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
27 | env.enableCheckpointing(600000L); //设置每10分钟设置自动生成checkpoint
28 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
29 |
30 | Properties kafkaProp = Property.getKafkaProperties("log"); //设置消费组id
31 | FlinkKafkaConsumer consumer = new FlinkKafkaConsumer<>("log",
32 | new SimpleStringSchema(),
33 | kafkaProp
34 | );
35 |
36 | DataStream logs = env
37 | .addSource(consumer)
38 | .flatMap(new LogFlatMapFunction()) //设置每5分钟生成一个水位线
39 | .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor(Time.minutes(15)) {
40 | @Override
41 | public long extractTimestamp(Log element) {
42 | return element.getTime();
43 | }
44 | });
45 |
46 | env.execute("Collect log task");
47 | }
48 |
49 | private static class LogFlatMapFunction implements FlatMapFunction {
50 |
51 | @Override
52 | public void flatMap(String value, Collector out) throws Exception {
53 | //直接将Kafka传来的log放入HBase中
54 | Log log = LogUtil.toLogEntry(value); //将log转化为Log实体
55 |
56 | if(log != null) {
57 | final String rowKey = LogUtil.getLogRowKey(log.getTime());
58 | String tableName = Property.getStrValue("table.log.name");
59 | //如果不存在表就创建一个
60 | HBaseClient.createTableIfNotExist(tableName, "l");
61 | //插入用户id
62 | HBaseClient.put(tableName, rowKey, "l"
63 | , "uid", log.getUserId());
64 | //插入文章id
65 | HBaseClient.put(tableName, rowKey, "l"
66 | , "aid", log.getArticleId());
67 | //插入用户发生动作时间
68 | HBaseClient.put(tableName, rowKey, "l"
69 | , "ts", String.valueOf(log.getTime()));
70 | //插入用户发生的动作
71 | HBaseClient.put(tableName, rowKey, "l"
72 | , "act", log.getAction());
73 |
74 | logger.info(log.toString());
75 | out.collect(log);
76 | }
77 | }
78 | }
79 |
80 | }
81 |
82 |
83 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/nearline/task/PortraitTask.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.nearline.task;
2 |
3 | import com.ams.recommend.client.HBaseClient;
4 | import com.ams.recommend.client.MySQLClient;
5 | import com.ams.recommend.common.pojo.Log;
6 | import com.ams.recommend.common.pojo.User;
7 | import com.ams.recommend.util.Constants;
8 | import com.ams.recommend.util.LogUtil;
9 | import com.ams.recommend.util.Property;
10 | import com.ams.recommend.util.WordTokenizerUtil;
11 | import org.apache.flink.api.common.serialization.SimpleStringSchema;
12 | import org.apache.flink.api.common.state.StateTtlConfig;
13 | import org.apache.flink.api.common.state.ValueState;
14 | import org.apache.flink.api.common.state.ValueStateDescriptor;
15 | import org.apache.flink.configuration.Configuration;
16 | import org.apache.flink.streaming.api.TimeCharacteristic;
17 | import org.apache.flink.streaming.api.datastream.DataStream;
18 | import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
19 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
20 | import org.apache.flink.streaming.api.functions.ProcessFunction;
21 | import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
22 | import org.apache.flink.streaming.api.functions.sink.SinkFunction;
23 | import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
24 | import org.apache.flink.streaming.api.windowing.time.Time;
25 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
26 | import org.apache.flink.util.Collector;
27 | import org.apache.flink.util.OutputTag;
28 |
29 | import java.sql.ResultSet;
30 |
31 | /**
32 | * 画像Task
33 | */
34 | public class PortraitTask {
35 |
36 | private final static Long TTL = 180L; //判定用户喜欢文章的阅读时间 默认3min
37 |
38 | private static final OutputTag outputTag = new OutputTag("side-output"){};
39 |
40 | public static void main(String[] args) throws Exception {
41 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
42 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
43 |
44 | DataStream logSource = env.addSource(new FlinkKafkaConsumer<>(
45 | "log",
46 | new SimpleStringSchema(),
47 | Property.getKafkaProperties("portrait")
48 | ));
49 |
50 | SingleOutputStreamOperator logs = logSource
51 | .process(new LogProcessFunction())
52 | .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor(Time.seconds(10)) {
53 | @Override
54 | public long extractTimestamp(Log element) {
55 | return element.getTime();
56 | }
57 | });
58 |
59 | logs.keyBy("articleId")
60 | .addSink(new ArticlePortraitSink());
61 |
62 | logs.getSideOutput(outputTag)
63 | .keyBy("userID")
64 | .addSink(new UserPortraitSink());
65 |
66 | env.execute("Portrait Task");
67 | }
68 |
69 | private static class LogProcessFunction extends ProcessFunction {
70 | @Override
71 | public void processElement(String log, Context ctx, Collector out) throws Exception {
72 | Log logEntry = LogUtil.toLogEntry(log);
73 | out.collect(logEntry);
74 | //侧输出到另外一条Stream
75 | ctx.output(outputTag, logEntry);
76 | }
77 | }
78 |
79 | /**
80 | * 统计喜欢看这篇文章的用户信息,比如:性别,年龄,职业,学历等
81 | */
82 | private static class ArticlePortraitSink implements SinkFunction {
83 | @Override
84 | public void invoke(Log log, Context context) throws Exception {
85 | User user = MySQLClient.getUserById(log.getUserId());
86 | String articleId = log.getArticleId();
87 | String userId = log.getUserId();
88 | //性别
89 | HBaseClient.put(Constants.ARTICLE_PORTRAIT_TABLE,
90 | articleId,
91 | "sex",
92 | userId,
93 | String.valueOf(user.getSex())
94 | );
95 | //年龄段
96 | HBaseClient.put(Constants.ARTICLE_PORTRAIT_TABLE,
97 | articleId,
98 | "age",
99 | userId,
100 | Constants.rangeAge(user.getAge())
101 | );
102 | //职业
103 | HBaseClient.put(Constants.ARTICLE_PORTRAIT_TABLE,
104 | articleId,
105 | "job",
106 | userId,
107 | user.getJob()
108 | );
109 | //学历
110 | HBaseClient.put(Constants.ARTICLE_PORTRAIT_TABLE,
111 | articleId,
112 | "edu",
113 | userId,
114 | user.getEducation()
115 | );
116 | }
117 | }
118 |
119 | /**
120 | * 统计用户画像信息
121 | * 作者(文章来源),频道,标题,关键字
122 | */
123 | private static class UserPortraitSink implements SinkFunction {
124 | @Override
125 | public void invoke(Log log, Context context) throws Exception {
126 | ResultSet rs = MySQLClient.getUserPortraitById(log.getArticleId());
127 |
128 | if(rs != null) {
129 | rs.next();
130 | String author = rs.getString("author");
131 | int channelId = rs.getInt("channel_id");
132 | String title = rs.getString("title");
133 | String keyword = rs.getString("keyword");
134 |
135 | String userId = log.getUserId();
136 | String articleId = log.getArticleId();
137 | //作者
138 | HBaseClient.put(Constants.USER_PORTRAIT_TABLE,
139 | userId,
140 | "aut",
141 | articleId,
142 | author
143 | );
144 | //频道
145 | HBaseClient.put(Constants.USER_PORTRAIT_TABLE,
146 | userId,
147 | "cha",
148 | articleId,
149 | String.valueOf(channelId)
150 | );
151 | //标题
152 | HBaseClient.put(Constants.USER_PORTRAIT_TABLE,
153 | userId,
154 | "tit",
155 | articleId,
156 | WordTokenizerUtil.segment(title)
157 | );
158 | //关键字
159 | HBaseClient.put(Constants.USER_PORTRAIT_TABLE,
160 | userId,
161 | "kw",
162 | articleId,
163 | keyword
164 | );
165 | }
166 | }
167 | }
168 |
169 | }
170 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/nearline/task/UserInterestTask.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.nearline.task;
2 |
3 | import com.ams.recommend.client.HBaseClient;
4 | import com.ams.recommend.common.pojo.Log;
5 | import com.ams.recommend.util.Constants;
6 | import com.ams.recommend.util.LogUtil;
7 | import com.ams.recommend.util.Property;
8 | import org.apache.flink.api.common.functions.MapFunction;
9 | import org.apache.flink.api.common.serialization.SimpleStringSchema;
10 | import org.apache.flink.api.common.state.StateTtlConfig;
11 | import org.apache.flink.api.common.state.ValueState;
12 | import org.apache.flink.api.common.state.ValueStateDescriptor;
13 | import org.apache.flink.configuration.Configuration;
14 | import org.apache.flink.streaming.api.TimeCharacteristic;
15 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
16 | import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
17 | import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
18 | import org.apache.flink.streaming.api.windowing.time.Time;
19 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
20 |
21 | public class UserInterestTask {
22 |
23 | private final static Long LIKE_TIME = 180_1000L; //判定用户喜欢文章的阅读时间
24 |
25 | public static void main(String[] args) throws Exception {
26 |
27 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
28 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
29 | env.enableCheckpointing(5000L);
30 |
31 | FlinkKafkaConsumer consumer = new FlinkKafkaConsumer(
32 | "log",
33 | new SimpleStringSchema(),
34 | Property.getKafkaProperties("user-interest")
35 | );
36 |
37 | env.addSource(consumer)
38 | .map(new LogMapFunction())
39 | .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor(Time.seconds(1)) {
40 | @Override
41 | public long extractTimestamp(Log element) {
42 | return element.getTime();
43 | }
44 | })
45 | .keyBy(user -> user.getUserId())
46 | .addSink(new UserInterestSinkFunction());
47 |
48 | env.execute("User Interest Task");
49 | }
50 |
51 | private static class LogMapFunction implements MapFunction {
52 | @Override
53 | public Log map(String value) throws Exception {
54 | Log log = LogUtil.toLogEntry(value);
55 |
56 | if(log != null) return log;
57 | else return null;
58 | }
59 | }
60 |
61 | private static class UserInterestSinkFunction extends RichSinkFunction {
62 |
63 | private ValueState lastTimeState;
64 |
65 | @Override
66 | public void open(Configuration parameters) throws Exception {
67 | super.open(parameters);
68 | //设置状态过期时间为3个小时(粗略认为如果用户3个小时还没有关闭该页面,可能是挂机状态,不认为是有效游览)
69 | StateTtlConfig ttlConfig = StateTtlConfig
70 | .newBuilder(org.apache.flink.api.common.time.Time.hours(3))
71 | .build();
72 |
73 | ValueStateDescriptor desc = new ValueStateDescriptor<>("Open Page time", Long.class);
74 | desc.enableTimeToLive(ttlConfig);
75 | lastTimeState = getRuntimeContext().getState(desc);
76 | }
77 |
78 | @Override
79 | public void invoke(Log log, Context context) throws Exception {
80 | //动作: 1.打开游览;2.点赞;3.收藏;4.关闭
81 | String op = log.getAction();
82 | Long curTime = log.getTime();
83 | Long lastTime = lastTimeState.value();
84 |
85 | if("1".equals(op)) {
86 | lastTimeState.update(curTime);
87 | }else if("4".equals(op)) {
88 | if(curTime - lastTime > LIKE_TIME) { //游览时间长,表示用户对该文章感兴趣
89 | HBaseClient.addOrUpdateColumn(Constants.USER_PORTRAIT_TABLE, log.getUserId(), "i", log.getArticleId());
90 | }
91 | lastTimeState.clear();
92 | } else if("2".equals(op) || "3".equals(op)) { //点赞收藏
93 | HBaseClient.addOrUpdateColumn(Constants.USER_PORTRAIT_TABLE, log.getUserId(), "i", log.getArticleId());
94 | lastTimeState.clear();
95 | }
96 | }
97 |
98 | }
99 |
100 | }
101 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/nearline/task/tfidf/SpiderTask.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.nearline.task.tfidf;
2 |
3 | import com.ams.recommend.client.MySQLClient;
4 | import com.ams.recommend.common.pojo.SpiderArticle;
5 | import com.ams.recommend.nearline.task.HotArticleTask;
6 | import com.ams.recommend.util.Property;
7 | import org.apache.flink.api.common.functions.MapFunction;
8 | import org.apache.flink.api.common.serialization.SimpleStringSchema;
9 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
10 | import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
11 | import org.apache.flink.streaming.api.windowing.time.Time;
12 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
13 | import org.slf4j.Logger;
14 | import org.slf4j.LoggerFactory;
15 |
16 | public class SpiderTask {
17 |
18 | private final static Logger logger = LoggerFactory.getLogger(HotArticleTask.class);
19 | private static final Integer KEYWORD_SIZE = 10; //爬去文章筛选的关键字个数
20 |
21 | public static void main(String[] args) throws Exception {
22 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
23 | env.enableCheckpointing(5000);
24 |
25 | FlinkKafkaConsumer consumer = new FlinkKafkaConsumer<>(
26 | "spider",
27 | new SimpleStringSchema(),
28 | Property.getKafkaProperties("tf-idf")
29 | );
30 |
31 | env.addSource(consumer)
32 | .map(new SpiderMapFunction())
33 | .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor(Time.minutes(10)) {
34 | @Override
35 | public long extractTimestamp(SpiderArticle element) {
36 | logger.info("spider article watermark : " + element.getTimestamp());
37 | return element.getTimestamp();
38 | }
39 | })
40 | .map(new TFIDFMapFunction(KEYWORD_SIZE))
41 | .addSink(new TFIDFSink());
42 |
43 | env.execute("Spider for tf-idf task");
44 | }
45 |
46 |
47 | private static class SpiderMapFunction implements MapFunction {
48 | @Override
49 | public SpiderArticle map(String value) throws Exception {
50 | if(value == null) throw new IllegalArgumentException("Spiders are EMPTY!");
51 |
52 | SpiderArticle article = new SpiderArticle();
53 | String[] vs = value.split(",");
54 | String articleId = vs[0];
55 | long timestamp = Long.valueOf(vs[1]);
56 |
57 | article.setArticleId(articleId);
58 | article.setTimestamp(timestamp);
59 |
60 | String content = MySQLClient.getContentById(articleId);
61 | article.setContent(content);
62 |
63 | return article;
64 | }
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/nearline/task/tfidf/TFIDFMapFunction.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.nearline.task.tfidf;
2 |
3 | import com.ams.recommend.client.HBaseClient;
4 | import com.ams.recommend.common.pojo.SpiderArticle;
5 | import com.ams.recommend.util.Constants;
6 | import com.ams.recommend.util.Property;
7 | import com.ams.recommend.util.WordTokenizerUtil;
8 | import org.apache.flink.api.common.functions.MapFunction;
9 | import org.apache.flink.api.java.tuple.Tuple2;
10 |
11 | import java.util.Map;
12 | import java.util.PriorityQueue;
13 |
14 | public class TFIDFMapFunction implements MapFunction {
15 |
16 | private final String tableName = Property.getStrValue("table.word.name");
17 | private int keywordSize;
18 | private long totalArticleSize = 1L;
19 |
20 | public TFIDFMapFunction(int keywordSize) {
21 | if(keywordSize < 1) throw new IllegalArgumentException("keywords num should not less than 1.");
22 | this.keywordSize = keywordSize;
23 |
24 | //取出文章总数
25 | String sizeStr = HBaseClient.get(tableName, "articleSize", "c", "count");
26 | if(sizeStr != null) totalArticleSize = Long.valueOf(sizeStr);
27 | }
28 |
29 | @Override
30 | public SpiderArticle map(SpiderArticle article) throws Exception {
31 | //统计文章各个词的TF
32 | Map tf = WordTokenizerUtil.tf(article.getContent());
33 | article.setTfMap(tf);
34 |
35 | PriorityQueue> tfidfQueue = new PriorityQueue<>(keywordSize);
36 |
37 | //计算TF-IDF
38 | for(String word : tf.keySet()) {
39 | //查询包含词word的文章总数
40 | int size = HBaseClient.getColumnSize(tableName, word, "a");
41 | if(size == 0) size = 1;
42 | Double TF = tf.get(word);
43 | Double IDF = Math.log10(totalArticleSize / size);
44 | Double tfidf = TF * IDF;
45 | tfidfQueue.add(new Tuple2<>(word, tfidf));
46 | //更新单词(rowKey)对应的文章列
47 | HBaseClient.addOrUpdateColumn(tableName, word, "a", article.getArticleId());
48 | //更新文章中各个单词的tf,tfidf
49 | HBaseClient.put(Constants.ARTICLE_TFIDF_TABLE, article.getArticleId(), "tf", word, String.valueOf(tf));
50 | HBaseClient.put(Constants.ARTICLE_TFIDF_TABLE, article.getArticleId(), "ti", word, String.valueOf(tfidf));
51 | }
52 | article.setTfidf(tfidfQueue);
53 | //更新总文章数
54 | HBaseClient.addOrUpdateColumn(tableName, "articleSize", "c", "count");
55 |
56 | return article;
57 | }
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/nearline/task/tfidf/TFIDFSink.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.nearline.task.tfidf;
2 |
3 | import com.ams.recommend.client.MySQLClient;
4 | import com.ams.recommend.common.pojo.SpiderArticle;
5 | import org.apache.flink.api.java.tuple.Tuple2;
6 | import org.apache.flink.streaming.api.functions.sink.SinkFunction;
7 |
8 | import java.util.PriorityQueue;
9 |
10 | public class TFIDFSink implements SinkFunction {
11 | @Override
12 | public void invoke(SpiderArticle article, Context context) throws Exception {
13 | PriorityQueue> topKeyword = article.getTfidf();
14 | StringBuilder stringBuilder = new StringBuilder();
15 | while(!topKeyword.isEmpty()){
16 | Tuple2 tiKV = topKeyword.poll();
17 | if(topKeyword.size() > 1)
18 | stringBuilder.append(tiKV.f0 + " ");
19 | else stringBuilder.append(tiKV.f1);
20 | }
21 | MySQLClient.putKeywordById(article.getArticleId(), stringBuilder.toString());
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/offline/ArticleCoeff.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.offline;
2 |
3 | import com.ams.recommend.client.HBaseClient;
4 | import com.ams.recommend.common.pojo.ArticlePortrait;
5 | import com.ams.recommend.util.Constants;
6 |
7 | import java.io.IOException;
8 | import java.util.List;
9 |
10 | /**
11 | * 基于文章标签的文章相关度计算
12 | * 1.基于文章标签 计算文章的余弦相似度
13 | * 2.基于文章内容(关键字) 计算文章的相似度
14 | * @author jackybai
15 | */
16 | public class ArticleCoeff {
17 | /**
18 | * 计算一个文章和其他相关文章的评分,并将计算结果放入Hbase
19 | * @param id 文章id
20 | * @param others 其他文章的id
21 | */
22 | public void getArticleCoeff(String id, List others) throws Exception {
23 | ArticlePortrait article = sigleArticle(id);
24 | for (String articleId : others) {
25 | if (id.equals(articleId)) continue;
26 | ArticlePortrait entity = sigleArticle(articleId);
27 | Double score = getScore(article, entity);
28 | HBaseClient.put(Constants.ARTICLE_TAG_TABLE, id, "p", articleId, score.toString());
29 | }
30 | }
31 |
32 | /**
33 | * 获取一个文章的所有标签数据
34 | * @param articleId 文章id
35 | * @return 文章标签entity
36 | * @throws IOException
37 | */
38 | private ArticlePortrait sigleArticle(String articleId) {
39 | ArticlePortrait entity = new ArticlePortrait();
40 | try {
41 | String woman = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "sex", Constants.SEX_WOMAN);
42 | String man = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "sex", Constants.SEX_MAN);
43 | String age_10 = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "age", Constants.AGE_10);
44 | String age_20 = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "age", Constants.AGE_20);
45 | String age_30 = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "age", Constants.AGE_30);
46 | String age_40 = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "age", Constants.AGE_40);
47 | String age_50 = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "age", Constants.AGE_50);
48 | String age_60 = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "age", Constants.AGE_60);
49 | entity.setMan(Integer.valueOf(man));
50 | entity.setWoman(Integer.valueOf(woman));
51 | entity.setAge_10(Integer.valueOf(age_10));
52 | entity.setAge_20(Integer.valueOf(age_20));
53 | entity.setAge_30(Integer.valueOf(age_30));
54 | entity.setAge_40(Integer.valueOf(age_40));
55 | entity.setAge_50(Integer.valueOf(age_50));
56 | entity.setAge_60(Integer.valueOf(age_60));
57 | } catch (Exception e) {
58 | System.err.println("articleId: " + articleId);
59 | e.printStackTrace();
60 | }
61 | return entity;
62 | }
63 |
64 | /**
65 | * 根据标签计算两个文章之间的相关度
66 | * @param article 文章
67 | * @param target 相关文章
68 | * @return 相似分数
69 | */
70 | private double getScore(ArticlePortrait article, ArticlePortrait target) {
71 | double sqrt = Math.sqrt(article.getTotal() + target.getTotal());
72 | if (sqrt == 0) {
73 | return 0.0;
74 | }
75 | int total = article.getMan() * target.getMan() + article.getWoman() * target.getWoman()
76 | + article.getAge_10() * target.getAge_10() + article.getAge_20() * target.getAge_20()
77 | + article.getAge_30() * target.getAge_30() + article.getAge_40() * target.getAge_40()
78 | + article.getAge_50() * target.getAge_50() + article.getAge_60() * target.getAge_60();
79 | return Math.sqrt(total) / sqrt;
80 | }
81 |
82 | public void calcuSimilar(String id, List others) {
83 |
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/offline/ItemCfCoeff.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.offline;
2 |
3 | import com.ams.recommend.client.HBaseClient;
4 | import com.ams.recommend.util.Constants;
5 |
6 | import java.io.IOException;
7 | import java.util.List;
8 | import java.util.Map;
9 |
10 | /**
11 | * 基于协同过滤的文章相关度计算
12 | * abs( i ∩ j)
13 | * w = ——————————————
14 | * sqrt(i || j)
15 | * @author jackybai
16 | */
17 | public class ItemCfCoeff {
18 |
19 | /**
20 | * 计算一个文章和其他相关文章的评分,并将计算结果放入Hbase
21 | *
22 | * @param id 文章id
23 | * @param others 其他文章的id
24 | */
25 | public void getSingelItemCfCoeff(String id, List others) throws Exception {
26 | for (String other : others) {
27 | if(id.equals(other)) continue;
28 | Double score = twoItemCfCoeff(id, other);
29 | HBaseClient.put(Constants.ARTICLE_CF_TABLE,id, "p",other,score.toString());
30 | }
31 | }
32 |
33 | /**
34 | * 计算两个文章之间的评分
35 | * @param id
36 | * @param other
37 | * @return
38 | * @throws IOException
39 | */
40 | private double twoItemCfCoeff(String id, String other) throws IOException {
41 | Map p1 = HBaseClient.getRow(Constants.ARTICLE_HIS_TABLE, id);
42 | Map p2 = HBaseClient.getRow(Constants.ARTICLE_HIS_TABLE, other);
43 |
44 | int n = p1.size();
45 | int m = p2.size();
46 | int sum = 0;
47 | Double total = Math.sqrt(n * m);
48 | for (Map.Entry entry : p1.entrySet()) {
49 | String key = (String) entry.getKey();
50 | for (Map.Entry p : p2.entrySet()) {
51 | if (key.equals(p.getKey())) {
52 | sum++;
53 | }
54 | }
55 | }
56 | if (total == 0){
57 | return 0.0;
58 | }
59 | return sum / total;
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/offline/SchedulerJob.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.offline;
2 |
3 | import com.ams.recommend.client.HBaseClient;
4 | import com.ams.recommend.nearline.task.LogTask;
5 | import com.ams.recommend.util.Constants;
6 | import org.apache.flink.api.common.time.Time;
7 | import org.slf4j.Logger;
8 | import org.slf4j.LoggerFactory;
9 |
10 | import java.io.IOException;
11 | import java.util.*;
12 | import java.util.concurrent.ExecutorService;
13 | import java.util.concurrent.Executors;
14 |
15 | public class SchedulerJob {
16 | private static final Logger logger = LoggerFactory.getLogger(SchedulerJob.class);
17 | private static ExecutorService executorService = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() + 1);
18 |
19 | /**
20 | * 每12小时定时调度一次 基于三个推荐策略的 文章评分计算
21 | * 策略1:协同过滤
22 | * 策略2:基于文章标签计算文章的余弦相似度
23 | * 策略3:基于文章内容计算文章相似度
24 | */
25 | public static void main(String[] args) {
26 | Timer qTimer = new Timer();
27 | qTimer.scheduleAtFixedRate(new RefreshTask(), 0, Time.hours(12).toMilliseconds());
28 | }
29 |
30 | private static class RefreshTask extends TimerTask {
31 | @Override
32 | public void run() {
33 | logger.info(new Date() + " 开始执行任务");
34 | /* 取出被用户游览过的文章id */
35 | List allArticleId;
36 | try {
37 | allArticleId = HBaseClient.getAllKey(Constants.ARTICLE_HIS_TABLE);
38 | } catch (IOException e) {
39 | System.err.println("获取历史文章id异常: " + e.getMessage());
40 | e.printStackTrace();
41 | return;
42 | }
43 |
44 | for (String id : allArticleId) {
45 | executorService.execute(new Task(id, allArticleId));
46 | }
47 | }
48 | }
49 |
50 | private static class Task implements Runnable {
51 | private String id;
52 | private List others;
53 |
54 | public Task(String id, List others) {
55 | this.id = id;
56 | this.others = others;
57 | }
58 |
59 | ItemCfCoeff item = new ItemCfCoeff();
60 | ArticleCoeff article = new ArticleCoeff();
61 |
62 | @Override
63 | public void run() {
64 | try {
65 | item.getSingelItemCfCoeff(id, others); //策略1:基于协同过滤
66 | article.getArticleCoeff(id, others); //策略2:基于文章标签
67 | article.calcuSimilar(id, others); //策略3:基于文章内容
68 | } catch (Exception e) {
69 | e.printStackTrace();
70 | }
71 | }
72 | }
73 |
74 | }
75 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/util/Constants.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.util;
2 |
3 | public class Constants {
4 |
5 | public static final String SEX_MAN = "1";
6 | public static final String SEX_WOMAN = "0";
7 | public static final String AGE_10 = "10s";
8 | public static final String AGE_20 = "20s";
9 | public static final String AGE_30 = "30s";
10 | public static final String AGE_40 = "40s";
11 | public static final String AGE_50 = "50s";
12 | public static final String AGE_60 = "60s";
13 |
14 | public static String rangeAge(int age) {
15 | if(age < 20) return AGE_10;
16 | else if(age < 30) return AGE_20;
17 | else if(age < 30) return AGE_30;
18 | else if(age < 40) return AGE_40;
19 | else if(age < 50) return AGE_50;
20 | else return AGE_60;
21 | }
22 |
23 | /*文章画像表*/
24 | public final static String ARTICLE_PORTRAIT_TABLE = Property.getStrValue("table.portrait.article.name");
25 | /*用户画像表*/
26 | public final static String USER_PORTRAIT_TABLE = Property.getStrValue("table.portrait.user.name");
27 | /*文章单词表*/
28 | public final static String WORD_TABLE = Property.getStrValue("table.word.name");
29 | /*文章历史信息表*/
30 | public final static String ARTICLE_HIS_TABLE = Property.getStrValue("table.article.history.name");
31 | /*文章历史信息表*/
32 | public final static String USER_HIS_TABLE = Property.getStrValue("table.user.history.name");
33 | /*文章相关度表*/
34 | public final static String ARTICLE_CF_TABLE = Property.getStrValue("table.article.cf.name");
35 | /*文章标签相关度表*/
36 | public final static String ARTICLE_TAG_TABLE = Property.getStrValue("table.article.tag.name");
37 | /*TF-IDF*/
38 | public final static String ARTICLE_TFIDF_TABLE = Property.getStrValue("table.article.tfidf.name");
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/util/LogUtil.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.util;
2 |
3 | import com.ams.recommend.common.pojo.Log;
4 | import org.jetbrains.annotations.NotNull;
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 |
8 | public class LogUtil {
9 |
10 | private static Logger logger = LoggerFactory.getLogger(LogUtil.class);
11 |
12 | @NotNull
13 | public static Log toLogEntry(String log) {
14 | logger.info(log);
15 |
16 | Log logEntry = new Log();
17 |
18 | String[] logArr = log.split(","); //日志用","作为分隔符,共包括四个部分
19 |
20 | if(logArr.length != 4) {
21 | logger.error("Log messages is incorrect");
22 | return null;
23 | }
24 |
25 | logEntry.setUserId(logArr[0]);
26 | logEntry.setArticleId(logArr[1]);
27 | logEntry.setTime(Long.valueOf(logArr[2]));
28 | logEntry.setAction(logArr[3]);
29 |
30 | return logEntry;
31 | }
32 |
33 | public static String getLogRowKey(Long time) {
34 | return String.valueOf(Long.MAX_VALUE - time);
35 | }
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/util/Property.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.util;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.io.InputStreamReader;
6 | import java.util.Properties;
7 |
8 | public class Property {
9 |
10 | private final static String CONF_NAME = "config.properties";
11 |
12 | private static Properties contextProperties;
13 |
14 | static {
15 | InputStream in = Thread.currentThread().getContextClassLoader().getResourceAsStream(CONF_NAME);
16 | contextProperties = new Properties();
17 | try {
18 | InputStreamReader inputStreamReader = new InputStreamReader(in, "UTF-8");
19 | contextProperties.load(inputStreamReader);
20 | } catch (IOException e) {
21 | System.err.println("===[AMS-recommendation-system]=== 资源文件加载失败!");
22 | e.printStackTrace();
23 | }
24 | System.out.println("===[AMS-recommendation-system]=== 资源文件加载成功");
25 | }
26 |
27 | public static String getStrValue(String key) {
28 | return contextProperties.getProperty(key);
29 | }
30 |
31 | public static int getIntValue(String key) {
32 | if(key.isEmpty()) throw new IllegalArgumentException("Key is not allowed NULL");
33 |
34 | String strValue = getStrValue(key);
35 | return Integer.parseInt(strValue);
36 | }
37 |
38 | public static Properties getKafkaProperties(String groupId) {
39 | Properties properties = new Properties();
40 | properties.setProperty("bootstrap.servers", getStrValue("kafka.bootstrap.servers"));
41 | properties.setProperty("zookeeper.connect", getStrValue("kafka.zookeeper.connect"));
42 | properties.setProperty("group.id", groupId);
43 | return properties;
44 | }
45 |
46 | }
--------------------------------------------------------------------------------
/src/main/java/com/ams/recommend/util/WordTokenizerUtil.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.util;
2 |
3 | import com.hankcs.hanlp.seg.common.Term;
4 | import com.hankcs.hanlp.tokenizer.NotionalTokenizer;
5 |
6 | import java.util.HashMap;
7 | import java.util.List;
8 | import java.util.Map;
9 |
10 | public class WordTokenizerUtil {
11 |
12 | /**
13 | * 计算文章中词汇的TF
14 | */
15 | public static Map tf(String content) {
16 | Map wc = new HashMap<>();
17 | List terms = NotionalTokenizer.segment(content);
18 | int wordSize = terms.size();
19 | System.out.println("总共:" + wordSize + "词");
20 |
21 | for(Term term : terms) {
22 | if(wc.keySet().contains(term.word)) {
23 | wc.put(term.word, wc.get(term.word) + 1.0);
24 | }else wc.put(term.word, 1.0);
25 | }
26 |
27 | Map tf = new HashMap<>();
28 |
29 | for(Map.Entry w : wc.entrySet()) {
30 | tf.put(w.getKey(), (w.getValue() / wordSize));
31 | }
32 | return tf;
33 | }
34 |
35 | /**
36 | * 分词并过滤停用词
37 | */
38 | public static String segment(String text) {
39 | StringBuilder builder = new StringBuilder();
40 | for(Term term : NotionalTokenizer.segment(text)) {
41 | builder.append(term.word + " ");
42 | }
43 | return builder.toString();
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/resources/config.properties:
--------------------------------------------------------------------------------
1 | #============== hadoop ===================
2 | hadoop.user=hadoop
3 | fs.defaultFS=hdfs://master:8020
4 | #hdfs.ha.zookeeper.quorum=XXXX-apache00.XX01,XXXX-apache01.XX01,XXXX-apache02.XX01
5 |
6 | #============== hbase ===================
7 | hbase.rootdir=hdfs://hbase:9000/hbase
8 | hbase.zookeeper.quorum=master,slave1,slave2
9 | hbase.client.scanner.timeout.period=1000
10 | hbase.rpc.timeout=3000
11 | hbase.client.ipc.pool.size=1
12 | table.log.name=log
13 | table.user.history.name=ua_history
14 | table.article.history.name=au_history
15 | table.word.name=word
16 | table.portrait.article.name=article_portrait
17 | table.portrait.user.name=user_portrait
18 | table.article.cf.name=acf
19 | table.article.tag.name=atag
20 | table.article.tfidf.name=tfidf
21 |
22 | #============== mysql config ===================
23 | mysql.url=jdbc:mysql://mysql:3306/ams?serverTimezone=GMT%2B8
24 | mysql.name=root
25 | mysql.password=123456
26 |
27 | #============== redis config ===================
28 | redis.host=ams
29 | redis.port=6371
30 | redis.db=0
31 |
32 | #============== kafka config ===================
33 | kafka.bootstrap.servers=master:9092
34 | kafka.zookeeper.connect=master:2181
35 |
36 | #============== spider ===================
37 | spider.article.size = 1
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | ###配置日志根Logger
2 | log4j.rootLogger=DEBUG,stdout,file
3 | #ERROR 为严重错误 主要是程序的错误
4 | #WARN 为一般警告,比如session丢失
5 | #INFO 为一般要显示的信息,比如登录登出
6 | #DEBUG 为程序的调试信息
7 | log4j.additivity.org.apache=true
8 |
9 | ###配置日志信息输出目的地Appender
10 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
11 | #org.apache.log4j.ConsoleAppender(控制台)
12 | #org.apache.log4j.FileAppender(文件)
13 | #org.apache.log4j.DailyRollingFileAppender(每天产生一个日志文件)
14 | #org.apache.log4j.RollingFileAppender(文件大小到达指定尺寸的时候产生一个新的文件)
15 | #org.apache.log4j.WriterAppender(将日志信息以流格式发送到任意指定的地方)
16 | #log4j.appender.error.Target=System.out
17 | ###输出ERROR级别以上的日志
18 | log4j.appender.stdout.threshold=INFO
19 | ###配置日志信息的格式(布局)
20 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
21 | #org.apache.log4j.HTMLLayout(以HTML表格形式布局)
22 | #org.apache.log4j.PatternLayout(可以灵活地指定布局模式)
23 | #org.apache.log4j.SimpleLayout(包含日志信息的级别和信息字符串)
24 | #org.apache.log4j.TTCCLayout(包含日志产生的时间、线程、类别等等信息)
25 | ###配置日志打印的格式格式化日志信息
26 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
27 |
28 | #%m 输出代码中指定的消息
29 | #%p 输出优先级,即DEBUG,INFO,WARN,ERROR,FATAL
30 | #%r 输出自应用启动到输出该log信息耗费的毫秒数
31 | #%c 输出所属的类目,通常就是所在类的全名
32 | #%t 输出产生该日志事件的线程名
33 | #%n 输出一个回车换行符,Windows平台为“\r\n”,Unix平台为“\n”
34 | #%d 输出日志时间点的日期或时间,默认格式为ISO8601,也可以在其后指定格式,比如:%d{yyy MMM dd HH:mm:ss , SSS}
35 | #%l 输出日志事件的发生位置,包括类目名、发生的线程,以及在代码中的行数
36 | #log4j.appender.file=org.apache.log4j.RollingFileAppender
37 | log4j.appender.file=org.apache.log4j.DailyRollingFileAppender
38 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
39 | log4j.appender.file.DatePattern='.'yyyy-MM-dd-HH-mm
40 | # '.'yyyy-MM:每月
41 | # '.'yyyy-ww:每周
42 | # '.'yyyy-MM-dd:每天
43 | # '.'yyyy-MM-dd-a:每天两次
44 | # '.'yyyy-MM-dd-HH:每小时
45 | # '.'yyyy-MM-dd-HH-mm:每分钟
46 | #log4j.appender.file.MaxFileSize=1MB
47 | ###滚动文件的最大数
48 | #log4j.appender.file.MaxBackupIndex=8
49 | log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} [%-5p](%-30c{1}) [TxId : %X{PtxId} , SpanId : %X{PspanId}] [ET:%X{ENV_TYPE},AN:%X{APP_NAME},SN:%X{SERVICE_NAME},CN:%X{CONTAINER_NAME},CI:%X{CONTAINER_IP}] %m%n
50 | log4j.appender.file.Threshold=DEBUG
51 | ###将消息增加到指定文件中,false指将消息覆盖指定的文件内容
52 | log4j.appender.file.append=true
53 | ###日志的保存位置
54 | #log4j.appender.file.File=E:/logs/file-debug-log.log
55 | log4j.appender.file.File=logs/debug-debug.log
56 | ###每天产生一个日志文件
57 | #log4j.appender.file=org.apache.log4j.DailyRollingFileAppender
58 | #log4j.appender.file.layout=org.apache.log4j.PatternLayout
59 | #log4j.appender.file.maxFileSize=100
60 | #log4j.appender.file.maxBackupIndex=5
61 | #log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} [%-5p](%-30c{1}) [TxId : %X{PtxId} , SpanId : %X{PspanId}] [ET:%X{ENV_TYPE},AN:%X{APP_NAME},SN:%X{SERVICE_NAME},CN:%X{CONTAINER_NAME},CI:%X{CONTAINER_IP}] %m%n
62 | #log4j.appender.file.Threshold=DEBUG
63 | #log4j.appender.file.append=true
64 | #log4j.appender.file.File=E:/logs/debug-log.log
--------------------------------------------------------------------------------
/src/main/test/com/ams/recommend/client/HBaseClientTest.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.client;
2 |
3 | import org.junit.Assert;
4 | import org.junit.Test;
5 |
6 | import java.util.Map;
7 |
8 | public class HBaseClientTest {
9 |
10 | @Test
11 | public void testTableApi() {
12 | Assert.assertEquals(true, HBaseClient.existTable("log"));
13 | HBaseClient.createTableIfNotExist("log", "l");
14 | HBaseClient.createTableIfNotExist("u_interest", "i");
15 | }
16 |
17 | @Test
18 | public void testGetRow() {
19 | Map kvs = HBaseClient.getRow("log", "9223372035269505076");
20 |
21 | for(Map.Entry kv : kvs.entrySet()) {
22 | System.out.println("column : " + kv.getKey() + ", value : " + kv.getValue());
23 | }
24 | }
25 |
26 | @Test
27 | public void testPut() {
28 | HBaseClient.put("log",
29 | "9223372035269505076",
30 | "l",
31 | "uid",
32 | "50");
33 | }
34 |
35 | @Test
36 | public void testGet() {
37 | String res = HBaseClient.get("log",
38 | "9223372035269505076",
39 | "l",
40 | "uid"
41 | );
42 | Assert.assertEquals("50", res);
43 | }
44 |
45 | }
--------------------------------------------------------------------------------
/src/main/test/com/ams/recommend/client/RedisClientTest.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.client;
2 |
3 | import org.junit.Test;
4 |
5 | import java.util.List;
6 |
7 | public class RedisClientTest {
8 |
9 | @Test
10 | public void getTest() {
11 | RedisClient client = new RedisClient();
12 | int topRange = 10;
13 | List data = client.getTopList(topRange);
14 | for(int i = 0; i < topRange; i++)
15 | System.out.println(i + " : " + data.get(i));
16 | }
17 |
18 | }
19 |
20 |
--------------------------------------------------------------------------------
/src/main/test/com/ams/recommend/util/LogUtilTest.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.util;
2 |
3 | import com.ams.recommend.common.pojo.Log;
4 | import org.junit.Assert;
5 | import org.junit.Test;
6 |
7 | import java.util.Date;
8 |
9 | public class LogUtilTest {
10 |
11 | @Test
12 | public void testToLogEntry() {
13 | Log log = new Log();
14 | log.setUserId("1");
15 | log.setArticleId("1");
16 | long timestamp = new Date().getTime();
17 | log.setTime(timestamp);
18 | log.setAction("1"); //游览操作
19 |
20 | Assert.assertEquals(log.toString(), LogUtil.toLogEntry("1,1," + timestamp + ",1").toString());
21 | }
22 |
23 | @Test
24 | public void rowKey() {
25 | long timestamp = new Date().getTime();
26 | Assert.assertEquals(String.valueOf(Long.MAX_VALUE - timestamp), LogUtil.getLogRowKey(timestamp));
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/test/com/ams/recommend/util/NotionalTokenizer.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.util;
2 | import com.hankcs.hanlp.HanLP;
3 | import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;
4 | import com.hankcs.hanlp.dictionary.stopword.Filter;
5 | import com.hankcs.hanlp.seg.Segment;
6 | import com.hankcs.hanlp.seg.common.Term;
7 |
8 | import java.util.List;
9 | import java.util.ListIterator;
10 |
11 | public class NotionalTokenizer
12 | {
13 | /**
14 | * 预置分词器
15 | */
16 | static final Segment SEGMENT = HanLP.newSegment();
17 |
18 | public static List segment(String text)
19 | {
20 | return segment(text.toCharArray());
21 | }
22 |
23 | /**
24 | * 分词
25 | *
26 | * @param text 文本
27 | * @return 分词结果
28 | */
29 | public static List segment(char[] text)
30 | {
31 | List resultList = SEGMENT.seg(text);
32 | ListIterator listIterator = resultList.listIterator();
33 | while (listIterator.hasNext())
34 | {
35 | if (!CoreStopWordDictionary.shouldInclude(listIterator.next()))
36 | {
37 | listIterator.remove();
38 | }
39 | }
40 |
41 | return resultList;
42 | }
43 |
44 | /**
45 | * 切分为句子形式
46 | *
47 | * @param text
48 | * @return
49 | */
50 | public static List> seg2sentence(String text)
51 | {
52 | List> sentenceList = SEGMENT.seg2sentence(text);
53 | for (List sentence : sentenceList)
54 | {
55 | ListIterator listIterator = sentence.listIterator();
56 | while (listIterator.hasNext())
57 | {
58 | if (!CoreStopWordDictionary.shouldInclude(listIterator.next()))
59 | {
60 | listIterator.remove();
61 | }
62 | }
63 | }
64 |
65 | return sentenceList;
66 | }
67 |
68 | /**
69 | * 切分为句子形式
70 | *
71 | * @param text
72 | * @param filterArrayChain 自定义过滤器链
73 | * @return
74 | */
75 | public static List> seg2sentence(String text, Filter... filterArrayChain)
76 | {
77 | List> sentenceList = SEGMENT.seg2sentence(text);
78 | for (List sentence : sentenceList)
79 | {
80 | ListIterator listIterator = sentence.listIterator();
81 | while (listIterator.hasNext())
82 | {
83 | if (filterArrayChain != null)
84 | {
85 | Term term = listIterator.next();
86 | for (Filter filter : filterArrayChain)
87 | {
88 | if (!filter.shouldInclude(term))
89 | {
90 | listIterator.remove();
91 | break;
92 | }
93 | }
94 | }
95 | }
96 | }
97 |
98 | return sentenceList;
99 | }
100 |
101 | public static void main(String[] args) {
102 | System.out.println(
103 | NotionalTokenizer.segment("Hi,大家好,这里是本人的新博客基地," +
104 | "之前的博客是在CSDN平台,由于各种原因长时间没有更新,同时想要有更加独立的," +
105 | "更简洁界面的博客,因此将地址转到本站。"));
106 | }
107 | }
--------------------------------------------------------------------------------
/src/main/test/com/ams/recommend/util/TFTest.java:
--------------------------------------------------------------------------------
1 | package com.ams.recommend.util;
2 |
3 | import com.ams.recommend.util.WordTokenizerUtil;
4 |
5 | import java.io.File;
6 | import java.io.FileNotFoundException;
7 | import java.util.*;
8 |
9 | public class TFTest {
10 |
11 | public static void main(String[] args) throws FileNotFoundException {
12 | StringBuilder sb = new StringBuilder();
13 |
14 | Scanner in = new Scanner(
15 | new File("/media/baith/123b86d4-6a94-41c8-994f-5786ea4c760c/download/bi.txt")
16 | );
17 |
18 | while(in.hasNext()) {
19 | sb.append(in.next());
20 | }
21 |
22 | Map tfs = WordTokenizerUtil.tf(sb.toString());
23 | List> tflist = new LinkedList<>();
24 | tflist.addAll(tfs.entrySet());
25 | Collections.sort(tflist, (o1, o2) -> {
26 | if(o1.getValue() > o2.getValue()) return -1;
27 | else if(o1.getValue() < o2.getValue()) return 1;
28 | else return 0;
29 | });
30 |
31 | for(Map.Entry tf : tflist) {
32 | System.out.println(tf.getKey() + " : " + tf.getValue());
33 | }
34 | }
35 |
36 | }
37 |
--------------------------------------------------------------------------------