├── .gitignore ├── LICENSE ├── README.md ├── doc ├── doc.md ├── kafka │ └── generator.sh ├── 功能.png ├── 系统整体架构.png └── 设计思路.png ├── pom.xml └── src └── main ├── java └── com │ └── ams │ └── recommend │ ├── client │ ├── HBaseClient.java │ ├── MySQLClient.java │ └── RedisClient.java │ ├── common │ └── pojo │ │ ├── ArticlePortrait.java │ │ ├── HotArticle.java │ │ ├── Log.java │ │ ├── SpiderArticle.java │ │ ├── User.java │ │ └── WindowedArticle.java │ ├── nearline │ └── task │ │ ├── HistoryTask.java │ │ ├── HotArticleTask.java │ │ ├── LogTask.java │ │ ├── PortraitTask.java │ │ ├── UserInterestTask.java │ │ └── tfidf │ │ ├── SpiderTask.java │ │ ├── TFIDFMapFunction.java │ │ └── TFIDFSink.java │ ├── offline │ ├── ArticleCoeff.java │ ├── ItemCfCoeff.java │ └── SchedulerJob.java │ └── util │ ├── Constants.java │ ├── LogUtil.java │ ├── Property.java │ └── WordTokenizerUtil.java ├── resources ├── config.properties └── log4j.properties └── test └── com └── ams └── recommend ├── client ├── HBaseClientTest.java └── RedisClientTest.java └── util ├── LogUtilTest.java ├── NotionalTokenizer.java └── TFTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | .gradle 2 | /build/ 3 | !gradle/wrapper/gradle-wrapper.jar 4 | 5 | ### STS ### 6 | .apt_generated 7 | .classpath 8 | .factorypath 9 | .project 10 | .settings 11 | .springBeans 12 | 13 | ### IntelliJ IDEA ### 14 | .idea 15 | *.iws 16 | *.iml 17 | *.ipr 18 | 19 | ### NetBeans ### 20 | nbproject/private/ 21 | build/ 22 | nbbuild/ 23 | dist/ 24 | nbdist/ 25 | .nb-gradle/ 26 | /bin/ 27 | 28 | /logs/ 29 | /target/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AMS-recommendation-system 2 | AMS实时推荐系统 3 | 4 | ## AMS设计思路 5 | 6 | ![](doc/设计思路.png) 7 | 8 | ## 系统架构 9 | 10 | ![](doc/系统整体架构.png) 11 | 12 | 推荐系统的数据源从Web端传来,其中包括用户的行为日志、未处理的文章内容等信息,利用Apache Kafka高效消息队列系统将数据以队列的形式发送下游推荐引擎来处理,这样实现了业务系统和推荐系统的解耦,同时也实现了高峰错流等机制,提高系统的抗压能力。推荐引擎则用Apache Flink分布式计算引擎高吞吐、低延迟的特点实时高效地处理,包括画像构建、日志ETL、实时热门文章计算等。另外,离线推荐引擎则暂时采用多个线程来并发处理,较为高效地计算相似文章、相似用户等。最终,推荐引擎处理数据存入HBase,该数据库的特点是高吞吐的实时读写,符合系统实时性的要求。 13 | 14 | 15 | 16 | ## 系统功能 17 | 18 | ![](doc/功能.png) 19 | 20 | -------------------------------------------------------------------------------- /doc/doc.md: -------------------------------------------------------------------------------- 1 | # AMS推荐系统改进思路 2 | 3 | - 用户行为日志【在线层】 4 | 5 | - 用户的点击游览,点赞,收藏,评论等作为事件实时更新用户画像,并且交给推荐引擎处理 6 | 7 | - A/B 分桶【在线层】 8 | - A/B 分流测试 9 | 10 | - 用户感兴趣 -> 基于上下文的推荐逻辑【近线层】 11 | - 如果用户对一篇文章感兴趣可能会点在页面停留一段时间,如果退出时间和开始游览时间之差大于3分钟,就认定为用户对这篇文章感兴趣 12 | 13 | - 根据用户行为实时推荐(u2i2i) 【近线层】 14 | - 根据用户点赞,感兴趣等推荐当前文章的相似文章 15 | 16 | - 相似文章推荐(i2i) -> 基于内容推荐【离线层】 17 | - 将文章通过 TF-IDF 算法得出关键词,然后利用 Alink 中的 Word2Vec Embedding 化(前 10 个关键字),最后通过 ASL 模型训练 18 | 19 | - 用户画像【离线层】 20 | - 根据文章的三个属性(作者,文章分类,关键字)给用户实时打入标签 21 | 22 | - 文章画像【离线层】 23 | - 根据用户的属性(年龄,性别,感兴趣的文章分类)给文章实时打入标签 24 | 25 | -------------------------------------------------------------------------------- /doc/kafka/generator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 自动生成日志 3 | function create_kafka_topic { 4 | $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper master:2181 --replication-factor 1 --partitions 1 --topic $1 5 | } 6 | 7 | function send_messages_to_kafka { 8 | msg=$(generator_message) 9 | echo -e $msg | $KAFKA_HOME/bin/kafka-console-producer.sh --broker-list master:9092 --topic $TOPIC 10 | } 11 | 12 | function rand { 13 | min=$1 14 | max=$(($2-$min+1)) 15 | num=$(date +%s%N) 16 | echo $(($num%$max+$min)) 17 | } 18 | 19 | function generator_message { 20 | userId=$(rand 1 100); 21 | articleId=$(rand 1 10); 22 | timestamp=`date '+%s'`; 23 | action=1; 24 | msg=$userId","$articleId","$timestamp","$action; 25 | echo $msg 26 | } 27 | 28 | TOPIC="log" 29 | create_kafka_topic $TOPIC 30 | while true 31 | do 32 | send_messages_to_kafka 33 | sleep 0.1 34 | done 35 | 36 | -------------------------------------------------------------------------------- /doc/功能.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xxubai/ams-recommendation-system/ec22e41e9bb8120d6cc6a105eaf693274baca2f7/doc/功能.png -------------------------------------------------------------------------------- /doc/系统整体架构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xxubai/ams-recommendation-system/ec22e41e9bb8120d6cc6a105eaf693274baca2f7/doc/系统整体架构.png -------------------------------------------------------------------------------- /doc/设计思路.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xxubai/ams-recommendation-system/ec22e41e9bb8120d6cc6a105eaf693274baca2f7/doc/设计思路.png -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 19 | 21 | 4.0.0 22 | 23 | ams-runtime-recommend-system 24 | ams-runtime-recommend-system 25 | 0.1 26 | jar 27 | 28 | AMS Runtime Recommend System 29 | http://www.myorganization.org 30 | 31 | 32 | UTF-8 33 | 1.10.0 34 | 2.1.8 35 | 1.8 36 | 2.11 37 | ${java.version} 38 | ${java.version} 39 | 40 | 41 | 42 | 43 | apache.snapshots 44 | Apache Development Snapshot Repository 45 | https://repository.apache.org/content/repositories/snapshots/ 46 | 47 | false 48 | 49 | 50 | true 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | org.apache.flink 60 | flink-java 61 | ${flink.version} 62 | 63 | 64 | org.apache.flink 65 | flink-streaming-java_2.11 66 | ${flink.version} 67 | 68 | 69 | org.apache.flink 70 | flink-clients_2.11 71 | ${flink.version} 72 | 73 | 74 | 75 | 76 | org.apache.flink 77 | flink-connector-kafka_2.11 78 | ${flink.version} 79 | 80 | 81 | org.apache.flink 82 | flink-connector-filesystem_2.11 83 | ${flink.version} 84 | 85 | 86 | 87 | 88 | 89 | org.slf4j 90 | slf4j-log4j12 91 | 1.7.7 92 | runtime 93 | 94 | 100 | 101 | 102 | 103 | org.apache.hbase 104 | hbase-client 105 | ${hbase.version} 106 | 107 | 108 | org.apache.hbase 109 | hbase-server 110 | ${hbase.version} 111 | 112 | 113 | redis.clients 114 | jedis 115 | 3.0.1 116 | 117 | 118 | org.apache.flink 119 | flink-connector-redis_2.10 120 | 1.1.5 121 | 122 | 123 | org.apache.flink 124 | flink-jdbc 125 | 1.6.1 126 | 127 | 128 | mysql 129 | mysql-connector-java 130 | 8.0.28 131 | 132 | 133 | com.alibaba 134 | druid 135 | 1.1.10 136 | 137 | 138 | org.jetbrains 139 | annotations 140 | RELEASE 141 | compile 142 | 143 | 144 | 145 | junit 146 | junit 147 | 4.13.1 148 | test 149 | 150 | 151 | 152 | 153 | com.hankcs 154 | hanlp 155 | portable-1.7.5 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | org.apache.maven.plugins 166 | maven-compiler-plugin 167 | 3.1 168 | 169 | ${java.version} 170 | ${java.version} 171 | 172 | 173 | 174 | 175 | 176 | 177 | org.apache.maven.plugins 178 | maven-shade-plugin 179 | 3.1.1 180 | 181 | 182 | 183 | package 184 | 185 | shade 186 | 187 | 188 | 189 | 190 | org.apache.flink:force-shading 191 | com.google.code.findbugs:jsr305 192 | org.slf4j:* 193 | log4j:* 194 | 195 | 196 | 197 | 198 | 200 | *:* 201 | 202 | META-INF/*.SF 203 | META-INF/*.DSA 204 | META-INF/*.RSA 205 | 206 | 207 | 208 | 209 | 210 | com.ams.recommend.StreamingJob 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | org.eclipse.m2e 225 | lifecycle-mapping 226 | 1.0.0 227 | 228 | 229 | 230 | 231 | 232 | org.apache.maven.plugins 233 | maven-shade-plugin 234 | [3.1.1,) 235 | 236 | shade 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | org.apache.maven.plugins 246 | maven-compiler-plugin 247 | [3.1,) 248 | 249 | testCompile 250 | compile 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/client/HBaseClient.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.client; 2 | 3 | import com.ams.recommend.util.Property; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.hbase.Cell; 6 | import org.apache.hadoop.hbase.HBaseConfiguration; 7 | import org.apache.hadoop.hbase.TableName; 8 | import org.apache.hadoop.hbase.client.*; 9 | import org.apache.hadoop.hbase.util.Bytes; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | 13 | import java.io.IOException; 14 | import java.util.ArrayList; 15 | import java.util.HashMap; 16 | import java.util.List; 17 | import java.util.Map; 18 | 19 | public class HBaseClient { 20 | 21 | private static final Logger logger = LoggerFactory.getLogger(HBaseClient.class); 22 | 23 | private static Configuration conf; 24 | 25 | static { 26 | conf = HBaseConfiguration.create(); 27 | conf.set("hbase.rootdir", Property.getStrValue("hbase.rootdir")); 28 | conf.set("hbase.zookeeper.quorum", Property.getStrValue("hbase.zookeeper.quorum")); 29 | conf.set("hbase.client.scanner.timeout.period", Property.getStrValue("hbase.client.scanner.timeout.period")); 30 | conf.set("hbase.rpc.timeout", Property.getStrValue("hbase.rpc.timeout")); 31 | conf.set("hbase.client.ipc.pool.size", Property.getStrValue("hbase.client.ipc.pool.size")); 32 | } 33 | 34 | /** 35 | * 查看表是否存在 36 | * @param tableName 37 | * @return 38 | */ 39 | public static boolean existTable(String tableName) { 40 | boolean exist = false; 41 | try(Connection conn = ConnectionFactory.createConnection(conf)) { 42 | Admin admin = conn.getAdmin(); 43 | exist = admin.tableExists(TableName.valueOf(tableName)); 44 | admin.close(); 45 | }catch (IOException e) { 46 | e.printStackTrace(); 47 | } 48 | return exist; 49 | } 50 | 51 | /** 52 | * 创建一个新的表 53 | * @param tableName 54 | */ 55 | public static void createTableIfNotExist(String tableName, String... family) { 56 | if(!existTable(tableName)) createOrOverwriteTable(tableName, family); 57 | else { 58 | logger.error("Table : " + tableName + " already existed"); 59 | return; 60 | } 61 | } 62 | 63 | /** 64 | * 创建或覆盖表 65 | * @param tableName 表名 66 | */ 67 | public static void createOrOverwriteTable(String tableName, String... cfs) { 68 | try(Connection conn = ConnectionFactory.createConnection(conf)) { 69 | Admin admin = conn.getAdmin(); 70 | TableName tName = TableName.valueOf(tableName); 71 | 72 | if(admin.tableExists(tName)) { 73 | admin.enableTable(tName); //先停掉table 74 | admin.disableTable(tName); //后删除table 75 | } 76 | 77 | List columnFamilyDescriptors = new ArrayList<>(); 78 | for (String cf : cfs) { 79 | columnFamilyDescriptors.add(ColumnFamilyDescriptorBuilder 80 | .newBuilder(Bytes.toBytes(cf)) 81 | .build()); 82 | } 83 | 84 | TableDescriptor tableDescriptor = TableDescriptorBuilder 85 | .newBuilder(tName) 86 | .setColumnFamilies(columnFamilyDescriptors) //设置添加列族 87 | .build(); 88 | admin.createTable(tableDescriptor); //创建table 89 | 90 | admin.close(); //及时关闭流 91 | } catch (IOException e) { 92 | e.printStackTrace(); 93 | } 94 | } 95 | 96 | /** 97 | * 插入一列数据 98 | * @param tableName 99 | * @param rowKey 100 | * @param family 101 | * @param column 102 | * @param value 103 | */ 104 | public static void put(String tableName, String rowKey, String family, String column, String value) { 105 | TableName tName = TableName.valueOf(tableName); 106 | try(Connection conn = ConnectionFactory.createConnection(conf)) { 107 | Table table = conn.getTable(tName); 108 | Put put = new Put(Bytes.toBytes(rowKey)) 109 | .addColumn(Bytes.toBytes(family), Bytes.toBytes(column), Bytes.toBytes(value)); 110 | table.put(put); 111 | 112 | table.close(); //及时关闭流 113 | } catch (IOException e) { 114 | e.printStackTrace(); 115 | } 116 | } 117 | 118 | public static void addOrUpdateColumn(String tableName, String rowKey, String family, String column) { 119 | String count = get(tableName, rowKey, family, column); 120 | if(count == null) count = "0"; 121 | 122 | put(tableName, rowKey, family, column, String.valueOf(Long.valueOf(count) + 1)); 123 | } 124 | 125 | public static String get(String tableName, String rowKey, String family, String column) { 126 | String res = null; 127 | TableName tName = TableName.valueOf(tableName); 128 | try(Connection conn = ConnectionFactory.createConnection(conf)) { 129 | Table table = conn.getTable(tName); 130 | Get get = new Get(Bytes.toBytes(rowKey)) 131 | .addColumn(Bytes.toBytes(family), Bytes.toBytes(column)); 132 | Result rs = table.get(get); 133 | res = Bytes.toString(rs.getValue(Bytes.toBytes(family), Bytes.toBytes(column))); 134 | table.close(); 135 | } catch (IOException e) { 136 | e.printStackTrace(); 137 | } 138 | return res; 139 | } 140 | 141 | /** 142 | * 获取一整行 143 | * @return 144 | */ 145 | public static Map getRow(String tableName, String rowKey) { 146 | Map kv = new HashMap<>(); 147 | TableName tName = TableName.valueOf(tableName); 148 | try(Connection conn = ConnectionFactory.createConnection(conf)) { 149 | Table table = conn.getTable(tName); 150 | Get get = new Get(Bytes.toBytes(rowKey)); 151 | Result rs = table.get(get); 152 | for (Cell cell : rs.listCells()){ 153 | String key = Bytes.toString(cell.getQualifierArray(),cell.getQualifierOffset(),cell.getQualifierLength()); 154 | String value = Bytes.toString(cell.getValueArray(),cell.getValueOffset(),cell.getValueLength()); 155 | kv.put(key, value); 156 | } 157 | table.close(); 158 | } catch (IOException e) { 159 | e.printStackTrace(); 160 | } 161 | return kv; 162 | } 163 | 164 | public static int getColumnSize(String tableName, String rowKey, String family) { 165 | int size = 0; 166 | TableName tName = TableName.valueOf(tableName); 167 | try(Connection conn = ConnectionFactory.createConnection(conf)) { 168 | Table table = conn.getTable(tName); 169 | Get get = new Get(Bytes.toBytes(rowKey)); 170 | Result rs = table.get(get); 171 | 172 | if(rs.isEmpty()) return 0; 173 | 174 | Map familyMap = rs.getFamilyMap(Bytes.toBytes(family)); 175 | size = familyMap.keySet().size(); 176 | table.close(); 177 | }catch (IOException e) { 178 | e.printStackTrace(); 179 | } 180 | return size; 181 | } 182 | 183 | public static void createRow(String tableName, String rowKey, String c, String count, String value) { 184 | TableName tName = TableName.valueOf(tableName); 185 | try(Connection conn = ConnectionFactory.createConnection(conf)) { 186 | Table table = conn.getTable(tName); 187 | Put put = new Put(Bytes.toBytes(rowKey)) 188 | .addColumn(Bytes.toBytes(c), Bytes.toBytes(count), Bytes.toBytes(value)); 189 | table.put(put); 190 | table.close(); 191 | }catch (IOException e) { 192 | e.printStackTrace(); 193 | } 194 | } 195 | 196 | 197 | /** 198 | * 取出表中所有的key 199 | * @param tableName 200 | * @return 201 | */ 202 | public static List getAllKey(String tableName) throws IOException { 203 | List keys = new ArrayList<>(); 204 | try(Connection conn = ConnectionFactory.createConnection(conf)) { 205 | Scan scan = new Scan(); 206 | Table table = conn.getTable(TableName.valueOf(tableName)); 207 | ResultScanner scanner = table.getScanner(scan); 208 | for (Result r : scanner) { 209 | keys.add(new String(r.getRow())); 210 | } 211 | }catch (IOException e) { 212 | e.printStackTrace(); 213 | } 214 | return keys; 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/client/MySQLClient.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.client; 2 | 3 | import com.alibaba.druid.pool.DruidDataSource; 4 | import com.ams.recommend.common.pojo.User; 5 | import com.ams.recommend.util.Property; 6 | 7 | import java.sql.Connection; 8 | import java.sql.PreparedStatement; 9 | import java.sql.ResultSet; 10 | import java.sql.SQLException; 11 | 12 | public class MySQLClient { 13 | 14 | private final static DruidDataSource dataSource; 15 | 16 | static { 17 | dataSource = new DruidDataSource(); 18 | dataSource.setUrl(Property.getStrValue("mysql.url")); 19 | dataSource.setUsername(Property.getStrValue("mysql.name")); 20 | dataSource.setPassword(Property.getStrValue("mysql.password")); 21 | } 22 | 23 | /** 24 | * 根据文章id查询文章内容 25 | * @param articleId 文章id 26 | * @return 文章内容 27 | */ 28 | public static String getContentById(String articleId) { 29 | String content = ""; 30 | try(Connection conn = dataSource.getConnection()) { 31 | PreparedStatement pst = conn.prepareStatement("SELECT content FROM article WHERE id = ?"); 32 | pst.setString(1, articleId); 33 | ResultSet rs = pst.executeQuery(); 34 | content = rs.getString("content"); 35 | 36 | pst.close(); 37 | }catch (SQLException e) { 38 | e.printStackTrace(); 39 | } 40 | return content; 41 | } 42 | 43 | /** 44 | * 根据文章id查询得到所有文章属性 45 | * @param articleId 文章id 46 | * @return 文章属性 47 | */ 48 | public static ResultSet getUserPortraitById(String articleId) { 49 | ResultSet article = null; 50 | try(Connection conn = dataSource.getConnection()) { 51 | PreparedStatement pst = conn.prepareStatement("SELECT author, title, keyword FROM article WHERE id = ?"); 52 | pst.setString(1, articleId); 53 | article = pst.executeQuery(); 54 | 55 | pst.close(); 56 | }catch (SQLException e) { 57 | e.printStackTrace(); 58 | } 59 | return article; 60 | } 61 | 62 | /** 63 | * 根据用户id查询得到所有用户属性 64 | * @param userId 用户id 65 | * @return 用户属性 66 | */ 67 | public static User getUserById(String userId) { 68 | User user = null; 69 | try(Connection conn = dataSource.getConnection()) { 70 | PreparedStatement pst = conn.prepareStatement("SELECT * FROM user WHERE id = ?"); 71 | pst.setString(1, userId); 72 | ResultSet rs = pst.executeQuery(); 73 | if(rs != null) { 74 | rs.next(); 75 | user.setUserId(userId); 76 | user.setSex(rs.getInt("sex")); 77 | user.setAge(rs.getInt("age")); 78 | user.setJob(rs.getString("job")); 79 | user.setEducation(rs.getString("education")); 80 | } 81 | 82 | pst.close(); 83 | }catch (SQLException e) { 84 | e.printStackTrace(); 85 | } 86 | return user; 87 | } 88 | 89 | public static void putKeywordById(String id, String keyword) { 90 | try(Connection conn = dataSource.getConnection()) { 91 | PreparedStatement pst = conn.prepareStatement("INSERT INTO article(keyword) VALUES(?) WHERE id = ?"); 92 | pst.setString(1, keyword); 93 | pst.setString(2, id); 94 | pst.executeUpdate(); 95 | 96 | pst.close(); 97 | }catch (SQLException e) { 98 | e.printStackTrace(); 99 | } 100 | } 101 | 102 | } 103 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/client/RedisClient.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.client; 2 | 3 | 4 | import com.ams.recommend.util.Property; 5 | import redis.clients.jedis.Jedis; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | public class RedisClient { 11 | private static Jedis jedis; 12 | 13 | static { 14 | jedis = new Jedis(Property.getStrValue("redis.host"), Property.getIntValue("redis.port")); 15 | jedis.select(Property.getIntValue("redis.db")); 16 | } 17 | 18 | /** 19 | * 获取redis中对应的值 20 | * @param key 建 21 | * @return 值 22 | */ 23 | public String getData(String key){ 24 | return jedis.get(key); 25 | } 26 | 27 | /** 28 | * 获取热榜文章 29 | * @param topRange 热门文章数 30 | * @return 热门文章id 31 | */ 32 | public List getTopList(int topRange){ 33 | List res = new ArrayList<>(); 34 | for (int i = 0; i < topRange; i++) { 35 | res.add(getData(String.valueOf(i))); 36 | } 37 | return res; 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/common/pojo/ArticlePortrait.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.common.pojo; 2 | 3 | public class ArticlePortrait { 4 | 5 | private int man; 6 | private int woman; 7 | 8 | private int age_10; 9 | private int age_20; 10 | private int age_30; 11 | private int age_40; 12 | private int age_50; 13 | private int age_60; 14 | 15 | public int getMan() { 16 | return man; 17 | } 18 | 19 | public void setMan(int man) { 20 | this.man = man; 21 | } 22 | 23 | public int getWoman() { 24 | return woman; 25 | } 26 | 27 | public void setWoman(int woman) { 28 | this.woman = woman; 29 | } 30 | 31 | public int getAge_10() { 32 | return age_10; 33 | } 34 | 35 | public void setAge_10(int age_10) { 36 | this.age_10 = age_10; 37 | } 38 | 39 | public int getAge_20() { 40 | return age_20; 41 | } 42 | 43 | public void setAge_20(int age_20) { 44 | this.age_20 = age_20; 45 | } 46 | 47 | public int getAge_30() { 48 | return age_30; 49 | } 50 | 51 | public void setAge_30(int age_30) { 52 | this.age_30 = age_30; 53 | } 54 | 55 | public int getAge_40() { 56 | return age_40; 57 | } 58 | 59 | public void setAge_40(int age_40) { 60 | this.age_40 = age_40; 61 | } 62 | 63 | public int getAge_50() { 64 | return age_50; 65 | } 66 | 67 | public void setAge_50(int age_50) { 68 | this.age_50 = age_50; 69 | } 70 | 71 | public int getAge_60() { 72 | return age_60; 73 | } 74 | 75 | public void setAge_60(int age_60) { 76 | this.age_60 = age_60; 77 | } 78 | 79 | 80 | public int getTotal(){ 81 | int ret = 0; 82 | ret += (man*man) + (woman*woman) + (age_10*age_10) + (age_20*age_20) + (age_30*age_30) + (age_40*age_40) + 83 | (age_50*age_50) + (age_60*age_60); 84 | return ret; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/common/pojo/HotArticle.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.common.pojo; 2 | 3 | public class HotArticle { 4 | 5 | private String articleId; //文章id 6 | private long pvCount; //文章游览量 7 | private long windowEnd; //窗口结束时间戳 8 | private int rank; //热度榜名次 9 | 10 | public HotArticle() { 11 | } 12 | 13 | public HotArticle(String articleId, long pvCount, long windowEnd, int rank) { 14 | this.articleId = articleId; 15 | this.pvCount = pvCount; 16 | this.windowEnd = windowEnd; 17 | this.rank = rank; 18 | } 19 | 20 | public HotArticle(String articleId, long pvCount, long windowEnd) { 21 | this.articleId = articleId; 22 | this.pvCount = pvCount; 23 | this.windowEnd = windowEnd; 24 | this.rank = 0; 25 | } 26 | 27 | public String getArticleId() { 28 | return articleId; 29 | } 30 | 31 | public void setArticleId(String articleId) { 32 | this.articleId = articleId; 33 | } 34 | 35 | public long getPvCount() { 36 | return pvCount; 37 | } 38 | 39 | public void setPvCount(long pvCount) { 40 | this.pvCount = pvCount; 41 | } 42 | 43 | public long getWindowEnd() { 44 | return windowEnd; 45 | } 46 | 47 | public void setWindowEnd(long windowEnd) { 48 | this.windowEnd = windowEnd; 49 | } 50 | 51 | public int getRank() { 52 | return rank; 53 | } 54 | 55 | public void setRank(int rank) { 56 | this.rank = rank; 57 | } 58 | 59 | @Override 60 | public String toString() { 61 | return "HotArticle : " + 62 | "articleId='" + articleId + '\'' + 63 | ", pvCount=" + pvCount + 64 | ", windowEnd=" + windowEnd + 65 | ", rank=" + rank; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/common/pojo/Log.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.common.pojo; 2 | 3 | /** 4 | * 日志实体 5 | */ 6 | public class Log { 7 | 8 | private String userId; 9 | private String articleId; 10 | private Long time; 11 | private String action; 12 | 13 | public String getUserId() { return userId; } 14 | 15 | public void setUserId(String userId) { this.userId = userId; } 16 | 17 | public String getArticleId() { return articleId; } 18 | 19 | public void setArticleId(String articleId) { 20 | this.articleId = articleId; 21 | } 22 | 23 | public Long getTime() { 24 | return time; 25 | } 26 | 27 | public void setTime(Long time) { 28 | this.time = time; 29 | } 30 | 31 | public String getAction() { 32 | return action; 33 | } 34 | 35 | public void setAction(String action) { 36 | this.action = action; 37 | } 38 | 39 | @Override 40 | public String toString() { 41 | return "Log" + 42 | "userId='" + userId + '\'' + 43 | ", articleId='" + articleId + '\'' + 44 | ", time=" + time + 45 | ", action='" + action + '\''; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/common/pojo/SpiderArticle.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.common.pojo; 2 | 3 | import org.apache.flink.api.java.tuple.Tuple2; 4 | 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | import java.util.PriorityQueue; 8 | 9 | public class SpiderArticle { 10 | 11 | private String articleId; 12 | private long timestamp; 13 | private String content; 14 | private Map tfMap; 15 | private PriorityQueue> tfidf; 16 | 17 | public SpiderArticle() { 18 | tfMap = new HashMap<>(); 19 | tfidf = new PriorityQueue(); 20 | } 21 | 22 | public SpiderArticle(String articleId, long timestamp, String content) { 23 | this.articleId = articleId; 24 | this.timestamp = timestamp; 25 | this.content = content; 26 | tfMap = new HashMap<>(); 27 | tfidf = new PriorityQueue(); 28 | } 29 | 30 | public String getArticleId() { 31 | return articleId; 32 | } 33 | 34 | public void setArticleId(String articleId) { 35 | this.articleId = articleId; 36 | } 37 | 38 | public long getTimestamp() { 39 | return timestamp; 40 | } 41 | 42 | public void setTimestamp(long timestamp) { 43 | this.timestamp = timestamp; 44 | } 45 | 46 | public String getContent() { 47 | return this.content; 48 | } 49 | 50 | public void setContent(String content) { 51 | this.content = content; 52 | } 53 | public Map getTfMap() { 54 | return tfMap; 55 | } 56 | 57 | public void setTfMap(Map tfMap) { 58 | this.tfMap = tfMap; 59 | } 60 | 61 | public PriorityQueue> getTfidf() { 62 | return tfidf; 63 | } 64 | 65 | public void setTfidf(PriorityQueue> tfidf) { 66 | this.tfidf = tfidf; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/common/pojo/User.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.common.pojo; 2 | 3 | public class User { 4 | private String userId; 5 | private int sex; 6 | private int age; 7 | private String job; 8 | private String education; 9 | 10 | public User(){} 11 | 12 | public User(String userId, int sex, int age, String job, String education) { 13 | this.userId = userId; 14 | this.sex = sex; 15 | this.age = age; 16 | this.job = job; 17 | this.education = education; 18 | } 19 | 20 | public String getUserId() { 21 | return userId; 22 | } 23 | 24 | public void setUserId(String userId) { 25 | this.userId = userId; 26 | } 27 | 28 | public int getSex() { 29 | return sex; 30 | } 31 | 32 | public void setSex(int sex) { 33 | this.sex = sex; 34 | } 35 | 36 | public int getAge() { 37 | return age; 38 | } 39 | 40 | public void setAge(int age) { 41 | this.age = age; 42 | } 43 | 44 | public String getJob() { 45 | return job; 46 | } 47 | 48 | public void setJob(String job) { 49 | this.job = job; 50 | } 51 | 52 | public String getEducation() { 53 | return education; 54 | } 55 | 56 | public void setEducation(String education) { 57 | this.education = education; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/common/pojo/WindowedArticle.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.common.pojo; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | public class WindowedArticle { 7 | 8 | private String articleId; 9 | private long timestamp; 10 | private long windowEnd; 11 | private Map tfMap; 12 | 13 | public WindowedArticle() { 14 | tfMap = new HashMap<>(); 15 | } 16 | 17 | public WindowedArticle(String articleId, long timestamp, long windowEnd) { 18 | this.articleId = articleId; 19 | this.timestamp = timestamp; 20 | this.windowEnd = windowEnd; 21 | tfMap = new HashMap<>(); 22 | } 23 | 24 | public String getArticleId() { 25 | return articleId; 26 | } 27 | 28 | public void setArticleId(String articleId) { 29 | this.articleId = articleId; 30 | } 31 | 32 | public long getTimestamp() { 33 | return timestamp; 34 | } 35 | 36 | public void setTimestamp(long timestamp) { 37 | this.timestamp = timestamp; 38 | } 39 | 40 | public long getWindowEnd() { 41 | return windowEnd; 42 | } 43 | 44 | public void setWindowEnd(long windowEnd) { 45 | this.windowEnd = windowEnd; 46 | } 47 | 48 | public Map getTfMap() { 49 | return tfMap; 50 | } 51 | 52 | public void setTfMap(Map tfMap) { 53 | this.tfMap = tfMap; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/nearline/task/HistoryTask.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.nearline.task; 2 | 3 | import com.ams.recommend.client.HBaseClient; 4 | import com.ams.recommend.common.pojo.Log; 5 | import com.ams.recommend.util.Constants; 6 | import com.ams.recommend.util.LogUtil; 7 | import com.ams.recommend.util.Property; 8 | import org.apache.flink.api.common.functions.FlatMapFunction; 9 | import org.apache.flink.api.common.serialization.SimpleStringSchema; 10 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 11 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; 12 | 13 | public class HistoryTask { 14 | 15 | public static void main(String[] args) throws Exception { 16 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 17 | 18 | FlinkKafkaConsumer consumer = new FlinkKafkaConsumer( 19 | "log", 20 | new SimpleStringSchema(), 21 | Property.getKafkaProperties("history") 22 | ); 23 | 24 | env.addSource(consumer) 25 | .flatMap((FlatMapFunction) (value, out) -> { 26 | Log log = LogUtil.toLogEntry(value); 27 | if(null != log) { 28 | //文章相对应的用户操作更新1次记录 29 | HBaseClient.addOrUpdateColumn( 30 | Constants.ARTICLE_HIS_TABLE, 31 | log.getArticleId(), 32 | "p", 33 | log.getUserId()); 34 | //用户对游览的文章的操作次数加1 35 | HBaseClient.addOrUpdateColumn( 36 | Constants.USER_HIS_TABLE, 37 | log.getUserId(), 38 | "p", 39 | log.getArticleId()); 40 | } 41 | }); 42 | 43 | env.execute("History Task"); 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/nearline/task/HotArticleTask.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.nearline.task; 2 | 3 | import com.ams.recommend.common.pojo.HotArticle; 4 | import com.ams.recommend.common.pojo.Log; 5 | import com.ams.recommend.util.LogUtil; 6 | import com.ams.recommend.util.Property; 7 | import org.apache.flink.api.common.functions.AggregateFunction; 8 | import org.apache.flink.api.common.functions.FlatMapFunction; 9 | import org.apache.flink.api.common.serialization.SimpleStringSchema; 10 | import org.apache.flink.api.common.state.ListState; 11 | import org.apache.flink.api.common.state.ListStateDescriptor; 12 | import org.apache.flink.configuration.Configuration; 13 | import org.apache.flink.streaming.api.TimeCharacteristic; 14 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 15 | import org.apache.flink.streaming.api.functions.KeyedProcessFunction; 16 | import org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor; 17 | import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; 18 | import org.apache.flink.streaming.api.functions.windowing.WindowFunction; 19 | import org.apache.flink.streaming.api.windowing.time.Time; 20 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow; 21 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; 22 | import org.apache.flink.streaming.connectors.redis.RedisSink; 23 | import org.apache.flink.streaming.connectors.redis.common.config.FlinkJedisPoolConfig; 24 | import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommand; 25 | import org.apache.flink.streaming.connectors.redis.common.mapper.RedisCommandDescription; 26 | import org.apache.flink.streaming.connectors.redis.common.mapper.RedisMapper; 27 | import org.apache.flink.util.Collector; 28 | import org.slf4j.Logger; 29 | import org.slf4j.LoggerFactory; 30 | 31 | import java.util.Comparator; 32 | import java.util.LinkedList; 33 | import java.util.List; 34 | import java.util.PriorityQueue; 35 | 36 | public class HotArticleTask { 37 | 38 | private final static Logger logger = LoggerFactory.getLogger(HotArticleTask.class); 39 | private final static int HOTSIZE = 20; //热榜的文章数 40 | 41 | public static void main(String[] args) throws Exception { 42 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 43 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); 44 | env.enableCheckpointing(5000L); 45 | 46 | FlinkKafkaConsumer consumer = new FlinkKafkaConsumer( 47 | "log", 48 | new SimpleStringSchema(), 49 | Property.getKafkaProperties("hot") 50 | ); 51 | 52 | FlinkJedisPoolConfig redisConf = new FlinkJedisPoolConfig.Builder() 53 | .setHost(Property.getStrValue("redis.host")) 54 | .setPort(Property.getIntValue("redis.port")) 55 | .setDatabase(Property.getIntValue("redis.db")) 56 | .build(); 57 | 58 | env.addSource(consumer) 59 | .flatMap(new LogFlatMapFunction()) 60 | .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor(Time.minutes(5)) { 61 | @Override 62 | public long extractTimestamp(Log log) { 63 | logger.info("watermark : " + log.getTime() * 1000); 64 | return log.getTime() * 1000; //转化为毫秒 65 | } 66 | }).keyBy(log -> log.getArticleId()) 67 | .timeWindow(Time.seconds(60), Time.seconds(20)) 68 | .aggregate(new CountAgg(), new WindowResultFunction()) 69 | .keyBy(hot -> hot.getWindowEnd()) 70 | .process(new HotArticleProcessFunction(HOTSIZE)) 71 | .flatMap(new TopFlatMapFunction()) 72 | .addSink(new RedisSink<>(redisConf, new HotArticleSink())); 73 | 74 | env.execute("Hot article task"); 75 | } 76 | 77 | 78 | private static class LogFlatMapFunction implements FlatMapFunction { 79 | @Override 80 | public void flatMap(String value, Collector out) throws Exception { 81 | Log log = LogUtil.toLogEntry(value); 82 | if("1".equals(log.getAction())) { 83 | out.collect(log); 84 | } 85 | } 86 | } 87 | 88 | private static class CountAgg implements AggregateFunction { 89 | @Override 90 | public Long createAccumulator() { 91 | return 0L; 92 | } 93 | 94 | @Override 95 | public Long add(Log value, Long accumulator) { 96 | return accumulator + 1; 97 | } 98 | 99 | @Override 100 | public Long getResult(Long accumulator) { 101 | return accumulator; 102 | } 103 | 104 | @Override 105 | public Long merge(Long a, Long b) { 106 | return a + b; 107 | } 108 | } 109 | 110 | /** 111 | * 将每个key每个窗口聚合后的结果带上热门文章对象进行输出 112 | */ 113 | private static class WindowResultFunction implements WindowFunction { 114 | @Override 115 | public void apply(String articleId, TimeWindow window, Iterable input, Collector out) throws Exception { 116 | Long pvCount = input.iterator().next(); 117 | HotArticle article = new HotArticle(articleId, pvCount, window.getEnd()); 118 | out.collect(article); 119 | 120 | logger.info(article.toString()); 121 | } 122 | } 123 | 124 | private static class HotArticleProcessFunction extends KeyedProcessFunction> { 125 | 126 | private int hotSize; 127 | private ListState hotArticleListState; 128 | 129 | public HotArticleProcessFunction(int hotSize) { 130 | if(hotSize < 1) throw new IllegalArgumentException("Article size should not less than 1!"); 131 | this.hotSize = hotSize; 132 | } 133 | 134 | @Override 135 | public void open(Configuration parameters) throws Exception { 136 | super.open(parameters); 137 | ListStateDescriptor hotArticleListStateDescriptor = 138 | new ListStateDescriptor<>("hotArticle-state", HotArticle.class); 139 | hotArticleListState = getRuntimeContext().getListState(hotArticleListStateDescriptor); 140 | } 141 | 142 | @Override 143 | public void processElement(HotArticle hotArticle, Context ctx, Collector> out) throws Exception { 144 | hotArticleListState.add(hotArticle); 145 | //注册下一次的事件计时器 146 | ctx.timerService().registerEventTimeTimer(hotArticle.getWindowEnd() + 1); 147 | } 148 | 149 | @Override 150 | public void onTimer(long timestamp, OnTimerContext ctx, Collector> out) throws Exception { 151 | PriorityQueue hotArticles = new PriorityQueue<>(hotSize, new Comparator() { 152 | @Override 153 | public int compare(HotArticle o1, HotArticle o2) { 154 | if(o1.getPvCount() > o2.getPvCount()) return -1; 155 | else if(o1.getPvCount() < o2.getPvCount()) return 1; 156 | else return 0; 157 | } 158 | }); 159 | 160 | for(HotArticle hotArticle : hotArticleListState.get()) { 161 | hotArticles.add(hotArticle); 162 | } 163 | //清理状态 164 | hotArticleListState.clear(); 165 | 166 | out.collect(new LinkedList<>(hotArticles)); 167 | } 168 | } 169 | 170 | private static class HotArticleSink implements RedisMapper { 171 | @Override 172 | public RedisCommandDescription getCommandDescription() { 173 | return new RedisCommandDescription(RedisCommand.SET, null); 174 | } 175 | 176 | @Override 177 | public String getKeyFromData(HotArticle hotArticle) { 178 | logger.info("Redis Key : " + hotArticle.getRank()); 179 | return String.valueOf(hotArticle.getRank()); 180 | } 181 | 182 | @Override 183 | public String getValueFromData(HotArticle hotArticle) { 184 | logger.info("Redis Value : " + hotArticle.getRank()); 185 | return hotArticle.getArticleId(); 186 | } 187 | } 188 | 189 | private static class TopFlatMapFunction implements FlatMapFunction, HotArticle>{ 190 | @Override 191 | public void flatMap(List topArticles, Collector out) throws Exception { 192 | StringBuilder builder = new StringBuilder(); 193 | 194 | builder.append("\n========== Hot Articles ==========\n"); 195 | int rank = 1; 196 | for(HotArticle topArticle : topArticles) { 197 | topArticle.setRank(rank++); 198 | builder.append("Article ID: " + topArticle.getArticleId()) 199 | .append(", Rank: " + topArticle.getRank()) 200 | .append(", PV: " + topArticle.getPvCount() + "\n"); 201 | 202 | out.collect(topArticle); 203 | } 204 | 205 | logger.info(builder.toString()); 206 | } 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/nearline/task/LogTask.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.nearline.task; 2 | 3 | import com.ams.recommend.client.HBaseClient; 4 | import com.ams.recommend.common.pojo.Log; 5 | import com.ams.recommend.util.LogUtil; 6 | import com.ams.recommend.util.Property; 7 | import org.apache.flink.api.common.functions.FlatMapFunction; 8 | import org.apache.flink.api.common.serialization.SimpleStringSchema; 9 | import org.apache.flink.streaming.api.TimeCharacteristic; 10 | import org.apache.flink.streaming.api.datastream.DataStream; 11 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 12 | import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; 13 | import org.apache.flink.streaming.api.windowing.time.Time; 14 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; 15 | import org.apache.flink.util.Collector; 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | 19 | import java.util.Properties; 20 | 21 | public class LogTask { 22 | 23 | private static final Logger logger = LoggerFactory.getLogger(LogTask.class); 24 | 25 | public static void main(String[] args) throws Exception { 26 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 27 | env.enableCheckpointing(600000L); //设置每10分钟设置自动生成checkpoint 28 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); 29 | 30 | Properties kafkaProp = Property.getKafkaProperties("log"); //设置消费组id 31 | FlinkKafkaConsumer consumer = new FlinkKafkaConsumer<>("log", 32 | new SimpleStringSchema(), 33 | kafkaProp 34 | ); 35 | 36 | DataStream logs = env 37 | .addSource(consumer) 38 | .flatMap(new LogFlatMapFunction()) //设置每5分钟生成一个水位线 39 | .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor(Time.minutes(15)) { 40 | @Override 41 | public long extractTimestamp(Log element) { 42 | return element.getTime(); 43 | } 44 | }); 45 | 46 | env.execute("Collect log task"); 47 | } 48 | 49 | private static class LogFlatMapFunction implements FlatMapFunction { 50 | 51 | @Override 52 | public void flatMap(String value, Collector out) throws Exception { 53 | //直接将Kafka传来的log放入HBase中 54 | Log log = LogUtil.toLogEntry(value); //将log转化为Log实体 55 | 56 | if(log != null) { 57 | final String rowKey = LogUtil.getLogRowKey(log.getTime()); 58 | String tableName = Property.getStrValue("table.log.name"); 59 | //如果不存在表就创建一个 60 | HBaseClient.createTableIfNotExist(tableName, "l"); 61 | //插入用户id 62 | HBaseClient.put(tableName, rowKey, "l" 63 | , "uid", log.getUserId()); 64 | //插入文章id 65 | HBaseClient.put(tableName, rowKey, "l" 66 | , "aid", log.getArticleId()); 67 | //插入用户发生动作时间 68 | HBaseClient.put(tableName, rowKey, "l" 69 | , "ts", String.valueOf(log.getTime())); 70 | //插入用户发生的动作 71 | HBaseClient.put(tableName, rowKey, "l" 72 | , "act", log.getAction()); 73 | 74 | logger.info(log.toString()); 75 | out.collect(log); 76 | } 77 | } 78 | } 79 | 80 | } 81 | 82 | 83 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/nearline/task/PortraitTask.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.nearline.task; 2 | 3 | import com.ams.recommend.client.HBaseClient; 4 | import com.ams.recommend.client.MySQLClient; 5 | import com.ams.recommend.common.pojo.Log; 6 | import com.ams.recommend.common.pojo.User; 7 | import com.ams.recommend.util.Constants; 8 | import com.ams.recommend.util.LogUtil; 9 | import com.ams.recommend.util.Property; 10 | import com.ams.recommend.util.WordTokenizerUtil; 11 | import org.apache.flink.api.common.serialization.SimpleStringSchema; 12 | import org.apache.flink.api.common.state.StateTtlConfig; 13 | import org.apache.flink.api.common.state.ValueState; 14 | import org.apache.flink.api.common.state.ValueStateDescriptor; 15 | import org.apache.flink.configuration.Configuration; 16 | import org.apache.flink.streaming.api.TimeCharacteristic; 17 | import org.apache.flink.streaming.api.datastream.DataStream; 18 | import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; 19 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 20 | import org.apache.flink.streaming.api.functions.ProcessFunction; 21 | import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; 22 | import org.apache.flink.streaming.api.functions.sink.SinkFunction; 23 | import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; 24 | import org.apache.flink.streaming.api.windowing.time.Time; 25 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; 26 | import org.apache.flink.util.Collector; 27 | import org.apache.flink.util.OutputTag; 28 | 29 | import java.sql.ResultSet; 30 | 31 | /** 32 | * 画像Task 33 | */ 34 | public class PortraitTask { 35 | 36 | private final static Long TTL = 180L; //判定用户喜欢文章的阅读时间 默认3min 37 | 38 | private static final OutputTag outputTag = new OutputTag("side-output"){}; 39 | 40 | public static void main(String[] args) throws Exception { 41 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 42 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); 43 | 44 | DataStream logSource = env.addSource(new FlinkKafkaConsumer<>( 45 | "log", 46 | new SimpleStringSchema(), 47 | Property.getKafkaProperties("portrait") 48 | )); 49 | 50 | SingleOutputStreamOperator logs = logSource 51 | .process(new LogProcessFunction()) 52 | .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor(Time.seconds(10)) { 53 | @Override 54 | public long extractTimestamp(Log element) { 55 | return element.getTime(); 56 | } 57 | }); 58 | 59 | logs.keyBy("articleId") 60 | .addSink(new ArticlePortraitSink()); 61 | 62 | logs.getSideOutput(outputTag) 63 | .keyBy("userID") 64 | .addSink(new UserPortraitSink()); 65 | 66 | env.execute("Portrait Task"); 67 | } 68 | 69 | private static class LogProcessFunction extends ProcessFunction { 70 | @Override 71 | public void processElement(String log, Context ctx, Collector out) throws Exception { 72 | Log logEntry = LogUtil.toLogEntry(log); 73 | out.collect(logEntry); 74 | //侧输出到另外一条Stream 75 | ctx.output(outputTag, logEntry); 76 | } 77 | } 78 | 79 | /** 80 | * 统计喜欢看这篇文章的用户信息,比如:性别,年龄,职业,学历等 81 | */ 82 | private static class ArticlePortraitSink implements SinkFunction { 83 | @Override 84 | public void invoke(Log log, Context context) throws Exception { 85 | User user = MySQLClient.getUserById(log.getUserId()); 86 | String articleId = log.getArticleId(); 87 | String userId = log.getUserId(); 88 | //性别 89 | HBaseClient.put(Constants.ARTICLE_PORTRAIT_TABLE, 90 | articleId, 91 | "sex", 92 | userId, 93 | String.valueOf(user.getSex()) 94 | ); 95 | //年龄段 96 | HBaseClient.put(Constants.ARTICLE_PORTRAIT_TABLE, 97 | articleId, 98 | "age", 99 | userId, 100 | Constants.rangeAge(user.getAge()) 101 | ); 102 | //职业 103 | HBaseClient.put(Constants.ARTICLE_PORTRAIT_TABLE, 104 | articleId, 105 | "job", 106 | userId, 107 | user.getJob() 108 | ); 109 | //学历 110 | HBaseClient.put(Constants.ARTICLE_PORTRAIT_TABLE, 111 | articleId, 112 | "edu", 113 | userId, 114 | user.getEducation() 115 | ); 116 | } 117 | } 118 | 119 | /** 120 | * 统计用户画像信息 121 | * 作者(文章来源),频道,标题,关键字 122 | */ 123 | private static class UserPortraitSink implements SinkFunction { 124 | @Override 125 | public void invoke(Log log, Context context) throws Exception { 126 | ResultSet rs = MySQLClient.getUserPortraitById(log.getArticleId()); 127 | 128 | if(rs != null) { 129 | rs.next(); 130 | String author = rs.getString("author"); 131 | int channelId = rs.getInt("channel_id"); 132 | String title = rs.getString("title"); 133 | String keyword = rs.getString("keyword"); 134 | 135 | String userId = log.getUserId(); 136 | String articleId = log.getArticleId(); 137 | //作者 138 | HBaseClient.put(Constants.USER_PORTRAIT_TABLE, 139 | userId, 140 | "aut", 141 | articleId, 142 | author 143 | ); 144 | //频道 145 | HBaseClient.put(Constants.USER_PORTRAIT_TABLE, 146 | userId, 147 | "cha", 148 | articleId, 149 | String.valueOf(channelId) 150 | ); 151 | //标题 152 | HBaseClient.put(Constants.USER_PORTRAIT_TABLE, 153 | userId, 154 | "tit", 155 | articleId, 156 | WordTokenizerUtil.segment(title) 157 | ); 158 | //关键字 159 | HBaseClient.put(Constants.USER_PORTRAIT_TABLE, 160 | userId, 161 | "kw", 162 | articleId, 163 | keyword 164 | ); 165 | } 166 | } 167 | } 168 | 169 | } 170 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/nearline/task/UserInterestTask.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.nearline.task; 2 | 3 | import com.ams.recommend.client.HBaseClient; 4 | import com.ams.recommend.common.pojo.Log; 5 | import com.ams.recommend.util.Constants; 6 | import com.ams.recommend.util.LogUtil; 7 | import com.ams.recommend.util.Property; 8 | import org.apache.flink.api.common.functions.MapFunction; 9 | import org.apache.flink.api.common.serialization.SimpleStringSchema; 10 | import org.apache.flink.api.common.state.StateTtlConfig; 11 | import org.apache.flink.api.common.state.ValueState; 12 | import org.apache.flink.api.common.state.ValueStateDescriptor; 13 | import org.apache.flink.configuration.Configuration; 14 | import org.apache.flink.streaming.api.TimeCharacteristic; 15 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 16 | import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; 17 | import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; 18 | import org.apache.flink.streaming.api.windowing.time.Time; 19 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; 20 | 21 | public class UserInterestTask { 22 | 23 | private final static Long LIKE_TIME = 180_1000L; //判定用户喜欢文章的阅读时间 24 | 25 | public static void main(String[] args) throws Exception { 26 | 27 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 28 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); 29 | env.enableCheckpointing(5000L); 30 | 31 | FlinkKafkaConsumer consumer = new FlinkKafkaConsumer( 32 | "log", 33 | new SimpleStringSchema(), 34 | Property.getKafkaProperties("user-interest") 35 | ); 36 | 37 | env.addSource(consumer) 38 | .map(new LogMapFunction()) 39 | .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor(Time.seconds(1)) { 40 | @Override 41 | public long extractTimestamp(Log element) { 42 | return element.getTime(); 43 | } 44 | }) 45 | .keyBy(user -> user.getUserId()) 46 | .addSink(new UserInterestSinkFunction()); 47 | 48 | env.execute("User Interest Task"); 49 | } 50 | 51 | private static class LogMapFunction implements MapFunction { 52 | @Override 53 | public Log map(String value) throws Exception { 54 | Log log = LogUtil.toLogEntry(value); 55 | 56 | if(log != null) return log; 57 | else return null; 58 | } 59 | } 60 | 61 | private static class UserInterestSinkFunction extends RichSinkFunction { 62 | 63 | private ValueState lastTimeState; 64 | 65 | @Override 66 | public void open(Configuration parameters) throws Exception { 67 | super.open(parameters); 68 | //设置状态过期时间为3个小时(粗略认为如果用户3个小时还没有关闭该页面,可能是挂机状态,不认为是有效游览) 69 | StateTtlConfig ttlConfig = StateTtlConfig 70 | .newBuilder(org.apache.flink.api.common.time.Time.hours(3)) 71 | .build(); 72 | 73 | ValueStateDescriptor desc = new ValueStateDescriptor<>("Open Page time", Long.class); 74 | desc.enableTimeToLive(ttlConfig); 75 | lastTimeState = getRuntimeContext().getState(desc); 76 | } 77 | 78 | @Override 79 | public void invoke(Log log, Context context) throws Exception { 80 | //动作: 1.打开游览;2.点赞;3.收藏;4.关闭 81 | String op = log.getAction(); 82 | Long curTime = log.getTime(); 83 | Long lastTime = lastTimeState.value(); 84 | 85 | if("1".equals(op)) { 86 | lastTimeState.update(curTime); 87 | }else if("4".equals(op)) { 88 | if(curTime - lastTime > LIKE_TIME) { //游览时间长,表示用户对该文章感兴趣 89 | HBaseClient.addOrUpdateColumn(Constants.USER_PORTRAIT_TABLE, log.getUserId(), "i", log.getArticleId()); 90 | } 91 | lastTimeState.clear(); 92 | } else if("2".equals(op) || "3".equals(op)) { //点赞收藏 93 | HBaseClient.addOrUpdateColumn(Constants.USER_PORTRAIT_TABLE, log.getUserId(), "i", log.getArticleId()); 94 | lastTimeState.clear(); 95 | } 96 | } 97 | 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/nearline/task/tfidf/SpiderTask.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.nearline.task.tfidf; 2 | 3 | import com.ams.recommend.client.MySQLClient; 4 | import com.ams.recommend.common.pojo.SpiderArticle; 5 | import com.ams.recommend.nearline.task.HotArticleTask; 6 | import com.ams.recommend.util.Property; 7 | import org.apache.flink.api.common.functions.MapFunction; 8 | import org.apache.flink.api.common.serialization.SimpleStringSchema; 9 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 10 | import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; 11 | import org.apache.flink.streaming.api.windowing.time.Time; 12 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; 13 | import org.slf4j.Logger; 14 | import org.slf4j.LoggerFactory; 15 | 16 | public class SpiderTask { 17 | 18 | private final static Logger logger = LoggerFactory.getLogger(HotArticleTask.class); 19 | private static final Integer KEYWORD_SIZE = 10; //爬去文章筛选的关键字个数 20 | 21 | public static void main(String[] args) throws Exception { 22 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 23 | env.enableCheckpointing(5000); 24 | 25 | FlinkKafkaConsumer consumer = new FlinkKafkaConsumer<>( 26 | "spider", 27 | new SimpleStringSchema(), 28 | Property.getKafkaProperties("tf-idf") 29 | ); 30 | 31 | env.addSource(consumer) 32 | .map(new SpiderMapFunction()) 33 | .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor(Time.minutes(10)) { 34 | @Override 35 | public long extractTimestamp(SpiderArticle element) { 36 | logger.info("spider article watermark : " + element.getTimestamp()); 37 | return element.getTimestamp(); 38 | } 39 | }) 40 | .map(new TFIDFMapFunction(KEYWORD_SIZE)) 41 | .addSink(new TFIDFSink()); 42 | 43 | env.execute("Spider for tf-idf task"); 44 | } 45 | 46 | 47 | private static class SpiderMapFunction implements MapFunction { 48 | @Override 49 | public SpiderArticle map(String value) throws Exception { 50 | if(value == null) throw new IllegalArgumentException("Spiders are EMPTY!"); 51 | 52 | SpiderArticle article = new SpiderArticle(); 53 | String[] vs = value.split(","); 54 | String articleId = vs[0]; 55 | long timestamp = Long.valueOf(vs[1]); 56 | 57 | article.setArticleId(articleId); 58 | article.setTimestamp(timestamp); 59 | 60 | String content = MySQLClient.getContentById(articleId); 61 | article.setContent(content); 62 | 63 | return article; 64 | } 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/nearline/task/tfidf/TFIDFMapFunction.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.nearline.task.tfidf; 2 | 3 | import com.ams.recommend.client.HBaseClient; 4 | import com.ams.recommend.common.pojo.SpiderArticle; 5 | import com.ams.recommend.util.Constants; 6 | import com.ams.recommend.util.Property; 7 | import com.ams.recommend.util.WordTokenizerUtil; 8 | import org.apache.flink.api.common.functions.MapFunction; 9 | import org.apache.flink.api.java.tuple.Tuple2; 10 | 11 | import java.util.Map; 12 | import java.util.PriorityQueue; 13 | 14 | public class TFIDFMapFunction implements MapFunction { 15 | 16 | private final String tableName = Property.getStrValue("table.word.name"); 17 | private int keywordSize; 18 | private long totalArticleSize = 1L; 19 | 20 | public TFIDFMapFunction(int keywordSize) { 21 | if(keywordSize < 1) throw new IllegalArgumentException("keywords num should not less than 1."); 22 | this.keywordSize = keywordSize; 23 | 24 | //取出文章总数 25 | String sizeStr = HBaseClient.get(tableName, "articleSize", "c", "count"); 26 | if(sizeStr != null) totalArticleSize = Long.valueOf(sizeStr); 27 | } 28 | 29 | @Override 30 | public SpiderArticle map(SpiderArticle article) throws Exception { 31 | //统计文章各个词的TF 32 | Map tf = WordTokenizerUtil.tf(article.getContent()); 33 | article.setTfMap(tf); 34 | 35 | PriorityQueue> tfidfQueue = new PriorityQueue<>(keywordSize); 36 | 37 | //计算TF-IDF 38 | for(String word : tf.keySet()) { 39 | //查询包含词word的文章总数 40 | int size = HBaseClient.getColumnSize(tableName, word, "a"); 41 | if(size == 0) size = 1; 42 | Double TF = tf.get(word); 43 | Double IDF = Math.log10(totalArticleSize / size); 44 | Double tfidf = TF * IDF; 45 | tfidfQueue.add(new Tuple2<>(word, tfidf)); 46 | //更新单词(rowKey)对应的文章列 47 | HBaseClient.addOrUpdateColumn(tableName, word, "a", article.getArticleId()); 48 | //更新文章中各个单词的tf,tfidf 49 | HBaseClient.put(Constants.ARTICLE_TFIDF_TABLE, article.getArticleId(), "tf", word, String.valueOf(tf)); 50 | HBaseClient.put(Constants.ARTICLE_TFIDF_TABLE, article.getArticleId(), "ti", word, String.valueOf(tfidf)); 51 | } 52 | article.setTfidf(tfidfQueue); 53 | //更新总文章数 54 | HBaseClient.addOrUpdateColumn(tableName, "articleSize", "c", "count"); 55 | 56 | return article; 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/nearline/task/tfidf/TFIDFSink.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.nearline.task.tfidf; 2 | 3 | import com.ams.recommend.client.MySQLClient; 4 | import com.ams.recommend.common.pojo.SpiderArticle; 5 | import org.apache.flink.api.java.tuple.Tuple2; 6 | import org.apache.flink.streaming.api.functions.sink.SinkFunction; 7 | 8 | import java.util.PriorityQueue; 9 | 10 | public class TFIDFSink implements SinkFunction { 11 | @Override 12 | public void invoke(SpiderArticle article, Context context) throws Exception { 13 | PriorityQueue> topKeyword = article.getTfidf(); 14 | StringBuilder stringBuilder = new StringBuilder(); 15 | while(!topKeyword.isEmpty()){ 16 | Tuple2 tiKV = topKeyword.poll(); 17 | if(topKeyword.size() > 1) 18 | stringBuilder.append(tiKV.f0 + " "); 19 | else stringBuilder.append(tiKV.f1); 20 | } 21 | MySQLClient.putKeywordById(article.getArticleId(), stringBuilder.toString()); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/offline/ArticleCoeff.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.offline; 2 | 3 | import com.ams.recommend.client.HBaseClient; 4 | import com.ams.recommend.common.pojo.ArticlePortrait; 5 | import com.ams.recommend.util.Constants; 6 | 7 | import java.io.IOException; 8 | import java.util.List; 9 | 10 | /** 11 | * 基于文章标签的文章相关度计算 12 | * 1.基于文章标签 计算文章的余弦相似度 13 | * 2.基于文章内容(关键字) 计算文章的相似度 14 | * @author jackybai 15 | */ 16 | public class ArticleCoeff { 17 | /** 18 | * 计算一个文章和其他相关文章的评分,并将计算结果放入Hbase 19 | * @param id 文章id 20 | * @param others 其他文章的id 21 | */ 22 | public void getArticleCoeff(String id, List others) throws Exception { 23 | ArticlePortrait article = sigleArticle(id); 24 | for (String articleId : others) { 25 | if (id.equals(articleId)) continue; 26 | ArticlePortrait entity = sigleArticle(articleId); 27 | Double score = getScore(article, entity); 28 | HBaseClient.put(Constants.ARTICLE_TAG_TABLE, id, "p", articleId, score.toString()); 29 | } 30 | } 31 | 32 | /** 33 | * 获取一个文章的所有标签数据 34 | * @param articleId 文章id 35 | * @return 文章标签entity 36 | * @throws IOException 37 | */ 38 | private ArticlePortrait sigleArticle(String articleId) { 39 | ArticlePortrait entity = new ArticlePortrait(); 40 | try { 41 | String woman = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "sex", Constants.SEX_WOMAN); 42 | String man = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "sex", Constants.SEX_MAN); 43 | String age_10 = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "age", Constants.AGE_10); 44 | String age_20 = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "age", Constants.AGE_20); 45 | String age_30 = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "age", Constants.AGE_30); 46 | String age_40 = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "age", Constants.AGE_40); 47 | String age_50 = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "age", Constants.AGE_50); 48 | String age_60 = HBaseClient.get(Constants.ARTICLE_PORTRAIT_TABLE, articleId, "age", Constants.AGE_60); 49 | entity.setMan(Integer.valueOf(man)); 50 | entity.setWoman(Integer.valueOf(woman)); 51 | entity.setAge_10(Integer.valueOf(age_10)); 52 | entity.setAge_20(Integer.valueOf(age_20)); 53 | entity.setAge_30(Integer.valueOf(age_30)); 54 | entity.setAge_40(Integer.valueOf(age_40)); 55 | entity.setAge_50(Integer.valueOf(age_50)); 56 | entity.setAge_60(Integer.valueOf(age_60)); 57 | } catch (Exception e) { 58 | System.err.println("articleId: " + articleId); 59 | e.printStackTrace(); 60 | } 61 | return entity; 62 | } 63 | 64 | /** 65 | * 根据标签计算两个文章之间的相关度 66 | * @param article 文章 67 | * @param target 相关文章 68 | * @return 相似分数 69 | */ 70 | private double getScore(ArticlePortrait article, ArticlePortrait target) { 71 | double sqrt = Math.sqrt(article.getTotal() + target.getTotal()); 72 | if (sqrt == 0) { 73 | return 0.0; 74 | } 75 | int total = article.getMan() * target.getMan() + article.getWoman() * target.getWoman() 76 | + article.getAge_10() * target.getAge_10() + article.getAge_20() * target.getAge_20() 77 | + article.getAge_30() * target.getAge_30() + article.getAge_40() * target.getAge_40() 78 | + article.getAge_50() * target.getAge_50() + article.getAge_60() * target.getAge_60(); 79 | return Math.sqrt(total) / sqrt; 80 | } 81 | 82 | public void calcuSimilar(String id, List others) { 83 | 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/offline/ItemCfCoeff.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.offline; 2 | 3 | import com.ams.recommend.client.HBaseClient; 4 | import com.ams.recommend.util.Constants; 5 | 6 | import java.io.IOException; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | /** 11 | * 基于协同过滤的文章相关度计算 12 | * abs( i ∩ j) 13 | * w = —————————————— 14 | * sqrt(i || j) 15 | * @author jackybai 16 | */ 17 | public class ItemCfCoeff { 18 | 19 | /** 20 | * 计算一个文章和其他相关文章的评分,并将计算结果放入Hbase 21 | * 22 | * @param id 文章id 23 | * @param others 其他文章的id 24 | */ 25 | public void getSingelItemCfCoeff(String id, List others) throws Exception { 26 | for (String other : others) { 27 | if(id.equals(other)) continue; 28 | Double score = twoItemCfCoeff(id, other); 29 | HBaseClient.put(Constants.ARTICLE_CF_TABLE,id, "p",other,score.toString()); 30 | } 31 | } 32 | 33 | /** 34 | * 计算两个文章之间的评分 35 | * @param id 36 | * @param other 37 | * @return 38 | * @throws IOException 39 | */ 40 | private double twoItemCfCoeff(String id, String other) throws IOException { 41 | Map p1 = HBaseClient.getRow(Constants.ARTICLE_HIS_TABLE, id); 42 | Map p2 = HBaseClient.getRow(Constants.ARTICLE_HIS_TABLE, other); 43 | 44 | int n = p1.size(); 45 | int m = p2.size(); 46 | int sum = 0; 47 | Double total = Math.sqrt(n * m); 48 | for (Map.Entry entry : p1.entrySet()) { 49 | String key = (String) entry.getKey(); 50 | for (Map.Entry p : p2.entrySet()) { 51 | if (key.equals(p.getKey())) { 52 | sum++; 53 | } 54 | } 55 | } 56 | if (total == 0){ 57 | return 0.0; 58 | } 59 | return sum / total; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/offline/SchedulerJob.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.offline; 2 | 3 | import com.ams.recommend.client.HBaseClient; 4 | import com.ams.recommend.nearline.task.LogTask; 5 | import com.ams.recommend.util.Constants; 6 | import org.apache.flink.api.common.time.Time; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | import java.io.IOException; 11 | import java.util.*; 12 | import java.util.concurrent.ExecutorService; 13 | import java.util.concurrent.Executors; 14 | 15 | public class SchedulerJob { 16 | private static final Logger logger = LoggerFactory.getLogger(SchedulerJob.class); 17 | private static ExecutorService executorService = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() + 1); 18 | 19 | /** 20 | * 每12小时定时调度一次 基于三个推荐策略的 文章评分计算 21 | * 策略1:协同过滤 22 | * 策略2:基于文章标签计算文章的余弦相似度 23 | * 策略3:基于文章内容计算文章相似度 24 | */ 25 | public static void main(String[] args) { 26 | Timer qTimer = new Timer(); 27 | qTimer.scheduleAtFixedRate(new RefreshTask(), 0, Time.hours(12).toMilliseconds()); 28 | } 29 | 30 | private static class RefreshTask extends TimerTask { 31 | @Override 32 | public void run() { 33 | logger.info(new Date() + " 开始执行任务"); 34 | /* 取出被用户游览过的文章id */ 35 | List allArticleId; 36 | try { 37 | allArticleId = HBaseClient.getAllKey(Constants.ARTICLE_HIS_TABLE); 38 | } catch (IOException e) { 39 | System.err.println("获取历史文章id异常: " + e.getMessage()); 40 | e.printStackTrace(); 41 | return; 42 | } 43 | 44 | for (String id : allArticleId) { 45 | executorService.execute(new Task(id, allArticleId)); 46 | } 47 | } 48 | } 49 | 50 | private static class Task implements Runnable { 51 | private String id; 52 | private List others; 53 | 54 | public Task(String id, List others) { 55 | this.id = id; 56 | this.others = others; 57 | } 58 | 59 | ItemCfCoeff item = new ItemCfCoeff(); 60 | ArticleCoeff article = new ArticleCoeff(); 61 | 62 | @Override 63 | public void run() { 64 | try { 65 | item.getSingelItemCfCoeff(id, others); //策略1:基于协同过滤 66 | article.getArticleCoeff(id, others); //策略2:基于文章标签 67 | article.calcuSimilar(id, others); //策略3:基于文章内容 68 | } catch (Exception e) { 69 | e.printStackTrace(); 70 | } 71 | } 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/util/Constants.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.util; 2 | 3 | public class Constants { 4 | 5 | public static final String SEX_MAN = "1"; 6 | public static final String SEX_WOMAN = "0"; 7 | public static final String AGE_10 = "10s"; 8 | public static final String AGE_20 = "20s"; 9 | public static final String AGE_30 = "30s"; 10 | public static final String AGE_40 = "40s"; 11 | public static final String AGE_50 = "50s"; 12 | public static final String AGE_60 = "60s"; 13 | 14 | public static String rangeAge(int age) { 15 | if(age < 20) return AGE_10; 16 | else if(age < 30) return AGE_20; 17 | else if(age < 30) return AGE_30; 18 | else if(age < 40) return AGE_40; 19 | else if(age < 50) return AGE_50; 20 | else return AGE_60; 21 | } 22 | 23 | /*文章画像表*/ 24 | public final static String ARTICLE_PORTRAIT_TABLE = Property.getStrValue("table.portrait.article.name"); 25 | /*用户画像表*/ 26 | public final static String USER_PORTRAIT_TABLE = Property.getStrValue("table.portrait.user.name"); 27 | /*文章单词表*/ 28 | public final static String WORD_TABLE = Property.getStrValue("table.word.name"); 29 | /*文章历史信息表*/ 30 | public final static String ARTICLE_HIS_TABLE = Property.getStrValue("table.article.history.name"); 31 | /*文章历史信息表*/ 32 | public final static String USER_HIS_TABLE = Property.getStrValue("table.user.history.name"); 33 | /*文章相关度表*/ 34 | public final static String ARTICLE_CF_TABLE = Property.getStrValue("table.article.cf.name"); 35 | /*文章标签相关度表*/ 36 | public final static String ARTICLE_TAG_TABLE = Property.getStrValue("table.article.tag.name"); 37 | /*TF-IDF*/ 38 | public final static String ARTICLE_TFIDF_TABLE = Property.getStrValue("table.article.tfidf.name"); 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/util/LogUtil.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.util; 2 | 3 | import com.ams.recommend.common.pojo.Log; 4 | import org.jetbrains.annotations.NotNull; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | public class LogUtil { 9 | 10 | private static Logger logger = LoggerFactory.getLogger(LogUtil.class); 11 | 12 | @NotNull 13 | public static Log toLogEntry(String log) { 14 | logger.info(log); 15 | 16 | Log logEntry = new Log(); 17 | 18 | String[] logArr = log.split(","); //日志用","作为分隔符,共包括四个部分 19 | 20 | if(logArr.length != 4) { 21 | logger.error("Log messages is incorrect"); 22 | return null; 23 | } 24 | 25 | logEntry.setUserId(logArr[0]); 26 | logEntry.setArticleId(logArr[1]); 27 | logEntry.setTime(Long.valueOf(logArr[2])); 28 | logEntry.setAction(logArr[3]); 29 | 30 | return logEntry; 31 | } 32 | 33 | public static String getLogRowKey(Long time) { 34 | return String.valueOf(Long.MAX_VALUE - time); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/util/Property.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.util; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | import java.util.Properties; 7 | 8 | public class Property { 9 | 10 | private final static String CONF_NAME = "config.properties"; 11 | 12 | private static Properties contextProperties; 13 | 14 | static { 15 | InputStream in = Thread.currentThread().getContextClassLoader().getResourceAsStream(CONF_NAME); 16 | contextProperties = new Properties(); 17 | try { 18 | InputStreamReader inputStreamReader = new InputStreamReader(in, "UTF-8"); 19 | contextProperties.load(inputStreamReader); 20 | } catch (IOException e) { 21 | System.err.println("===[AMS-recommendation-system]=== 资源文件加载失败!"); 22 | e.printStackTrace(); 23 | } 24 | System.out.println("===[AMS-recommendation-system]=== 资源文件加载成功"); 25 | } 26 | 27 | public static String getStrValue(String key) { 28 | return contextProperties.getProperty(key); 29 | } 30 | 31 | public static int getIntValue(String key) { 32 | if(key.isEmpty()) throw new IllegalArgumentException("Key is not allowed NULL"); 33 | 34 | String strValue = getStrValue(key); 35 | return Integer.parseInt(strValue); 36 | } 37 | 38 | public static Properties getKafkaProperties(String groupId) { 39 | Properties properties = new Properties(); 40 | properties.setProperty("bootstrap.servers", getStrValue("kafka.bootstrap.servers")); 41 | properties.setProperty("zookeeper.connect", getStrValue("kafka.zookeeper.connect")); 42 | properties.setProperty("group.id", groupId); 43 | return properties; 44 | } 45 | 46 | } -------------------------------------------------------------------------------- /src/main/java/com/ams/recommend/util/WordTokenizerUtil.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.util; 2 | 3 | import com.hankcs.hanlp.seg.common.Term; 4 | import com.hankcs.hanlp.tokenizer.NotionalTokenizer; 5 | 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | public class WordTokenizerUtil { 11 | 12 | /** 13 | * 计算文章中词汇的TF 14 | */ 15 | public static Map tf(String content) { 16 | Map wc = new HashMap<>(); 17 | List terms = NotionalTokenizer.segment(content); 18 | int wordSize = terms.size(); 19 | System.out.println("总共:" + wordSize + "词"); 20 | 21 | for(Term term : terms) { 22 | if(wc.keySet().contains(term.word)) { 23 | wc.put(term.word, wc.get(term.word) + 1.0); 24 | }else wc.put(term.word, 1.0); 25 | } 26 | 27 | Map tf = new HashMap<>(); 28 | 29 | for(Map.Entry w : wc.entrySet()) { 30 | tf.put(w.getKey(), (w.getValue() / wordSize)); 31 | } 32 | return tf; 33 | } 34 | 35 | /** 36 | * 分词并过滤停用词 37 | */ 38 | public static String segment(String text) { 39 | StringBuilder builder = new StringBuilder(); 40 | for(Term term : NotionalTokenizer.segment(text)) { 41 | builder.append(term.word + " "); 42 | } 43 | return builder.toString(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/resources/config.properties: -------------------------------------------------------------------------------- 1 | #============== hadoop =================== 2 | hadoop.user=hadoop 3 | fs.defaultFS=hdfs://master:8020 4 | #hdfs.ha.zookeeper.quorum=XXXX-apache00.XX01,XXXX-apache01.XX01,XXXX-apache02.XX01 5 | 6 | #============== hbase =================== 7 | hbase.rootdir=hdfs://hbase:9000/hbase 8 | hbase.zookeeper.quorum=master,slave1,slave2 9 | hbase.client.scanner.timeout.period=1000 10 | hbase.rpc.timeout=3000 11 | hbase.client.ipc.pool.size=1 12 | table.log.name=log 13 | table.user.history.name=ua_history 14 | table.article.history.name=au_history 15 | table.word.name=word 16 | table.portrait.article.name=article_portrait 17 | table.portrait.user.name=user_portrait 18 | table.article.cf.name=acf 19 | table.article.tag.name=atag 20 | table.article.tfidf.name=tfidf 21 | 22 | #============== mysql config =================== 23 | mysql.url=jdbc:mysql://mysql:3306/ams?serverTimezone=GMT%2B8 24 | mysql.name=root 25 | mysql.password=123456 26 | 27 | #============== redis config =================== 28 | redis.host=ams 29 | redis.port=6371 30 | redis.db=0 31 | 32 | #============== kafka config =================== 33 | kafka.bootstrap.servers=master:9092 34 | kafka.zookeeper.connect=master:2181 35 | 36 | #============== spider =================== 37 | spider.article.size = 1 -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | ###配置日志根Logger 2 | log4j.rootLogger=DEBUG,stdout,file 3 | #ERROR 为严重错误 主要是程序的错误 4 | #WARN 为一般警告,比如session丢失 5 | #INFO 为一般要显示的信息,比如登录登出 6 | #DEBUG 为程序的调试信息 7 | log4j.additivity.org.apache=true 8 | 9 | ###配置日志信息输出目的地Appender 10 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 11 | #org.apache.log4j.ConsoleAppender(控制台) 12 | #org.apache.log4j.FileAppender(文件) 13 | #org.apache.log4j.DailyRollingFileAppender(每天产生一个日志文件) 14 | #org.apache.log4j.RollingFileAppender(文件大小到达指定尺寸的时候产生一个新的文件) 15 | #org.apache.log4j.WriterAppender(将日志信息以流格式发送到任意指定的地方) 16 | #log4j.appender.error.Target=System.out 17 | ###输出ERROR级别以上的日志 18 | log4j.appender.stdout.threshold=INFO 19 | ###配置日志信息的格式(布局) 20 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 21 | #org.apache.log4j.HTMLLayout(以HTML表格形式布局) 22 | #org.apache.log4j.PatternLayout(可以灵活地指定布局模式) 23 | #org.apache.log4j.SimpleLayout(包含日志信息的级别和信息字符串) 24 | #org.apache.log4j.TTCCLayout(包含日志产生的时间、线程、类别等等信息) 25 | ###配置日志打印的格式格式化日志信息 26 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 27 | 28 | #%m 输出代码中指定的消息 29 | #%p 输出优先级,即DEBUG,INFO,WARN,ERROR,FATAL 30 | #%r 输出自应用启动到输出该log信息耗费的毫秒数 31 | #%c 输出所属的类目,通常就是所在类的全名 32 | #%t 输出产生该日志事件的线程名 33 | #%n 输出一个回车换行符,Windows平台为“\r\n”,Unix平台为“\n” 34 | #%d 输出日志时间点的日期或时间,默认格式为ISO8601,也可以在其后指定格式,比如:%d{yyy MMM dd HH:mm:ss , SSS} 35 | #%l 输出日志事件的发生位置,包括类目名、发生的线程,以及在代码中的行数 36 | #log4j.appender.file=org.apache.log4j.RollingFileAppender 37 | log4j.appender.file=org.apache.log4j.DailyRollingFileAppender 38 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 39 | log4j.appender.file.DatePattern='.'yyyy-MM-dd-HH-mm 40 | # '.'yyyy-MM:每月 41 | # '.'yyyy-ww:每周 42 | # '.'yyyy-MM-dd:每天 43 | # '.'yyyy-MM-dd-a:每天两次 44 | # '.'yyyy-MM-dd-HH:每小时 45 | # '.'yyyy-MM-dd-HH-mm:每分钟 46 | #log4j.appender.file.MaxFileSize=1MB 47 | ###滚动文件的最大数 48 | #log4j.appender.file.MaxBackupIndex=8 49 | log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} [%-5p](%-30c{1}) [TxId : %X{PtxId} , SpanId : %X{PspanId}] [ET:%X{ENV_TYPE},AN:%X{APP_NAME},SN:%X{SERVICE_NAME},CN:%X{CONTAINER_NAME},CI:%X{CONTAINER_IP}] %m%n 50 | log4j.appender.file.Threshold=DEBUG 51 | ###将消息增加到指定文件中,false指将消息覆盖指定的文件内容 52 | log4j.appender.file.append=true 53 | ###日志的保存位置 54 | #log4j.appender.file.File=E:/logs/file-debug-log.log 55 | log4j.appender.file.File=logs/debug-debug.log 56 | ###每天产生一个日志文件 57 | #log4j.appender.file=org.apache.log4j.DailyRollingFileAppender 58 | #log4j.appender.file.layout=org.apache.log4j.PatternLayout 59 | #log4j.appender.file.maxFileSize=100 60 | #log4j.appender.file.maxBackupIndex=5 61 | #log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} [%-5p](%-30c{1}) [TxId : %X{PtxId} , SpanId : %X{PspanId}] [ET:%X{ENV_TYPE},AN:%X{APP_NAME},SN:%X{SERVICE_NAME},CN:%X{CONTAINER_NAME},CI:%X{CONTAINER_IP}] %m%n 62 | #log4j.appender.file.Threshold=DEBUG 63 | #log4j.appender.file.append=true 64 | #log4j.appender.file.File=E:/logs/debug-log.log -------------------------------------------------------------------------------- /src/main/test/com/ams/recommend/client/HBaseClientTest.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.client; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | import java.util.Map; 7 | 8 | public class HBaseClientTest { 9 | 10 | @Test 11 | public void testTableApi() { 12 | Assert.assertEquals(true, HBaseClient.existTable("log")); 13 | HBaseClient.createTableIfNotExist("log", "l"); 14 | HBaseClient.createTableIfNotExist("u_interest", "i"); 15 | } 16 | 17 | @Test 18 | public void testGetRow() { 19 | Map kvs = HBaseClient.getRow("log", "9223372035269505076"); 20 | 21 | for(Map.Entry kv : kvs.entrySet()) { 22 | System.out.println("column : " + kv.getKey() + ", value : " + kv.getValue()); 23 | } 24 | } 25 | 26 | @Test 27 | public void testPut() { 28 | HBaseClient.put("log", 29 | "9223372035269505076", 30 | "l", 31 | "uid", 32 | "50"); 33 | } 34 | 35 | @Test 36 | public void testGet() { 37 | String res = HBaseClient.get("log", 38 | "9223372035269505076", 39 | "l", 40 | "uid" 41 | ); 42 | Assert.assertEquals("50", res); 43 | } 44 | 45 | } -------------------------------------------------------------------------------- /src/main/test/com/ams/recommend/client/RedisClientTest.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.client; 2 | 3 | import org.junit.Test; 4 | 5 | import java.util.List; 6 | 7 | public class RedisClientTest { 8 | 9 | @Test 10 | public void getTest() { 11 | RedisClient client = new RedisClient(); 12 | int topRange = 10; 13 | List data = client.getTopList(topRange); 14 | for(int i = 0; i < topRange; i++) 15 | System.out.println(i + " : " + data.get(i)); 16 | } 17 | 18 | } 19 | 20 | -------------------------------------------------------------------------------- /src/main/test/com/ams/recommend/util/LogUtilTest.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.util; 2 | 3 | import com.ams.recommend.common.pojo.Log; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | 7 | import java.util.Date; 8 | 9 | public class LogUtilTest { 10 | 11 | @Test 12 | public void testToLogEntry() { 13 | Log log = new Log(); 14 | log.setUserId("1"); 15 | log.setArticleId("1"); 16 | long timestamp = new Date().getTime(); 17 | log.setTime(timestamp); 18 | log.setAction("1"); //游览操作 19 | 20 | Assert.assertEquals(log.toString(), LogUtil.toLogEntry("1,1," + timestamp + ",1").toString()); 21 | } 22 | 23 | @Test 24 | public void rowKey() { 25 | long timestamp = new Date().getTime(); 26 | Assert.assertEquals(String.valueOf(Long.MAX_VALUE - timestamp), LogUtil.getLogRowKey(timestamp)); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/test/com/ams/recommend/util/NotionalTokenizer.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.util; 2 | import com.hankcs.hanlp.HanLP; 3 | import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary; 4 | import com.hankcs.hanlp.dictionary.stopword.Filter; 5 | import com.hankcs.hanlp.seg.Segment; 6 | import com.hankcs.hanlp.seg.common.Term; 7 | 8 | import java.util.List; 9 | import java.util.ListIterator; 10 | 11 | public class NotionalTokenizer 12 | { 13 | /** 14 | * 预置分词器 15 | */ 16 | static final Segment SEGMENT = HanLP.newSegment(); 17 | 18 | public static List segment(String text) 19 | { 20 | return segment(text.toCharArray()); 21 | } 22 | 23 | /** 24 | * 分词 25 | * 26 | * @param text 文本 27 | * @return 分词结果 28 | */ 29 | public static List segment(char[] text) 30 | { 31 | List resultList = SEGMENT.seg(text); 32 | ListIterator listIterator = resultList.listIterator(); 33 | while (listIterator.hasNext()) 34 | { 35 | if (!CoreStopWordDictionary.shouldInclude(listIterator.next())) 36 | { 37 | listIterator.remove(); 38 | } 39 | } 40 | 41 | return resultList; 42 | } 43 | 44 | /** 45 | * 切分为句子形式 46 | * 47 | * @param text 48 | * @return 49 | */ 50 | public static List> seg2sentence(String text) 51 | { 52 | List> sentenceList = SEGMENT.seg2sentence(text); 53 | for (List sentence : sentenceList) 54 | { 55 | ListIterator listIterator = sentence.listIterator(); 56 | while (listIterator.hasNext()) 57 | { 58 | if (!CoreStopWordDictionary.shouldInclude(listIterator.next())) 59 | { 60 | listIterator.remove(); 61 | } 62 | } 63 | } 64 | 65 | return sentenceList; 66 | } 67 | 68 | /** 69 | * 切分为句子形式 70 | * 71 | * @param text 72 | * @param filterArrayChain 自定义过滤器链 73 | * @return 74 | */ 75 | public static List> seg2sentence(String text, Filter... filterArrayChain) 76 | { 77 | List> sentenceList = SEGMENT.seg2sentence(text); 78 | for (List sentence : sentenceList) 79 | { 80 | ListIterator listIterator = sentence.listIterator(); 81 | while (listIterator.hasNext()) 82 | { 83 | if (filterArrayChain != null) 84 | { 85 | Term term = listIterator.next(); 86 | for (Filter filter : filterArrayChain) 87 | { 88 | if (!filter.shouldInclude(term)) 89 | { 90 | listIterator.remove(); 91 | break; 92 | } 93 | } 94 | } 95 | } 96 | } 97 | 98 | return sentenceList; 99 | } 100 | 101 | public static void main(String[] args) { 102 | System.out.println( 103 | NotionalTokenizer.segment("Hi,大家好,这里是本人的新博客基地," + 104 | "之前的博客是在CSDN平台,由于各种原因长时间没有更新,同时想要有更加独立的," + 105 | "更简洁界面的博客,因此将地址转到本站。")); 106 | } 107 | } -------------------------------------------------------------------------------- /src/main/test/com/ams/recommend/util/TFTest.java: -------------------------------------------------------------------------------- 1 | package com.ams.recommend.util; 2 | 3 | import com.ams.recommend.util.WordTokenizerUtil; 4 | 5 | import java.io.File; 6 | import java.io.FileNotFoundException; 7 | import java.util.*; 8 | 9 | public class TFTest { 10 | 11 | public static void main(String[] args) throws FileNotFoundException { 12 | StringBuilder sb = new StringBuilder(); 13 | 14 | Scanner in = new Scanner( 15 | new File("/media/baith/123b86d4-6a94-41c8-994f-5786ea4c760c/download/bi.txt") 16 | ); 17 | 18 | while(in.hasNext()) { 19 | sb.append(in.next()); 20 | } 21 | 22 | Map tfs = WordTokenizerUtil.tf(sb.toString()); 23 | List> tflist = new LinkedList<>(); 24 | tflist.addAll(tfs.entrySet()); 25 | Collections.sort(tflist, (o1, o2) -> { 26 | if(o1.getValue() > o2.getValue()) return -1; 27 | else if(o1.getValue() < o2.getValue()) return 1; 28 | else return 0; 29 | }); 30 | 31 | for(Map.Entry tf : tflist) { 32 | System.out.println(tf.getKey() + " : " + tf.getValue()); 33 | } 34 | } 35 | 36 | } 37 | --------------------------------------------------------------------------------