├── .gitignore ├── .idea └── .name ├── README.md ├── data ├── discoveryWord │ └── dis-word-part-00023-opt └── recommend │ └── ml-1m │ ├── README │ ├── movies.dat │ ├── personalRatings.txt │ ├── ratings.dat │ └── users.dat ├── out └── artifacts │ └── spark_example_jar │ └── spark-example.jar ├── pom.xml ├── spark-example.iml └── src └── main └── java ├── META-INF └── MANIFEST.MF └── com └── blogchong ├── spark └── mllib │ ├── advance │ ├── ALSRecommendMovie │ │ └── AlsArithmeticPractice.scala │ ├── DiscoveryNewWord │ │ ├── AnsjDisWords.scala │ │ └── NGramSpark.scala │ └── LdaExtractTopics │ │ ├── Check │ │ ├── PredictsDocTopics.scala │ │ └── PredictsDocTopicsArgsParser.scala │ │ ├── Refer │ │ ├── LDAModelBuild.scala │ │ └── LDAModelBuildArgsParser.scala │ │ └── Train │ │ ├── LDAModelBuild.scala │ │ └── LDAModelBuildArgsParser.scala │ └── base │ ├── AlsArithmetic.scala │ ├── Kmeans.scala │ ├── KmeansArithmetic.scala │ ├── LdaArithmetic.scala │ └── Word2Vec.scala └── util ├── CharUtil.java └── NewTime.java /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /.classpath 3 | /.project 4 | /.cache 5 | /.settings 6 | /logs 7 | /.idea 8 | -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | spark-example -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #这是一个Spark MLlib实例 2 | ##1 K-meams基础实例 3 | ###1.1 数据准备 4 | **准备好如下数据：**
5 | >0.0 0.0 0.0
6 | 0.1 0.1 0.1
7 | 0.2 0.2 0.2
8 | 9.0 9.0 9.0
9 | 9.1 9.1 9.1
10 | 9.2 9.2 9.2
11 | 0.3 0.2 0.2
12 | 9.1 9.5 9.1
13 | 0.2 0.2 0.2
14 | 0.1 0.2 0.2
15 | 8.9 9.5 9.1
16 | 17 | 命名为kmeans_data.txt，且上传到hdfs的/spark/mllib/data/路径中。
18 | 19 | ###1.2 代码打包 20 | * 在Intellij中，点击file->选择project structure->选择Artifact->添加jar->把乱七八糟的依赖移除->勾选Build on make。
21 | * 点击Build->选择Build Artifact->选择ReBuild，然后在之前填写的路径下找到jar。
22 | * 上传到spark中。
23 | 24 | ###1.3 执行代码 25 | * 执行命令`./spark-submit --class com.blogchong.spark.mllib.base.Kmeans --master spark://192.168.5.200:7077 --num-executors 2 --driver-memory 124m --executor-memory 124m --total-executor-cores 2 /root/spark/hcy/spark-example.jar`
26 | //需要注意的是，在设置core数以及内存时，最好先参考一下spark-master-id:8080页面中的worker参数，别超过了就行。
27 | * 跑完了，直接到输出文件夹下，找到代码的输出结果即可。
28 | 29 | ##2 协同推荐ALS算法基础实例 30 | ###2.1 数据准备 31 | **用户评分数据，格式: 用户ID,电影ID,评分**
32 | >1,1,5.0
33 | 1,2,1.0
34 | 1,3,5.0
35 | 1,4,1.0
36 | 2,1,5.0
37 | 2,2,1.0
38 | 2,3,5.0
39 | 2,4,1.0
40 | 3,1,1.0
41 | 3,2,5.0
42 | 3,3,1.0
43 | 3,4,5.0
44 | 4,1,1.0
45 | 4,2,5.0
46 | 4,3,1.0
47 | 4,4,5.0
48 | 49 | 上传到hdfs的/spark/mllib/data/als路径中。
50 | 51 | ###2.2 代码打包 52 | * 在Intellij中，点击file->选择project structure->选择Artifact->添加jar->把乱七八糟的依赖移除->勾选Build on make。
53 | * 点击Build->选择Build Artifact->选择ReBuild，然后在之前填写的路径下找到jar。
54 | * 上传到spark中。
55 | 56 | ###2.3 执行代码 57 | * 执行命令`./spark-submit --class com.blogchong.spark.mllib.base.AlsArithmetic --master spark://192.168.5.200:7077 --num-executors 2 --driver-memory 124m --executor-memory 124m --total-executor-cores 2 /root/spark/hcy/spark-example.jar`
58 | //需要注意的是，在设置core数以及内存时，最好先参考一下spark-master-id:8080页面中的worker参数，别超过了就行。
59 | * 跑完了，直接到输出文件夹下，找到代码的输出结果即可。
60 | 61 | ###2.4 附加说明 62 | >在实际的调试过程中，我们会把ALS的几个重要参数，比如迭代次数，比如lambda值等，设置成一个范围，然后进行逐步调优，当MSE值，也就是均根方差值最小时，这个模型即我们需要的训练模型。
63 | 64 | ##3 协同推荐ALS算法进阶--电影推荐实例 65 | ###3.1 数据准备 66 | **当前用户(需要给这货做推荐)评分数据(11条)personalRatings.txt
67 | >格式用户ID::电影ID::评分::时间戳**
68 | >0::1::5::1409495135
69 | 0::780::4::1409495135
70 | 0::590::3::1409495135
71 | 0::1216::4::1409495135
72 | 0::648::5::1409495135
73 | 0::344::3::1409495135
74 | 0::165::4::1409495135
75 | 0::153::5::1409495135
76 | 0::597::4::1409495135
77 | 0::1586::5::1409495135
78 | 0::231::5::1409495135
79 | 80 | **电影信息数据(3706条)movies.dat
81 | >格式: 电影ID::电影名称::类型**
82 | >1::Toy Story (1995)::Animation|Children's|Comedy
83 | 2::Jumanji (1995)::Adventure|Children's|Fantasy
84 | 3::Grumpier Old Men (1995)::Comedy|Romance
85 | 4::Waiting to Exhale (1995)::Comedy|Drama
86 | 5::Father of the Bride Part II (1995)::Comedy
87 | 6::Heat (1995)::Action|Crime|Thriller
88 | 7::Sabrina (1995)::Comedy|Romance
89 | 8::Tom and Huck (1995)::Adventure|Children's
90 | 9::Sudden Death (1995)::Action
91 | 92 | **用户电影评分信息数据(1000209条)ratings.dat
93 | >格式: 用户ID::电影名称::评分::时间戳**
94 | >3::260::5::978297512
95 | 3::2858::4::978297039
96 | 3::3114::3::978298103
97 | 3::1049::4::978297805
98 | 3::1261::1::978297663
99 | 3::552::4::978297837
100 | 3::480::4::978297690
101 | 4::1265::2::978298316
102 | 4::1266::5::978297396
103 | 4::733::5::978297757
104 | 105 | 上传到hdfs的/spark/mllib/data/als2路径中。
106 | 107 | ###3.2 代码打包 108 | * 在Intellij中，点击file->选择project structure->选择Artifact->添加jar->把乱七八糟的依赖移除->勾选Build on make。
109 | * 点击Build->选择Build Artifact->选择ReBuild，然后在之前填写的路径下找到jar。
110 | * 上传到spark中。
111 | 112 | ###3.3 执行代码 113 | * 执行命令`./spark-submit --class com.blogchong.spark.mllib.advance.ALSRecommendMovie.AlsArithmeticPractice --master spark://192.168.5.200:7077 --num-executors 2 --driver-memory 400m --executor-memory 400m --total-executor-cores 2 /root/spark/hcy/spark-example.jar`
114 | //需要注意的是，在设置core数以及内存时，最好先参考一下spark-master-id:8080页面中的worker参数，别超过了就行。
115 | * 跑完了，直接到输出文件夹下，找到代码的输出结果即可。
116 | 117 | ###3.4 附加说明 118 | >在调试过程中，把ALS的几个重要参数，比如迭代次数，比如lambda值等，设置成一个范围，然后进行逐步调优，当MSE值，也就是均根方差值最小时，这个模型即我们需要的训练模型。
119 | 120 | ###3.5 输出结果 121 | >对于每次尝试的结果直接打印，最终给用户0推荐的结果按降序保存在/spark/mllib/result/als2/data/recommendations，模型文件保存在/spark/mllib/result/als2/model。 122 | 123 | 124 | ##4 LDA主题特征抽取实例 125 | 126 | ###执行命令 127 | 128 | > 批量文档训练：获取训练文档的Docs-Topics概率矩阵，以及Words-Topics概率矩阵，当然，还有Model文件，这是后期预测新文档的基础. 129 | 130 | * 训练执行命令`./spark-submit --class com.blogchong.spark.mllib.advance.LdaExtractTopics.Train.LDAModelBuild --master spark://192.168.25.10:7077 --conf "spark.driver.extraJavaOptions=-XX:MaxPermSize=512m" --conf "spark.executor.extraJavaOptions=-XX:MaxPermSize=512m" --executor-memory 6G --driver-memory 6G --num-executors 4 --executor-cores 4 --jars /root/hcyLda/spark-example-1.0-SNAPSHOT.jar XX PdataPath /hcy/lda/train/part-r-00000-write PmodelPath /hcy/lda/model PtopicSize 100 PmaxIterations 100 PwordsPath /hcy/lda/train/extract_index.dic PsaveVector true > ~/hcyLda/20151219.log 2>&1` 131 | 132 | > 新文档主题预测：利用上面训练得到的LocalLDAMoldel，进行新文档的主题预测，求docs-topics矩阵，然后结合Model中已有的Topics-words矩阵，求docs-words矩阵 133 | 134 | * 测试执行命令`./spark-submit --class com.blogchong.spark.mllib.advance.LdaExtractTopics.Check.PredictsDocTopics --master spark://192.168.25.10:7077 --conf "spark.driver.extraJavaOptions=-XX:MaxPermSize=512m" --conf "spark.executor.extraJavaOptions=-XX:MaxPermSize=512m" --executor-memory 6G --driver-memory 6G --num-executors 1 --executor-cores 1 --jars /root/hcyLda/spark-example-1.0-SNAPSHOT.jar XX PdataPath /hcy/lda/data/test.data PmodelPath /hcy/lda/model/2015-12-23-23-32-00/localLdaModel PtopicsPath /hcy/lda/data PtopicSize 200 PwordsPath /hcy/lda/train/extract_index.dic > ~/hcyLda/201512231544.log 2>&1` 135 | 136 | ##5 新词发现(基于Ansj工具) 137 | 138 | ###执行命令 139 | 140 | * 训练执行命令`./spark-submit --class com.blogchong.spark.mllib.advance.DiscoveryNewWord.AnsjDisWords --master spark://192.168.25.10:7077 --conf "spark.driver.extraJavaOptions=-XX:MaxPermSize=512m" --conf "spark.executor.extraJavaOptions=-XX:MaxPermSize=512m" --executor-memory 3G --driver-memory 3G --num-executors 1 --executor-cores 1 --jars /root/hcyLda/newWord/spark-example-1.0-SNAPSHOT.jar,/root/hcyLda/newWord/ansj_seg-0.9.jar,/root/hcyLda/newWord/tree_split-1.0.1.jar --driver-library-path /root/hcyLda/newWord/ansj_seg-0.9.jar /root/hcyLda/newWord/tree_split-1.0.1.jar /hcy/newWord/data/11 /hcy/newWord/result` 141 | 142 | ##6 新词发现(基于NGram算法的Spark实现) 143 | 144 | ###执行命令 145 | 146 | * 执行命令`./spark-submit --class com.blogchong.spark.mllib.advance.DiscoveryNewWord.NGramSpark --master spark://192.168.25.10:7077 --conf "spark.driver.extraJavaOptions=-XX:MaxPermSize=512m" --conf "spark.executor.extraJavaOptions=-XX:MaxPermSize=512m" --executor-memory 6G --driver-memory 18G --num-executors 3 --executor-cores 3 --jars /root/hcyLda/newWord/spark-example-1.0-SNAPSHOT.jar,/root/hcyLda/newWord/ansj_seg-0.9.jar,/root/hcyLda/newWord/tree_split-1.0.1.jar,/root/hcyLda/newWord/json-lib-2.4-jdk13.jar,/root/hcyLda/newWord/ezmorph-1.0.6.jar /root/hcyLda/newWord/spark-example-1.0-SNAPSHOT.jar /hcy/newWord/data/userLibrary.dic /hcy/newWord/data/11 /hcy/newWord/result > ~/hcyLda/newWord/20160118.log` 147 | -------------------------------------------------------------------------------- /data/discoveryWord/dis-word-part-00023-opt: -------------------------------------------------------------------------------- 1 | font-family 2 | vertical-align 3 | 售票窗口 4 | 安全组 5 | 图形缓冲区 6 | 去雾 7 | 抗变 8 | 要素类 9 | gdt表 10 | 写者 11 | rtp数据 12 | 提取器 13 | user_agent 14 | nat转换 15 | 虚拟运行 16 | read_lock 17 | 流程浅析 18 | surfaceflinger服务 19 | 任务栈 20 | 意图对象 21 | 注释类型 22 | phone状态 23 | 传输地址 24 | render进程 25 | 界面外观 26 | build_system 27 | 错误响应 28 | power_supply 29 | 单面板 30 | 局部变量数组 31 | sql节点 32 | 全文目录 33 | 顺序锁 34 | gpu进程 35 | 分配粒度 36 | ffmpeg源代码 37 | 暗通道 38 | session数据 39 | wms服务 40 | 兼容级别 41 | store节点 42 | 霍夫变换 43 | start_code 44 | 线程pool 45 | 滤波法 46 | 重写规则 47 | schedule_timeout 48 | 输入子系统 49 | fd_set 50 | 纤程 51 | socket描述 52 | body对象 53 | 辅助索引 54 | xp_cmdshell 55 | 工作项 56 | signal_pending 57 | 资料表 58 | 数据捕获 59 | 管理节点 60 | 性能视图 61 | using指示 62 | 败者树 63 | project元素 64 | 同步框架 65 | 会话bean 66 | current_session 67 | 段界限 68 | 年轻代 69 | 弱符号 70 | session文件 71 | 类厂 72 | 核心动画 73 | gatt服务 74 | settings跳转 75 | 父rdd 76 | zi类 77 | 过滤规则 78 | 划分数 79 | mysql_fetch_array 80 | 降序数组 81 | 立即数 82 | 界面线程 83 | 国际化消息 84 | arp请求 85 | 本尊 86 | 检索策略 87 | 脏页面 88 | riff文件 89 | compatible属性 90 | buf_type 91 | session隔离 92 | rtpsession类 93 | 操作视窗 94 | 贝叶斯算法 95 | open列表 96 | id_file 97 | 事件过滤器 98 | dll_thread_attach 99 | irq_desc 100 | 旋锁 101 | 远程对象 102 | 延迟检索 103 | crontab文件 104 | 提权 105 | set_fd 106 | thread-count 107 | dllmain函数 108 | 目标场景 109 | tcp段 110 | scn号 111 | withevent方法 112 | nat穿透 113 | pthread_rwlock_init 114 | hp-socket 115 | session初始化 116 | 标签处理器 117 | 强符号 118 | 事务id 119 | 活动对象 120 | 共享函数 121 | activity组件 122 | rtp包 123 | bitset对象 124 | 产品族 125 | 缓存项 126 | ngx_http 127 | aopproxy代理 128 | 推荐列表 129 | 互斥对象 130 | 子块 131 | lob类型 132 | 共享函数库 133 | activitymanagerservice服务 134 | 添加商品 135 | product_out 136 | system_window 137 | json_object 138 | record结构 139 | 导向图 140 | javaweb工程 141 | kitkatphone工作 142 | 副本集 143 | 层次聚类 144 | 返回页 145 | corp节点 146 | 帧缓存 147 | 位域 148 | overflow按钮 149 | 传输控制块 150 | 处理例程 151 | hash_map 152 | 丑数 153 | 深度测试 154 | browser进程 155 | 数据标签 156 | 快速窗口 157 | soap消息 158 | 数据节点 159 | 语法解析器 160 | 马氏距离 161 | 面板布局 162 | 消息块 163 | 屏蔽信号 164 | 数据库集成 165 | 拖尾 166 | criteria查询 167 | uitableviewcell对象 168 | 右值 169 | 游戏模式 170 | 旋转算法 171 | 平均滤波 172 | 商品列表 173 | 动态函数库 174 | 设计问题 175 | closed列表 176 | 输入分片 177 | 闭包表达式 178 | idle进程 179 | 页面空间 180 | 宽依赖 181 | 引用逸出 182 | 类型擦除 183 | socket描述字 184 | 游戏邦 185 | toast通知 186 | 实际用户 187 | 规则文件 188 | 窗口修饰 189 | 终结节点 190 | 应用空间 191 | 宿主语言 192 | cdata段 193 | 加密体制 194 | 视图名称 195 | 实时流 196 | 孤儿进程 197 | java栈 198 | dll_process_attach 199 | auto_ptr 200 | 短链 201 | 存储属性 202 | 中断处理 203 | alpha混合 204 | 描述字 205 | 地址结构 206 | deb-src 207 | 人脸图像 208 | pthread_detach 209 | 特殊权限 210 | 实时传输 211 | nfc标签 212 | 病毒编号 213 | 组合索引 214 | machine-name 215 | proto文件 216 | 记录锁 217 | 缓存行 218 | 页框 219 | 模式串 220 | 纹素 221 | shelldll_defview 222 | 桥方法 223 | 外部存储介质 224 | 表变量 225 | 动态性能视图 226 | 栋栋 227 | 堆内存heap 228 | 导向滤波 229 | lock函数 230 | 脉冲干扰 231 | 捕获机制 232 | 奇圈 233 | 私有地址 234 | 取票 235 | hashentry数组 236 | 顶点函数 237 | job参数 238 | storm集群 239 | ntp时间 240 | slave进程 241 | 依赖范围 242 | 安装副本 243 | id_edit_copy 244 | 物理menu 245 | 序列化形式 246 | 汇编地址 247 | dll_process_attach 248 | 调用bean 249 | 路径表 250 | 日志清理 251 | stream_type 252 | 主站点 253 | 坐标轴范围 254 | 标记文本 255 | 帧缓冲 256 | 范围扫描 257 | 拖拽事件 258 | 校验器 259 | 自定义功能 260 | state模式 261 | file_operations 262 | 命令执行器 263 | region_id 264 | 中断处理 265 | 平衡因子 266 | 原根 267 | 残量 268 | hprof文件 269 | dbf文件 270 | ble设备 271 | pthread_cond_wait 272 | 钩子方法 273 | jni编程 274 | rtp会话 275 | 伪元素 276 | 睡眠模式 277 | 半连接 278 | 共用体 279 | 黑球 280 | poi检索 281 | 局部引用 282 | 硬链接 283 | 结构元素 284 | 框架层 285 | 后端服务器 286 | 内存池 287 | 特权级 288 | request函数 289 | 类地址 290 | 控制块 291 | 注册中心 292 | 特征点 293 | 职责链 294 | 周期数 295 | zygote进程 296 | 胜者树 297 | 双向队列 298 | instead_of触发器 299 | 桶排序 300 | before触发器 301 | spi设备 302 | 代码运行图 303 | 求解树 304 | 白球 305 | 公共长度 306 | 嵌套表 307 | 远端仓库 308 | video_capture 309 | survivor区域 310 | 行光标 311 | js数组 312 | 点云 313 | 按键音 314 | sensoreventlistener接口 315 | 情感倾向 316 | 更新脚本 317 | smali文件 318 | 物理仿真 319 | 参考帧 320 | 浸入式 321 | 奇异态 322 | 硬解码 323 | 转移次数 324 | 目的寄存器 325 | 帧间预测 326 | return-einval 327 | loop对象 328 | 加壳 329 | 列族 330 | binder驱动程序 331 | 位带操作 332 | wm_drawitem 333 | entry链 334 | connectionstrings配置 335 | 动态函数 336 | non-fast-forward 337 | 媒体流 338 | iic总线 339 | play应用 340 | tiddata块 341 | might_sleep 342 | 灰度变换 343 | next_date 344 | 隐函数 345 | 喷水装置 346 | 立即检索 347 | 伪目标 348 | 切比雪夫距离 349 | 并行化 350 | size-cells 351 | do_select 352 | autowired注释 353 | nagios监测 354 | dll_process_detach 355 | 子分区 356 | 优惠劵 357 | component类 358 | interface方式 359 | gui组件 360 | word_count 361 | resultsethandler实现 362 | gen_server 363 | cfs调度 364 | 面版 365 | slot函数 366 | 滑动块 367 | 阶码 368 | network-manager 369 | erlang节点 370 | scoped_ptr 371 | commit历史 372 | cmyk模式 373 | 缓冲设备 374 | crash日志 375 | 设置gpio 376 | cast转换 377 | longjmp函数 378 | 内存示意图 379 | pthread_key 380 | ssl协议 381 | writablecomparable接口 382 | avio_open 383 | 自动回复 384 | 函数节流 385 | 写锁 386 | 委托类 387 | 就绪队列 388 | 工作内存 389 | 服务时间 390 | 条件变量 391 | socket通道 392 | 可变集合 393 | 颜色表 394 | wait_event 395 | 内核抢占 396 | 等价类 397 | url标签 398 | 抽象路径名 399 | 互斥量 400 | timestamp列 401 | 动态性能 402 | 逻辑块号 403 | 物理块号 404 | 数据报文 405 | 鸽巢 406 | 鸽巢原理 407 | survivor区 408 | 资源计数 409 | sheet页 410 | innodb表 411 | hello-hello 412 | out事件 413 | 负载类型 414 | 前台服务 415 | cms收集器 416 | jit编译器 417 | 逸出 418 | 控制终端 419 | 虚基类 420 | 状态模式 421 | core文件 422 | 画刷 423 | binder驱动 424 | 主串 425 | pthread_once 426 | 父坐标 427 | nulls说明 428 | session生命周期 429 | open_cache 430 | oaf框架 431 | 麦可网 432 | rmi调用 433 | 长度类型 434 | simplest_ffmpeg_remuxer 435 | 滑动动画 436 | http_core 437 | 灰度模式 438 | 偏移坐标 439 | dc算法 440 | 匈牙利树蜂 441 | 顺序文件 442 | android镜像 443 | list块 444 | 脚本编程 445 | 工厂等级 446 | 再哈希 447 | 频繁项 448 | 文章分页 449 | mysql_connect 450 | rtp时间戳 451 | 号码本 452 | 搜索区域 453 | 源代码程序 454 | drawitem消息 455 | component注释 456 | reqsk_queue 457 | 频率域 458 | 切词 459 | initrd文件 460 | 停止等待协议 461 | 增量索引 462 | 后缀操作符 463 | 输出属性 464 | 托管资源 465 | 非托管资源 466 | 并行收集 467 | 正样本 468 | surface对象 469 | ftp客户端 470 | swift教程 471 | speak方法 472 | 模式对话框 473 | data-widget-config 474 | block块 475 | 正交矩阵 476 | 直交矩阵 477 | heap视图 478 | 计算引擎 479 | 重力传感器 480 | 活动记录 481 | 实例表 482 | gap优化 483 | property_read 484 | public_key 485 | dir结构 486 | 空间容量 487 | 块划分 488 | fb设备 489 | 异步编程 490 | flash演示 491 | 描述符索引 492 | 内置锁 493 | rtp数据包 494 | 短信验证 495 | 死亡骑士 496 | 字节区 497 | 长腿 498 | rigidbody组件 499 | 虚拟运行时间 500 | 等待队列机制 501 | spi控制器 502 | 指针悬挂 503 | 特性分支 504 | rpm-gpg 505 | 平台游戏开发 506 | right-angled 507 | 魔术字 508 | 帧内预测 509 | loadview方法 510 | 地址offset 511 | 导航面板 512 | for死循环 513 | 隶属度 514 | 抢占式调度 515 | beanutils包 516 | 解答树 517 | 参数集 518 | 空闲节点 519 | 失性存储器 520 | 聚合器 521 | 非静态属性 522 | 需求说明书 523 | 计算上限 524 | 动态语句 525 | 反射型 526 | 超步 527 | 终止方法 528 | messagedigest对象 529 | 迷途指针 530 | rpc函数 531 | 空闲堆 532 | 进程组 533 | 低通滤波 534 | 卷组 535 | time_wait 536 | 项集 537 | 内存索引 538 | piggy-bank 539 | platform_device 540 | 语法解析 541 | 用户账户 542 | 外部类 543 | 懒加载 544 | 非正式协议 545 | 程序进程 546 | 办理业务 547 | socket服务 548 | 类加载器 549 | storyboard文件 550 | 函数原形 551 | mediaplayer对象 552 | can总线 553 | repeater控件 554 | 匿名类型 555 | 静态嵌套 556 | 对齐模数 557 | root结点 558 | world对象 559 | 内存heap 560 | random对象 561 | key_value 562 | 分类标签 563 | 线性地址 564 | one标签 565 | 位图排序 566 | 货币系统 567 | 网站编号 568 | 跨立 569 | nginx负载 570 | 广播接收器 571 | 建造者 572 | 像素密度 573 | 映射区 574 | 根堆 575 | 慢查询 576 | lamp对象 577 | 二进制日志 578 | threadlocal实例 579 | 左偏树 580 | 右偏树 581 | 杀敌数 582 | 关联规则 583 | 加速度传感器 584 | property元素 585 | handle_irq 586 | 覆盖集 587 | 常量池 588 | 地址表 589 | dts文件 590 | 虚拟系统 591 | 优先数 592 | 优先数系 593 | 大顶堆 594 | 小顶堆 595 | 僵尸对象 596 | 操作集 597 | board_init_f 598 | com库 599 | xss攻击 600 | a星算法 601 | 日志组 602 | 序数组 603 | 页目录 604 | pthread_mutex_t 605 | 伸展树 606 | 内核对象 607 | 商品id 608 | character_set_server 609 | 隐式对象 610 | lambda表达式 611 | view对象 612 | 引用计数器 613 | 局部类 614 | 责任链模式 615 | 重定位 616 | dispatchtouchevent方法 617 | 异步通知 618 | 程序分析 619 | 虚继承 620 | 对端IP 621 | 工兵营地 622 | 布局管理 623 | 阻塞模式 624 | 块元素 625 | shared_ptr 626 | 程序集 627 | 弱引用 628 | 初始化块 629 | 虚基类 630 | 外键 631 | prototype模式 632 | description方法 633 | derived对象 634 | 强分类器 635 | merge操作 636 | 学生学籍 637 | binder_thread 638 | 分组密码 639 | 客户浏览器 640 | 光照模式 641 | 光栅化 642 | 全局引用 643 | crontab命令 644 | backtrace_symbols 645 | httpd服务 646 | webbrowser组件 647 | 脉宽 648 | 着色程序 649 | extern类型 650 | js嵌入 651 | 点轴 652 | javaclass文件 653 | 卷管理 654 | 内核锁 655 | 内核页 656 | vao对象 657 | 目标资源 658 | 注释标记 659 | 普通参数 660 | 营业情况 661 | pendingintent对象 662 | 节点target参数 663 | 槽位 664 | 导航窗 665 | 网络事件 666 | 全匹配 667 | sequence序列 668 | worker节点 669 | 标准关联 670 | 水王 671 | 静态构造 672 | 解码decode 673 | 预补偿 674 | 强顺序 675 | 模板测试 676 | 睡眠唤醒 677 | vip客户 678 | 优化函数 679 | 秩优化 680 | 查询串 681 | tabcontrol控件 682 | aoe网 683 | set_terminate 684 | 最大承载 685 | 裁减区域 686 | flood攻击 687 | where筛选 688 | 引导扇区 689 | mysql_field_name 690 | car类 691 | 音频资源 692 | 状态节点 693 | opengl命令 694 | 编码代码 695 | 费纳波契序列 696 | 裴波那契序列 697 | 内容页 698 | 弱分类 699 | 程序协议 700 | 执行模式 701 | 引用队列 702 | interruptible状态 703 | devices视图 704 | sub-string 705 | android内容提供器 706 | 逆向队列 707 | 弱分类器 708 | 验证框架 709 | user模型 710 | ddos攻击 711 | 交叉业务 712 | 弦图 713 | 双面板宽 714 | 托管堆 715 | 运输层 716 | dma控制器 717 | 特殊日期 718 | socket输出流 719 | token令牌 720 | 估价函数 721 | log系统 722 | osal_wmem_alloc 723 | 表状态 724 | gets命令 725 | 过滤机制 726 | 集合容器类 727 | getopt函数 728 | dense_rank 729 | 程序集清单 730 | write_lock 731 | item项 732 | 进程数目 733 | 高层组件 734 | 动态图层 735 | material主题 736 | 图像去雾算法 737 | 片段着色器 738 | 公共域 739 | the-sequence 740 | 底层文件 741 | 好莱坞原则 742 | sobel算子 743 | 指示点 744 | 转换文件 745 | phonewindow类 746 | 离线下载 747 | disconnect函数 748 | 动态列表 749 | 唤醒状态 750 | 弱学习算法 751 | 交集部分 752 | 存储段描述符 753 | beautifulsoup对象 754 | to_days 755 | 等级结构 756 | path对象 757 | 文件modulea 758 | 窄依赖 759 | 免费电话 760 | proc_dir_entry 761 | 左扩展 762 | 新建进程 763 | access_token 764 | 模型函数 765 | address-cells 766 | 扩展段 767 | 适配者模式 768 | 通道号 769 | df标志 770 | spin_unlock宏 771 | 接口隔离原则 772 | 观察序列 773 | 笨小熊 774 | 外部工具 775 | 动态联结 776 | atexit函数 777 | 表错误 778 | hello_world 779 | 消息长度 780 | 距离标号 781 | 水位标记 782 | dom对象 783 | rst包 784 | 目标元素 785 | hiddenfield控件 786 | 策略路由 787 | 导航控制器 788 | 标签技术 789 | node_identifier 790 | 局部环境 791 | 标签应用 792 | 附加属性 793 | 功能寄存器 794 | 编辑方式 795 | uevent_ops 796 | 信号屏蔽 797 | 剪枝算法 798 | 已用空间 799 | 通信组件 800 | dispatch_group 801 | mime-type 802 | mat文件 803 | device_package_overlays 804 | product_package_overlays 805 | dup函数 806 | 图层类 807 | singleinstance模式 808 | 局部类型 809 | 加权轮转调度算法 810 | 后置通知 811 | syn包 812 | 异常捕获机制 813 | 上下文定义 814 | apt工具 815 | 集合视图 816 | 失败指针 817 | http框架 818 | 内建命令 819 | 美观程度 820 | 云硬盘 821 | babab模式 822 | daemon线程 823 | usb总线 824 | usermanager接口 825 | cs设计 826 | 转换运算符 827 | 调整算法 828 | 自然对齐 829 | 混合模型 830 | 应用模式 831 | double-kwic-index 832 | 源矩形 833 | thread_info 834 | onhandleintent方法 835 | mcontentparent对象 836 | 字符编码方式 837 | dp设置 838 | 驾车路线 839 | 外部联接 840 | 重建索引 841 | spark_worker_menory 842 | spark_worker_core 843 | 滚动块 844 | 输入源 845 | hittestwithevent方法 846 | 滚动框 847 | 驻留池 848 | high-low 849 | sst文件 850 | 物理卷 851 | 同步关系 852 | 客户接口 853 | 静态构造函数 854 | winmain函数 855 | 硬中断 856 | derived类 857 | 作业类 858 | drop事件 859 | 新闻阅读器 860 | 识别符 861 | 组合算法 862 | 属性页 863 | start-point 864 | 梯度上升算法 865 | 作用对象 866 | put_user 867 | 预载 868 | click事件函数 869 | hdf文件 870 | 衍合 871 | 设备描述符 872 | remote接口 873 | head对象 874 | 内核目录 875 | 幸运数 876 | usb设备 877 | 启动事务 878 | img镜像 879 | 窗口实例 880 | 聚簇索引 881 | param标签 882 | prehandle方法 883 | 适配器驱动 884 | 新朋友 885 | sharedpreferences数据 886 | rank函数 887 | 对象模板 888 | list_entry 889 | 事务信息 890 | 匿名内存 891 | 跳跃表 892 | ebx寄存器 893 | 链接标签 894 | 双括号 895 | 方法表 896 | 上层逻辑 897 | nat操作 898 | 增广路 899 | 僵尸进程 900 | 连接队列 901 | clone方法 902 | 对称加密 903 | 重复提交 904 | wait_queue 905 | const修饰 906 | 双连通分量 907 | logistic回归 908 | copy_from_user 909 | socket描述符 910 | 原型模式 911 | 外围类 912 | classloader类 913 | 虚拟网络 914 | local_module_class 915 | 逆序对 916 | objc_msgsend 917 | sql语法 918 | worker进程 919 | init进程 920 | 分析函数 921 | map集合 922 | 意图过滤器 923 | 倍增算法 924 | 字符串池 925 | null字符 926 | 根对象 927 | 地精商人 928 | 正式协议 929 | 中断处理程序 930 | 关联对象 931 | 字节代码 932 | finalize方法 933 | hive数据 934 | 公平调度 935 | 二次排序 936 | 号码管理 937 | 密钥库 938 | demo工程 939 | struct_board_info 940 | 运行地址 941 | 定位信息 942 | arp包 943 | 双缓冲 944 | udp数据报 945 | 共享中断 946 | 前置声明 947 | 主题角色 948 | 后缀表达式 949 | to_date 950 | 引用计数 951 | wifi状态 952 | 目的主机 953 | cookie对象 954 | 目录项 955 | 上下文容器 956 | 写缓存 957 | entry对象 958 | 目标窗口 959 | 视图函数 960 | 投票结果 961 | 软中断 962 | 分组函数 963 | 开关操作 964 | export_symbol 965 | 读锁 966 | 消息映射 967 | 自动变量 968 | keep-alive 969 | eax寄存器 970 | wait状态 971 | binder对象 972 | 装饰类 973 | 父类 974 | 属性表 975 | ioc容器 976 | usb驱动 977 | decimal类型 978 | static数据成员 979 | 注解处理器 980 | systemserver进程 981 | intent对象 982 | 线性加速度 983 | 推荐方案 984 | declare-styleable 985 | ip_hash 986 | 启动模式 987 | source类 988 | 容器集合 989 | object文件 990 | net服务器 991 | global对象 992 | api库 993 | pair对象 994 | 单调栈 995 | platform设备 996 | 反素数 997 | 处理层 998 | 威威 999 | 第三方应用程序 1000 | socket结构 1001 | 参数签名 1002 | 服务器证书 1003 | 设备模型 1004 | 数据权限 1005 | 过滤操作 1006 | initrc文件 1007 | epoll事件 1008 | 单类 1009 | 响应文件 1010 | 启动请求 1011 | am文件 1012 | 核心态 1013 | 普通表 1014 | 设备结点 1015 | case常量 1016 | curl_easy_perform 1017 | linux命令行 1018 | 候选对象 1019 | 非聚集索引 1020 | lu系统 1021 | 偏函数 1022 | 滤波方法 1023 | 功能快捷键 1024 | 相机应用 1025 | mother类 1026 | 最小点权覆盖 1027 | fun方法 1028 | 幂等性 1029 | 验证器 1030 | 积分图像 1031 | 类型转换器 1032 | 聊天消息 1033 | unsafe_unretained 1034 | 应用程序域 1035 | ubuntu镜像 1036 | 位图排序 1037 | 临时节点 1038 | jquery类 1039 | sighup信号 1040 | raii对象 1041 | roi区域 1042 | 动态sql 1043 | invalidate流程 1044 | 成功响应 1045 | 外连接 1046 | 实体bean 1047 | 工厂方法 1048 | 操作栏 1049 | lamp类 1050 | 大根堆 1051 | obj对象 1052 | 单向关联 1053 | 中文分词 1054 | 互斥锁 1055 | 梯度下降 1056 | dispatch_queue 1057 | 特征码 1058 | 目标组件 1059 | 嵌套类 1060 | aidl文件 1061 | 回文子串 1062 | 模板方法模式 1063 | 工厂方法模式 1064 | haar特征 1065 | ack包 1066 | 随机数种子 1067 | 连接串 1068 | 基础表 1069 | text-decoration 1070 | private_data 1071 | scanf函数 1072 | session变量 1073 | 共享锁 1074 | 账户信息 1075 | 组件对象模型 1076 | avi文件 1077 | int-identity 1078 | va_start 1079 | 拷贝构造函数 1080 | http模块 1081 | pthread_cleanup_push 1082 | pthread_cleanup_pop 1083 | 深度克隆 1084 | jstl标签 1085 | customer_id 1086 | 硬件地址 1087 | 整数函数 1088 | cgi程序 1089 | 子空间 1090 | rtp协议 1091 | pattern对象 1092 | 窗口布局 1093 | 导出函数 1094 | 调用约定 1095 | context-param 1096 | 时钟周期 1097 | 同步函数 1098 | 代理对象 1099 | 相度 1100 | 负载因子 1101 | const_iterator 1102 | bindservice方法 1103 | 式套 1104 | 控制文件 1105 | vip窗口 1106 | 懒汉模式 1107 | 提交状态 1108 | 虚拟继承 1109 | 外部状态 1110 | 删除器 1111 | 右孩子 1112 | 方法摘要 1113 | 执行耗时 1114 | 本地仓库 1115 | 关联映射 1116 | 深拷贝 1117 | 形图 1118 | 页首 1119 | run方法 1120 | apk包 1121 | sim卡 1122 | 强引用 1123 | 传输控制 1124 | looper对象 1125 | 智能指针 1126 | 临界区 1127 | 简单工厂模式 1128 | 类加载 1129 | 运行时区 1130 | 离屏渲染 1131 | 验证规则 1132 | from属性 1133 | socket实战 1134 | 喜爱程度 1135 | 密集型任务 1136 | singletop模式 1137 | repo仓库 1138 | 钥匙串 1139 | 出栈算法 1140 | 入栈算法 1141 | 空间使用率 1142 | xml编辑器 1143 | list_for_each 1144 | 带权路径 1145 | 父母节点 1146 | bab模式 1147 | 父加载器 1148 | 二级索引 1149 | 覆盖模型 1150 | 输出电压 1151 | 模式规则 1152 | io复用 1153 | 父函数 1154 | uitouch对象 1155 | matlab仿真 1156 | scrollby方法 1157 | ocx控件 1158 | remoting-config.xml 1159 | 基准时间 1160 | 包装集 1161 | 父窗 1162 | page_size 1163 | http_conf 1164 | 堆管理器 1165 | 环回 1166 | movie对象 1167 | 代理程序 1168 | cache-control 1169 | 卷积层 1170 | 协议处理器 1171 | 短信信息 1172 | 唯一索引 1173 | 命令列表 1174 | 相遇点 1175 | responsetext属性 1176 | 代理ip 1177 | 捕获组 1178 | 非捕获组 1179 | 手写栈 1180 | 计时器对象 1181 | invited_nodes 1182 | sed编辑器 1183 | 魔术方法 1184 | jsp-file 1185 | array_buffer 1186 | 左子节点 1187 | 二维rmq 1188 | gralloc模块 1189 | 组合函数 1190 | 指标计算 1191 | html元件 1192 | 核心协议 1193 | rpc消息 1194 | 抽屉原理 1195 | 数据库链接 1196 | 写句柄 1197 | bootstrap-datetimepicker 1198 | unix_timestamp 1199 | 深度解剖 1200 | queue消息队列 1201 | curl_global_init 1202 | 故事板 1203 | 解码流程 1204 | 聚类中心 1205 | 异常模式 1206 | 移动存储 1207 | day_of_week 1208 | doc类 1209 | 后验概率 1210 | 主键id 1211 | 异常模型 1212 | syslog函数 1213 | windows类 1214 | cast表达式 1215 | 控制元素 1216 | tcp_keepalive 1217 | 原生字符串 1218 | is_ref_gc 1219 | 区间外 1220 | 按秩合并 1221 | app包 1222 | 悍马 1223 | 慕课 1224 | qq互联 1225 | arm开发板 1226 | opengl库 1227 | 扩展分区 1228 | 实验效果 1229 | progress_display 1230 | 增益率 1231 | imwrite函数 1232 | 数据表结构 1233 | cursor_variable 1234 | wake_lock 1235 | target元素 1236 | 碎片率 1237 | 响应消息头 1238 | sock_init 1239 | 双端链表 1240 | 二级指针 1241 | 随机字符串 1242 | father类 1243 | 调度类 1244 | net-snmp 1245 | color函数 1246 | 向量类 1247 | 名称声明 1248 | tga文件 1249 | tftp-server 1250 | logistic函数 1251 | 过滤表达式 1252 | 控制项 1253 | inplace_merge 1254 | gatt服务器 1255 | udp数据包 1256 | 确认报文 1257 | 汇编函数 1258 | 奇数阶 1259 | redirect_uri 1260 | pthread_rwlockattr_init 1261 | lazy初始化 1262 | 待删 1263 | shell扩展 1264 | 保留表 1265 | 离散点 1266 | crond服务 1267 | 差异报告 1268 | 基因序列 1269 | 磁盘块 1270 | 连接描述符 1271 | findfirst函数 1272 | request_threaded_irq 1273 | cas命令 1274 | 嵌套属性 1275 | resultsethandler实现类 1276 | 右健 1277 | 世界矩阵 1278 | systemserver类 1279 | shell提示 1280 | android属性 1281 | 隔离原则 1282 | 深度拷贝 1283 | 移动窗 1284 | compile-time 1285 | events域 1286 | 虚拟机进程 1287 | 变换方法 1288 | curl_easy_setopt 1289 | 内存片段 1290 | 帧指针 1291 | 衰减因子 1292 | 操作区域 1293 | transaction命令 1294 | 兰州烧饼 1295 | 缩放系数 1296 | jni对象 1297 | 数据手册 1298 | 字符串表达式 1299 | 等式约束 1300 | 颜色模式 1301 | 构建规则 1302 | 响应报头 1303 | 描述词组 1304 | abstractclass方式 1305 | select方法 1306 | 填充方式 1307 | 回归问题 1308 | heap内存 1309 | 地址字节 1310 | 跨平台游戏 1311 | 代理函数 1312 | xml声明 1313 | 点球大战 1314 | cx_oracle 1315 | os-ii 1316 | msg函数 1317 | 程序题 1318 | commandargument属性 1319 | must-revalidate 1320 | android签名 1321 | 注释驱动 1322 | 存储类别 1323 | alloc_netdev 1324 | account_type 1325 | 内容分发 1326 | overlapped结构 1327 | cume_dist 1328 | 消息中心 1329 | session数组 1330 | swap分区 1331 | 安装函数 1332 | 工作分配 1333 | 处理器缓存 1334 | org语句 1335 | 序转换到 1336 | ajax验证 1337 | block类 1338 | poll_wait 1339 | 登录窗 1340 | 双亲表示法 1341 | 队列机制 1342 | css_set 1343 | updated_at 1344 | 网站地图 1345 | 主机字节序 1346 | find操作 1347 | control文件 1348 | 哈密顿回路 1349 | tif文件 1350 | enable函数 1351 | 出栈序列 1352 | 符号链 1353 | wait_event_interruptible 1354 | unsafe类 1355 | 启发式函数 1356 | 快速失败 1357 | canny边缘检测 1358 | 依赖事务 1359 | 三角数 1360 | com_test 1361 | 信号捕捉 1362 | 采样模式 1363 | 产品等级 1364 | 环形链 1365 | 信息域 1366 | 数字下标 1367 | cycle函数 1368 | rtp库 1369 | 接收模式 1370 | 类厂对象 1371 | ontransact方法 1372 | 状态会话 1373 | 属性缓存 1374 | 填充算法 1375 | m_mmap_threshold 1376 | 当前会话 1377 | 上升算法 1378 | jdbc规范 1379 | query组件 1380 | 洗牌问题 1381 | register_encdec 1382 | cv_fourcc 1383 | usb主机 1384 | iterable对象 1385 | no_data_found 1386 | 执行入口 1387 | sobel导数 1388 | 标识对象 1389 | static_library 1390 | source命令 1391 | 服务消费者 1392 | dojo类 1393 | 触发操作 1394 | udp客户端 1395 | neon优化 1396 | scsi硬盘 1397 | 介绍页面 1398 | getopts命令 1399 | 地址库 1400 | ngx_event_s 1401 | flex项目 1402 | 产生序列 1403 | android.mk 1404 | 登录地址 1405 | 属性检查器 1406 | driver结构 1407 | 音频处理 1408 | preempt_count 1409 | ftl标签 1410 | sex男 1411 | one-two 1412 | pull方式 1413 | pyc文件 1414 | 脚本函数 1415 | 持久化对象 1416 | describe命令 1417 | boolean定义 1418 | 依赖属性 1419 | 排序准则 1420 | arg变量 1421 | http块 1422 | 引导页面 1423 | 微信易 1424 | dispatch_get_main_queue 1425 | const_ty 1426 | flag_include_stopped_packages 1427 | servlet源 1428 | 弹性域 1429 | setsortcomparatorclass设置 1430 | 源码声明 1431 | 应用模块 1432 | cpu控制 1433 | 病毒特征 1434 | com_joomsport 1435 | date命令 1436 | mmap系统调用 1437 | 共享模块 1438 | 排序命令 1439 | node应用 1440 | 父线程 1441 | 幽灵引用 1442 | final标记 1443 | 触发模式 1444 | 实时索引 1445 | 实体管理 1446 | 主索引 1447 | isvalid属性 1448 | 临时页 1449 | 接口电路 1450 | 弱引用对象 1451 | sta模式 1452 | 媒体特性 1453 | 异常向量表 1454 | 权限标志 1455 | 栈限制 1456 | graphicspath类 1457 | html安全 1458 | 日历应用 1459 | 架构视图 1460 | 授权登陆 1461 | 缓冲区分配 1462 | 事件传播 1463 | 中断号 1464 | 中断控制器 1465 | 终结符表达式 1466 | 描述表 1467 | 压缩传输 1468 | sql模板 1469 | 初级篇 1470 | 顶点着色器 1471 | logo_linux 1472 | memcached服务器 1473 | variables变量 1474 | 槽函数 1475 | 轴偏移 1476 | 域属性 1477 | touch对象 1478 | hash值 1479 | 条件运算结果 1480 | 媒体数据 1481 | 事务操作 1482 | session连接 1483 | 访问控制符 1484 | mqtt协议 1485 | first_drv 1486 | 时间比例 1487 | 操作函数原形 1488 | 线程局部存储 1489 | iphone应用程序 1490 | 模板图像 1491 | 二路归并 1492 | query_time 1493 | head引用 1494 | typeid操作符 1495 | nat模块 1496 | jni库 1497 | 自动关联 1498 | aware接口 1499 | 硬件服务 1500 | 不可靠信号 1501 | 溢出结果 1502 | 函数重构 1503 | cookie登录 1504 | dlopen函数 1505 | 业务组件 1506 | 硬件抽象层 1507 | worker对象 1508 | getdefaultsensor方法 1509 | 吞吐率 1510 | 虚指针 1511 | process_request 1512 | slave端 1513 | 事务传播 1514 | 向量加法 1515 | 购买流程 1516 | 原料厂 1517 | length字段 1518 | soapobject对象 1519 | 闵可夫斯基距离 1520 | 重心公式 1521 | pop序列 1522 | key主键 1523 | pci设备 1524 | binder实体 1525 | 配置解析 1526 | row_count 1527 | config.mk -------------------------------------------------------------------------------- /data/recommend/ml-1m/README: -------------------------------------------------------------------------------- 1 | SUMMARY 2 | ================================================================================ 3 | 4 | These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 5 | made by 6,040 MovieLens users who joined MovieLens in 2000. 6 | 7 | USAGE LICENSE 8 | ================================================================================ 9 | 10 | Neither the University of Minnesota nor any of the researchers 11 | involved can guarantee the correctness of the data, its suitability 12 | for any particular purpose, or the validity of results based on the 13 | use of the data set. The data set may be used for any research 14 | purposes under the following conditions: 15 | 16 | * The user may not state or imply any endorsement from the 17 | University of Minnesota or the GroupLens Research Group. 18 | 19 | * The user must acknowledge the use of the data set in 20 | publications resulting from the use of the data set, and must 21 | send us an electronic or paper copy of those publications. 22 | 23 | * The user may not redistribute the data without separate 24 | permission. 25 | 26 | * The user may not use this information for any commercial or 27 | revenue-bearing purposes without first obtaining permission 28 | from a faculty member of the GroupLens Research Project at the 29 | University of Minnesota. 30 | 31 | If you have any further questions or comments, please contact GroupLens 32 | . 33 | 34 | ACKNOWLEDGEMENTS 35 | ================================================================================ 36 | 37 | Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data 38 | set. 39 | 40 | FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT 41 | ================================================================================ 42 | 43 | The GroupLens Research Project is a research group in the Department of 44 | Computer Science and Engineering at the University of Minnesota. Members of 45 | the GroupLens Research Project are involved in many research projects related 46 | to the fields of information filtering, collaborative filtering, and 47 | recommender systems. The project is lead by professors John Riedl and Joseph 48 | Konstan. The project began to explore automated collaborative filtering in 49 | 1992, but is most well known for its world wide trial of an automated 50 | collaborative filtering system for Usenet news in 1996. Since then the project 51 | has expanded its scope to research overall information filtering solutions, 52 | integrating in content-based methods as well as improving current collaborative 53 | filtering technology. 54 | 55 | Further information on the GroupLens Research project, including research 56 | publications, can be found at the following web site: 57 | 58 | http://www.grouplens.org/ 59 | 60 | GroupLens Research currently operates a movie recommender based on 61 | collaborative filtering: 62 | 63 | http://www.movielens.org/ 64 | 65 | RATINGS FILE DESCRIPTION 66 | ================================================================================ 67 | 68 | All ratings are contained in the file "ratings.dat" and are in the 69 | following format: 70 | 71 | UserID::MovieID::Rating::Timestamp 72 | 73 | - UserIDs range between 1 and 6040 74 | - MovieIDs range between 1 and 3952 75 | - Ratings are made on a 5-star scale (whole-star ratings only) 76 | - Timestamp is represented in seconds since the epoch as returned by time(2) 77 | - Each user has at least 20 ratings 78 | 79 | USERS FILE DESCRIPTION 80 | ================================================================================ 81 | 82 | User information is in the file "users.dat" and is in the following 83 | format: 84 | 85 | UserID::Gender::Age::Occupation::Zip-code 86 | 87 | All demographic information is provided voluntarily by the users and is 88 | not checked for accuracy. Only users who have provided some demographic 89 | information are included in this data set. 90 | 91 | - Gender is denoted by a "M" for male and "F" for female 92 | - Age is chosen from the following ranges: 93 | 94 | * 1: "Under 18" 95 | * 18: "18-24" 96 | * 25: "25-34" 97 | * 35: "35-44" 98 | * 45: "45-49" 99 | * 50: "50-55" 100 | * 56: "56+" 101 | 102 | - Occupation is chosen from the following choices: 103 | 104 | * 0: "other" or not specified 105 | * 1: "academic/educator" 106 | * 2: "artist" 107 | * 3: "clerical/admin" 108 | * 4: "college/grad student" 109 | * 5: "customer service" 110 | * 6: "doctor/health care" 111 | * 7: "executive/managerial" 112 | * 8: "farmer" 113 | * 9: "homemaker" 114 | * 10: "K-12 student" 115 | * 11: "lawyer" 116 | * 12: "programmer" 117 | * 13: "retired" 118 | * 14: "sales/marketing" 119 | * 15: "scientist" 120 | * 16: "self-employed" 121 | * 17: "technician/engineer" 122 | * 18: "tradesman/craftsman" 123 | * 19: "unemployed" 124 | * 20: "writer" 125 | 126 | MOVIES FILE DESCRIPTION 127 | ================================================================================ 128 | 129 | Movie information is in the file "movies.dat" and is in the following 130 | format: 131 | 132 | MovieID::Title::Genres 133 | 134 | - Titles are identical to titles provided by the IMDB (including 135 | year of release) 136 | - Genres are pipe-separated and are selected from the following genres: 137 | 138 | * Action 139 | * Adventure 140 | * Animation 141 | * Children's 142 | * Comedy 143 | * Crime 144 | * Documentary 145 | * Drama 146 | * Fantasy 147 | * Film-Noir 148 | * Horror 149 | * Musical 150 | * Mystery 151 | * Romance 152 | * Sci-Fi 153 | * Thriller 154 | * War 155 | * Western 156 | 157 | - Some MovieIDs do not correspond to a movie due to accidental duplicate 158 | entries and/or test entries 159 | - Movies are mostly entered by hand, so errors and inconsistencies may exist 160 | -------------------------------------------------------------------------------- /data/recommend/ml-1m/movies.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blogchong/spark-example/6333b20128368dce31cb22fd2dc1184001075e1b/data/recommend/ml-1m/movies.dat -------------------------------------------------------------------------------- /data/recommend/ml-1m/personalRatings.txt: -------------------------------------------------------------------------------- 1 | 0::1::5::1409495135 2 | 0::780::4::1409495135 3 | 0::590::3::1409495135 4 | 0::1216::4::1409495135 5 | 0::648::5::1409495135 6 | 0::344::3::1409495135 7 | 0::165::4::1409495135 8 | 0::153::5::1409495135 9 | 0::597::4::1409495135 10 | 0::1586::5::1409495135 11 | 0::231::5::1409495135 12 | -------------------------------------------------------------------------------- /out/artifacts/spark_example_jar/spark-example.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blogchong/spark-example/6333b20128368dce31cb22fd2dc1184001075e1b/out/artifacts/spark_example_jar/spark-example.jar -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.blogchong 8 | spark-example 9 | 1.0-SNAPSHOT 10 | 11 | 12 | UTF-8 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | net.sf.json-lib 31 | json-lib 32 | 2.4 33 | jdk15 34 | 35 | 36 | 37 | org.ansj 38 | ansj_seg 39 | 0.9 40 | 41 | 42 | org.ansj 43 | tree_split 44 | 1.0.1 45 | 46 | 47 | 48 | org.scala-lang 49 | scala-library 50 | 2.10.3 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | org.apache.spark 64 | spark-core_2.10 65 | 1.5.0 66 | 67 | 68 | org.apache.spark 69 | spark-mllib_2.10 70 | 1.5.1 71 | 72 | 73 | 74 | 75 | 76 | 77 | org.apache.maven.plugins 78 | maven-surefire-plugin 79 | 2.8.1 80 | 81 | 82 | **/*.java 83 | **/*.scala 84 | 85 | 86 | 87 | 88 | org.apache.maven.plugins 89 | maven-compiler-plugin 90 | 2.3.2 91 | 92 | -g 93 | true 94 | 95 | 96 | 97 | 98 | org.scala-tools 99 | maven-scala-plugin 100 | 2.15.2 101 | 102 | 103 | 104 | -g:vars 105 | 106 | 107 | true 108 | 109 | 110 | 111 | scala-compile-first 112 | process-resources 113 | 114 | compile 115 | 116 | 117 | 118 | scala-test-compile 119 | process-test-resources 120 | 121 | testCompile 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /spark-example.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /src/main/java/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Main-Class: com.blogchong.spark.mllib.Kmeans 3 | 4 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/advance/ALSRecommendMovie/AlsArithmeticPractice.scala: -------------------------------------------------------------------------------- 1 | package com.blogchong.spark.mllib.advance.ALSRecommendMovie 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating} 6 | import org.apache.spark.rdd._ 7 | import scala.io.Source 8 | 9 | /** 10 | * Author: blogchong 11 | * Blog: www.blogchong.com 12 | * Mailbox: blogchong@163.com 13 | * Data: 2015/10/30 14 | * Describe:协同过滤中，基于模型的协同，最小二乘法ALS算法，贴合实践的实例 15 | */ 16 | object AlsArithmeticPractice { 17 | def main(args: Array[String]) { 18 | 19 | // 屏蔽不必要的日志显示在终端上 20 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 21 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) 22 | 23 | // 设置运行环境 24 | val conf = new SparkConf().setAppName("ALSPractice") 25 | val sc = new SparkContext(conf) 26 | 27 | val personalRatingsPath = "/root/spark/spark-1.4.0-bin-hadoop2.6/data/mllib/als2/personalRatings.txt" 28 | val moviesPath = "hdfs://192.168.5.200:9000/spark/mllib/data/als2/movies.dat" 29 | val ratingsPath = "hdfs://192.168.5.200:9000/spark/mllib/data/als2/ratings.dat" 30 | val userPath = "hdfs://192.168.5.200:9000/spark/mllib/data/als2/users.dat" 31 | val modelPath = "hdfs://192.168.5.200:9000/spark/mllib/result/als2/model" 32 | val outPath = "hdfs://192.168.5.200:9000/spark/mllib/result/als2/data/recommendations" 33 | 34 | // 装载用户评分数据，该评分由评分器生成，即文件personalRatings.txt 35 | val myRatings = loadRatings(personalRatingsPath) 36 | val personalRatingsData = sc.parallelize(myRatings, 1) 37 | 38 | //装载样本评分数据，最后一列Timestamp去除10余数作为key，Rating为值，即(Int, Ratings) 39 | //输出的结果是一个key-value集合，其中key为时间取余，value是Rating对象 40 | val ratings = sc.textFile(ratingsPath).map{ 41 | line => 42 | val fields = line.split("::") 43 | (fields(3).toLong %10, Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble)) 44 | } 45 | 46 | //装载电影电影目录对照表(电影ID->电影标题),即输出是一个数组集合 47 | val movies = sc.textFile(moviesPath).map { 48 | line => 49 | val fields = line.split("::") 50 | (fields(0).toInt, fields(1)) 51 | } 52 | 53 | //统计有用户数量和电影数量以及用户对电影的评分数目 54 | val numRatings = ratings.count() 55 | val numUsers = ratings.map(_._2.user).distinct().count() 56 | val numMovies = ratings.map(_._2.product).distinct().count() 57 | 58 | println("==================样本数量===================") 59 | println("NumRatings: [" + numRatings + "]") 60 | println("NumUsers: [" + numUsers + "]") 61 | println("NumMovies: [" + numMovies + "]") 62 | 63 | //将样本评分表以Key值切分成3个部分,并且数据在计算的过程中会多次用到，所以存入cache 64 | //-训练(60%，并加入用户评分) 65 | //-校验(20%) 66 | //-测试(20%) 67 | val numPartions = 4 68 | //通过key(10的余数，均衡分布，所以x._1 < 6基本能够切分出大约60%的数据量) 69 | val training = ratings.filter(x => x._1 < 6).values 70 | .union(personalRatingsData).repartition(numPartions).persist() 71 | val validation = ratings.filter(x => x._1 >=6 && x._1 < 8).values 72 | .repartition(numPartions).persist() 73 | val test = ratings.filter(x => x._1 > 8).values.persist() 74 | 75 | //统计各部分的量 76 | val numTraining = training.count() 77 | val numValidation = validation.count() 78 | val numTest = test.count() 79 | //打印统计信息 80 | println("==================样本划分===================") 81 | println("NumTraining: [" + numTraining + "]") 82 | println("NumValidation: [" + numValidation + "]") 83 | println("NumTest: [" + numTest + "]") 84 | 85 | //训练不同参数下的模型，并在校验集中验证，获取最佳参数下的模型 86 | val ranks = List(5, 8, 12, 15) 87 | val lambdas = List(0.1, 0.5, 5) 88 | val numIters = List(8, 10, 20) 89 | //最佳模型变量 90 | var bestModel: Option[MatrixFactorizationModel] = None 91 | //最佳校验均根方差 92 | var bestValidationRmse = Double.MaxValue 93 | var bestRank = 0 94 | var bestLambda = -1.0 95 | var bestNumIter = -1 96 | 97 | var count = 0 98 | //进行三层循环遍历，找最佳的Rmse值，对应的model 99 | for (rank <- ranks; lambda <- lambdas; numIter <- numIters) { 100 | val model = ALS.train(training, rank, numIter, lambda) 101 | //计算均根方差值，传入的是model以及校验数据 102 | val validationRmse = computeRmse(model, validation, numValidation) 103 | count += 1 104 | println("==============参数尝试次数:[" + count + "]=======================") 105 | println("RMSE(validation): [" + validationRmse + "]") 106 | println("rank: [" + rank + "]") 107 | println("lambda: [" + lambda + "]") 108 | println("numIter: [" + numIter + "]") 109 | 110 | //选取最佳值，均方根误差越小越OK 111 | if (validationRmse < bestValidationRmse) { 112 | bestModel = Some(model) 113 | bestValidationRmse = validationRmse 114 | bestLambda = lambda 115 | bestRank = rank 116 | bestNumIter = numIter 117 | } 118 | } 119 | 120 | //至此，已经选择出均方根误差最小的模型，即最佳模型 121 | //用最佳模型进行测试集评分预测，并计算和实际评分之间的RMSE值 122 | val testRmse = computeRmse(bestModel.get, test, numTest) 123 | println("==============测试集预测==========================") 124 | println("rank: [" + bestRank + "]") 125 | println("lambda: [" + bestLambda + "]") 126 | println("numIter: [" + bestNumIter + "]") 127 | println("Rmse: [" + testRmse + "]") 128 | 129 | //创建一个基准衡量标准，并且用最好的模型进行比较 130 | //获取训练样本+预测样本的rating平均分 131 | val meanRating = training.union(validation).map(_.rating).mean() 132 | //计算标准差 133 | val baseLineRmse = math.sqrt(test.map(x => (meanRating - x.rating) * (meanRating - x.rating)).reduce(_+_)/numTest) 134 | //改进系数 135 | val improvement = (baseLineRmse - testRmse) / baseLineRmse * 100 136 | println("=============模型预测改进系数========================================================") 137 | println("The best model improves the baseline by " + "%1.2f".format(improvement) + "%.") 138 | 139 | //推荐前十部最感兴趣的电影,注意需要剔除该用户已经评分的电影，即去重 140 | val myRatedMovieIds = myRatings.map(_.product).toSet 141 | 142 | val candidates = movies.keys.filter(!myRatedMovieIds.contains(_)) 143 | 144 | //为用户0推荐十部movies 145 | val candRDD: RDD[(Int, Int)] = candidates.map((0, _)) 146 | val recommendations:RDD[Rating] = bestModel.get.predict(candRDD) //.collect.sortBy(_.rating).take(10) 147 | val recommendations_ = recommendations.collect().sortBy(-_.rating).take(10) 148 | var i = 1 149 | 150 | println("Movies recommended for you:") 151 | recommendations_.foreach { 152 | r => 153 | println("%2d".format(i) + ": [" + r.product + "]") 154 | i += 1 155 | } 156 | 157 | //保存结果 158 | recommendations.sortBy(-_.rating).saveAsTextFile(outPath) 159 | //保存模型文件 160 | bestModel.get.save(sc, modelPath) 161 | //再次使用模型文件 162 | //val sameModel = MatrixFactorizationModel.load(sc, modelPath) 163 | 164 | sc.stop() 165 | } 166 | 167 | /** 装载用户评分文件 personalRatings.txt **/ 168 | def loadRatings(path:String):Seq[Rating] = { 169 | val lines = Source.fromFile(path).getLines() 170 | val ratings = lines.map{ 171 | line => 172 | val fields = line.split("::") 173 | Rating(fields(0).toInt,fields(1).toInt,fields(2).toDouble) 174 | }.filter(_.rating > 0.0) 175 | if(ratings.isEmpty){ 176 | sys.error("No ratings provided.") 177 | ratings.toSeq 178 | }else{ 179 | ratings.toSeq 180 | } 181 | } 182 | 183 | //校验集预测数据和实际数据之间的均方根误差 184 | def computeRmse(model:MatrixFactorizationModel,data:RDD[Rating],n:Long):Double = { 185 | 186 | //调用model的predict预测方法，把预测数据初始化model中，并且生成预测rating 187 | val predictions:RDD[Rating] = model.predict((data.map(x => (x.user, x.product)))) 188 | 189 | //通过join操作，把相同user-product的value合并成一个(double,double)元组，前者为预测值，后者为实际值 190 | val predictionsAndRatings = predictions.map{ 191 | x => ((x.user, x.product), x.rating) 192 | }.join(data.map(x => ((x.user, x.product), x.rating))).values 193 | 194 | //均方根误差能够很好的反应出测量的精密度，对于偏离过大或者过小的测量值较为敏感 195 | //计算过程为观测值与真实值偏差的平方，除于观测次数n，然后再取平方根 196 | //reduce方法，执行的是值累加操作 197 | math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce( _ + _ )/n) 198 | 199 | } 200 | 201 | } 202 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/advance/DiscoveryNewWord/AnsjDisWords.scala: -------------------------------------------------------------------------------- 1 | package com.blogchong.spark.mllib.advance.DiscoveryNewWord 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | import scala.util.parsing.json.JSONObject 5 | import scala.collection.JavaConversions._ 6 | import org.ansj.app.newWord.LearnTool 7 | import org.ansj.domain.TermNatures 8 | import org.ansj.splitWord.analysis.NlpAnalysis 9 | 10 | /** 11 | * Author: blogchong 12 | * Blog: www.blogchong.com 13 | * Mailbox: blogchong@163.com 14 | * Data: 2016/1/10 15 | * Describe: ansj工具新词发现实验 16 | */ 17 | object AnsjDisWords { 18 | def main(args: Array[String]) { 19 | // 设置运行环境 20 | val conf = new SparkConf().setAppName("新词发现") 21 | val sc = new SparkContext(conf) 22 | 23 | val inputPath = args(0) 24 | val outputPath = args(1) 25 | 26 | println("InputPath:" + inputPath) 27 | println("OutputPath:" + outputPath) 28 | 29 | val list: List[Int] = List(1) 30 | 31 | list.map { 32 | k => 33 | //获取初始数据 34 | val input = sc.textFile(inputPath) 35 | 36 | println("InputSize:" + input.count()) 37 | 38 | if (LTSerializa2.getTool == null) { 39 | println("learnTool is NULL!") 40 | } else { 41 | println("learnTool is not NULL!") 42 | } 43 | 44 | input.map { 45 | f => 46 | val notes = f.split("\t") 47 | val noteObj = notes(1).asInstanceOf[JSONObject] 48 | NlpAnalysis.parse(noteObj.obj.get("title").toString, LTSerializa2.getTool) 49 | NlpAnalysis.parse(noteObj.obj.get("body").toString, LTSerializa2.getTool) 50 | } 51 | 52 | val newWords = LTSerializa2.getTool.getTopTree(100, TermNatures.NW) 53 | 54 | if (newWords == null) { 55 | println("NewWords is NULL!") 56 | } else { 57 | println("NewWordsSize:" + newWords.size()) 58 | sc.parallelize(newWords.map(f => f.getKey).toSeq).saveAsTextFile(outputPath) 59 | } 60 | } 61 | sc.stop() 62 | } 63 | 64 | object LTSerializa2 { 65 | val learnTool2 = new LearnTool 66 | def getTool = { 67 | learnTool2 68 | } 69 | } 70 | 71 | class LTSerializa extends java.io.Serializable { 72 | val learnTool = new LearnTool 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/advance/DiscoveryNewWord/NGramSpark.scala: -------------------------------------------------------------------------------- 1 | package com.blogchong.spark.mllib.advance.DiscoveryNewWord 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | import org.ansj.splitWord.analysis.{ToAnalysis} 5 | import scala.collection.JavaConversions._ 6 | import java.util.Date 7 | import com.blogchong.util.{CharUtil, NewTime} 8 | import java.util.regex.Pattern 9 | import java.util 10 | import org.apache.log4j.Logger 11 | 12 | /** 13 | * Author: blogchong 14 | * Blog: www.blogchong.com 15 | * Mailbox: blogchong@163.com 16 | * Data: 2016/1/12 17 | * Describe: NGram算法 - 新词发现实验 18 | */ 19 | object NGramSpark { 20 | 21 | val logger = Logger.getLogger(NGramSpark.getClass) 22 | 23 | def main(args: Array[String]) { 24 | 25 | // 设置运行环境 26 | val conf = new SparkConf().setAppName("ansj分词，新词发现") 27 | val sc = new SparkContext(conf) 28 | 29 | val userDicPath = args(0) 30 | val inputPath = args(1) 31 | val outputPath = args(2) 32 | 33 | println("=============>UserDicPath:" + userDicPath) 34 | println("=============>InputPath:" + inputPath) 35 | println("=============>OutputPath:" + outputPath) 36 | 37 | //输入 38 | val input = sc.textFile(inputPath).collect() 39 | println("=============>InputSize:" + input.size) 40 | 41 | //初始化用户字典 42 | val userDic = sc.textFile(userDicPath).map { 43 | f => 44 | val notes = f.split("\t") 45 | if (notes.size == 3) { 46 | notes(0) 47 | } 48 | }.collect.toSet 49 | 50 | //设置TF/D阈值 51 | val tdfThreshold = 2.5 52 | 53 | println("=============>UserDicSize:" + userDic.size) 54 | 55 | val mergeWordTFDMap = new util.HashMap[String, Double]() 56 | 57 | //进行词合并 58 | val mergeWordTFMap = new util.HashMap[String, Int]() 59 | val mergeWordTF2WordsMap = new util.HashMap[String, String]() 60 | val mergeWordDMap = new util.HashMap[String, Int]() 61 | val wordMap = new util.HashMap[String, Int]() 62 | 63 | val que_2: util.Queue[String] = new util.LinkedList[String] 64 | val que_3: util.Queue[String] = new util.LinkedList[String] 65 | 66 | var count = 0 67 | //统计总词数 68 | var wordCount = 0 69 | val output = input.map { 70 | f => 71 | val newWordSet = new util.HashSet[String]() 72 | val notes = f.split("\t") 73 | val id = notes(0) 74 | val noteObj = net.sf.json.JSONObject.fromObject(notes(1)) 75 | 76 | val title = noteObj.get("title").toString 77 | val body = noteObj.get("body").toString 78 | 79 | val titleParse = ToAnalysis.parse(title) 80 | val bodyParse = ToAnalysis.parse(body) 81 | 82 | val titleWords = titleParse.map { 83 | f => 84 | replaceStr(f.getName.toLowerCase.trim) 85 | }.filter(f => f.length > 0) 86 | 87 | val bodyWords = bodyParse.map { 88 | f => 89 | replaceStr(f.getName.toLowerCase.trim).trim 90 | }.filter(f => f.length > 0).union(titleWords) 91 | 92 | //进行单个字词统计 93 | bodyWords.foreach { 94 | f => 95 | wordCount = wordCount + 1 96 | if (wordMap.containsKey(f)) { 97 | wordMap.put(f, wordMap.get(f) + 1) 98 | } else { 99 | wordMap.put(f, 1) 100 | } 101 | } 102 | 103 | //做两词合并 104 | bodyWords.foreach { 105 | j => 106 | que_2.offer(j) 107 | //先做2词合并实验，并记录词频，目前只考虑词频文档频以及平均文档频 108 | if (que_2.size() == 2) { 109 | val newWord2Words = joinWord(que_2, 2) 110 | val newWord = newWord2Words._1 111 | //判断是否在基本字典中 112 | if (!userDic.contains(newWord) && newWord.trim.size != 0) { 113 | //存储组合词和字词关系 114 | mergeWordTF2WordsMap.put(newWord2Words._1, newWord2Words._2) 115 | //存储词个数 116 | if (mergeWordTFMap.containsKey(newWord)) { 117 | mergeWordTFMap.put(newWord, mergeWordTFMap.get(newWord) + 1) 118 | } else { 119 | mergeWordTFMap.put(newWord, 1) 120 | } 121 | //加入单篇文档新词set 122 | newWordSet.add(newWord) 123 | wordCount = wordCount + 1 124 | } 125 | } else if (que_2.size() > 2) { 126 | //先移除之前的 127 | que_2.poll() 128 | val newWord2Words = joinWord(que_2, 2) 129 | val newWord = newWord2Words._1 130 | //判断是否在基本字典中 131 | if (!userDic.contains(newWord) && newWord.trim.size != 0) { 132 | //存储组合词和字词关系 133 | mergeWordTF2WordsMap.put(newWord2Words._1, newWord2Words._2) 134 | //存储词个数 135 | if (mergeWordTFMap.containsKey(newWord)) { 136 | mergeWordTFMap.put(newWord, mergeWordTFMap.get(newWord) + 1) 137 | } else { 138 | mergeWordTFMap.put(newWord, 1) 139 | } 140 | //加入单篇文档新词set 141 | newWordSet.add(newWord) 142 | wordCount = wordCount + 1 143 | } 144 | } 145 | } 146 | 147 | //做三词合并 148 | bodyWords.foreach { 149 | j => 150 | que_3.offer(j) 151 | //先做2词合并实验，并记录词频，目前只考虑词频文档频以及平均文档频 152 | if (que_3.size() == 3) { 153 | val newWord2Words = joinWord(que_3, 3) 154 | val newWord = newWord2Words._1 155 | //判断是否在基本字典中 156 | if (!userDic.contains(newWord) && newWord.trim.size != 0) { 157 | // //存储组合词和字词关系 158 | mergeWordTF2WordsMap.put(newWord2Words._1, newWord2Words._2) 159 | //存储词个数 160 | if (mergeWordTFMap.containsKey(newWord)) { 161 | mergeWordTFMap.put(newWord, mergeWordTFMap.get(newWord) + 1) 162 | } else { 163 | mergeWordTFMap.put(newWord, 1) 164 | } 165 | //加入单篇文档新词set 166 | newWordSet.add(newWord) 167 | wordCount = wordCount + 1 168 | } 169 | } else if (que_3.size() > 3) { 170 | //先移除之前的 171 | que_3.poll() 172 | val newWord2Words = joinWord(que_3, 3) 173 | val newWord = newWord2Words._1 174 | //判断是否在基本字典中 175 | if (!userDic.contains(newWord) && newWord.trim.size != 0) { 176 | if (mergeWordTFMap.containsKey(newWord)) { 177 | mergeWordTFMap.put(newWord, mergeWordTFMap.get(newWord) + 1) 178 | } else { 179 | mergeWordTFMap.put(newWord, 1) 180 | } 181 | //加入单篇文档新词set 182 | newWordSet.add(newWord) 183 | wordCount = wordCount + 1 184 | } 185 | } 186 | } 187 | 188 | //计算文档频 189 | newWordSet.foreach { 190 | f => 191 | if (mergeWordDMap.containsKey(f)) { 192 | mergeWordDMap.put(f, mergeWordDMap.get(f) + 1) 193 | } else { 194 | mergeWordDMap.put(f, 1) 195 | } 196 | } 197 | 198 | if (count % 5000 == 0) { 199 | println("Begin--------------------------------------------------------") 200 | println("=============>count:" + count) 201 | println("=============>mergeWordTFMapSize:" + mergeWordTFMap.size()) 202 | println("=============>mergeWordDMapSize:" + mergeWordDMap.size()) 203 | println("End--------------------------------------------------------") 204 | } 205 | 206 | count = count + 1 207 | s"${id}\t${bodyWords.mkString(",")}" 208 | } 209 | 210 | println("=============>OutputSize:" + output.size) 211 | 212 | val dateDate = new Date 213 | val saveTime = NewTime.dateToString(dateDate, NewTime.`type`) 214 | 215 | //存储基础分词之后的结果 216 | // sc.parallelize(output.toSeq).saveAsTextFile(outputPath + "/" + saveTime + "/splitwords") 217 | 218 | //进行TF词频，文档排序输出 219 | val scTFMap = mergeWordTFMap.toSeq.sortBy { 220 | case (word, freq) => freq 221 | }.filter(_._2 > 1) 222 | println("=============>新词词频MergeWordTFMapSize:" + scTFMap.size()) 223 | // sc.parallelize(scTFMap).saveAsTextFile(outputPath + "/" + saveTime + "/tfNewWord") 224 | 225 | //进行文档频存储 226 | val scDMap = mergeWordDMap.toSeq.sortBy { 227 | case (word, freq) => freq 228 | }.filter(_._2 > 1) 229 | println("=============>新词文档频MergeWordDMapSize:" + scDMap.size()) 230 | // sc.parallelize(scDMap).saveAsTextFile(outputPath + "/" + saveTime + "/dNewWord") 231 | 232 | //进行平均词频计算 233 | scDMap.foreach { 234 | f => 235 | if (mergeWordTFMap.containsKey(f._1) && f._2 != 0) { 236 | mergeWordTFDMap.put(f._1, mergeWordTFMap.get(f._1).toDouble / f._2.toDouble) 237 | } 238 | } 239 | 240 | //进行平均文档频计算存储 241 | val scTFDMap = mergeWordTFDMap.toSeq.sortBy { 242 | case (word, freq) => freq 243 | }.filter(_._1.split("\\s").size == 1).filter(_._2 >= tdfThreshold) 244 | println("=============>新词平均词频(过阈值过纯英文\\s组合之后)MergeWordTFDMapSize:" + scTFDMap.size()) 245 | sc.parallelize(scTFDMap).saveAsTextFile(outputPath + "/" + saveTime + "/tfdNewWord") 246 | 247 | //////////////////////////////////计算凝固度//////////////////////////////////////////////////////////// 248 | sc.parallelize(wordMap.toSeq).saveAsTextFile(outputPath + "/" + saveTime + "/wordMap") 249 | 250 | println("=============>词总量wordCount:" + wordCount) 251 | 252 | //计算单个字词的概率 253 | val wordRateMap = new util.HashMap[String, Double]() 254 | wordMap.foreach { 255 | f => 256 | wordRateMap.put(f._1, f._2.toDouble / wordCount.toDouble) 257 | } 258 | println("=============>全量子词概率WordRateMapSize:" + wordRateMap.size()) 259 | sc.parallelize(wordRateMap.toSeq.sortBy{ 260 | case (word, freq) => freq 261 | }).saveAsTextFile(outputPath + "/" + saveTime + "/wordRateMap") 262 | 263 | //计算组合词概率，只计算过了TFD阈值的词 264 | val wordTfdRateMap = new util.HashMap[String, Double]() 265 | scTFDMap.foreach { 266 | f => 267 | wordTfdRateMap.put(f._1, f._2.toDouble / wordCount.toDouble) 268 | } 269 | println("=============>新词组合概率(过了TFD阈值)WordTfdRateMapSize:" + wordTfdRateMap.size()) 270 | sc.parallelize(wordTfdRateMap.toSeq.sortBy{ 271 | case (word, freq) => freq 272 | }).saveAsTextFile(outputPath + "/" + saveTime + "/wordTfdRateMap") 273 | 274 | //计算组合词概率和字词乘积和 275 | sc.parallelize(mergeWordTF2WordsMap.toSeq).saveAsTextFile(outputPath + "/" + saveTime + "/mergeWordTF2WordsMap") 276 | val proRate = new util.HashMap[String, Double]() 277 | wordTfdRateMap.foreach{ 278 | f=> 279 | var rate : Double = 1.0 280 | mergeWordTF2WordsMap.get(f._1).split("\t").foreach{ 281 | g => 282 | rate = rate * wordRateMap.get(g) 283 | } 284 | 285 | proRate.put(f._1, wordTfdRateMap.get(f._1) / rate) 286 | } 287 | println("=============>计算组合词概率和子词乘积比值ProRateSize:" + proRate.size()) 288 | 289 | //保存概率比 290 | sc.parallelize(proRate.toSeq.sortBy{ 291 | case(word, freq) => freq 292 | }).saveAsTextFile(outputPath + "/" + saveTime + "/proRate") 293 | 294 | sc.stop() 295 | } 296 | 297 | //合并新词 298 | def joinWord(que: util.Queue[String], num: Int): (String, String) = { 299 | val que_ret: util.Queue[String] = que 300 | if (num == 2) { 301 | val word1 = que_ret.poll() 302 | val word2 = que_ret.poll() 303 | if ((CharUtil.isNumeric(word1) || CharUtil.isNumeric(word2)) 304 | || (word1.equals("的") || word2.equals("的"))) { 305 | ("", word1 + "\t" + word2) 306 | } else if ((word1.equals("-") || (word2.equals("-"))) || (word1.equals("_") || word2.equals("_"))) { 307 | ("", word1 + "\t" + word2) 308 | } else { 309 | if (!CharUtil.isChinese(word1) && !CharUtil.isChinese(word2)) { 310 | (word1 + " " + word2, word1 + "\t" + word2) 311 | } else { 312 | (word1 + word2, word1 + "\t" + word2) 313 | } 314 | } 315 | } else if (num == 3) { 316 | val word1 = que_ret.poll() 317 | val word2 = que_ret.poll() 318 | val word3 = que_ret.poll() 319 | if ((CharUtil.isNumeric(word1) || CharUtil.isNumeric(word2) || CharUtil.isNumeric(word3)) 320 | || (word1.equals("的") || word2.equals("的") || word3.equals("的"))) { 321 | ("", word1 + "\t" + word2 + "\t" + word3) 322 | } else if (word1.equals("-") || word1.equals("_") || word3.equals("-") || word3.equals("_")) { 323 | ("", word1 + "\t" + word2 + "\t" + word3) 324 | } else if (!CharUtil.isChinese(word1) && !CharUtil.isChinese(word2) && !CharUtil.isChinese(word3) 325 | && (word2.equals("-") || word2.equals("_"))) { 326 | (word1 + word2 + word3, word1 + "\t" + word2 + "\t" + word3) 327 | } else if (!CharUtil.isChinese(word1) && !CharUtil.isChinese(word2) && !CharUtil.isChinese(word3) 328 | && (!word2.equals("-") && !word2.equals("_"))) { 329 | (word1 + " " + word2 + " " + word3, word1 + "\t" + word2 + "\t" + word3) 330 | } else { 331 | (word1 + word2 + word3, word1 + "\t" + word2 + "\t" + word3) 332 | } 333 | } else { 334 | (que_ret.mkString(""), que.mkString("\t")) 335 | } 336 | 337 | } 338 | 339 | //特殊字符去除 340 | def replaceStr(str: String) = { 341 | if (!CharUtil.isChinese(str) && !str.equals("-") && !str.equals("_") && str.size <= 1) { 342 | "" 343 | } else { 344 | val regEx = "[`~!@#$%^&*()+=|{}':→;ˇ'\\\\,\\[\\].<>/?~！≤@#￥%《》ø━……&*（）——+|{}【】‘；：”“’。，、？]" 345 | val pattern = Pattern.compile(regEx) 346 | val matcher = pattern.matcher(str) 347 | matcher.replaceAll("").trim 348 | } 349 | } 350 | 351 | } 352 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/advance/LdaExtractTopics/Check/PredictsDocTopics.scala: -------------------------------------------------------------------------------- 1 | package com.blogchong.spark.mllib.advance.LdaExtractTopics.Check 2 | 3 | import org.apache.spark.mllib.clustering.{LocalLDAModel} 4 | import org.apache.spark.mllib.linalg.Vectors 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import scala.collection.JavaConversions._ 7 | import java.util.Date 8 | import com.blogchong.util.NewTime 9 | import java.util 10 | 11 | /** 12 | * Author: blogchong 13 | * Blog: www.blogchong.com 14 | * Mailbox: blogchong@163.com 15 | * Data: 2015/10/23 16 | * Describe: LDA主题词新文档主题推测，然后进行校验 17 | */ 18 | 19 | object PredictsDocTopics { 20 | def main(args: Array[String]) = { 21 | 22 | args.map(f => println(f)) 23 | 24 | val argsParser = new PredictsDocTopicsArgsParser 25 | 26 | require(args.length >= 3, argsParser.getUsageMessage(null)) 27 | 28 | argsParser.parseArgs(args.toList) 29 | 30 | val dataPath = argsParser.dataPath 31 | val modelPath = argsParser.modelPath 32 | val wordsPath = argsParser.wordsPath 33 | val maxWordsTopic = argsParser.topicSize 34 | val topicsPath = argsParser.topicsPath 35 | 36 | val conf = new SparkConf().setAppName("PredictsDocTopics") 37 | val sc = new SparkContext(conf) 38 | 39 | //生成对应关系。要求为字典 \t 格式 40 | val wordToLabelLocal = sc.textFile(wordsPath).map { 41 | f => 42 | val Array(label, word) = f.split("\t") 43 | (word, label.toInt) 44 | }.collect.toMap 45 | 46 | //生成对应关系。要求为字典 \t 格式 47 | val wordToLabelLocal2 = sc.textFile(wordsPath).map { 48 | f => 49 | val Array(label, word) = f.split("\t") 50 | (label.toInt, word) 51 | }.collect.toMap 52 | 53 | //将字典广播出去 54 | val keywordsDis = sc.broadcast(wordToLabelLocal.keys.toSet) 55 | val wordToLabelDis = sc.broadcast(wordToLabelLocal) 56 | val wordToLabelDis2 = sc.broadcast(wordToLabelLocal2) 57 | 58 | val dataPathCollections = dataPath.split(",") 59 | 60 | var data = sc.textFile(dataPathCollections(0)) 61 | 62 | if (dataPathCollections.length > 1) { 63 | dataPathCollections.takeRight(dataPathCollections.length - 1).foreach { 64 | k => 65 | data = data.union(sc.textFile(k)) 66 | } 67 | } 68 | 69 | //存储docs中id与自动index，对应起来 70 | val mapIdsIndex = new util.HashMap[Long, String]() 71 | 72 | //获取文档编号。每条内容的格式为\t\s.... 其中id为文档的业务编号。我们会再生成一个 73 | //LDA需要的Long类型编号 74 | val docs = data.zipWithIndex.map(_.swap). 75 | map { 76 | f => 77 | val splitters = f._2.split("\t") 78 | val id = splitters(0) 79 | val sentence = splitters.takeRight(splitters.length - 1).mkString(" ") 80 | Doc(f._1, id, sentence.split("\\s+").filter(word => keywordsDis.value.contains(word))) 81 | 82 | }.filter(f => f.sentence.length > 0) 83 | 84 | docs.map{ 85 | f=> 86 | mapIdsIndex.put(f.label, f.id) 87 | } 88 | 89 | //获得训练集，仅仅使用词频作为权重。把文档转化为向量 90 | val corpus = docs.map { 91 | f => 92 | val docVector = f.sentence.groupBy(f => f).map { 93 | k => 94 | val wordLabel = wordToLabelDis.value(k._1) 95 | (wordLabel.toInt, k._2.size.toDouble) 96 | }.toSeq 97 | (f.label, Vectors.sparse(50269, docVector)) 98 | }.repartition(20) 99 | 100 | //加载主题模型 101 | val ldaModel = LocalLDAModel.load(sc, modelPath) 102 | val predictsTopics = ldaModel.topicDistributions(corpus) 103 | 104 | //预测新文档的主题分布，并且保存下来 105 | val dateDate = new Date 106 | val saveTime = NewTime.dateToString(dateDate, NewTime.`type`) 107 | predictsTopics.map{ 108 | f => 109 | val docIndex = f._1 110 | val wordArray = ldaModel.topics.multiply(f._2).toArray 111 | val wordRdd = wordArray.zipWithIndex.sortBy(-_._1).take(maxWordsTopic) 112 | val topWords = wordRdd.map { 113 | case (weight, index) => 114 | s"${wordToLabelDis2.value(index.toInt)}:${weight}" 115 | } 116 | s"${mapIdsIndex.get(docIndex)}\t${topWords.mkString(" ")}" 117 | }.saveAsTextFile(topicsPath + "/" + saveTime + "/predictsTopics") 118 | 119 | sc.stop() 120 | } 121 | 122 | } 123 | 124 | case class Doc(label: Long, id: String, sentence: Array[String]) 125 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/advance/LdaExtractTopics/Check/PredictsDocTopicsArgsParser.scala: -------------------------------------------------------------------------------- 1 | package com.blogchong.spark.mllib.advance.LdaExtractTopics.Check 2 | 3 | /** 4 | * Author: blogchong 5 | * Blog: www.blogchong.com 6 | * Mailbox: blogchong@163.com 7 | * Data: 2015/10/23 8 | * Describe: 处理传参 9 | */ 10 | 11 | class PredictsDocTopicsArgsParser { 12 | 13 | var dataPath: String = null 14 | var modelPath: String = null 15 | var topicsPath: String = null 16 | var wordsPath: String = null 17 | var topicSize: Int = 100 18 | 19 | def parseArgs(inputArgs: List[String]): Unit = { 20 | 21 | var args = inputArgs 22 | 23 | while (!args.isEmpty) { 24 | args match { 25 | case ("PdataPath") :: value :: tail => 26 | dataPath = value 27 | println("PdataPath: " + dataPath) 28 | args = tail 29 | case ("PmodelPath") :: value :: tail => 30 | modelPath = value 31 | println("PmodelPath: " + modelPath) 32 | args = tail 33 | case ("PwordsPath") :: value :: tail => 34 | wordsPath = value 35 | println("PwordsPath: " + wordsPath) 36 | args = tail 37 | case ("PtopicSize") :: value :: tail => 38 | topicSize = value.toInt 39 | println("PtopicSize: " + topicSize) 40 | args = tail 41 | case ("PtopicsPath") :: value :: tail => 42 | topicsPath = value 43 | println("PtopicsPath: " + topicsPath) 44 | args = tail 45 | case Nil => 46 | 47 | case _ => 48 | throw new IllegalArgumentException(getUsageMessage(args)) 49 | } 50 | } 51 | } 52 | 53 | def getUsageMessage(unknownParam: List[String] = null): String = { 54 | val message = if (unknownParam != null) s"Unknown/unsupported param $unknownParam\n" else "" 55 | message + 56 | """ 57 | |Usage: com.blogchong.spark.mllib.advance.CSDN.LDAModelBuild [options] 58 | |Options: 59 | | PdataPath the location where you put your training documents 60 | | PmodelPath the location where you save your model 61 | | PtopicSize topic size of lda 62 | | PwordsPath the location of dictionary 63 | | PtopicsPath the topics of documents 64 | """.stripMargin 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/advance/LdaExtractTopics/Refer/LDAModelBuild.scala: -------------------------------------------------------------------------------- 1 | //package com.blogchong.spark.mllib.advance.weilianzhu 2 | // 3 | //import org.apache.spark.mllib.clustering.{DistributedLDAModel, LDA} 4 | //import org.apache.spark.mllib.linalg.Vectors 5 | //import org.apache.spark.{SparkConf, SparkContext} 6 | //import scala.collection.mutable 7 | //import scala.collection.mutable.ArrayBuffer 8 | // 9 | ///** 10 | // * 11 | // */ 12 | //object LDAModelBuild { 13 | // def main(args: Array[String]) = { 14 | // val argsParser = new LDAModelBuildArgsParser 15 | // 16 | // require(args.length >= 3, argsParser.getUsageMessage(null)) 17 | // 18 | // argsParser.parseArgs(args.toList) 19 | // 20 | // val dataPath = argsParser.dataPath 21 | // val wordsPath = argsParser.wordsPath 22 | // val modelPath = argsParser.modelPath 23 | // val debug = argsParser.debug 24 | // 25 | // val conf = new SparkConf().setAppName("LDAModelBuild") 26 | // if (debug) { 27 | // conf.setMaster("local[2]") 28 | // } 29 | // val sc = new SparkContext(conf) 30 | // 31 | // //生成对应关系。要求为字典 \t 格式 32 | // val wordToLabelLocal = sc.textFile(wordsPath).map { f => 33 | // val Array(label, word) = f.split("\t") 34 | // (word, label.toInt) 35 | // }.collect.toMap 36 | // 37 | // //将字典广播出去 38 | // val keywordsDis = sc.broadcast(wordToLabelLocal.keys.toSet) 39 | // val wordToLabelDis = sc.broadcast(wordToLabelLocal) 40 | // val labelToWordToDis = sc.broadcast(wordToLabelLocal.map(f => (f._2, f._1)).toMap) 41 | // 42 | // 43 | // val dataPathCollections = dataPath.split(",") 44 | // 45 | // var data = sc.textFile(if (debug) dataPathCollections(0) + "_sample" else dataPathCollections(0)) 46 | // 47 | // if (dataPathCollections.length > 1) { 48 | // dataPathCollections.takeRight(dataPathCollections.length - 1).foreach { k => 49 | // data = data.union(sc.textFile(k)) 50 | // } 51 | // } 52 | // 53 | // //获取文档编号。每条内容的格式为\t\s.... 其中id为文档的业务编号。我们会再生成一个 54 | // //LDA需要的Long类型编号，并且对应 55 | // val docs = data.zipWithIndex.map(_.swap). 56 | // map { f => 57 | // val splitters = f._2.split("\t") 58 | // val id = splitters(0) 59 | // val sentence = splitters.takeRight(splitters.length - 1).mkString(" ") 60 | // Doc(f._1, id, sentence.split("\\s+").filter(word => keywordsDis.value.contains(word))) 61 | // 62 | // }.filter(f => f.sentence.length > 0) 63 | // 64 | // 65 | // //获得训练集，仅仅使用词频作为权重。把文档转化为向量 66 | // val corpus = docs.map { f => 67 | // val docVector = f.sentence.groupBy(f => f).map { 68 | // k => 69 | // val wordLabel = wordToLabelDis.value(k._1) 70 | // (wordLabel.toInt, k._2.size.toDouble) 71 | // }.toSeq 72 | // (f.label, Vectors.sparse(wordToLabelDis.value.size, docVector)) 73 | // }.repartition(20) 74 | // 75 | // //主题模型训练 76 | // val topicSize = argsParser.topicSize 77 | // val ldaModel = new LDA().setK(topicSize).setMaxIterations(argsParser.maxIterations) 78 | // .run(corpus).asInstanceOf[DistributedLDAModel] 79 | // 80 | // //保存模型 81 | //// val saveTime = new DateTime().toString("yyyy-MM-dd-HH-mm-ss") 82 | // val saveTime = "yyyy-MM-dd-HH-mm-ss" 83 | // 84 | // ldaModel.save(sc, modelPath + "/" + saveTime + "/model") 85 | // 86 | // //存储文档数字编号和id的对应关系 87 | // docs.map(f => s"${f.label},${f.id}").saveAsTextFile(modelPath + "/" + saveTime + "/docLabelToId") 88 | // 89 | // 90 | // 91 | // if (argsParser.saveVector) { 92 | // val hbaseTable = argsParser.hbaseTableName 93 | // 94 | // val docLabelToId = docs.map(f => (f.label, f.id)).collect().toMap 95 | // 96 | // //存储内容的主题分布 97 | // ldaModel.topTopicsPerDocument(topicSize).map { f => 98 | // (f._1, f._2 zip f._3) 99 | // }.foreach { f => 100 | // if (debug) { 101 | // println(s"${docLabelToId(f._1)},${f._2.map(k => k._1 + ":" + k._2).mkString(" ")}") 102 | // } else { 103 | // BaseSimpleHBaseClient.put(hbaseTable, 104 | // docLabelToId(f._1), f._2.map(k => k._1 + ":" + k._2).mkString(" ")) 105 | // } 106 | // } 107 | // 108 | // //存储词的主题分布 109 | // val mmm = new mutable.HashMap[String, mutable.ArrayBuffer[(Int, Double)]]() 110 | // val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = wordToLabelLocal.size) 111 | // topicIndices.map { case (terms, termWeights) => 112 | // terms.map(labelToWordToDis.value(_)).zip(termWeights) 113 | // }.zipWithIndex.map { case (topic, i) => 114 | // i + " " + topic.map { case (term, weight) => s"$term:$weight" }.mkString(" ") 115 | // }.foreach { f => 116 | // val line = f.asInstanceOf[String].split("\\s+") 117 | // val topic = line(0).toInt 118 | // 119 | // line.takeRight(line.length - 1).map { f => 120 | // val Array(t, w) = f.split(":") 121 | // val item = (topic, w.toDouble) 122 | // if (!mmm.contains(t)) { 123 | // mmm(t) = new ArrayBuffer[(Int, Double)]() 124 | // } 125 | // mmm(t) += item 126 | // } 127 | // } 128 | // 129 | // mmm.foreach { f => 130 | // val item = f._2.sortBy(k => k._1).map(k => s"${k._1}:${k._2}").mkString(" ") 131 | // if (debug) { 132 | // println(s"${f._1},${item}") 133 | // } else { 134 | // BaseSimpleHBaseClient.put(hbaseTable, 135 | // f._1, item 136 | // ) 137 | // } 138 | // } 139 | // 140 | // } 141 | // 142 | // sc.stop() 143 | // } 144 | // 145 | // 146 | //} 147 | // 148 | //case class Doc(label: Long, id: String, sentence: Array[String]) 149 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/advance/LdaExtractTopics/Refer/LDAModelBuildArgsParser.scala: -------------------------------------------------------------------------------- 1 | //package com.blogchong.spark.mllib.advance.weilianzhu 2 | // 3 | ///** 4 | // * 5 | // */ 6 | //class LDAModelBuildArgsParser { 7 | // 8 | // var dataPath: String = null 9 | // var modelPath: String = null 10 | // var wordsPath: String = null 11 | // var debug: Boolean = false 12 | // var saveVector: Boolean = false 13 | // var topicSize: Int = 160 14 | // var maxIterations: Int = 160 15 | // var hbaseTableName: String = "spark_ml_lda_model_result" 16 | // 17 | // def parseArgs(inputArgs: List[String]): Unit = { 18 | // 19 | // var args = inputArgs 20 | // 21 | // while (!args.isEmpty) { 22 | // args match { 23 | // case ("PdataPath") :: value :: tail => 24 | // dataPath = value 25 | // args = tail 26 | // case ("PmodelPath") :: value :: tail => 27 | // modelPath = value 28 | // args = tail 29 | // 30 | // case ("PtopicSize") :: value :: tail => 31 | // topicSize = value.toInt 32 | // args = tail 33 | // 34 | // case ("PmaxIterations") :: value :: tail => 35 | // maxIterations = value.toInt 36 | // args = tail 37 | // 38 | // case ("Pdebug") :: value :: tail => 39 | // debug = value.toBoolean 40 | // args = tail 41 | // 42 | // case ("PwordsPath") :: value :: tail => 43 | // wordsPath = value 44 | // args = tail 45 | // 46 | // case ("PsaveVector") :: value :: tail => 47 | // saveVector = value.toBoolean 48 | // args = tail 49 | // case ("PhbaseTableName") :: value :: tail => 50 | // hbaseTableName = value 51 | // args = tail 52 | // case Nil => 53 | // 54 | // case _ => 55 | // throw new IllegalArgumentException(getUsageMessage(args)) 56 | // } 57 | // } 58 | // } 59 | // 60 | // def getUsageMessage(unknownParam: List[String] = null): String = { 61 | // val message = if (unknownParam != null) s"Unknown/unsupported param $unknownParam\n" else "" 62 | // message + 63 | // """ 64 | // |Usage: com.letv.batch.LDAModelBuild [options] 65 | // |Options: 66 | // | PdataPath the location where you put your training documents 67 | // | PmodelPath the location where you save your model 68 | // | PwordsPath the location of dictionary 69 | // | PtopicSize topic size of lda 70 | // | PmaxIterations maxIterations lda should run 71 | // | PsaveVector whether to save word vector and doc vector;default value is false 72 | // | PhbaseTableName hbase table name to save word vector ,doc vector;default value is spark_ml_lda_model_result 73 | // """.stripMargin 74 | // } 75 | //} 76 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/advance/LdaExtractTopics/Train/LDAModelBuild.scala: -------------------------------------------------------------------------------- 1 | package com.blogchong.spark.mllib.advance.LdaExtractTopics.Train 2 | 3 | import org.apache.spark.mllib.clustering.{DistributedLDAModel, LDA} 4 | import org.apache.spark.mllib.linalg.Vectors 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import scala.collection.mutable 7 | import scala.collection.mutable.ArrayBuffer 8 | import java.util.Date 9 | import com.blogchong.util.NewTime 10 | 11 | /** 12 | * Author: blogchong 13 | * Blog: www.blogchong.com 14 | * Mailbox: blogchong@163.com 15 | * Data: 2015/10/23 16 | * Describe: LDA主题词训练实验 17 | */ 18 | 19 | object LDAModelBuild { 20 | def main(args: Array[String]) = { 21 | 22 | args.map(f=>println(f)) 23 | 24 | val argsParser = new LDAModelBuildArgsParser 25 | 26 | require(args.length >= 3, argsParser.getUsageMessage(null)) 27 | 28 | argsParser.parseArgs(args.toList) 29 | 30 | val dataPath = argsParser.dataPath 31 | val wordsPath = argsParser.wordsPath 32 | val modelPath = argsParser.modelPath 33 | 34 | val conf = new SparkConf().setAppName("LDAModelBuild") 35 | val sc = new SparkContext(conf) 36 | 37 | //生成对应关系。要求为字典 \t 格式 38 | val wordToLabelLocal = sc.textFile(wordsPath).map { 39 | f => 40 | val Array(label, word) = f.split("\t") 41 | (word, label.toInt) 42 | }.collect.toMap 43 | 44 | //将字典广播出去 45 | val keywordsDis = sc.broadcast(wordToLabelLocal.keys.toSet) 46 | val wordToLabelDis = sc.broadcast(wordToLabelLocal) 47 | val labelToWordToDis = sc.broadcast(wordToLabelLocal.map(f => (f._2, f._1)).toMap) 48 | 49 | val dataPathCollections = dataPath.split(",") 50 | 51 | var data = sc.textFile(dataPathCollections(0)) 52 | 53 | if (dataPathCollections.length > 1) { 54 | dataPathCollections.takeRight(dataPathCollections.length - 1).foreach { 55 | k => 56 | data = data.union(sc.textFile(k)) 57 | } 58 | } 59 | 60 | //获取文档编号。每条内容的格式为\t\s.... 其中id为文档的业务编号。我们会再生成一个 61 | //LDA需要的Long类型编号，并且对应 62 | val docs = data.zipWithIndex.map(_.swap). 63 | map { 64 | f => 65 | val splitters = f._2.split("\t") 66 | val id = splitters(0) 67 | val sentence = splitters.takeRight(splitters.length - 1).mkString(" ") 68 | Doc(f._1, id, sentence.split("\\s+").filter(word => keywordsDis.value.contains(word))) 69 | 70 | }.filter(f => f.sentence.length > 0) 71 | 72 | //获得训练集，仅仅使用词频作为权重。把文档转化为向量 73 | val corpus = docs.map { 74 | f => 75 | val docVector = f.sentence.groupBy(f => f).map { 76 | k => 77 | val wordLabel = wordToLabelDis.value(k._1) 78 | (wordLabel.toInt, k._2.size.toDouble) 79 | }.toSeq 80 | (f.label, Vectors.sparse(50269, docVector)) 81 | // (f.label, Vectors.sparse(wordToLabelDis.value.size, docVector)) 82 | }.repartition(20) 83 | 84 | //主题模型训练 85 | val topicSize = argsParser.topicSize 86 | val ldaModel = new LDA().setK(topicSize).setMaxIterations(argsParser.maxIterations) 87 | .run(corpus).asInstanceOf[DistributedLDAModel] 88 | 89 | //DistributedLDAModel转换为LocalLDAModel 90 | val ldaModelLocal = ldaModel.toLocal 91 | 92 | //保存模型，保存最原始的类型 93 | val dateDate = new Date 94 | val saveTime = NewTime.dateToString(dateDate, NewTime.`type`) 95 | 96 | ldaModelLocal.save(sc, modelPath + "/" + saveTime + "/localLdaModel") 97 | ldaModel.save(sc, modelPath + "/" + saveTime + "/distributedLDAModel") 98 | 99 | //存储文档数字编号和id的对应关系 100 | docs.map(f => s"${f.label},${f.id}").saveAsTextFile(modelPath + "/" + saveTime + "/docLabelToId") 101 | 102 | if (argsParser.saveVector) { 103 | 104 | val docLabelToId = docs.map(f => (f.label, f.id)).collect().toMap 105 | 106 | //存储内容的主题分布 107 | ldaModel.topTopicsPerDocument(topicSize).map { 108 | f => 109 | (f._1, f._2 zip f._3) 110 | }.map(f => s"${docLabelToId(f._1)}\t${f._2.map(k => k._1 + ":" + k._2).mkString(" ")}") 111 | .saveAsTextFile(modelPath + "/" + saveTime + "/docLabelToTopics") 112 | 113 | //存储词的主题分布 114 | val mmm = new mutable.HashMap[String, mutable.ArrayBuffer[(Int, Double)]]() 115 | val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = wordToLabelLocal.size) 116 | topicIndices.map { 117 | case (terms, termWeights) => 118 | terms.map(labelToWordToDis.value(_)).zip(termWeights) 119 | }.zipWithIndex.map { 120 | case (topic, i) => 121 | i + " " + topic.map { 122 | case (term, weight) => s"$term:$weight" 123 | }.mkString(" ") 124 | }.foreach { 125 | f => 126 | val line = f.split("\\s+") 127 | val topic = line(0).toInt 128 | line.takeRight(line.length - 1).map { 129 | f => 130 | val Array(t, w) = f.split(":") 131 | val item = (topic, w.toDouble) 132 | if (!mmm.contains(t)) { 133 | mmm(t) = new ArrayBuffer[(Int, Double)]() 134 | } 135 | mmm(t) += item 136 | } 137 | } 138 | 139 | val wordToTopic = mmm.map{ 140 | f => 141 | val item = f._2.sortBy(k => k._1).map(k => s"${k._1}:${k._2}").mkString(" ") 142 | s"${f._1}\t${item}" 143 | } 144 | 145 | sc.parallelize(wordToTopic.toSeq).saveAsTextFile(modelPath + "/" + saveTime + "/wordToTopics") 146 | 147 | } 148 | 149 | sc.stop() 150 | } 151 | 152 | 153 | } 154 | 155 | case class Doc(label: Long, id: String, sentence: Array[String]) 156 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/advance/LdaExtractTopics/Train/LDAModelBuildArgsParser.scala: -------------------------------------------------------------------------------- 1 | package com.blogchong.spark.mllib.advance.LdaExtractTopics.Train 2 | 3 | /** 4 | * Author: blogchong 5 | * Blog: www.blogchong.com 6 | * Mailbox: blogchong@163.com 7 | * Data: 2015/10/23 8 | * Describe: LDA主题训练实验，处理传参 9 | */ 10 | 11 | class LDAModelBuildArgsParser { 12 | 13 | var dataPath: String = null 14 | var modelPath: String = null 15 | var wordsPath: String = null 16 | var saveVector: Boolean = false 17 | var topicSize: Int = 160 18 | var maxIterations: Int = 160 19 | 20 | def parseArgs(inputArgs: List[String]): Unit = { 21 | 22 | var args = inputArgs 23 | 24 | while (!args.isEmpty) { 25 | args match { 26 | case ("PdataPath") :: value :: tail => 27 | dataPath = value 28 | println("PdataPath: " + dataPath) 29 | args = tail 30 | case ("PmodelPath") :: value :: tail => 31 | modelPath = value 32 | println("PmodelPath: " + modelPath) 33 | args = tail 34 | 35 | case ("PtopicSize") :: value :: tail => 36 | topicSize = value.toInt 37 | println("PtopicSize: " + topicSize) 38 | args = tail 39 | 40 | case ("PmaxIterations") :: value :: tail => 41 | maxIterations = value.toInt 42 | println("PmaxIterations: " + maxIterations) 43 | args = tail 44 | 45 | case ("PwordsPath") :: value :: tail => 46 | wordsPath = value 47 | println("PwordsPath: " + wordsPath) 48 | args = tail 49 | 50 | case ("PsaveVector") :: value :: tail => 51 | saveVector = value.toBoolean 52 | println("PsaveVector: " + saveVector) 53 | args = tail 54 | case Nil => 55 | 56 | case _ => 57 | throw new IllegalArgumentException(getUsageMessage(args)) 58 | } 59 | } 60 | } 61 | 62 | def getUsageMessage(unknownParam: List[String] = null): String = { 63 | val message = if (unknownParam != null) s"Unknown/unsupported param $unknownParam\n" else "" 64 | message + 65 | """ 66 | |Usage: com.blogchong.spark.mllib.advance.CSDN.LDAModelBuild [options] 67 | |Options: 68 | | PdataPath the location where you put your training documents 69 | | PmodelPath the location where you save your model 70 | | PwordsPath the location of dictionary 71 | | PtopicSize topic size of lda 72 | | PmaxIterations maxIterations lda should run 73 | | PsaveVector whether to save word vector and doc vector;default value is false 74 | """.stripMargin 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/base/AlsArithmetic.scala: -------------------------------------------------------------------------------- 1 | package com.blogchong.spark.mllib.base 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.mllib.recommendation.{ALS, Rating} 6 | 7 | /** 8 | * Author: blogchong 9 | * Blog: www.blogchong.com 10 | * Mailbox: blogchong@163.com 11 | * Data: 2015/10/30 12 | * Describe:协同过滤中，基于模型的协同，最小二乘法ALS算法 13 | */ 14 | object AlsArithmetic { 15 | def main(args: Array[String]) { 16 | 17 | // 屏蔽不必要的日志显示在终端上 18 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 19 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) 20 | 21 | // 设置运行环境 22 | val conf = new SparkConf().setAppName("ALS") 23 | val sc = new SparkContext(conf) 24 | 25 | // 装载数据集，这是一个三列数据，用户ID：电影ID：用户对该电影的评分 26 | val data = sc.textFile("hdfs://192.168.5.200:9000/spark/mllib/data/als/test.data") 27 | //进行数组化操作 28 | val ratings = data.map(_.split(",") match { case Array(user, item, rate) => 29 | Rating(user.toInt, item.toInt, rate.toDouble) 30 | }) 31 | 32 | //进行ALS三个重要参数设置 33 | val rank = 10 34 | val numIterations = 10 35 | val lambda = 0.1 36 | 37 | // 模型文件训练 38 | val model = ALS.train(ratings, rank, numIterations, lambda) 39 | 40 | //评估模型 41 | val usersProducts = ratings.map{ case Rating(user, product, rate) => 42 | (user, product) 43 | } 44 | 45 | //通过模型RDD进行效果预测 46 | val predictions = model.predict(usersProducts).map { 47 | case Rating(user, product, rate) => 48 | ((user, product), rate) 49 | } 50 | 51 | val ratesAndPreds = ratings.map { case Rating(user, product, rate) => 52 | ((user, product), rate) 53 | }.join(predictions) 54 | 55 | //效果评判，通过计算预测评分的均方误差来衡量 56 | val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) => 57 | val err = (r1 - r2) 58 | err * err 59 | }.mean() 60 | println("Mean Squared Error = " + MSE) 61 | 62 | //保存模型文件 63 | val modelPath = "hdfs://192.168.5.200:9000/spark/mllib/result/als" 64 | model.save(sc, modelPath) 65 | //使用已经生成的模型文件模型文件 66 | //val sameModel = MatrixFactorizationModel.load(sc, modelPath) 67 | 68 | sc.stop() 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/base/Kmeans.scala: -------------------------------------------------------------------------------- 1 | package com.blogchong.spark.mllib.base 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.mllib.clustering.KMeans 6 | import org.apache.spark.mllib.linalg.Vectors 7 | 8 | /** 9 | * Author: blogchong 10 | * Blog: www.blogchong.com 11 | * Mailbox: blogchong@163.com 12 | * Data: 2015/10/30 13 | * Describe:K-meams聚类算法基本实例 14 | */ 15 | object Kmeans { 16 | def main(args: Array[String]) { 17 | 18 | // 屏蔽不必要的日志显示在终端上 19 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 20 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) 21 | 22 | // 设置运行环境 23 | val conf = new SparkConf().setAppName("Kmeans") 24 | val sc = new SparkContext(conf) 25 | 26 | // 装载数据集 27 | val data = sc.textFile("hdfs://192.168.5.200:9000/spark/mllib/data/kmeans_data.txt", 1) 28 | val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))) 29 | 30 | // 将数据集聚类，2个类，20次迭代，进行模型训练形成数据模型 31 | val numClusters = 2 32 | val numIterations = 20 33 | val model = KMeans.train(parsedData, numClusters, numIterations) 34 | 35 | // 打印数据模型的中心点 36 | println("Cluster centers:") 37 | for (c <- model.clusterCenters) { 38 | println(" " + c.toString) 39 | } 40 | 41 | // 使用误差平方之和来评估数据模型 42 | val cost = model.computeCost(parsedData) 43 | println("Within Set Sum of Squared Errors = " + cost) 44 | 45 | // 使用模型测试单点数据 46 | println("Vectors 0.2 0.2 0.2 is belongs to clusters:" + model.predict(Vectors.dense("0.2 0.2 0.2".split(' ').map(_.toDouble)))) 47 | println("Vectors 0.25 0.25 0.25 is belongs to clusters:" + model.predict(Vectors.dense("0.25 0.25 0.25".split(' ').map(_.toDouble)))) 48 | println("Vectors 8 8 8 is belongs to clusters:" + model.predict(Vectors.dense("8 8 8".split(' ').map(_.toDouble)))) 49 | 50 | // 交叉评估1，只返回结果 51 | val testdata = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))) 52 | val result1 = model.predict(testdata) 53 | result1.saveAsTextFile("hdfs://192.168.5.200:9000/spark/mllib/result/kmeams1") 54 | 55 | // 交叉评估2，返回数据集和结果 56 | val result2 = data.map { 57 | line => 58 | val linevectore = Vectors.dense(line.split(' ').map(_.toDouble)) 59 | val prediction = model.predict(linevectore) 60 | line + " " + prediction 61 | }.saveAsTextFile("hdfs://192.168.5.200:9000/spark/mllib/result/kmeams2") 62 | 63 | sc.stop() 64 | } 65 | } -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/base/KmeansArithmetic.scala: -------------------------------------------------------------------------------- 1 | package com.blogchong.spark.mllib.base 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.mllib.clustering.KMeans 6 | import org.apache.spark.mllib.linalg.Vectors 7 | 8 | /** 9 | * Author: blogchong 10 | * Blog: www.blogchong.com 11 | * Mailbox: blogchong@163.com 12 | * Data: 2015/10/30 13 | * Describe:K-meams聚类算法 14 | */ 15 | object KmeansArithmetic { 16 | def main(args: Array[String]) { 17 | 18 | // 屏蔽不必要的日志显示在终端上 19 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 20 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) 21 | 22 | // 设置运行环境 23 | val conf = new SparkConf().setAppName("Kmeans") 24 | val sc = new SparkContext(conf) 25 | 26 | // 装载数据集 27 | val data = sc.textFile("hdfs://192.168.5.200:9000/spark/mllib/data/kmeans_data.txt", 1) 28 | val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))) 29 | 30 | // 将数据集聚类，2个类，20次迭代，进行模型训练形成数据模型 31 | val numClusters = 2 32 | val numIterations = 20 33 | val model = KMeans.train(parsedData, numClusters, numIterations) 34 | 35 | // 打印数据模型的中心点 36 | println("Cluster centers:") 37 | for (c <- model.clusterCenters) { 38 | println(" " + c.toString) 39 | } 40 | 41 | // 使用误差平方之和来评估数据模型 42 | val cost = model.computeCost(parsedData) 43 | println("Within Set Sum of Squared Errors = " + cost) 44 | 45 | // 使用模型测试单点数据 46 | println("Vectors 0.2 0.2 0.2 is belongs to clusters:" + model.predict(Vectors.dense("0.2 0.2 0.2".split(' ').map(_.toDouble)))) 47 | println("Vectors 0.25 0.25 0.25 is belongs to clusters:" + model.predict(Vectors.dense("0.25 0.25 0.25".split(' ').map(_.toDouble)))) 48 | println("Vectors 8 8 8 is belongs to clusters:" + model.predict(Vectors.dense("8 8 8".split(' ').map(_.toDouble)))) 49 | 50 | // 交叉评估1，只返回结果 51 | val testdata = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))) 52 | val result1 = model.predict(testdata) 53 | result1.saveAsTextFile("hdfs://192.168.5.200:9000/spark/mllib/result/kmeams1") 54 | 55 | // 交叉评估2，返回数据集和结果 56 | val result2 = data.map { 57 | line => 58 | val linevectore = Vectors.dense(line.split(' ').map(_.toDouble)) 59 | val prediction = model.predict(linevectore) 60 | line + " " + prediction 61 | }.saveAsTextFile("hdfs://192.168.5.200:9000/spark/mllib/result/kmeams2") 62 | 63 | sc.stop() 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/base/LdaArithmetic.scala: -------------------------------------------------------------------------------- 1 | package com.blogchong.spark.mllib.base 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.mllib.clustering.{LDA, DistributedLDAModel} 6 | import org.apache.spark.mllib.linalg.Vectors 7 | 8 | /** 9 | * Author: blogchong 10 | * Blog: www.blogchong.com 11 | * Mailbox: blogchong@163.com 12 | * Data: 2015/10/30 13 | * Describe:LDA主题模型基础实例 14 | */ 15 | object LdaArithmetic { 16 | def main(args: Array[String]) { 17 | 18 | // 屏蔽不必要的日志显示在终端上 19 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 20 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) 21 | 22 | // 设置运行环境 23 | val conf = new SparkConf().setAppName("LDA") 24 | val sc = new SparkContext(conf) 25 | 26 | val modelPath = "hdfs://192.168.5.200:9000/spark/mllib/result/lda/model" 27 | //doc-topic 28 | val modelPath2 = "hdfs://192.168.5.200:9000/spark/mllib/result/lda/model2" 29 | 30 | //1 加载数据，返回的数据格式为：documents: RDD[(Long, Vector)] 31 | // 其中：Long为文章ID，Vector为文章分词后的词向量 32 | // 可以读取指定目录下的数据，通过分词以及数据格式的转换，转换成RDD[(Long, Vector)]即可 33 | val data = sc.textFile("hdfs://192.168.5.200:9000/spark/mllib/data/sample_lda_data.txt", 1) 34 | val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))) 35 | //通过唯一id为文档构建index 36 | val corpus = parsedData.zipWithIndex.map(_.swap).cache() 37 | 38 | //2 建立模型，设置训练参数，训练模型 39 | /** 40 | * k: 主题数，或者聚类中心数 41 | * DocConcentration：文章分布的超参数(Dirichlet分布的参数)，必需>1.0 42 | * TopicConcentration：主题分布的超参数(Dirichlet分布的参数)，必需>1.0 43 | * MaxIterations：迭代次数 44 | * setSeed：随机种子 45 | * CheckpointInterval：迭代计算时检查点的间隔 46 | * Optimizer：优化计算方法，目前支持"em", "online" 47 | */ 48 | val ldaModel = new LDA(). 49 | setK(3). 50 | setDocConcentration(5). 51 | setTopicConcentration(5). 52 | setMaxIterations(20). 53 | setSeed(0L). 54 | setCheckpointInterval(10). 55 | setOptimizer("em"). 56 | run(corpus) 57 | 58 | //3 模型输出，模型参数输出，结果输出，输出的结果是是针对于每一个分类，对应的特征打分 59 | // Output topics. Each is a distribution over words (matching word count vectors) 60 | println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):") 61 | val topics = ldaModel.topicsMatrix 62 | for (topic <- Range(0, 3)) { 63 | //print(topic + ":") 64 | val words = for (word <- Range(0, ldaModel.vocabSize)) { " " + topics(word, topic); } 65 | topic + ":" + words 66 | // println() 67 | } 68 | 69 | val dldaModel = ldaModel.asInstanceOf[DistributedLDAModel] 70 | val tmpLda = dldaModel.topTopicsPerDocument(3).map { 71 | f => 72 | (f._1, f._2 zip f._3) 73 | }.map(f => s"${f._1} ${f._2.map(k => k._1 + ":" + k._2).mkString(" ")}").repartition(1).saveAsTextFile(modelPath2) 74 | 75 | //保存模型文件 76 | ldaModel.save(sc, modelPath) 77 | //再次使用 78 | //val sameModel = DistributedLDAModel.load(sc, modelPath) 79 | 80 | sc.stop() 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/spark/mllib/base/Word2Vec.scala: -------------------------------------------------------------------------------- 1 | package com.blogchong.spark.mllib.base 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.{SparkContext, SparkConf} 5 | import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel} 6 | 7 | /** 8 | * Author: blogchong 9 | * Blog: www.blogchong.com 10 | * Mailbox: blogchong@163.com 11 | * Data: 2015/11/23 12 | * Describe: 特征抽取Word2Vec算法基础实例 13 | */ 14 | object Word2Vec { 15 | 16 | def main(args: Array[String]) { 17 | 18 | // 屏蔽不必要的日志显示在终端上 19 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 20 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) 21 | 22 | // 设置运行环境 23 | val conf = new SparkConf().setAppName("ALS") 24 | val sc = new SparkContext(conf) 25 | 26 | val modelPath = "hdfs://192.168.5.200:9000/spark/mllib/result/feature/word2vec/model" 27 | 28 | val input = sc.textFile("hdfs://192.168.5.200:9000/spark/mllib/data/feature/word2vec2.txt") 29 | .map(line => line.split(" ").toSeq) 30 | 31 | val word2vec = new Word2Vec() 32 | 33 | val model = word2vec.fit(input) 34 | 35 | val synonyms = model.findSynonyms("as", 40) 36 | 37 | for((synonym, cosineSimilarity) <- synonyms) { 38 | println(s"输出[$synonym $cosineSimilarity]") 39 | } 40 | 41 | // Save and load model 42 | model.save(sc, modelPath) 43 | //val sameModel = Word2VecModel.load(sc, modelPath) 44 | 45 | sc.stop() 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/util/CharUtil.java: -------------------------------------------------------------------------------- 1 | package com.blogchong.util; 2 | 3 | /** 4 | * Author: blogchong 5 | * Blog: www.blogchong.com 6 | * Mailbox: blogchong@163.com 7 | * Data: 2015/11/6 8 | * Describe:字符通用工具类 9 | */ 10 | public class CharUtil { 11 | 12 | public static void main(String[] args) { 13 | 14 | String str = "0改000123"; 15 | if (isNumeric(str)) { 16 | System.out.println("ok"); 17 | System.out.println("Num1: " + countChineseNum(str)); 18 | System.out.println("Num2: " + str.length()); 19 | } else { 20 | System.out.println("no"); 21 | System.out.println("Num: " + str.length()); 22 | } 23 | 24 | } 25 | 26 | //判断字符串有几个中文 27 | public static int countChineseNum(String str) { 28 | String regex = "[\u4e00-\u9fff]"; 29 | int count = (" " + str + " ").split (regex).length - 1; 30 | return count; 31 | } 32 | 33 | // 判断一个字符串是否含有中文 34 | public static boolean isChinese(String str) { 35 | if (str == null) return false; 36 | for (char c : str.toCharArray()) { 37 | if (isChinese(c)) 38 | return true; 39 | // 有一个中文字符就返回 40 | } 41 | return false; 42 | } 43 | public static boolean isChinese(char c) { 44 | // 根据字节码判断 45 | return c >= 0x4E00 && c <= 0x9FA5; 46 | } 47 | 48 | //判断字符串纯数字 49 | public static boolean isNumeric(String str){ 50 | for (int i = 0; i < str.length(); i++){ 51 | if (!Character.isDigit(str.charAt(i))){ 52 | return false; 53 | } 54 | } 55 | return true; 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/com/blogchong/util/NewTime.java: -------------------------------------------------------------------------------- 1 | package com.blogchong.util; 2 | 3 | import java.text.ParseException; 4 | import java.text.SimpleDateFormat; 5 | import java.util.Date; 6 | 7 | /** 8 | * Author: blogchong 9 | * Blog: www.blogchong.com 10 | * Mailbox: blogchong@163.com 11 | * QQGroup: 191321336 12 | * Weixin: blogchong 13 | * Data: 2015/7/29 14 | * Describe:时间转换工具 15 | */ 16 | public class NewTime { 17 | 18 | public static String type = "yyyy-MM-dd-HH-mm-ss"; 19 | 20 | public static void main(String[] args) throws Exception { 21 | 22 | long interval = 32; 23 | 24 | Date dateDate = new Date(); 25 | 26 | String strDate = dateToString(dateDate, type); 27 | 28 | Long longDate = stringToLong(strDate, type); 29 | 30 | Long longDate2 = longDate - (interval * 24 * 60 * 60 * 1000); 31 | String strDate2 = longToString(longDate2, type); 32 | 33 | 34 | System.out.println("初始时间1：" + strDate); 35 | System.out.println("初始时间2：" + strDate2); 36 | //System.out.println("差：" + (stringToLong(strDate, type) - stringToLong(testDate, type))); 37 | 38 | // System.out.println("初始时间1：" + stringToLong(strDate, type)); 39 | // System.out.println("初始时间2：" + stringToLong(testDate, type)); 40 | // System.out.println("差：" + (stringToLong(strDate, type) - stringToLong(testDate, type))); 41 | 42 | 43 | // System.out.println(nowTime); 44 | // SimpleDateFormat time=new SimpleDateFormat("yyyy-MM-dd"); 45 | // System.out.println(time.format(nowTime)); 46 | } 47 | 48 | // date类型转换为String类型 49 | // formatType格式为yyyy-MM-dd HH:mm:ss//yyyy年MM月dd日 HH时mm分ss秒 50 | // data Date类型的时间 51 | public static String dateToString(Date data, String formatType) { 52 | return new SimpleDateFormat(formatType).format(data); 53 | } 54 | 55 | // long类型转换为String类型 56 | // currentTime要转换的long类型的时间 57 | // formatType要转换的string类型的时间格式 58 | public static String longToString(long currentTime, String formatType) 59 | throws ParseException { 60 | Date date = longToDate(currentTime, formatType); // long类型转成Date类型 61 | String strTime = dateToString(date, formatType); // date类型转成String 62 | return strTime; 63 | } 64 | 65 | // string类型转换为date类型 66 | // strTime要转换的string类型的时间，formatType要转换的格式yyyy-MM-dd HH:mm:ss//yyyy年MM月dd日 67 | // HH时mm分ss秒， 68 | // strTime的时间格式必须要与formatType的时间格式相同 69 | public static Date stringToDate(String strTime, String formatType) 70 | throws ParseException { 71 | SimpleDateFormat formatter = new SimpleDateFormat(formatType); 72 | Date date = null; 73 | date = formatter.parse(strTime); 74 | return date; 75 | } 76 | 77 | // long转换为Date类型 78 | // currentTime要转换的long类型的时间 79 | // formatType要转换的时间格式yyyy-MM-dd HH:mm:ss//yyyy年MM月dd日 HH时mm分ss秒 80 | public static Date longToDate(long currentTime, String formatType) 81 | throws ParseException { 82 | Date dateOld = new Date(currentTime); // 根据long类型的毫秒数生命一个date类型的时间 83 | String sDateTime = dateToString(dateOld, formatType); // 把date类型的时间转换为string 84 | Date date = stringToDate(sDateTime, formatType); // 把String类型转换为Date类型 85 | return date; 86 | } 87 | 88 | // string类型转换为long类型 89 | // strTime要转换的String类型的时间 90 | // formatType时间格式 91 | // strTime的时间格式和formatType的时间格式必须相同 92 | public static long stringToLong(String strTime, String formatType) 93 | throws ParseException { 94 | Date date = stringToDate(strTime, formatType); // String类型转成date类型 95 | if (date == null) { 96 | return 0; 97 | } else { 98 | long currentTime = dateToLong(date); // date类型转成long类型 99 | return currentTime; 100 | } 101 | } 102 | 103 | // date类型转换为long类型 104 | // date要转换的date类型的时间 105 | public static long dateToLong(Date date) { 106 | return date.getTime(); 107 | } 108 | 109 | } 110 | --------------------------------------------------------------------------------