├── .gitignore
├── build.sbt
├── project
    └── plugins.sbt
├── scripts
    ├── CbcfService.sh
    ├── CommunityIbcfService.sh
    ├── InventoryIbcfService.sh
    ├── InventoryPortraitCleanService.sh
    ├── UserPortraitAttenuationService.sh
    ├── UserUbcfService.sh
    ├── community
    │   └── CommunityIBCF.sh
    ├── extract
    │   └── AccessLogToKafka.sh
    ├── inventory
    │   ├── InventoryIBCF.sh
    │   ├── InventoryPortraitClean.sh
    │   └── PropertyInventoryIndex.sh
    └── user
    │   ├── UserPortrait.sh
    │   ├── UserPortraitAttenuation.sh
    │   └── UserUbcf.sh
└── src
    ├── main
        ├── java
        │   └── com
        │   │   └── angejia
        │   │       └── dw
        │   │           ├── common
        │   │               └── util
        │   │               │   ├── DateUtil.java
        │   │               │   ├── DebugUtil.java
        │   │               │   ├── FileUtil.java
        │   │               │   ├── JavaJsonUtil.java
        │   │               │   ├── PropertyUtil.java
        │   │               │   ├── mysql
        │   │               │       └── JavaMysqlClient.java
        │   │               │   └── parse
        │   │               │       ├── ParseMobileAgent.java
        │   │               │       └── ParseMobileToken.java
        │   │           ├── hadoop
        │   │               └── hive
        │   │               │   └── HiveClient.java
        │   │           └── service
        │   │               ├── Conf.java
        │   │               ├── property
        │   │                   ├── PropertyInventoryService.java
        │   │                   └── model
        │   │                   │   └── Inventory.java
        │   │               └── user
        │   │                   └── UserService.java
        ├── resources
        │   ├── conf_dev.properties
        │   ├── conf_online.properties
        │   └── log4j.properties
        └── scala
        │   └── com
        │       └── angejia
        │           └── dw
        │               ├── common
        │                   └── util
        │                   │   ├── JsonUtil.scala
        │                   │   ├── ListenerFile.scala
        │                   │   ├── RegexUtil.scala
        │                   │   ├── ScFileUtil.scala
        │                   │   ├── ScriptUtil.scala
        │                   │   └── mysql
        │                   │       └── MysqlClient.scala
        │               ├── hadoop
        │                   ├── hbase
        │                   │   └── HBaseClient.scala
        │                   ├── hdfs
        │                   │   ├── HDFSClient.scala
        │                   │   └── HDFSClientTest.scala
        │                   ├── kafka
        │                   │   ├── KafkaConsumer.scala
        │                   │   └── KafkaProducer.scala
        │                   └── spark
        │                   │   ├── CollaborativeFiltering.scala
        │                   │   └── CollaborativeFilteringTest.scala
        │               ├── logs
        │                   ├── UbaAppActionLogStreaming.scala
        │                   ├── UbaWebActionLogStreaming.scala
        │                   └── UbaWebVisitLogStreaming.scala
        │               └── recommend
        │                   ├── Conf.scala
        │                   ├── IBCF.scala
        │                   ├── UBCF.scala
        │                   ├── community
        │                       └── CommunityIBCF.scala
        │                   ├── extract
        │                       └── ExtractFileToKafka.scala
        │                   ├── inventory
        │                       ├── InventoryIBCF.scala
        │                       ├── InventoryIBCFspark.scala
        │                       ├── InventoryItemCF.scala
        │                       ├── InventoryItemCFBak.scala
        │                       ├── InventoryItemCFTest.scala
        │                       └── portrait
        │                       │   ├── InventoryPortraitCommon.scala
        │                       │   └── MarketingInventoryPortrait.scala
        │                   └── user
        │                       ├── UserUBCF.scala
        │                       ├── UserUBCF20160517.scala
        │                       └── portrait
        │                           ├── UserPortrait.scala
        │                           ├── UserPortraitAttenuation.scala
        │                           ├── UserPortraitBrowse.scala
        │                           ├── UserPortraitCommon.scala
        │                           ├── UserPortraitFilter.scala
        │                           ├── UserPortraitLikeInventory.scala
        │                           ├── UserPortraitLinkInventory.scala
        │                           ├── UserPortraitMemberDemand.scala
        │                           ├── UserPortraitMemberDemand_20160808.scala
        │                           ├── UserPortraitNeeds.scala
        │                           ├── UserPortraitTagConf.scala
        │                           ├── UserPortraitTags.scala
        │                           ├── UserPortraitVisitItem.scala
        │                           └── UserPortraitrModelState.scala
    └── test
        └── scala
            └── com
                └── angejia
                    └── dw
                        └── recommend
                            ├── inventory
                                └── portrait
                                │   └── InventoryPortraitCommonTest.scala
                            └── user
                                └── portrait
                                    └── UserPortraitTest.scala


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .settings/
3 | .classpath
4 | .project
5 | .springBeans
6 | target/
7 | derby.log
8 | metastore_db
9 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
  1 | import AssemblyKeys._
  2 | 
  3 | // 打开 assembly 插件功能
  4 | assemblySettings
  5 | 
  6 | // 配置 assembly 插件所有使用的 JAR
  7 | jarName in assembly := "recommend-2.0.jar"
  8 | 
  9 | // 项目名称
 10 | name := "recommend-2.0"
 11 | 
 12 | // 组织名称
 13 | organization := "com.angejia.dw.recommend"
 14 | 
 15 | // 项目版本号
 16 | version := "2.0"
 17 | 
 18 | // scala 版本
 19 | scalaVersion := "2.11.8"
 20 | 
 21 | // Eclipse 支持
 22 | EclipseKeys.createSrc := EclipseCreateSrc.Default + EclipseCreateSrc.Resource
 23 | 
 24 | // 非托管资源目录
 25 | unmanagedResourceDirectories in Compile += { baseDirectory.value / "src/main/resources" }
 26 | 
 27 | // 相关依赖
 28 | libraryDependencies ++= Seq(
 29 |     // scala-library
 30 |     "org.scala-lang" % "scala-library" % "2.11.8",
 31 | 
 32 |     // hadoop 依赖
 33 |     "org.apache.hadoop" % "hadoop-common" % "2.6.0",
 34 |     "org.apache.hadoop" % "hadoop-hdfs" % "2.6.0",
 35 |     "org.apache.hadoop" % "hadoop-client" % "2.6.0",
 36 | 
 37 |     // Spark 依赖 : spark-core_2.11(spark 所属 scala 版本号) 2.0.2(spark 版本号)
 38 |     "org.apache.spark" % "spark-core_2.11" % "2.0.2",
 39 |     "org.apache.spark" % "spark-streaming_2.11" % "2.0.2",
 40 |     "org.apache.spark" % "spark-streaming-kafka-0-10_2.11" % "2.0.2"
 41 |     //"org.apache.spark" % "spark-streaming-kafka-0-8_2.11" % "2.0.2"
 42 |         exclude("org.apache.avro","*")
 43 |         exclude("org.slf4j","*"),
 44 |     "org.apache.spark" % "spark-mllib_2.11" % "2.0.2",
 45 |     // spark sql
 46 |     "org.apache.spark" % "spark-sql_2.11" % "2.0.2",
 47 |     "org.apache.spark" % "spark-hive_2.11" % "2.0.2",
 48 |     //"org.apache.avro"  % "avro" % "1.7.4",
 49 |     //"org.apache.avro"  % "avro-ipc" % "1.7.4" excludeAll(excludeNetty),
 50 | 
 51 |     // hive 相关 JDBC
 52 |     "org.apache.hive" % "hive-common" % "1.1.0",
 53 |     //"org.apache.hive" % "hive-exec" % "1.1.0",
 54 |     "org.apache.hive" % "hive-jdbc" % "1.1.0", 
 55 |     "org.apache.hive" % "hive-cli" % "1.1.0",
 56 |     //"org.spark-project.hive" % "hive-beeline" % "1.2.1.spark2",
 57 | 
 58 |     // jblas 线性代数库,求向量点积
 59 |     "org.jblas" % "jblas" % "1.2.4",
 60 | 
 61 |     // Kafka 依赖 
 62 |     "org.apache.kafka" % "kafka-log4j-appender" % "0.10.1.0" % "provided",
 63 |     "org.apache.kafka" % "kafka_2.11" % "0.10.1.0"
 64 |         exclude("javax.jms", "jms")
 65 |         exclude("com.sun.jdmk", "jmxtools") 
 66 |         exclude("com.sun.jmx", "jmxri"),
 67 | 
 68 |     // Hbase 依赖
 69 |     //"org.apache.hbase" % "hbase" % "1.0.0",
 70 |     "org.apache.hbase" % "hbase-common" % "1.0.0",
 71 |     "org.apache.hbase" % "hbase-client" % "1.0.0",
 72 |     "org.apache.hbase" % "hbase-server" % "1.0.0",
 73 |     //"org.apache.hbase" % "hbase-protocol" % "1.0.0",
 74 |     //"org.apache.htrace" % "htrace-core" % "3.1.0-incubating",
 75 | 
 76 |     // Mysql 依赖
 77 |     "mysql" % "mysql-connector-java" % "5.1.38",
 78 | 
 79 |     // ES 客户端
 80 |     //"org.elasticsearch" % "elasticsearch" % "2.3.4",
 81 |     // 原始 elasticsearch 依赖因为 guava 包会产生冲突 , HBase 使用的是 12.0,  ES 使用的是 19.0 的版本
 82 |     // 解决方法: http://blog.csdn.net/sunshine920103/article/details/51659936
 83 |     //"com.angejia.dw.elasticsearch" % "dw_elasticsearch" % "1.0",
 84 | 
 85 |     // play Json 包, 版本太高会冲突
 86 |     "com.typesafe.play" % "play-json_2.11" % "2.3.9",
 87 |     // spray Json 包
 88 |     "io.spray" % "spray-json_2.11" % "1.3.2",
 89 |     // smart Json 包
 90 |     "net.minidev" % "json-smart" % "2.2.1",
 91 | 
 92 |     // java Json 包
 93 |     "com.googlecode.json-simple" % "json-simple" % "1.1.1",
 94 | 
 95 |     // ORM 框架 Hibernate
 96 |     //"org.hibernate" % "hibernate-core" % "5.2.1.Final",
 97 |     //"org.hibernate.javax.persistence" % "hibernate-jpa-2.0-api" % "1.0.1.Final",
 98 |     //"commons-logging" % "commons-logging" % "1.2",
 99 |     //"commons-collections" % "commons-collections" % "3.2.2",
100 |     //"cglib" % "cglib" % "3.2.4",
101 |     //"dom4j" % "dom4j" % "1.6.1",
102 | 
103 |     // 其他
104 |     "net.sf.jopt-simple" % "jopt-simple" % "4.9" % "provided",
105 |     "joda-time" % "joda-time" % "2.9.2" % "provided",
106 |     "commons-codec" % "commons-codec" % "1.10",
107 |     "log4j" % "log4j" % "1.2.9",
108 |     "com.github.scopt" %% "scopt" % "3.5.0",
109 | 
110 |     // 单元测试框架
111 |     "org.scalatest" %% "scalatest" % "3.0.0" % "test"
112 | )
113 | 
114 | 
115 | // 强制默认合并
116 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => {
117 |  case entry => {
118 |    val strategy = mergeStrategy(entry)
119 |    if (strategy == MergeStrategy.deduplicate) MergeStrategy.first
120 |    else strategy
121 |  }
122 | }}
123 | 
124 | 
125 | // 配置远程资源
126 | resolvers ++= Seq(
127 | 
128 |       // HTTPS is unavailable for Maven Central
129 |       "Maven Repository"     at "http://repo.maven.apache.org/maven2",
130 |       "Apache Repository"    at "https://repository.apache.org/content/repositories/releases",
131 |       "JBoss Repository"     at "https://repository.jboss.org/nexus/content/repositories/releases/",
132 |       "Cloudera Repository"  at "https://repository.cloudera.com/artifactory/cloudera-repos/",
133 |       "Elaticsearch Repository" at "https://mvnrepository.com/artifact/org.elasticsearch/elasticsearch",
134 |       "MQTT Repository"      at "https://repo.eclipse.org/content/repositories/paho-releases/",
135 | 
136 |       // For Sonatype publishing
137 |       // "sonatype-snapshots"   at "https://oss.sonatype.org/content/repositories/snapshots",
138 |       // "sonatype-staging"     at "https://oss.sonatype.org/service/local/staging/deploy/maven2/",
139 |       // also check the local Maven repository ~/.m2  "/usr/local/maven/repository"
140 | 
141 |       //本地 mavan 仓库地址
142 |       "Local Maven Repository" at "file:///usr/local/maven/repository",
143 |       Resolver.mavenLocal 
144 | )
145 | 
146 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
 2 | 
 3 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/"
 4 | 
 5 | // eclipse 插件
 6 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.5.0")
 7 | 
 8 | // 打包所有依赖的插件
 9 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2")
10 | 


--------------------------------------------------------------------------------
/scripts/CbcfService.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 重启 CBCF 所有服务 ./CbcfService.sh "/home/dwadmin/app/recommend/recommend-2.0"
 3 | 
 4 | # 项目路径
 5 | PROJECT_HOME=$1 
 6 | 
 7 | 
 8 | # 远程重启 用户画像
 9 | echo "UserPortrait"
10 | ssh -q -t dwadmin@bi4 "bash -i ${PROJECT_HOME}/scripts/user/UserPortrait.sh \"${PROJECT_HOME}/target/scala-2.11/recommend-2.0.jar\" "
11 | 
12 | # 远程重启 抽取日志脚本
13 | echo "AccessLogToKafka"
14 | ssh -q -t dwadmin@bi0 "bash -i ${PROJECT_HOME}/scripts/extract/AccessLogToKafka.sh \"${PROJECT_HOME}/target/scala-2.11/recommend-2.0.jar\" "
15 | 


--------------------------------------------------------------------------------
/scripts/CommunityIbcfService.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 重启 IBCF 所有服务 ./InventoryIbcfService.sh "/home/hadoop/app/recommend/recommend-2.0"
 3 | 
 4 | # 推荐系统项目路径
 5 | PROJECT_HOME=$1
 6 | 
 7 | # 在 task3 节点执行这个 sh
 8 | echo "CommunityIBCF"
 9 | ssh -q -t hadoop@uhadoop-ociicy-task3 "bash -i ${PROJECT_HOME}/scripts/community/CommunityIBCF.sh \"${PROJECT_HOME}/target/scala-2.10/recommend-2.0.jar\" "
10 | 
11 | 
12 | # dw_etl 项目路径
13 | #DW_ETL_HOME=/home/dwadmin/app/dw_etl
14 | # 把结果数据通过 HBASE 保存在 Hive 中
15 | #${DW_ETL_HOME}/dw_service/index.py  --service task --mo hive_task --par '{"sql":"source/real_time/rt_recommend_inventroy_ibcf_result.sql", "date":"today", "runEnv":"local"}'


--------------------------------------------------------------------------------
/scripts/InventoryIbcfService.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 重启 IBCF 所有服务 ./InventoryIbcfService.sh "/home/hadoop/app/recommend/recommend-2.0"
 3 | 
 4 | # 推荐系统项目路径
 5 | PROJECT_HOME=$1
 6 | 
 7 | # 在 task3 节点执行这个 sh
 8 | echo "InventoryIBCF"
 9 | ssh -q -t hadoop@uhadoop-ociicy-task3 "bash -i ${PROJECT_HOME}/scripts/inventory/InventoryIBCF.sh \"${PROJECT_HOME}/target/scala-2.10/recommend-2.0.jar\" "
10 | 
11 | 
12 | # dw_etl 项目路径
13 | #DW_ETL_HOME=/home/dwadmin/app/dw_etl
14 | # 把结果数据通过 HBASE 保存在 Hive 中
15 | #${DW_ETL_HOME}/dw_service/index.py  --service task --mo hive_task --par '{"sql":"source/real_time/rt_recommend_inventroy_ibcf_result.sql", "date":"today", "runEnv":"local"}'


--------------------------------------------------------------------------------
/scripts/InventoryPortraitCleanService.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 清理无效的房源画像 ./InventoryPortraitCleanService.sh "/home/dwadmin/app/recommend/recommend-2.0"
 3 | 
 4 | # 项目路径
 5 | PROJECT_HOME=$1 
 6 | 
 7 | # 衰减用户画像
 8 | echo "InventoryPortraitClean"
 9 | ssh -q -t dwadmin@bi4 "bash -i ${PROJECT_HOME}/scripts/inventory/InventoryPortraitClean.sh \"${PROJECT_HOME}/target/scala-2.10/recommend-2.0.jar\" "
10 | 


--------------------------------------------------------------------------------
/scripts/UserPortraitAttenuationService.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 衰减用户画像 ./UserPortraitAttenuationService.sh "/home/dwadmin/app/recommend/recommend-2.0"
 3 | 
 4 | # 项目路径
 5 | PROJECT_HOME=$1 
 6 | 
 7 | # 衰减用户画像
 8 | echo "UserPortraitAttenuation"
 9 | ssh -q -t dwadmin@bi4 "bash -i ${PROJECT_HOME}/scripts/user/UserPortraitAttenuation.sh \"${PROJECT_HOME}/target/scala-2.10/recommend-2.0.jar\" "
10 | 


--------------------------------------------------------------------------------
/scripts/UserUbcfService.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 重启 IBCF 所有服务 ./UbcfService.sh "/home/hadoop/app/recommend/recommend-2.0"
 3 | 
 4 | # 推荐系统项目路径
 5 | PROJECT_HOME=$1
 6 | 
 7 | # 在 task3 节点执行这个 sh
 8 | echo "UserUbcf"
 9 | ssh -q -t hadoop@uhadoop-ociicy-task3 "bash -i ${PROJECT_HOME}/scripts/user/UserUbcf.sh \"${PROJECT_HOME}/target/scala-2.10/recommend-2.0.jar\" "
10 | 


--------------------------------------------------------------------------------
/scripts/community/CommunityIBCF.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 重启 CommunityIBCF 算法程序
 3 | 
 4 | # 案例 ./CommunityIBCF.sh "/home/hadoop/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar"
 5 | 
 6 | # jar 的地址
 7 | JAR_PATH=$1
 8 | 
 9 | # 删除 spark 提交时的临时文件
10 | rm -rf /tmp/spark-*
11 | 
12 | # 原来的 进程
13 | ps -aux | grep 'com.angejia.dw.recommend.community.CommunityIBCF' | awk '{print $2}' | while read pid;
14 |   do
15 |     echo "old pid: ${pid}"
16 |     kill -15 $pid;
17 |   done
18 | 
19 | 
20 | # 提交任务给集群 yarn 客户端模式
21 | spark-submit \
22 | --name CommunityIBCF \
23 | --class com.angejia.dw.recommend.community.CommunityIBCF \
24 | --master yarn-client \
25 | --driver-cores 4 \
26 | --driver-memory 10240M \
27 | --executor-memory 2048M \
28 | --num-executors 2 \
29 | ${JAR_PATH} "online" "hdfs://uhadoop-ociicy-master1:8020/user/hive/real_time/rt_user_community_history/*"
30 |  
31 | 
32 | # 新的进程
33 | ps -aux | grep 'com.angejia.dw.recommend.community.CommunityIBCF' | awk '{print $2}' | while read pid;
34 |   do
35 |     echo "new pid: ${pid}"
36 |   done
37 | 
38 | 


--------------------------------------------------------------------------------
/scripts/extract/AccessLogToKafka.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 重启 抽取日志脚本
 3 | 
 4 | # 案例 ./AccessLogToKafka.sh "/home/dwadmin/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar"
 5 | 
 6 | # jar 的地址
 7 | JAR_PATH=$1
 8 | 
 9 | # 原来的 进程
10 | ps -aux | grep ExtractFileToKafkaAccessLog | awk '{print $2}' | while read pid;
11 |   do
12 |     echo "old pid: ${pid}"
13 |     kill -9 $pid;
14 |   done
15 | 
16 | 
17 | # 提交任务
18 | java -Xms2048M -Xmx2048M -DAPP_NAME=ExtractFileToKafkaAccessLog \
19 | -cp ${JAR_PATH} com.angejia.dw.recommend.extract.ExtractFileToKafka "uhadoop-ociicy-master1:2181" "bi4:9092" "accessLog" "0" "accessLogBase" "/data/log/real_time/logs/access_log" "2000" >> /data/log/real_time/logs/access_log_run 2>&1  & 
20 | 
21 | 
22 | # 新的进程
23 | ps -aux | grep ExtractFileToKafkaAccessLog | awk '{print $2}' | while read pid;
24 |   do
25 |     echo "new pid: ${pid}"
26 |   done
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/inventory/InventoryIBCF.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 重启 InventoryIBCF 算法程序
 3 | 
 4 | # 案例 ./InventoryIBCF.sh "/home/hadoop/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar"
 5 | 
 6 | # jar 的地址
 7 | JAR_PATH=$1
 8 | 
 9 | # 删除 spark 提交时的临时文件
10 | rm -rf /tmp/spark-*
11 | 
12 | # 原来的 进程
13 | ps -aux | grep 'com.angejia.dw.recommend.inventory.InventoryIBCF' | awk '{print $2}' | while read pid;
14 |   do
15 |     echo "old pid: ${pid}"
16 |     kill -15 $pid;
17 |   done
18 | 
19 | 
20 | # 提交任务给集群 yarn 客户端模式
21 | spark-submit \
22 | --name InventoryIBCF \
23 | --class com.angejia.dw.recommend.inventory.InventoryIBCF \
24 | --master yarn-client \
25 | --conf spark.driver.maxResultSize=8192M \
26 | --driver-cores 4 \
27 | --driver-memory 10240M \
28 | --executor-memory 2048M \
29 | --num-executors 2 \
30 | ${JAR_PATH} "online" "hdfs://uhadoop-ociicy-master2:8020/user/hive/real_time/rt_user_inventory_history/*"
31 |  
32 | 
33 | # 新的进程
34 | ps -aux | grep 'com.angejia.dw.recommend.inventory.InventoryIBCF' | awk '{print $2}' | while read pid;
35 |   do
36 |     echo "new pid: ${pid}"
37 |   done
38 | 
39 | 


--------------------------------------------------------------------------------
/scripts/inventory/InventoryPortraitClean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 清理无用房源画像
 3 | 
 4 | # 案例 ./InventoryPortraitClean "/home/dwadmin/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar"
 5 | 
 6 | # jar 的地址
 7 | JAR_PATH=$1
 8 | 
 9 | # 原来的 进程
10 | ps -aux | grep 'com.angejia.dw.recommend.inventory.portrait.InventoryPortraitClean' | awk '{print $2}' | while read pid;
11 |   do
12 |     echo "old pid: ${pid}"
13 |     kill -9 $pid;
14 |   done
15 | 
16 | 
17 | # 提交任务
18 | java -DAPP_NAME=InventoryPortraitClean \
19 | -cp ${JAR_PATH} com.angejia.dw.recommend.inventory.portrait.InventoryPortraitClean "online" "" >> /data/log/recommend/InventoryPortraitClean 2>&1 &
20 | 
21 | 
22 | # 新的进程
23 | ps -aux | grep 'com.angejia.dw.recommend.inventory.portrait.InventoryPortraitClean' | awk '{print $2}' | while read pid;
24 |   do
25 |     echo "new pid: ${pid}"
26 |   done
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/inventory/PropertyInventoryIndex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 房源索引服务
 3 | 
 4 | # 案例 ./PropertyInventoryIndex.sh "/home/hadoop/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar"
 5 | 
 6 | # jar 的地址
 7 | JAR_PATH=$1
 8 | 
 9 | 
10 | # 原来的 进程
11 | ps -aux | grep 'com.angejia.dw.service.property.PropertyInventoryService' | awk '{print $2}' | while read pid;
12 |   do
13 |     echo "old pid: ${pid}"
14 |     kill -15 $pid;
15 |   done
16 | 
17 | 
18 | java -DAPP_NAME=PropertyInventoryIndexService \
19 | -cp ~/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar \
20 | com.angejia.dw.service.property.PropertyInventoryService \
21 | "online" \
22 | "/data/log/service/property/service_property_date_point" \
23 | >> /data/log/service/property/service_property_extract 2>&1
24 | 
25 | 
26 | # 新的进程
27 | ps -aux | grep 'com.angejia.dw.service.property.PropertyInventoryService' | awk '{print $2}' | while read pid;
28 |   do
29 |     echo "new pid: ${pid}"
30 |   done
31 | 
32 | 


--------------------------------------------------------------------------------
/scripts/user/UserPortrait.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 重启 用户画像 程序
 3 | 
 4 | # 案例 ./UserPortrait.sh "/home/dwadmin/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar"
 5 | 
 6 | # jar 的地址
 7 | JAR_PATH=$1
 8 | 
 9 | # 删除 spark 提交时的临时文件
10 | rm -rf /tmp/spark-*
11 | 
12 | # 原来的 进程
13 | ps -aux | grep 'com.angejia.dw.recommend.user.portrait.UserPortrait' | awk '{print $2}' | while read pid;
14 |   do
15 |     echo "old pid: ${pid}"
16 |     kill -9 $pid;
17 |   done
18 | 
19 | 
20 | # 提交任务
21 | spark-submit \
22 | --name UserPortrait \
23 | --class com.angejia.dw.recommend.user.portrait.UserPortrait \
24 | --master yarn \
25 | --deploy-mode client \
26 | --driver-cores 2 \
27 | --driver-memory 4096M \
28 | --executor-memory 2048M \
29 | --executor-cores 2 \
30 | --num-executors 2 \
31 | ${JAR_PATH} \
32 | --env "online" \
33 | --kafka-topic "accessLog" \
34 | --kafka-consumer-gid "userPortrait"  >> /data/log/recommend/UserPortrait 2>&1  &
35 | 
36 | 
37 | # 新的进程
38 | ps -aux | grep 'com.angejia.dw.recommend.user.portrait.UserPortrait' | awk '{print $2}' | while read pid;
39 |   do
40 |     echo "new pid: ${pid}"
41 |   done
42 | 
43 | 


--------------------------------------------------------------------------------
/scripts/user/UserPortraitAttenuation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 衰减用户画像
 3 | 
 4 | # 案例 ./UserPortraitAttenuation.sh "/home/dwadmin/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar"
 5 | 
 6 | # jar 的地址
 7 | JAR_PATH=$1
 8 | 
 9 | # 原来的 进程
10 | ps -aux | grep 'com.angejia.dw.recommend.user.portrait.UserPortraitAttenuation' | awk '{print $2}' | while read pid;
11 |   do
12 |     echo "old pid: ${pid}"
13 |     kill -9 $pid;
14 |   done
15 | 
16 | 
17 | # 提交任务
18 | java -DAPP_NAME=UserPortraitAttenuation \
19 | -cp ${JAR_PATH} com.angejia.dw.recommend.user.portrait.UserPortraitAttenuation "online" "" >> /data/log/recommend/UserPortraitAttenuation 2>&1 &
20 | 
21 | 
22 | # 新的进程
23 | ps -aux | grep 'com.angejia.dw.recommend.user.portrait.UserPortraitAttenuation' | awk '{print $2}' | while read pid;
24 |   do
25 |     echo "new pid: ${pid}"
26 |   done
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/user/UserUbcf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # 重启 UserUbcf 算法程序
 3 | 
 4 | # 案例 ./UserUbcf.sh "/home/hadoop/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar"
 5 | 
 6 | # jar 的地址
 7 | JAR_PATH=$1
 8 | 
 9 | # 删除 spark 提交时的临时文件
10 | rm -rf /tmp/spark-*
11 | 
12 | # 原来的 进程
13 | ps -aux | grep 'com.angejia.dw.recommend.user.UserUBCF' | awk '{print $2}' | while read pid;
14 |   do
15 |     echo "old pid: ${pid}"
16 |     kill -15 $pid;
17 |   done
18 | 
19 | 
20 | # 提交任务给集群 yarn 客户端模式
21 | spark-submit \
22 | --name UserUBCF \
23 | --class com.angejia.dw.recommend.user.UserUBCF \
24 | --master yarn-client \
25 | --driver-cores 4 \
26 | --driver-memory 10240M \
27 | --executor-memory 2048M \
28 | --num-executors 2 \
29 | ${JAR_PATH} "online" "hdfs://uhadoop-ociicy-master2:8020/user/hive/real_time/rt_user_inventory_history/*"
30 |  
31 | 
32 | # 新的进程
33 | ps -aux | grep 'com.angejia.dw.recommend.user.UserUBCF' | awk '{print $2}' | while read pid;
34 |   do
35 |     echo "new pid: ${pid}"
36 |   done
37 | 
38 | 


--------------------------------------------------------------------------------
/src/main/java/com/angejia/dw/common/util/DateUtil.java:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.common.util;
  2 | 
  3 | import java.security.Timestamp;
  4 | import java.text.ParseException;
  5 | import java.text.SimpleDateFormat;
  6 | import java.util.Calendar;
  7 | import java.util.Date;
  8 | import java.util.HashMap;
  9 | import java.util.Map;
 10 | 
 11 | 
 12 | /**
 13 |  * 日期转换
 14 |  * @author Jason
 15 |  */
 16 | public class DateUtil {
 17 |     
 18 |     //格式化日期模式，按照需求增加
 19 |     public static final String SIMPLE_FORMAT = "yyyy-MM-dd HH:mm:ss";
 20 |     public static final String SIMPLE_Y_M_D_FORMAT = "yyyy-MM-dd";
 21 |     public static final String SIMPLE_YMD_FORMAT = "yyyyMMdd";
 22 |     public static final String SIMPLE_hms_FORMAT = "HH:mm:ss";
 23 | 
 24 | 
 25 |     /**
 26 |      *  String -> Date
 27 |      * @param time
 28 |      * @param time_format
 29 |      * @return Date
 30 |      */
 31 |     public static Date StringToDate (String time,String time_format) {
 32 |         //设置格式化模式
 33 |         SimpleDateFormat format = new SimpleDateFormat(time_format);
 34 | 
 35 |         Date result = null; 
 36 |         try {
 37 |             Date t = format.parse(time);
 38 |             result = t;
 39 |         } catch (ParseException e) {
 40 |             // TODO Auto-generated catch block
 41 |             e.printStackTrace();
 42 |         }
 43 |         return result;
 44 |     }
 45 | 
 46 | 
 47 |     /**
 48 |      * String -> Timestamp
 49 |      * @param time 时间字符串
 50 |      * @param time_format 需要格式化的格式
 51 |      * @return Long
 52 |      * @throws ParseException
 53 |      * example :
 54 |      *  DateUtil.StringToTimestamp("2010-06-25",DateUtil.SIMPLE_Y_M_D_FORMAT);
 55 |      */
 56 |     public static Long StringToTimestamp (String time,String time_format)  {
 57 |         //设置格式化模式
 58 |         SimpleDateFormat format = new SimpleDateFormat(time_format);
 59 | 
 60 |         Date t = DateUtil.StringToDate(time,time_format);
 61 | 
 62 |         return t.getTime();
 63 | 
 64 |     }
 65 | 
 66 | 
 67 |     /**
 68 |      * Timestamp -> Sting
 69 |      * @param timestamp
 70 |      * @param time_format
 71 |      * @return String
 72 |      * example :
 73 |      *  Long timestamp =  DateUtil.StringToTimestamp("2010-06-25 00:24:00",DateUtil.SIMPLE_FORMAT);
 74 |         DateUtil.TimestampToSting(timestamp,DateUtil.SIMPLE_FORMAT);
 75 |      */
 76 |     public static String TimestampToSting (Long timestamp,String time_format) {
 77 |         //根据时间戳拿到日期对象
 78 |         Date date = new Date(timestamp);
 79 | 
 80 |         //设置格式化模式
 81 |         SimpleDateFormat format = new SimpleDateFormat(time_format);
 82 | 
 83 |         //通过格式化对象，返回结果
 84 |         String result = format.format(date);
 85 | 
 86 |         return result;
 87 |         
 88 |     }
 89 | 
 90 | 
 91 |     /**
 92 |      * Timestamp -> Date
 93 |      * @param timestamp
 94 |      * @return Date
 95 |      */
 96 |     public static Date TimestampToDate (Long timestamp) {
 97 | 
 98 |         Date date = new Date(timestamp);
 99 | 
100 |         return date;
101 |     }
102 | 
103 |     /**
104 |      * String -> FormatString
105 |      * @param time
106 |      * @param current_time_format
107 |      * @param new_time_format
108 |      * @return String
109 |      * example :
110 |      *  String date = DateUtil.StringToFormatString("2015-06-03", DateUtil.SIMPLE_YMD_FORMAT,DateUtil.SIMPLE_Y_M_D_FORMAT);
111 |      */
112 |     public static String StringToFormatString (String time,String current_time_format,String new_time_format ) {
113 |         //装换为时间戳
114 |         Long timestamp = DateUtil.StringToTimestamp(time,current_time_format);
115 | 
116 |         //转换成字符串
117 |         String string = DateUtil.TimestampToSting(timestamp,new_time_format);
118 | 
119 |         return string;
120 |     }
121 | 
122 | 
123 |     /**
124 |      * date -> string
125 |      * @param date
126 |      * @param time_format
127 |      * @return
128 |      */
129 |     public static String DateToString (Date date,String time_format) {
130 | 
131 |         //设置格式化模式
132 |         SimpleDateFormat simple_date_format = new SimpleDateFormat(time_format);
133 | 
134 |         //格式化日期
135 |         String result = simple_date_format.format(date);
136 | 
137 |         return result;
138 |     }
139 | 
140 | 
141 |     /**
142 |      * date -> Timestamp
143 |      * @param date
144 |      * @return Long
145 |      */
146 |     public static Long DateToTimestamp (Date date) {
147 |         return date.getTime();
148 |     }
149 | 
150 | 
151 |     /**
152 |      * 获取当前日期的 偏移天数
153 |      * @param offset_day 
154 |      * @return
155 |      */
156 |     public static Date getCalendarOffsetDateDay(int offset_day) {
157 |         return DateUtil.calendarOffsetDateDay(offset_day,new Date());
158 |     }
159 | 
160 |     /**
161 |      * 获取指定日期的 偏移天数
162 |      * @param offset_day
163 |      * @param curDate
164 |      * @return
165 |      */
166 |     public static Date getCalendarOffsetDateDay(int offset_day, Date curDate) {
167 |         return DateUtil.calendarOffsetDateDay(offset_day,curDate);
168 |     }
169 | 
170 |     /** 
171 |      * 获取指定偏移日期
172 |      * @param offset_day 偏移天数，-1 表示昨天 1明天  2 后天，以此类推
173 |      * @param curDate 指定日期
174 |      * @return Date
175 |      */
176 |     public static Date calendarOffsetDateDay (int offset_day, Date curDate) {
177 |         Calendar c1 = Calendar.getInstance();
178 | 
179 |         c1.setTime(curDate);   // 设置当前日期
180 |         c1.add(Calendar.DATE,offset_day);
181 | 
182 |         int year = c1.get(Calendar.YEAR); //获得年
183 |         int month = c1.get(Calendar.MONTH) + 1;  // 获得月份
184 |         int date = c1.get(Calendar.DATE); // 获得日期
185 |         int hours = c1.get(Calendar.HOUR_OF_DAY); // 获得小时
186 |         int minute = c1.get(Calendar.MINUTE); // 获得分钟
187 |         int second = c1.get(Calendar.SECOND); // 获得秒
188 |         int day_of_week = c1.get(Calendar.DAY_OF_WEEK); //获得星期几（注意（这个与Date类是不同的）：1代表星期日、2代表星期1、3代表星期二，以此类推）
189 | 
190 |         //Date
191 |         return c1.getTime();
192 |     }
193 | 
194 | 
195 |     /**
196 |      * 获取当前时间戳
197 |      * 1436768318923
198 |      */
199 |     public static Long getNowTimestamp() {
200 |         return System.currentTimeMillis();
201 |     }
202 | 
203 |     
204 |     /**
205 |      * 获取当前时间 , 可以指定格式
206 |      * @return
207 |      */
208 |     public static String getCurTime(String time_format) {
209 |         return DateUtil.TimestampToSting(DateUtil.getNowTimestamp(),time_format);
210 |     }
211 |     
212 | 
213 |     
214 |     
215 |     
216 |     
217 |     
218 |     public static Date addDateOneDay(Date date) {  
219 |         if (null == date) {  
220 |             return date;  
221 |         }  
222 |         Calendar c = Calendar.getInstance();  
223 |         c.setTime(date);   //设置当前日期  
224 |         c.add(Calendar.DATE, 1); //日期加1天  
225 | //     c.add(Calendar.DATE, -1); //日期减1天  
226 |         date = c.getTime();  
227 |         return date;  
228 |     } 
229 | 
230 |     public static void main (String[] args) throws ParseException {
231 | 
232 |         Long timestamp =  DateUtil.StringToTimestamp("2010-06-25 02:24:10",DateUtil.SIMPLE_FORMAT);
233 |         String t2s = DateUtil.TimestampToSting(timestamp,DateUtil.SIMPLE_FORMAT);
234 |         
235 |         Date date = DateUtil.TimestampToDate(timestamp);
236 |         System.out.println(t2s);
237 | 
238 |     }
239 | }
240 | 


--------------------------------------------------------------------------------
/src/main/java/com/angejia/dw/common/util/DebugUtil.java:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.common.util;
 2 | 
 3 | public class DebugUtil {
 4 | 
 5 |     public static void dump (Object obj,int i) {
 6 |         DebugUtil.print(obj);
 7 |         DebugUtil.exit(i);
 8 |     }
 9 | 
10 |     public static void dump (Object obj) {
11 |         DebugUtil.print(obj);
12 |     }
13 | 
14 |     public static void exit(int i) {
15 |         System.exit(i);
16 |     }
17 | 
18 |     public static void print (Object obj) {
19 |         System.out.println(DebugUtil.getType(obj));
20 |         System.out.println(obj);
21 |     }
22 | 
23 |     public static String getType(Object o){
24 |        return o.getClass().toString();
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/com/angejia/dw/common/util/FileUtil.java:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.common.util;
  2 | 
  3 | import java.io.*;
  4 | 
  5 | public class FileUtil {
  6 |     
  7 |     /**
  8 |      * 向文件写数据
  9 |      * @param file_name
 10 |      * @param data
 11 |      * @param append 是否追加
 12 |      */
 13 |     public static void fileOutputStream (String file_name,String data,boolean append)  {
 14 | 
 15 |         try {
 16 | 
 17 |             File f = new File(file_name);
 18 |             FileOutputStream fop = new FileOutputStream(f,append);
 19 | 
 20 |             //构建OutputStreamWriter对象,参数可以指定编码,默认为操作系统默认编码,windows上是gbk
 21 |             OutputStreamWriter writer = new OutputStreamWriter(fop, "UTF-8");
 22 | 
 23 |             //写入到缓冲区
 24 |             writer.append(data);
 25 | 
 26 |             //关闭写入流,同时会把缓冲区内容写入文件,所以上面的注释掉
 27 |             writer.close();
 28 | 
 29 |             //关闭输出流,释放系统资源
 30 |             fop.close();
 31 | 
 32 |         } catch (IOException e) {
 33 |             // TODO Auto-generated catch block
 34 |             e.printStackTrace();
 35 |         } 
 36 |     }
 37 |     
 38 |     
 39 |     /**
 40 |      * 读取文件
 41 |      * @param file_name
 42 |      * @return String
 43 |      */
 44 |     public static String fileInputStream(String file_name){
 45 | 
 46 |         StringBuilder sb = new StringBuilder();
 47 | 
 48 |         try {
 49 |             File f = new File(file_name);
 50 | 
 51 |             //构建FileInputStream对象
 52 |             FileInputStream fip = new FileInputStream(f);
 53 | 
 54 |             // InputStreamReader 逐行读取六中的数据,编码与写入相同
 55 |             InputStreamReader reader = new InputStreamReader(fip, "UTF-8");
 56 | 
 57 |             //一行行读去文件数据
 58 |             while (reader.ready()) {
 59 |                 sb.append((char) reader.read());
 60 |             }
 61 | 
 62 |             //关闭读取流
 63 |             reader.close();
 64 | 
 65 |             //关闭输出流,释放系统资源
 66 |             fip.close();
 67 | 
 68 |         } catch (IOException e) {
 69 |             // TODO Auto-generated catch block
 70 |             e.printStackTrace();
 71 |         }
 72 | 
 73 |         return sb.toString();
 74 |     }
 75 | 
 76 |     
 77 |     public static boolean deleteFile (String file_name) {
 78 |         boolean isDelete = false;
 79 |         try{
 80 |             
 81 |             File file = new File(file_name);
 82 |             
 83 |             if(file.delete()){
 84 |                 isDelete = true;
 85 |             }else{
 86 |                 isDelete = false;
 87 |             }
 88 |        
 89 |            }catch(Exception e){
 90 |        
 91 |             e.printStackTrace();
 92 |        
 93 |            }
 94 |         return isDelete;
 95 |     }
 96 |     
 97 |     public static void main(String[] args) {
 98 |         //FileUtil a = new FileUtil();
 99 |         //a.fileOutputStream("/tmp/aaa","中文输入");
100 |         //a.fileOutputStream("/tmp/aaa","\r\n");
101 |         //System.out.println(a.fileInputStream("/tmp/aaa"));
102 |     }
103 | }
104 | 


--------------------------------------------------------------------------------
/src/main/java/com/angejia/dw/common/util/JavaJsonUtil.java:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.common.util;
  2 | 
  3 | import org.json.simple.JSONObject;
  4 | import org.json.simple.JSONArray;
  5 | import org.json.simple.parser.ParseException;
  6 | import org.json.simple.parser.JSONParser;
  7 | 
  8 | /**
  9 |  * Json 转换
 10 |  * @author Jason 
 11 |  *  JSONObject 就是 java.util.Map
 12 |  *  JSONArray 就是 java.util.List
 13 |  *  使用 Map 或 List 的标准操作访问它们
 14 |  */
 15 | public class JavaJsonUtil {
 16 |     
 17 |     
 18 |     /**
 19 |      * JsonStr {} 转换成  java.util.Map
 20 |      * SONObject 就是 java.util.Map
 21 |      * @param strJson
 22 |      * @return
 23 |      */
 24 |     public static JSONObject JsonStrToMap (String strJson) {
 25 |         JSONParser parser = new JSONParser();
 26 |         JSONObject obj = null; 
 27 | 
 28 |         try{
 29 |             obj = (JSONObject) parser.parse(strJson);
 30 |         } catch(ParseException pe){
 31 |             System.out.println("position: " + pe.getPosition());
 32 |             System.out.println(pe);
 33 |         }
 34 |         //System.out.println(obj);
 35 |         return obj;
 36 |     }
 37 | 
 38 | 
 39 |     /**
 40 |      * JsonStr [{},{}] 转换成  java.util.List
 41 |      * JSONArray 就是 java.util.List
 42 |      * @param strJson
 43 |      * @return
 44 |      */
 45 |     public static JSONArray JsonStrToArray(String strJson) {
 46 |         JSONParser parser = new JSONParser();
 47 |         JSONArray obj = null; 
 48 | 
 49 |         try{
 50 |             obj = (JSONArray) parser.parse(strJson);
 51 |         } catch(ParseException pe){
 52 |             System.out.println("position: " + pe.getPosition());
 53 |             System.out.println(pe);
 54 |         }
 55 |         //System.out.println(obj);
 56 |         return obj;
 57 |     }
 58 | 
 59 |     /**
 60 |      * JSONObject 转换成 json 字符串
 61 |      * @param obj
 62 |      * @return
 63 |      */
 64 |     public static String MapToJsonStr(JSONObject obj) {
 65 |         return obj.toJSONString();
 66 |     }
 67 | 
 68 |     /**
 69 |      * JSONArray 转换成 Json 字符串
 70 |      * @param obj
 71 |      * @return
 72 |      */
 73 |     public static String ArrayToJsonStr(JSONArray obj) {
 74 |         return obj.toJSONString();
 75 |     }
 76 | 
 77 |     public static void main(String[] args) {
 78 |         JSONObject obja = JavaJsonUtil.JsonStrToMap("{\"a\":\"1\"}");
 79 |         obja.put("a", "1");
 80 |         //JavaJsonUtil.JsonStrToArray("[{\"a\":\"1\"},{\"a\":\"1\"}]");
 81 |         //JavaJsonUtil.MapToJson();
 82 |         System.exit(0);
 83 |         
 84 |         JSONParser parser=new JSONParser();
 85 |         String s = "[0,{\"1\":{\"2\":{\"3\":{\"4\":[5,{\"6\":7}]}}}}]";
 86 |             try{
 87 |             Object obj = parser.parse(s);
 88 |             JSONArray array = (JSONArray)obj;
 89 |             System.out.println("The 2nd element of array");
 90 |             System.out.println(array.get(1));
 91 |             System.out.println();
 92 |             JSONObject obj2 = (JSONObject)array.get(1);
 93 |             obj2.put(2, "a");
 94 |             System.out.println("Field \"1\"");
 95 |             System.out.println(obj2.get(2));
 96 | 
 97 |             s = "{}";
 98 |             obj = parser.parse(s);
 99 |             System.out.println(obj);
100 | 
101 |             s= "[5,]";
102 |             obj = parser.parse(s);
103 |             System.out.println(obj);
104 | 
105 |             s= "[5,,2]";
106 |             obj = parser.parse(s);
107 |             System.out.println(obj);
108 |         }catch(ParseException pe){
109 |             System.out.println("position: " + pe.getPosition());
110 |             System.out.println(pe);
111 |         }
112 |     }
113 |     
114 |     
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/com/angejia/dw/common/util/PropertyUtil.java:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.common.util;
  2 | 
  3 | import java.io.BufferedInputStream;   
  4 | import java.io.FileInputStream;   
  5 | import java.io.FileNotFoundException;   
  6 | import java.io.FileOutputStream;   
  7 | import java.io.IOException;   
  8 | import java.io.InputStream;   
  9 | import java.io.OutputStream;   
 10 | import java.io.Reader;
 11 | import java.util.Properties;  
 12 | import java.io.InputStreamReader;
 13 | import java.io.BufferedReader;
 14 | 
 15 | 
 16 | public class PropertyUtil {
 17 | 
 18 |     //属性文件的路径
 19 |     private  String profilepath = "";   
 20 | 
 21 |     private  Properties props = new Properties();
 22 | 
 23 |     /**
 24 |      * 设置文件输入流(使用这种方式, 可以在 jar 内部读取文件等操作)
 25 |      */
 26 |     public void setFileInputStream(Reader reader) throws IOException {
 27 |         this.props.load(reader);
 28 |     }
 29 | 
 30 | 
 31 |     /**
 32 |      * 设置文件路径(使用这种方式, jar 内部会读不到文件, 可以 jar 外部操作文件)
 33 |      * @param filePath
 34 |      */
 35 |     public void setFilePath(String filePath) {
 36 |         this.profilepath = filePath;
 37 |         
 38 |         try {   
 39 |             props.load(new FileInputStream(this.profilepath));   
 40 |         } catch (FileNotFoundException e) {   
 41 |             e.printStackTrace();   
 42 |             System.exit(-1);   
 43 |         } catch (IOException e) {          
 44 |             System.exit(-1);   
 45 |         } 
 46 |     }
 47 | 
 48 | 
 49 |     /**  
 50 |     * 读取属性文件中相应键的值  
 51 |     * @param key  
 52 |     *            主键  
 53 |     * @return String  
 54 |     */   
 55 |     public  String getKeyValue(String key) {   
 56 |         return props.getProperty(key);   
 57 |     }
 58 | 
 59 | 
 60 |     /**  
 61 |     * 根据主键key读取主键的值value  
 62 |     * @param filePath 属性文件路径  
 63 |     * @param key 键名  
 64 |     */   
 65 |     public  String readValue(String filePath, String key) {   
 66 |         Properties props = new Properties();   
 67 |         try {   
 68 |             InputStream in = new BufferedInputStream(new FileInputStream(   
 69 |                     filePath));   
 70 |             props.load(in);   
 71 |             String value = props.getProperty(key);   
 72 |             System.out.println(key +"键的值是："+ value);   
 73 |             return value;   
 74 |         } catch (Exception e) {   
 75 |             e.printStackTrace();   
 76 |             return null;   
 77 |         }   
 78 |     }   
 79 | 
 80 | 
 81 |     /**  
 82 |     * 更新（或插入）一对properties信息(主键及其键值)  
 83 |     * 如果该主键已经存在，更新该主键的值；  
 84 |     * 如果该主键不存在，则插件一对键值。  
 85 |     * @param keyname 键名  
 86 |     * @param keyvalue 键值  
 87 |     */   
 88 |     public  void writeProperties(String keyname,String keyvalue) {          
 89 |         try {   
 90 |             // 调用 Hashtable 的方法 put，使用 getProperty 方法提供并行性。   
 91 |             // 强制要求为属性的键和值使用字符串。返回值是 Hashtable 调用 put 的结果。   
 92 |             OutputStream fos = new FileOutputStream(profilepath);   
 93 |             props.setProperty(keyname, keyvalue);   
 94 |             // 以适合使用 load 方法加载到 Properties 表中的格式，   
 95 |             // 将此 Properties 表中的属性列表（键和元素对）写入输出流   
 96 |             props.store(fos, "Update '" + keyname + "' value");   
 97 |         } catch (IOException e) {   
 98 |             System.err.println("属性文件更新错误");   
 99 |         }   
100 |     }   
101 | 
102 | 
103 |     /**  
104 |     * 更新properties文件的键值对  
105 |     * 如果该主键已经存在，更新该主键的值；  
106 |     * 如果该主键不存在，则插件一对键值。  
107 |     * @param keyname 键名  
108 |     * @param keyvalue 键值  
109 |     */   
110 |     public void updateProperties(String keyname,String keyvalue) {   
111 |         try {   
112 |             props.load(new FileInputStream(profilepath));   
113 |             // 调用 Hashtable 的方法 put，使用 getProperty 方法提供并行性。   
114 |             // 强制要求为属性的键和值使用字符串。返回值是 Hashtable 调用 put 的结果。   
115 |             OutputStream fos = new FileOutputStream(profilepath);              
116 |             props.setProperty(keyname, keyvalue);   
117 |             // 以适合使用 load 方法加载到 Properties 表中的格式，   
118 |             // 将此 Properties 表中的属性列表（键和元素对）写入输出流   
119 |             props.store(fos, "Update '" + keyname + "' value");   
120 |         } catch (IOException e) {   
121 |             System.err.println("属性文件更新错误");   
122 |         }   
123 |     }
124 |  
125 | 
126 | 
127 |     //测试代码   
128 |     public static void main(String[] args) throws IOException {   
129 |         System.out.println("123");
130 |         //返回读取指定资源的输入流  
131 |         InputStream is= PropertyUtil.class.getClass().getResourceAsStream("/resources/conf_dev.properties");   
132 |         BufferedReader br=new BufferedReader(new InputStreamReader(is));  
133 |         String s="";  
134 |         while((s=br.readLine())!=null)  
135 |             System.out.println(s);
136 |     } 
137 | }
138 | 


--------------------------------------------------------------------------------
/src/main/java/com/angejia/dw/common/util/mysql/JavaMysqlClient.java:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.common.util.mysql;
  2 | 
  3 | import java.util.Map;
  4 | import java.util.HashMap;
  5 | import java.util.List;
  6 | import java.util.ArrayList;
  7 | import java.util.Arrays;
  8 | 
  9 | import java.sql.Connection;
 10 | import java.sql.DriverManager;
 11 | import java.sql.PreparedStatement;
 12 | import java.sql.ResultSet;
 13 | import java.sql.SQLException;
 14 | import java.sql.Statement;
 15 | 
 16 | public class JavaMysqlClient {
 17 | 
 18 |     private String url;
 19 |     private String user;
 20 |     private String psw;
 21 | 
 22 |     private Connection conn;
 23 | 
 24 |     static {
 25 |         try {
 26 |             Class.forName("com.mysql.jdbc.Driver");
 27 |         } catch (ClassNotFoundException e) {
 28 |             e.printStackTrace();
 29 |             throw new RuntimeException(e);
 30 |         }
 31 |     }
 32 | 
 33 |     public JavaMysqlClient(String url, String user, String psw) {
 34 |         this.url = url;
 35 |         this.user = user;
 36 |         this.psw = psw;
 37 |     }
 38 | 
 39 |     /**
 40 |      * 获取数据库的连接
 41 |      * 
 42 |      * @return conn
 43 |      */
 44 |     public Connection getConnection() {
 45 |         if (null == conn) {
 46 |             try {
 47 |                 conn = DriverManager.getConnection(url, user, psw);
 48 |             } catch (SQLException e) {
 49 |                 e.printStackTrace();
 50 |                 throw new RuntimeException(e);
 51 |             }
 52 |         }
 53 |         return conn;
 54 |     }
 55 | 
 56 |     /**
 57 |      * 查询数据
 58 |      * 
 59 |      * @param sql
 60 |      *            查询的 SQL
 61 |      * @param fields
 62 |      *            查询的字段 fields
 63 |      * @return List<Map<String, String>>
 64 |      * @throws SQLException
 65 |      */
 66 |     public List<Map<String, String>> select(String sql, String fields) {
 67 | 
 68 |         List<Map<String, String>> rsList = new ArrayList<Map<String, String>>();
 69 | 
 70 |         // TODO 不使用string来读取参数
 71 |         String[] fieldsArr = fields.split(",");
 72 | 
 73 |         try {
 74 |             // 通过数据库的连接操作数据库，实现增删改查
 75 |             PreparedStatement ptmt = getConnection().prepareStatement(sql);
 76 | 
 77 |             // 执行 Sql 获取结果集
 78 |             ResultSet rs = ptmt.executeQuery();
 79 | 
 80 |             // 保存列中的字段集
 81 |             Map<String, String> rowData;
 82 |             // 遍历结果集
 83 |             while (rs.next()) {
 84 |                 rowData = new HashMap<String, String>();
 85 |                 for (String field : fieldsArr) {
 86 |                     rowData.put(field, rs.getString(field));
 87 |                 }
 88 |                 // 保存列的数据到行中
 89 |                 rsList.add(rowData);
 90 |             }
 91 | 
 92 |         } catch (SQLException e) {
 93 |             e.printStackTrace();
 94 |         }
 95 | 
 96 |         return rsList;
 97 |     }
 98 | 
 99 |     /**
100 |      * 获取查询的数据条数
101 |      * <p>
102 |      * <code>语句类似 SELECT COUNT(*) AS cn FROM tal</code>
103 |      * </p>
104 |      * 
105 |      * @param sql
106 |      * @return int
107 |      * @throws SQLException
108 |      */
109 |     public int count(String sql) {
110 |         int cn = 0;
111 | 
112 |         try {
113 |             PreparedStatement ptmt = getConnection().prepareStatement(sql);
114 |             ResultSet rs = ptmt.executeQuery();
115 |             rs.next();
116 |             cn = rs.getInt("cn");
117 |         } catch (SQLException e) {
118 |             // TODO Auto-generated catch block
119 |             e.printStackTrace();
120 |         }
121 | 
122 |         return cn;
123 | 
124 |     }
125 | 
126 |     /**
127 |      * 执行 Sql
128 |      * 
129 |      * @param sql
130 |      * @return boolean
131 |      * @throws SQLException
132 |      */
133 |     public boolean execute(String sql) {
134 |         boolean rs = false;
135 |         try {
136 |             PreparedStatement ptmt = getConnection().prepareStatement(sql);
137 |             rs = ptmt.execute(sql);
138 |             ptmt.close();
139 |         } catch (SQLException e) {
140 |             e.printStackTrace();
141 |         }
142 |         return rs;
143 |     }
144 | 
145 |     /**
146 |      * 释放资源
147 |      * 
148 |      * @param conn
149 |      * @param pstmt
150 |      * @param rs
151 |      */
152 |     public void closeResources(Connection conn, PreparedStatement pstmt, ResultSet rs) {
153 |         if (null != rs) {
154 |             try {
155 |                 rs.close();
156 |             } catch (SQLException e) {
157 |                 e.printStackTrace();
158 |                 throw new RuntimeException(e);
159 |             } finally {
160 |                 if (null != pstmt) {
161 |                     try {
162 |                         pstmt.close();
163 |                     } catch (SQLException e) {
164 |                         e.printStackTrace();
165 |                         throw new RuntimeException(e);
166 |                     } finally {
167 |                         if (null != conn) {
168 |                             try {
169 |                                 conn.close();
170 |                             } catch (SQLException e) {
171 |                                 e.printStackTrace();
172 |                                 throw new RuntimeException(e);
173 |                             }
174 |                         }
175 |                     }
176 |                 }
177 |             }
178 |         }
179 |     }
180 | 
181 | }
182 | 


--------------------------------------------------------------------------------
/src/main/java/com/angejia/dw/common/util/parse/ParseMobileAgent.java:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.common.util.parse;
 2 | 
 3 | import org.apache.hadoop.hive.ql.exec.UDF;
 4 | import java.util.regex.Pattern;
 5 | import java.util.regex.Matcher;
 6 | 
 7 | 
 8 | /**
 9 |  * 解析 accessLog 中的 Http 头的 数据
10 |  *   app=i-broker;av=1.0.0;ccid=1;gcid=1;ch=A01;lng=121.526063;lat=31.219871;ip=;mac=None;net=WIFI;p=iOS;pm=iPhone 4S;osv=7.1;dvid=32A02E76EC8C-4D78-B331-201503251125;
11 |  * 这个数据是 APP 请求的时候发送的
12 |  */
13 | public class ParseMobileAgent {
14 |      /**
15 |      * @param s sting
16 |      * @param p pattern
17 |      * @return
18 |      */
19 |     public static String evaluate(String s, String p) {
20 |         if (s == null) { return ""; }
21 |         String base_p = p+"=([^;]+)";
22 | 
23 |         String result = "";
24 | 
25 |         String first_result = parseAgent(s, ";"+base_p);//先执行严格匹配，防止取p的时候把app的值取出来
26 | 
27 |         if(first_result == ""){
28 |             String second_result = parseAgent(s, base_p);
29 |             result = second_result;
30 |         }else{
31 |             result = first_result;
32 |         }
33 | 
34 |         return result;
35 |     }
36 |     public static String parseAgent(String s,String p){
37 |         if (s == null) { return ""; }
38 |         Pattern pattern = Pattern.compile(p);
39 |         Matcher matcher=pattern.matcher(s);
40 | 
41 |         if(matcher.find()){
42 |             return matcher.group(1);
43 |         }
44 |         return "";
45 |     }
46 | //    public static void main(String[] args){
47 | //    	String s = "app=i-broker;av=1.0.0;ccid=1;gcid=1;ch=A01;lng=121.526063;lat=31.219871;ip=;mac=None;net=WIFI;p=iOS;pm=iPhone 4S;osv=7.1;dvid=32A02E76EC8C-4D78-B331-201503251125;";
48 | //    	ParseMobileAgent obj = new ParseMobileAgent();
49 | //    	System.out.println(obj.evaluate(s,"app"));//开头的值
50 | //    	System.out.println(obj.evaluate(s,"p"));//取重复值
51 | //    	System.out.println(obj.evaluate(s,"gcid"));//中间的值
52 | //    	System.out.println(obj.evaluate(s,"dvid"));//结尾的值
53 | //    	System.out.println(obj.evaluate(s,"ip"));//取空值
54 | //    	System.out.println(obj.evaluate(s,"notexist"));//不存在的值
55 | //    }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/com/angejia/dw/common/util/parse/ParseMobileToken.java:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.common.util.parse;
 2 | 
 3 | import java.net.URLDecoder;
 4 | import java.util.Map;
 5 | 
 6 | import javax.crypto.Cipher;
 7 | import javax.crypto.spec.IvParameterSpec;
 8 | import javax.crypto.spec.SecretKeySpec;
 9 | 
10 | import org.apache.commons.codec.binary.Base64;
11 | import org.codehaus.jackson.JsonParseException;
12 | import org.codehaus.jackson.map.ObjectMapper;
13 | 
14 | /**
15 |  * 解密 token
16 |  */
17 | public class ParseMobileToken {
18 | 
19 |     public static String evaluate(String s, String index) throws Exception {
20 |         if (s == null || s.length() <= 0) {
21 |             return "";
22 |         }
23 | 
24 |         String token = Decrypt(s);
25 | 
26 |         if (token != null && token.length() > 0) {// json decode
27 |             Map<String, Map<String, Object>> maps;
28 |             ObjectMapper objectMapper = new ObjectMapper();
29 |             try {
30 |                 try {
31 |                     maps = objectMapper.readValue(token, Map.class);
32 |                     if (maps.containsKey(index)) {
33 |                         return String.valueOf(maps.get(index));
34 |                     }
35 |                 } catch (JsonParseException e) {
36 |                     maps = objectMapper.readValue(URLDecoder.decode(token, "utf-8"), Map.class);
37 |                     if (maps.containsKey(index)) {
38 |                         return String.valueOf(maps.get(index));
39 |                     }
40 |                 }
41 |             } catch (Exception e) {
42 |                 System.err.println(e.toString());
43 |                 return "";
44 |             }
45 |         }
46 | 
47 |         return "";
48 |     }
49 | 
50 |     public static String Decrypt(String data) throws Exception {
51 |         try {
52 |             String key = "12345678123456xx";
53 |             String iv = "12345678123456xx";
54 | 
55 |             byte[] encrypted1 = new Base64().decode(data);
56 | 
57 |             Cipher cipher = Cipher.getInstance("AES/CBC/NoPadding");
58 |             SecretKeySpec keyspec = new SecretKeySpec(key.getBytes(), "AES");
59 |             IvParameterSpec ivspec = new IvParameterSpec(iv.getBytes());
60 | 
61 |             cipher.init(Cipher.DECRYPT_MODE, keyspec, ivspec);
62 |             try {
63 |                 byte[] original = cipher.doFinal(encrypted1);
64 |                 String originalString = new String(original);
65 |                 return originalString;
66 |             } catch (Exception e) {
67 |                 System.err.println(e.toString());
68 |                 return null;
69 |             }
70 |         } catch (Exception e) {
71 |             e.printStackTrace();
72 |             return null;
73 |         }
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/java/com/angejia/dw/hadoop/hive/HiveClient.java:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.hadoop.hive;
  2 | 
  3 | 
  4 | import java.util.Map;
  5 | import java.util.HashMap;
  6 | import java.util.List;
  7 | import java.util.ArrayList;
  8 | import java.util.Arrays;
  9 | 
 10 | 
 11 | import java.sql.SQLException;
 12 | import java.sql.Connection;
 13 | import java.sql.ResultSet;
 14 | import java.sql.Statement;
 15 | import java.sql.DriverManager;
 16 | import org.apache.hive.jdbc.HiveDriver;
 17 | 
 18 | 
 19 | public class HiveClient {
 20 | 
 21 |     private static String driverName = "org.apache.hive.jdbc.HiveDriver";
 22 | 
 23 |     /**
 24 |      * 获取连接
 25 |      */
 26 |     private Connection connection;
 27 |     public Connection getConnection() {
 28 |         return connection;
 29 |     }
 30 |     public void setConnection(Connection conn) {
 31 |         this.connection = conn;
 32 |     }
 33 | 
 34 | 
 35 | 
 36 |     public HiveClient(String url, String user, String password) throws SQLException {
 37 |         // 导入类
 38 |         try {
 39 |             Class.forName(driverName);
 40 |         } catch (ClassNotFoundException e) {
 41 |             e.printStackTrace();
 42 |         }
 43 | 
 44 |         // 创建连接
 45 |         Connection con = DriverManager.getConnection(url, user, password);
 46 |         this.setConnection(con);
 47 | 
 48 |         // 创建连接句柄语句
 49 |         //Statement stmt = con.createStatement();
 50 |         //this.setStmt(stmt);
 51 |     }
 52 | 
 53 | 
 54 |     /**
 55 |      * 执行指定 Sql
 56 |      * @param sql
 57 |      * @return 布尔值
 58 |      */
 59 |     public Boolean execute (String sql) {
 60 |         Boolean rs = false;
 61 |         try {
 62 |             Statement stmt = this.getConnection().createStatement();
 63 |             rs = stmt.execute(sql);
 64 |             stmt.close();
 65 |         } catch (SQLException e) {
 66 |             e.printStackTrace();
 67 |         }
 68 |         return rs;
 69 |     }
 70 | 
 71 | 
 72 |     /**
 73 |      * 查询数据 
 74 |      * @param sql
 75 |      * @param fields 字段 
 76 |      * @return List<Map<String, String>>
 77 |      * @throws SQLException
 78 |      * 
 79 |      * 循环出数据
 80 |      * for (i <- 0 to select.size() - 1 ) {
 81 |             println(rsData.get(i).get("visit_item_invs_a"))
 82 |         }
 83 |      * 
 84 |      */
 85 |     public List<Map<String, String>> select(String sql, String fields) throws SQLException {
 86 |         // 保存结果数据
 87 |         List<Map<String, String>> listResult = new ArrayList<Map<String, String>>();
 88 | 
 89 |         ResultSet res = null;
 90 |         try {
 91 |             Statement stmt = this.getConnection().createStatement();
 92 | 
 93 |             res = stmt.executeQuery(sql);
 94 | 
 95 |             // 转换为数组
 96 |             String[] arrFields = fields.split(",");
 97 | 
 98 |             // 遍历每一行
 99 |             while (res.next()) {
100 |                 // 保存一行数据
101 |                 Map<String, String> mapRowData = new HashMap<String, String>();
102 | 
103 |                 // 拼接字段值
104 |                 for (String field : arrFields) {
105 |                     mapRowData.put(field, res.getString(field));
106 |                 }
107 |                 // 追加到 list 中
108 |                 listResult.add(mapRowData);
109 | 
110 |                 mapRowData = null;
111 |             }
112 | 
113 |             stmt.close();
114 |             res.close();
115 | 
116 |         } catch (SQLException e) {
117 |             e.printStackTrace();
118 |         }
119 | 
120 |         //res.close();
121 |         /**
122 |         for (Map<String, String> rs : listResult) {
123 |             System.out.println(rs.get("broker_id"));
124 |         }
125 |         */
126 | 
127 |         return listResult;
128 |     }
129 | 
130 | 
131 |     /**
132 |      * 关闭连接
133 |      */
134 |     public void closeConnection() {
135 |         try {
136 |             this.getConnection().close();
137 |         } catch (SQLException e) {
138 |             e.printStackTrace();
139 |         }
140 |     }
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 |     /**
152 |      * 测试方法
153 |      * @param args
154 |      * @throws SQLException
155 |      */
156 |     public static void main(String[] args) throws SQLException {
157 |        try {
158 |         Class.forName(driverName);
159 |       } catch (ClassNotFoundException e) {
160 |         // TODO Auto-generated catch block
161 |         e.printStackTrace();
162 |         System.exit(1);
163 |       }
164 |       //replace "hive" here with the name of the user the queries should run as
165 |       Connection con = DriverManager.getConnection("jdbc:hive2://localhost:10000/default", "hive", "");
166 |       Statement stmt = con.createStatement();
167 |       String tableName = "testHiveDriverTable";
168 |       stmt.execute("drop table if exists " + tableName);
169 |       stmt.execute("create table " + tableName + " (key int, value string)");
170 |       // show tables
171 |       String sql = "show tables '" + tableName + "'";
172 |       System.out.println("Running: " + sql);
173 |       ResultSet res = stmt.executeQuery(sql);
174 |       if (res.next()) {
175 |         System.out.println(res.getString(1));
176 |       }
177 |          // describe table
178 |       sql = "describe " + tableName;
179 |       System.out.println("Running: " + sql);
180 |       res = stmt.executeQuery(sql);
181 |       while (res.next()) {
182 |         System.out.println(res.getString(1) + "\t" + res.getString(2));
183 |       }
184 |    
185 |       // load data into table
186 |       // NOTE: filepath has to be local to the hive server
187 |       // NOTE: /tmp/a.txt is a ctrl-A separated file with two fields per line
188 |       String filepath = "/tmp/a.txt";
189 |       sql = "load data local inpath '" + filepath + "' into table " + tableName;
190 |       System.out.println("Running: " + sql);
191 |       stmt.execute(sql);
192 |    
193 |       // select * query
194 |       sql = "select * from " + tableName;
195 |       System.out.println("Running: " + sql);
196 |       res = stmt.executeQuery(sql);
197 |       while (res.next()) {
198 |         System.out.println(String.valueOf(res.getInt(1)) + "\t" + res.getString(2));
199 |       }
200 |    
201 |       // regular hive query
202 |       sql = "select count(1) from " + tableName;
203 |       System.out.println("Running: " + sql);
204 |       res = stmt.executeQuery(sql);
205 |       while (res.next()) {
206 |         System.out.println(res.getString(1));
207 |       }
208 |     }
209 | }
210 | 


--------------------------------------------------------------------------------
/src/main/java/com/angejia/dw/service/Conf.java:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.service;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.io.InputStreamReader;
 6 | 
 7 | import java.util.Map;
 8 | import java.util.HashMap;
 9 | 
10 | import com.angejia.dw.common.util.PropertyUtil;
11 | 
12 | public class Conf {
13 | 
14 |     // 读取配置文件
15 |     PropertyUtil property = new PropertyUtil();
16 |     
17 |     /**
18 |      * 设置环境,根据不同的环境使用不同的配置文件
19 |      * @throws IOException 
20 |      */
21 |     public void setEnv(String env)   {
22 | 
23 |         // 读取的配置文件名称
24 |         String confName = "/conf_" + env + ".properties";
25 | 
26 |         // 获取 resource 文件读输入流
27 |         InputStream  classPath = Conf.class.getResourceAsStream(confName);
28 |         InputStreamReader inputStreamReader = new InputStreamReader(classPath);
29 | 
30 |         // 设置读取的流
31 |         try {
32 |             property.setFileInputStream(inputStreamReader);
33 |         } catch (IOException e) {
34 |             // TODO Auto-generated catch block
35 |             e.printStackTrace();
36 |         }
37 |     }
38 |     
39 |     
40 |     /**
41 |      * 获取 Spark 配置
42 |      * @return  Map<String,String>
43 |      */
44 |     public Map<String,String> getSparkConf(){
45 |         Map<String, String> data = new HashMap<String, String>();
46 |         data.put("sparkThriftServerUrl",  property.getKeyValue("spark.thrift.server.url"));
47 |         data.put("sparkThriftServerUser",  property.getKeyValue("spark.thrift.server.user"));
48 |         data.put("sparkThriftServerPass",  property.getKeyValue("spark.thrift.server.pass"));
49 |         
50 |         return data;
51 |     }
52 |     
53 |     
54 |     /**
55 |      * 获取 elasticsearch 配置
56 |      * @return
57 |      */
58 |     public Map<String,String> getElasticsearchMasterConf(){
59 |         Map<String, String> data = new HashMap<String, String>();
60 |         data.put("elasticsearchMasterHost",  property.getKeyValue("elasticsearch.master.host"));
61 |         data.put("elasticsearchMasterPort",  property.getKeyValue("elasticsearch.master.port"));
62 |         data.put("elasticsearchMasterCluster",  property.getKeyValue("elasticsearch.master.cluster"));
63 |         return data;
64 |     }
65 |     
66 |     
67 |     /**
68 |      * 获取 业务 mysql 配置
69 |      * @return
70 |      */
71 |     public Map<String,String> getProductMysqDBInfo(){
72 |         Map<String, String> data = new HashMap<String, String>();
73 |         data.put("host",  property.getKeyValue("productMysqlDB.host"));
74 |         data.put("account",  property.getKeyValue("productMysqlDB.account"));
75 |         data.put("password",  property.getKeyValue("productMysqlDB.password"));
76 |         data.put("defaultDB",  property.getKeyValue("productMysqlDB.defaultDB"));
77 |         return data;
78 |     }
79 | 
80 |     /**
81 |      * 获取 dw mysql 配置
82 |      * @return
83 |      */
84 |     public Map<String,String> getDwMysqDBInfo(){
85 |         Map<String, String> data = new HashMap<String, String>();
86 |         data.put("host",  property.getKeyValue("biMysqlDB.host"));
87 |         data.put("account",  property.getKeyValue("biMysqlDB.account"));
88 |         data.put("password",  property.getKeyValue("biMysqlDB.password"));
89 |         data.put("defaultDB",  property.getKeyValue("biMysqlDB.defaultDB"));
90 |         return data;
91 |     }
92 | 
93 | }
94 | 


--------------------------------------------------------------------------------
/src/main/java/com/angejia/dw/service/property/model/Inventory.java:
--------------------------------------------------------------------------------
1 | package com.angejia.dw.service.property.model;
2 | 
3 | public class Inventory {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/java/com/angejia/dw/service/user/UserService.java:
--------------------------------------------------------------------------------
1 | package com.angejia.dw.service.user;
2 | 
3 | public class UserService {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/src/main/resources/conf_dev.properties:
--------------------------------------------------------------------------------
 1 | # HDFS SERVER
 2 | HDFSServer=namenode:8020
 3 | 
 4 | # zookeeper 
 5 | zookeeperQuorum=namenode:2181,datanode01:2181,datanode02:2181
 6 | 
 7 | # kafka
 8 | kafkaServerBrokerList=dwtest:9092
 9 | 
10 | # product Mysql DB
11 | productMysqlDB.host=192.168.169.12
12 | productMysqlDB.account=angejia
13 | productMysqlDB.password=angejia123
14 | productMysqlDB.defaultDB=angejia
15 | 
16 | # bi Mysql DB
17 | biMysqlDB.host=dwtest
18 | biMysqlDB.account=root
19 | biMysqlDB.password=root
20 | biMysqlDB.defaultDB=da_db
21 | 
22 | # hive
23 | hive.metastore.uris=thrift://namenode:9083
24 | hive.thrift.server.url=jdbc:hive2://NameNode:10000/default
25 | hive.thrift.server.user=dwadmin
26 | hive.thrift.server.pass=dwadmin
27 | 
28 | # spark
29 | spark.thrift.server.url=jdbc:hive2://NameNode:10000/default
30 | spark.thrift.server.user=dwadmin
31 | spark.thrift.server.pass=dwadmin
32 | 
33 | # elasticsearch cluster
34 | elasticsearch.master.host=dwtest
35 | elasticsearch.master.port=9300
36 | elasticsearch.master.cluster=angejia-dw-es
37 | 
38 | 


--------------------------------------------------------------------------------
/src/main/resources/conf_online.properties:
--------------------------------------------------------------------------------
 1 | # HDFS SERVER
 2 | HDFSServer=uhadoop-ociicy-master1:8020
 3 | 
 4 | # zookeeper 
 5 | zookeeperQuorum=uhadoop-ociicy-master1:2181,uhadoop-ociicy-master2:2181,uhadoop-ociicy-core1:2181
 6 | 
 7 | # ukafka cluster
 8 | #kafkaServerBrokerList=ukafka-uiu1lt-1-bj03.service.ucloud.cn:9092,ukafka-uiu1lt-2-bj03.service.ucloud.cn:9092,ukafka-uiu1lt-3-bj03.service.ucloud.cn:9092
 9 | kafkaServerBrokerList=bi4:9092
10 | 
11 | # product Mysql DB
12 | productMysqlDB.host=agjdb2-bi
13 | productMysqlDB.account=angejia_dw
14 | productMysqlDB.password=Th872havAyaxEmEB
15 | productMysqlDB.defaultDB=angejia
16 | 
17 | # bi Mysql DB
18 | biMysqlDB.host=angejia-bi-db
19 | biMysqlDB.account=hadoop
20 | biMysqlDB.password=angejia888
21 | biMysqlDB.defaultDB=da_db
22 | 
23 | # hive
24 | hive.metastore.uris=thrift://uhadoop-ociicy-master1:9083,thrift://uhadoop-ociicy-master2:9083
25 | hive.thrift.server.url=jdbc:hive2://uhadoop-ociicy-master2:10000/dw_db
26 | hive.thrift.server.user=dwadmin
27 | hive.thrift.server.pass=dwadmin
28 | 
29 | # spark
30 | spark.thrift.server.url=jdbc:hive2://uhadoop-ociicy-task4:10002/dw_db
31 | spark.thrift.server.user=hadoop
32 | spark.thrift.server.pass=hadoop
33 | 
34 | # elasticsearch cluster
35 | elasticsearch.master.host=bi4
36 | elasticsearch.master.port=9300
37 | elasticsearch.master.cluster=angejia-dw-es
38 | 
39 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=WARN,console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.spark-project.jetty=WARN
10 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
13 | 
14 | #log4j.logger.org.apache.spark.sql.SQLContext=TRACE
15 | #log4j.logger.org.apache.spark.sql.catalyst.analysis.Analyzer=TRACE
16 | #log4j.logger.org.apache.spark=TRACE
17 | #log4j.logger.org.apache.spark.storage.BlockManagerMasterActor=WARN
18 | #log4j.logger.org.apache.spark.HeartbeatReceiver=WARN
19 | #log4j.logger.org.apache.spark.scheduler.local.LocalActor=WARN


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/common/util/JsonUtil.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.common.util
  2 | 
  3 | import scala.util.parsing.json.JSON
  4 | 
  5 | // play Json
  6 | import play.api.libs.json._
  7 | import play.api.libs.json.JsValue
  8 | import play.api.libs.json.JsString
  9 | import play.api.libs.json.JsArray
 10 | import play.api.libs.json.JsObject
 11 | import play.api.libs.json.JsResult
 12 | import play.api.libs.json.Reads._
 13 | import play.api.libs.json.Json.JsValueWrapper
 14 | 
 15 | // spray Json
 16 | import spray.json._
 17 | import DefaultJsonProtocol._ 
 18 | 
 19 | // smart Json
 20 | import java.util
 21 | import net.minidev.json.{JSONObject}
 22 | import net.minidev.json.parser.JSONParser
 23 | import scala.collection.JavaConversions.mapAsScalaMap
 24 | import scala.collection.JavaConversions.mutableMapAsJavaMap
 25 | 
 26 | object JsonUtil {
 27 | 
 28 |     /**
 29 |      * play 解析类库
 30 |      */
 31 |     implicit val objectMapFormat = new Format[Map[String, Object]] {
 32 | 
 33 |       /**
 34 |        * 写, Map -> Json 操作
 35 |        */
 36 |       def writes(map: Map[String, Object]): JsValue = 
 37 |         Json.obj(map.map{case (s, o) =>
 38 |           val ret:(String, JsValueWrapper) = o match {
 39 |             case _:String => s -> JsString(o.asInstanceOf[String])
 40 |             case z:Map[String, String] => {
 41 |                 s -> o.asInstanceOf[Map[String, String]]
 42 |             }
 43 |             case _ => s -> JsArray(o.asInstanceOf[List[String]].map(JsString(_)))
 44 |           }
 45 |           ret
 46 |         }.toSeq:_*)
 47 | 
 48 |       /**
 49 |        * 读，Json -> Map
 50 |        */
 51 |       def reads(jv: JsValue): JsResult[Map[String, Object]] =
 52 |         JsSuccess(jv.as[Map[String, JsValue]].map{case (k, v) =>
 53 |           k -> (v match {
 54 |             case s: JsString => s.as[String]
 55 |             case z: JsObject => {
 56 |                 var rs: Map[String, String] = Map[String, String]()
 57 |                 val jsonValue: JsValue = Json.parse(z.toString())
 58 |                 val mp = Json.fromJson[Map[String, String]](jsonValue)
 59 |                 if (mp != null) {
 60 |                     rs = mp.get
 61 |                 }
 62 |                 rs
 63 |             }
 64 |             case l => l.as[List[String]]
 65 |           })
 66 |         })
 67 |     }
 68 | 
 69 |     /**
 70 |      * play 类库 map -> json  
 71 |      * 
 72 |      * 使用前需要把 map 转换为不可变 map.toMap
 73 |      *  如果内部有嵌套 map 也需要转换为不可变 map.
 74 |      *    val map = Map("a"-> Map("a"->"2").toMap).toMap
 75 |      *      OR
 76 |      *    val map = mapData.map(f => f._1 -> f._2.toMap).toMap
 77 |      */
 78 |     def playMapToJson(map: Map[String, Object]) : String =  {
 79 |         val jv: JsValue = Json.toJson(map)
 80 |         jv.toString()
 81 |     }
 82 | 
 83 |     /**
 84 |      * json -> map 不可变 Map :
 85 |      * 
 86 |      *  // 案例
 87 |      *  val userNeedsBaseData = JsonUtil.playJsonToMap(userNeedsJson) // 返回的是一个 Map[String, Object]
 88 |         val userNeedsBaseDataFormat = userNeedsBaseData.map{case (k,v) =>
 89 |             val curK = k 
 90 |             // 把元祖 v 转换为 Map, 再把 map 转换为可变 Map
 91 |             val curV = scala.collection.mutable.Map(v.asInstanceOf[scala.collection.immutable.Map[String,String]].toSeq:_*)
 92 |             k -> curV
 93 |         }
 94 |         // 再把最外层的 Map 也转换为可变的 map
 95 |         var userNeedsBase = collection.mutable.Map(userNeedsBaseDataFormat.toSeq:_*).asInstanceOf[scala.collection.mutable.Map[String, Map[String, String]]]
 96 |      */
 97 |     def playJsonToMap(jsonStr: String): Map[String, Object] = {
 98 |         val jsonValue: JsValue = Json.parse(jsonStr)
 99 |         val jr: JsResult[Map[String, Object]] = Json.fromJson[Map[String, Object]](jsonValue)
100 |         jr.get
101 |     }
102 | 
103 |     /**
104 |      * Json -> JsValue 解析成 JsValue 对象
105 |      * json.\("fieldName") 使用这种方式直接访问
106 |      * 或者直接 .toString 即可访问完整的
107 |      */
108 |     def playJsonToJsValue(jsonString: String) : JsValue = {
109 |         val jsonValue: JsValue = Json.parse(jsonString)
110 |         jsonValue
111 |     }
112 | 
113 | 
114 |     
115 |     def playTest() : Unit = {
116 |         // map 转换成为 String
117 |         val map: Map[String, Object] = Map(
118 |                 "val1" -> "xxx",
119 |                 "val2" -> List("a", "b", "c"), 
120 |                 "val3" -> "sss",
121 |                 "val4" -> List("d", "e", "f"),
122 |                 "val5" -> Map("a"->"1", "b"->"2", "c"->"3").toMap // 你懂得,转换为不可变 Map
123 |         )
124 |         val jv: JsValue = Json.toJson(map)
125 |         println(jv) // {"val1":"xxx","val3":"sss","val2":["a","b","c"],"val5":{"a":"1","b":"2","c":"3"},"val4":["d","e","f"]}
126 | 
127 |         // String 转换为 Map
128 |         val jr: JsResult[Map[String, Object]] = Json.fromJson[Map[String, Object]](jv)
129 |         println(jr.get) //Map(val1 -> xxx, val3 -> sss, val2 -> List(a, b, c), val5 -> Map(a -> 1, b -> 2, c -> 3), val4 -> List(d, e, f))
130 |         println(jr.get("val5").asInstanceOf[Map[String, String]].get("a"))
131 |         
132 |         
133 |         val uesrTagData: Map[String, Map[String,String]] = Map[String, Map[String,String]](
134 |               "0" ->   Map(
135 |                 "city" -> "1",
136 |                 "block" -> "1",
137 |                 "community" -> "1",
138 |                 "bedrooms" -> "2"
139 |                 ),
140 |                "1" ->   Map(
141 |                 "city" -> "1",
142 |                 "block" -> "1",
143 |                 "community" -> "1",
144 |                 "bedrooms" -> "2"
145 |                 )
146 |         )
147 |         val uesrTagToJson: JsValue = Json.toJson(uesrTagData)
148 |         println(uesrTagToJson)
149 |         val uesrTagToMap: JsResult[Map[String, Object]] = Json.fromJson[Map[String, Object]](uesrTagToJson)
150 |         println(uesrTagToMap.getOrElse().asInstanceOf[Map[String, Map[String,String]]])
151 |         //exit
152 |     }
153 | 
154 | 
155 | 
156 |     /**
157 |      *  scala 原生对象 json -> object
158 |      */
159 |     def JsonToObj(jsonString: String) : Option[Any] = {
160 |         val obj =  JSON.parseFull(jsonString)
161 |         obj
162 |     }
163 | 
164 | 
165 |     import scala.collection.mutable.Map
166 |     /**
167 |      * 将map转为json
168 |      * @param map 输入格式 mutable.Map[String,Object]
169 |      * @return
170 |      * */
171 |     def smartMapToJsonStr(map : Map[String,Object]) : String = {
172 |         val jsonString = JSONObject.toJSONString(map)
173 |         jsonString
174 |     }
175 | 
176 |    /**
177 |    * 将 json 转化为 Map
178 |    * @param json 输入json字符串
179 |    * @return
180 |    * */
181 |     def smartJsonStrToMap(json : String) : Map[String,Object] = {
182 |         val map : Map[String,Object]= Map()
183 |         val jsonParser =new JSONParser()
184 | 
185 |         //将string转化为jsonObject
186 |         val jsonObj: JSONObject = jsonParser.parse(json).asInstanceOf[JSONObject]
187 |     
188 |         //获取所有键
189 |         val jsonKey = jsonObj.keySet()
190 |     
191 |         val iter = jsonKey.iterator()
192 |     
193 |         while (iter.hasNext){
194 |           val field = iter.next()
195 |           val value = jsonObj.get(field).toString
196 |     
197 |           if(value.startsWith("{")&&value.endsWith("}")){
198 |             val value = mapAsScalaMap(jsonObj.get(field).asInstanceOf[util.HashMap[String, String]])
199 |             map.put(field,value)
200 |           }else{
201 |             map.put(field,value)
202 |           }
203 |         }
204 |         map
205 |     }
206 | 
207 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/common/util/ListenerFile.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.common.util
  2 | 
  3 | import java.util.concurrent.TimeUnit
  4 | 
  5 | class ListenerFile {
  6 |  
  7 |     /**
  8 |      * 读取小文件可以,但是针对大文件,会有部分问题 
  9 |      *   Exception in thread "main" java.lang.StackOverflowError
 10 |      *   调整 java -Xss512M
 11 |      * 从指定文件日期开始,监听文件的变化,并发送给回调函数
 12 |      * file : 需要监听文件路径,格式: /data/log/uba/lb/access.${date}.log
 13 |      * date : 指定文件日期(天) , 格式: 20160101
 14 |      * lineNum : 从多少行开始读
 15 |      * stepLength : 每次读取多少行, 如果每次读取 1 行写 0, 如果每次读取 100 行写 100
 16 |      * 
 17 |      * callback : 回调函数
 18 |      * 
 19 |      
 20 |      def readLogLine(result: Map[String,Any]): Unit = {
 21 |          println(result)
 22 |      }
 23 |      */
 24 |     def listenerDateFile(
 25 |             file: String, 
 26 |             date: String, 
 27 |             lineNum: Int, 
 28 |             stepLength: Int,
 29 |             callback: Map[String,Any] => Unit,    // 回调函数
 30 |             isRecursive: Boolean = true // 是否递归调用
 31 |     ): Map[String,Any] = {
 32 | 
 33 |         // 当前运行日期
 34 |         var curDate = date
 35 | 
 36 |         // 当前读的文件
 37 |         val curReadFile = file.replace("${date}", date)
 38 | 
 39 |         // 等待执行 sh 命令 sed -n '5,7p'
 40 |         val startLine = lineNum
 41 |         val endLine = lineNum + stepLength - 1
 42 |         var readLineCmd = "sed -n " + startLine + "," + endLine + "p " + curReadFile
 43 |         val commandResult = ScriptUtil.runSyncCommand(readLineCmd)       // 执行返回结果
 44 |         val commandCode = commandResult.get("code").get                  // 执行状态
 45 |         val curLine = commandResult.get("stdoutPut").get.toString()      // 获取标准输出
 46 | 
 47 | 
 48 |         // 当前定位行数
 49 |         var curLineNum: Int = 0
 50 | 
 51 |         // 结果数据的行数
 52 |         val curLineResult = curLine.split("\n")
 53 |         val curLineResultLength = curLineResult.length // 一共读了多少行
 54 | 
 55 |         // 当行数分解的数组长度为 1， 并且第一元素的值为空 , 表示读取的数据为空了
 56 |         val rs = curLineResultLength <= 1 && curLineResult(0).length() == 0 // 为空是 true , 不为空是 false
 57 | 
 58 |         // 读取的行数为空, 日期是今天的 
 59 |         if (rs == true && date == this.getCurDate()) {
 60 |             // 把位置定位到开始的时间
 61 |             curLineNum = startLine
 62 | 
 63 |             // 等待 3 秒后再执行
 64 |             TimeUnit.SECONDS.sleep(3); 
 65 | 
 66 |         // 读取的行数为空, 日期不是当天的日期
 67 |         } else if (rs == true && date != this.getCurDate()) {
 68 |             // 当前日期增加 1 天，tomorrowDate
 69 |             curDate = this.getOffsetDate(1,date)
 70 | 
 71 |             // 文件从第一行开始读
 72 |             curLineNum = 1
 73 | 
 74 |         // 读取的行数不为空, 不是今天日期, 也不是隔天日期, 表示是正常累加的行数
 75 |         } else {
 76 |             if (curLineResultLength < stepLength) {
 77 |                TimeUnit.SECONDS.sleep(3);
 78 |             }
 79 |           
 80 |             // 则把行数定位到 开始行数 + 总共读取的行数
 81 |             curLineNum = startLine + curLineResultLength
 82 |         }
 83 | 
 84 | 
 85 |         // 返回的结果
 86 |         var result = Map(
 87 |             // 读到的文件
 88 |             "file" -> curReadFile,
 89 |             // 下一次开始读的行数
 90 |             "nextLineNum" -> curLineNum,
 91 |             // 读到的行内容
 92 |             "fileLineContent" -> curLine,
 93 |             // 文件模板
 94 |             "fileTemplate" -> file,
 95 |              // 日期
 96 |             "date" -> curDate,
 97 |             // 读到的命令
 98 |             "readLineCmd" -> readLineCmd,
 99 |             // 命令返回的参数
100 |             "commandResult" -> commandResult
101 |         )
102 |         callback(result) // 回调函数
103 |  
104 |         if (isRecursive == true) {
105 |             // 递归,从指定日期第 n 行开始读取数据
106 |             this.listenerDateFile(file, curDate, curLineNum, stepLength,callback)
107 |         }
108 |         result
109 | 
110 |     }
111 | 
112 | 
113 |      /**
114 |      * While 方式监听文件变化
115 |      * file : /data/log/uba/lb/access.${date}.log  监听的文件
116 |      * date : 20160101 日期
117 |      * lineNum: 行数
118 |      * stepLength : 步长
119 |      */
120 |     def listenerDateFileWhile(
121 |             file: String, 
122 |             date: String, 
123 |             lineNum: Int, 
124 |             stepLength: Int,
125 |             callback: Map[String,Any] => Unit    // 回调函数
126 |     ) : Unit = {
127 |         var status = true
128 | 
129 |         // 当前运行时间
130 |         var curDate =  date
131 | 
132 |         // 当前读到的行数
133 |         var curLineNum = lineNum
134 | 
135 |         var map = Map[String,Any]();
136 |         while( status ){
137 |             map = this.listenerDateFile(file, curDate, curLineNum, stepLength, callback, false)
138 |             curDate = map.get("date").get.toString()
139 |             curLineNum = map.get("nextLineNum").get.toString().toInt
140 |         }
141 | 
142 |     }
143 | 
144 | 
145 |     // 日期增加 减少 1 天
146 |     def getOffsetDate(offset: Int, dateStr: String): String = {
147 |         // 字符日期转换为 Date 对象
148 |         val date = DateUtil.StringToDate(dateStr, DateUtil.SIMPLE_YMD_FORMAT)
149 |         // date 对象 + n 天
150 |         val offsetDate = DateUtil.getCalendarOffsetDateDay(offset, date);
151 |         // date 对象转换成 str
152 |         DateUtil.DateToString(offsetDate, DateUtil.SIMPLE_YMD_FORMAT)
153 |     } 
154 | 
155 | 
156 |     // 获取当前系统时间
157 |     def getCurDate(): String = DateUtil.TimestampToSting(DateUtil.getNowTimestamp,DateUtil.SIMPLE_YMD_FORMAT)
158 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/common/util/RegexUtil.scala:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.common.util
 2 | 
 3 | import scala.util.matching.Regex
 4 | 
 5 | 
 6 | object RegexUtil {
 7 | 
 8 | 
 9 |     def findStrData(pattern: String, strData: String) : String = {
10 |         var rs = ""
11 | 
12 |         // String 类的 r() 方法构造了一个Regex对象
13 |         val patternObj: Regex = pattern.r
14 | 
15 |         // findFirstIn 方法找到首个匹配项
16 |         val findRs: Option[String] = patternObj.findFirstIn(strData)
17 | 
18 |         if (!findRs.isEmpty) {
19 |             //val patternObj(num,str) = string
20 |             //rs = args
21 | 
22 |             // 模式匹配, 当前字符串匹配到了正则
23 |             strData match{  
24 |                 case patternObj(str) => 
25 |                     //println(str)
26 |                     rs = str
27 |                 case _=> 
28 |                     println("Not matched")  
29 |             }
30 |         }
31 |         rs
32 |     }
33 | 
34 | 
35 |     def findStrDataBak(pattern: String, string: String) : String = {
36 |         var rs = ""
37 |         // 使用 Regex 构造对象
38 |         val patternObj = new Regex(pattern)  // 首字母可以是大写 S 或小写 s
39 |         println(patternObj.findFirstIn(string))
40 | 
41 |         println(patternObj findFirstIn string) 
42 |         rs
43 |     }
44 | 
45 | 
46 |     def test() : Unit = {
47 |         
48 |         // 构造 Regex 对象, 用 String 类的 r 方法即可
49 |          val pattern = "Scala".r
50 |          val str = "Scala is Scalable and cool"
51 |          println(pattern findFirstIn str) //  println(pattern.findFirstIn(str))
52 | 
53 |          // 使用 Regex 构造对象
54 |          val pattern2 = new Regex("(S|s)cala")  // 首字母可以是大写 S 或小写 s
55 |          val str2 = "Scala is scalable and cool"
56 |          println(pattern2 findFirstIn str2) 
57 |          
58 |          
59 |          val filter_regex="/mobile/member/inventories/list[?](.*)".r
60 |          val str3 = "/mobile/member/inventories/list?bedroom_id=2&city_id=1&sort_id=3&price_id=4&district_id=7&block_id=62&page=1&per_page=8"
61 |          if (!filter_regex.findFirstIn(str3).isEmpty) {
62 |              val filter_regex(pars) = str3
63 |              println(pars)
64 |          }
65 | 
66 |     }
67 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/common/util/ScFileUtil.scala:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.common.util
 2 | 
 3 | import scala.io.Source
 4 | 
 5 | object ScFileUtil {
 6 |    
 7 |     /**
 8 |      * 读取文件
 9 |      * val readFile = fileInputStream.split("\\s+")
10 |      */
11 |     def fileInputStream(fileName: String,encoded: String = "UTF-8"): String = {
12 |         val source = Source.fromFile(fileName,encoded)
13 |         val contents = source.mkString.toString()
14 |         contents
15 |     }
16 |     
17 |     
18 |     
19 |     
20 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/common/util/ScriptUtil.scala:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.common.util
 2 | 
 3 | import sys.process._
 4 | import java.io.BufferedReader
 5 | import java.io.InputStreamReader
 6 | 
 7 | object ScriptUtil {
 8 | 
 9 | 
10 |     def runSyncCommand(cmd: String): Map[String,Any] = {
11 |        val qb = Process(cmd)
12 | 
13 |        var out = ""
14 |        var err = ""
15 |  
16 |        val exitCode = qb ! ProcessLogger( 
17 |               // 匿名函数
18 |               (s) => {
19 |                   out += s + "\n"
20 |                }, 
21 |               (s) => {
22 |                   err += s + "\n"
23 |               }
24 |        )
25 | 
26 |        val result: Map[String,Any] = Map(
27 |             "code" -> exitCode, 
28 |             "stdoutPut" -> out,
29 |             "erroutPut" -> err
30 |        )
31 | 
32 |         result
33 |     }
34 | 
35 | 
36 | 
37 |     /** Run a command, collecting the stdout, stderr and exit status */
38 |     def runCommandBak(in: String): (List[String], List[String], Int) = {
39 |       val qb = Process(in)
40 |       var out = List[String]()
41 |       var err = List[String]()
42 |  
43 |       val exit = qb ! ProcessLogger(
44 |               (s) => out =  List(s), 
45 |               (s) => err ::= s)
46 | 
47 |       (out.reverse, err.reverse, exit)
48 |     }
49 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/common/util/mysql/MysqlClient.scala:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.common.util.mysql
 2 | 
 3 | import scala.collection.mutable.ArrayBuffer
 4 | import scala.collection.mutable.HashMap
 5 | 
 6 | import java.sql.{ DriverManager, ResultSet }
 7 | import com.mysql.jdbc.Driver
 8 | 
 9 | class MysqlClient(ip: String, user: String, pwd: String, db: String) extends Serializable {
10 | 
11 |     // Change to Your Database Config
12 |     lazy val conn_str =
13 |         "jdbc:mysql://" +
14 |             ip + ":3306/" +
15 |             db + "?" +
16 |             "user=" + user +
17 |             "&password=" + pwd +
18 |             "&zeroDateTimeBehavior=convertToNull"
19 | 
20 |     // Setup the connection
21 |     lazy val conn = DriverManager.getConnection(conn_str)
22 | 
23 |     // 查询
24 |     def select(sql: String): ArrayBuffer[HashMap[String, Any]] = {
25 |         try {
26 |             // Configure to be Read Only
27 |             val statement = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY)
28 | 
29 |             // Execute Query
30 |             val rs = statement.executeQuery(sql)
31 | 
32 |             val rsmd = rs.getMetaData
33 |             val colNames = for (i <- 1 to rsmd.getColumnCount) yield rsmd.getColumnLabel(i)
34 |             val result = ArrayBuffer[HashMap[String, Any]]()
35 | 
36 |             while (rs.next) {
37 |                 var row = new HashMap[String, Any];
38 |                 for (n <- colNames) {
39 |                     row.put(n, rs.getObject(n))
40 |                 }
41 |                 result += row
42 |             }
43 | 
44 |             rs.close()
45 |             statement.close()
46 | 
47 |             result
48 |         } finally {
49 |             //conn.close
50 |         }
51 |     }
52 | 
53 |     /**
54 |      * 执行
55 |      */
56 |     def exec(sql: String): Int = {
57 |         try {
58 |             val prep = conn.prepareStatement(sql)
59 |             //prep.setString(1, "Nothing great was ever achieved without enthusiasm.")
60 |             //prep.setString(2, "Ralph Waldo Emerson")
61 |             prep.executeUpdate
62 |         } finally {
63 |             //conn.close
64 |         }
65 |     }
66 |     
67 |     def close() : Unit  = {
68 |         conn.close()
69 |     }
70 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/hadoop/hdfs/HDFSClient.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.hadoop.hdfs
  2 | 
  3 | import java.io.BufferedInputStream
  4 | import java.io.File
  5 | import java.io.FileInputStream
  6 | import java.io.InputStream
  7 | import org.apache.hadoop.conf.Configuration
  8 | import org.apache.hadoop.fs.FileSystem
  9 | import org.apache.hadoop.fs.Path
 10 | import org.apache.hadoop.fs.FileStatus
 11 | import org.apache.hadoop.fs.FileUtil
 12 | 
 13 | /**
 14 |  * 操作 HDFS 类
 15 |  */
 16 | class HDFSClient(conf: Configuration)  {
 17 | 
 18 | //Initial Configuration
 19 | 
 20 |   //private var conf = new Configuration()
 21 |   //private var maprfsCoreSitePath = new Path("core-site.xml")
 22 |   //private var maprfsSitePath = new Path("maprfs-site.xml")
 23 | 
 24 |   //conf.addResource(maprfsCoreSitePath)
 25 |   //conf.addResource(maprfsSitePath)
 26 | 
 27 |     private var fileSystem = FileSystem.get(conf)
 28 | 
 29 |     /**
 30 |      * 创建 HDFS 目录
 31 |      */
 32 |     def mkdirs(hdfsFolderPath: String): Unit = {
 33 |        var path = new Path(hdfsFolderPath)
 34 |          if (!fileSystem.exists(path)) {
 35 |             fileSystem.mkdirs(path)
 36 |          }
 37 |     }
 38 | 
 39 | 
 40 |     /**
 41 |      * 创建一个空的 HDFS 文件
 42 |      */
 43 |     def createNewFile(hdfsFilePath:String): Path = {
 44 |         val path = new Path(hdfsFilePath)
 45 |         if (!fileSystem.exists(path)) {
 46 |             var out = fileSystem.createNewFile(path)
 47 |         }
 48 |         path
 49 |     }
 50 | 
 51 | 
 52 |     /**
 53 |      * 创建或者修改 HDFS 文件
 54 |      */
 55 |     def createAndSave(hdfsPath: String): Unit = {
 56 |         var out = fileSystem.create(new Path(hdfsPath))
 57 |         var in = new BufferedInputStream(new FileInputStream(hdfsPath))
 58 |         var b = new Array[Byte](1024)
 59 |         var numBytes = in.read(b)
 60 |         while (numBytes > 0) {
 61 |           out.write(b, 0, numBytes)
 62 |           numBytes = in.read(b)
 63 |         }
 64 |         in.close()
 65 |         out.close()
 66 |     }
 67 | 
 68 | 
 69 |     /**
 70 |      * 追加本地文件到 HDFS 文件中
 71 |      */
 72 |     def appendFileToHdfsFile(fromfilepath: String, hdfsFilePath: String): Unit = {
 73 |        val hdfsPath = this.createNewFile(hdfsFilePath)
 74 |        // hfds
 75 |        var out = fileSystem.append(hdfsPath)
 76 | 
 77 |        // 本地文件流
 78 |        var in = new BufferedInputStream(new FileInputStream(new File(fromfilepath)))
 79 |        var b = new Array[Byte](1024)
 80 |        var numBytes = in.read(b)
 81 |        while (numBytes > 0) {
 82 |          out.write(b, 0, numBytes)
 83 |          numBytes = in.read(b)
 84 |         }
 85 |         in.close()
 86 |         out.close()
 87 |     }
 88 | 
 89 | 
 90 |     /**
 91 |      * 追加数据到 HFDS 中
 92 |      */
 93 |     def appendDataToHdfsFile(data: String, hdfsFilePath: String) : Unit = {
 94 |         val path = this.createNewFile(hdfsFilePath)
 95 | 
 96 |         var out = fileSystem.append(new Path(hdfsFilePath))
 97 | 
 98 |         val by = data.getBytes()
 99 |         out.write(by, 0, by.length)
100 |         out.close()
101 |     }
102 | 
103 | 
104 |     /**
105 |      * 读取 HDFS 文件流
106 |      */
107 |     def getFile(hdfsPath: String): InputStream = {
108 |         var path = new Path(hdfsPath)
109 |         fileSystem.open(path)
110 |         
111 |     }
112 | 
113 | 
114 |     /**
115 |      * 删除 hdfs 文件
116 |      */
117 |     def deleteFile(hdfsPath: String): Boolean = {
118 |         var path = new Path(hdfsPath)
119 |         fileSystem.delete(path, true)
120 |     }
121 | 
122 | 
123 |     /**
124 |      * Close the FileSystem Handle
125 |      */
126 |     def close() = {
127 |          fileSystem.close
128 |     }
129 |  }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/hadoop/hdfs/HDFSClientTest.scala:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.hadoop.hdfs
 2 | 
 3 | import java.io.BufferedInputStream
 4 | import java.io.File
 5 | import java.io.FileInputStream
 6 | import java.io.InputStream
 7 | import org.apache.hadoop.conf.Configuration
 8 | import org.apache.hadoop.fs.FileSystem
 9 | import org.apache.hadoop.fs.Path
10 | import org.apache.hadoop.fs.FileStatus
11 | import org.apache.hadoop.fs.FileUtil
12 | 
13 | import org.apache.hadoop.conf.Configuration
14 | import com.angejia.dw.hadoop.hdfs.HDFSClient
15 | 
16 | object HDFSClientTest  {
17 | 
18 | 
19 |     def main(args: Array[String])  {
20 |         val ob = new HDFSClientTest
21 |         ob.run(args(0), args(1))
22 |     }
23 | }
24 | 
25 | 
26 | class HDFSClientTest {
27 | 
28 |     def run(hdfsPath: String, localPath: String) : Unit = {
29 |         //fs.defaultFS 8020
30 |         val conf = new Configuration()
31 |         conf.set("fs.defaultFS", "hdfs://uhadoop-ociicy-master1:8020") // 写地址
32 |         conf.setBoolean("dfs.support.append", true)    // 开启追加模式
33 | 
34 | 
35 |         val hdfsServer = new HDFSClient(conf)
36 |         // 创建文件
37 |         //hdfsServer.createNewFile("/user/hive/real_time/source_data/access_log/aaa.txt")
38 |         // 追加数据
39 |         //hdfsServer.appendFileToHdfsFile("/data/tmp/test.log","/user/hive/real_time/source_data/access_log/aaa.txt")
40 |         hdfsServer.appendDataToHdfsFile("你好呀",hdfsPath)
41 |         hdfsServer.appendDataToHdfsFile("你好呀",hdfsPath)
42 |         //hdfsServer.appendDataToHdfsFile("你好呀",hdfsPath)
43 |         //hdfsServer.appendDataToHdfsFile("你好呀",hdfsPath)
44 |         // 读取内容
45 |         val content = hdfsServer.getFile(hdfsPath)
46 |         println(content)
47 |     }
48 | }
49 | 
50 | 
51 |  
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/hadoop/kafka/KafkaConsumer.scala:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.hadoop.kafka
 2 | 
 3 | import kafka.message._
 4 | import kafka.serializer._
 5 | import kafka.utils._
 6 | import kafka.consumer.Consumer
 7 | import kafka.consumer.ConsumerConfig
 8 | import kafka.consumer.Whitelist
 9 | import java.util.Properties
10 | import kafka.utils.Logging
11 | import scala.collection.JavaConversions._
12 | 
13 | class KafkaConsumer(
14 |   topic: String, 
15 |   /** topic 
16 |   * The high-level API hides the details of brokers from the consumer and allows consuming off the cluster of machines 
17 |   * without concern for the underlying topology. It also maintains the state of what has been consumed. The high-level API 
18 |   * also provides the ability to subscribe to topics that match a filter expression (i.e., either a whitelist or a blacklist 
19 |   * regular expression).  This topic is a whitelist only but can change with re-factoring below on the filterSpec
20 |   */
21 |   groupId: String, 
22 |   /** groupId
23 |   * A string that uniquely identifies the group of consumer processes to which this consumer belongs. By setting the same 
24 |   * group id multiple processes indicate that they are all part of the same consumer group.
25 |   */
26 |   zookeeperConnect: String, 
27 |   /**
28 |   * Specifies the zookeeper connection string in the form hostname:port where host and port are the host and port of 
29 |   * a zookeeper server. To allow connecting through other zookeeper nodes when that zookeeper machine is down you can also 
30 |   * specify multiple hosts in the form hostname1:port1,hostname2:port2,hostname3:port3. The server may also have a zookeeper 
31 |   * chroot path as part of it's zookeeper connection string which puts its data under some path in the global zookeeper namespace. 
32 |   * If so the consumer should use the same chroot path in its connection string. For example to give a chroot path of /chroot/path 
33 |   * you would give the connection string as hostname1:port1,hostname2:port2,hostname3:port3/chroot/path.
34 |   */
35 |   readFromStartOfStream: Boolean = true
36 |   /** 
37 |   * What to do when there is no initial offset in Zookeeper or if an offset is out of range: 
38 |   * 1) smallest : automatically reset the offset to the smallest offset 
39 |   * 2) largest : automatically reset the offset to the largest offset 
40 |   * 3) anything else: throw exception to the consumer. If this is set to largest, the consumer may lose some 
41 |        messages when the number of partitions, for the topics it subscribes to, changes on the broker. 
42 |   ****************************************************************************************
43 |   To prevent data loss during partition addition, set auto.offset.reset to smallest
44 |   This make sense to change to true if you know you are listening for new data only as of
45 |   after you connect to the stream new things are coming out.  you can audit/reconcile in
46 |   another consumer which this flag allows you to toggle if it is catch-up and new stuff or
47 |   just new stuff coming out of the stream.  This will also block waiting for new stuff so
48 |   it makes a good listener.
49 |   //readFromStartOfStream: Boolean = true
50 |   readFromStartOfStream: Boolean = false
51 |   ****************************************************************************************
52 |   */
53 |   ) extends Logging {
54 |   
55 |   val props = new Properties()
56 |   props.put("group.id", groupId)
57 |   props.put("zookeeper.connect", zookeeperConnect)
58 |   props.put("auto.offset.reset", if(readFromStartOfStream) "smallest" else "largest")
59 | 
60 |   val config = new ConsumerConfig(props)
61 |   val connector = Consumer.create(config)
62 | 
63 |   val filterSpec = new Whitelist(topic)
64 | 
65 |   info("setup:start topic=%s for zk=%s and groupId=%s".format(topic,zookeeperConnect,groupId))
66 |   val stream = connector.createMessageStreamsByFilter(filterSpec, 1, new DefaultDecoder(), new DefaultDecoder()).get(0)
67 |   info("setup:complete topic=%s for zk=%s and groupId=%s".format(topic,zookeeperConnect,groupId))
68 | 
69 |   def read(write: (Array[Byte])=>Unit) = {
70 |     info("reading on stream now")
71 |     for(messageAndTopic <- stream) {
72 |       try {
73 |         info("writing from stream")
74 |         write(messageAndTopic.message)
75 |         info("written to stream")
76 |       } catch {
77 |         case e: Throwable =>
78 |           if (true) { //this is objective even how to conditionalize on it
79 |             error("Error processing message, skipping this message: ", e)
80 |           } else {
81 |             throw e
82 |           }
83 |       }
84 |     }      
85 |   }
86 | 
87 |   def close() {
88 |     connector.shutdown()
89 |   }
90 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/hadoop/kafka/KafkaProducer.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.hadoop.kafka
  2 | 
  3 | import java.util.{Properties, UUID}
  4 | import kafka.producer.ProducerConfig
  5 | import kafka.common._
  6 | import kafka.message._
  7 | import kafka.serializer._
  8 | import kafka.producer.Producer
  9 | import kafka.producer.KeyedMessage
 10 | import java.util.Properties
 11 | import java.util.Date
 12 | // import org.apache.log4j.Logger
 13 | 
 14 | 
 15 | case class KafkaProducer(
 16 |   topic: String, 
 17 |   brokerList: String, 
 18 |   /** brokerList
 19 |   * This is for bootstrapping and the producer will only use it for getting metadata (topics, partitions and replicas). 
 20 |   * The socket connections for sending the actual data will be established based on the broker information returned in 
 21 |   * the metadata. The format is host1:port1,host2:port2, and the list can be a subset of brokers or a VIP pointing to a 
 22 |   * subset of brokers.
 23 |   */
 24 |   clientId: String = UUID.randomUUID().toString,
 25 |   /** clientId
 26 |   * The client id is a user-specified string sent in each request to help trace calls. It should logically identify 
 27 |   * the application making the request.
 28 |   */  
 29 |   synchronously: Boolean = true, 
 30 |   /** synchronously
 31 |   * This parameter specifies whether the messages are sent asynchronously in a background thread. 
 32 |   * Valid values are false for asynchronous send and true for synchronous send. By setting the producer 
 33 |   * to async we allow batching together of requests (which is great for throughput) but open the possibility 
 34 |   * of a failure of the client machine dropping unsent data.
 35 |   */
 36 |   compress: Boolean = true,
 37 |   /** compress
 38 |   * This parameter allows you to specify the compression codec for all data generated by this producer. 
 39 |   * When set to true gzip is used.  To override and use snappy you need to implement that as the default
 40 |   * codec for compression using SnappyCompressionCodec.codec instead of DefaultCompressionCodec.codec below.
 41 |   */
 42 | 
 43 |   batchSize: Integer = 200,
 44 |   /** batchSize
 45 |   * The number of messages to send in one batch when using async mode. 
 46 |   * The producer will wait until either this number of messages are ready 
 47 |   * to send or queue.buffer.max.ms is reached.
 48 |   */
 49 |   messageSendMaxRetries: Integer = 3,
 50 |   /** messageSendMaxRetries
 51 |   * This property will cause the producer to automatically retry a failed send request. 
 52 |   * This property specifies the number of retries when such failures occur. Note that 
 53 |   * setting a non-zero value here can lead to duplicates in the case of network errors 
 54 |   * that cause a message to be sent but the acknowledgement to be lost.
 55 |   */
 56 |   requestRequiredAcks: Integer = -1
 57 |   /** requestRequiredAcks
 58 |   *  0) which means that the producer never waits for an acknowledgement from the broker (the same behavior as 0.7). 
 59 |   *     This option provides the lowest latency but the weakest durability guarantees (some data will be lost when a server fails).
 60 |   *  1) which means that the producer gets an acknowledgement after the leader replica has received the data. This option provides 
 61 |   *     better durability as the client waits until the server acknowledges the request as successful (only messages that were 
 62 |   *     written to the now-dead leader but not yet replicated will be lost).
 63 |   * -1) which means that the producer gets an acknowledgement after all in-sync replicas have received the data. This option 
 64 |   *     provides the best durability, we guarantee that no messages will be lost as long as at least one in sync replica remains.
 65 |   */
 66 |   ) { 
 67 | 
 68 |   val props = new Properties()
 69 | 
 70 |   val codec = if(compress) DefaultCompressionCodec.codec else NoCompressionCodec.codec
 71 | 
 72 |   props.put("compression.codec", codec.toString)
 73 |   props.put("producer.type", if(synchronously) "sync" else "async")
 74 |   props.put("metadata.broker.list", brokerList)
 75 |   props.put("batch.num.messages", batchSize.toString)
 76 |   props.put("message.send.max.retries", messageSendMaxRetries.toString)
 77 |   props.put("request.required.acks",requestRequiredAcks.toString)
 78 |   props.put("client.id",clientId.toString)
 79 | 
 80 |   val producer = new Producer[AnyRef, AnyRef](new ProducerConfig(props))
 81 |   
 82 |   def kafkaMesssage(message: Array[Byte], partition: Array[Byte]): KeyedMessage[AnyRef, AnyRef] = {
 83 |      if (partition == null) {
 84 |        new KeyedMessage(topic,message)
 85 |      } else {
 86 |        new KeyedMessage(topic,partition,message)
 87 |      }
 88 |   } 
 89 | 
 90 |   def send(message: String, partition: String = null): Unit = {
 91 |       //println(partition.getBytes("UTF8"))
 92 |       send(message.getBytes("UTF8"), if (partition == null) null else partition.getBytes("UTF8"))
 93 |   }
 94 | 
 95 |   def send(message: Array[Byte], partition: Array[Byte]): Unit = {
 96 |     try {
 97 |       producer.send(kafkaMesssage(message, partition))
 98 |     } catch {
 99 |       case e: Exception =>
100 |         e.printStackTrace
101 |         System.exit(1)
102 |     }        
103 |   }
104 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/hadoop/spark/CollaborativeFiltering.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.hadoop.spark
  2 | 
  3 | import org.apache.spark.SparkContext
  4 | import org.apache.spark.SparkConf
  5 | import org.apache.spark.rdd.RDD
  6 | import org.apache.spark.mllib.recommendation.{ALS,MatrixFactorizationModel,Rating}
  7 | import org.jblas.DoubleMatrix
  8 | 
  9 | /**
 10 |  * Spark Mllib 协同过滤
 11 |  */
 12 | class CollaborativeFiltering extends Serializable {
 13 | 
 14 |     // 训练得出的模型
 15 |     var trainModel: MatrixFactorizationModel = null
 16 | 
 17 | 
 18 |     /**
 19 |      * 提取有效特征
 20 |      * rdd: RDD[Array[Int]]   一个处理过的 RDD 对象, 结构为 RDD[Array[Int]] 
 21 |      * 分别是 用户,主题,评分
 22 |      */
 23 |     def characteristics(rdd: RDD[Array[Int]]): RDD[Rating] = {
 24 |         //把 RDD 数据转换成 Rating 对象
 25 |         val ratings: RDD[Rating] = rdd.map {
 26 |             // 模式匹配
 27 |             case Array(user, item, rating) => Rating(user.toInt, item.toInt, rating.toInt)
 28 |         }
 29 |         ratings.cache()
 30 |     }
 31 | 
 32 | 
 33 |      /**
 34 |        * 训练推荐模型 
 35 |        * 使用: (ALS)最小二乘法,是求解矩阵分解问题的最优方法
 36 |        * 矩阵分解: 
 37 |        *   显示矩阵分解: ALS.train(ratings, rank, iterations, lambda) 用来处理直接获得的数据,一般是用户访问,收藏,评分等数据 
 38 |        *   隐式矩阵分解: ALS.trainImplicit(ratings, rank, iterations) 用处理间接才能获得的数据,需要在用户与物品的交互中才能得到的数据,如看了电影的次数,购买了某个产品等
 39 |        * 
 40 |        * 矩阵分解参数:
 41 |        *   rank : ALS 模型中的因子个数,值越大会越好,但是训练模型和保存时的开销就越大
 42 |        *   iterations : 运行迭代次数,经过少数次数迭代后 ALS 模型便已能收敛为一个比较合理的好模型
 43 |        *   lambda : 控制模型的正规化过程, 从而控制模型的过拟合情况
 44 |      */
 45 | 
 46 |     /**
 47 |      * 显示矩阵分解
 48 |      */
 49 |     def train(ratings: RDD[Rating], rank: Int = 50, iterations: Int = 10, lambda: Double = 0.01) : MatrixFactorizationModel = {
 50 |         val model: MatrixFactorizationModel = ALS.train(ratings, rank, iterations, lambda)
 51 |         this.trainModel = model
 52 |         model
 53 |     }
 54 | 
 55 |     /**
 56 |      * 隐式矩阵分解
 57 |      */
 58 |     def trainImplicit(ratings: RDD[Rating], rank: Int = 50, iterations: Int = 10) : MatrixFactorizationModel = {
 59 |         val model: MatrixFactorizationModel = ALS.trainImplicit(ratings, rank, iterations)
 60 |         this.trainModel = model
 61 |         model 
 62 |     }
 63 | 
 64 | 
 65 |     /**
 66 |       * 使用推荐模型
 67 |       * 
 68 |       * 用户推荐模型: 利用相似用户的评级来计算对某个用户的推荐
 69 |       *   给指定用户推荐物品,通常以 "前 K 个" 形式展现, 即通过模型求出用户可能喜好程度最高的前 K 个商品
 70 |       *   这个过程通过计算每个商品的预计得分, 按照得分机型排序实现
 71 |       *   
 72 |       * 物品推荐模型: 依赖用户接触过的物品与候选物品之间的相似度来获得推荐
 73 |       *   给定一个物品, 有哪些物品与它相似,相似的确切定义取决于所使用的模型,相似度是通过某种方式比较表示两个物品的向量二得到的
 74 |       *   相似度衡量方法
 75 |       *     皮尔森相关系数(Pearson correlation)
 76 |       *     针对实数响亮的余弦相似度(cosine similarity)
 77 |       *     针对二元向量的杰卡德相似系数(Jaccard similarity)
 78 |       */
 79 | 
 80 |     /**
 81 |      * 用户推荐 - 单个用户推荐最得分最高的 K 个物品
 82 |      * userId: 需要推荐的用户 Id
 83 |      * K: 匹配分数最高的前 K 个物品
 84 |      */
 85 |     def userRecommendItem(userId: Int, K: Int) : Array[Rating] = {
 86 |         val topKRecs: Array[Rating] = this.trainModel.recommendProducts(userId, K)
 87 |         topKRecs
 88 |     }
 89 | 
 90 | 
 91 |     /**
 92 |      * 用户推荐 - 用户物品推荐预测得分
 93 |      */
 94 |     def userPredict(user: Int, product: Int): Double = {
 95 |         val predictionScore = this.trainModel.predict(user, product)
 96 |         predictionScore
 97 |     }
 98 | 
 99 | 
100 |     /**
101 |      * 用户推荐 - 批量用户推荐物品推荐得分
102 |      */
103 |     def userPredict(usersProducts: RDD[(Int,Int)]): RDD[Rating] = {
104 |         val predictionScore = this.trainModel.predict(usersProducts)
105 |         predictionScore
106 |     }
107 | 
108 | 
109 |     /**
110 |      * 物品余弦相似度计算
111 |      * 返回 (item ID, 因子分数) 这是一个 pair RDD
112 |      */
113 |     def itemCosineSimilarity(itemId: Int) : RDD[(Int, Double)] = {
114 |          // 线性代数库,求向量点积 ,创建一个 Array[Double] 类型的向量
115 | 
116 |         // item 因子 从模型中,取回对应的因子
117 |         val itemFactor: Array[Double] = this.trainModel.productFeatures.lookup(itemId).head
118 | 
119 |         // item 向量
120 |         val itemVector: DoubleMatrix = new org.jblas.DoubleMatrix(itemFactor)
121 | 
122 |         // 求出本物品与各个物品的余弦相似度
123 |         val sims: RDD[(Int, Double)] = this.trainModel.productFeatures.map { case (id, factor) => 
124 |             val factorVector = new org.jblas.DoubleMatrix(factor)
125 |             val sim = this.cosineSimilarity(factorVector,itemVector)
126 |             (id, sim)
127 |         }
128 |         sims
129 |     }
130 | 
131 | 
132 |     /**
133 |      * 物品推荐 - top 推荐
134 |      */
135 |     def itemRecommendItem(sims: RDD[(Int, Double)], K: Int) : Array[(Int, Double)] = {
136 |          // 按照物品相似度排序,取出与本物品最相似前 K 个物品
137 |         val sortedSims: Array[(Int, Double)] = sims.top(K)(    // top 是分布式计算出前 K 个结果
138 |             Ordering.by[(Int, Double), Double] {
139 |                 case (id, similarity) => similarity 
140 |             }
141 |         )
142 |  
143 |         // 打印出这 10 个与给定物品最相似的物品
144 |         //val result = sortedSims.take(10).mkString("\n")
145 |         //println(result)
146 |         sortedSims
147 |     }
148 | 
149 | 
150 |     /**
151 |      * 计算连个向量之间的余弦相似度, 余弦相似度是两个向量在 n 维空间里两者夹角的读书
152 |      * 它是两个向量的点积与各向量范数(或长度)的乘积的商
153 |      * 相似度的取值在 -1 到 1 主键
154 |      *  1 表示完全相似
155 |      *  0 表示两者互不相关(即无相关性)
156 |      *  -1 表示两者不相关, 还表示它们完全不相同
157 |      */
158 |     def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix) : Double = {
159 |         vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
160 |     }
161 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/hadoop/spark/CollaborativeFilteringTest.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.hadoop.spark
  2 | 
  3 | 
  4 | import org.apache.spark.SparkContext
  5 | import org.apache.spark.SparkConf
  6 | import org.apache.spark.mllib.recommendation.ALS
  7 | import org.apache.spark.mllib.recommendation.Rating
  8 | import org.jblas.DoubleMatrix
  9 | 
 10 | 
 11 | 
 12 | /**
 13 |  * 测试数据 /data/log/recommend/ml-100k/u.data
 14 |  */
 15 | object CollaborativeFilteringTest {
 16 | 
 17 |     def main(args: Array[String])  {
 18 |         val inventoryIBCF = new CollaborativeFilteringTest()
 19 |         inventoryIBCF.run()
 20 |     }
 21 | }
 22 | 
 23 | /**
 24 |  * 看了又看算法
 25 |  */
 26 | class CollaborativeFilteringTest {
 27 |   
 28 |     
 29 |     def run(): Unit = {
 30 |         this.suanfa()
 31 |     }
 32 | 
 33 | 
 34 |     def suanfa(): Unit = {
 35 |         // SPARK 运行环境配置
 36 |         val conf = new SparkConf()
 37 |         conf.setAppName("InventoryIBCF")
 38 |         conf.setMaster("local[2]")
 39 |         //conf.set("spark.ui.port", "36000")
 40 | 
 41 |         // SPARK 上下文配置
 42 |         val sc = new SparkContext(conf)
 43 | 
 44 |  
 45 |         /**
 46 |          * 提取有效特征
 47 |          * 1. 数据清洗
 48 |          * 2. 载入数据
 49 |          * 3. 格式化数据
 50 |          */
 51 |         //原始数据
 52 |         val rawData = sc.textFile("/data/log/recommend/ml-100k/u.data")
 53 |         //println(rawData.first())
 54 | 
 55 |         // 把行分割成数组，并且读取数组前 3 个原始
 56 |         // 参数(类型)推断
 57 |         val rawRatings = rawData.map(_.split("\t").take(3))
 58 |         // 正常写法
 59 |         //val rawRatings = rawData.map(line => line.split("\t").take(3))     
 60 | 
 61 |         //把数据转换成 Rating 对象
 62 |         val ratings = rawRatings.map {
 63 |             // 模式匹配
 64 |             case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toInt)
 65 |         }
 66 | 
 67 | 
 68 |         /**
 69 |          * 训练推荐模型 
 70 |          * 使用: (ALS)最小二乘法,是求解矩阵分解问题的最优方法
 71 |          * 矩阵分解: 
 72 |          *   显示矩阵分解: ALS.train(ratings, rank, iterations, lambda) 用来处理直接获得的数据,一般是用户访问,收藏,评分等数据 
 73 |          *   隐式矩阵分解: ALS.trainImplicit(ratings, rank, iterations) 用处理间接才能获得的数据,需要在用户与物品的交互中才能得到的数据,如看了电影的次数,购买了某个产品等
 74 |          * 
 75 |          * 矩阵分解参数:
 76 |          *   rank : ALS 模型中的因子个数,值越大会越好,但是训练模型和保存时的开销就越大
 77 |          *   iterations : 运行迭代次数,经过少数次数迭代后 ALS 模型便已能收敛为一个比较合理的好模型
 78 |          *   lambda : 控制模型的正规化过程, 从而控制模型的过拟合情况
 79 |          */
 80 |         var rank = 50
 81 |         var iterations = 10
 82 |         var lambda = 0.01
 83 | 
 84 |         // 训练模型,返回 MatrixFactorizationModel 对象,返回 用户因子 RDD 和 物品因子 RDD
 85 |         val model = ALS.train(ratings, rank, iterations, lambda)
 86 | 
 87 |         //val productFeatures = model.productFeatures    // 物品因子
 88 |         //val userFeatures = model.userFeatures          // 用户因子
 89 |         val K = 10    // 推荐数量
 90 | 
 91 | 
 92 |         /**
 93 |          * 使用推荐模型
 94 |          * 
 95 |          * 用户推荐模型: 利用相似用户的评级来计算对某个用户的推荐
 96 |          *   给指定用户推荐物品,通常以 "前 K 个" 形式展现, 即通过模型求出用户可能喜好程度最高的前 K 个商品
 97 |          *   这个过程通过计算每个商品的预计得分, 按照得分机型排序实现
 98 |          *   
 99 |          * 物品推荐模型: 依赖用户接触过的物品与候选物品之间的相似度来获得推荐
100 |          *   给定一个物品, 有哪些物品与它相似,相似的确切定义取决于所使用的模型,相似度是通过某种方式比较表示两个物品的向量二得到的
101 |          *   相似度衡量方法
102 |          *     皮尔森相关系数(Pearson correlation)
103 |          *     针对实数响亮的余弦相似度(cosine similarity)
104 |          *     针对二元向量的杰卡德相似系数(Jaccard similarity)
105 |          */
106 | 
107 |         /**
108 |          * 用户推荐
109 |          */
110 |         // 计算给定用户 -> 给定物品的预计得分
111 |         model.predict(789, 123)
112 | 
113 |         // 以(user,item) ID对类型的 RDD 对象为输入, 返回多个用户和物品的预测, 
114 |         //model.predict(userFeatures)
115 | 
116 |         // 为每个用户生成前 K 个推荐物品
117 |         val userId = 789
118 |         
119 |         val topKRecs = model.recommendProducts(userId, K)
120 |          //println(topKRecs.mkString("\n"))
121 | 
122 |         /**
123 |          * 物品推荐
124 |          */
125 |         // 线性代数库,求向量点积 ,创建一个 Array[Double] 类型的向量
126 |         val aMatrix = new DoubleMatrix(Array(1.0,2.0,3.0))
127 | 
128 |        
129 |         var itemId = 567
130 |         val itemFactor = model.productFeatures.lookup(itemId).head
131 |         val itemVector = new DoubleMatrix(itemFactor)
132 |         
133 |          // 计算物品与自己的相似度 - Test
134 |         //val itemX = this.cosineSimilarity(itemVector, itemVector)
135 |         //println(itemX)
136 | 
137 | 
138 |         // 求出物品与各个物品的余弦相似度
139 |         val sims = model.productFeatures.map { case (id, factor) => 
140 |             val factorVector = new DoubleMatrix(factor)
141 |             val sim = cosineSimilarity(factorVector,itemVector)
142 |             (id, sim)
143 |         }
144 | 
145 | 
146 |         // 按照物品相似度排序,取出与物品 567 最相似前 10 个物品
147 |         val sortedSims = sims.top(K)(
148 |             Ordering.by[(Int, Double), Double] {
149 |                 case (id, similarity) => similarity 
150 |             }
151 |         )
152 | 
153 |         // 打印出这 10 个与给定物品最相似的物品
154 |         val result = sortedSims.take(10).mkString("\n")
155 |         println(result)
156 |     }
157 | 
158 |     
159 |     /**
160 |      * 计算连个向量之间的余弦相似度, 余弦相似度是两个向量在 n 维空间里两者夹角的读书
161 |      * 它是两个向量的点积与各向量范数(或长度)的乘积的商
162 |      * 相似度的取值在 -1 到 1 主键
163 |      *  1 表示完全相似
164 |      *  0 表示两者互不相关(即无相关性)
165 |      *  -1 表示两者不相关, 还表示它们完全不相同
166 |      */
167 |     def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix) : Double = {
168 |         vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
169 |     }
170 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/logs/UbaAppActionLogStreaming.scala:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.logs
 2 | 
 3 | object UbaAppActionLogStreaming  {
 4 | 
 5 | 
 6 |     def main(args: Array[String])  {
 7 |         
 8 |     }
 9 | }
10 | 
11 | 
12 | class UbaAppActionLogStreaming {
13 |   
14 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/logs/UbaWebActionLogStreaming.scala:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.logs
 2 | 
 3 | object UbaWebActionLogStreaming  {
 4 | 
 5 | 
 6 |     def main(args: Array[String])  {
 7 |         
 8 |     }
 9 | }
10 | 
11 | 
12 | class UbaWebActionLogStreaming {
13 |   
14 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/logs/UbaWebVisitLogStreaming.scala:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.logs
 2 | 
 3 | object UbaWebVisitLogStreaming  {
 4 | 
 5 | 
 6 |     def main(args: Array[String])  {
 7 |         
 8 |     }
 9 | }
10 | 
11 | 
12 | class UbaWebVisitLogStreaming {
13 |   
14 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/Conf.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend
  2 | 
  3 | import scala.collection.mutable.Map
  4 | import scala.io.Source
  5 | import java.io.{ InputStream, BufferedReader, InputStreamReader, PushbackReader }
  6 | 
  7 | import com.angejia.dw.common.util.PropertyUtil
  8 | 
  9 | object Conf {
 10 | 
 11 |     // 读取配置文件
 12 |     val property = new PropertyUtil()
 13 | 
 14 |     // 项目根目录
 15 |     //var projectPath = Conf.getClass().getResource("/").getFile().toString()
 16 | 
 17 |     /**
 18 |      * 设置环境,根据不同的环境使用不同的配置文件
 19 |      */
 20 |     def setEnv(env: String = "dev") : Unit = {
 21 | 
 22 |         // 读取的配置文件名称
 23 |         val confName = "/conf_" + env + ".properties"
 24 | 
 25 |         /**
 26 |          * 读取 resource 目录下的文件
 27 |          * val lines = Source.fromURL(getClass.getResource(confName)).getLines()
 28 |            lines.foreach(println)
 29 |          */
 30 |         // 获取 resource 文件读输入流
 31 |         val inputStreamReader: InputStreamReader = Source.fromURL(getClass.getResource(confName)).reader()
 32 | 
 33 |         // 设置读取的流
 34 |         property.setFileInputStream(inputStreamReader)
 35 |     }
 36 | 
 37 | 
 38 |     /**
 39 |      * 获取 zookeeper 地址
 40 |      */
 41 |     def getZookeeperQuorum(): String = {
 42 |         val zookeeperQuorum: String = property.getKeyValue("zookeeperQuorum") 
 43 |         zookeeperQuorum
 44 |     }
 45 | 
 46 | 
 47 |     /**
 48 |      * 获取 kafka 服务器地址
 49 |      */
 50 |     def getKafkaServerBrokerList() : String = {
 51 |         val kafkaServerBrokerList: String = property.getKeyValue("kafkaServerBrokerList")
 52 |         kafkaServerBrokerList
 53 |     }
 54 | 
 55 | 
 56 |     /**
 57 |      * 获取业务数据库配置信息
 58 |      */
 59 |     def getProductMysqDBInfo(): Map[String,String] = {
 60 |         Map[String, String](
 61 |            "host" ->  property.getKeyValue("productMysqlDB.host"),
 62 |            "account" -> property.getKeyValue("productMysqlDB.account"),
 63 |            "password" -> property.getKeyValue("productMysqlDB.password"),
 64 |            "defaultDB" -> property.getKeyValue("productMysqlDB.defaultDB")
 65 |         )
 66 |     }
 67 | 
 68 | 
 69 |     /**
 70 |      * 获取bi数据库配置信息
 71 |      */
 72 |     def getBiMysqDBInfo(): Map[String,String] = {
 73 |         Map[String, String](
 74 |            "host" ->  property.getKeyValue("biMysqlDB.host"),
 75 |            "account" -> property.getKeyValue("biMysqlDB.account"),
 76 |            "password" -> property.getKeyValue("biMysqlDB.password"),
 77 |            "defaultDB" -> property.getKeyValue("biMysqlDB.defaultDB")
 78 |         )
 79 |     }
 80 | 
 81 | 
 82 |     /**
 83 |      * hdfs 文件服务地址
 84 |      */
 85 |     def getHDFSServer(): String = {
 86 |         val hdfsServer: String = property.getKeyValue("HDFSServer")
 87 |         hdfsServer
 88 |     }
 89 | 
 90 | 
 91 |     /**
 92 |      * 获取 hive 相关配置信息
 93 |      */
 94 |     def getHiveConf(): Map[String,String] = {
 95 |         Map[String, String](
 96 |            "hiveMetastoreUris" ->  property.getKeyValue("hive.metastore.uris"),
 97 |            "hiveThriftServerUrl" ->  property.getKeyValue("hive.thrift.server.url"),
 98 |            "hiveThriftServerUser" ->  property.getKeyValue("hive.thrift.server.user"),
 99 |            "hiveThriftServerPass" ->  property.getKeyValue("hive.thrift.server.pass")
100 |         )
101 |     }
102 | 
103 | 
104 |     /**
105 |      * 获取 spark 相关配置信息
106 |      */
107 |     def getSparkConf(): Map[String,String] = {
108 |         Map[String, String](
109 |            "sparkThriftServerUrl" ->  property.getKeyValue("spark.thrift.server.url"),
110 |            "sparkThriftServerUser" ->  property.getKeyValue("spark.thrift.server.user"),
111 |            "sparkThriftServerPass" ->  property.getKeyValue("spark.thrift.server.pass")
112 |         )
113 |     }
114 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/IBCF.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend
  2 | 
  3 | import scala.collection.mutable.Map
  4 | 
  5 | import org.apache.log4j.{Level, Logger}
  6 | 
  7 | import org.apache.spark.SparkConf
  8 | import org.apache.spark.SparkContext
  9 | import org.apache.spark.rdd.RDD
 10 | 
 11 | 
 12 | /**
 13 |  * 基于 spark Rdd 实现 IBCF 算法
 14 |  */
 15 | class IBCF extends Serializable  {
 16 | 
 17 |     /**
 18 |      * 模型的数据源文件 : HDFS 或者 普通文件
 19 |      */
 20 |     var characteristicsFile: String = null
 21 |     def setCharacteristicsFile(characteristicsFile: String): Unit = {
 22 |         this.characteristicsFile = characteristicsFile
 23 |     }
 24 |     def getCharacteristicsFile(): String = {
 25 |         this.characteristicsFile
 26 |     }
 27 | 
 28 | 
 29 |     /**
 30 |      * 文件行分隔符
 31 |      */
 32 |     var fileSeparator: String = "\t"
 33 |     def setFileSeparator(fileSeparator: String): Unit = {
 34 |         this.fileSeparator = fileSeparator
 35 |     }
 36 |     def getFileSeparator(): String = {
 37 |         this.fileSeparator
 38 |     }
 39 | 
 40 | 
 41 |     /**
 42 |      * 根据 SparkConf 初始化 SparkContext 上下文
 43 |      */
 44 |     var sparkContext: SparkContext = null
 45 |     def setSparkContext(sparkConf: SparkConf): Unit = {
 46 |         this.sparkContext = new SparkContext(sparkConf)
 47 |     }
 48 |     def getSparkContext(): SparkContext  = {
 49 |         this.sparkContext
 50 |     }
 51 | 
 52 | 
 53 |     /**
 54 |      * 计算矩阵并得到 item -> item 关系矩阵, 根据输入文件或者 HDFS
 55 |      */
 56 |     def calculateByFile(sparkConf: SparkConf, characteristicsFile: String, fileSeparator: String): Map[String, Int] = {
 57 | 
 58 | println("----- 初始化 sparkContext 上下文 -----")
 59 |         this.setSparkContext(sparkConf)
 60 | 
 61 |         // 设置加载文件
 62 |         this.setCharacteristicsFile(characteristicsFile)
 63 |         this.setFileSeparator(fileSeparator)
 64 | 
 65 |         // 加载数据, 生成 RDD 
 66 |         val baseModelRDD = this.loadModelData(this.getCharacteristicsFile())
 67 | 
 68 |         // 过滤模型
 69 |         val modelRDD =  this.filterModelRDD(baseModelRDD, this.getFileSeparator())
 70 | 
 71 |         // 生成 user -> item 矩阵
 72 |         val itemAndItemMatrixRDD = this.generateItemAndItemMatrix(modelRDD)
 73 | 
 74 |         // 合并所有 user -> item 矩阵
 75 |         val itemAndItemMatrixCollection = this.mergeItemAndItemMatrix(itemAndItemMatrixRDD)
 76 | 
 77 |         itemAndItemMatrixCollection
 78 |     }
 79 |     
 80 |     
 81 |     
 82 |     /**
 83 |      * 计算矩阵并得到 item -> item 关系矩阵 - 根据基础输入模型
 84 |      */
 85 |     def calculateByBaseModelRDD(baseModelRDD: RDD[String], fileSeparator: String): Map[String, Int] = {
 86 | 
 87 |         this.setFileSeparator(fileSeparator)
 88 | 
 89 |         // 过滤模型
 90 |         val modelRDD =  this.filterModelRDD(baseModelRDD, this.getFileSeparator())
 91 | 
 92 |         // 生成 user -> item 矩阵
 93 |         val itemAndItemMatrixRDD = this.generateItemAndItemMatrix(modelRDD)
 94 | 
 95 |         // 合并所有 user -> item 矩阵
 96 |         val itemAndItemMatrixCollection = this.mergeItemAndItemMatrix(itemAndItemMatrixRDD)
 97 | 
 98 |         itemAndItemMatrixCollection
 99 |     }
100 | 
101 | 
102 | 
103 |     /**
104 |      * 加载模型数据
105 |      * 
106 |      */
107 |     def loadModelData(characteristicsFile: String) : RDD[String] = {
108 | 
109 | println("----- 加载模型数据: " + characteristicsFile + "-----")
110 |         // 读取数据源
111 |         val baseModelRDD = this.getSparkContext().textFile(characteristicsFile)
112 | 
113 |         baseModelRDD
114 |     }
115 | 
116 | 
117 |     /**
118 |      * 过滤 RDD 模型
119 |      * modelRDD: 根据数据源生成基础 RDD
120 |      * fileSeparator : 文件行分隔符
121 |      */
122 |     def filterModelRDD(modelRDD: RDD[String], fileSeparator: String): RDD[String] = {
123 |  
124 | println("----- 过滤模型非法数据 -----")
125 | 
126 |         // 筛选过滤原始数据
127 |         val filterRDD = modelRDD.filter { line =>  
128 |             var checkStatus = true
129 |             if (line.isEmpty()) {
130 |                 checkStatus = false
131 |             }
132 |             val lineArr = line.split(fileSeparator)
133 |             if (lineArr.length < 2) {
134 |                 checkStatus = false
135 |             } else {
136 |                 val userId = lineArr.apply(0)
137 |                 val itemId = lineArr.apply(1)
138 |                 //if (userId.matches("[0-9]+") == false) {
139 |                     //checkStatus = false
140 |                 //}
141 |                 //if (itemId.matches("[0-9]+") == false) {
142 |                    // checkStatus = false
143 |                 //} 
144 |                 if (userId == "" || itemId == "") {
145 |                     checkStatus = false
146 |                 }
147 |             }
148 |             checkStatus
149 |         }
150 | 
151 |         filterRDD
152 |     }
153 | 
154 | 
155 | 
156 |     /**
157 |      * 根据 user 集合, 生成 item -> item 矩阵
158 |      * modelRDD : 基础数据模型
159 |      * return RDD(
160 |      *     Map("50:57"->1, "57:50"->1, "51:55"->1),
161 |      *     Map("50:57"->2, "57:50"->3, "51:55"->4)
162 |      * )
163 |      */
164 |     def generateItemAndItemMatrix(modelRDD:  RDD[String]) :  RDD[Map[String, Int]] = {
165 | 
166 | println("----- 归并 user -> items 集合 -----")
167 |         // 用户喜欢 items 的集合 RDD
168 |         val userLikeItemsCollectionRDD = modelRDD.map { line =>
169 |             val lineArr = line.split("\t")
170 |             val userId = lineArr.apply(0).toString()
171 |             val itemId = lineArr.apply(1).toString()
172 |             (userId, itemId)
173 |         }.groupByKey()
174 | 
175 | println("----- 根据 user -> items 集合, 生成 item -> item 矩阵 -----")
176 |         // 用户的 Item 矩阵 B
177 |         val itemAndItemMatrixRDD = userLikeItemsCollectionRDD.map{userItemsCollection =>
178 |             // 用户喜欢物品的集合
179 |             val userItems = userItemsCollection._2
180 | 
181 |             /**
182 |              * 数据结构
183 |                  Map("50:57"->1, "57:50"->1, "51:55"->1)
184 |              */
185 |             // 保存用户,每个物品对的矩阵 B
186 |             val itemAndItemMatrix : Map[String,Int] = Map[String,Int]()
187 | 
188 |             // 二二配对当前用户的物品,  为每个用户,产出一个 B 矩阵
189 |             for (i <- userItems) {
190 |                 for (j <- userItems) { 
191 |                     // 排除相同的物品
192 |                     if (i != j) {
193 |                         // 默认为每个用户 +1 个访问次数
194 |                         //userItemMatrix += Map(i -> Map(j -> 1))
195 |                         val key = i.toString() + ":" + j.toString()
196 |                         itemAndItemMatrix.put(key ,1)
197 |                     }
198 |                 }
199 |             }
200 |  
201 |             itemAndItemMatrix
202 |         }
203 | 
204 |         itemAndItemMatrixRDD
205 |     }
206 | 
207 | 
208 | 
209 |     /**
210 |      *  合并累加 ItemAndItemMatrix 矩阵
211 |      *  itemAndItemMatrix : item -> item 矩阵
212 |      */
213 |     def mergeItemAndItemMatrix(itemAndItemMatrix: RDD[Map[String, Int]]): Map[String, Int] = {
214 | println("----- 合并累加所有 item -> item  矩阵 -----")
215 | 
216 |         var rsItemMatrix: Map[String, Int] = Map[String, Int]()
217 | 
218 |         if (itemAndItemMatrix.count() == 0) {
219 |            return rsItemMatrix
220 |         }
221 | 
222 |         // 合并最终的矩阵
223 |         val itemAndItemMatrixCollection = itemAndItemMatrix.reduce{ (x, y) =>
224 | 
225 |             var curMatrix = x
226 |             var nextMatrix = y
227 | 
228 |             /**
229 |              * 目标 :
230 |              *  1. 把 curMatrix 和 nextMatrix  相同 key 的值相加
231 |              *  2. 把 nextMatrix 不在 curMatrix 中的原样追加到 curMatrix
232 |              */
233 |              for ((yK, yV) <- nextMatrix) {
234 |  
235 |                   if (curMatrix.contains(yK) == true) {
236 |                       curMatrix(yK) += nextMatrix(yK)
237 |                    } else {
238 |                       curMatrix.put(yK,yV)
239 |                    }
240 |              }
241 | 
242 |             curMatrix
243 |         }
244 | 
245 |         itemAndItemMatrixCollection
246 |     }
247 | 
248 | 
249 |     /**
250 |      * 根据 ItemId , groupBy ItemMatrix
251 |       把同类型的物品, 聚合到一起
252 |       Map(51 ->  51:55:2, 51:52:2, 51:53:2, 51:56:1
253 |         56 ->  56:53:1, 56:55:1, 56:52:1,56:51:1
254 |         )
255 |      */
256 |     def itemMatrixGroupByItemId(itemAndItemMatrixCollection: Map[String, Int]):  Map[String, Iterable[Array[String]]] = {
257 | println("----- 根据 ItemId , groupBy ItemMatrix  -----")
258 | 
259 |         val itemAndItemGroup = itemAndItemMatrixCollection.map( f => {
260 |             val ids = f._1.split(":")
261 |             val itemId = ids(0).toString()        // 房源 ID
262 |             val itemRsId = ids(1).toString()      // 推荐房源 ID
263 |             val itemRsIdCnt = f._2.toString()     // 共同看过的人数
264 |             // 转换成数组
265 |             Array(itemId, itemRsId, itemRsIdCnt)
266 |             //println(itemId, itemRsId, itemRsIdCnt)
267 |         }).groupBy { 
268 |             // 然后, 按照 itmeId 把把同类的 Item ID groupBy 到一起
269 |             f => f(0)
270 |         }
271 | 
272 |         //itemAndItemGroup
273 |         scala.collection.mutable.Map(itemAndItemGroup.toSeq:_*)
274 |     }
275 |     
276 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/community/CommunityIBCF.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.community
  2 | 
  3 | import scala.collection.mutable.Map
  4 | 
  5 | import org.apache.log4j.{Level, Logger}
  6 | import org.apache.spark.SparkContext
  7 | import org.apache.spark.SparkConf
  8 | 
  9 | import com.angejia.dw.recommend.Conf
 10 | import com.angejia.dw.hadoop.hbase.HBaseClient
 11 | import com.angejia.dw.recommend.IBCF
 12 | 
 13 | /**
 14 |  * Community IBCF 算法实现
 15 |  */
 16 | object CommunityIBCF {
 17 | 
 18 |     // hbase 数据表
 19 |     var hbaseResultTb: HBaseClient = null
 20 | 
 21 |     // 等待训练的数据文件
 22 |     var characteristicsFile: String = null
 23 | 
 24 |     // 文件分隔符
 25 |     var separator = "\t"
 26 |     
 27 |     def main (args: Array[String]) {
 28 | 
 29 |         for (ar <- args) {
 30 |             println(ar)
 31 |         }
 32 | 
 33 |         val env = args(0)    
 34 |         this.init(env)
 35 | 
 36 |         this.characteristicsFile = args(1)
 37 | 
 38 |         this.calculate()
 39 | 
 40 |     }
 41 |     
 42 | 
 43 |     /**
 44 |      * 初始化
 45 |      * env:  dev 开发环境， online 线上环境
 46 |      */
 47 |     def init (env: String): Unit =  {
 48 |         Conf.setEnv(env)
 49 | 
 50 |         // 连接 userUBCF 数据表
 51 |         this.hbaseResultTb = new HBaseClient("communityIBCF",Conf.getZookeeperQuorum())
 52 |     }
 53 | 
 54 |     def calculate(): Unit = {
 55 | 
 56 |         /**
 57 |          * 初始化 spark 
 58 |          */
 59 |         val sparkConf = new SparkConf()
 60 |         sparkConf.setAppName("CommunityIBCF")
 61 |         sparkConf.setMaster("local[2]")
 62 | 
 63 |         /**
 64 |          * 初始化推荐模型
 65 |          */
 66 |         val communityIBCF = new IBCF()
 67 | 
 68 |         // 合并累加 ItemAndItemMatrix 矩阵
 69 |         val itemAndItemMatrixCollection = communityIBCF.calculateByFile(sparkConf, characteristicsFile, separator)
 70 | 
 71 |         // 根据 ItemId , groupBy ItemMatrix
 72 |         val itemAndItemGroup = communityIBCF.itemMatrixGroupByItemId(itemAndItemMatrixCollection)
 73 | 
 74 |         var communityLine = 0         // 推荐行数
 75 | println("----- 把聚合后的数据格式化成字符串保存在 Hbase -----")
 76 |         itemAndItemGroup.foreach(line => {
 77 |             val invetoryId = line._1
 78 |             val invetoryRsInfo = line._2
 79 | 
 80 |             // 把里面的 array 按照:组合, 最外层按照,组合
 81 |             val invetoryRsToString = invetoryRsInfo.map(f => f.mkString(":")).mkString(",")
 82 | 
 83 |             this.communityRecommendWriteHbase(invetoryId, invetoryRsToString)
 84 | 
 85 |             communityLine += 1
 86 |         })
 87 | 
 88 |         println("")
 89 |         println("-----  HBase Table communityIBCF: ",
 90 |                 "写入了: " + communityLine + " 行")
 91 | 
 92 |     }
 93 | 
 94 | 
 95 |     /**
 96 |      * 保存推荐结果到 Hbase
 97 |      */
 98 |     def communityRecommendWriteHbase(rowKey: String, value: String): Unit = {
 99 |         this.hbaseResultTb.insert(rowKey, "recommend", "communityRecommend", value)
100 |     }
101 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/extract/ExtractFileToKafka.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.extract
  2 | 
  3 | import java.lang.{Runtime,Thread}
  4 | import com.angejia.dw.common.util.{ListenerFile,ScFileUtil,FileUtil}
  5 | import com.angejia.dw.hadoop.kafka.{KafkaProducer,KafkaConsumer}
  6 | 
  7 | /**
  8 |  * 动态抽取日志到 kafka
  9 |  */
 10 | 
 11 | object ExtractFileToKafka {
 12 | 
 13 |     def main(args: Array[String])  {
 14 |         val zookeeperConnect = args(0)
 15 |         val kafkaBrokerList =  args(1)
 16 |         val kafkaTopic = args(2)
 17 |         val kafkaTopicPartition = args(3)
 18 |         val kafkaConsumerGroupId = args(4)
 19 |         val listenerConfFile = args(5)
 20 |         val stepLength = args(6)
 21 | 
 22 |         val extractAccessLog = new ExtractFileToKafka
 23 |         extractAccessLog.zookeeperConnect = zookeeperConnect
 24 |         extractAccessLog.kafkaBrokerList = kafkaBrokerList
 25 |         extractAccessLog.kafkaTopic = kafkaTopic
 26 |         extractAccessLog.kafkaTopicPartition = kafkaTopicPartition
 27 |         extractAccessLog.kafkaConsumerGroupId = kafkaConsumerGroupId
 28 |         extractAccessLog.listenerConfFile = listenerConfFile
 29 |         extractAccessLog.stepLength = stepLength
 30 |         //extractAccessLog.stepLength = 1000.toString()
 31 |         extractAccessLog.runExtractAccessLog()
 32 |     }
 33 | 
 34 |     def runTest(): Unit = {
 35 |        val extractAccessLog = new ExtractFileToKafka
 36 |        extractAccessLog.zookeeperConnect = "dwtest:2181"
 37 |        extractAccessLog.kafkaBrokerList = "dwtest:9092"
 38 |        extractAccessLog.kafkaTopic = "accessLog"
 39 |        extractAccessLog.kafkaTopicPartition = "0"
 40 |        extractAccessLog.kafkaConsumerGroupId = "userPortrait"
 41 |        extractAccessLog.listenerConfFile = "/data/log/recommend/accesslog"
 42 |        extractAccessLog.runExtractAccessLog()
 43 |     }
 44 | }
 45 | 
 46 | /**
 47 |  * 等待完成功能
 48 |  * 1. 程序退出、失败，记录最后一次更新点
 49 |  * 2. 可以读取最后一次更新的文件,作为标记开始的日期
 50 |  * 3. 当目标文件不存在(等待)
 51 |  */
 52 | class ExtractFileToKafka {
 53 | 
 54 |     // zookeeper 服务器
 55 |     var zookeeperConnect : String = null
 56 | 
 57 |     // kafka broker 服务器
 58 |     var kafkaBrokerList : String = null
 59 | 
 60 |     // kafka Topic
 61 |     var kafkaTopic : String = null
 62 | 
 63 |     // kafka Topic Partition
 64 |     var kafkaTopicPartition : String = null
 65 | 
 66 |     // kafka Consumer GroupId
 67 |     var kafkaConsumerGroupId : String = null
 68 | 
 69 |     // kafkaProducer 连接对象
 70 |     var kafkaProducer: KafkaProducer = null
 71 | 
 72 |     // kafkaConsumer 连接对象
 73 |     var kafkaConsumer: KafkaConsumer = null
 74 | 
 75 |     // 监听的配置文件
 76 |     var listenerConfFile: String = null
 77 | 
 78 |     // 每次读取行的长度
 79 |     var stepLength: String = null
 80 | 
 81 | 
 82 |     /**
 83 |      * 监听日志文件,并且发送日志到 kafka 中
 84 |      */
 85 |     def runExtractAccessLog(): Unit = {
 86 |         // 读取配置文件
 87 |         val readFileConf = ScFileUtil.fileInputStream(this.listenerConfFile)
 88 |         val readFileConfArgs = readFileConf.split("\\s+")
 89 |         val lsFile = readFileConfArgs(0)         // 监听文件
 90 |         val lsFileDate = readFileConfArgs(1)     // 监听文件的日期
 91 |         val lsFileLineNum: String = readFileConfArgs(2)     // 监听文件读到的行数
 92 | 
 93 |         val listenerFile = new ListenerFile() 
 94 | 
 95 |         //listenerFile.listenerDateFile(lsFile, lsFileDate, lsFileLineNum.toInt, stepLength.toInt, this.readLogLine)
 96 |         listenerFile.listenerDateFileWhile(lsFile, lsFileDate, lsFileLineNum.toInt, stepLength.toInt ,this.readLogLine)
 97 |     }
 98 | 
 99 | 
100 | 
101 |     /**
102 |      * 回调函数,发送日志到 Kafka
103 |      */
104 |     def readLogLine(result: Map[String,Any]): Unit = {
105 |         val file = result.get("file").get.toString()    // 当前读的到的文件
106 |         val fileTemplate = result.get("fileTemplate").get.toString() // 文件模板
107 |         val date = result.get("date").get.toString() // 当前读到的文件日期
108 |         val nextLineNum = result.get("nextLineNum").get.toString()     // 下一次开始读取的行数
109 |         val readLineCmd = result.get("readLineCmd").get.toString()     // 当前读到的行数
110 |         val fileLineContent = result.get("fileLineContent").get.toString()    // 当前读到的文件内容
111 | 
112 |         val curLog = "NextReadFile: " + file + " " + date + " " + nextLineNum
113 |         println(readLineCmd)
114 |         println(curLog)
115 |         //println(fileLineContent)
116 | 
117 |         if (fileLineContent.length() != 0) {
118 |             // 发送日志到 kafka
119 |             this.producerToKafka(fileLineContent)
120 | 
121 |             // 文件记录点
122 |             val filePoint = fileTemplate + " " + date + " " + nextLineNum
123 |             FileUtil.fileOutputStream(this.listenerConfFile, filePoint, false)
124 | 
125 |             // 记录一份到 本地日志 : 调试用
126 |             //val localFile = "/var/log/ExtractFileToKafka/ExtractFileToKafka_" + date + ".log"
127 |             //FileUtil.fileOutputStream(localFile, fileLineContent, true)
128 | 
129 |             println("filePoint: " + filePoint)
130 |         }
131 | 
132 |         println("----------")
133 | 
134 |     }
135 | 
136 | 
137 |     /**
138 |      *  获取 KafkaProducer 连接对象
139 |      */
140 |     def getKafkaProducerObj(): KafkaProducer = {
141 |         if (this.kafkaProducer == null ) {
142 |             this.kafkaProducer = new KafkaProducer(this.kafkaTopic,this.kafkaBrokerList)
143 |         }
144 |         this.kafkaProducer 
145 |     }
146 | 
147 | 
148 |     /**
149 |      * 获取 KafkaConsumer 连接对象
150 |      */
151 |     def getConsumerKafka(): KafkaConsumer = {
152 |         if (this.kafkaConsumer == null) {
153 |             this.kafkaConsumer = new KafkaConsumer(this.kafkaTopic,this.kafkaConsumerGroupId,this.zookeeperConnect)
154 |         }
155 |         this.kafkaConsumer
156 |     }
157 | 
158 | 
159 |     /**
160 |      * 发送数据给 Kafka
161 |      */
162 |     def producerToKafka(content: String): Unit = {
163 |         this.getKafkaProducerObj().send(content,this.kafkaTopicPartition)
164 |     }
165 | 
166 | 
167 |     
168 |     /**
169 |      * 消费数据
170 |      */
171 |     def consumerKafka () : Unit = {
172 |         this.getConsumerKafka().read(this.consumerData)
173 |     }
174 | 
175 | 
176 | 
177 |     /**
178 |      * 回调函数
179 |      */
180 |     def consumerData(a: Array[Byte]): Unit = {
181 |         println(a(0).toBinaryString)
182 |         println(123) 
183 |     }
184 | 
185 | }
186 | 
187 | 
188 |   
189 | 
190 | 
191 | 


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/inventory/InventoryIBCF.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.inventory
  2 | 
  3 | import scala.collection.mutable.Map
  4 | 
  5 | import org.apache.log4j.{Level, Logger}
  6 | import org.apache.spark.SparkContext
  7 | import org.apache.spark.SparkConf
  8 | 
  9 | import com.angejia.dw.recommend.Conf
 10 | import com.angejia.dw.hadoop.hbase.HBaseClient
 11 | import com.angejia.dw.recommend.IBCF
 12 | import com.angejia.dw.common.util.{FileUtil}
 13 | 
 14 | /**
 15 |  * IBCF 算法实现
 16 |  * create 'inventoryIBCF',{NAME=>'baseInfo'},{NAME=>'recommend'}
 17 |  */
 18 | object InventoryIBCF {
 19 | 
 20 |     // hbase 数据表
 21 |     var hbaseResultTb: HBaseClient = null
 22 | 
 23 |     // 等待训练的数据文件
 24 |     var characteristicsFile: String = null
 25 | 
 26 |     // 文件分隔符
 27 |     var separator = "\t"
 28 | 
 29 |     def main (args: Array[String]) {
 30 | 
 31 |         for (ar <- args) {
 32 |             println(ar)
 33 |         }
 34 | 
 35 |         val env = args(0)    
 36 |         this.init(env)
 37 | 
 38 |         this.characteristicsFile = args(1)
 39 |         //this.separator = " "
 40 |         //this.characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/mytest"
 41 | 
 42 |         this.calculate()
 43 | 
 44 |     }
 45 | 
 46 | 
 47 |     /**
 48 |      * 初始化
 49 |      * env:  dev 开发环境， online 线上环境
 50 |      */
 51 |     def init (env: String): Unit =  {
 52 |         Conf.setEnv(env)
 53 | 
 54 |         println(Conf.getZookeeperQuorum())
 55 |         // 连接 userUBCF 数据表
 56 |         this.hbaseResultTb = new HBaseClient("inventoryIBCF",Conf.getZookeeperQuorum())
 57 |         println(Conf.getZookeeperQuorum())
 58 |     }
 59 | 
 60 | 
 61 |     def calculate(): Unit = {
 62 | 
 63 |         /**
 64 |          * 初始化 spark 
 65 |          */
 66 |         val sparkConf = new SparkConf()
 67 |         sparkConf.setAppName("InventoryIBCF")
 68 |         sparkConf.setMaster("local[2]")
 69 | 
 70 |         /**
 71 |          * 初始化推荐模型
 72 |          */
 73 |         val inventoryIBCF = new IBCF()
 74 | 
 75 |         // 合并累加 ItemAndItemMatrix 矩阵
 76 |         val itemAndItemMatrixCollection = inventoryIBCF.calculateByFile(sparkConf, characteristicsFile, separator)
 77 | 
 78 |         // 根据 ItemId , groupBy ItemMatrix
 79 |         val itemAndItemGroup = inventoryIBCF.itemMatrixGroupByItemId(itemAndItemMatrixCollection)
 80 | 
 81 |         var inventoryLine = 0         // 推荐行数
 82 | println("----- 把聚合后的数据格式化成字符串保存在 Hbase -----")
 83 |         itemAndItemGroup.foreach(line => {
 84 |             val invetoryId = line._1
 85 |             val invetoryRsInfo = line._2
 86 | 
 87 |             // 把里面的 array 按照:组合, 最外层按照,组合
 88 |             val invetoryRsToString = invetoryRsInfo.map(f => f.mkString(":")).mkString(",")
 89 | 
 90 |             this.inventoryRecommendWriteHbase(invetoryId, invetoryRsToString)
 91 | 
 92 |             inventoryLine += 1
 93 |         })
 94 | 
 95 |         println("")
 96 |         println("-----  HBase Table inventoryIBCF: ",
 97 |                 "写入了: " + inventoryLine + " 行")
 98 | 
 99 |     }
100 | 
101 | 
102 |     /**
103 |      * 保存推荐结果到 Hbase
104 |      */
105 |     def inventoryRecommendWriteHbase(rowKey: String, value: String): Unit = {
106 |         this.hbaseResultTb.insert(rowKey, "recommend", "inventoryRecommend", value)
107 |     }
108 | 
109 | 
110 | 
111 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/inventory/InventoryIBCFspark.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.inventory
  2 | 
  3 | import play.api.libs.json._
  4 | import org.apache.log4j.{Level, Logger}
  5 | import org.apache.spark.SparkContext
  6 | import org.apache.spark.SparkConf
  7 | 
  8 | import com.angejia.dw.hadoop.spark.CollaborativeFiltering
  9 | import com.angejia.dw.common.util.{FileUtil}
 10 | import com.angejia.dw.hadoop.hbase.HBaseClient
 11 | 
 12 | 
 13 | /**
 14 |  * 这个是 spark mlib 算法实现的,不靠谱呵呵哒
 15 |  *
 16 |  * create 'inventoryRecommend',{NAME=>'inventoryRecommendInventory'}
 17 | 
 18 | 
 19 |   spark-submit \
 20 |   --name InventoryIBCF \
 21 |   --class com.angejia.dw.recommend.inventory.InventoryIBCF \
 22 |   --master local[2] \
 23 |    ~/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar "DataNode01" "inventoryRecommend" "/data/log/recommend/ml-100k/u.data"
 24 | 
 25 |    参数: 
 26 |    [zookeeperIds] [HBaseTableName] [characteristicsFile]
 27 |  */
 28 | object InventoryIBCFspark  {
 29 | 
 30 |     var zookeeperIds: String = null
 31 |     var HBaseTableName: String = null
 32 |     var HBaseClientService: HBaseClient = null
 33 |     var characteristicsFile: String = null
 34 | 
 35 |     def main(args: Array[String])  {
 36 | 
 37 |         for (ar <- args) {
 38 |             println(ar)
 39 |         }
 40 | 
 41 |         // Hbase 配置
 42 |         zookeeperIds = args(0)
 43 |         HBaseTableName = args(1)
 44 |         HBaseClientService = new HBaseClient(HBaseTableName,zookeeperIds)
 45 | 
 46 |         // 等待训练的文件
 47 |         characteristicsFile = args(2)
 48 |         //characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/mytest"
 49 | 
 50 |         val inventoryIBCF = new InventoryIBCFspark()
 51 |         inventoryIBCF.characteristicsFile = characteristicsFile
 52 |         inventoryIBCF.run()
 53 |     }
 54 |  
 55 | 
 56 |     /**
 57 |      * 写 HBase
 58 |      */
 59 |     def resultWriteHBase(rowKey: String, value: String) : Unit = {
 60 |         HBaseClientService.insert(rowKey, "inventoryRecommendInventory", "inventoryIds", value)
 61 |     }
 62 | 
 63 | }
 64 | 
 65 | /**
 66 |  * 看了又看算法
 67 |  */
 68 | class InventoryIBCFspark extends Serializable {
 69 | 
 70 |     // 提取特征的文件 
 71 |     var characteristicsFile: String = null 
 72 | 
 73 |     def run(): Unit = {
 74 |         Logger.getRootLogger.setLevel(Level.WARN)
 75 | 
 76 |         // 训练模型
 77 |         val inventoryTrainModel = this.inventoryTrain()
 78 |     }
 79 | 
 80 | 
 81 |     /**
 82 |      * 训练 inventory 模型
 83 |      */
 84 |     //def inventoryTrain() : MatrixFactorizationModel = {
 85 |     def inventoryTrain() : Unit = {
 86 | 
 87 | println("----- 开始初始化 -----")
 88 |         // SPARK 运行环境配置
 89 |         val conf = new SparkConf()
 90 |         conf.setAppName("InventoryIBCF")
 91 |         conf.setMaster("local[2]")
 92 | 
 93 |         conf.set("spark.cores.max", "4") // 16 map workers, that is 2 workers per machine (see my cluster config below) 
 94 |         //conf.set("spark.akka.frameSize", "100000") 
 95 |         conf.set("spark.driver.maxResultSize", "2g") 
 96 |         conf.set("spark.executor.memory", "2g") 
 97 |         conf.set("spark.reducer.maxMbInFlight", "100000") 
 98 |         conf.set("spark.storage.memoryFraction", "0.9") 
 99 |         conf.set("spark.shuffle.file.buffer.kb", "1000") 
100 |         conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.HttpBroadcastFactory")   
101 | 
102 |         val sc = new SparkContext(conf)
103 | 
104 |         // 读取数据源
105 |         val sourceDataRDD = sc.textFile(characteristicsFile)
106 |         //println(sourceDataRDD.first())
107 | 
108 |         // 把行分割成数组，并且读取数组前 3 个元素 ，格式化后作为入参
109 |         val ratingsRDD = sourceDataRDD.map(line => {
110 |             val curLine = line.split("\t").map { x => x.toInt}
111 |             Array(curLine(0), curLine(1) , curLine(2))
112 |         })
113 |         ratingsRDD.take(2).foreach(x => println(x(0) + ":" + x(1) + ":" + x(2)))
114 | 
115 | 
116 |         // 把需要推荐的房源 id 抽取出来,去重
117 |         val needRecommendInventoryIdsRDD = ratingsRDD.map(_(1)).distinct()
118 |         needRecommendInventoryIdsRDD.take(2).foreach(x => println(x))
119 |          
120 | 
121 |         // 算出所有房源相关度 (调试)
122 |         //val inventoryIds = List(308524, 213775, 276360, 206754)
123 |         //val needRecommendInventoryIdsRDD = sc.parallelize(inventoryIds)
124 |         //needRecommendInventoryIdsRDD.take(4).foreach {x => println(x)}
125 | 
126 |         // 计算需要推荐的房源的次数
127 |         val inventoryIdsRddCount = needRecommendInventoryIdsRDD.count().toInt
128 | println("共需要推荐: " + inventoryIdsRddCount)
129 | 
130 | 
131 | println("----- 提取特征 -----")
132 |         // IBCF 算法类
133 |         val collaborativeFiltering = new CollaborativeFiltering()
134 | 
135 |         // 提取特征
136 |         val characteristicsRDD = collaborativeFiltering.characteristics(ratingsRDD)
137 | 
138 | 
139 | println("----- 训练模型 -----")
140 |         // 训练模型
141 |         collaborativeFiltering.train(characteristicsRDD, 50, 10, 0.01)
142 |         // 广播变量
143 |         val collaborativeFilteringSignPrefixesRdd = sc.broadcast(collaborativeFiltering)
144 | 
145 |         // 累加器
146 |         val blankLines = sc.accumulator(0)
147 | 
148 | 
149 | println("----- inventoryId 计算推荐的 inventoryIds -----")
150 |         val inventoryResInventorysRDD = needRecommendInventoryIdsRDD.map { inventoryId => 
151 |           val itemCosineSimilarity = collaborativeFilteringSignPrefixesRdd.value.itemCosineSimilarity(inventoryId)
152 | 
153 |           var result = ""
154 |            itemCosineSimilarity.take(100).foreach{ inventroyRes =>
155 |                val inventoryRId = inventroyRes._1    // 推荐的房源 ID
156 |                val inventoryRSouce = inventroyRes._2    // 推荐的房源 分数
157 |                result += inventoryId + ":" + inventoryRId + ":" + inventoryRSouce + ","
158 |            }
159 |            blankLines += 1
160 |            println(blankLines)
161 |            println("wirete: " + result)
162 |           // 结果写到 HBase
163 |           //InventoryIBCF.resultWriteHBase(inventoryId.toString(),result.toString())
164 |         }.take(inventoryIdsRddCount)
165 | 
166 | 
167 | 
168 | /**
169 | println("----- inventoryId 计算推荐的 inventoryIds -----")
170 |         // 为每个 inventoryId 计算推荐的 inventoryIds
171 |         val inventoryResInventorysRDD = needRecommendInventoryIdsRDD.map { inventoryId => 
172 | 
173 |             /** 物品余弦相似度计算
174 |              *  RDD[(Int, Double)] 返回详细值: 
175 |                  (1,0.537378279119025)
176 |                  (3,0.37167637258108627)
177 |                  (5,0.6282701874791976)
178 |              */
179 |             val itemCosineSimilarity = collaborativeFilteringSignPrefixesRdd.value.itemCosineSimilarity(inventoryId)
180 | 
181 |             val id = inventoryId
182 |             val result: Array[(Int, Double)] = itemCosineSimilarity.take(5000)
183 | 
184 |             val t: (Int,Array[(Int, Double)]) = (id, result)
185 |             t
186 |         }
187 | 
188 |         // 计算需要推荐的房源的次数
189 |         val inventoryIdsRddCount = needRecommendInventoryIdsRDD.count().toInt
190 | 
191 | 
192 | println("----- 准备开始写入数据 -----")
193 |         /** 格式化数据
194 |          *  写入到 Hbase 中             
195 |         */
196 |         val data = inventoryResInventorysRDD.map(inventoryData => {
197 |            val inventoryId = inventoryData._1    // 123234  这种数据结构s
198 |            val ResInventorys = inventoryData._2    // (2,0.5074470833019032) 这种数据结构
199 | 
200 |            var result = ""
201 |            ResInventorys.foreach{ tuple =>
202 |                val inventoryRId = tuple._1    // 推荐的房源 ID
203 |                val inventoryRSouce = tuple._2    // 推荐的房源 分数
204 |                result += inventoryId + ":" + inventoryRId + ":" + inventoryRSouce + ","
205 |            }
206 | 
207 |            // 结果写到 HBase
208 |            InventoryIBCF.resultWriteHBase(inventoryId.toString(),result.toString())
209 | 
210 |            result
211 |         }).take(inventoryIdsRddCount)
212 | **/
213 | 
214 |     }
215 | 
216 | 
217 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/inventory/InventoryItemCF.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.inventory
  2 | 
  3 | import scala.collection.mutable.Map
  4 | 
  5 | import org.apache.log4j.{Level, Logger}
  6 | import org.apache.spark.SparkContext
  7 | import org.apache.spark.SparkConf
  8 | 
  9 | import com.angejia.dw.hadoop.hbase.HBaseClient
 10 | import com.angejia.dw.common.util.{FileUtil}
 11 | 
 12 | /**
 13 |  * IBCF 算法实现
 14 |    参数: 
 15 |    [zookeeperIds] [HBaseTableName] [characteristicsFile]
 16 |  */
 17 | object InventoryItemCF {
 18 | 
 19 |     var zookeeperIds: String = null
 20 |     var HBaseTableName: String = null
 21 |     var HBaseClientService: HBaseClient = null
 22 |     var characteristicsFile: String = null
 23 | 
 24 |     def main (args: Array[String]) {
 25 |         
 26 |         for (ar <- args) {
 27 |             println(ar)
 28 |         }
 29 | 
 30 |         // Hbase 配置
 31 |         zookeeperIds = args(0)
 32 |         HBaseTableName = args(1)
 33 |         HBaseClientService = new HBaseClient(HBaseTableName,zookeeperIds)
 34 | 
 35 |         // 等待训练的文件
 36 |         characteristicsFile = args(2)
 37 |         //characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/00000*"
 38 |         //characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/mytest2"
 39 |         //characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/000000_1"
 40 | 
 41 |         this.calculate()
 42 |     }
 43 | 
 44 | 
 45 |     /**
 46 |      *  算法逻辑
 47 |      */
 48 |     def calculate() : Unit = {
 49 | println("----- 初始化 -----")
 50 |         val conf = new SparkConf()
 51 |         conf.setAppName("InventoryIBCF")
 52 |         conf.setMaster("local[2]")
 53 | 
 54 |         val sc = new SparkContext(conf)
 55 | 
 56 | println("----- 加载数据源: " + characteristicsFile + " -----")
 57 |         // 读取数据源
 58 |         val sourceDataRDD = sc.textFile(characteristicsFile)
 59 |         //println(sourceDataRDD.first())
 60 | 
 61 | 
 62 | println("----- 过滤数据源 -----")
 63 | 
 64 |         // 筛选过滤原始数据
 65 |         val filterSourceDataRDD = sourceDataRDD.filter { line =>  
 66 |             var checkStatus = true
 67 |             if (line.isEmpty()) {
 68 |                 checkStatus = false
 69 |             }
 70 |             val lineArr = line.split("\t")
 71 |             if (lineArr.length < 2) {
 72 |                 checkStatus = false
 73 |             } else {
 74 |                 val userId = lineArr.apply(0)
 75 |                 val inventoryId = lineArr.apply(1)
 76 |                 if (userId.matches("[0-9]+") == false) {
 77 |                     checkStatus = false
 78 |                 }
 79 |                 if (inventoryId.matches("[0-9]+") == false) {
 80 |                     checkStatus = false
 81 |                 } 
 82 |             }
 83 |             checkStatus
 84 |         }
 85 | 
 86 | println("----- 归并用户 item 集合 -----")
 87 |         // 用户喜欢 items 的集合 RDD
 88 |         val userLikeItemsCollectionRDD = filterSourceDataRDD.map { line =>
 89 |             val lineArr = line.split("\t")
 90 |             val userId = lineArr.apply(0).toInt
 91 |             val inventoryId = lineArr.apply(1).toInt
 92 |             (userId, inventoryId)
 93 |         }.groupByKey()
 94 | 
 95 | 
 96 | println("----- 用户物品集合生成矩阵 -----")
 97 |         // 用户的 Item 矩阵 B
 98 |         val userItemMatrixsRDD = userLikeItemsCollectionRDD.map{userAndItems =>
 99 |             // 用户喜欢物品的集合
100 |             val userItems = userAndItems._2
101 | 
102 |             /**
103 |              * 数据结构
104 |                  Map("50:57"->1, "57:50"->1, "51:55"->1)
105 |              */
106 |             // 保存用户,每个物品对的矩阵 B
107 |             val userItemMatrix : Map[String,Int] = Map[String,Int]()
108 | 
109 |             // 二二配对当前用户的物品,  为每个用户,产出一个 B 矩阵
110 |             for (i <- userItems) {
111 |                 for (j <- userItems) { 
112 |                     // 排除相同的物品
113 |                     if (i != j) {
114 |                         // 默认为每个用户 +1 个访问次数
115 |                         //userItemMatrix += Map(i -> Map(j -> 1))
116 |                         val key = i.toString() + ":" + j.toString()
117 |                         userItemMatrix.put(key ,1)
118 |                     }
119 |                 }
120 |             }
121 |  
122 |             userItemMatrix
123 |         }
124 | 
125 |         //userItemMatrixs.take(10)
126 |         //exit()
127 | 
128 | 
129 | println("----- 合并所有物品矩阵 -----")
130 |         // 合并最终的矩阵
131 |         val itemAndItemMatriCollection = userItemMatrixsRDD.reduce{ (x, y) =>
132 | 
133 |             var curMatrix = x
134 |             var nextMatrix = y
135 | 
136 |             /**
137 |              * 目标 :
138 |              *  1. 把 curMatrix 和 nextMatrix  相同 key 的值相加
139 |              *  2. 把 nextMatrix 不在 curMatrix 中的原样追加到 curMatrix
140 |              */
141 |              for ((yK, yV) <- nextMatrix) {
142 |  
143 |                   if (curMatrix.contains(yK) == true) {
144 |                       curMatrix(yK) += nextMatrix(yK)
145 |                    } else {
146 |                       curMatrix.put(yK,yV)
147 |                    }
148 |              }
149 | 
150 |             curMatrix
151 |         }
152 |         //exit()
153 |         //println(itemAndItemMatrix.toBuffer)
154 | 
155 | 
156 | /** 把同类型的物品, 聚合到一起
157 | (51, 51:55:2, 51:52:2, 51:53:2, 51:56:1)
158 | (56, 56:53:1, 56:55:1, 56:52:1,56:51:1)
159 |  */
160 | 
161 | println("----- 聚合同类型物品 -----")
162 |         val itemAndItemGroup = itemAndItemMatriCollection.map( f => {
163 |             val ids = f._1.split(":")
164 |             val invetoryId = ids(0).toString()        // 房源 ID
165 |             val invetoryRsId = ids(1).toString()      // 推荐房源 ID
166 |             val invetoryRsIdCnt = f._2.toString()     // 共同看过的人数
167 |             // 转换成数组
168 |             Array(invetoryId, invetoryRsId, invetoryRsIdCnt)
169 |             //println(invetoryId, invetoryRsId , invetoryRsIdCnt)
170 |         // 把
171 |         }).groupBy { 
172 |             // 然后, 按照 invetoryId 把把同类的房源 ID groupBy 到一起
173 |             f => f(0)
174 |         }
175 | 
176 |         val blankLines = sc.accumulator(0)
177 | 
178 | println("----- 把聚合后的数据格式化成字符串 -----")
179 |         val itemAndItemGroupToStringRDD = itemAndItemGroup.map(line => {
180 |             val invetoryId = line._1
181 |             val invetoryRsInfo = line._2
182 |             // 把里面的 array 按照:组合, 最外层按照,组合
183 |             val invetoryRsToString = invetoryRsInfo.map(f => f.mkString(":")).mkString(",")
184 | 
185 |             blankLines += 1
186 |             // println("write[" + blankLines + "]: " + invetoryId)
187 |             //结果写到 Hbase (invetoryId, invetoryRsToString)
188 |             resultWriteHBase(invetoryId, invetoryRsToString)
189 |         })
190 | 
191 |         println("")
192 |         println("-----  HBase Table inventoryRecommend: ",
193 |                 " inventoryRecommend 写入了: " + blankLines + " 行")
194 |         
195 |         //itemAndItemGroupToStringRDD
196 | /*
197 |        //val blankLines = sc.accumulator(0)
198 | println("----- 写到 Hbase -----")
199 |         var n = 0
200 |         itemAndItemGroupToStringRDD.foreach(f => {
201 |             n += 1
202 |             println("write[" + n + "]: " + f._1)
203 |             resultWriteHBase(f._1, f._2)
204 |         })
205 | 
206 |     
207 | */
208 |     }
209 |     
210 |     // 拆解 Map 返回
211 |     def dismantlingMap (data: Map[Int,Int]): (Int, Int) = {
212 |         val keys = data.keySet.toArray
213 | 
214 |         (keys(0), data.get(keys(0)).get)
215 |     } 
216 | 
217 | 
218 |     def resultWriteHBase(rowKey: String, value: String) : Unit = {
219 |         //FileUtil.fileOutputStream("/data/log/recommend/result",rowKey + "--" + value + "\n",true)
220 |         HBaseClientService.insert(rowKey, "inventoryRecommendInventory", "inventoryIds", value)
221 |     }
222 | 
223 | 
224 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/inventory/InventoryItemCFBak.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.inventory
  2 | 
  3 | import org.apache.log4j.{Level, Logger}
  4 | import org.apache.spark.SparkContext
  5 | import org.apache.spark.SparkConf
  6 | import scala.collection.mutable.Map
  7 | import scala.collection.mutable.LinkedList
  8 | import scala.collection.mutable.ArrayBuffer
  9 | 
 10 | import com.angejia.dw.hadoop.hbase.HBaseClient
 11 | import com.angejia.dw.common.util.{FileUtil}
 12 | 
 13 | 
 14 | object InventoryItemCFBak {
 15 | 
 16 |     var zookeeperIds: String = null
 17 |     var HBaseTableName: String = null
 18 |     var HBaseClientService: HBaseClient = null
 19 |     var characteristicsFile: String = null
 20 | 
 21 |     def main (args: Array[String]) {
 22 |         
 23 |          for (ar <- args) {
 24 |             println(ar)
 25 |         }
 26 |         
 27 |         
 28 | 
 29 |         // Hbase 配置
 30 |         zookeeperIds = args(0)
 31 |         HBaseTableName = args(1)
 32 |         //HBaseClientService = new HBaseClient(HBaseTableName,zookeeperIds)
 33 | 
 34 |         // 等待训练的文件
 35 |         characteristicsFile = args(2)
 36 |         characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/00000*"
 37 |         //characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/mytest2"
 38 |         
 39 |         this.calculate()
 40 |     }
 41 |     
 42 |     def calculate() : Unit = {
 43 | println("----- 初始化 -----")
 44 |         val conf = new SparkConf()
 45 |         conf.setAppName("InventoryIBCF")
 46 |         conf.setMaster("local[2]")
 47 | 
 48 |         val sc = new SparkContext(conf)
 49 | 
 50 | println("----- 加载数据源: " + characteristicsFile + " -----")
 51 |         // 读取数据源
 52 |         val sourceDataRDD = sc.textFile(characteristicsFile)
 53 |         //println(sourceDataRDD.first())
 54 | 
 55 | 
 56 | println("----- 归并用户 item 集合 -----")
 57 |         // 用户喜欢 items 的集合 RDD
 58 |         val userLikeItemsCollectionRDD = sourceDataRDD.map(line => {
 59 |             val curLine = line.split("\t").map { x => x.toInt}
 60 |             //println(curLine(0) + " " + curLine(1) + " " + curLine(2))
 61 |             (curLine(0), curLine(1))
 62 |         }).groupByKey()
 63 | 
 64 | 
 65 | println("----- 用户物品集合生成矩阵 -----")
 66 |         // 用户的 Item 矩阵 B
 67 |         val userItemMatrixs = userLikeItemsCollectionRDD.map{userAndItems =>
 68 |             val userItems = userAndItems._2
 69 | 
 70 |             /*
 71 |             val map : Map[Int,Map[Int,Int]] = Map[Int,Map[Int,Int]]()
 72 |             var am : Map[Int,Int] = Map[Int,Int]()
 73 |             val a = Map(1 -> 2)
 74 |             map.put(1,a)
 75 |             println(map(1)(1))
 76 |             * */
 77 | 
 78 |             // 保存用户,每个物品对的矩阵 B
 79 |             //var itemMatrix = ArrayBuffer[ArrayBuffer[Int]]()
 80 |             //var itemMatrix = Array.ofDim[Int](55,55)
 81 |             var userItemMatrix : Map[Int,Map[Int,Int]] = Map[Int,Map[Int,Int]]()
 82 | 
 83 |             // 二二配对当前用户的物品,  为每个用户,产出一个 B 矩阵
 84 |             for (i <- userItems) {
 85 |                 for (j <- userItems) {
 86 |                     // 排除相同的物品
 87 |                     if (i != j) {
 88 |                         // 默认为每个用户 +1 个访问次数
 89 |                         userItemMatrix.put(i,Map(j -> 1))
 90 |                     }
 91 |                 }
 92 |             }
 93 |             userItemMatrix
 94 |         }
 95 | 
 96 | 
 97 | println("----- 合并用户矩阵 -----")
 98 | 
 99 |         // 合并最终的矩阵
100 |         val itemAndItemMatrix = userItemMatrixs.reduce{ (x, y) =>
101 |             // var curRsMatrix : Map[Int,Map[Int,Int]] = Map[Int,Map[Int,Int]]()
102 | 
103 |             var curMatrix = x
104 |             var nextMatrix = y
105 | 
106 |             // 合并 2 个 map ,把用户矩阵相加
107 |             for ((yK, yV) <- nextMatrix) {
108 |                val iKey = yK
109 |                val tmp = yV.keySet.toArray
110 |                val jKey = tmp(0).toInt
111 | 
112 |                // 当前 x 的 map key 和 // 子 map 的 key 相同
113 |                if (x.contains(iKey) == true && x.get(iKey).get.contains(jKey) == true) {
114 |                   curMatrix(iKey)(jKey) += 1
115 |                } else {
116 |                    // 追加矩阵
117 |                    curMatrix.put(iKey,yV)
118 |                }
119 |                //println("yK:" + yK + "  yV:" + yV)
120 |             }
121 | 
122 |             //println("-----")
123 |             curMatrix
124 |         }
125 | 
126 |         //println(itemAndItemMatrix.toBuffer)
127 | 
128 | println("----- 写到 Hbase -----")
129 |         itemAndItemMatrix.foreach{ f =>
130 |             val invetoryId = f._1
131 |             val invetoryRsInfo = this.dismantlingMap(f._2)
132 |             val invetoryRsId = invetoryRsInfo._1
133 |             val invetoryRsIdCnt = invetoryRsInfo._2
134 | 
135 |             println(f)
136 |             println(invetoryId , invetoryRsId , invetoryRsIdCnt)
137 |             println("-----")
138 |             val key = invetoryId.toString() + ":"
139 |             val value = invetoryRsId.toString() + ":" + invetoryRsIdCnt.toString()
140 |             this.resultWriteHBase(key,value)
141 |             //this.resultWriteHBase()
142 |             
143 |         }
144 | 
145 |         //itemAndItemMatrix.foreach(f => println(f))
146 |         //itemAndItemMatrix.take(10)
147 | 
148 | 
149 |     }
150 | 
151 | 
152 |     // 拆解 Map 返回
153 |     def dismantlingMap (data: Map[Int,Int]): (Int, Int) = {
154 |         val keys = data.keySet.toArray
155 | 
156 |         (keys(0), data.get(keys(0)).get)
157 |     } 
158 | 
159 | 
160 |     def resultWriteHBase(rowKey: String, value: String) : Unit = {
161 |         //FileUtil.fileOutputStream("/data/log/recommend/result","",false)
162 |         FileUtil.fileOutputStream("/data/log/recommend/result",rowKey + value + "\n",true)
163 |         //HBaseClientService.insert(rowKey, "inventoryRecommendInventory", "inventoryIds", value)
164 |     }
165 | 
166 | 
167 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/inventory/InventoryItemCFTest.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.inventory
  2 | 
  3 | import org.apache.log4j.{Level, Logger}
  4 | import org.apache.spark.SparkContext
  5 | import org.apache.spark.SparkConf
  6 | import scala.collection.mutable.Map
  7 | 
  8 | import com.angejia.dw.hadoop.hbase.HBaseClient
  9 | import com.angejia.dw.common.util.{FileUtil}
 10 | 
 11 | /**
 12 |  * create 'inventoryRecommend',{NAME=>'inventoryRecommendInventory'}
 13 | 
 14 | 
 15 |   spark-submit \
 16 |   --name InventoryIBCF \
 17 |   --class com.angejia.dw.recommend.inventory.InventoryIBCF \
 18 |   --master local[2] \
 19 |    ~/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar "DataNode01" "inventoryRecommend" "/data/log/recommend/ml-100k/u.data"
 20 | 
 21 |    参数: 
 22 |    [zookeeperIds] [HBaseTableName] [characteristicsFile]
 23 |  */
 24 | object InventoryItemCFTest {
 25 | 
 26 |     var characteristicsFile: String = null
 27 | 
 28 |     def main (args: Array[String]) {
 29 |         
 30 |         for (ar <- args) {
 31 |             println(ar)
 32 |         }
 33 | 
 34 | 
 35 |         // 等待训练的文件
 36 |         characteristicsFile = args(0)
 37 |         //characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/00000*"
 38 | 
 39 |         this.calculate()
 40 |     }
 41 | 
 42 | 
 43 |     /**
 44 |      *  算法逻辑
 45 |      */
 46 |     def calculate() : Unit = {
 47 | println("----- 初始化 -----")
 48 |         val conf = new SparkConf()
 49 |         conf.setAppName("InventoryIBCF")
 50 |         conf.setMaster("local[2]")
 51 | 
 52 |         val sc = new SparkContext(conf)
 53 | 
 54 | println("----- 加载数据源: " + characteristicsFile + " -----")
 55 |         // 读取数据源
 56 |         val sourceDataRDD = sc.textFile(characteristicsFile)
 57 |         //println(sourceDataRDD.first())
 58 | 
 59 | 
 60 | println("----- 归并用户 item 集合 -----")
 61 |         // 用户喜欢 items 的集合 RDD
 62 |         val userLikeItemsCollectionRDD = sourceDataRDD.map(line => {
 63 |             val curLine = line.split("\t").map { x => x.toInt}
 64 |             (curLine(0), curLine(1))
 65 |         }).groupByKey()
 66 | 
 67 | 
 68 | println("----- 用户物品集合生成矩阵 -----")
 69 |         // 用户的 Item 矩阵 B
 70 |         val userItemMatrixs = userLikeItemsCollectionRDD.map{userAndItems =>
 71 |             // 用户喜欢物品的集合
 72 |             val userItems = userAndItems._2
 73 | 
 74 |             /**
 75 |              * 数据结构
 76 | Map("50:57"->1, "57:50"->1, "51:55"->1)
 77 |              */
 78 |             // 保存用户,每个物品对的矩阵 B
 79 |             val userItemMatrix : Map[String,Int] = Map[String,Int]()
 80 | 
 81 |             // 二二配对当前用户的物品,  为每个用户,产出一个 B 矩阵
 82 |             for (i <- userItems) {
 83 |                 for (j <- userItems) {
 84 |                     // 排除相同的物品
 85 |                     if (i != j) {
 86 |                         // 默认为每个用户 +1 个访问次数
 87 |                         //userItemMatrix += Map(i -> Map(j -> 1))
 88 |                         val key = i.toString() + ":" + j.toString()
 89 |                         userItemMatrix.put(key ,1)
 90 |                     }
 91 |                 }
 92 |             }
 93 |  
 94 |             userItemMatrix
 95 |         }
 96 | 
 97 |         //userItemMatrixs.take(10)
 98 |         //exit()
 99 | 
100 | 
101 | println("----- 合并所有用户物品矩阵 -----")
102 |         // 合并最终的矩阵
103 |         val itemAndItemMatrixRDD = userItemMatrixs.reduce{ (x, y) =>
104 | 
105 |             var curMatrix = x
106 |             var nextMatrix = y
107 | 
108 |             /**
109 |              * 目标 :
110 |              *  1. 把 curMatrix 和 nextMatrix  相同 key 的值相加
111 |              *  2. 把 nextMatrix 不在 curMatrix 中的原样追加到 curMatrix
112 |              */
113 |              for ((yK, yV) <- nextMatrix) {
114 |  
115 |                   if (curMatrix.contains(yK) == true) {
116 |                       curMatrix(yK) += nextMatrix(yK)
117 |                    } else {
118 |                       curMatrix.put(yK,yV)
119 |                    }
120 |              }
121 | 
122 |             curMatrix
123 |         }
124 |         //exit()
125 |         //println(itemAndItemMatrix.toBuffer)
126 | 
127 | 
128 | /** 把同类型的物品, 聚合到一起
129 | (51, 51:55:2, 51:52:2, 51:53:2, 51:56:1)
130 | (56, 56:53:1, 56:55:1, 56:52:1,56:51:1)
131 |  */
132 | 
133 | println("----- 聚合同类型物品 -----")
134 |         val itemAndItemGroupRDD = itemAndItemMatrixRDD.map( f => {
135 |             val ids = f._1.split(":")
136 |             val invetoryId = ids(0).toString()        // 房源 ID
137 |             val invetoryRsId = ids(1).toString()      // 推荐房源 ID
138 |             val invetoryRsIdCnt = f._2.toString()     // 共同看过的人数
139 |             // 转换成数组
140 |             Array(invetoryId, invetoryRsId, invetoryRsIdCnt)
141 |             //println(invetoryId, invetoryRsId , invetoryRsIdCnt)
142 |         // 把
143 |         }).groupBy { 
144 |             // 然后, 按照 invetoryId 把把同类的房源 ID groupBy 到一起
145 |             f => f(0)
146 |         }
147 | 
148 |         val blankLines = sc.accumulator(0)
149 | 
150 | println("----- 把聚合后的数据格式化成字符串 -----")
151 |         val itemAndItemGroupToStringRDD = itemAndItemGroupRDD.map(line => {
152 |             val invetoryId = line._1
153 |             val invetoryRsInfo = line._2
154 |             // 把里面的 array 按照:组合, 最外层按照,组合
155 |             val invetoryRsToString = invetoryRsInfo.map(f => f.mkString(":")).mkString(",")
156 | 
157 |             blankLines += 1
158 |             println("write[" + blankLines + "]: " + invetoryId)
159 | 
160 |         })
161 | 
162 | 
163 |     }
164 | 
165 | 
166 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/inventory/portrait/InventoryPortraitCommon.scala:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.recommend.inventory.portrait
 2 | 
 3 | import scala.collection.mutable.HashMap
 4 | import scala.collection.mutable.Map
 5 | 
 6 | import com.angejia.dw.common.util.mysql.MysqlClient
 7 | import com.angejia.dw.recommend.user.portrait.UserPortraitCommon
 8 | import com.angejia.dw.recommend.user.portrait.UserPortraitTags
 9 | 
10 | object InventoryPortraitCommon {
11 | 
12 |     // mysql 数据库连接对象
13 |     var mysqlClient: MysqlClient = null
14 | 
15 |     /**
16 |      * 通过房源 Id 获取房源画像基础数据
17 |      */
18 |     private def getInventoryPortraitByInventoryId(inventoryId: String): HashMap[String, String] = {
19 |         var querySql = sqlStmt.format(inventoryId.toInt)
20 |         val res = mysqlClient.select(querySql)
21 |         val result = new HashMap[String, String]()
22 |         if (!res.isEmpty) {
23 |             for ((k, v) <- res(0)) {
24 |                 result.put(k, v.toString)
25 |             }
26 |         }
27 |         result
28 |     }
29 | 
30 |     /**
31 |      * 通过房源 Id 获取, 获取标签code 与房源属性的 Mapping 数据
32 |      */
33 |     def getUserTagsInventoryMappingByInventoryId(inventoryId: String): Map[String, String] = {
34 |         val rs = Map[String, String]()
35 | 
36 |         val inventoryPortrait = this.getInventoryPortraitByInventoryId(inventoryId)
37 |         if (!inventoryPortrait.isEmpty) {
38 |             val cityId = inventoryPortrait.getOrElse("city_id", "0")
39 |             rs.put(UserPortraitCommon.cityTagCode, cityId)
40 | 
41 |             val districtId = inventoryPortrait.getOrElse("district_id", "0")
42 |             rs.put(UserPortraitCommon.districtTagCode, districtId)
43 | 
44 |             val blockId = inventoryPortrait.getOrElse("block_id", "0")
45 |             rs.put(UserPortraitCommon.blockTagCode, blockId)
46 | 
47 |             val communityId = inventoryPortrait.getOrElse("community_id", "0")
48 |             rs.put(UserPortraitCommon.communityTagCode, communityId)
49 | 
50 |             val bedrooms = inventoryPortrait.getOrElse("bedrooms", "0")
51 |             rs.put(UserPortraitCommon.bedroomsTagCode, bedrooms)
52 | 
53 |             // 价格转换为价格段
54 |             val price = inventoryPortrait.getOrElse("price", "0")
55 |             val priceTierId = UserPortraitTags.getPriceTier(price)
56 |             rs.put(UserPortraitCommon.priceTagCode, priceTierId)
57 |         }
58 | 
59 |         rs
60 |     }
61 | 
62 |     val sqlStmt = """
63 |         SELECT
64 |           community.city_id AS city_id
65 |           ,community.district_id AS district_id
66 |           ,community.block_id AS block_id
67 |           ,house.community_id AS community_id
68 |           ,inventory.id AS inventory_id
69 |           ,inventory.price AS price
70 |           ,inventory.area AS area
71 |           ,inventory.is_real AS is_real
72 |           ,inventory.survey_status AS survey_status
73 |           ,inventory.source AS source
74 |           ,inventory.has_checked AS has_checked
75 |           ,inventory.created_at AS created_at
76 |           ,inventory.updated_at AS updated_at
77 |           ,inventory.verify_status AS verify_status
78 |           ,inventory.status AS status
79 |           ,house.orientation AS orientation
80 |           ,property.bedrooms AS bedrooms
81 |           ,house.floor AS floor
82 |           ,house.total_floors AS total_floors
83 |         FROM
84 |           property.inventory AS inventory
85 |           LEFT JOIN property.property AS property on inventory.property_id = property.id
86 |           LEFT JOIN property.house AS house on property.house_id = house.id
87 |           LEFT JOIN angejia.community AS community on house.community_id = community.id
88 |         WHERE inventory.id = %d
89 |         """
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/inventory/portrait/MarketingInventoryPortrait.scala:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.recommend.inventory.portrait
 2 | 
 3 | import scala.collection.mutable.Map
 4 | import scala.collection.mutable.HashMap
 5 | import com.angejia.dw.hadoop.hbase.HBaseClient
 6 | 
 7 | import com.angejia.dw.recommend.user.portrait.UserPortraitCommon
 8 | import com.angejia.dw.recommend.user.portrait.UserPortraitTags
 9 | import com.angejia.dw.common.util.mysql.MysqlClient
10 | 
11 | object MarketingInventoryPortrait {
12 |     /**
13 |      * 通过房源 Id 获取房源画像基础数据
14 |      */
15 |     private def getInventoryPortraitByInventoryId(inventoryId: String): HashMap[String, String] = {
16 |         var querySql = sqlStmt.format(inventoryId.toInt)
17 |         val res = UserPortraitCommon.mysqlClient.select(querySql)
18 |         val result = new HashMap[String, String]()
19 |         if (!res.isEmpty) {
20 |             for ((k, v) <- res(0)) {
21 |                 result.put(k, v.toString)
22 |             }
23 |         }
24 |         result
25 |     }
26 | 
27 |     /**
28 |      * 通过房源 Id 获取, 获取标签code 与房源属性的 Mapping 数据
29 |      */
30 |     def getUserTagsInventoryMappingByInventoryId(inventoryId: String): Map[String, String] = {
31 |         val rs = Map[String, String]()
32 | 
33 |         val inventoryPortrait = this.getInventoryPortraitByInventoryId(inventoryId)
34 |         if (!inventoryPortrait.isEmpty) {
35 |             val cityId = inventoryPortrait.getOrElse("city_id", "0")
36 |             rs.put(UserPortraitCommon.cityTagCode, cityId)
37 | 
38 |             val districtId = inventoryPortrait.getOrElse("district_id", "0")
39 |             rs.put(UserPortraitCommon.districtTagCode, districtId)
40 | 
41 |             val blockId = inventoryPortrait.getOrElse("block_id", "0")
42 |             rs.put(UserPortraitCommon.blockTagCode, blockId)
43 | 
44 |             val communityId = inventoryPortrait.getOrElse("community_id", "0")
45 |             rs.put(UserPortraitCommon.communityTagCode, communityId)
46 | 
47 |             val bedrooms = inventoryPortrait.getOrElse("bedrooms", "0")
48 |             rs.put(UserPortraitCommon.bedroomsTagCode, bedrooms)
49 | 
50 |             // 价格转换为价格段
51 |             val price = inventoryPortrait.getOrElse("price", "0")
52 |             val priceTierId = UserPortraitTags.getPriceTier(price)
53 |             rs.put(UserPortraitCommon.priceTagCode, priceTierId)
54 |         }
55 | 
56 |         rs
57 |     }
58 | 
59 |     val sqlStmt = """
60 |         SELECT
61 |           city_id
62 |           , district_id
63 |           , block_id
64 |           , community_id
65 |           , id
66 |           , price
67 |           , area
68 |           , publish_time AS created_at
69 |           , orientation
70 |           , bedrooms
71 |           , floor
72 |           , total_floors
73 |         FROM angejia.marketing_inventory
74 |         WHERE id = %d
75 |         """
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/user/UserUBCF.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.user
  2 | 
  3 | import scala.collection.mutable.Map
  4 | import scala.collection.mutable.HashMap
  5 | import scala.collection.mutable.ListBuffer
  6 | 
  7 | import org.apache.log4j.{ Level, Logger }
  8 | import org.apache.spark.SparkContext
  9 | import org.apache.spark.SparkConf
 10 | 
 11 | import com.angejia.dw.recommend.Conf
 12 | import com.angejia.dw.hadoop.hbase.HBaseClient
 13 | import com.angejia.dw.recommend.UBCF
 14 | 
 15 | /**
 16 |  * UBCF 算法实现
 17 |  *
 18 |  * create 'userUBCF',{NAME=>'relation'},{NAME=>'recommend'},{NAME=>'baseInfo'}
 19 |  * relation 用户关系
 20 |  * recommend 用户推荐
 21 |  */
 22 | object UserUBCF {
 23 | 
 24 |     // hbase 数据表
 25 |     var hbaseResultTb: HBaseClient = null
 26 | 
 27 |     // 等待训练的数据文件
 28 |     var characteristicsFile: String = null
 29 | 
 30 |     // 文件分隔符
 31 |     var separator = "\t"
 32 | 
 33 |     def main(args: Array[String]) {
 34 | 
 35 |         for (ar <- args) {
 36 |             println(ar)
 37 |         }
 38 | 
 39 |         val env = args(0)
 40 |         this.init(env)
 41 | 
 42 |         this.characteristicsFile = args(1)
 43 | 
 44 |         //this.separator = " "
 45 |         //this.characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/mytest"
 46 | 
 47 |         this.calculate()
 48 |     }
 49 | 
 50 |     /**
 51 |      * 初始化
 52 |      * env:  dev 开发环境， online 线上环境
 53 |      */
 54 |     def init(env: String): Unit = {
 55 |         Conf.setEnv(env)
 56 | 
 57 |         // 连接 userUBCF 数据表
 58 |         this.hbaseResultTb = new HBaseClient("userUBCF", Conf.getZookeeperQuorum())
 59 |     }
 60 | 
 61 |     def calculate(): Unit = {
 62 |         /**
 63 |          * 初始化 spark
 64 |          */
 65 |         val sparkConf = new SparkConf()
 66 |         sparkConf.setAppName("UserUBCF")
 67 |         sparkConf.setMaster("local[2]")
 68 | 
 69 |         /**
 70 |          * 初始化推荐模型
 71 |          */
 72 |         val userUBCF = new UBCF()
 73 | 
 74 |         // user -> uesr 相似度矩阵
 75 |         val userAndUserMatrixCollection = userUBCF.calculateByFile(sparkConf, characteristicsFile, separator)
 76 | 
 77 |         // 根据  userId , groupBy userRelationMatrix
 78 |         val userRelationGroup = userUBCF.userRelationMatrixGroupByUserId(userAndUserMatrixCollection)
 79 | 
 80 |         // user -> items 集合
 81 |         val userItemsCollectionRDD = userUBCF.userAndItemsCollection()
 82 | 
 83 |         println("----- userId -> items 集合本地化 ")
 84 |         val userItemsCollectionMap: scala.collection.immutable.Map[String, Iterable[(String, Int)]] = userItemsCollectionRDD.collect().toMap
 85 | 
 86 |         println("----- Start : 基于 userRelationGroup 集合, 持久化数据到 Hbase ----- \n")
 87 | 
 88 |         println("----- user 关联结果, user 推荐结果, 持久化到 Hbase Table ")
 89 |         var userRelationMatrixLineNum = 0 // user 关系矩阵行数
 90 |         var userRecommendLineNum = 0 // user 推荐行数
 91 | 
 92 |         // 遍历 user -> users 相似度集合
 93 |         userRelationGroup.foreach { userRelationInfo =>
 94 | 
 95 |             val curUserId = userRelationInfo._1 // userId
 96 |             val curUserRelationUsers = userRelationInfo._2 // 关联的 users 集合 
 97 | 
 98 |             // ---------- Start user -> user 集合关系保存到 Hbase ----------
 99 | 
100 |             val userRelationToString = curUserRelationUsers.map(userRelation => userRelation.mkString(":")).mkString(",")
101 |             this.userRelationMatrixWriteHbase(curUserId, userRelationToString)
102 | 
103 |             // ---------- End user -> user 集合关系保存到 Hbase ----------
104 | 
105 |             // ---------- Start 为 user 推荐 items ----------
106 | 
107 |             // 保存当前 user 最终推荐的 items
108 |             val userRecommendItemsRs = ListBuffer[Array[String]]()
109 |             //val userRecommendItems = ListBuffer[Map[String,String]]()
110 | 
111 |             // 当前 user 已经推荐的 item Ids , 用来保存已经存在 itemIds
112 |             //val userRecommendItemsPool = ListBuffer[String]()
113 |             val userRecommendItemsPool = Map[String, String]()
114 | 
115 |             // 当前 user 自身的 items 
116 |             val curUserItems = userItemsCollectionMap.getOrElse(curUserId, null)
117 |             if (curUserItems != null) {
118 |                 curUserItems.foreach { itemInfo =>
119 |                     val itemId = itemInfo._1
120 |                     val itemPf = itemInfo._2
121 | 
122 |                     // 组合推荐结果
123 |                     val rsInfo = Array(
124 |                         curUserId, // 当期 userId
125 |                         "0", // 关联 userId (因为是自身的所以用 0 表示)
126 |                         "0", // 关联 user 相似度分数  (因为是自身的所以用 0 表示)
127 |                         itemId.toString(), // 关联 user ItemId
128 |                         itemPf.toString() // 关联 user item 喜欢次数
129 |                         )
130 | 
131 |                     userRecommendItemsRs.append(rsInfo)
132 |                 }
133 |             }
134 | 
135 |             // 当前 user Relation user 下的 items 集合, 把当前 user items 不存在的 item 追加进去, 最终汇总后作为推荐结果
136 |             curUserRelationUsers.foreach { relationUserInfo =>
137 |                 val relationUserId = relationUserInfo.apply(1) // 关联 uesrId
138 |                 val relationUserPf = relationUserInfo.apply(2) // 关联 user 的相似度分数
139 | 
140 |                 // 相似度分数大于 1 才会推荐
141 |                 if (relationUserPf.toInt > 1) {
142 |                     // 关联 user 的 items
143 |                     val relationUserItems = userItemsCollectionMap.getOrElse(relationUserId, null)
144 |                     if (relationUserItems != null) {
145 | 
146 |                         relationUserItems.foreach { itemInfo =>
147 |                             val itemId = itemInfo._1
148 |                             val itemPf = itemInfo._2
149 | 
150 |                             // 若推荐的 items 已经存在, 则不推荐了
151 |                             if (!userRecommendItemsPool.contains(itemId.toString())) {
152 |                                 // 组合推荐结果
153 |                                 val rsInfo = Array(
154 |                                     curUserId, // 当期 userId
155 |                                     relationUserId, // 关联 userId
156 |                                     relationUserPf, // 关联 user 相似度分数
157 |                                     itemId.toString(), // 关联 user ItemId
158 |                                     itemPf.toString() // 关联 user item 喜欢次数
159 |                                     )
160 | 
161 |                                 // userRecommendItemsPool.append(itemId.toString())
162 |                                 userRecommendItemsPool.put(itemId.toString(), "exist")
163 | 
164 |                                 userRecommendItemsRs.append(rsInfo)
165 |                             }
166 |                             // println(rsInfo.toBuffer)
167 |                         }
168 |                     }
169 |                 }
170 |             }
171 | 
172 |             // 当前推荐结果转换成 字符串
173 |             val userRecommendItemsToString = userRecommendItemsRs.map(recommendItemInfo => recommendItemInfo.mkString(":")).mkString(",")
174 |             // println(curUserId, userRecommendItemsRs.size)
175 | 
176 |             // userRecommendItemsRs.foreach{ f => println(f.toBuffer) }
177 |             this.userRecommendWriteHbase(curUserId, userRecommendItemsToString)
178 | 
179 |             // ---------- End 为 user 推荐 items ----------
180 |             if (!userRecommendItemsRs.isEmpty) userRecommendLineNum += 1
181 |             userRelationMatrixLineNum += 1
182 |         }
183 |         println("")
184 |         println("-----  HBase Table userUBCF: ",
185 |             " userRelationMatrixLineNum 写入了: " + userRelationMatrixLineNum + " 行",
186 |             " userRecommendLineNum 写入了: " + userRecommendLineNum + " 行")
187 |         println("")
188 | 
189 |         println("----- End : 基于 userRelationGroup 集合, 持久化数据到 Hbase -----")
190 | 
191 |     }
192 | 
193 |     /**
194 |      * uesr 关系矩阵, 写入到 Hbase
195 |      */
196 |     def userRelationMatrixWriteHbase(rowKey: String, value: String): Unit = {
197 |         this.hbaseResultTb.insert(rowKey, "relation", "userRelation", value)
198 |     }
199 | 
200 |     /**
201 |      * 保存推荐结果到 Hbase
202 |      */
203 |     def userRecommendWriteHbase(rowKey: String, value: String): Unit = {
204 |         this.hbaseResultTb.insert(rowKey, "recommend", "userRecommend", value)
205 |     }
206 | 
207 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitBrowse.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.user.portrait
  2 | 
  3 | import scala.collection.mutable.ArrayBuffer
  4 | import scala.collection.mutable.HashMap
  5 | import scala.collection.mutable.Map
  6 | 
  7 | import com.angejia.dw.common.util.DateUtil
  8 | import com.angejia.dw.common.util.RegexUtil
  9 | 
 10 | /**
 11 |  * 浏览房源数据行为
 12 |  * 根据 URL 找出 房源 Id
 13 |  * 1. 查找 Hbase 房源 ID 的，城市，区域，板块，小区，户型，所属的价格区间段
 14 |  * 2. 读取 Hbase 中的以上标签原始 Json 数据
 15 |  * 3. 对找出的标签进行 浏览加分后写回 Hbase
 16 |  */
 17 | object UserPortraitBrowse {
 18 | 
 19 |     val actionName = "Browse"
 20 | 
 21 |     // 当前处理的 userId
 22 |     var userId: String = new String()
 23 | 
 24 |     def setUserId(userId: String): Unit = {
 25 |         this.userId = userId
 26 |     }
 27 | 
 28 |     def getUserId(): String = {
 29 |         this.userId
 30 |     }
 31 | 
 32 |     // 请求的 URI
 33 |     var requestUri = new String()
 34 |     def setRequestUri(uri: String): Unit = {
 35 |         this.requestUri = uri
 36 |     }
 37 |     def getRequestUri(): String = {
 38 |         this.requestUri
 39 |     }
 40 | 
 41 |     var inventoryId = new String()
 42 | 
 43 |     // 房源单页 uri 正则匹配
 44 |     val browseRegex = """^/mobile/member/(?:inventories|inventory/detail)/\d+/(\d+)"""
 45 | 
 46 |     /**
 47 |      * 本行为中用到的用户标签
 48 |      */
 49 |     var userTags = Map[String, String]()
 50 | 
 51 |     def run(): String = {
 52 |         // 推荐状态
 53 |         var reStatus = "no"
 54 | 
 55 |         // 清空
 56 |         this.inventoryId = ""
 57 | 
 58 |         /**
 59 |          * 解析出 Url 中的article id
 60 |          */
 61 |         val articleId = RegexUtil.findStrData(this.browseRegex, this.getRequestUri())
 62 | 
 63 |         if (articleId.isEmpty() || articleId == "") return reStatus
 64 | 
 65 |         /*
 66 |          * 根据article获取inventory id和resource
 67 |          * 若resource为1，为安个家二手房
 68 |          * 若resource为2，则为营销房源
 69 |          */
 70 |         val sql = "SELECT inventory_id,resource " +
 71 |             "FROM angejia.article " +
 72 |             "WHERE id = " + articleId
 73 |         val article: ArrayBuffer[HashMap[String, Any]] = UserPortraitCommon.mysqlClient.select(sql)
 74 | 
 75 |         if (article.length != 1) {
 76 |             return reStatus
 77 |         }
 78 | 
 79 |         inventoryId = article(0).getOrElse("inventory_id", "").toString
 80 |         val resource = article(0).getOrElse("resource", "").toString
 81 | 
 82 |         this.userTags = Map[String, String](
 83 |             UserPortraitCommon.cityTagCode -> new String(),
 84 |             UserPortraitCommon.districtTagCode -> new String(),
 85 |             UserPortraitCommon.blockTagCode -> new String(),
 86 |             UserPortraitCommon.communityTagCode -> new String(),
 87 |             UserPortraitCommon.bedroomsTagCode -> new String(),
 88 |             UserPortraitCommon.priceTagCode -> new String())
 89 | 
 90 |         if (resource == "1") {
 91 |             // 安个家房源
 92 |             println(DateUtil.getCurTime(DateUtil.SIMPLE_FORMAT) + "|"
 93 |                 + getUserId() + ": UserPortraitBrowse", inventoryId, this.getRequestUri(), "angejia")
 94 |             this.updateUserNeedsByInventoryId(inventoryId)
 95 |             this.scoreByInventoryId(inventoryId)
 96 |         } else {
 97 |             // 营销房源
 98 |             println(DateUtil.getCurTime(DateUtil.SIMPLE_FORMAT) + "|"
 99 |                 + getUserId() + ": UserPortraitBrowse", inventoryId, this.getRequestUri(), "marketing")
100 |             this.updateUserNeedsByMarketingInventoryId(inventoryId)
101 |             this.scoreByMarketingInventoryId(inventoryId)
102 |         }
103 | 
104 |         reStatus = "yes"
105 |         reStatus
106 |     }
107 | 
108 |     /**
109 |      * 一组标签进行合并（安个家房源）
110 |      */
111 |     private def updateUserNeedsByInventoryId(inventoryId: String): Unit = {
112 |         val inventoryIds = Array(inventoryId)
113 | 
114 |         // 合并
115 |         UserPortraitNeeds.setUserId(this.getUserId())
116 |         UserPortraitNeeds.userNeedsMergeByInventoryIds(inventoryIds)
117 |     }
118 | 
119 |     /**
120 |      * 一组标签进行合并（营销房源）
121 |      */
122 |     private def updateUserNeedsByMarketingInventoryId(inventoryId: String): Unit = {
123 |         val inventoryIds = Array(inventoryId)
124 | 
125 |         // 合并
126 |         UserPortraitNeeds.setUserId(this.getUserId())
127 |         UserPortraitNeeds.userNeedsMergeByMarketingInventoryIds(inventoryIds)
128 |     }
129 | 
130 |     /**
131 |      * 打分入口（安个家房源）
132 |      */
133 |     def scoreByInventoryId(inventoryId: String): Unit = {
134 |         UserPortraitTags.setUserId(this.getUserId())
135 |         UserPortraitTags.scoreByInventoryIdAndAction(inventoryId, this.actionName)
136 |     }
137 | 
138 |     /**
139 |      * 打分入口（营销房源）
140 |      */
141 |     def scoreByMarketingInventoryId(inventoryId: String): Unit = {
142 |         UserPortraitTags.setUserId(this.getUserId())
143 |         UserPortraitTags.scoreByMarketingInventoryIdAndAction(inventoryId, this.actionName)
144 |     }
145 | }
146 | 


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitCommon.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.user.portrait
  2 | 
  3 | import scala.collection.mutable.Map
  4 | import scala.collection.mutable.HashMap
  5 | 
  6 | import javax.crypto.Cipher
  7 | import javax.crypto.spec.IvParameterSpec
  8 | import javax.crypto.spec.SecretKeySpec
  9 | import sun.misc.BASE64Decoder
 10 | 
 11 | import com.angejia.dw.hadoop.hbase.HBaseClient
 12 | import com.angejia.dw.common.util.mysql.MysqlClient
 13 | import com.angejia.dw.hadoop.hive.HiveClient
 14 | 
 15 | import com.angejia.dw.common.util.JsonUtil
 16 | 
 17 | object UserPortraitCommon {
 18 | 
 19 |     // mysql 数据库连接对象
 20 |     var mysqlClient: MysqlClient = null
 21 | 
 22 |     // spark 通过 thriftServer 连接 hive 数据仓库的对象
 23 |     var sparkHiveClient: HiveClient = null
 24 | 
 25 |     // 用户画像表连接对象
 26 |     var userPortraitTable: HBaseClient = null
 27 | 
 28 |     // 用户画像标签 列族
 29 |     val TagColumnFamily = "tags"
 30 | 
 31 |     // 用户画像维度 列族
 32 |     val DimColumnFamily = "dimension"
 33 | 
 34 |     // 用户画像需求 列族
 35 |     val NeedsColumnFamily = "needs"
 36 | 
 37 |     // 用户画像推荐 列族(保存推荐信息)
 38 |     val ModelStateColumnFamily = "modelState"
 39 | 
 40 |     // 标签配置！ 
 41 |     // 城市
 42 |     val cityTagConf = UserPortraitTagConf.CITY_TAG // 标签配置
 43 |     val cityTagCode = cityTagConf.getOrElse("TagCode", "") // 标签代码
 44 | 
 45 |     // 区域
 46 |     val districtTagConf = UserPortraitTagConf.DISTRICT_TAG
 47 |     val districtTagCode = districtTagConf.getOrElse("TagCode", "")
 48 | 
 49 |     // 版块
 50 |     val blockTagConf = UserPortraitTagConf.BLOCK_TAG
 51 |     val blockTagCode = blockTagConf.getOrElse("TagCode", "")
 52 | 
 53 |     // 小区
 54 |     val communityTagConf = UserPortraitTagConf.COMMUNITY_TAG
 55 |     val communityTagCode = communityTagConf.getOrElse("TagCode", "")
 56 | 
 57 |     // 户型
 58 |     val bedroomsTagConf = UserPortraitTagConf.BEDROOMS_TAG
 59 |     val bedroomsTagCode = bedroomsTagConf.getOrElse("TagCode", "").toString()
 60 |     // 户型段映射
 61 |     val bedroomsType = bedroomsTagConf.get("bedroomsType").get.asInstanceOf[collection.Map[String, String]]
 62 | 
 63 |     // 价格
 64 |     val priceTagConf = UserPortraitTagConf.PRICE_TAG
 65 |     val priceTagCode = priceTagConf.getOrElse("TagCode", "").toString()
 66 |     // 价格段
 67 |     val priceTier = priceTagConf.get("PriceTier").get.asInstanceOf[collection.Map[String, String]]
 68 |     // 价格段映射
 69 |     val priceTierType = priceTagConf.get("PriceTierType").get.asInstanceOf[collection.Map[String, String]]
 70 | 
 71 |     /**
 72 |      * 获取用户画像 tags 标签列族数据
 73 |      * userId 用户 ID
 74 |      * columnFamily 列族
 75 |      */
 76 |     def getUserPortraitTagsByUserId(userId: String): HashMap[String, String] = {
 77 | 
 78 |         val hbaseData: HashMap[String, String] = UserPortraitCommon.userPortraitTable.select(
 79 |             userId,
 80 |             UserPortraitCommon.TagColumnFamily,
 81 |             Array(UserPortraitCommon.cityTagCode,
 82 |                 UserPortraitCommon.districtTagCode,
 83 |                 UserPortraitCommon.blockTagCode,
 84 |                 UserPortraitCommon.communityTagCode,
 85 |                 UserPortraitCommon.bedroomsTagCode,
 86 |                 UserPortraitCommon.priceTagCode))
 87 | 
 88 |         hbaseData
 89 |     }
 90 | 
 91 |     /**
 92 |      * 获取用户画像 dim 维度列族数据
 93 |      * userId 用户 ID
 94 |      * columnFamily 列族
 95 |      */
 96 |     def getUserPortraitDimByUserId(userId: String): HashMap[String, String] = {
 97 | 
 98 |         val hbaseData: HashMap[String, String] = UserPortraitCommon.userPortraitTable.select(
 99 |             userId,
100 |             UserPortraitCommon.DimColumnFamily,
101 |             Array( //"userDemand",        // 需求单维度老的,
102 |                 "memberDemand", // 需求单维度
103 |                 "likeInventorys", // 收藏房源
104 |                 "visitItemInventorys", // 带看房源
105 |                 "linkInventorys" // 连接房源
106 |                 ))
107 | 
108 |         hbaseData
109 |     }
110 | 
111 |     /**
112 |      * 获取用户画像 needs 列族需求数据
113 |      * userId 用户 ID
114 |      * columnFamily 列族
115 |      */
116 |     def getUserPortraitNeedsByUserId(userId: String): HashMap[String, String] = {
117 | 
118 |         val hbaseData: HashMap[String, String] = UserPortraitCommon.userPortraitTable.select(
119 |             userId,
120 |             UserPortraitCommon.NeedsColumnFamily, // 列族
121 |             Array("actionNeeds" // 需要抽取的列
122 |             ))
123 | 
124 |         hbaseData
125 |     }
126 | 
127 |     /**
128 |      * 获取用户画像 ModelState 建模状态列族数据
129 |      */
130 |     def getUserPortraitModelStateByUserId(userId: String): HashMap[String, String] = {
131 | 
132 |         val hbaseData: HashMap[String, String] = UserPortraitCommon.userPortraitTable.select(
133 |             userId,
134 |             UserPortraitCommon.ModelStateColumnFamily, // 列族
135 |             // 需要抽取的列
136 |             Array("visitItemInventorysRecord", // 带看上次修改记录
137 |                 "linkInventorysRecord", // 连接上次修改记录
138 |                 "memberDemandTime" // 需求单上次修改记录
139 |                 ))
140 | 
141 |         hbaseData
142 |     }
143 | 
144 |     /**
145 |      * 对于 map 中,key 是空的值，进行处理
146 |      */
147 |     def mapKeyDefaultValue(map: HashMap[String, String], key: String, default: String = ""): String = {
148 |         var rs: String = ""
149 |         if (map.contains(key)) {
150 |             if (map.getOrElse(key, null) == null) {
151 |                 rs = default
152 |             } else {
153 |                 rs = map.get(key).get.toString()
154 |             }
155 |         } else {
156 |             rs = default
157 |         }
158 |         rs
159 |     }
160 | 
161 |     /**
162 |      * 解密 auth
163 |      */
164 |     def Decrypt(data: String): String = {
165 |         try {
166 |             val key = "12345678123456xx"
167 |             val iv = "12345678123456xx"
168 | 
169 |             val encrypted1 = new BASE64Decoder().decodeBuffer(data)
170 | 
171 |             val cipher = Cipher.getInstance("AES/CBC/NoPadding");
172 |             val keyspec = new SecretKeySpec(key.getBytes(), "AES");
173 |             val ivspec = new IvParameterSpec(iv.getBytes());
174 | 
175 |             cipher.init(Cipher.DECRYPT_MODE, keyspec, ivspec);
176 |             val original = cipher.doFinal(encrypted1)
177 |             val originalString = new String(original)
178 |             return originalString;
179 | 
180 |         } catch {
181 |             case e: Exception =>
182 |                 e.printStackTrace();
183 |                 return "";
184 |         }
185 |     }
186 | 
187 |     /**
188 |      * Json 转换成 Map, 不可变的 Map
189 |      */
190 |     def jsonStrToMap(jsonString: String): Map[String, Object] = {
191 |         JsonUtil.smartJsonStrToMap(jsonString)
192 |     }
193 | 
194 |     /**
195 |      * Map 转换成 String, 不可变的 Map
196 |      */
197 |     def mapToJsonStr(map: Map[String, Object]): String = {
198 |         JsonUtil.smartMapToJsonStr(map)
199 |     }
200 | 
201 |     /**
202 |      * 两层 json 转换 -> 可变的 Map
203 |      * jsonStr :
204 |      *  {}
205 |      *  或者
206 |      *  {
207 |      *    "x": {"a":"1","b":"2"},
208 |      *    "y": {"c":"3","d":"4"}
209 |      *   }
210 |      *
211 |      * return :
212 |      *  Map(
213 |      *   "x" => Map("a"=> 1, "b"=> 2),
214 |      *   "y" => Map("c"=> 3, "d"=> 4)
215 |      *   )
216 |      */
217 |     def jsonStrToMapByTwolayers(jsonStr: String): Map[String, Map[String, Object]] = {
218 |         // 返回的是一个 Map[String, Object] , Object = Map[String, String]
219 |         val baseMap = JsonUtil.playJsonToMap(jsonStr)
220 |         // json 转换为可变 map 
221 |         val mapChildToVariable = baseMap.map {
222 |             case (k, v) =>
223 |                 val curK = k
224 |                 // 把 v 转换为 Map[String,String]
225 |                 val curV = v.asInstanceOf[scala.collection.immutable.Map[String, Object]]
226 |                 // 再把 map 转换为可变 Map
227 |                 val formatV = scala.collection.mutable.Map(curV.toSeq: _*)
228 |                 k -> formatV
229 |         }
230 |         // 转变 map 数据
231 |         val mapVariable = collection.mutable.Map(mapChildToVariable.toSeq: _*).asInstanceOf[scala.collection.mutable.Map[String, Map[String, Object]]]
232 |         mapVariable
233 |     }
234 | 
235 |     /**
236 |      * 两层 map 转换 -> 为 json
237 |      */
238 |     def mapToJsonStrByTwolayers(mapData: Map[String, Map[String, Object]]): String = {
239 |         val mapDataToMap = mapData.map {
240 |             case (k, v) => k -> v.toMap // 转换成不可变 Map
241 |         }.toMap
242 |         // 转换为 json 字符串
243 |         val mapToStr = JsonUtil.playMapToJson(mapDataToMap)
244 |         mapToStr
245 |     }
246 | }
247 | 


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitFilter.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.user.portrait
  2 | 
  3 | import scala.collection.mutable.Map
  4 | import scala.collection.mutable.ListBuffer
  5 | 
  6 | import com.angejia.dw.common.util.JsonUtil
  7 | import com.angejia.dw.common.util.RegexUtil
  8 | 
  9 | /**
 10 |  * 筛选房源逻辑处理
 11 |  * 根据 URL - 找出 城市，区域，板块，户型，价格区间(转换成价格段)
 12 |  * 1. 收集 url 出现的以上标签,
 13 |  * 比如: val cityIds = Set(1)
 14 |  * 2. 读取 Hbase 中的以上标签原始 Json 数据
 15 |  * 3. 对找出的标签进行 筛选逻辑加分后写回 Hbase
 16 |  */
 17 | object UserPortraitFilter {
 18 | 
 19 |     val actionName = "Filter"
 20 | 
 21 |     // 当前处理的 userId
 22 |     var userId: String = new String()
 23 |     def setUserId(userId: String): Unit = {
 24 |         this.userId = userId
 25 |     }
 26 |     def getUserId(): String = {
 27 |         this.userId
 28 |     }
 29 | 
 30 |     // 请求的 URI
 31 |     var requestUri = new String()
 32 |     def setRequestUri(uri: String): Unit = {
 33 |         this.requestUri = uri
 34 |     }
 35 |     def getRequestUri(): String = {
 36 |         this.requestUri
 37 |     }
 38 | 
 39 |     /**
 40 |      * 本行为中用到的用户标签
 41 |      */
 42 |     var userTags = Map[String, String]()
 43 | 
 44 |     // URI 搜索匹配正则
 45 |     val filterRegex = "/mobile/member/inventories/list[?](.*)"
 46 | 
 47 |     def run(): String = {
 48 |         // 推荐状态
 49 |         var reStatus = "no"
 50 | 
 51 |         /**
 52 |          * 解析出 Url 中的筛选字段数据
 53 |          */
 54 |         val urlPars = RegexUtil.findStrData(this.filterRegex, this.getRequestUri())
 55 |         if (urlPars.isEmpty()) return reStatus
 56 |         println(getUserId() + ": UserPortraitFilter ", this.getRequestUri())
 57 | 
 58 |         this.userTags = Map[String, String](
 59 |             UserPortraitCommon.cityTagCode -> new String(),
 60 |             UserPortraitCommon.districtTagCode -> new String(),
 61 |             UserPortraitCommon.blockTagCode -> new String(),
 62 |             UserPortraitCommon.communityTagCode -> new String(),
 63 |             UserPortraitCommon.bedroomsTagCode -> new String(),
 64 |             UserPortraitCommon.priceTagCode -> new String())
 65 | 
 66 |         /**
 67 |          * 处理 url 中出现的标签
 68 |          */
 69 |         urlPars.split("&").foreach { keyValueStr =>
 70 | 
 71 |             val keyValue = keyValueStr.split("=")
 72 |             // 表示一对 key,value
 73 |             if (keyValue.size == 2) {
 74 | 
 75 |                 val key: String = keyValue(0).toString()
 76 |                 val value: String = keyValue(1).toString()
 77 | 
 78 |                 key match {
 79 |                     case "city_id" => {
 80 |                         userTags.update(UserPortraitCommon.cityTagCode, value)
 81 |                     }
 82 |                     case "district_id" => {
 83 |                         userTags.update(UserPortraitCommon.districtTagCode, value)
 84 |                     }
 85 |                     case "block_id" => {
 86 |                         userTags.update(UserPortraitCommon.blockTagCode, value)
 87 |                     }
 88 |                     case "community_id" => {
 89 |                         userTags.update(UserPortraitCommon.communityTagCode, value)
 90 |                     }
 91 |                     case "bedroom_id" => {
 92 |                         // 通过户型 key 找到实际户型
 93 |                         val bedrooms = UserPortraitCommon.bedroomsType.getOrElse(value, "0").toString()
 94 |                         userTags.update(UserPortraitCommon.bedroomsTagCode, bedrooms)
 95 |                     }
 96 |                     case "price_id" => {
 97 |                         // 通过价格段 key 找到实际的户型
 98 |                         val priceTierId = UserPortraitCommon.priceTierType.getOrElse(value, "0").toString()
 99 |                         userTags.update(UserPortraitCommon.priceTagCode, priceTierId)
100 |                     }
101 |                     case _ => "filter nothing"
102 |                 }
103 | 
104 |             }
105 |         }
106 | 
107 |         this.userNeeds()
108 | 
109 |         this.score()
110 | 
111 |         reStatus = "yes"
112 |         reStatus
113 |     }
114 | 
115 |     /**
116 |      * 一组标签进行合并
117 |      */
118 |     def userNeeds(): Unit = {
119 |         val uesrActions = ListBuffer[Map[String, String]]()
120 |         uesrActions.append(userTags)
121 | 
122 |         // 对标签动作进行累加
123 |         UserPortraitNeeds.setUserId(this.getUserId())
124 |         UserPortraitNeeds.userActionNeedsMergeAction(uesrActions)
125 |     }
126 | 
127 |     /**
128 |      * 标签打分
129 |      */
130 |     def score(): Unit = {
131 | 
132 |         // 设置操作标签的用户
133 |         UserPortraitTags.setUserId(this.getUserId())
134 | 
135 |         // 城市标签打分
136 |         val cityId = userTags.getOrElse(UserPortraitCommon.cityTagCode, "0")
137 |         UserPortraitTags.cityTag(Set(cityId), Set(), UserPortraitCommon.cityTagConf.getOrElse("filterScore", "0"))
138 | 
139 |         // 区域标签打分
140 |         val districtId = userTags.getOrElse(UserPortraitCommon.districtTagCode, "0")
141 |         UserPortraitTags.districtTag(Set(districtId), Set(), UserPortraitCommon.districtTagConf.getOrElse("filterScore", "0"))
142 | 
143 |         // 版块标签打分
144 |         val blockId = userTags.getOrElse(UserPortraitCommon.blockTagCode, "0")
145 |         UserPortraitTags.blockTag(Set(blockId), Set(), UserPortraitCommon.blockTagConf.getOrElse("filterScore", "0"))
146 | 
147 |         // 小区标签打分
148 |         val communityId = userTags.getOrElse(UserPortraitCommon.communityTagCode, "0")
149 |         UserPortraitTags.communityTag(Set(communityId), Set(), UserPortraitCommon.communityTagConf.getOrElse("filterScore", "0"))
150 | 
151 |         // 户型标签打分
152 |         val bedrooms = userTags.getOrElse(UserPortraitCommon.bedroomsTagCode, "0")
153 |         UserPortraitTags.bedroomsTag(Set(bedrooms), Set(), UserPortraitCommon.bedroomsTagConf.getOrElse("filterScore", "0").toString())
154 | 
155 |         // 价格段标签打分
156 |         val priceTierId = userTags.getOrElse(UserPortraitCommon.priceTagCode, "0")
157 |         UserPortraitTags.priceTag(Set(priceTierId), Set(), UserPortraitCommon.priceTagConf.getOrElse("filterScore", "0").toString())
158 |     }
159 | 
160 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitLinkInventory.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.user.portrait
  2 | 
  3 | import scala.collection.mutable.HashMap
  4 | import scala.collection.mutable.Map
  5 | 
  6 | import com.angejia.dw.common.util.DateUtil
  7 | 
  8 | /**
  9 |  * 用户发生过连接的房源
 10 |  * 1. 获取用户被连接过得房源
 11 |  * 2. 为连接过得房子打分
 12 |  */
 13 | object UserPortraitLinkInventory {
 14 | 
 15 |     val actionName = "LinkInventory"
 16 | 
 17 |     def run(userId: String, date: String): String = {
 18 |         var userPortraitLinkInventory = new UserPortraitLinkInventory()
 19 |         userPortraitLinkInventory.setUserId(userId)
 20 |         userPortraitLinkInventory.setDwDate(date)
 21 |         var reStatus = userPortraitLinkInventory.run()
 22 |         
 23 |         userPortraitLinkInventory = null
 24 |         reStatus
 25 |     }
 26 | }
 27 | 
 28 | /**
 29 |  * 流程
 30 |  */
 31 | class UserPortraitLinkInventory {
 32 | 
 33 |     // hbase dimension:linkInventorys  列名
 34 |     val column: String = "linkInventorys"
 35 | 
 36 |     // 当前处理的 userId
 37 |     var userId: String = new String()
 38 |     def setUserId(userId: String): Unit = {
 39 |         if (userId.isEmpty()) {
 40 |             return
 41 |         }
 42 |         this.userId = userId
 43 |     }
 44 |     def getUserId(): String = {
 45 |         if (this.userId.isEmpty()) {
 46 |             return "0"
 47 |         }
 48 |         this.userId
 49 |     }
 50 | 
 51 |     // 分区日期
 52 |     var dwDate: String = new String()
 53 |     def setDwDate(date: String): Unit = {
 54 |         this.dwDate = date
 55 |     }
 56 |     def getDwDate(): String = {
 57 |         this.dwDate
 58 |     }
 59 | 
 60 |     /**
 61 |      * 初始化环境
 62 |      */
 63 |     def run(): String = {
 64 |         // 推荐状态
 65 |         var reStatus = "no"
 66 | 
 67 |         // 当日日期
 68 |         val offsetDate = DateUtil.getCalendarOffsetDateDay(0) // 获取当天日期
 69 |         val todayYmd = DateUtil.DateToString(offsetDate, DateUtil.SIMPLE_Y_M_D_FORMAT) // 格式化日期
 70 | 
 71 |         // 当日建模建模状态
 72 |         val modelState = this.getModelStateByDate(todayYmd)
 73 |         if (modelState == true) return reStatus
 74 | 
 75 |         // 最新连接房源数据
 76 |         val newLinkInventoryIds = this.getNewLinkInventoryIds()
 77 |         // 连接数据为空, 则退出
 78 |         if (newLinkInventoryIds.isEmpty) {
 79 |             this.saveModelStateByDate(todayYmd) // 保存当日的建模状态
 80 |             return reStatus
 81 |         }
 82 | 
 83 |         // 原始连接房源数据
 84 |         val linkInventoryIds = this.getLinkInventoryIds()
 85 | 
 86 |         //  检测新增的房源 Ids
 87 |         val diffInventoryIds = this.diffInventoryIds(newLinkInventoryIds, linkInventoryIds)
 88 | 
 89 |         // 如果没有变化, 则退出
 90 |         if (diffInventoryIds.isEmpty) {
 91 |             this.saveModelStateByDate(todayYmd) // 保存当日的建模状态
 92 |             return reStatus
 93 |         }
 94 |         println(getUserId() + ": UserPortraitLinkInventory ", diffInventoryIds.mkString(","))
 95 | 
 96 |         // 需求标签合并
 97 |         this.userNeeds(diffInventoryIds)
 98 | 
 99 |         // 打分
100 |         this.score(diffInventoryIds)
101 | 
102 |         // 最新的需求更新到 hbase 中
103 |         this.updateInventorysToHbase(newLinkInventoryIds)
104 | 
105 |         // 全部成功, 保存当日的建模状态
106 |         this.saveModelStateByDate(todayYmd) // 保存当日的建模状态
107 | 
108 |         // 返回
109 |         reStatus = "yes"
110 |         reStatus
111 |     }
112 | 
113 |     /**
114 |      * 获取用户最新连接数据
115 |      */
116 |     def getNewLinkInventoryIds(): Map[String, Object] = {
117 |         var rs: Map[String, Object] = Map[String, Object]()
118 | 
119 |         // 读取指定用户的连接房源数据
120 |         val querySql = "SELECT link_invs_a FROM dw_db.dw_user_sd WHERE user_id = '" + this.getUserId() + "' AND p_dt = '" + this.getDwDate() + "' limit 1"
121 |         //println(querySql)
122 |         val userSdData = UserPortraitCommon.sparkHiveClient.select(querySql, "link_invs_a");
123 | 
124 |         if (!userSdData.isEmpty()) {
125 |             // 所有的连接房源数据
126 |             val linkInventoryIvns: String = userSdData.get(0).get("link_invs_a")
127 |             if (linkInventoryIvns != null) {
128 |                 rs.put(column, linkInventoryIvns)
129 |             }
130 |         }
131 |         rs
132 |     }
133 | 
134 |     /**
135 |      * 从 Hbase 获取用户最新连接数据
136 |      */
137 |     def getLinkInventoryIds(): Map[String, Object] = {
138 | 
139 |         var rs: Map[String, Object] = Map[String, Object]()
140 | 
141 |         // 获取用户维度数据
142 |         val linkInventorys: HashMap[String, String] = UserPortraitCommon.getUserPortraitDimByUserId(this.getUserId())
143 | 
144 |         // 获取用户 喜欢房源维度 维度数据 jsonStri
145 |         val dimLinkInventorysJsonStr = UserPortraitCommon.mapKeyDefaultValue(linkInventorys, column, "{}")
146 | 
147 |         // 转换 jsonStr 成 Map
148 |         rs = UserPortraitCommon.jsonStrToMap(dimLinkInventorysJsonStr)
149 | 
150 |         rs
151 |     }
152 | 
153 |     // 获取房源差集
154 |     def diffInventoryIds(newInventorys: Map[String, Object], oldInventorys: Map[String, Object]): Array[String] = {
155 |         var newInventoryIds: Set[String] = Set[String]()
156 |         var oldInventoryIds: Set[String] = Set[String]()
157 | 
158 |         if (!newInventorys.isEmpty) {
159 |             newInventoryIds = newInventorys.getOrElse(column, "").toString().split(",").toSet
160 |         }
161 |         if (!oldInventorys.isEmpty) {
162 |             oldInventoryIds = oldInventorys.getOrElse(column, "").toString().split(",").toSet
163 |         }
164 | 
165 |         // 差集
166 |         val diffInventoryIds = newInventoryIds -- oldInventoryIds
167 | 
168 |         diffInventoryIds.toArray
169 |     }
170 | 
171 |     /**
172 |      * 一组标签进行合并
173 |      */
174 |     def userNeeds(inventoryIds: Array[String]): Unit = {
175 |         // 合并
176 |         UserPortraitNeeds.setUserId(this.getUserId())
177 |         UserPortraitNeeds.userNeedsMergeByInventoryIds(inventoryIds)
178 |     }
179 | 
180 |     /**
181 |      * 通过房源 Id, 为房源属性打分
182 |      */
183 |     def score(inventoryIds: Array[String]): Unit = {
184 | 
185 |         // 通过房源 ID , 为用户标签打分
186 |         inventoryIds.foreach { inventoryId =>
187 |             UserPortraitTags.setUserId(this.getUserId())
188 |             // 分数
189 |             // val score =  UserPortraitCommon.cityTagConf.getOrElse("linkInventoryScore", "0").toString()
190 |             // UserPortraitTags.tagScoreByInventoryId(inventoryId, score)
191 |             UserPortraitTags.tagsScoreByInventoryAndAction(inventoryId, UserPortraitLinkInventory.actionName)
192 |         }
193 | 
194 |     }
195 | 
196 |     /**
197 |      *  把需求转换成 Json
198 |      *  保存到用户画像表的 dimension:linkInventorys 中
199 |      */
200 |     def updateInventorysToHbase(inventorys: Map[String, Object]) = {
201 |         if (!inventorys.isEmpty) {
202 |             // Map 转换为 Json Str
203 |             val toString = UserPortraitCommon.mapToJsonStr(inventorys)
204 |             UserPortraitCommon.userPortraitTable.update(this.getUserId(), UserPortraitCommon.DimColumnFamily, column, toString)
205 |         }
206 |     }
207 | 
208 |     /**
209 |      * 指定日期的建模状态
210 |      * dateYmd: 日期 2016-04-10
211 |      * return
212 |      *  true : 已建模
213 |      *  false : 未建模
214 |      */
215 |     def getModelStateByDate(dateYmd: String): Boolean = {
216 |         var status = false
217 |         UserPortraitrModelState.setUserId(this.getUserId())
218 |         UserPortraitrModelState.setLinkInventorysRecord()
219 |         val linkInventorysRecord = UserPortraitrModelState.getLinkInventorysRecord() // 获取指定天数是否已经连接过了
220 |         if (linkInventorysRecord.contains(dateYmd)) {
221 |             status = true
222 |         }
223 |         status
224 |     }
225 | 
226 |     /**
227 |      * 保存属性的建模状态
228 |      * dateYmd: 日期 2016-04-10
229 |      */
230 |     def saveModelStateByDate(dateYmd: String): Unit = {
231 |         // 把建模状态写入到 hbase 中
232 |         UserPortraitrModelState.setUserId(this.getUserId())
233 |         var newLinkInventorysRecord: Map[String, Map[String, String]] = Map[String, Map[String, String]]()
234 |         newLinkInventorysRecord.put(dateYmd, Map("status" -> "1")) // Map[当前日期 -> Map[status->1]]
235 |         UserPortraitrModelState.saveLinkInventorysRecord(newLinkInventorysRecord) // 更新到 Hbase
236 |     }
237 | 
238 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitTagConf.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.user.portrait
  2 | 
  3 | import scala.collection.mutable.Map
  4 | 
  5 | /**
  6 |  * 用户画像 标签核心配置
  7 |  */
  8 | 
  9 | object UserPortraitTagConf {
 10 | 
 11 |     // 城市标签
 12 |     val CITY_TAG = Map(
 13 |         "TagCode" -> "city",
 14 |         "TagName" -> "城市",
 15 | 
 16 |         // 用户选房单分数
 17 |         "userDemandScore" -> "0",
 18 | 
 19 |         // 房源筛选给的分数
 20 |         "filterScore" -> "0",
 21 | 
 22 |         // 浏览房源分数
 23 |         "browseScore" -> "0",
 24 | 
 25 |         // 收藏房源分数
 26 |         "likeInventoryScore" -> "0",
 27 | 
 28 |         // 发生带看房源发生的分数
 29 |         "visitItemInventoryScore" -> "0",
 30 | 
 31 |         // 发生连接的分数
 32 |         "linkInventoryScore" -> "0",
 33 | 
 34 |         // 分数衰减百分比
 35 |         "attenuationPercentage" -> "0.1")
 36 | 
 37 |     // 区域标签
 38 |     val DISTRICT_TAG = Map(
 39 |         "TagCode" -> "district",
 40 |         "TagName" -> "区域",
 41 | 
 42 |         // 用户选房单分数
 43 |         "userDemandScore" -> "0",
 44 | 
 45 |         // 房源筛选给的分数
 46 |         "filterScore" -> "0",
 47 | 
 48 |         // 浏览房源分数
 49 |         "browseScore" -> "0",
 50 | 
 51 |         // 收藏房源分数
 52 |         "likeInventoryScore" -> "0",
 53 | 
 54 |         // 发生带看房源发生的分数
 55 |         "visitItemInventoryScore" -> "0",
 56 | 
 57 |         // 发生连接的分数
 58 |         "linkInventoryScore" -> "0",
 59 | 
 60 |         // 分数衰减百分比
 61 |         "attenuationPercentage" -> "0.1")
 62 | 
 63 |     // 版块标签
 64 |     val BLOCK_TAG = Map(
 65 |         "TagCode" -> "block",
 66 |         "TagName" -> "版块",
 67 | 
 68 |         // 用户选房单分数
 69 |         "userDemandScore" -> "20",
 70 | 
 71 |         // 房源筛选给的分数
 72 |         //"filterScore" -> "2",
 73 |         "filterScore" -> "10",
 74 | 
 75 |         // 浏览房源分数
 76 |         "browseScore" -> "1",
 77 | 
 78 |         // 收藏房源分数
 79 |         "likeInventoryScore" -> "5",
 80 | 
 81 |         // 发生带看房源发生的分数
 82 |         "visitItemInventoryScore" -> "50",
 83 | 
 84 |         // 发生连接的分数
 85 |         "linkInventoryScore" -> "30",
 86 | 
 87 |         // 分数衰减百分比
 88 |         "attenuationPercentage" -> "0.1")
 89 | 
 90 |     // 小区标签
 91 |     val COMMUNITY_TAG = Map(
 92 |         "TagCode" -> "community",
 93 |         "TagName" -> "小区",
 94 | 
 95 |         // 用户选房单分数
 96 |         "userDemandScore" -> "20",
 97 | 
 98 |         // 房源筛选给的分数
 99 |         //"filterScore" -> "10",
100 |         "filterScore" -> "10", // 1.0
101 | 
102 |         // 浏览房源分数
103 |         //"browseScore" -> "2",
104 |         "browseScore" -> "5", // 1.0
105 | 
106 |         // 收藏房源分数
107 |         "likeInventoryScore" -> "5",
108 | 
109 |         // 发生带看房源发生的分数
110 |         "visitItemInventoryScore" -> "50",
111 | 
112 |         // 发生连接的分数
113 |         "linkInventoryScore" -> "30",
114 | 
115 |         // 分数衰减百分比
116 |         "attenuationPercentage" -> "0.1")
117 | 
118 |     // 户型标签 
119 |     val BEDROOMS_TAG = Map(
120 |         "TagCode" -> "bedrooms",
121 |         "TagName" -> "户型",
122 | 
123 |         // 用户选房单分数
124 |         "userDemandScore" -> "20",
125 | 
126 |         // 房源筛选给的分数
127 |         //"filterScore" -> "2",
128 |         "filterScore" -> "10", // 1.0 
129 | 
130 |         // 浏览房源分数
131 |         //"browseScore" -> "2",
132 |         "browseScore" -> "5", // 1.0
133 | 
134 |         // 收藏房源分数
135 |         "likeInventoryScore" -> "5",
136 | 
137 |         // 发生带看房源发生的分数
138 |         "visitItemInventoryScore" -> "50",
139 | 
140 |         // 发生连接的分数
141 |         "linkInventoryScore" -> "30",
142 | 
143 |         // 分数衰减百分比
144 |         "attenuationPercentage" -> "0.1",
145 | 
146 |         // 户型映射(筛选列表时)
147 |         "bedroomsType" -> Map[String, String](
148 |             "2" -> "1",
149 |             "3" -> "2",
150 |             "4" -> "3",
151 |             "5" -> "4",
152 |             "6" -> "5",
153 |             "7" -> "6"))
154 | 
155 |     // 价格段标签
156 |     val PRICE_TAG = Map(
157 |         "TagCode" -> "price",
158 |         "TagName" -> "价格段",
159 | 
160 |         // 用户选房单分数
161 |         "userDemandScore" -> "20",
162 | 
163 |         // 房源筛选给的分数
164 |         //"filterScore" -> "2",
165 |         "filterScore" -> "10", // 1.0
166 | 
167 |         // 浏览房源分数
168 |         //"browseScore" -> "2",
169 |         "browseScore" -> "5", // 1.0
170 | 
171 |         // 收藏房源分数
172 |         "likeInventoryScore" -> "5",
173 | 
174 |         // 发生带看房源发生的分数
175 |         "visitItemInventoryScore" -> "50",
176 | 
177 |         // 发生连接的分数
178 |         "linkInventoryScore" -> "30",
179 | 
180 |         // 分数衰减百分比
181 |         "attenuationPercentage" -> "0.1",
182 | 
183 |         // 价格段映射
184 |         "PriceTierType" -> Map[String, String](
185 |             "2" -> "1",
186 |             "3" -> "2",
187 |             "4" -> "3",
188 |             "5" -> "4",
189 |             "6" -> "5",
190 |             "7" -> "6",
191 |             "8" -> "7",
192 |             "9" -> "8",
193 |             "10" -> "9"),
194 | 
195 |         // 价格段数据
196 |         "PriceTier" -> Map[String, String](
197 |             "0-1500000" -> "1",
198 |             "1500000-2000000" -> "2",
199 |             "2000000-2500000" -> "3",
200 |             "2500000-3000000" -> "4",
201 |             "3000000-4000000" -> "5",
202 |             "4000000-5000000" -> "6",
203 |             "5000000-7000000" -> "7",
204 |             "7000000-10000000" -> "8",
205 |             "10000000-1000000000" -> "9"))
206 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitVisitItem.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.user.portrait
  2 | 
  3 | import scala.collection.mutable.Map
  4 | import scala.collection.mutable.HashMap
  5 | import scala.collection.mutable.ArrayBuffer
  6 | import com.angejia.dw.common.util.DateUtil
  7 | 
  8 | /**
  9 |  * 用户发生过带看的房源
 10 |  * 1. 获取用户被带看过得房源
 11 |  * 2. 为带看过得房子打分
 12 |  */
 13 | object UserPortraitVisitItem {
 14 | 
 15 |     val actionName = "VisitItem"
 16 | 
 17 |     def run(userId: String, date: String): String = {
 18 |         var userPortraitVisitItem = new UserPortraitVisitItem()
 19 |         userPortraitVisitItem.setUserId(userId)
 20 |         userPortraitVisitItem.setDwDate(date)
 21 |         var reStatus = userPortraitVisitItem.run()
 22 | 
 23 |         userPortraitVisitItem = null
 24 |         reStatus
 25 |     }
 26 | }
 27 | 
 28 | /**
 29 |  * 流程
 30 |  */
 31 | class UserPortraitVisitItem {
 32 | 
 33 |     // hbase dimension:visitItemInventorys  列名
 34 |     val column: String = "visitItemInventorys"
 35 | 
 36 |     // 当前处理的 userId
 37 |     var userId: String = new String()
 38 |     def setUserId(userId: String): Unit = {
 39 |         if (userId.isEmpty()) {
 40 |             return
 41 |         }
 42 |         this.userId = userId
 43 |     }
 44 |     def getUserId(): String = {
 45 |         if (this.userId.isEmpty()) {
 46 |             return "0"
 47 |         }
 48 |         this.userId
 49 |     }
 50 | 
 51 |     // 分区日期
 52 |     var dwDate: String = new String()
 53 |     def setDwDate(date: String): Unit = {
 54 |         this.dwDate = date
 55 |     }
 56 |     def getDwDate(): String = {
 57 |         this.dwDate
 58 |     }
 59 | 
 60 |     /**
 61 |      * 初始化环境
 62 |      */
 63 |     def run(): String = {
 64 |         // 推荐状态
 65 |         var reStatus = "no"
 66 | 
 67 |         // 当日日期
 68 |         val offsetDate = DateUtil.getCalendarOffsetDateDay(0) // 获取当天日期
 69 |         val todayYmd = DateUtil.DateToString(offsetDate, DateUtil.SIMPLE_Y_M_D_FORMAT) // 格式化日期
 70 | 
 71 |         // 当日建模建模状态
 72 |         val modelState = this.getModelStateByDate(todayYmd)
 73 |         if (modelState == true) return reStatus
 74 | 
 75 |         // 最新带看房源数据
 76 |         val newVisitItemIds = this.getNewVisitItemIds()
 77 |         // 如果最新带看数据为空, 则退出
 78 |         if (newVisitItemIds.isEmpty) {
 79 |             this.saveModelStateByDate(todayYmd) // 保存当日的建模状态
 80 |             return reStatus
 81 |         }
 82 | 
 83 |         // 原始带看房源数据
 84 |         val visitItemIds = this.getVisitItemIds()
 85 | 
 86 |         //  检测新增的房源 Ids
 87 |         val diffInventoryIds = this.diffInventoryIds(newVisitItemIds, visitItemIds)
 88 | 
 89 |         // 如果没有变化, 则退出
 90 |         if (diffInventoryIds.isEmpty) {
 91 |             this.saveModelStateByDate(todayYmd) // 保存当日的建模状态
 92 |             return reStatus
 93 |         }
 94 |         println(getUserId() + ": UserPortraitVisitItem ", diffInventoryIds.mkString(","))
 95 | 
 96 |         // 需求标签合并
 97 |         this.userNeeds(diffInventoryIds)
 98 | 
 99 |         // 打分
100 |         this.score(diffInventoryIds)
101 | 
102 |         // 最新的需求更新到 hbase 中
103 |         this.updateInventorysToHbase(newVisitItemIds)
104 | 
105 |         // 全部成功, 保存当日的建模状态
106 |         this.saveModelStateByDate(todayYmd) // 保存当日的建模状态
107 | 
108 |         // 返回
109 |         reStatus = "yes"
110 |         reStatus
111 |     }
112 | 
113 |     /**
114 |      * 获取用户最新带看数据
115 |      */
116 |     def getNewVisitItemIds(): Map[String, Object] = {
117 |         var rs: Map[String, Object] = Map[String, Object]()
118 | 
119 |         // 读取指定用户的带看房源数据
120 |         val querySql = "SELECT visit_item_invs_a FROM dw_user_sd WHERE user_id = '" + this.getUserId() + "' AND p_dt = '" + this.getDwDate() + "' limit 1"
121 |         //println(querySql)
122 |         //println(querySql)
123 |         val userSdData = UserPortraitCommon.sparkHiveClient.select(querySql, "visit_item_invs_a");
124 | 
125 |         if (!userSdData.isEmpty()) {
126 |             // 所有的带看房源数据
127 |             val visitItemIvns: String = userSdData.get(0).get("visit_item_invs_a")
128 |             if (visitItemIvns != null) {
129 |                 rs.put(column, visitItemIvns)
130 |             }
131 |         }
132 |         rs
133 |     }
134 | 
135 |     /**
136 |      * 从 Hbase 获取用户最新带看数据
137 |      */
138 |     def getVisitItemIds(): Map[String, Object] = {
139 | 
140 |         var rs: Map[String, Object] = Map[String, Object]()
141 | 
142 |         // 获取用户维度数据
143 |         val visitItemInventorys: HashMap[String, String] = UserPortraitCommon.getUserPortraitDimByUserId(this.getUserId())
144 | 
145 |         // 获取用户 喜欢房源维度 维度数据 jsonStri
146 |         val dimVisitItemInventorysJsonStr = UserPortraitCommon.mapKeyDefaultValue(visitItemInventorys, column, "{}")
147 | 
148 |         // 转换 jsonStr 成 Map
149 |         rs = UserPortraitCommon.jsonStrToMap(dimVisitItemInventorysJsonStr)
150 | 
151 |         rs
152 |     }
153 | 
154 |     // 获取房源差集
155 |     def diffInventoryIds(newInventorys: Map[String, Object], oldInventorys: Map[String, Object]): Array[String] = {
156 |         var newInventoryIds: Set[String] = Set[String]()
157 |         var oldInventoryIds: Set[String] = Set[String]()
158 | 
159 |         if (!newInventorys.isEmpty) {
160 |             newInventoryIds = newInventorys.getOrElse(column, "").toString().split(",").toSet
161 |         }
162 |         if (!oldInventorys.isEmpty) {
163 |             oldInventoryIds = oldInventorys.getOrElse(column, "").toString().split(",").toSet
164 |         }
165 | 
166 |         // 差集
167 |         val diffInventoryIds = newInventoryIds -- oldInventoryIds
168 | 
169 |         diffInventoryIds.toArray
170 |     }
171 | 
172 |     /**
173 |      * 一组标签进行合并
174 |      */
175 |     def userNeeds(inventoryIds: Array[String]): Unit = {
176 |         // 合并
177 |         UserPortraitNeeds.setUserId(this.getUserId())
178 |         UserPortraitNeeds.userNeedsMergeByInventoryIds(inventoryIds)
179 |     }
180 | 
181 |     /**
182 |      * 通过房源 Id, 为房源属性打分
183 |      */
184 |     def score(inventoryIds: Array[String]): Unit = {
185 |         // 通过房源 ID , 为用户标签打分
186 |         inventoryIds.foreach { inventoryId =>
187 |             UserPortraitTags.setUserId(this.getUserId())
188 |             // 分数
189 |             // val score =  UserPortraitCommon.cityTagConf.getOrElse("visitItemInventoryScore", "0").toString()
190 |             // UserPortraitTags.tagScoreByInventoryId(inventoryId, score)
191 |             UserPortraitTags.tagsScoreByInventoryAndAction(inventoryId, UserPortraitVisitItem.actionName)
192 |         }
193 |     }
194 | 
195 |     /**
196 |      *  把需求转换成 Json
197 |      *  保存到用户画像表的 dimension:visitItemInventorys 中
198 |      */
199 |     def updateInventorysToHbase(inventorys: Map[String, Object]) = {
200 |         if (!inventorys.isEmpty) {
201 |             // Map 转换为 Json Str
202 |             val toString = UserPortraitCommon.mapToJsonStr(inventorys)
203 |             UserPortraitCommon.userPortraitTable.update(this.getUserId(), UserPortraitCommon.DimColumnFamily, column, toString)
204 |         }
205 |     }
206 | 
207 |     /**
208 |      * 指定日期的建模状态
209 |      * dateYmd: 日期 2016-04-10
210 |      * return
211 |      *  true : 已建模
212 |      *  false : 未建模
213 |      */
214 |     def getModelStateByDate(dateYmd: String): Boolean = {
215 |         var status = false
216 |         UserPortraitrModelState.setUserId(this.getUserId())
217 |         UserPortraitrModelState.setVisitItemInventorysRecord()
218 |         val visitItemInventorysRecord = UserPortraitrModelState.getVisitItemInventorysRecord() // 获取指定天数是否已经建模过了
219 |         if (visitItemInventorysRecord.contains(dateYmd)) {
220 |             status = true
221 |         }
222 |         status
223 |     }
224 | 
225 |     /**
226 |      * 保存属性的建模状态
227 |      * dateYmd: 日期 2016-04-10
228 |      */
229 |     def saveModelStateByDate(dateYmd: String): Unit = {
230 |         // 把建模状态写入到 hbase 中
231 |         UserPortraitrModelState.setUserId(this.getUserId())
232 |         var newVisitItemInventorysRecord: Map[String, Map[String, String]] = Map[String, Map[String, String]]()
233 |         newVisitItemInventorysRecord.put(dateYmd, Map("status" -> "1")) // Map[当前日期 -> Map[status->1]]
234 |         UserPortraitrModelState.saveVisitItemInventorysRecord(newVisitItemInventorysRecord) // 更新到 Hbase
235 |     }
236 | 
237 | }


--------------------------------------------------------------------------------
/src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitrModelState.scala:
--------------------------------------------------------------------------------
  1 | package com.angejia.dw.recommend.user.portrait
  2 | 
  3 | import scala.collection.mutable.Map
  4 | import scala.collection.mutable.HashMap
  5 | 
  6 | import com.angejia.dw.common.util.JsonUtil
  7 | 
  8 | /**
  9 |  * 用户画像，建模状态
 10 |  */
 11 | object UserPortraitrModelState {
 12 | 
 13 |     // 当前处理的 userId
 14 |     var userId: String = new String()
 15 |     def setUserId(userId: String): Unit = {
 16 |         if (userId.isEmpty()) {
 17 |             return
 18 |         }
 19 |         this.userId = userId
 20 |     }
 21 |     def getUserId(): String = {
 22 |         if (this.userId.isEmpty()) {
 23 |             return "0"
 24 |         }
 25 |         this.userId
 26 |     }
 27 | 
 28 |     /**
 29 |      * 用户 modelState 列族下的所有列的数据
 30 |      */
 31 |     var modelState: HashMap[String, String] = HashMap[String, String]()
 32 |     def setModelState(): Unit = {
 33 |         this.modelState = UserPortraitCommon.getUserPortraitModelStateByUserId(this.getUserId())
 34 |     }
 35 |     def getModelState(): HashMap[String, String] = {
 36 |         this.modelState
 37 |     }
 38 | 
 39 |     /**
 40 |      * 用户 visitItemInventorysRecord 带看房源推荐记录
 41 |      * 初始化 modelState:visitItemInventorysRecord 列数据
 42 |      * return
 43 |      *     Map[String, Map[String, String]]
 44 |      */
 45 |     var visitItemInventorysRecord: Map[String, Map[String, String]] = Map[String, Map[String, String]]()
 46 | 
 47 |     def setVisitItemInventorysRecord(): Unit = {
 48 |         this.setModelState()
 49 | 
 50 |         // 读取 Hbase 中已存在的标签数据 json String
 51 |         val jsonString = UserPortraitCommon.mapKeyDefaultValue(this.getModelState(), "visitItemInventorysRecord", "{}")
 52 | 
 53 |         // 转化成为可变的 Map
 54 |         this.visitItemInventorysRecord = this.toolJsonStringToChangeMap(jsonString)
 55 |     }
 56 | 
 57 |     def getVisitItemInventorysRecord(): Map[String, Map[String, String]] = {
 58 |         this.visitItemInventorysRecord
 59 |     }
 60 | 
 61 |     /**
 62 |      * 保存 visitItemInventorysRecord 结果到 Hbase 中
 63 |      */
 64 |     def saveVisitItemInventorysRecord(visitItemInventorysRecord: Map[String, Map[String, String]]): String = {
 65 |         // 更新数据
 66 |         this.toolSaveMapDataToHbaseColumn("visitItemInventorysRecord", visitItemInventorysRecord)
 67 |     }
 68 | 
 69 |     /**
 70 |      * 用户 linkInventorysRecord 连接房源记录
 71 |      * 初始化 modelState:linkInventorysRecord 列数据
 72 |      * return
 73 |      *     Map[String, Map[String, String]]
 74 |      */
 75 |     var linkInventorysRecord: Map[String, Map[String, String]] = Map[String, Map[String, String]]()
 76 | 
 77 |     def setLinkInventorysRecord(): Unit = {
 78 |         this.setModelState()
 79 | 
 80 |         // 读取 Hbase 中已存在的标签数据 json String
 81 |         val jsonString = UserPortraitCommon.mapKeyDefaultValue(this.getModelState(), "linkInventorysRecord", "{}")
 82 | 
 83 |         // 转化成为可变的 Map
 84 |         this.linkInventorysRecord = this.toolJsonStringToChangeMap(jsonString)
 85 |     }
 86 | 
 87 |     def getLinkInventorysRecord(): Map[String, Map[String, String]] = {
 88 |         this.linkInventorysRecord
 89 |     }
 90 | 
 91 |     /**
 92 |      * 保存 linkInventorysRecord 结果到 Hbase 中
 93 |      */
 94 |     def saveLinkInventorysRecord(linkInventorysRecord: Map[String, Map[String, String]]): String = {
 95 |         // 更新数据
 96 |         this.toolSaveMapDataToHbaseColumn("linkInventorysRecord", linkInventorysRecord)
 97 |     }
 98 | 
 99 |     /**
100 |      * json 字符串转换为一个可变的 Map
101 |      * return
102 |      *     Map[String, Map[String, String]
103 |      */
104 |     def toolJsonStringToChangeMap(jsonStringInput: String): scala.collection.mutable.Map[String, scala.collection.mutable.Map[String, String]] = {
105 |         //import scala.collection.mutable.Map
106 |         var jsonString = jsonStringInput
107 |         if (jsonString.isEmpty() || jsonString == "") {
108 |             jsonString = "{}"
109 |         }
110 | 
111 |         // string 转换为 json 格式(不可变  Map)
112 |         val mapData = JsonUtil.playJsonToMap(jsonString) // 返回的是一个 Map[String, Object]
113 | 
114 |         /**
115 |          * 这段转换写了好长时间，不要问为什么，好蛋疼.........
116 |          */
117 |         // 把 json 转换为可变 map 
118 |         val jsonToMap = mapData.map {
119 |             case (k, v) =>
120 |                 val curK = k
121 |                 // 把元祖 v 转换为 Map[String,String]
122 |                 val curV = v.asInstanceOf[scala.collection.immutable.Map[String, String]]
123 |                 // 再把 map 转换为可变 Map
124 |                 val formatV = scala.collection.mutable.Map(curV.toSeq: _*)
125 |                 k -> formatV
126 |         }
127 |         // hbase 可变 map 数据
128 |         val rs = collection.mutable.Map(jsonToMap.toSeq: _*).asInstanceOf[scala.collection.mutable.Map[String, Map[String, String]]]
129 | 
130 |         rs
131 |     }
132 | 
133 |     /**
134 |      * 保存 map 到数据到 hbase
135 |      * column : UserPortraitCommon.RecommendColumnFamily 列族下的列
136 |      * mapData : map 数据
137 |      */
138 |     def toolSaveMapDataToHbaseColumn(column: String, mapData: Map[String, Map[String, String]]): String = {
139 |         val map = mapData.map {
140 |             case (k, v) => k -> v.toMap // 转换成不可变 Map
141 |         }.toMap
142 |         // 转换为 json 字符串
143 |         val jsonString = JsonUtil.playMapToJson(map)
144 |         // 更新数据
145 |         UserPortraitCommon.userPortraitTable.update(this.getUserId(), UserPortraitCommon.ModelStateColumnFamily, column, jsonString)
146 |     }
147 | 
148 | }


--------------------------------------------------------------------------------
/src/test/scala/com/angejia/dw/recommend/inventory/portrait/InventoryPortraitCommonTest.scala:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.recommend.inventory.portrait;
 2 | 
 3 | import collection.mutable.Stack
 4 | import collection.mutable.HashMap
 5 | import org.scalatest._
 6 | import com.angejia.dw.common.util.mysql.MysqlClient
 7 | import com.angejia.dw.recommend.Conf
 8 | 
 9 | class InventoryPortraitCommonTest extends FlatSpec with Matchers {
10 |     Conf.setEnv("dev")
11 |     val productMysqDBInfo = Conf.getProductMysqDBInfo()
12 |     InventoryPortraitCommon.mysqlClient = new MysqlClient(
13 |         productMysqDBInfo.get("host").get,
14 |         productMysqDBInfo.get("account").get,
15 |         productMysqDBInfo.get("password").get,
16 |         productMysqDBInfo.get("defaultDB").get)
17 |     
18 |     "getInventoryPortraitByInventoryId" should "works" in {
19 |         val res = InventoryPortraitCommon.getUserTagsInventoryMappingByInventoryId("1")
20 | 
21 |         res should contain key ("price")
22 |         res should contain key ("district")
23 |         res should contain key ("city")
24 |         res should contain key ("block")
25 |         res should contain key ("bedrooms")
26 |         res should contain key ("community")
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/test/scala/com/angejia/dw/recommend/user/portrait/UserPortraitTest.scala:
--------------------------------------------------------------------------------
 1 | package com.angejia.dw.recommend.user.portrait
 2 | 
 3 | import collection.mutable.Stack
 4 | import org.scalatest._
 5 | 
 6 | class UserPortraitTest extends FlatSpec with Matchers {
 7 | 
 8 |     "new log" should "be parsed" in {
 9 |         val log = ("0.045\t0.045\t183.198.218.230\t1378\t127.0.0.1:9000\t"
10 |             + "[2016-11-23T16:59:15+08:00]\tapi.angejia.com\t"
11 |             + "GET /mobile/member/inventories/1/2 HTTP/1.1\t"
12 |             + "200\t379\t-\tDalvik/2.1.0 (Linux; U; Android 6.0.1; OPPO R9s Build/MMB29M)\t"
13 |             + "0.46\t183.198.218.230\t"
14 |             + "ExhuBkt16RSHa0/0C+y9x4sAxoD7EKhTTIoMs76vHmF4082Rl0cMQrHG/n4sx5Swb2xKFoKuT0q71Fh1+vcW/wo7KxexQroTAfJbSze3I5pDxw6TdZ/8HoE2wwmq0Zfcoevuqh00RFkCTG88w2CddEjoOiL6xJ1PI0GK4i4D5MYk3WvuFyoYcBZ+Nk5i4yVhrD8GCRVu8uRiXCoQAyX8mahVxbtae3MEyOZD4E2goMoiul9tEcXyK0XYB+aJhEZVL6jdCR5kg9bihbVyulmWOIvjLnqkZCjBcEFrpv9kcG9zNTY9MUTUyJxPg2MuvzeZZ2FG5Yv8GKIhOSSNl1pKY/v35cpXBzldw55381DmC1s=\t"
15 |             + "app=a-angejia;av=4.8.1;ccid=2;gcid=;ch=B14;lng=;lat=;net=WIFI;p=android;pm=Android-OPPO R9s;osv=6.0.1;dvid=86253103615747802:00:00:00:00:00;uid=760198\t"
16 |             + "-\t-")
17 |         val res = UserPortrait.formatLogData(log)
18 | 
19 |         res should contain key ("logRequestUri")
20 |         res.get("logRequestUri") should equal(Some("/mobile/member/inventories/1/2"))
21 | 
22 |         res should contain key ("logHost")
23 |         res.get("logHost") should equal(Some("api.angejia.com"))
24 | 
25 |         res should contain key ("logTime")
26 |         res.get("logTime") should equal(Some("[2016-11-23T16:59:15+08:00]"))
27 | 
28 |         res should contain key ("userId")
29 |         //res.get("userId") should equal(Some("728924"))
30 | 
31 |         res should contain key ("appAgent")
32 |         res.get("appAgent") should equal(Some("app=a-angejia;av=4.8.1;ccid=2;gcid=;ch=B14;lng=;lat=;net=WIFI;p=android;pm=Android-OPPO R9s;osv=6.0.1;dvid=86253103615747802:00:00:00:00:00;uid=760198"))
33 | 
34 |         res should contain key ("cityId")
35 |         res.get("cityId") should equal(Some("2"))
36 | 
37 |         res should contain key ("auth")
38 |         res.get("auth") should equal(Some("ExhuBkt16RSHa0/0C+y9x4sAxoD7EKhTTIoMs76vHmF4082Rl0cMQrHG/n4sx5Swb2xKFoKuT0q71Fh1+vcW/wo7KxexQroTAfJbSze3I5pDxw6TdZ/8HoE2wwmq0Zfcoevuqh00RFkCTG88w2CddEjoOiL6xJ1PI0GK4i4D5MYk3WvuFyoYcBZ+Nk5i4yVhrD8GCRVu8uRiXCoQAyX8mahVxbtae3MEyOZD4E2goMoiul9tEcXyK0XYB+aJhEZVL6jdCR5kg9bihbVyulmWOIvjLnqkZCjBcEFrpv9kcG9zNTY9MUTUyJxPg2MuvzeZZ2FG5Yv8GKIhOSSNl1pKY/v35cpXBzldw55381DmC1s="))
39 | 
40 |         res should contain key ("logType")
41 |         res.get("logType") should equal(Some("accessLog"))
42 |     }
43 | 
44 |     "old log" should "be parsed" in {
45 |         val log =("0.064\t0.064\t153.99.123.51\t1529\t127.0.0.1:9000\t"
46 |                 +"[2016-11-01T00:00:00+08:00]\tapi.angejia.com\t"
47 |                 +"GET /mobile/member/inventories/1/2 HTTP/1.1\t"
48 |                 +"200\t2195\t-\tAngejia/4.6.2 CFNetwork/808.0.2 Darwin/16.0.0\t7.42\t153.99.123.51\t"
49 |                 +"kCp+SLcl85sKrn/1jntFnhRXZlG79zMr6wEAy7Vkd9TyJ46da3IxyJPRLdd/ngMk/KqLmF8p26/izeoN7/Pgo7NB5VO21FyaHKrN370snfqWOv5CYb1x7fFJNJQYwwX54ketZAJ1mMSWj7LzbhSj9Kedl56dUi/9OL64djEld2iecKGWtNk2Rc4I2FWjoLiavAsJh/6RCOJ84tcc7KLB+IeCjz/uW3JlrZoJO3qvDfMiCv28y6geQjRNVljmBo3P\t"
50 |                 +"app=i-angejia;av=4.6;ccid=1;gcid=1;ch=A01;lng=0.000000;lat=0.000000;ip=192.168.1.100;mac=None;net=WIFI;p=iOS;pm=iPhone9,1;osv=10.0.1;dvid=09DF78A6-935D-46E6-9BB5-201610241142;uid=728924;idfa=D2CAED51-9235-4B51-9CC6-7ECC3AE7DD91\t"
51 |                 +"-")
52 |         val res = UserPortrait.formatLogData(log)
53 | 
54 |         res should contain key ("logRequestUri")
55 |         res.get("logRequestUri") should equal(Some("/mobile/member/inventories/1/2"))
56 | 
57 |         res should contain key ("logHost")
58 |         res.get("logHost") should equal(Some("api.angejia.com"))
59 | 
60 |         res should contain key ("logTime")
61 |         res.get("logTime") should equal(Some("[2016-11-01T00:00:00+08:00]"))
62 | 
63 |         res should contain key ("userId")
64 |         res.get("userId") should equal(Some("728924"))
65 | 
66 |         res should contain key ("appAgent")
67 |         res.get("appAgent") should equal(Some("app=i-angejia;av=4.6;ccid=1;gcid=1;ch=A01;lng=0.000000;lat=0.000000;ip=192.168.1.100;mac=None;net=WIFI;p=iOS;pm=iPhone9,1;osv=10.0.1;dvid=09DF78A6-935D-46E6-9BB5-201610241142;uid=728924;idfa=D2CAED51-9235-4B51-9CC6-7ECC3AE7DD91"))
68 | 
69 |         res should contain key ("cityId")
70 |         res.get("cityId") should equal(Some("1"))
71 | 
72 |         res should contain key ("auth")
73 |         res.get("auth") should equal(Some("kCp+SLcl85sKrn/1jntFnhRXZlG79zMr6wEAy7Vkd9TyJ46da3IxyJPRLdd/ngMk/KqLmF8p26/izeoN7/Pgo7NB5VO21FyaHKrN370snfqWOv5CYb1x7fFJNJQYwwX54ketZAJ1mMSWj7LzbhSj9Kedl56dUi/9OL64djEld2iecKGWtNk2Rc4I2FWjoLiavAsJh/6RCOJ84tcc7KLB+IeCjz/uW3JlrZoJO3qvDfMiCv28y6geQjRNVljmBo3P"))
74 | 
75 |         res should contain key ("logType")
76 |         res.get("logType") should equal(Some("accessLog"))
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------