├── .gitignore ├── build.sbt ├── project └── plugins.sbt ├── scripts ├── CbcfService.sh ├── CommunityIbcfService.sh ├── InventoryIbcfService.sh ├── InventoryPortraitCleanService.sh ├── UserPortraitAttenuationService.sh ├── UserUbcfService.sh ├── community │ └── CommunityIBCF.sh ├── extract │ └── AccessLogToKafka.sh ├── inventory │ ├── InventoryIBCF.sh │ ├── InventoryPortraitClean.sh │ └── PropertyInventoryIndex.sh └── user │ ├── UserPortrait.sh │ ├── UserPortraitAttenuation.sh │ └── UserUbcf.sh └── src ├── main ├── java │ └── com │ │ └── angejia │ │ └── dw │ │ ├── common │ │ └── util │ │ │ ├── DateUtil.java │ │ │ ├── DebugUtil.java │ │ │ ├── FileUtil.java │ │ │ ├── JavaJsonUtil.java │ │ │ ├── PropertyUtil.java │ │ │ ├── mysql │ │ │ └── JavaMysqlClient.java │ │ │ └── parse │ │ │ ├── ParseMobileAgent.java │ │ │ └── ParseMobileToken.java │ │ ├── hadoop │ │ └── hive │ │ │ └── HiveClient.java │ │ └── service │ │ ├── Conf.java │ │ ├── property │ │ ├── PropertyInventoryService.java │ │ └── model │ │ │ └── Inventory.java │ │ └── user │ │ └── UserService.java ├── resources │ ├── conf_dev.properties │ ├── conf_online.properties │ └── log4j.properties └── scala │ └── com │ └── angejia │ └── dw │ ├── common │ └── util │ │ ├── JsonUtil.scala │ │ ├── ListenerFile.scala │ │ ├── RegexUtil.scala │ │ ├── ScFileUtil.scala │ │ ├── ScriptUtil.scala │ │ └── mysql │ │ └── MysqlClient.scala │ ├── hadoop │ ├── hbase │ │ └── HBaseClient.scala │ ├── hdfs │ │ ├── HDFSClient.scala │ │ └── HDFSClientTest.scala │ ├── kafka │ │ ├── KafkaConsumer.scala │ │ └── KafkaProducer.scala │ └── spark │ │ ├── CollaborativeFiltering.scala │ │ └── CollaborativeFilteringTest.scala │ ├── logs │ ├── UbaAppActionLogStreaming.scala │ ├── UbaWebActionLogStreaming.scala │ └── UbaWebVisitLogStreaming.scala │ └── recommend │ ├── Conf.scala │ ├── IBCF.scala │ ├── UBCF.scala │ ├── community │ └── CommunityIBCF.scala │ ├── extract │ └── ExtractFileToKafka.scala │ ├── inventory │ ├── InventoryIBCF.scala │ ├── InventoryIBCFspark.scala │ ├── InventoryItemCF.scala │ ├── InventoryItemCFBak.scala │ ├── InventoryItemCFTest.scala │ └── portrait │ │ ├── InventoryPortraitCommon.scala │ │ └── MarketingInventoryPortrait.scala │ └── user │ ├── UserUBCF.scala │ ├── UserUBCF20160517.scala │ └── portrait │ ├── UserPortrait.scala │ ├── UserPortraitAttenuation.scala │ ├── UserPortraitBrowse.scala │ ├── UserPortraitCommon.scala │ ├── UserPortraitFilter.scala │ ├── UserPortraitLikeInventory.scala │ ├── UserPortraitLinkInventory.scala │ ├── UserPortraitMemberDemand.scala │ ├── UserPortraitMemberDemand_20160808.scala │ ├── UserPortraitNeeds.scala │ ├── UserPortraitTagConf.scala │ ├── UserPortraitTags.scala │ ├── UserPortraitVisitItem.scala │ └── UserPortraitrModelState.scala └── test └── scala └── com └── angejia └── dw └── recommend ├── inventory └── portrait │ └── InventoryPortraitCommonTest.scala └── user └── portrait └── UserPortraitTest.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .settings/ 3 | .classpath 4 | .project 5 | .springBeans 6 | target/ 7 | derby.log 8 | metastore_db 9 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import AssemblyKeys._ 2 | 3 | // 打开 assembly 插件功能 4 | assemblySettings 5 | 6 | // 配置 assembly 插件所有使用的 JAR 7 | jarName in assembly := "recommend-2.0.jar" 8 | 9 | // 项目名称 10 | name := "recommend-2.0" 11 | 12 | // 组织名称 13 | organization := "com.angejia.dw.recommend" 14 | 15 | // 项目版本号 16 | version := "2.0" 17 | 18 | // scala 版本 19 | scalaVersion := "2.11.8" 20 | 21 | // Eclipse 支持 22 | EclipseKeys.createSrc := EclipseCreateSrc.Default + EclipseCreateSrc.Resource 23 | 24 | // 非托管资源目录 25 | unmanagedResourceDirectories in Compile += { baseDirectory.value / "src/main/resources" } 26 | 27 | // 相关依赖 28 | libraryDependencies ++= Seq( 29 | // scala-library 30 | "org.scala-lang" % "scala-library" % "2.11.8", 31 | 32 | // hadoop 依赖 33 | "org.apache.hadoop" % "hadoop-common" % "2.6.0", 34 | "org.apache.hadoop" % "hadoop-hdfs" % "2.6.0", 35 | "org.apache.hadoop" % "hadoop-client" % "2.6.0", 36 | 37 | // Spark 依赖 : spark-core_2.11(spark 所属 scala 版本号) 2.0.2(spark 版本号) 38 | "org.apache.spark" % "spark-core_2.11" % "2.0.2", 39 | "org.apache.spark" % "spark-streaming_2.11" % "2.0.2", 40 | "org.apache.spark" % "spark-streaming-kafka-0-10_2.11" % "2.0.2" 41 | //"org.apache.spark" % "spark-streaming-kafka-0-8_2.11" % "2.0.2" 42 | exclude("org.apache.avro","*") 43 | exclude("org.slf4j","*"), 44 | "org.apache.spark" % "spark-mllib_2.11" % "2.0.2", 45 | // spark sql 46 | "org.apache.spark" % "spark-sql_2.11" % "2.0.2", 47 | "org.apache.spark" % "spark-hive_2.11" % "2.0.2", 48 | //"org.apache.avro" % "avro" % "1.7.4", 49 | //"org.apache.avro" % "avro-ipc" % "1.7.4" excludeAll(excludeNetty), 50 | 51 | // hive 相关 JDBC 52 | "org.apache.hive" % "hive-common" % "1.1.0", 53 | //"org.apache.hive" % "hive-exec" % "1.1.0", 54 | "org.apache.hive" % "hive-jdbc" % "1.1.0", 55 | "org.apache.hive" % "hive-cli" % "1.1.0", 56 | //"org.spark-project.hive" % "hive-beeline" % "1.2.1.spark2", 57 | 58 | // jblas 线性代数库,求向量点积 59 | "org.jblas" % "jblas" % "1.2.4", 60 | 61 | // Kafka 依赖 62 | "org.apache.kafka" % "kafka-log4j-appender" % "0.10.1.0" % "provided", 63 | "org.apache.kafka" % "kafka_2.11" % "0.10.1.0" 64 | exclude("javax.jms", "jms") 65 | exclude("com.sun.jdmk", "jmxtools") 66 | exclude("com.sun.jmx", "jmxri"), 67 | 68 | // Hbase 依赖 69 | //"org.apache.hbase" % "hbase" % "1.0.0", 70 | "org.apache.hbase" % "hbase-common" % "1.0.0", 71 | "org.apache.hbase" % "hbase-client" % "1.0.0", 72 | "org.apache.hbase" % "hbase-server" % "1.0.0", 73 | //"org.apache.hbase" % "hbase-protocol" % "1.0.0", 74 | //"org.apache.htrace" % "htrace-core" % "3.1.0-incubating", 75 | 76 | // Mysql 依赖 77 | "mysql" % "mysql-connector-java" % "5.1.38", 78 | 79 | // ES 客户端 80 | //"org.elasticsearch" % "elasticsearch" % "2.3.4", 81 | // 原始 elasticsearch 依赖因为 guava 包会产生冲突 , HBase 使用的是 12.0, ES 使用的是 19.0 的版本 82 | // 解决方法: http://blog.csdn.net/sunshine920103/article/details/51659936 83 | //"com.angejia.dw.elasticsearch" % "dw_elasticsearch" % "1.0", 84 | 85 | // play Json 包, 版本太高会冲突 86 | "com.typesafe.play" % "play-json_2.11" % "2.3.9", 87 | // spray Json 包 88 | "io.spray" % "spray-json_2.11" % "1.3.2", 89 | // smart Json 包 90 | "net.minidev" % "json-smart" % "2.2.1", 91 | 92 | // java Json 包 93 | "com.googlecode.json-simple" % "json-simple" % "1.1.1", 94 | 95 | // ORM 框架 Hibernate 96 | //"org.hibernate" % "hibernate-core" % "5.2.1.Final", 97 | //"org.hibernate.javax.persistence" % "hibernate-jpa-2.0-api" % "1.0.1.Final", 98 | //"commons-logging" % "commons-logging" % "1.2", 99 | //"commons-collections" % "commons-collections" % "3.2.2", 100 | //"cglib" % "cglib" % "3.2.4", 101 | //"dom4j" % "dom4j" % "1.6.1", 102 | 103 | // 其他 104 | "net.sf.jopt-simple" % "jopt-simple" % "4.9" % "provided", 105 | "joda-time" % "joda-time" % "2.9.2" % "provided", 106 | "commons-codec" % "commons-codec" % "1.10", 107 | "log4j" % "log4j" % "1.2.9", 108 | "com.github.scopt" %% "scopt" % "3.5.0", 109 | 110 | // 单元测试框架 111 | "org.scalatest" %% "scalatest" % "3.0.0" % "test" 112 | ) 113 | 114 | 115 | // 强制默认合并 116 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { mergeStrategy => { 117 | case entry => { 118 | val strategy = mergeStrategy(entry) 119 | if (strategy == MergeStrategy.deduplicate) MergeStrategy.first 120 | else strategy 121 | } 122 | }} 123 | 124 | 125 | // 配置远程资源 126 | resolvers ++= Seq( 127 | 128 | // HTTPS is unavailable for Maven Central 129 | "Maven Repository" at "http://repo.maven.apache.org/maven2", 130 | "Apache Repository" at "https://repository.apache.org/content/repositories/releases", 131 | "JBoss Repository" at "https://repository.jboss.org/nexus/content/repositories/releases/", 132 | "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/", 133 | "Elaticsearch Repository" at "https://mvnrepository.com/artifact/org.elasticsearch/elasticsearch", 134 | "MQTT Repository" at "https://repo.eclipse.org/content/repositories/paho-releases/", 135 | 136 | // For Sonatype publishing 137 | // "sonatype-snapshots" at "https://oss.sonatype.org/content/repositories/snapshots", 138 | // "sonatype-staging" at "https://oss.sonatype.org/service/local/staging/deploy/maven2/", 139 | // also check the local Maven repository ~/.m2 "/usr/local/maven/repository" 140 | 141 | //本地 mavan 仓库地址 142 | "Local Maven Repository" at "file:///usr/local/maven/repository", 143 | Resolver.mavenLocal 144 | ) 145 | 146 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) 2 | 3 | resolvers += "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/" 4 | 5 | // eclipse 插件 6 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.5.0") 7 | 8 | // 打包所有依赖的插件 9 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") 10 | -------------------------------------------------------------------------------- /scripts/CbcfService.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 重启 CBCF 所有服务 ./CbcfService.sh "/home/dwadmin/app/recommend/recommend-2.0" 3 | 4 | # 项目路径 5 | PROJECT_HOME=$1 6 | 7 | 8 | # 远程重启 用户画像 9 | echo "UserPortrait" 10 | ssh -q -t dwadmin@bi4 "bash -i ${PROJECT_HOME}/scripts/user/UserPortrait.sh \"${PROJECT_HOME}/target/scala-2.11/recommend-2.0.jar\" " 11 | 12 | # 远程重启 抽取日志脚本 13 | echo "AccessLogToKafka" 14 | ssh -q -t dwadmin@bi0 "bash -i ${PROJECT_HOME}/scripts/extract/AccessLogToKafka.sh \"${PROJECT_HOME}/target/scala-2.11/recommend-2.0.jar\" " 15 | -------------------------------------------------------------------------------- /scripts/CommunityIbcfService.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 重启 IBCF 所有服务 ./InventoryIbcfService.sh "/home/hadoop/app/recommend/recommend-2.0" 3 | 4 | # 推荐系统项目路径 5 | PROJECT_HOME=$1 6 | 7 | # 在 task3 节点执行这个 sh 8 | echo "CommunityIBCF" 9 | ssh -q -t hadoop@uhadoop-ociicy-task3 "bash -i ${PROJECT_HOME}/scripts/community/CommunityIBCF.sh \"${PROJECT_HOME}/target/scala-2.10/recommend-2.0.jar\" " 10 | 11 | 12 | # dw_etl 项目路径 13 | #DW_ETL_HOME=/home/dwadmin/app/dw_etl 14 | # 把结果数据通过 HBASE 保存在 Hive 中 15 | #${DW_ETL_HOME}/dw_service/index.py --service task --mo hive_task --par '{"sql":"source/real_time/rt_recommend_inventroy_ibcf_result.sql", "date":"today", "runEnv":"local"}' -------------------------------------------------------------------------------- /scripts/InventoryIbcfService.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 重启 IBCF 所有服务 ./InventoryIbcfService.sh "/home/hadoop/app/recommend/recommend-2.0" 3 | 4 | # 推荐系统项目路径 5 | PROJECT_HOME=$1 6 | 7 | # 在 task3 节点执行这个 sh 8 | echo "InventoryIBCF" 9 | ssh -q -t hadoop@uhadoop-ociicy-task3 "bash -i ${PROJECT_HOME}/scripts/inventory/InventoryIBCF.sh \"${PROJECT_HOME}/target/scala-2.10/recommend-2.0.jar\" " 10 | 11 | 12 | # dw_etl 项目路径 13 | #DW_ETL_HOME=/home/dwadmin/app/dw_etl 14 | # 把结果数据通过 HBASE 保存在 Hive 中 15 | #${DW_ETL_HOME}/dw_service/index.py --service task --mo hive_task --par '{"sql":"source/real_time/rt_recommend_inventroy_ibcf_result.sql", "date":"today", "runEnv":"local"}' -------------------------------------------------------------------------------- /scripts/InventoryPortraitCleanService.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 清理无效的房源画像 ./InventoryPortraitCleanService.sh "/home/dwadmin/app/recommend/recommend-2.0" 3 | 4 | # 项目路径 5 | PROJECT_HOME=$1 6 | 7 | # 衰减用户画像 8 | echo "InventoryPortraitClean" 9 | ssh -q -t dwadmin@bi4 "bash -i ${PROJECT_HOME}/scripts/inventory/InventoryPortraitClean.sh \"${PROJECT_HOME}/target/scala-2.10/recommend-2.0.jar\" " 10 | -------------------------------------------------------------------------------- /scripts/UserPortraitAttenuationService.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 衰减用户画像 ./UserPortraitAttenuationService.sh "/home/dwadmin/app/recommend/recommend-2.0" 3 | 4 | # 项目路径 5 | PROJECT_HOME=$1 6 | 7 | # 衰减用户画像 8 | echo "UserPortraitAttenuation" 9 | ssh -q -t dwadmin@bi4 "bash -i ${PROJECT_HOME}/scripts/user/UserPortraitAttenuation.sh \"${PROJECT_HOME}/target/scala-2.10/recommend-2.0.jar\" " 10 | -------------------------------------------------------------------------------- /scripts/UserUbcfService.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 重启 IBCF 所有服务 ./UbcfService.sh "/home/hadoop/app/recommend/recommend-2.0" 3 | 4 | # 推荐系统项目路径 5 | PROJECT_HOME=$1 6 | 7 | # 在 task3 节点执行这个 sh 8 | echo "UserUbcf" 9 | ssh -q -t hadoop@uhadoop-ociicy-task3 "bash -i ${PROJECT_HOME}/scripts/user/UserUbcf.sh \"${PROJECT_HOME}/target/scala-2.10/recommend-2.0.jar\" " 10 | -------------------------------------------------------------------------------- /scripts/community/CommunityIBCF.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 重启 CommunityIBCF 算法程序 3 | 4 | # 案例 ./CommunityIBCF.sh "/home/hadoop/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar" 5 | 6 | # jar 的地址 7 | JAR_PATH=$1 8 | 9 | # 删除 spark 提交时的临时文件 10 | rm -rf /tmp/spark-* 11 | 12 | # 原来的 进程 13 | ps -aux | grep 'com.angejia.dw.recommend.community.CommunityIBCF' | awk '{print $2}' | while read pid; 14 | do 15 | echo "old pid: ${pid}" 16 | kill -15 $pid; 17 | done 18 | 19 | 20 | # 提交任务给集群 yarn 客户端模式 21 | spark-submit \ 22 | --name CommunityIBCF \ 23 | --class com.angejia.dw.recommend.community.CommunityIBCF \ 24 | --master yarn-client \ 25 | --driver-cores 4 \ 26 | --driver-memory 10240M \ 27 | --executor-memory 2048M \ 28 | --num-executors 2 \ 29 | ${JAR_PATH} "online" "hdfs://uhadoop-ociicy-master1:8020/user/hive/real_time/rt_user_community_history/*" 30 | 31 | 32 | # 新的进程 33 | ps -aux | grep 'com.angejia.dw.recommend.community.CommunityIBCF' | awk '{print $2}' | while read pid; 34 | do 35 | echo "new pid: ${pid}" 36 | done 37 | 38 | -------------------------------------------------------------------------------- /scripts/extract/AccessLogToKafka.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 重启 抽取日志脚本 3 | 4 | # 案例 ./AccessLogToKafka.sh "/home/dwadmin/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar" 5 | 6 | # jar 的地址 7 | JAR_PATH=$1 8 | 9 | # 原来的 进程 10 | ps -aux | grep ExtractFileToKafkaAccessLog | awk '{print $2}' | while read pid; 11 | do 12 | echo "old pid: ${pid}" 13 | kill -9 $pid; 14 | done 15 | 16 | 17 | # 提交任务 18 | java -Xms2048M -Xmx2048M -DAPP_NAME=ExtractFileToKafkaAccessLog \ 19 | -cp ${JAR_PATH} com.angejia.dw.recommend.extract.ExtractFileToKafka "uhadoop-ociicy-master1:2181" "bi4:9092" "accessLog" "0" "accessLogBase" "/data/log/real_time/logs/access_log" "2000" >> /data/log/real_time/logs/access_log_run 2>&1 & 20 | 21 | 22 | # 新的进程 23 | ps -aux | grep ExtractFileToKafkaAccessLog | awk '{print $2}' | while read pid; 24 | do 25 | echo "new pid: ${pid}" 26 | done 27 | 28 | -------------------------------------------------------------------------------- /scripts/inventory/InventoryIBCF.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 重启 InventoryIBCF 算法程序 3 | 4 | # 案例 ./InventoryIBCF.sh "/home/hadoop/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar" 5 | 6 | # jar 的地址 7 | JAR_PATH=$1 8 | 9 | # 删除 spark 提交时的临时文件 10 | rm -rf /tmp/spark-* 11 | 12 | # 原来的 进程 13 | ps -aux | grep 'com.angejia.dw.recommend.inventory.InventoryIBCF' | awk '{print $2}' | while read pid; 14 | do 15 | echo "old pid: ${pid}" 16 | kill -15 $pid; 17 | done 18 | 19 | 20 | # 提交任务给集群 yarn 客户端模式 21 | spark-submit \ 22 | --name InventoryIBCF \ 23 | --class com.angejia.dw.recommend.inventory.InventoryIBCF \ 24 | --master yarn-client \ 25 | --conf spark.driver.maxResultSize=8192M \ 26 | --driver-cores 4 \ 27 | --driver-memory 10240M \ 28 | --executor-memory 2048M \ 29 | --num-executors 2 \ 30 | ${JAR_PATH} "online" "hdfs://uhadoop-ociicy-master2:8020/user/hive/real_time/rt_user_inventory_history/*" 31 | 32 | 33 | # 新的进程 34 | ps -aux | grep 'com.angejia.dw.recommend.inventory.InventoryIBCF' | awk '{print $2}' | while read pid; 35 | do 36 | echo "new pid: ${pid}" 37 | done 38 | 39 | -------------------------------------------------------------------------------- /scripts/inventory/InventoryPortraitClean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 清理无用房源画像 3 | 4 | # 案例 ./InventoryPortraitClean "/home/dwadmin/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar" 5 | 6 | # jar 的地址 7 | JAR_PATH=$1 8 | 9 | # 原来的 进程 10 | ps -aux | grep 'com.angejia.dw.recommend.inventory.portrait.InventoryPortraitClean' | awk '{print $2}' | while read pid; 11 | do 12 | echo "old pid: ${pid}" 13 | kill -9 $pid; 14 | done 15 | 16 | 17 | # 提交任务 18 | java -DAPP_NAME=InventoryPortraitClean \ 19 | -cp ${JAR_PATH} com.angejia.dw.recommend.inventory.portrait.InventoryPortraitClean "online" "" >> /data/log/recommend/InventoryPortraitClean 2>&1 & 20 | 21 | 22 | # 新的进程 23 | ps -aux | grep 'com.angejia.dw.recommend.inventory.portrait.InventoryPortraitClean' | awk '{print $2}' | while read pid; 24 | do 25 | echo "new pid: ${pid}" 26 | done 27 | 28 | -------------------------------------------------------------------------------- /scripts/inventory/PropertyInventoryIndex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 房源索引服务 3 | 4 | # 案例 ./PropertyInventoryIndex.sh "/home/hadoop/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar" 5 | 6 | # jar 的地址 7 | JAR_PATH=$1 8 | 9 | 10 | # 原来的 进程 11 | ps -aux | grep 'com.angejia.dw.service.property.PropertyInventoryService' | awk '{print $2}' | while read pid; 12 | do 13 | echo "old pid: ${pid}" 14 | kill -15 $pid; 15 | done 16 | 17 | 18 | java -DAPP_NAME=PropertyInventoryIndexService \ 19 | -cp ~/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar \ 20 | com.angejia.dw.service.property.PropertyInventoryService \ 21 | "online" \ 22 | "/data/log/service/property/service_property_date_point" \ 23 | >> /data/log/service/property/service_property_extract 2>&1 24 | 25 | 26 | # 新的进程 27 | ps -aux | grep 'com.angejia.dw.service.property.PropertyInventoryService' | awk '{print $2}' | while read pid; 28 | do 29 | echo "new pid: ${pid}" 30 | done 31 | 32 | -------------------------------------------------------------------------------- /scripts/user/UserPortrait.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 重启 用户画像 程序 3 | 4 | # 案例 ./UserPortrait.sh "/home/dwadmin/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar" 5 | 6 | # jar 的地址 7 | JAR_PATH=$1 8 | 9 | # 删除 spark 提交时的临时文件 10 | rm -rf /tmp/spark-* 11 | 12 | # 原来的 进程 13 | ps -aux | grep 'com.angejia.dw.recommend.user.portrait.UserPortrait' | awk '{print $2}' | while read pid; 14 | do 15 | echo "old pid: ${pid}" 16 | kill -9 $pid; 17 | done 18 | 19 | 20 | # 提交任务 21 | spark-submit \ 22 | --name UserPortrait \ 23 | --class com.angejia.dw.recommend.user.portrait.UserPortrait \ 24 | --master yarn \ 25 | --deploy-mode client \ 26 | --driver-cores 2 \ 27 | --driver-memory 4096M \ 28 | --executor-memory 2048M \ 29 | --executor-cores 2 \ 30 | --num-executors 2 \ 31 | ${JAR_PATH} \ 32 | --env "online" \ 33 | --kafka-topic "accessLog" \ 34 | --kafka-consumer-gid "userPortrait" >> /data/log/recommend/UserPortrait 2>&1 & 35 | 36 | 37 | # 新的进程 38 | ps -aux | grep 'com.angejia.dw.recommend.user.portrait.UserPortrait' | awk '{print $2}' | while read pid; 39 | do 40 | echo "new pid: ${pid}" 41 | done 42 | 43 | -------------------------------------------------------------------------------- /scripts/user/UserPortraitAttenuation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 衰减用户画像 3 | 4 | # 案例 ./UserPortraitAttenuation.sh "/home/dwadmin/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar" 5 | 6 | # jar 的地址 7 | JAR_PATH=$1 8 | 9 | # 原来的 进程 10 | ps -aux | grep 'com.angejia.dw.recommend.user.portrait.UserPortraitAttenuation' | awk '{print $2}' | while read pid; 11 | do 12 | echo "old pid: ${pid}" 13 | kill -9 $pid; 14 | done 15 | 16 | 17 | # 提交任务 18 | java -DAPP_NAME=UserPortraitAttenuation \ 19 | -cp ${JAR_PATH} com.angejia.dw.recommend.user.portrait.UserPortraitAttenuation "online" "" >> /data/log/recommend/UserPortraitAttenuation 2>&1 & 20 | 21 | 22 | # 新的进程 23 | ps -aux | grep 'com.angejia.dw.recommend.user.portrait.UserPortraitAttenuation' | awk '{print $2}' | while read pid; 24 | do 25 | echo "new pid: ${pid}" 26 | done 27 | 28 | -------------------------------------------------------------------------------- /scripts/user/UserUbcf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 重启 UserUbcf 算法程序 3 | 4 | # 案例 ./UserUbcf.sh "/home/hadoop/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar" 5 | 6 | # jar 的地址 7 | JAR_PATH=$1 8 | 9 | # 删除 spark 提交时的临时文件 10 | rm -rf /tmp/spark-* 11 | 12 | # 原来的 进程 13 | ps -aux | grep 'com.angejia.dw.recommend.user.UserUBCF' | awk '{print $2}' | while read pid; 14 | do 15 | echo "old pid: ${pid}" 16 | kill -15 $pid; 17 | done 18 | 19 | 20 | # 提交任务给集群 yarn 客户端模式 21 | spark-submit \ 22 | --name UserUBCF \ 23 | --class com.angejia.dw.recommend.user.UserUBCF \ 24 | --master yarn-client \ 25 | --driver-cores 4 \ 26 | --driver-memory 10240M \ 27 | --executor-memory 2048M \ 28 | --num-executors 2 \ 29 | ${JAR_PATH} "online" "hdfs://uhadoop-ociicy-master2:8020/user/hive/real_time/rt_user_inventory_history/*" 30 | 31 | 32 | # 新的进程 33 | ps -aux | grep 'com.angejia.dw.recommend.user.UserUBCF' | awk '{print $2}' | while read pid; 34 | do 35 | echo "new pid: ${pid}" 36 | done 37 | 38 | -------------------------------------------------------------------------------- /src/main/java/com/angejia/dw/common/util/DateUtil.java: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util; 2 | 3 | import java.security.Timestamp; 4 | import java.text.ParseException; 5 | import java.text.SimpleDateFormat; 6 | import java.util.Calendar; 7 | import java.util.Date; 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | 11 | 12 | /** 13 | * 日期转换 14 | * @author Jason 15 | */ 16 | public class DateUtil { 17 | 18 | //格式化日期模式,按照需求增加 19 | public static final String SIMPLE_FORMAT = "yyyy-MM-dd HH:mm:ss"; 20 | public static final String SIMPLE_Y_M_D_FORMAT = "yyyy-MM-dd"; 21 | public static final String SIMPLE_YMD_FORMAT = "yyyyMMdd"; 22 | public static final String SIMPLE_hms_FORMAT = "HH:mm:ss"; 23 | 24 | 25 | /** 26 | * String -> Date 27 | * @param time 28 | * @param time_format 29 | * @return Date 30 | */ 31 | public static Date StringToDate (String time,String time_format) { 32 | //设置格式化模式 33 | SimpleDateFormat format = new SimpleDateFormat(time_format); 34 | 35 | Date result = null; 36 | try { 37 | Date t = format.parse(time); 38 | result = t; 39 | } catch (ParseException e) { 40 | // TODO Auto-generated catch block 41 | e.printStackTrace(); 42 | } 43 | return result; 44 | } 45 | 46 | 47 | /** 48 | * String -> Timestamp 49 | * @param time 时间字符串 50 | * @param time_format 需要格式化的格式 51 | * @return Long 52 | * @throws ParseException 53 | * example : 54 | * DateUtil.StringToTimestamp("2010-06-25",DateUtil.SIMPLE_Y_M_D_FORMAT); 55 | */ 56 | public static Long StringToTimestamp (String time,String time_format) { 57 | //设置格式化模式 58 | SimpleDateFormat format = new SimpleDateFormat(time_format); 59 | 60 | Date t = DateUtil.StringToDate(time,time_format); 61 | 62 | return t.getTime(); 63 | 64 | } 65 | 66 | 67 | /** 68 | * Timestamp -> Sting 69 | * @param timestamp 70 | * @param time_format 71 | * @return String 72 | * example : 73 | * Long timestamp = DateUtil.StringToTimestamp("2010-06-25 00:24:00",DateUtil.SIMPLE_FORMAT); 74 | DateUtil.TimestampToSting(timestamp,DateUtil.SIMPLE_FORMAT); 75 | */ 76 | public static String TimestampToSting (Long timestamp,String time_format) { 77 | //根据时间戳拿到日期对象 78 | Date date = new Date(timestamp); 79 | 80 | //设置格式化模式 81 | SimpleDateFormat format = new SimpleDateFormat(time_format); 82 | 83 | //通过格式化对象,返回结果 84 | String result = format.format(date); 85 | 86 | return result; 87 | 88 | } 89 | 90 | 91 | /** 92 | * Timestamp -> Date 93 | * @param timestamp 94 | * @return Date 95 | */ 96 | public static Date TimestampToDate (Long timestamp) { 97 | 98 | Date date = new Date(timestamp); 99 | 100 | return date; 101 | } 102 | 103 | /** 104 | * String -> FormatString 105 | * @param time 106 | * @param current_time_format 107 | * @param new_time_format 108 | * @return String 109 | * example : 110 | * String date = DateUtil.StringToFormatString("2015-06-03", DateUtil.SIMPLE_YMD_FORMAT,DateUtil.SIMPLE_Y_M_D_FORMAT); 111 | */ 112 | public static String StringToFormatString (String time,String current_time_format,String new_time_format ) { 113 | //装换为时间戳 114 | Long timestamp = DateUtil.StringToTimestamp(time,current_time_format); 115 | 116 | //转换成字符串 117 | String string = DateUtil.TimestampToSting(timestamp,new_time_format); 118 | 119 | return string; 120 | } 121 | 122 | 123 | /** 124 | * date -> string 125 | * @param date 126 | * @param time_format 127 | * @return 128 | */ 129 | public static String DateToString (Date date,String time_format) { 130 | 131 | //设置格式化模式 132 | SimpleDateFormat simple_date_format = new SimpleDateFormat(time_format); 133 | 134 | //格式化日期 135 | String result = simple_date_format.format(date); 136 | 137 | return result; 138 | } 139 | 140 | 141 | /** 142 | * date -> Timestamp 143 | * @param date 144 | * @return Long 145 | */ 146 | public static Long DateToTimestamp (Date date) { 147 | return date.getTime(); 148 | } 149 | 150 | 151 | /** 152 | * 获取当前日期的 偏移天数 153 | * @param offset_day 154 | * @return 155 | */ 156 | public static Date getCalendarOffsetDateDay(int offset_day) { 157 | return DateUtil.calendarOffsetDateDay(offset_day,new Date()); 158 | } 159 | 160 | /** 161 | * 获取指定日期的 偏移天数 162 | * @param offset_day 163 | * @param curDate 164 | * @return 165 | */ 166 | public static Date getCalendarOffsetDateDay(int offset_day, Date curDate) { 167 | return DateUtil.calendarOffsetDateDay(offset_day,curDate); 168 | } 169 | 170 | /** 171 | * 获取指定偏移日期 172 | * @param offset_day 偏移天数,-1 表示昨天 1明天 2 后天,以此类推 173 | * @param curDate 指定日期 174 | * @return Date 175 | */ 176 | public static Date calendarOffsetDateDay (int offset_day, Date curDate) { 177 | Calendar c1 = Calendar.getInstance(); 178 | 179 | c1.setTime(curDate); // 设置当前日期 180 | c1.add(Calendar.DATE,offset_day); 181 | 182 | int year = c1.get(Calendar.YEAR); //获得年 183 | int month = c1.get(Calendar.MONTH) + 1; // 获得月份 184 | int date = c1.get(Calendar.DATE); // 获得日期 185 | int hours = c1.get(Calendar.HOUR_OF_DAY); // 获得小时 186 | int minute = c1.get(Calendar.MINUTE); // 获得分钟 187 | int second = c1.get(Calendar.SECOND); // 获得秒 188 | int day_of_week = c1.get(Calendar.DAY_OF_WEEK); //获得星期几(注意(这个与Date类是不同的):1代表星期日、2代表星期1、3代表星期二,以此类推) 189 | 190 | //Date 191 | return c1.getTime(); 192 | } 193 | 194 | 195 | /** 196 | * 获取当前时间戳 197 | * 1436768318923 198 | */ 199 | public static Long getNowTimestamp() { 200 | return System.currentTimeMillis(); 201 | } 202 | 203 | 204 | /** 205 | * 获取当前时间 , 可以指定格式 206 | * @return 207 | */ 208 | public static String getCurTime(String time_format) { 209 | return DateUtil.TimestampToSting(DateUtil.getNowTimestamp(),time_format); 210 | } 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | public static Date addDateOneDay(Date date) { 219 | if (null == date) { 220 | return date; 221 | } 222 | Calendar c = Calendar.getInstance(); 223 | c.setTime(date); //设置当前日期 224 | c.add(Calendar.DATE, 1); //日期加1天 225 | // c.add(Calendar.DATE, -1); //日期减1天 226 | date = c.getTime(); 227 | return date; 228 | } 229 | 230 | public static void main (String[] args) throws ParseException { 231 | 232 | Long timestamp = DateUtil.StringToTimestamp("2010-06-25 02:24:10",DateUtil.SIMPLE_FORMAT); 233 | String t2s = DateUtil.TimestampToSting(timestamp,DateUtil.SIMPLE_FORMAT); 234 | 235 | Date date = DateUtil.TimestampToDate(timestamp); 236 | System.out.println(t2s); 237 | 238 | } 239 | } 240 | -------------------------------------------------------------------------------- /src/main/java/com/angejia/dw/common/util/DebugUtil.java: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util; 2 | 3 | public class DebugUtil { 4 | 5 | public static void dump (Object obj,int i) { 6 | DebugUtil.print(obj); 7 | DebugUtil.exit(i); 8 | } 9 | 10 | public static void dump (Object obj) { 11 | DebugUtil.print(obj); 12 | } 13 | 14 | public static void exit(int i) { 15 | System.exit(i); 16 | } 17 | 18 | public static void print (Object obj) { 19 | System.out.println(DebugUtil.getType(obj)); 20 | System.out.println(obj); 21 | } 22 | 23 | public static String getType(Object o){ 24 | return o.getClass().toString(); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/angejia/dw/common/util/FileUtil.java: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util; 2 | 3 | import java.io.*; 4 | 5 | public class FileUtil { 6 | 7 | /** 8 | * 向文件写数据 9 | * @param file_name 10 | * @param data 11 | * @param append 是否追加 12 | */ 13 | public static void fileOutputStream (String file_name,String data,boolean append) { 14 | 15 | try { 16 | 17 | File f = new File(file_name); 18 | FileOutputStream fop = new FileOutputStream(f,append); 19 | 20 | //构建OutputStreamWriter对象,参数可以指定编码,默认为操作系统默认编码,windows上是gbk 21 | OutputStreamWriter writer = new OutputStreamWriter(fop, "UTF-8"); 22 | 23 | //写入到缓冲区 24 | writer.append(data); 25 | 26 | //关闭写入流,同时会把缓冲区内容写入文件,所以上面的注释掉 27 | writer.close(); 28 | 29 | //关闭输出流,释放系统资源 30 | fop.close(); 31 | 32 | } catch (IOException e) { 33 | // TODO Auto-generated catch block 34 | e.printStackTrace(); 35 | } 36 | } 37 | 38 | 39 | /** 40 | * 读取文件 41 | * @param file_name 42 | * @return String 43 | */ 44 | public static String fileInputStream(String file_name){ 45 | 46 | StringBuilder sb = new StringBuilder(); 47 | 48 | try { 49 | File f = new File(file_name); 50 | 51 | //构建FileInputStream对象 52 | FileInputStream fip = new FileInputStream(f); 53 | 54 | // InputStreamReader 逐行读取六中的数据,编码与写入相同 55 | InputStreamReader reader = new InputStreamReader(fip, "UTF-8"); 56 | 57 | //一行行读去文件数据 58 | while (reader.ready()) { 59 | sb.append((char) reader.read()); 60 | } 61 | 62 | //关闭读取流 63 | reader.close(); 64 | 65 | //关闭输出流,释放系统资源 66 | fip.close(); 67 | 68 | } catch (IOException e) { 69 | // TODO Auto-generated catch block 70 | e.printStackTrace(); 71 | } 72 | 73 | return sb.toString(); 74 | } 75 | 76 | 77 | public static boolean deleteFile (String file_name) { 78 | boolean isDelete = false; 79 | try{ 80 | 81 | File file = new File(file_name); 82 | 83 | if(file.delete()){ 84 | isDelete = true; 85 | }else{ 86 | isDelete = false; 87 | } 88 | 89 | }catch(Exception e){ 90 | 91 | e.printStackTrace(); 92 | 93 | } 94 | return isDelete; 95 | } 96 | 97 | public static void main(String[] args) { 98 | //FileUtil a = new FileUtil(); 99 | //a.fileOutputStream("/tmp/aaa","中文输入"); 100 | //a.fileOutputStream("/tmp/aaa","\r\n"); 101 | //System.out.println(a.fileInputStream("/tmp/aaa")); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/angejia/dw/common/util/JavaJsonUtil.java: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util; 2 | 3 | import org.json.simple.JSONObject; 4 | import org.json.simple.JSONArray; 5 | import org.json.simple.parser.ParseException; 6 | import org.json.simple.parser.JSONParser; 7 | 8 | /** 9 | * Json 转换 10 | * @author Jason 11 | * JSONObject 就是 java.util.Map 12 | * JSONArray 就是 java.util.List 13 | * 使用 Map 或 List 的标准操作访问它们 14 | */ 15 | public class JavaJsonUtil { 16 | 17 | 18 | /** 19 | * JsonStr {} 转换成 java.util.Map 20 | * SONObject 就是 java.util.Map 21 | * @param strJson 22 | * @return 23 | */ 24 | public static JSONObject JsonStrToMap (String strJson) { 25 | JSONParser parser = new JSONParser(); 26 | JSONObject obj = null; 27 | 28 | try{ 29 | obj = (JSONObject) parser.parse(strJson); 30 | } catch(ParseException pe){ 31 | System.out.println("position: " + pe.getPosition()); 32 | System.out.println(pe); 33 | } 34 | //System.out.println(obj); 35 | return obj; 36 | } 37 | 38 | 39 | /** 40 | * JsonStr [{},{}] 转换成 java.util.List 41 | * JSONArray 就是 java.util.List 42 | * @param strJson 43 | * @return 44 | */ 45 | public static JSONArray JsonStrToArray(String strJson) { 46 | JSONParser parser = new JSONParser(); 47 | JSONArray obj = null; 48 | 49 | try{ 50 | obj = (JSONArray) parser.parse(strJson); 51 | } catch(ParseException pe){ 52 | System.out.println("position: " + pe.getPosition()); 53 | System.out.println(pe); 54 | } 55 | //System.out.println(obj); 56 | return obj; 57 | } 58 | 59 | /** 60 | * JSONObject 转换成 json 字符串 61 | * @param obj 62 | * @return 63 | */ 64 | public static String MapToJsonStr(JSONObject obj) { 65 | return obj.toJSONString(); 66 | } 67 | 68 | /** 69 | * JSONArray 转换成 Json 字符串 70 | * @param obj 71 | * @return 72 | */ 73 | public static String ArrayToJsonStr(JSONArray obj) { 74 | return obj.toJSONString(); 75 | } 76 | 77 | public static void main(String[] args) { 78 | JSONObject obja = JavaJsonUtil.JsonStrToMap("{\"a\":\"1\"}"); 79 | obja.put("a", "1"); 80 | //JavaJsonUtil.JsonStrToArray("[{\"a\":\"1\"},{\"a\":\"1\"}]"); 81 | //JavaJsonUtil.MapToJson(); 82 | System.exit(0); 83 | 84 | JSONParser parser=new JSONParser(); 85 | String s = "[0,{\"1\":{\"2\":{\"3\":{\"4\":[5,{\"6\":7}]}}}}]"; 86 | try{ 87 | Object obj = parser.parse(s); 88 | JSONArray array = (JSONArray)obj; 89 | System.out.println("The 2nd element of array"); 90 | System.out.println(array.get(1)); 91 | System.out.println(); 92 | JSONObject obj2 = (JSONObject)array.get(1); 93 | obj2.put(2, "a"); 94 | System.out.println("Field \"1\""); 95 | System.out.println(obj2.get(2)); 96 | 97 | s = "{}"; 98 | obj = parser.parse(s); 99 | System.out.println(obj); 100 | 101 | s= "[5,]"; 102 | obj = parser.parse(s); 103 | System.out.println(obj); 104 | 105 | s= "[5,,2]"; 106 | obj = parser.parse(s); 107 | System.out.println(obj); 108 | }catch(ParseException pe){ 109 | System.out.println("position: " + pe.getPosition()); 110 | System.out.println(pe); 111 | } 112 | } 113 | 114 | 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/com/angejia/dw/common/util/PropertyUtil.java: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util; 2 | 3 | import java.io.BufferedInputStream; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileOutputStream; 7 | import java.io.IOException; 8 | import java.io.InputStream; 9 | import java.io.OutputStream; 10 | import java.io.Reader; 11 | import java.util.Properties; 12 | import java.io.InputStreamReader; 13 | import java.io.BufferedReader; 14 | 15 | 16 | public class PropertyUtil { 17 | 18 | //属性文件的路径 19 | private String profilepath = ""; 20 | 21 | private Properties props = new Properties(); 22 | 23 | /** 24 | * 设置文件输入流(使用这种方式, 可以在 jar 内部读取文件等操作) 25 | */ 26 | public void setFileInputStream(Reader reader) throws IOException { 27 | this.props.load(reader); 28 | } 29 | 30 | 31 | /** 32 | * 设置文件路径(使用这种方式, jar 内部会读不到文件, 可以 jar 外部操作文件) 33 | * @param filePath 34 | */ 35 | public void setFilePath(String filePath) { 36 | this.profilepath = filePath; 37 | 38 | try { 39 | props.load(new FileInputStream(this.profilepath)); 40 | } catch (FileNotFoundException e) { 41 | e.printStackTrace(); 42 | System.exit(-1); 43 | } catch (IOException e) { 44 | System.exit(-1); 45 | } 46 | } 47 | 48 | 49 | /** 50 | * 读取属性文件中相应键的值 51 | * @param key 52 | * 主键 53 | * @return String 54 | */ 55 | public String getKeyValue(String key) { 56 | return props.getProperty(key); 57 | } 58 | 59 | 60 | /** 61 | * 根据主键key读取主键的值value 62 | * @param filePath 属性文件路径 63 | * @param key 键名 64 | */ 65 | public String readValue(String filePath, String key) { 66 | Properties props = new Properties(); 67 | try { 68 | InputStream in = new BufferedInputStream(new FileInputStream( 69 | filePath)); 70 | props.load(in); 71 | String value = props.getProperty(key); 72 | System.out.println(key +"键的值是:"+ value); 73 | return value; 74 | } catch (Exception e) { 75 | e.printStackTrace(); 76 | return null; 77 | } 78 | } 79 | 80 | 81 | /** 82 | * 更新(或插入)一对properties信息(主键及其键值) 83 | * 如果该主键已经存在,更新该主键的值; 84 | * 如果该主键不存在,则插件一对键值。 85 | * @param keyname 键名 86 | * @param keyvalue 键值 87 | */ 88 | public void writeProperties(String keyname,String keyvalue) { 89 | try { 90 | // 调用 Hashtable 的方法 put,使用 getProperty 方法提供并行性。 91 | // 强制要求为属性的键和值使用字符串。返回值是 Hashtable 调用 put 的结果。 92 | OutputStream fos = new FileOutputStream(profilepath); 93 | props.setProperty(keyname, keyvalue); 94 | // 以适合使用 load 方法加载到 Properties 表中的格式, 95 | // 将此 Properties 表中的属性列表(键和元素对)写入输出流 96 | props.store(fos, "Update '" + keyname + "' value"); 97 | } catch (IOException e) { 98 | System.err.println("属性文件更新错误"); 99 | } 100 | } 101 | 102 | 103 | /** 104 | * 更新properties文件的键值对 105 | * 如果该主键已经存在,更新该主键的值; 106 | * 如果该主键不存在,则插件一对键值。 107 | * @param keyname 键名 108 | * @param keyvalue 键值 109 | */ 110 | public void updateProperties(String keyname,String keyvalue) { 111 | try { 112 | props.load(new FileInputStream(profilepath)); 113 | // 调用 Hashtable 的方法 put,使用 getProperty 方法提供并行性。 114 | // 强制要求为属性的键和值使用字符串。返回值是 Hashtable 调用 put 的结果。 115 | OutputStream fos = new FileOutputStream(profilepath); 116 | props.setProperty(keyname, keyvalue); 117 | // 以适合使用 load 方法加载到 Properties 表中的格式, 118 | // 将此 Properties 表中的属性列表(键和元素对)写入输出流 119 | props.store(fos, "Update '" + keyname + "' value"); 120 | } catch (IOException e) { 121 | System.err.println("属性文件更新错误"); 122 | } 123 | } 124 | 125 | 126 | 127 | //测试代码 128 | public static void main(String[] args) throws IOException { 129 | System.out.println("123"); 130 | //返回读取指定资源的输入流 131 | InputStream is= PropertyUtil.class.getClass().getResourceAsStream("/resources/conf_dev.properties"); 132 | BufferedReader br=new BufferedReader(new InputStreamReader(is)); 133 | String s=""; 134 | while((s=br.readLine())!=null) 135 | System.out.println(s); 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/main/java/com/angejia/dw/common/util/mysql/JavaMysqlClient.java: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util.mysql; 2 | 3 | import java.util.Map; 4 | import java.util.HashMap; 5 | import java.util.List; 6 | import java.util.ArrayList; 7 | import java.util.Arrays; 8 | 9 | import java.sql.Connection; 10 | import java.sql.DriverManager; 11 | import java.sql.PreparedStatement; 12 | import java.sql.ResultSet; 13 | import java.sql.SQLException; 14 | import java.sql.Statement; 15 | 16 | public class JavaMysqlClient { 17 | 18 | private String url; 19 | private String user; 20 | private String psw; 21 | 22 | private Connection conn; 23 | 24 | static { 25 | try { 26 | Class.forName("com.mysql.jdbc.Driver"); 27 | } catch (ClassNotFoundException e) { 28 | e.printStackTrace(); 29 | throw new RuntimeException(e); 30 | } 31 | } 32 | 33 | public JavaMysqlClient(String url, String user, String psw) { 34 | this.url = url; 35 | this.user = user; 36 | this.psw = psw; 37 | } 38 | 39 | /** 40 | * 获取数据库的连接 41 | * 42 | * @return conn 43 | */ 44 | public Connection getConnection() { 45 | if (null == conn) { 46 | try { 47 | conn = DriverManager.getConnection(url, user, psw); 48 | } catch (SQLException e) { 49 | e.printStackTrace(); 50 | throw new RuntimeException(e); 51 | } 52 | } 53 | return conn; 54 | } 55 | 56 | /** 57 | * 查询数据 58 | * 59 | * @param sql 60 | * 查询的 SQL 61 | * @param fields 62 | * 查询的字段 fields 63 | * @return List> 64 | * @throws SQLException 65 | */ 66 | public List> select(String sql, String fields) { 67 | 68 | List> rsList = new ArrayList>(); 69 | 70 | // TODO 不使用string来读取参数 71 | String[] fieldsArr = fields.split(","); 72 | 73 | try { 74 | // 通过数据库的连接操作数据库,实现增删改查 75 | PreparedStatement ptmt = getConnection().prepareStatement(sql); 76 | 77 | // 执行 Sql 获取结果集 78 | ResultSet rs = ptmt.executeQuery(); 79 | 80 | // 保存列中的字段集 81 | Map rowData; 82 | // 遍历结果集 83 | while (rs.next()) { 84 | rowData = new HashMap(); 85 | for (String field : fieldsArr) { 86 | rowData.put(field, rs.getString(field)); 87 | } 88 | // 保存列的数据到行中 89 | rsList.add(rowData); 90 | } 91 | 92 | } catch (SQLException e) { 93 | e.printStackTrace(); 94 | } 95 | 96 | return rsList; 97 | } 98 | 99 | /** 100 | * 获取查询的数据条数 101 | *

102 | * 语句类似 SELECT COUNT(*) AS cn FROM tal 103 | *

104 | * 105 | * @param sql 106 | * @return int 107 | * @throws SQLException 108 | */ 109 | public int count(String sql) { 110 | int cn = 0; 111 | 112 | try { 113 | PreparedStatement ptmt = getConnection().prepareStatement(sql); 114 | ResultSet rs = ptmt.executeQuery(); 115 | rs.next(); 116 | cn = rs.getInt("cn"); 117 | } catch (SQLException e) { 118 | // TODO Auto-generated catch block 119 | e.printStackTrace(); 120 | } 121 | 122 | return cn; 123 | 124 | } 125 | 126 | /** 127 | * 执行 Sql 128 | * 129 | * @param sql 130 | * @return boolean 131 | * @throws SQLException 132 | */ 133 | public boolean execute(String sql) { 134 | boolean rs = false; 135 | try { 136 | PreparedStatement ptmt = getConnection().prepareStatement(sql); 137 | rs = ptmt.execute(sql); 138 | ptmt.close(); 139 | } catch (SQLException e) { 140 | e.printStackTrace(); 141 | } 142 | return rs; 143 | } 144 | 145 | /** 146 | * 释放资源 147 | * 148 | * @param conn 149 | * @param pstmt 150 | * @param rs 151 | */ 152 | public void closeResources(Connection conn, PreparedStatement pstmt, ResultSet rs) { 153 | if (null != rs) { 154 | try { 155 | rs.close(); 156 | } catch (SQLException e) { 157 | e.printStackTrace(); 158 | throw new RuntimeException(e); 159 | } finally { 160 | if (null != pstmt) { 161 | try { 162 | pstmt.close(); 163 | } catch (SQLException e) { 164 | e.printStackTrace(); 165 | throw new RuntimeException(e); 166 | } finally { 167 | if (null != conn) { 168 | try { 169 | conn.close(); 170 | } catch (SQLException e) { 171 | e.printStackTrace(); 172 | throw new RuntimeException(e); 173 | } 174 | } 175 | } 176 | } 177 | } 178 | } 179 | } 180 | 181 | } 182 | -------------------------------------------------------------------------------- /src/main/java/com/angejia/dw/common/util/parse/ParseMobileAgent.java: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util.parse; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | import java.util.regex.Pattern; 5 | import java.util.regex.Matcher; 6 | 7 | 8 | /** 9 | * 解析 accessLog 中的 Http 头的 数据 10 | * app=i-broker;av=1.0.0;ccid=1;gcid=1;ch=A01;lng=121.526063;lat=31.219871;ip=;mac=None;net=WIFI;p=iOS;pm=iPhone 4S;osv=7.1;dvid=32A02E76EC8C-4D78-B331-201503251125; 11 | * 这个数据是 APP 请求的时候发送的 12 | */ 13 | public class ParseMobileAgent { 14 | /** 15 | * @param s sting 16 | * @param p pattern 17 | * @return 18 | */ 19 | public static String evaluate(String s, String p) { 20 | if (s == null) { return ""; } 21 | String base_p = p+"=([^;]+)"; 22 | 23 | String result = ""; 24 | 25 | String first_result = parseAgent(s, ";"+base_p);//先执行严格匹配,防止取p的时候把app的值取出来 26 | 27 | if(first_result == ""){ 28 | String second_result = parseAgent(s, base_p); 29 | result = second_result; 30 | }else{ 31 | result = first_result; 32 | } 33 | 34 | return result; 35 | } 36 | public static String parseAgent(String s,String p){ 37 | if (s == null) { return ""; } 38 | Pattern pattern = Pattern.compile(p); 39 | Matcher matcher=pattern.matcher(s); 40 | 41 | if(matcher.find()){ 42 | return matcher.group(1); 43 | } 44 | return ""; 45 | } 46 | // public static void main(String[] args){ 47 | // String s = "app=i-broker;av=1.0.0;ccid=1;gcid=1;ch=A01;lng=121.526063;lat=31.219871;ip=;mac=None;net=WIFI;p=iOS;pm=iPhone 4S;osv=7.1;dvid=32A02E76EC8C-4D78-B331-201503251125;"; 48 | // ParseMobileAgent obj = new ParseMobileAgent(); 49 | // System.out.println(obj.evaluate(s,"app"));//开头的值 50 | // System.out.println(obj.evaluate(s,"p"));//取重复值 51 | // System.out.println(obj.evaluate(s,"gcid"));//中间的值 52 | // System.out.println(obj.evaluate(s,"dvid"));//结尾的值 53 | // System.out.println(obj.evaluate(s,"ip"));//取空值 54 | // System.out.println(obj.evaluate(s,"notexist"));//不存在的值 55 | // } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/com/angejia/dw/common/util/parse/ParseMobileToken.java: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util.parse; 2 | 3 | import java.net.URLDecoder; 4 | import java.util.Map; 5 | 6 | import javax.crypto.Cipher; 7 | import javax.crypto.spec.IvParameterSpec; 8 | import javax.crypto.spec.SecretKeySpec; 9 | 10 | import org.apache.commons.codec.binary.Base64; 11 | import org.codehaus.jackson.JsonParseException; 12 | import org.codehaus.jackson.map.ObjectMapper; 13 | 14 | /** 15 | * 解密 token 16 | */ 17 | public class ParseMobileToken { 18 | 19 | public static String evaluate(String s, String index) throws Exception { 20 | if (s == null || s.length() <= 0) { 21 | return ""; 22 | } 23 | 24 | String token = Decrypt(s); 25 | 26 | if (token != null && token.length() > 0) {// json decode 27 | Map> maps; 28 | ObjectMapper objectMapper = new ObjectMapper(); 29 | try { 30 | try { 31 | maps = objectMapper.readValue(token, Map.class); 32 | if (maps.containsKey(index)) { 33 | return String.valueOf(maps.get(index)); 34 | } 35 | } catch (JsonParseException e) { 36 | maps = objectMapper.readValue(URLDecoder.decode(token, "utf-8"), Map.class); 37 | if (maps.containsKey(index)) { 38 | return String.valueOf(maps.get(index)); 39 | } 40 | } 41 | } catch (Exception e) { 42 | System.err.println(e.toString()); 43 | return ""; 44 | } 45 | } 46 | 47 | return ""; 48 | } 49 | 50 | public static String Decrypt(String data) throws Exception { 51 | try { 52 | String key = "12345678123456xx"; 53 | String iv = "12345678123456xx"; 54 | 55 | byte[] encrypted1 = new Base64().decode(data); 56 | 57 | Cipher cipher = Cipher.getInstance("AES/CBC/NoPadding"); 58 | SecretKeySpec keyspec = new SecretKeySpec(key.getBytes(), "AES"); 59 | IvParameterSpec ivspec = new IvParameterSpec(iv.getBytes()); 60 | 61 | cipher.init(Cipher.DECRYPT_MODE, keyspec, ivspec); 62 | try { 63 | byte[] original = cipher.doFinal(encrypted1); 64 | String originalString = new String(original); 65 | return originalString; 66 | } catch (Exception e) { 67 | System.err.println(e.toString()); 68 | return null; 69 | } 70 | } catch (Exception e) { 71 | e.printStackTrace(); 72 | return null; 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/com/angejia/dw/hadoop/hive/HiveClient.java: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.hadoop.hive; 2 | 3 | 4 | import java.util.Map; 5 | import java.util.HashMap; 6 | import java.util.List; 7 | import java.util.ArrayList; 8 | import java.util.Arrays; 9 | 10 | 11 | import java.sql.SQLException; 12 | import java.sql.Connection; 13 | import java.sql.ResultSet; 14 | import java.sql.Statement; 15 | import java.sql.DriverManager; 16 | import org.apache.hive.jdbc.HiveDriver; 17 | 18 | 19 | public class HiveClient { 20 | 21 | private static String driverName = "org.apache.hive.jdbc.HiveDriver"; 22 | 23 | /** 24 | * 获取连接 25 | */ 26 | private Connection connection; 27 | public Connection getConnection() { 28 | return connection; 29 | } 30 | public void setConnection(Connection conn) { 31 | this.connection = conn; 32 | } 33 | 34 | 35 | 36 | public HiveClient(String url, String user, String password) throws SQLException { 37 | // 导入类 38 | try { 39 | Class.forName(driverName); 40 | } catch (ClassNotFoundException e) { 41 | e.printStackTrace(); 42 | } 43 | 44 | // 创建连接 45 | Connection con = DriverManager.getConnection(url, user, password); 46 | this.setConnection(con); 47 | 48 | // 创建连接句柄语句 49 | //Statement stmt = con.createStatement(); 50 | //this.setStmt(stmt); 51 | } 52 | 53 | 54 | /** 55 | * 执行指定 Sql 56 | * @param sql 57 | * @return 布尔值 58 | */ 59 | public Boolean execute (String sql) { 60 | Boolean rs = false; 61 | try { 62 | Statement stmt = this.getConnection().createStatement(); 63 | rs = stmt.execute(sql); 64 | stmt.close(); 65 | } catch (SQLException e) { 66 | e.printStackTrace(); 67 | } 68 | return rs; 69 | } 70 | 71 | 72 | /** 73 | * 查询数据 74 | * @param sql 75 | * @param fields 字段 76 | * @return List> 77 | * @throws SQLException 78 | * 79 | * 循环出数据 80 | * for (i <- 0 to select.size() - 1 ) { 81 | println(rsData.get(i).get("visit_item_invs_a")) 82 | } 83 | * 84 | */ 85 | public List> select(String sql, String fields) throws SQLException { 86 | // 保存结果数据 87 | List> listResult = new ArrayList>(); 88 | 89 | ResultSet res = null; 90 | try { 91 | Statement stmt = this.getConnection().createStatement(); 92 | 93 | res = stmt.executeQuery(sql); 94 | 95 | // 转换为数组 96 | String[] arrFields = fields.split(","); 97 | 98 | // 遍历每一行 99 | while (res.next()) { 100 | // 保存一行数据 101 | Map mapRowData = new HashMap(); 102 | 103 | // 拼接字段值 104 | for (String field : arrFields) { 105 | mapRowData.put(field, res.getString(field)); 106 | } 107 | // 追加到 list 中 108 | listResult.add(mapRowData); 109 | 110 | mapRowData = null; 111 | } 112 | 113 | stmt.close(); 114 | res.close(); 115 | 116 | } catch (SQLException e) { 117 | e.printStackTrace(); 118 | } 119 | 120 | //res.close(); 121 | /** 122 | for (Map rs : listResult) { 123 | System.out.println(rs.get("broker_id")); 124 | } 125 | */ 126 | 127 | return listResult; 128 | } 129 | 130 | 131 | /** 132 | * 关闭连接 133 | */ 134 | public void closeConnection() { 135 | try { 136 | this.getConnection().close(); 137 | } catch (SQLException e) { 138 | e.printStackTrace(); 139 | } 140 | } 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | /** 152 | * 测试方法 153 | * @param args 154 | * @throws SQLException 155 | */ 156 | public static void main(String[] args) throws SQLException { 157 | try { 158 | Class.forName(driverName); 159 | } catch (ClassNotFoundException e) { 160 | // TODO Auto-generated catch block 161 | e.printStackTrace(); 162 | System.exit(1); 163 | } 164 | //replace "hive" here with the name of the user the queries should run as 165 | Connection con = DriverManager.getConnection("jdbc:hive2://localhost:10000/default", "hive", ""); 166 | Statement stmt = con.createStatement(); 167 | String tableName = "testHiveDriverTable"; 168 | stmt.execute("drop table if exists " + tableName); 169 | stmt.execute("create table " + tableName + " (key int, value string)"); 170 | // show tables 171 | String sql = "show tables '" + tableName + "'"; 172 | System.out.println("Running: " + sql); 173 | ResultSet res = stmt.executeQuery(sql); 174 | if (res.next()) { 175 | System.out.println(res.getString(1)); 176 | } 177 | // describe table 178 | sql = "describe " + tableName; 179 | System.out.println("Running: " + sql); 180 | res = stmt.executeQuery(sql); 181 | while (res.next()) { 182 | System.out.println(res.getString(1) + "\t" + res.getString(2)); 183 | } 184 | 185 | // load data into table 186 | // NOTE: filepath has to be local to the hive server 187 | // NOTE: /tmp/a.txt is a ctrl-A separated file with two fields per line 188 | String filepath = "/tmp/a.txt"; 189 | sql = "load data local inpath '" + filepath + "' into table " + tableName; 190 | System.out.println("Running: " + sql); 191 | stmt.execute(sql); 192 | 193 | // select * query 194 | sql = "select * from " + tableName; 195 | System.out.println("Running: " + sql); 196 | res = stmt.executeQuery(sql); 197 | while (res.next()) { 198 | System.out.println(String.valueOf(res.getInt(1)) + "\t" + res.getString(2)); 199 | } 200 | 201 | // regular hive query 202 | sql = "select count(1) from " + tableName; 203 | System.out.println("Running: " + sql); 204 | res = stmt.executeQuery(sql); 205 | while (res.next()) { 206 | System.out.println(res.getString(1)); 207 | } 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /src/main/java/com/angejia/dw/service/Conf.java: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.service; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | 7 | import java.util.Map; 8 | import java.util.HashMap; 9 | 10 | import com.angejia.dw.common.util.PropertyUtil; 11 | 12 | public class Conf { 13 | 14 | // 读取配置文件 15 | PropertyUtil property = new PropertyUtil(); 16 | 17 | /** 18 | * 设置环境,根据不同的环境使用不同的配置文件 19 | * @throws IOException 20 | */ 21 | public void setEnv(String env) { 22 | 23 | // 读取的配置文件名称 24 | String confName = "/conf_" + env + ".properties"; 25 | 26 | // 获取 resource 文件读输入流 27 | InputStream classPath = Conf.class.getResourceAsStream(confName); 28 | InputStreamReader inputStreamReader = new InputStreamReader(classPath); 29 | 30 | // 设置读取的流 31 | try { 32 | property.setFileInputStream(inputStreamReader); 33 | } catch (IOException e) { 34 | // TODO Auto-generated catch block 35 | e.printStackTrace(); 36 | } 37 | } 38 | 39 | 40 | /** 41 | * 获取 Spark 配置 42 | * @return Map 43 | */ 44 | public Map getSparkConf(){ 45 | Map data = new HashMap(); 46 | data.put("sparkThriftServerUrl", property.getKeyValue("spark.thrift.server.url")); 47 | data.put("sparkThriftServerUser", property.getKeyValue("spark.thrift.server.user")); 48 | data.put("sparkThriftServerPass", property.getKeyValue("spark.thrift.server.pass")); 49 | 50 | return data; 51 | } 52 | 53 | 54 | /** 55 | * 获取 elasticsearch 配置 56 | * @return 57 | */ 58 | public Map getElasticsearchMasterConf(){ 59 | Map data = new HashMap(); 60 | data.put("elasticsearchMasterHost", property.getKeyValue("elasticsearch.master.host")); 61 | data.put("elasticsearchMasterPort", property.getKeyValue("elasticsearch.master.port")); 62 | data.put("elasticsearchMasterCluster", property.getKeyValue("elasticsearch.master.cluster")); 63 | return data; 64 | } 65 | 66 | 67 | /** 68 | * 获取 业务 mysql 配置 69 | * @return 70 | */ 71 | public Map getProductMysqDBInfo(){ 72 | Map data = new HashMap(); 73 | data.put("host", property.getKeyValue("productMysqlDB.host")); 74 | data.put("account", property.getKeyValue("productMysqlDB.account")); 75 | data.put("password", property.getKeyValue("productMysqlDB.password")); 76 | data.put("defaultDB", property.getKeyValue("productMysqlDB.defaultDB")); 77 | return data; 78 | } 79 | 80 | /** 81 | * 获取 dw mysql 配置 82 | * @return 83 | */ 84 | public Map getDwMysqDBInfo(){ 85 | Map data = new HashMap(); 86 | data.put("host", property.getKeyValue("biMysqlDB.host")); 87 | data.put("account", property.getKeyValue("biMysqlDB.account")); 88 | data.put("password", property.getKeyValue("biMysqlDB.password")); 89 | data.put("defaultDB", property.getKeyValue("biMysqlDB.defaultDB")); 90 | return data; 91 | } 92 | 93 | } 94 | -------------------------------------------------------------------------------- /src/main/java/com/angejia/dw/service/property/model/Inventory.java: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.service.property.model; 2 | 3 | public class Inventory { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /src/main/java/com/angejia/dw/service/user/UserService.java: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.service.user; 2 | 3 | public class UserService { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /src/main/resources/conf_dev.properties: -------------------------------------------------------------------------------- 1 | # HDFS SERVER 2 | HDFSServer=namenode:8020 3 | 4 | # zookeeper 5 | zookeeperQuorum=namenode:2181,datanode01:2181,datanode02:2181 6 | 7 | # kafka 8 | kafkaServerBrokerList=dwtest:9092 9 | 10 | # product Mysql DB 11 | productMysqlDB.host=192.168.169.12 12 | productMysqlDB.account=angejia 13 | productMysqlDB.password=angejia123 14 | productMysqlDB.defaultDB=angejia 15 | 16 | # bi Mysql DB 17 | biMysqlDB.host=dwtest 18 | biMysqlDB.account=root 19 | biMysqlDB.password=root 20 | biMysqlDB.defaultDB=da_db 21 | 22 | # hive 23 | hive.metastore.uris=thrift://namenode:9083 24 | hive.thrift.server.url=jdbc:hive2://NameNode:10000/default 25 | hive.thrift.server.user=dwadmin 26 | hive.thrift.server.pass=dwadmin 27 | 28 | # spark 29 | spark.thrift.server.url=jdbc:hive2://NameNode:10000/default 30 | spark.thrift.server.user=dwadmin 31 | spark.thrift.server.pass=dwadmin 32 | 33 | # elasticsearch cluster 34 | elasticsearch.master.host=dwtest 35 | elasticsearch.master.port=9300 36 | elasticsearch.master.cluster=angejia-dw-es 37 | 38 | -------------------------------------------------------------------------------- /src/main/resources/conf_online.properties: -------------------------------------------------------------------------------- 1 | # HDFS SERVER 2 | HDFSServer=uhadoop-ociicy-master1:8020 3 | 4 | # zookeeper 5 | zookeeperQuorum=uhadoop-ociicy-master1:2181,uhadoop-ociicy-master2:2181,uhadoop-ociicy-core1:2181 6 | 7 | # ukafka cluster 8 | #kafkaServerBrokerList=ukafka-uiu1lt-1-bj03.service.ucloud.cn:9092,ukafka-uiu1lt-2-bj03.service.ucloud.cn:9092,ukafka-uiu1lt-3-bj03.service.ucloud.cn:9092 9 | kafkaServerBrokerList=bi4:9092 10 | 11 | # product Mysql DB 12 | productMysqlDB.host=agjdb2-bi 13 | productMysqlDB.account=angejia_dw 14 | productMysqlDB.password=Th872havAyaxEmEB 15 | productMysqlDB.defaultDB=angejia 16 | 17 | # bi Mysql DB 18 | biMysqlDB.host=angejia-bi-db 19 | biMysqlDB.account=hadoop 20 | biMysqlDB.password=angejia888 21 | biMysqlDB.defaultDB=da_db 22 | 23 | # hive 24 | hive.metastore.uris=thrift://uhadoop-ociicy-master1:9083,thrift://uhadoop-ociicy-master2:9083 25 | hive.thrift.server.url=jdbc:hive2://uhadoop-ociicy-master2:10000/dw_db 26 | hive.thrift.server.user=dwadmin 27 | hive.thrift.server.pass=dwadmin 28 | 29 | # spark 30 | spark.thrift.server.url=jdbc:hive2://uhadoop-ociicy-task4:10002/dw_db 31 | spark.thrift.server.user=hadoop 32 | spark.thrift.server.pass=hadoop 33 | 34 | # elasticsearch cluster 35 | elasticsearch.master.host=bi4 36 | elasticsearch.master.port=9300 37 | elasticsearch.master.cluster=angejia-dw-es 38 | 39 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=WARN,console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.spark-project.jetty=WARN 10 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR 11 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 12 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 13 | 14 | #log4j.logger.org.apache.spark.sql.SQLContext=TRACE 15 | #log4j.logger.org.apache.spark.sql.catalyst.analysis.Analyzer=TRACE 16 | #log4j.logger.org.apache.spark=TRACE 17 | #log4j.logger.org.apache.spark.storage.BlockManagerMasterActor=WARN 18 | #log4j.logger.org.apache.spark.HeartbeatReceiver=WARN 19 | #log4j.logger.org.apache.spark.scheduler.local.LocalActor=WARN -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/common/util/JsonUtil.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util 2 | 3 | import scala.util.parsing.json.JSON 4 | 5 | // play Json 6 | import play.api.libs.json._ 7 | import play.api.libs.json.JsValue 8 | import play.api.libs.json.JsString 9 | import play.api.libs.json.JsArray 10 | import play.api.libs.json.JsObject 11 | import play.api.libs.json.JsResult 12 | import play.api.libs.json.Reads._ 13 | import play.api.libs.json.Json.JsValueWrapper 14 | 15 | // spray Json 16 | import spray.json._ 17 | import DefaultJsonProtocol._ 18 | 19 | // smart Json 20 | import java.util 21 | import net.minidev.json.{JSONObject} 22 | import net.minidev.json.parser.JSONParser 23 | import scala.collection.JavaConversions.mapAsScalaMap 24 | import scala.collection.JavaConversions.mutableMapAsJavaMap 25 | 26 | object JsonUtil { 27 | 28 | /** 29 | * play 解析类库 30 | */ 31 | implicit val objectMapFormat = new Format[Map[String, Object]] { 32 | 33 | /** 34 | * 写, Map -> Json 操作 35 | */ 36 | def writes(map: Map[String, Object]): JsValue = 37 | Json.obj(map.map{case (s, o) => 38 | val ret:(String, JsValueWrapper) = o match { 39 | case _:String => s -> JsString(o.asInstanceOf[String]) 40 | case z:Map[String, String] => { 41 | s -> o.asInstanceOf[Map[String, String]] 42 | } 43 | case _ => s -> JsArray(o.asInstanceOf[List[String]].map(JsString(_))) 44 | } 45 | ret 46 | }.toSeq:_*) 47 | 48 | /** 49 | * 读,Json -> Map 50 | */ 51 | def reads(jv: JsValue): JsResult[Map[String, Object]] = 52 | JsSuccess(jv.as[Map[String, JsValue]].map{case (k, v) => 53 | k -> (v match { 54 | case s: JsString => s.as[String] 55 | case z: JsObject => { 56 | var rs: Map[String, String] = Map[String, String]() 57 | val jsonValue: JsValue = Json.parse(z.toString()) 58 | val mp = Json.fromJson[Map[String, String]](jsonValue) 59 | if (mp != null) { 60 | rs = mp.get 61 | } 62 | rs 63 | } 64 | case l => l.as[List[String]] 65 | }) 66 | }) 67 | } 68 | 69 | /** 70 | * play 类库 map -> json 71 | * 72 | * 使用前需要把 map 转换为不可变 map.toMap 73 | * 如果内部有嵌套 map 也需要转换为不可变 map. 74 | * val map = Map("a"-> Map("a"->"2").toMap).toMap 75 | * OR 76 | * val map = mapData.map(f => f._1 -> f._2.toMap).toMap 77 | */ 78 | def playMapToJson(map: Map[String, Object]) : String = { 79 | val jv: JsValue = Json.toJson(map) 80 | jv.toString() 81 | } 82 | 83 | /** 84 | * json -> map 不可变 Map : 85 | * 86 | * // 案例 87 | * val userNeedsBaseData = JsonUtil.playJsonToMap(userNeedsJson) // 返回的是一个 Map[String, Object] 88 | val userNeedsBaseDataFormat = userNeedsBaseData.map{case (k,v) => 89 | val curK = k 90 | // 把元祖 v 转换为 Map, 再把 map 转换为可变 Map 91 | val curV = scala.collection.mutable.Map(v.asInstanceOf[scala.collection.immutable.Map[String,String]].toSeq:_*) 92 | k -> curV 93 | } 94 | // 再把最外层的 Map 也转换为可变的 map 95 | var userNeedsBase = collection.mutable.Map(userNeedsBaseDataFormat.toSeq:_*).asInstanceOf[scala.collection.mutable.Map[String, Map[String, String]]] 96 | */ 97 | def playJsonToMap(jsonStr: String): Map[String, Object] = { 98 | val jsonValue: JsValue = Json.parse(jsonStr) 99 | val jr: JsResult[Map[String, Object]] = Json.fromJson[Map[String, Object]](jsonValue) 100 | jr.get 101 | } 102 | 103 | /** 104 | * Json -> JsValue 解析成 JsValue 对象 105 | * json.\("fieldName") 使用这种方式直接访问 106 | * 或者直接 .toString 即可访问完整的 107 | */ 108 | def playJsonToJsValue(jsonString: String) : JsValue = { 109 | val jsonValue: JsValue = Json.parse(jsonString) 110 | jsonValue 111 | } 112 | 113 | 114 | 115 | def playTest() : Unit = { 116 | // map 转换成为 String 117 | val map: Map[String, Object] = Map( 118 | "val1" -> "xxx", 119 | "val2" -> List("a", "b", "c"), 120 | "val3" -> "sss", 121 | "val4" -> List("d", "e", "f"), 122 | "val5" -> Map("a"->"1", "b"->"2", "c"->"3").toMap // 你懂得,转换为不可变 Map 123 | ) 124 | val jv: JsValue = Json.toJson(map) 125 | println(jv) // {"val1":"xxx","val3":"sss","val2":["a","b","c"],"val5":{"a":"1","b":"2","c":"3"},"val4":["d","e","f"]} 126 | 127 | // String 转换为 Map 128 | val jr: JsResult[Map[String, Object]] = Json.fromJson[Map[String, Object]](jv) 129 | println(jr.get) //Map(val1 -> xxx, val3 -> sss, val2 -> List(a, b, c), val5 -> Map(a -> 1, b -> 2, c -> 3), val4 -> List(d, e, f)) 130 | println(jr.get("val5").asInstanceOf[Map[String, String]].get("a")) 131 | 132 | 133 | val uesrTagData: Map[String, Map[String,String]] = Map[String, Map[String,String]]( 134 | "0" -> Map( 135 | "city" -> "1", 136 | "block" -> "1", 137 | "community" -> "1", 138 | "bedrooms" -> "2" 139 | ), 140 | "1" -> Map( 141 | "city" -> "1", 142 | "block" -> "1", 143 | "community" -> "1", 144 | "bedrooms" -> "2" 145 | ) 146 | ) 147 | val uesrTagToJson: JsValue = Json.toJson(uesrTagData) 148 | println(uesrTagToJson) 149 | val uesrTagToMap: JsResult[Map[String, Object]] = Json.fromJson[Map[String, Object]](uesrTagToJson) 150 | println(uesrTagToMap.getOrElse().asInstanceOf[Map[String, Map[String,String]]]) 151 | //exit 152 | } 153 | 154 | 155 | 156 | /** 157 | * scala 原生对象 json -> object 158 | */ 159 | def JsonToObj(jsonString: String) : Option[Any] = { 160 | val obj = JSON.parseFull(jsonString) 161 | obj 162 | } 163 | 164 | 165 | import scala.collection.mutable.Map 166 | /** 167 | * 将map转为json 168 | * @param map 输入格式 mutable.Map[String,Object] 169 | * @return 170 | * */ 171 | def smartMapToJsonStr(map : Map[String,Object]) : String = { 172 | val jsonString = JSONObject.toJSONString(map) 173 | jsonString 174 | } 175 | 176 | /** 177 | * 将 json 转化为 Map 178 | * @param json 输入json字符串 179 | * @return 180 | * */ 181 | def smartJsonStrToMap(json : String) : Map[String,Object] = { 182 | val map : Map[String,Object]= Map() 183 | val jsonParser =new JSONParser() 184 | 185 | //将string转化为jsonObject 186 | val jsonObj: JSONObject = jsonParser.parse(json).asInstanceOf[JSONObject] 187 | 188 | //获取所有键 189 | val jsonKey = jsonObj.keySet() 190 | 191 | val iter = jsonKey.iterator() 192 | 193 | while (iter.hasNext){ 194 | val field = iter.next() 195 | val value = jsonObj.get(field).toString 196 | 197 | if(value.startsWith("{")&&value.endsWith("}")){ 198 | val value = mapAsScalaMap(jsonObj.get(field).asInstanceOf[util.HashMap[String, String]]) 199 | map.put(field,value) 200 | }else{ 201 | map.put(field,value) 202 | } 203 | } 204 | map 205 | } 206 | 207 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/common/util/ListenerFile.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | class ListenerFile { 6 | 7 | /** 8 | * 读取小文件可以,但是针对大文件,会有部分问题 9 | * Exception in thread "main" java.lang.StackOverflowError 10 | * 调整 java -Xss512M 11 | * 从指定文件日期开始,监听文件的变化,并发送给回调函数 12 | * file : 需要监听文件路径,格式: /data/log/uba/lb/access.${date}.log 13 | * date : 指定文件日期(天) , 格式: 20160101 14 | * lineNum : 从多少行开始读 15 | * stepLength : 每次读取多少行, 如果每次读取 1 行写 0, 如果每次读取 100 行写 100 16 | * 17 | * callback : 回调函数 18 | * 19 | 20 | def readLogLine(result: Map[String,Any]): Unit = { 21 | println(result) 22 | } 23 | */ 24 | def listenerDateFile( 25 | file: String, 26 | date: String, 27 | lineNum: Int, 28 | stepLength: Int, 29 | callback: Map[String,Any] => Unit, // 回调函数 30 | isRecursive: Boolean = true // 是否递归调用 31 | ): Map[String,Any] = { 32 | 33 | // 当前运行日期 34 | var curDate = date 35 | 36 | // 当前读的文件 37 | val curReadFile = file.replace("${date}", date) 38 | 39 | // 等待执行 sh 命令 sed -n '5,7p' 40 | val startLine = lineNum 41 | val endLine = lineNum + stepLength - 1 42 | var readLineCmd = "sed -n " + startLine + "," + endLine + "p " + curReadFile 43 | val commandResult = ScriptUtil.runSyncCommand(readLineCmd) // 执行返回结果 44 | val commandCode = commandResult.get("code").get // 执行状态 45 | val curLine = commandResult.get("stdoutPut").get.toString() // 获取标准输出 46 | 47 | 48 | // 当前定位行数 49 | var curLineNum: Int = 0 50 | 51 | // 结果数据的行数 52 | val curLineResult = curLine.split("\n") 53 | val curLineResultLength = curLineResult.length // 一共读了多少行 54 | 55 | // 当行数分解的数组长度为 1, 并且第一元素的值为空 , 表示读取的数据为空了 56 | val rs = curLineResultLength <= 1 && curLineResult(0).length() == 0 // 为空是 true , 不为空是 false 57 | 58 | // 读取的行数为空, 日期是今天的 59 | if (rs == true && date == this.getCurDate()) { 60 | // 把位置定位到开始的时间 61 | curLineNum = startLine 62 | 63 | // 等待 3 秒后再执行 64 | TimeUnit.SECONDS.sleep(3); 65 | 66 | // 读取的行数为空, 日期不是当天的日期 67 | } else if (rs == true && date != this.getCurDate()) { 68 | // 当前日期增加 1 天,tomorrowDate 69 | curDate = this.getOffsetDate(1,date) 70 | 71 | // 文件从第一行开始读 72 | curLineNum = 1 73 | 74 | // 读取的行数不为空, 不是今天日期, 也不是隔天日期, 表示是正常累加的行数 75 | } else { 76 | if (curLineResultLength < stepLength) { 77 | TimeUnit.SECONDS.sleep(3); 78 | } 79 | 80 | // 则把行数定位到 开始行数 + 总共读取的行数 81 | curLineNum = startLine + curLineResultLength 82 | } 83 | 84 | 85 | // 返回的结果 86 | var result = Map( 87 | // 读到的文件 88 | "file" -> curReadFile, 89 | // 下一次开始读的行数 90 | "nextLineNum" -> curLineNum, 91 | // 读到的行内容 92 | "fileLineContent" -> curLine, 93 | // 文件模板 94 | "fileTemplate" -> file, 95 | // 日期 96 | "date" -> curDate, 97 | // 读到的命令 98 | "readLineCmd" -> readLineCmd, 99 | // 命令返回的参数 100 | "commandResult" -> commandResult 101 | ) 102 | callback(result) // 回调函数 103 | 104 | if (isRecursive == true) { 105 | // 递归,从指定日期第 n 行开始读取数据 106 | this.listenerDateFile(file, curDate, curLineNum, stepLength,callback) 107 | } 108 | result 109 | 110 | } 111 | 112 | 113 | /** 114 | * While 方式监听文件变化 115 | * file : /data/log/uba/lb/access.${date}.log 监听的文件 116 | * date : 20160101 日期 117 | * lineNum: 行数 118 | * stepLength : 步长 119 | */ 120 | def listenerDateFileWhile( 121 | file: String, 122 | date: String, 123 | lineNum: Int, 124 | stepLength: Int, 125 | callback: Map[String,Any] => Unit // 回调函数 126 | ) : Unit = { 127 | var status = true 128 | 129 | // 当前运行时间 130 | var curDate = date 131 | 132 | // 当前读到的行数 133 | var curLineNum = lineNum 134 | 135 | var map = Map[String,Any](); 136 | while( status ){ 137 | map = this.listenerDateFile(file, curDate, curLineNum, stepLength, callback, false) 138 | curDate = map.get("date").get.toString() 139 | curLineNum = map.get("nextLineNum").get.toString().toInt 140 | } 141 | 142 | } 143 | 144 | 145 | // 日期增加 减少 1 天 146 | def getOffsetDate(offset: Int, dateStr: String): String = { 147 | // 字符日期转换为 Date 对象 148 | val date = DateUtil.StringToDate(dateStr, DateUtil.SIMPLE_YMD_FORMAT) 149 | // date 对象 + n 天 150 | val offsetDate = DateUtil.getCalendarOffsetDateDay(offset, date); 151 | // date 对象转换成 str 152 | DateUtil.DateToString(offsetDate, DateUtil.SIMPLE_YMD_FORMAT) 153 | } 154 | 155 | 156 | // 获取当前系统时间 157 | def getCurDate(): String = DateUtil.TimestampToSting(DateUtil.getNowTimestamp,DateUtil.SIMPLE_YMD_FORMAT) 158 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/common/util/RegexUtil.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util 2 | 3 | import scala.util.matching.Regex 4 | 5 | 6 | object RegexUtil { 7 | 8 | 9 | def findStrData(pattern: String, strData: String) : String = { 10 | var rs = "" 11 | 12 | // String 类的 r() 方法构造了一个Regex对象 13 | val patternObj: Regex = pattern.r 14 | 15 | // findFirstIn 方法找到首个匹配项 16 | val findRs: Option[String] = patternObj.findFirstIn(strData) 17 | 18 | if (!findRs.isEmpty) { 19 | //val patternObj(num,str) = string 20 | //rs = args 21 | 22 | // 模式匹配, 当前字符串匹配到了正则 23 | strData match{ 24 | case patternObj(str) => 25 | //println(str) 26 | rs = str 27 | case _=> 28 | println("Not matched") 29 | } 30 | } 31 | rs 32 | } 33 | 34 | 35 | def findStrDataBak(pattern: String, string: String) : String = { 36 | var rs = "" 37 | // 使用 Regex 构造对象 38 | val patternObj = new Regex(pattern) // 首字母可以是大写 S 或小写 s 39 | println(patternObj.findFirstIn(string)) 40 | 41 | println(patternObj findFirstIn string) 42 | rs 43 | } 44 | 45 | 46 | def test() : Unit = { 47 | 48 | // 构造 Regex 对象, 用 String 类的 r 方法即可 49 | val pattern = "Scala".r 50 | val str = "Scala is Scalable and cool" 51 | println(pattern findFirstIn str) // println(pattern.findFirstIn(str)) 52 | 53 | // 使用 Regex 构造对象 54 | val pattern2 = new Regex("(S|s)cala") // 首字母可以是大写 S 或小写 s 55 | val str2 = "Scala is scalable and cool" 56 | println(pattern2 findFirstIn str2) 57 | 58 | 59 | val filter_regex="/mobile/member/inventories/list[?](.*)".r 60 | val str3 = "/mobile/member/inventories/list?bedroom_id=2&city_id=1&sort_id=3&price_id=4&district_id=7&block_id=62&page=1&per_page=8" 61 | if (!filter_regex.findFirstIn(str3).isEmpty) { 62 | val filter_regex(pars) = str3 63 | println(pars) 64 | } 65 | 66 | } 67 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/common/util/ScFileUtil.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util 2 | 3 | import scala.io.Source 4 | 5 | object ScFileUtil { 6 | 7 | /** 8 | * 读取文件 9 | * val readFile = fileInputStream.split("\\s+") 10 | */ 11 | def fileInputStream(fileName: String,encoded: String = "UTF-8"): String = { 12 | val source = Source.fromFile(fileName,encoded) 13 | val contents = source.mkString.toString() 14 | contents 15 | } 16 | 17 | 18 | 19 | 20 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/common/util/ScriptUtil.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util 2 | 3 | import sys.process._ 4 | import java.io.BufferedReader 5 | import java.io.InputStreamReader 6 | 7 | object ScriptUtil { 8 | 9 | 10 | def runSyncCommand(cmd: String): Map[String,Any] = { 11 | val qb = Process(cmd) 12 | 13 | var out = "" 14 | var err = "" 15 | 16 | val exitCode = qb ! ProcessLogger( 17 | // 匿名函数 18 | (s) => { 19 | out += s + "\n" 20 | }, 21 | (s) => { 22 | err += s + "\n" 23 | } 24 | ) 25 | 26 | val result: Map[String,Any] = Map( 27 | "code" -> exitCode, 28 | "stdoutPut" -> out, 29 | "erroutPut" -> err 30 | ) 31 | 32 | result 33 | } 34 | 35 | 36 | 37 | /** Run a command, collecting the stdout, stderr and exit status */ 38 | def runCommandBak(in: String): (List[String], List[String], Int) = { 39 | val qb = Process(in) 40 | var out = List[String]() 41 | var err = List[String]() 42 | 43 | val exit = qb ! ProcessLogger( 44 | (s) => out = List(s), 45 | (s) => err ::= s) 46 | 47 | (out.reverse, err.reverse, exit) 48 | } 49 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/common/util/mysql/MysqlClient.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.common.util.mysql 2 | 3 | import scala.collection.mutable.ArrayBuffer 4 | import scala.collection.mutable.HashMap 5 | 6 | import java.sql.{ DriverManager, ResultSet } 7 | import com.mysql.jdbc.Driver 8 | 9 | class MysqlClient(ip: String, user: String, pwd: String, db: String) extends Serializable { 10 | 11 | // Change to Your Database Config 12 | lazy val conn_str = 13 | "jdbc:mysql://" + 14 | ip + ":3306/" + 15 | db + "?" + 16 | "user=" + user + 17 | "&password=" + pwd + 18 | "&zeroDateTimeBehavior=convertToNull" 19 | 20 | // Setup the connection 21 | lazy val conn = DriverManager.getConnection(conn_str) 22 | 23 | // 查询 24 | def select(sql: String): ArrayBuffer[HashMap[String, Any]] = { 25 | try { 26 | // Configure to be Read Only 27 | val statement = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY) 28 | 29 | // Execute Query 30 | val rs = statement.executeQuery(sql) 31 | 32 | val rsmd = rs.getMetaData 33 | val colNames = for (i <- 1 to rsmd.getColumnCount) yield rsmd.getColumnLabel(i) 34 | val result = ArrayBuffer[HashMap[String, Any]]() 35 | 36 | while (rs.next) { 37 | var row = new HashMap[String, Any]; 38 | for (n <- colNames) { 39 | row.put(n, rs.getObject(n)) 40 | } 41 | result += row 42 | } 43 | 44 | rs.close() 45 | statement.close() 46 | 47 | result 48 | } finally { 49 | //conn.close 50 | } 51 | } 52 | 53 | /** 54 | * 执行 55 | */ 56 | def exec(sql: String): Int = { 57 | try { 58 | val prep = conn.prepareStatement(sql) 59 | //prep.setString(1, "Nothing great was ever achieved without enthusiasm.") 60 | //prep.setString(2, "Ralph Waldo Emerson") 61 | prep.executeUpdate 62 | } finally { 63 | //conn.close 64 | } 65 | } 66 | 67 | def close() : Unit = { 68 | conn.close() 69 | } 70 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/hadoop/hdfs/HDFSClient.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.hadoop.hdfs 2 | 3 | import java.io.BufferedInputStream 4 | import java.io.File 5 | import java.io.FileInputStream 6 | import java.io.InputStream 7 | import org.apache.hadoop.conf.Configuration 8 | import org.apache.hadoop.fs.FileSystem 9 | import org.apache.hadoop.fs.Path 10 | import org.apache.hadoop.fs.FileStatus 11 | import org.apache.hadoop.fs.FileUtil 12 | 13 | /** 14 | * 操作 HDFS 类 15 | */ 16 | class HDFSClient(conf: Configuration) { 17 | 18 | //Initial Configuration 19 | 20 | //private var conf = new Configuration() 21 | //private var maprfsCoreSitePath = new Path("core-site.xml") 22 | //private var maprfsSitePath = new Path("maprfs-site.xml") 23 | 24 | //conf.addResource(maprfsCoreSitePath) 25 | //conf.addResource(maprfsSitePath) 26 | 27 | private var fileSystem = FileSystem.get(conf) 28 | 29 | /** 30 | * 创建 HDFS 目录 31 | */ 32 | def mkdirs(hdfsFolderPath: String): Unit = { 33 | var path = new Path(hdfsFolderPath) 34 | if (!fileSystem.exists(path)) { 35 | fileSystem.mkdirs(path) 36 | } 37 | } 38 | 39 | 40 | /** 41 | * 创建一个空的 HDFS 文件 42 | */ 43 | def createNewFile(hdfsFilePath:String): Path = { 44 | val path = new Path(hdfsFilePath) 45 | if (!fileSystem.exists(path)) { 46 | var out = fileSystem.createNewFile(path) 47 | } 48 | path 49 | } 50 | 51 | 52 | /** 53 | * 创建或者修改 HDFS 文件 54 | */ 55 | def createAndSave(hdfsPath: String): Unit = { 56 | var out = fileSystem.create(new Path(hdfsPath)) 57 | var in = new BufferedInputStream(new FileInputStream(hdfsPath)) 58 | var b = new Array[Byte](1024) 59 | var numBytes = in.read(b) 60 | while (numBytes > 0) { 61 | out.write(b, 0, numBytes) 62 | numBytes = in.read(b) 63 | } 64 | in.close() 65 | out.close() 66 | } 67 | 68 | 69 | /** 70 | * 追加本地文件到 HDFS 文件中 71 | */ 72 | def appendFileToHdfsFile(fromfilepath: String, hdfsFilePath: String): Unit = { 73 | val hdfsPath = this.createNewFile(hdfsFilePath) 74 | // hfds 75 | var out = fileSystem.append(hdfsPath) 76 | 77 | // 本地文件流 78 | var in = new BufferedInputStream(new FileInputStream(new File(fromfilepath))) 79 | var b = new Array[Byte](1024) 80 | var numBytes = in.read(b) 81 | while (numBytes > 0) { 82 | out.write(b, 0, numBytes) 83 | numBytes = in.read(b) 84 | } 85 | in.close() 86 | out.close() 87 | } 88 | 89 | 90 | /** 91 | * 追加数据到 HFDS 中 92 | */ 93 | def appendDataToHdfsFile(data: String, hdfsFilePath: String) : Unit = { 94 | val path = this.createNewFile(hdfsFilePath) 95 | 96 | var out = fileSystem.append(new Path(hdfsFilePath)) 97 | 98 | val by = data.getBytes() 99 | out.write(by, 0, by.length) 100 | out.close() 101 | } 102 | 103 | 104 | /** 105 | * 读取 HDFS 文件流 106 | */ 107 | def getFile(hdfsPath: String): InputStream = { 108 | var path = new Path(hdfsPath) 109 | fileSystem.open(path) 110 | 111 | } 112 | 113 | 114 | /** 115 | * 删除 hdfs 文件 116 | */ 117 | def deleteFile(hdfsPath: String): Boolean = { 118 | var path = new Path(hdfsPath) 119 | fileSystem.delete(path, true) 120 | } 121 | 122 | 123 | /** 124 | * Close the FileSystem Handle 125 | */ 126 | def close() = { 127 | fileSystem.close 128 | } 129 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/hadoop/hdfs/HDFSClientTest.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.hadoop.hdfs 2 | 3 | import java.io.BufferedInputStream 4 | import java.io.File 5 | import java.io.FileInputStream 6 | import java.io.InputStream 7 | import org.apache.hadoop.conf.Configuration 8 | import org.apache.hadoop.fs.FileSystem 9 | import org.apache.hadoop.fs.Path 10 | import org.apache.hadoop.fs.FileStatus 11 | import org.apache.hadoop.fs.FileUtil 12 | 13 | import org.apache.hadoop.conf.Configuration 14 | import com.angejia.dw.hadoop.hdfs.HDFSClient 15 | 16 | object HDFSClientTest { 17 | 18 | 19 | def main(args: Array[String]) { 20 | val ob = new HDFSClientTest 21 | ob.run(args(0), args(1)) 22 | } 23 | } 24 | 25 | 26 | class HDFSClientTest { 27 | 28 | def run(hdfsPath: String, localPath: String) : Unit = { 29 | //fs.defaultFS 8020 30 | val conf = new Configuration() 31 | conf.set("fs.defaultFS", "hdfs://uhadoop-ociicy-master1:8020") // 写地址 32 | conf.setBoolean("dfs.support.append", true) // 开启追加模式 33 | 34 | 35 | val hdfsServer = new HDFSClient(conf) 36 | // 创建文件 37 | //hdfsServer.createNewFile("/user/hive/real_time/source_data/access_log/aaa.txt") 38 | // 追加数据 39 | //hdfsServer.appendFileToHdfsFile("/data/tmp/test.log","/user/hive/real_time/source_data/access_log/aaa.txt") 40 | hdfsServer.appendDataToHdfsFile("你好呀",hdfsPath) 41 | hdfsServer.appendDataToHdfsFile("你好呀",hdfsPath) 42 | //hdfsServer.appendDataToHdfsFile("你好呀",hdfsPath) 43 | //hdfsServer.appendDataToHdfsFile("你好呀",hdfsPath) 44 | // 读取内容 45 | val content = hdfsServer.getFile(hdfsPath) 46 | println(content) 47 | } 48 | } 49 | 50 | 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/hadoop/kafka/KafkaConsumer.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.hadoop.kafka 2 | 3 | import kafka.message._ 4 | import kafka.serializer._ 5 | import kafka.utils._ 6 | import kafka.consumer.Consumer 7 | import kafka.consumer.ConsumerConfig 8 | import kafka.consumer.Whitelist 9 | import java.util.Properties 10 | import kafka.utils.Logging 11 | import scala.collection.JavaConversions._ 12 | 13 | class KafkaConsumer( 14 | topic: String, 15 | /** topic 16 | * The high-level API hides the details of brokers from the consumer and allows consuming off the cluster of machines 17 | * without concern for the underlying topology. It also maintains the state of what has been consumed. The high-level API 18 | * also provides the ability to subscribe to topics that match a filter expression (i.e., either a whitelist or a blacklist 19 | * regular expression). This topic is a whitelist only but can change with re-factoring below on the filterSpec 20 | */ 21 | groupId: String, 22 | /** groupId 23 | * A string that uniquely identifies the group of consumer processes to which this consumer belongs. By setting the same 24 | * group id multiple processes indicate that they are all part of the same consumer group. 25 | */ 26 | zookeeperConnect: String, 27 | /** 28 | * Specifies the zookeeper connection string in the form hostname:port where host and port are the host and port of 29 | * a zookeeper server. To allow connecting through other zookeeper nodes when that zookeeper machine is down you can also 30 | * specify multiple hosts in the form hostname1:port1,hostname2:port2,hostname3:port3. The server may also have a zookeeper 31 | * chroot path as part of it's zookeeper connection string which puts its data under some path in the global zookeeper namespace. 32 | * If so the consumer should use the same chroot path in its connection string. For example to give a chroot path of /chroot/path 33 | * you would give the connection string as hostname1:port1,hostname2:port2,hostname3:port3/chroot/path. 34 | */ 35 | readFromStartOfStream: Boolean = true 36 | /** 37 | * What to do when there is no initial offset in Zookeeper or if an offset is out of range: 38 | * 1) smallest : automatically reset the offset to the smallest offset 39 | * 2) largest : automatically reset the offset to the largest offset 40 | * 3) anything else: throw exception to the consumer. If this is set to largest, the consumer may lose some 41 | messages when the number of partitions, for the topics it subscribes to, changes on the broker. 42 | **************************************************************************************** 43 | To prevent data loss during partition addition, set auto.offset.reset to smallest 44 | This make sense to change to true if you know you are listening for new data only as of 45 | after you connect to the stream new things are coming out. you can audit/reconcile in 46 | another consumer which this flag allows you to toggle if it is catch-up and new stuff or 47 | just new stuff coming out of the stream. This will also block waiting for new stuff so 48 | it makes a good listener. 49 | //readFromStartOfStream: Boolean = true 50 | readFromStartOfStream: Boolean = false 51 | **************************************************************************************** 52 | */ 53 | ) extends Logging { 54 | 55 | val props = new Properties() 56 | props.put("group.id", groupId) 57 | props.put("zookeeper.connect", zookeeperConnect) 58 | props.put("auto.offset.reset", if(readFromStartOfStream) "smallest" else "largest") 59 | 60 | val config = new ConsumerConfig(props) 61 | val connector = Consumer.create(config) 62 | 63 | val filterSpec = new Whitelist(topic) 64 | 65 | info("setup:start topic=%s for zk=%s and groupId=%s".format(topic,zookeeperConnect,groupId)) 66 | val stream = connector.createMessageStreamsByFilter(filterSpec, 1, new DefaultDecoder(), new DefaultDecoder()).get(0) 67 | info("setup:complete topic=%s for zk=%s and groupId=%s".format(topic,zookeeperConnect,groupId)) 68 | 69 | def read(write: (Array[Byte])=>Unit) = { 70 | info("reading on stream now") 71 | for(messageAndTopic <- stream) { 72 | try { 73 | info("writing from stream") 74 | write(messageAndTopic.message) 75 | info("written to stream") 76 | } catch { 77 | case e: Throwable => 78 | if (true) { //this is objective even how to conditionalize on it 79 | error("Error processing message, skipping this message: ", e) 80 | } else { 81 | throw e 82 | } 83 | } 84 | } 85 | } 86 | 87 | def close() { 88 | connector.shutdown() 89 | } 90 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/hadoop/kafka/KafkaProducer.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.hadoop.kafka 2 | 3 | import java.util.{Properties, UUID} 4 | import kafka.producer.ProducerConfig 5 | import kafka.common._ 6 | import kafka.message._ 7 | import kafka.serializer._ 8 | import kafka.producer.Producer 9 | import kafka.producer.KeyedMessage 10 | import java.util.Properties 11 | import java.util.Date 12 | // import org.apache.log4j.Logger 13 | 14 | 15 | case class KafkaProducer( 16 | topic: String, 17 | brokerList: String, 18 | /** brokerList 19 | * This is for bootstrapping and the producer will only use it for getting metadata (topics, partitions and replicas). 20 | * The socket connections for sending the actual data will be established based on the broker information returned in 21 | * the metadata. The format is host1:port1,host2:port2, and the list can be a subset of brokers or a VIP pointing to a 22 | * subset of brokers. 23 | */ 24 | clientId: String = UUID.randomUUID().toString, 25 | /** clientId 26 | * The client id is a user-specified string sent in each request to help trace calls. It should logically identify 27 | * the application making the request. 28 | */ 29 | synchronously: Boolean = true, 30 | /** synchronously 31 | * This parameter specifies whether the messages are sent asynchronously in a background thread. 32 | * Valid values are false for asynchronous send and true for synchronous send. By setting the producer 33 | * to async we allow batching together of requests (which is great for throughput) but open the possibility 34 | * of a failure of the client machine dropping unsent data. 35 | */ 36 | compress: Boolean = true, 37 | /** compress 38 | * This parameter allows you to specify the compression codec for all data generated by this producer. 39 | * When set to true gzip is used. To override and use snappy you need to implement that as the default 40 | * codec for compression using SnappyCompressionCodec.codec instead of DefaultCompressionCodec.codec below. 41 | */ 42 | 43 | batchSize: Integer = 200, 44 | /** batchSize 45 | * The number of messages to send in one batch when using async mode. 46 | * The producer will wait until either this number of messages are ready 47 | * to send or queue.buffer.max.ms is reached. 48 | */ 49 | messageSendMaxRetries: Integer = 3, 50 | /** messageSendMaxRetries 51 | * This property will cause the producer to automatically retry a failed send request. 52 | * This property specifies the number of retries when such failures occur. Note that 53 | * setting a non-zero value here can lead to duplicates in the case of network errors 54 | * that cause a message to be sent but the acknowledgement to be lost. 55 | */ 56 | requestRequiredAcks: Integer = -1 57 | /** requestRequiredAcks 58 | * 0) which means that the producer never waits for an acknowledgement from the broker (the same behavior as 0.7). 59 | * This option provides the lowest latency but the weakest durability guarantees (some data will be lost when a server fails). 60 | * 1) which means that the producer gets an acknowledgement after the leader replica has received the data. This option provides 61 | * better durability as the client waits until the server acknowledges the request as successful (only messages that were 62 | * written to the now-dead leader but not yet replicated will be lost). 63 | * -1) which means that the producer gets an acknowledgement after all in-sync replicas have received the data. This option 64 | * provides the best durability, we guarantee that no messages will be lost as long as at least one in sync replica remains. 65 | */ 66 | ) { 67 | 68 | val props = new Properties() 69 | 70 | val codec = if(compress) DefaultCompressionCodec.codec else NoCompressionCodec.codec 71 | 72 | props.put("compression.codec", codec.toString) 73 | props.put("producer.type", if(synchronously) "sync" else "async") 74 | props.put("metadata.broker.list", brokerList) 75 | props.put("batch.num.messages", batchSize.toString) 76 | props.put("message.send.max.retries", messageSendMaxRetries.toString) 77 | props.put("request.required.acks",requestRequiredAcks.toString) 78 | props.put("client.id",clientId.toString) 79 | 80 | val producer = new Producer[AnyRef, AnyRef](new ProducerConfig(props)) 81 | 82 | def kafkaMesssage(message: Array[Byte], partition: Array[Byte]): KeyedMessage[AnyRef, AnyRef] = { 83 | if (partition == null) { 84 | new KeyedMessage(topic,message) 85 | } else { 86 | new KeyedMessage(topic,partition,message) 87 | } 88 | } 89 | 90 | def send(message: String, partition: String = null): Unit = { 91 | //println(partition.getBytes("UTF8")) 92 | send(message.getBytes("UTF8"), if (partition == null) null else partition.getBytes("UTF8")) 93 | } 94 | 95 | def send(message: Array[Byte], partition: Array[Byte]): Unit = { 96 | try { 97 | producer.send(kafkaMesssage(message, partition)) 98 | } catch { 99 | case e: Exception => 100 | e.printStackTrace 101 | System.exit(1) 102 | } 103 | } 104 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/hadoop/spark/CollaborativeFiltering.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.hadoop.spark 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.mllib.recommendation.{ALS,MatrixFactorizationModel,Rating} 7 | import org.jblas.DoubleMatrix 8 | 9 | /** 10 | * Spark Mllib 协同过滤 11 | */ 12 | class CollaborativeFiltering extends Serializable { 13 | 14 | // 训练得出的模型 15 | var trainModel: MatrixFactorizationModel = null 16 | 17 | 18 | /** 19 | * 提取有效特征 20 | * rdd: RDD[Array[Int]] 一个处理过的 RDD 对象, 结构为 RDD[Array[Int]] 21 | * 分别是 用户,主题,评分 22 | */ 23 | def characteristics(rdd: RDD[Array[Int]]): RDD[Rating] = { 24 | //把 RDD 数据转换成 Rating 对象 25 | val ratings: RDD[Rating] = rdd.map { 26 | // 模式匹配 27 | case Array(user, item, rating) => Rating(user.toInt, item.toInt, rating.toInt) 28 | } 29 | ratings.cache() 30 | } 31 | 32 | 33 | /** 34 | * 训练推荐模型 35 | * 使用: (ALS)最小二乘法,是求解矩阵分解问题的最优方法 36 | * 矩阵分解: 37 | * 显示矩阵分解: ALS.train(ratings, rank, iterations, lambda) 用来处理直接获得的数据,一般是用户访问,收藏,评分等数据 38 | * 隐式矩阵分解: ALS.trainImplicit(ratings, rank, iterations) 用处理间接才能获得的数据,需要在用户与物品的交互中才能得到的数据,如看了电影的次数,购买了某个产品等 39 | * 40 | * 矩阵分解参数: 41 | * rank : ALS 模型中的因子个数,值越大会越好,但是训练模型和保存时的开销就越大 42 | * iterations : 运行迭代次数,经过少数次数迭代后 ALS 模型便已能收敛为一个比较合理的好模型 43 | * lambda : 控制模型的正规化过程, 从而控制模型的过拟合情况 44 | */ 45 | 46 | /** 47 | * 显示矩阵分解 48 | */ 49 | def train(ratings: RDD[Rating], rank: Int = 50, iterations: Int = 10, lambda: Double = 0.01) : MatrixFactorizationModel = { 50 | val model: MatrixFactorizationModel = ALS.train(ratings, rank, iterations, lambda) 51 | this.trainModel = model 52 | model 53 | } 54 | 55 | /** 56 | * 隐式矩阵分解 57 | */ 58 | def trainImplicit(ratings: RDD[Rating], rank: Int = 50, iterations: Int = 10) : MatrixFactorizationModel = { 59 | val model: MatrixFactorizationModel = ALS.trainImplicit(ratings, rank, iterations) 60 | this.trainModel = model 61 | model 62 | } 63 | 64 | 65 | /** 66 | * 使用推荐模型 67 | * 68 | * 用户推荐模型: 利用相似用户的评级来计算对某个用户的推荐 69 | * 给指定用户推荐物品,通常以 "前 K 个" 形式展现, 即通过模型求出用户可能喜好程度最高的前 K 个商品 70 | * 这个过程通过计算每个商品的预计得分, 按照得分机型排序实现 71 | * 72 | * 物品推荐模型: 依赖用户接触过的物品与候选物品之间的相似度来获得推荐 73 | * 给定一个物品, 有哪些物品与它相似,相似的确切定义取决于所使用的模型,相似度是通过某种方式比较表示两个物品的向量二得到的 74 | * 相似度衡量方法 75 | * 皮尔森相关系数(Pearson correlation) 76 | * 针对实数响亮的余弦相似度(cosine similarity) 77 | * 针对二元向量的杰卡德相似系数(Jaccard similarity) 78 | */ 79 | 80 | /** 81 | * 用户推荐 - 单个用户推荐最得分最高的 K 个物品 82 | * userId: 需要推荐的用户 Id 83 | * K: 匹配分数最高的前 K 个物品 84 | */ 85 | def userRecommendItem(userId: Int, K: Int) : Array[Rating] = { 86 | val topKRecs: Array[Rating] = this.trainModel.recommendProducts(userId, K) 87 | topKRecs 88 | } 89 | 90 | 91 | /** 92 | * 用户推荐 - 用户物品推荐预测得分 93 | */ 94 | def userPredict(user: Int, product: Int): Double = { 95 | val predictionScore = this.trainModel.predict(user, product) 96 | predictionScore 97 | } 98 | 99 | 100 | /** 101 | * 用户推荐 - 批量用户推荐物品推荐得分 102 | */ 103 | def userPredict(usersProducts: RDD[(Int,Int)]): RDD[Rating] = { 104 | val predictionScore = this.trainModel.predict(usersProducts) 105 | predictionScore 106 | } 107 | 108 | 109 | /** 110 | * 物品余弦相似度计算 111 | * 返回 (item ID, 因子分数) 这是一个 pair RDD 112 | */ 113 | def itemCosineSimilarity(itemId: Int) : RDD[(Int, Double)] = { 114 | // 线性代数库,求向量点积 ,创建一个 Array[Double] 类型的向量 115 | 116 | // item 因子 从模型中,取回对应的因子 117 | val itemFactor: Array[Double] = this.trainModel.productFeatures.lookup(itemId).head 118 | 119 | // item 向量 120 | val itemVector: DoubleMatrix = new org.jblas.DoubleMatrix(itemFactor) 121 | 122 | // 求出本物品与各个物品的余弦相似度 123 | val sims: RDD[(Int, Double)] = this.trainModel.productFeatures.map { case (id, factor) => 124 | val factorVector = new org.jblas.DoubleMatrix(factor) 125 | val sim = this.cosineSimilarity(factorVector,itemVector) 126 | (id, sim) 127 | } 128 | sims 129 | } 130 | 131 | 132 | /** 133 | * 物品推荐 - top 推荐 134 | */ 135 | def itemRecommendItem(sims: RDD[(Int, Double)], K: Int) : Array[(Int, Double)] = { 136 | // 按照物品相似度排序,取出与本物品最相似前 K 个物品 137 | val sortedSims: Array[(Int, Double)] = sims.top(K)( // top 是分布式计算出前 K 个结果 138 | Ordering.by[(Int, Double), Double] { 139 | case (id, similarity) => similarity 140 | } 141 | ) 142 | 143 | // 打印出这 10 个与给定物品最相似的物品 144 | //val result = sortedSims.take(10).mkString("\n") 145 | //println(result) 146 | sortedSims 147 | } 148 | 149 | 150 | /** 151 | * 计算连个向量之间的余弦相似度, 余弦相似度是两个向量在 n 维空间里两者夹角的读书 152 | * 它是两个向量的点积与各向量范数(或长度)的乘积的商 153 | * 相似度的取值在 -1 到 1 主键 154 | * 1 表示完全相似 155 | * 0 表示两者互不相关(即无相关性) 156 | * -1 表示两者不相关, 还表示它们完全不相同 157 | */ 158 | def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix) : Double = { 159 | vec1.dot(vec2) / (vec1.norm2() * vec2.norm2()) 160 | } 161 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/hadoop/spark/CollaborativeFilteringTest.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.hadoop.spark 2 | 3 | 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.mllib.recommendation.ALS 7 | import org.apache.spark.mllib.recommendation.Rating 8 | import org.jblas.DoubleMatrix 9 | 10 | 11 | 12 | /** 13 | * 测试数据 /data/log/recommend/ml-100k/u.data 14 | */ 15 | object CollaborativeFilteringTest { 16 | 17 | def main(args: Array[String]) { 18 | val inventoryIBCF = new CollaborativeFilteringTest() 19 | inventoryIBCF.run() 20 | } 21 | } 22 | 23 | /** 24 | * 看了又看算法 25 | */ 26 | class CollaborativeFilteringTest { 27 | 28 | 29 | def run(): Unit = { 30 | this.suanfa() 31 | } 32 | 33 | 34 | def suanfa(): Unit = { 35 | // SPARK 运行环境配置 36 | val conf = new SparkConf() 37 | conf.setAppName("InventoryIBCF") 38 | conf.setMaster("local[2]") 39 | //conf.set("spark.ui.port", "36000") 40 | 41 | // SPARK 上下文配置 42 | val sc = new SparkContext(conf) 43 | 44 | 45 | /** 46 | * 提取有效特征 47 | * 1. 数据清洗 48 | * 2. 载入数据 49 | * 3. 格式化数据 50 | */ 51 | //原始数据 52 | val rawData = sc.textFile("/data/log/recommend/ml-100k/u.data") 53 | //println(rawData.first()) 54 | 55 | // 把行分割成数组,并且读取数组前 3 个原始 56 | // 参数(类型)推断 57 | val rawRatings = rawData.map(_.split("\t").take(3)) 58 | // 正常写法 59 | //val rawRatings = rawData.map(line => line.split("\t").take(3)) 60 | 61 | //把数据转换成 Rating 对象 62 | val ratings = rawRatings.map { 63 | // 模式匹配 64 | case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toInt) 65 | } 66 | 67 | 68 | /** 69 | * 训练推荐模型 70 | * 使用: (ALS)最小二乘法,是求解矩阵分解问题的最优方法 71 | * 矩阵分解: 72 | * 显示矩阵分解: ALS.train(ratings, rank, iterations, lambda) 用来处理直接获得的数据,一般是用户访问,收藏,评分等数据 73 | * 隐式矩阵分解: ALS.trainImplicit(ratings, rank, iterations) 用处理间接才能获得的数据,需要在用户与物品的交互中才能得到的数据,如看了电影的次数,购买了某个产品等 74 | * 75 | * 矩阵分解参数: 76 | * rank : ALS 模型中的因子个数,值越大会越好,但是训练模型和保存时的开销就越大 77 | * iterations : 运行迭代次数,经过少数次数迭代后 ALS 模型便已能收敛为一个比较合理的好模型 78 | * lambda : 控制模型的正规化过程, 从而控制模型的过拟合情况 79 | */ 80 | var rank = 50 81 | var iterations = 10 82 | var lambda = 0.01 83 | 84 | // 训练模型,返回 MatrixFactorizationModel 对象,返回 用户因子 RDD 和 物品因子 RDD 85 | val model = ALS.train(ratings, rank, iterations, lambda) 86 | 87 | //val productFeatures = model.productFeatures // 物品因子 88 | //val userFeatures = model.userFeatures // 用户因子 89 | val K = 10 // 推荐数量 90 | 91 | 92 | /** 93 | * 使用推荐模型 94 | * 95 | * 用户推荐模型: 利用相似用户的评级来计算对某个用户的推荐 96 | * 给指定用户推荐物品,通常以 "前 K 个" 形式展现, 即通过模型求出用户可能喜好程度最高的前 K 个商品 97 | * 这个过程通过计算每个商品的预计得分, 按照得分机型排序实现 98 | * 99 | * 物品推荐模型: 依赖用户接触过的物品与候选物品之间的相似度来获得推荐 100 | * 给定一个物品, 有哪些物品与它相似,相似的确切定义取决于所使用的模型,相似度是通过某种方式比较表示两个物品的向量二得到的 101 | * 相似度衡量方法 102 | * 皮尔森相关系数(Pearson correlation) 103 | * 针对实数响亮的余弦相似度(cosine similarity) 104 | * 针对二元向量的杰卡德相似系数(Jaccard similarity) 105 | */ 106 | 107 | /** 108 | * 用户推荐 109 | */ 110 | // 计算给定用户 -> 给定物品的预计得分 111 | model.predict(789, 123) 112 | 113 | // 以(user,item) ID对类型的 RDD 对象为输入, 返回多个用户和物品的预测, 114 | //model.predict(userFeatures) 115 | 116 | // 为每个用户生成前 K 个推荐物品 117 | val userId = 789 118 | 119 | val topKRecs = model.recommendProducts(userId, K) 120 | //println(topKRecs.mkString("\n")) 121 | 122 | /** 123 | * 物品推荐 124 | */ 125 | // 线性代数库,求向量点积 ,创建一个 Array[Double] 类型的向量 126 | val aMatrix = new DoubleMatrix(Array(1.0,2.0,3.0)) 127 | 128 | 129 | var itemId = 567 130 | val itemFactor = model.productFeatures.lookup(itemId).head 131 | val itemVector = new DoubleMatrix(itemFactor) 132 | 133 | // 计算物品与自己的相似度 - Test 134 | //val itemX = this.cosineSimilarity(itemVector, itemVector) 135 | //println(itemX) 136 | 137 | 138 | // 求出物品与各个物品的余弦相似度 139 | val sims = model.productFeatures.map { case (id, factor) => 140 | val factorVector = new DoubleMatrix(factor) 141 | val sim = cosineSimilarity(factorVector,itemVector) 142 | (id, sim) 143 | } 144 | 145 | 146 | // 按照物品相似度排序,取出与物品 567 最相似前 10 个物品 147 | val sortedSims = sims.top(K)( 148 | Ordering.by[(Int, Double), Double] { 149 | case (id, similarity) => similarity 150 | } 151 | ) 152 | 153 | // 打印出这 10 个与给定物品最相似的物品 154 | val result = sortedSims.take(10).mkString("\n") 155 | println(result) 156 | } 157 | 158 | 159 | /** 160 | * 计算连个向量之间的余弦相似度, 余弦相似度是两个向量在 n 维空间里两者夹角的读书 161 | * 它是两个向量的点积与各向量范数(或长度)的乘积的商 162 | * 相似度的取值在 -1 到 1 主键 163 | * 1 表示完全相似 164 | * 0 表示两者互不相关(即无相关性) 165 | * -1 表示两者不相关, 还表示它们完全不相同 166 | */ 167 | def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix) : Double = { 168 | vec1.dot(vec2) / (vec1.norm2() * vec2.norm2()) 169 | } 170 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/logs/UbaAppActionLogStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.logs 2 | 3 | object UbaAppActionLogStreaming { 4 | 5 | 6 | def main(args: Array[String]) { 7 | 8 | } 9 | } 10 | 11 | 12 | class UbaAppActionLogStreaming { 13 | 14 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/logs/UbaWebActionLogStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.logs 2 | 3 | object UbaWebActionLogStreaming { 4 | 5 | 6 | def main(args: Array[String]) { 7 | 8 | } 9 | } 10 | 11 | 12 | class UbaWebActionLogStreaming { 13 | 14 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/logs/UbaWebVisitLogStreaming.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.logs 2 | 3 | object UbaWebVisitLogStreaming { 4 | 5 | 6 | def main(args: Array[String]) { 7 | 8 | } 9 | } 10 | 11 | 12 | class UbaWebVisitLogStreaming { 13 | 14 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/Conf.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend 2 | 3 | import scala.collection.mutable.Map 4 | import scala.io.Source 5 | import java.io.{ InputStream, BufferedReader, InputStreamReader, PushbackReader } 6 | 7 | import com.angejia.dw.common.util.PropertyUtil 8 | 9 | object Conf { 10 | 11 | // 读取配置文件 12 | val property = new PropertyUtil() 13 | 14 | // 项目根目录 15 | //var projectPath = Conf.getClass().getResource("/").getFile().toString() 16 | 17 | /** 18 | * 设置环境,根据不同的环境使用不同的配置文件 19 | */ 20 | def setEnv(env: String = "dev") : Unit = { 21 | 22 | // 读取的配置文件名称 23 | val confName = "/conf_" + env + ".properties" 24 | 25 | /** 26 | * 读取 resource 目录下的文件 27 | * val lines = Source.fromURL(getClass.getResource(confName)).getLines() 28 | lines.foreach(println) 29 | */ 30 | // 获取 resource 文件读输入流 31 | val inputStreamReader: InputStreamReader = Source.fromURL(getClass.getResource(confName)).reader() 32 | 33 | // 设置读取的流 34 | property.setFileInputStream(inputStreamReader) 35 | } 36 | 37 | 38 | /** 39 | * 获取 zookeeper 地址 40 | */ 41 | def getZookeeperQuorum(): String = { 42 | val zookeeperQuorum: String = property.getKeyValue("zookeeperQuorum") 43 | zookeeperQuorum 44 | } 45 | 46 | 47 | /** 48 | * 获取 kafka 服务器地址 49 | */ 50 | def getKafkaServerBrokerList() : String = { 51 | val kafkaServerBrokerList: String = property.getKeyValue("kafkaServerBrokerList") 52 | kafkaServerBrokerList 53 | } 54 | 55 | 56 | /** 57 | * 获取业务数据库配置信息 58 | */ 59 | def getProductMysqDBInfo(): Map[String,String] = { 60 | Map[String, String]( 61 | "host" -> property.getKeyValue("productMysqlDB.host"), 62 | "account" -> property.getKeyValue("productMysqlDB.account"), 63 | "password" -> property.getKeyValue("productMysqlDB.password"), 64 | "defaultDB" -> property.getKeyValue("productMysqlDB.defaultDB") 65 | ) 66 | } 67 | 68 | 69 | /** 70 | * 获取bi数据库配置信息 71 | */ 72 | def getBiMysqDBInfo(): Map[String,String] = { 73 | Map[String, String]( 74 | "host" -> property.getKeyValue("biMysqlDB.host"), 75 | "account" -> property.getKeyValue("biMysqlDB.account"), 76 | "password" -> property.getKeyValue("biMysqlDB.password"), 77 | "defaultDB" -> property.getKeyValue("biMysqlDB.defaultDB") 78 | ) 79 | } 80 | 81 | 82 | /** 83 | * hdfs 文件服务地址 84 | */ 85 | def getHDFSServer(): String = { 86 | val hdfsServer: String = property.getKeyValue("HDFSServer") 87 | hdfsServer 88 | } 89 | 90 | 91 | /** 92 | * 获取 hive 相关配置信息 93 | */ 94 | def getHiveConf(): Map[String,String] = { 95 | Map[String, String]( 96 | "hiveMetastoreUris" -> property.getKeyValue("hive.metastore.uris"), 97 | "hiveThriftServerUrl" -> property.getKeyValue("hive.thrift.server.url"), 98 | "hiveThriftServerUser" -> property.getKeyValue("hive.thrift.server.user"), 99 | "hiveThriftServerPass" -> property.getKeyValue("hive.thrift.server.pass") 100 | ) 101 | } 102 | 103 | 104 | /** 105 | * 获取 spark 相关配置信息 106 | */ 107 | def getSparkConf(): Map[String,String] = { 108 | Map[String, String]( 109 | "sparkThriftServerUrl" -> property.getKeyValue("spark.thrift.server.url"), 110 | "sparkThriftServerUser" -> property.getKeyValue("spark.thrift.server.user"), 111 | "sparkThriftServerPass" -> property.getKeyValue("spark.thrift.server.pass") 112 | ) 113 | } 114 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/IBCF.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend 2 | 3 | import scala.collection.mutable.Map 4 | 5 | import org.apache.log4j.{Level, Logger} 6 | 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.rdd.RDD 10 | 11 | 12 | /** 13 | * 基于 spark Rdd 实现 IBCF 算法 14 | */ 15 | class IBCF extends Serializable { 16 | 17 | /** 18 | * 模型的数据源文件 : HDFS 或者 普通文件 19 | */ 20 | var characteristicsFile: String = null 21 | def setCharacteristicsFile(characteristicsFile: String): Unit = { 22 | this.characteristicsFile = characteristicsFile 23 | } 24 | def getCharacteristicsFile(): String = { 25 | this.characteristicsFile 26 | } 27 | 28 | 29 | /** 30 | * 文件行分隔符 31 | */ 32 | var fileSeparator: String = "\t" 33 | def setFileSeparator(fileSeparator: String): Unit = { 34 | this.fileSeparator = fileSeparator 35 | } 36 | def getFileSeparator(): String = { 37 | this.fileSeparator 38 | } 39 | 40 | 41 | /** 42 | * 根据 SparkConf 初始化 SparkContext 上下文 43 | */ 44 | var sparkContext: SparkContext = null 45 | def setSparkContext(sparkConf: SparkConf): Unit = { 46 | this.sparkContext = new SparkContext(sparkConf) 47 | } 48 | def getSparkContext(): SparkContext = { 49 | this.sparkContext 50 | } 51 | 52 | 53 | /** 54 | * 计算矩阵并得到 item -> item 关系矩阵, 根据输入文件或者 HDFS 55 | */ 56 | def calculateByFile(sparkConf: SparkConf, characteristicsFile: String, fileSeparator: String): Map[String, Int] = { 57 | 58 | println("----- 初始化 sparkContext 上下文 -----") 59 | this.setSparkContext(sparkConf) 60 | 61 | // 设置加载文件 62 | this.setCharacteristicsFile(characteristicsFile) 63 | this.setFileSeparator(fileSeparator) 64 | 65 | // 加载数据, 生成 RDD 66 | val baseModelRDD = this.loadModelData(this.getCharacteristicsFile()) 67 | 68 | // 过滤模型 69 | val modelRDD = this.filterModelRDD(baseModelRDD, this.getFileSeparator()) 70 | 71 | // 生成 user -> item 矩阵 72 | val itemAndItemMatrixRDD = this.generateItemAndItemMatrix(modelRDD) 73 | 74 | // 合并所有 user -> item 矩阵 75 | val itemAndItemMatrixCollection = this.mergeItemAndItemMatrix(itemAndItemMatrixRDD) 76 | 77 | itemAndItemMatrixCollection 78 | } 79 | 80 | 81 | 82 | /** 83 | * 计算矩阵并得到 item -> item 关系矩阵 - 根据基础输入模型 84 | */ 85 | def calculateByBaseModelRDD(baseModelRDD: RDD[String], fileSeparator: String): Map[String, Int] = { 86 | 87 | this.setFileSeparator(fileSeparator) 88 | 89 | // 过滤模型 90 | val modelRDD = this.filterModelRDD(baseModelRDD, this.getFileSeparator()) 91 | 92 | // 生成 user -> item 矩阵 93 | val itemAndItemMatrixRDD = this.generateItemAndItemMatrix(modelRDD) 94 | 95 | // 合并所有 user -> item 矩阵 96 | val itemAndItemMatrixCollection = this.mergeItemAndItemMatrix(itemAndItemMatrixRDD) 97 | 98 | itemAndItemMatrixCollection 99 | } 100 | 101 | 102 | 103 | /** 104 | * 加载模型数据 105 | * 106 | */ 107 | def loadModelData(characteristicsFile: String) : RDD[String] = { 108 | 109 | println("----- 加载模型数据: " + characteristicsFile + "-----") 110 | // 读取数据源 111 | val baseModelRDD = this.getSparkContext().textFile(characteristicsFile) 112 | 113 | baseModelRDD 114 | } 115 | 116 | 117 | /** 118 | * 过滤 RDD 模型 119 | * modelRDD: 根据数据源生成基础 RDD 120 | * fileSeparator : 文件行分隔符 121 | */ 122 | def filterModelRDD(modelRDD: RDD[String], fileSeparator: String): RDD[String] = { 123 | 124 | println("----- 过滤模型非法数据 -----") 125 | 126 | // 筛选过滤原始数据 127 | val filterRDD = modelRDD.filter { line => 128 | var checkStatus = true 129 | if (line.isEmpty()) { 130 | checkStatus = false 131 | } 132 | val lineArr = line.split(fileSeparator) 133 | if (lineArr.length < 2) { 134 | checkStatus = false 135 | } else { 136 | val userId = lineArr.apply(0) 137 | val itemId = lineArr.apply(1) 138 | //if (userId.matches("[0-9]+") == false) { 139 | //checkStatus = false 140 | //} 141 | //if (itemId.matches("[0-9]+") == false) { 142 | // checkStatus = false 143 | //} 144 | if (userId == "" || itemId == "") { 145 | checkStatus = false 146 | } 147 | } 148 | checkStatus 149 | } 150 | 151 | filterRDD 152 | } 153 | 154 | 155 | 156 | /** 157 | * 根据 user 集合, 生成 item -> item 矩阵 158 | * modelRDD : 基础数据模型 159 | * return RDD( 160 | * Map("50:57"->1, "57:50"->1, "51:55"->1), 161 | * Map("50:57"->2, "57:50"->3, "51:55"->4) 162 | * ) 163 | */ 164 | def generateItemAndItemMatrix(modelRDD: RDD[String]) : RDD[Map[String, Int]] = { 165 | 166 | println("----- 归并 user -> items 集合 -----") 167 | // 用户喜欢 items 的集合 RDD 168 | val userLikeItemsCollectionRDD = modelRDD.map { line => 169 | val lineArr = line.split("\t") 170 | val userId = lineArr.apply(0).toString() 171 | val itemId = lineArr.apply(1).toString() 172 | (userId, itemId) 173 | }.groupByKey() 174 | 175 | println("----- 根据 user -> items 集合, 生成 item -> item 矩阵 -----") 176 | // 用户的 Item 矩阵 B 177 | val itemAndItemMatrixRDD = userLikeItemsCollectionRDD.map{userItemsCollection => 178 | // 用户喜欢物品的集合 179 | val userItems = userItemsCollection._2 180 | 181 | /** 182 | * 数据结构 183 | Map("50:57"->1, "57:50"->1, "51:55"->1) 184 | */ 185 | // 保存用户,每个物品对的矩阵 B 186 | val itemAndItemMatrix : Map[String,Int] = Map[String,Int]() 187 | 188 | // 二二配对当前用户的物品, 为每个用户,产出一个 B 矩阵 189 | for (i <- userItems) { 190 | for (j <- userItems) { 191 | // 排除相同的物品 192 | if (i != j) { 193 | // 默认为每个用户 +1 个访问次数 194 | //userItemMatrix += Map(i -> Map(j -> 1)) 195 | val key = i.toString() + ":" + j.toString() 196 | itemAndItemMatrix.put(key ,1) 197 | } 198 | } 199 | } 200 | 201 | itemAndItemMatrix 202 | } 203 | 204 | itemAndItemMatrixRDD 205 | } 206 | 207 | 208 | 209 | /** 210 | * 合并累加 ItemAndItemMatrix 矩阵 211 | * itemAndItemMatrix : item -> item 矩阵 212 | */ 213 | def mergeItemAndItemMatrix(itemAndItemMatrix: RDD[Map[String, Int]]): Map[String, Int] = { 214 | println("----- 合并累加所有 item -> item 矩阵 -----") 215 | 216 | var rsItemMatrix: Map[String, Int] = Map[String, Int]() 217 | 218 | if (itemAndItemMatrix.count() == 0) { 219 | return rsItemMatrix 220 | } 221 | 222 | // 合并最终的矩阵 223 | val itemAndItemMatrixCollection = itemAndItemMatrix.reduce{ (x, y) => 224 | 225 | var curMatrix = x 226 | var nextMatrix = y 227 | 228 | /** 229 | * 目标 : 230 | * 1. 把 curMatrix 和 nextMatrix 相同 key 的值相加 231 | * 2. 把 nextMatrix 不在 curMatrix 中的原样追加到 curMatrix 232 | */ 233 | for ((yK, yV) <- nextMatrix) { 234 | 235 | if (curMatrix.contains(yK) == true) { 236 | curMatrix(yK) += nextMatrix(yK) 237 | } else { 238 | curMatrix.put(yK,yV) 239 | } 240 | } 241 | 242 | curMatrix 243 | } 244 | 245 | itemAndItemMatrixCollection 246 | } 247 | 248 | 249 | /** 250 | * 根据 ItemId , groupBy ItemMatrix 251 | 把同类型的物品, 聚合到一起 252 | Map(51 -> 51:55:2, 51:52:2, 51:53:2, 51:56:1 253 | 56 -> 56:53:1, 56:55:1, 56:52:1,56:51:1 254 | ) 255 | */ 256 | def itemMatrixGroupByItemId(itemAndItemMatrixCollection: Map[String, Int]): Map[String, Iterable[Array[String]]] = { 257 | println("----- 根据 ItemId , groupBy ItemMatrix -----") 258 | 259 | val itemAndItemGroup = itemAndItemMatrixCollection.map( f => { 260 | val ids = f._1.split(":") 261 | val itemId = ids(0).toString() // 房源 ID 262 | val itemRsId = ids(1).toString() // 推荐房源 ID 263 | val itemRsIdCnt = f._2.toString() // 共同看过的人数 264 | // 转换成数组 265 | Array(itemId, itemRsId, itemRsIdCnt) 266 | //println(itemId, itemRsId, itemRsIdCnt) 267 | }).groupBy { 268 | // 然后, 按照 itmeId 把把同类的 Item ID groupBy 到一起 269 | f => f(0) 270 | } 271 | 272 | //itemAndItemGroup 273 | scala.collection.mutable.Map(itemAndItemGroup.toSeq:_*) 274 | } 275 | 276 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/community/CommunityIBCF.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.community 2 | 3 | import scala.collection.mutable.Map 4 | 5 | import org.apache.log4j.{Level, Logger} 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.SparkConf 8 | 9 | import com.angejia.dw.recommend.Conf 10 | import com.angejia.dw.hadoop.hbase.HBaseClient 11 | import com.angejia.dw.recommend.IBCF 12 | 13 | /** 14 | * Community IBCF 算法实现 15 | */ 16 | object CommunityIBCF { 17 | 18 | // hbase 数据表 19 | var hbaseResultTb: HBaseClient = null 20 | 21 | // 等待训练的数据文件 22 | var characteristicsFile: String = null 23 | 24 | // 文件分隔符 25 | var separator = "\t" 26 | 27 | def main (args: Array[String]) { 28 | 29 | for (ar <- args) { 30 | println(ar) 31 | } 32 | 33 | val env = args(0) 34 | this.init(env) 35 | 36 | this.characteristicsFile = args(1) 37 | 38 | this.calculate() 39 | 40 | } 41 | 42 | 43 | /** 44 | * 初始化 45 | * env: dev 开发环境, online 线上环境 46 | */ 47 | def init (env: String): Unit = { 48 | Conf.setEnv(env) 49 | 50 | // 连接 userUBCF 数据表 51 | this.hbaseResultTb = new HBaseClient("communityIBCF",Conf.getZookeeperQuorum()) 52 | } 53 | 54 | def calculate(): Unit = { 55 | 56 | /** 57 | * 初始化 spark 58 | */ 59 | val sparkConf = new SparkConf() 60 | sparkConf.setAppName("CommunityIBCF") 61 | sparkConf.setMaster("local[2]") 62 | 63 | /** 64 | * 初始化推荐模型 65 | */ 66 | val communityIBCF = new IBCF() 67 | 68 | // 合并累加 ItemAndItemMatrix 矩阵 69 | val itemAndItemMatrixCollection = communityIBCF.calculateByFile(sparkConf, characteristicsFile, separator) 70 | 71 | // 根据 ItemId , groupBy ItemMatrix 72 | val itemAndItemGroup = communityIBCF.itemMatrixGroupByItemId(itemAndItemMatrixCollection) 73 | 74 | var communityLine = 0 // 推荐行数 75 | println("----- 把聚合后的数据格式化成字符串保存在 Hbase -----") 76 | itemAndItemGroup.foreach(line => { 77 | val invetoryId = line._1 78 | val invetoryRsInfo = line._2 79 | 80 | // 把里面的 array 按照:组合, 最外层按照,组合 81 | val invetoryRsToString = invetoryRsInfo.map(f => f.mkString(":")).mkString(",") 82 | 83 | this.communityRecommendWriteHbase(invetoryId, invetoryRsToString) 84 | 85 | communityLine += 1 86 | }) 87 | 88 | println("") 89 | println("----- HBase Table communityIBCF: ", 90 | "写入了: " + communityLine + " 行") 91 | 92 | } 93 | 94 | 95 | /** 96 | * 保存推荐结果到 Hbase 97 | */ 98 | def communityRecommendWriteHbase(rowKey: String, value: String): Unit = { 99 | this.hbaseResultTb.insert(rowKey, "recommend", "communityRecommend", value) 100 | } 101 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/extract/ExtractFileToKafka.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.extract 2 | 3 | import java.lang.{Runtime,Thread} 4 | import com.angejia.dw.common.util.{ListenerFile,ScFileUtil,FileUtil} 5 | import com.angejia.dw.hadoop.kafka.{KafkaProducer,KafkaConsumer} 6 | 7 | /** 8 | * 动态抽取日志到 kafka 9 | */ 10 | 11 | object ExtractFileToKafka { 12 | 13 | def main(args: Array[String]) { 14 | val zookeeperConnect = args(0) 15 | val kafkaBrokerList = args(1) 16 | val kafkaTopic = args(2) 17 | val kafkaTopicPartition = args(3) 18 | val kafkaConsumerGroupId = args(4) 19 | val listenerConfFile = args(5) 20 | val stepLength = args(6) 21 | 22 | val extractAccessLog = new ExtractFileToKafka 23 | extractAccessLog.zookeeperConnect = zookeeperConnect 24 | extractAccessLog.kafkaBrokerList = kafkaBrokerList 25 | extractAccessLog.kafkaTopic = kafkaTopic 26 | extractAccessLog.kafkaTopicPartition = kafkaTopicPartition 27 | extractAccessLog.kafkaConsumerGroupId = kafkaConsumerGroupId 28 | extractAccessLog.listenerConfFile = listenerConfFile 29 | extractAccessLog.stepLength = stepLength 30 | //extractAccessLog.stepLength = 1000.toString() 31 | extractAccessLog.runExtractAccessLog() 32 | } 33 | 34 | def runTest(): Unit = { 35 | val extractAccessLog = new ExtractFileToKafka 36 | extractAccessLog.zookeeperConnect = "dwtest:2181" 37 | extractAccessLog.kafkaBrokerList = "dwtest:9092" 38 | extractAccessLog.kafkaTopic = "accessLog" 39 | extractAccessLog.kafkaTopicPartition = "0" 40 | extractAccessLog.kafkaConsumerGroupId = "userPortrait" 41 | extractAccessLog.listenerConfFile = "/data/log/recommend/accesslog" 42 | extractAccessLog.runExtractAccessLog() 43 | } 44 | } 45 | 46 | /** 47 | * 等待完成功能 48 | * 1. 程序退出、失败,记录最后一次更新点 49 | * 2. 可以读取最后一次更新的文件,作为标记开始的日期 50 | * 3. 当目标文件不存在(等待) 51 | */ 52 | class ExtractFileToKafka { 53 | 54 | // zookeeper 服务器 55 | var zookeeperConnect : String = null 56 | 57 | // kafka broker 服务器 58 | var kafkaBrokerList : String = null 59 | 60 | // kafka Topic 61 | var kafkaTopic : String = null 62 | 63 | // kafka Topic Partition 64 | var kafkaTopicPartition : String = null 65 | 66 | // kafka Consumer GroupId 67 | var kafkaConsumerGroupId : String = null 68 | 69 | // kafkaProducer 连接对象 70 | var kafkaProducer: KafkaProducer = null 71 | 72 | // kafkaConsumer 连接对象 73 | var kafkaConsumer: KafkaConsumer = null 74 | 75 | // 监听的配置文件 76 | var listenerConfFile: String = null 77 | 78 | // 每次读取行的长度 79 | var stepLength: String = null 80 | 81 | 82 | /** 83 | * 监听日志文件,并且发送日志到 kafka 中 84 | */ 85 | def runExtractAccessLog(): Unit = { 86 | // 读取配置文件 87 | val readFileConf = ScFileUtil.fileInputStream(this.listenerConfFile) 88 | val readFileConfArgs = readFileConf.split("\\s+") 89 | val lsFile = readFileConfArgs(0) // 监听文件 90 | val lsFileDate = readFileConfArgs(1) // 监听文件的日期 91 | val lsFileLineNum: String = readFileConfArgs(2) // 监听文件读到的行数 92 | 93 | val listenerFile = new ListenerFile() 94 | 95 | //listenerFile.listenerDateFile(lsFile, lsFileDate, lsFileLineNum.toInt, stepLength.toInt, this.readLogLine) 96 | listenerFile.listenerDateFileWhile(lsFile, lsFileDate, lsFileLineNum.toInt, stepLength.toInt ,this.readLogLine) 97 | } 98 | 99 | 100 | 101 | /** 102 | * 回调函数,发送日志到 Kafka 103 | */ 104 | def readLogLine(result: Map[String,Any]): Unit = { 105 | val file = result.get("file").get.toString() // 当前读的到的文件 106 | val fileTemplate = result.get("fileTemplate").get.toString() // 文件模板 107 | val date = result.get("date").get.toString() // 当前读到的文件日期 108 | val nextLineNum = result.get("nextLineNum").get.toString() // 下一次开始读取的行数 109 | val readLineCmd = result.get("readLineCmd").get.toString() // 当前读到的行数 110 | val fileLineContent = result.get("fileLineContent").get.toString() // 当前读到的文件内容 111 | 112 | val curLog = "NextReadFile: " + file + " " + date + " " + nextLineNum 113 | println(readLineCmd) 114 | println(curLog) 115 | //println(fileLineContent) 116 | 117 | if (fileLineContent.length() != 0) { 118 | // 发送日志到 kafka 119 | this.producerToKafka(fileLineContent) 120 | 121 | // 文件记录点 122 | val filePoint = fileTemplate + " " + date + " " + nextLineNum 123 | FileUtil.fileOutputStream(this.listenerConfFile, filePoint, false) 124 | 125 | // 记录一份到 本地日志 : 调试用 126 | //val localFile = "/var/log/ExtractFileToKafka/ExtractFileToKafka_" + date + ".log" 127 | //FileUtil.fileOutputStream(localFile, fileLineContent, true) 128 | 129 | println("filePoint: " + filePoint) 130 | } 131 | 132 | println("----------") 133 | 134 | } 135 | 136 | 137 | /** 138 | * 获取 KafkaProducer 连接对象 139 | */ 140 | def getKafkaProducerObj(): KafkaProducer = { 141 | if (this.kafkaProducer == null ) { 142 | this.kafkaProducer = new KafkaProducer(this.kafkaTopic,this.kafkaBrokerList) 143 | } 144 | this.kafkaProducer 145 | } 146 | 147 | 148 | /** 149 | * 获取 KafkaConsumer 连接对象 150 | */ 151 | def getConsumerKafka(): KafkaConsumer = { 152 | if (this.kafkaConsumer == null) { 153 | this.kafkaConsumer = new KafkaConsumer(this.kafkaTopic,this.kafkaConsumerGroupId,this.zookeeperConnect) 154 | } 155 | this.kafkaConsumer 156 | } 157 | 158 | 159 | /** 160 | * 发送数据给 Kafka 161 | */ 162 | def producerToKafka(content: String): Unit = { 163 | this.getKafkaProducerObj().send(content,this.kafkaTopicPartition) 164 | } 165 | 166 | 167 | 168 | /** 169 | * 消费数据 170 | */ 171 | def consumerKafka () : Unit = { 172 | this.getConsumerKafka().read(this.consumerData) 173 | } 174 | 175 | 176 | 177 | /** 178 | * 回调函数 179 | */ 180 | def consumerData(a: Array[Byte]): Unit = { 181 | println(a(0).toBinaryString) 182 | println(123) 183 | } 184 | 185 | } 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/inventory/InventoryIBCF.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.inventory 2 | 3 | import scala.collection.mutable.Map 4 | 5 | import org.apache.log4j.{Level, Logger} 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.SparkConf 8 | 9 | import com.angejia.dw.recommend.Conf 10 | import com.angejia.dw.hadoop.hbase.HBaseClient 11 | import com.angejia.dw.recommend.IBCF 12 | import com.angejia.dw.common.util.{FileUtil} 13 | 14 | /** 15 | * IBCF 算法实现 16 | * create 'inventoryIBCF',{NAME=>'baseInfo'},{NAME=>'recommend'} 17 | */ 18 | object InventoryIBCF { 19 | 20 | // hbase 数据表 21 | var hbaseResultTb: HBaseClient = null 22 | 23 | // 等待训练的数据文件 24 | var characteristicsFile: String = null 25 | 26 | // 文件分隔符 27 | var separator = "\t" 28 | 29 | def main (args: Array[String]) { 30 | 31 | for (ar <- args) { 32 | println(ar) 33 | } 34 | 35 | val env = args(0) 36 | this.init(env) 37 | 38 | this.characteristicsFile = args(1) 39 | //this.separator = " " 40 | //this.characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/mytest" 41 | 42 | this.calculate() 43 | 44 | } 45 | 46 | 47 | /** 48 | * 初始化 49 | * env: dev 开发环境, online 线上环境 50 | */ 51 | def init (env: String): Unit = { 52 | Conf.setEnv(env) 53 | 54 | println(Conf.getZookeeperQuorum()) 55 | // 连接 userUBCF 数据表 56 | this.hbaseResultTb = new HBaseClient("inventoryIBCF",Conf.getZookeeperQuorum()) 57 | println(Conf.getZookeeperQuorum()) 58 | } 59 | 60 | 61 | def calculate(): Unit = { 62 | 63 | /** 64 | * 初始化 spark 65 | */ 66 | val sparkConf = new SparkConf() 67 | sparkConf.setAppName("InventoryIBCF") 68 | sparkConf.setMaster("local[2]") 69 | 70 | /** 71 | * 初始化推荐模型 72 | */ 73 | val inventoryIBCF = new IBCF() 74 | 75 | // 合并累加 ItemAndItemMatrix 矩阵 76 | val itemAndItemMatrixCollection = inventoryIBCF.calculateByFile(sparkConf, characteristicsFile, separator) 77 | 78 | // 根据 ItemId , groupBy ItemMatrix 79 | val itemAndItemGroup = inventoryIBCF.itemMatrixGroupByItemId(itemAndItemMatrixCollection) 80 | 81 | var inventoryLine = 0 // 推荐行数 82 | println("----- 把聚合后的数据格式化成字符串保存在 Hbase -----") 83 | itemAndItemGroup.foreach(line => { 84 | val invetoryId = line._1 85 | val invetoryRsInfo = line._2 86 | 87 | // 把里面的 array 按照:组合, 最外层按照,组合 88 | val invetoryRsToString = invetoryRsInfo.map(f => f.mkString(":")).mkString(",") 89 | 90 | this.inventoryRecommendWriteHbase(invetoryId, invetoryRsToString) 91 | 92 | inventoryLine += 1 93 | }) 94 | 95 | println("") 96 | println("----- HBase Table inventoryIBCF: ", 97 | "写入了: " + inventoryLine + " 行") 98 | 99 | } 100 | 101 | 102 | /** 103 | * 保存推荐结果到 Hbase 104 | */ 105 | def inventoryRecommendWriteHbase(rowKey: String, value: String): Unit = { 106 | this.hbaseResultTb.insert(rowKey, "recommend", "inventoryRecommend", value) 107 | } 108 | 109 | 110 | 111 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/inventory/InventoryIBCFspark.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.inventory 2 | 3 | import play.api.libs.json._ 4 | import org.apache.log4j.{Level, Logger} 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.SparkConf 7 | 8 | import com.angejia.dw.hadoop.spark.CollaborativeFiltering 9 | import com.angejia.dw.common.util.{FileUtil} 10 | import com.angejia.dw.hadoop.hbase.HBaseClient 11 | 12 | 13 | /** 14 | * 这个是 spark mlib 算法实现的,不靠谱呵呵哒 15 | * 16 | * create 'inventoryRecommend',{NAME=>'inventoryRecommendInventory'} 17 | 18 | 19 | spark-submit \ 20 | --name InventoryIBCF \ 21 | --class com.angejia.dw.recommend.inventory.InventoryIBCF \ 22 | --master local[2] \ 23 | ~/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar "DataNode01" "inventoryRecommend" "/data/log/recommend/ml-100k/u.data" 24 | 25 | 参数: 26 | [zookeeperIds] [HBaseTableName] [characteristicsFile] 27 | */ 28 | object InventoryIBCFspark { 29 | 30 | var zookeeperIds: String = null 31 | var HBaseTableName: String = null 32 | var HBaseClientService: HBaseClient = null 33 | var characteristicsFile: String = null 34 | 35 | def main(args: Array[String]) { 36 | 37 | for (ar <- args) { 38 | println(ar) 39 | } 40 | 41 | // Hbase 配置 42 | zookeeperIds = args(0) 43 | HBaseTableName = args(1) 44 | HBaseClientService = new HBaseClient(HBaseTableName,zookeeperIds) 45 | 46 | // 等待训练的文件 47 | characteristicsFile = args(2) 48 | //characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/mytest" 49 | 50 | val inventoryIBCF = new InventoryIBCFspark() 51 | inventoryIBCF.characteristicsFile = characteristicsFile 52 | inventoryIBCF.run() 53 | } 54 | 55 | 56 | /** 57 | * 写 HBase 58 | */ 59 | def resultWriteHBase(rowKey: String, value: String) : Unit = { 60 | HBaseClientService.insert(rowKey, "inventoryRecommendInventory", "inventoryIds", value) 61 | } 62 | 63 | } 64 | 65 | /** 66 | * 看了又看算法 67 | */ 68 | class InventoryIBCFspark extends Serializable { 69 | 70 | // 提取特征的文件 71 | var characteristicsFile: String = null 72 | 73 | def run(): Unit = { 74 | Logger.getRootLogger.setLevel(Level.WARN) 75 | 76 | // 训练模型 77 | val inventoryTrainModel = this.inventoryTrain() 78 | } 79 | 80 | 81 | /** 82 | * 训练 inventory 模型 83 | */ 84 | //def inventoryTrain() : MatrixFactorizationModel = { 85 | def inventoryTrain() : Unit = { 86 | 87 | println("----- 开始初始化 -----") 88 | // SPARK 运行环境配置 89 | val conf = new SparkConf() 90 | conf.setAppName("InventoryIBCF") 91 | conf.setMaster("local[2]") 92 | 93 | conf.set("spark.cores.max", "4") // 16 map workers, that is 2 workers per machine (see my cluster config below) 94 | //conf.set("spark.akka.frameSize", "100000") 95 | conf.set("spark.driver.maxResultSize", "2g") 96 | conf.set("spark.executor.memory", "2g") 97 | conf.set("spark.reducer.maxMbInFlight", "100000") 98 | conf.set("spark.storage.memoryFraction", "0.9") 99 | conf.set("spark.shuffle.file.buffer.kb", "1000") 100 | conf.set("spark.broadcast.factory", "org.apache.spark.broadcast.HttpBroadcastFactory") 101 | 102 | val sc = new SparkContext(conf) 103 | 104 | // 读取数据源 105 | val sourceDataRDD = sc.textFile(characteristicsFile) 106 | //println(sourceDataRDD.first()) 107 | 108 | // 把行分割成数组,并且读取数组前 3 个元素 ,格式化后作为入参 109 | val ratingsRDD = sourceDataRDD.map(line => { 110 | val curLine = line.split("\t").map { x => x.toInt} 111 | Array(curLine(0), curLine(1) , curLine(2)) 112 | }) 113 | ratingsRDD.take(2).foreach(x => println(x(0) + ":" + x(1) + ":" + x(2))) 114 | 115 | 116 | // 把需要推荐的房源 id 抽取出来,去重 117 | val needRecommendInventoryIdsRDD = ratingsRDD.map(_(1)).distinct() 118 | needRecommendInventoryIdsRDD.take(2).foreach(x => println(x)) 119 | 120 | 121 | // 算出所有房源相关度 (调试) 122 | //val inventoryIds = List(308524, 213775, 276360, 206754) 123 | //val needRecommendInventoryIdsRDD = sc.parallelize(inventoryIds) 124 | //needRecommendInventoryIdsRDD.take(4).foreach {x => println(x)} 125 | 126 | // 计算需要推荐的房源的次数 127 | val inventoryIdsRddCount = needRecommendInventoryIdsRDD.count().toInt 128 | println("共需要推荐: " + inventoryIdsRddCount) 129 | 130 | 131 | println("----- 提取特征 -----") 132 | // IBCF 算法类 133 | val collaborativeFiltering = new CollaborativeFiltering() 134 | 135 | // 提取特征 136 | val characteristicsRDD = collaborativeFiltering.characteristics(ratingsRDD) 137 | 138 | 139 | println("----- 训练模型 -----") 140 | // 训练模型 141 | collaborativeFiltering.train(characteristicsRDD, 50, 10, 0.01) 142 | // 广播变量 143 | val collaborativeFilteringSignPrefixesRdd = sc.broadcast(collaborativeFiltering) 144 | 145 | // 累加器 146 | val blankLines = sc.accumulator(0) 147 | 148 | 149 | println("----- inventoryId 计算推荐的 inventoryIds -----") 150 | val inventoryResInventorysRDD = needRecommendInventoryIdsRDD.map { inventoryId => 151 | val itemCosineSimilarity = collaborativeFilteringSignPrefixesRdd.value.itemCosineSimilarity(inventoryId) 152 | 153 | var result = "" 154 | itemCosineSimilarity.take(100).foreach{ inventroyRes => 155 | val inventoryRId = inventroyRes._1 // 推荐的房源 ID 156 | val inventoryRSouce = inventroyRes._2 // 推荐的房源 分数 157 | result += inventoryId + ":" + inventoryRId + ":" + inventoryRSouce + "," 158 | } 159 | blankLines += 1 160 | println(blankLines) 161 | println("wirete: " + result) 162 | // 结果写到 HBase 163 | //InventoryIBCF.resultWriteHBase(inventoryId.toString(),result.toString()) 164 | }.take(inventoryIdsRddCount) 165 | 166 | 167 | 168 | /** 169 | println("----- inventoryId 计算推荐的 inventoryIds -----") 170 | // 为每个 inventoryId 计算推荐的 inventoryIds 171 | val inventoryResInventorysRDD = needRecommendInventoryIdsRDD.map { inventoryId => 172 | 173 | /** 物品余弦相似度计算 174 | * RDD[(Int, Double)] 返回详细值: 175 | (1,0.537378279119025) 176 | (3,0.37167637258108627) 177 | (5,0.6282701874791976) 178 | */ 179 | val itemCosineSimilarity = collaborativeFilteringSignPrefixesRdd.value.itemCosineSimilarity(inventoryId) 180 | 181 | val id = inventoryId 182 | val result: Array[(Int, Double)] = itemCosineSimilarity.take(5000) 183 | 184 | val t: (Int,Array[(Int, Double)]) = (id, result) 185 | t 186 | } 187 | 188 | // 计算需要推荐的房源的次数 189 | val inventoryIdsRddCount = needRecommendInventoryIdsRDD.count().toInt 190 | 191 | 192 | println("----- 准备开始写入数据 -----") 193 | /** 格式化数据 194 | * 写入到 Hbase 中 195 | */ 196 | val data = inventoryResInventorysRDD.map(inventoryData => { 197 | val inventoryId = inventoryData._1 // 123234 这种数据结构s 198 | val ResInventorys = inventoryData._2 // (2,0.5074470833019032) 这种数据结构 199 | 200 | var result = "" 201 | ResInventorys.foreach{ tuple => 202 | val inventoryRId = tuple._1 // 推荐的房源 ID 203 | val inventoryRSouce = tuple._2 // 推荐的房源 分数 204 | result += inventoryId + ":" + inventoryRId + ":" + inventoryRSouce + "," 205 | } 206 | 207 | // 结果写到 HBase 208 | InventoryIBCF.resultWriteHBase(inventoryId.toString(),result.toString()) 209 | 210 | result 211 | }).take(inventoryIdsRddCount) 212 | **/ 213 | 214 | } 215 | 216 | 217 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/inventory/InventoryItemCF.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.inventory 2 | 3 | import scala.collection.mutable.Map 4 | 5 | import org.apache.log4j.{Level, Logger} 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.SparkConf 8 | 9 | import com.angejia.dw.hadoop.hbase.HBaseClient 10 | import com.angejia.dw.common.util.{FileUtil} 11 | 12 | /** 13 | * IBCF 算法实现 14 | 参数: 15 | [zookeeperIds] [HBaseTableName] [characteristicsFile] 16 | */ 17 | object InventoryItemCF { 18 | 19 | var zookeeperIds: String = null 20 | var HBaseTableName: String = null 21 | var HBaseClientService: HBaseClient = null 22 | var characteristicsFile: String = null 23 | 24 | def main (args: Array[String]) { 25 | 26 | for (ar <- args) { 27 | println(ar) 28 | } 29 | 30 | // Hbase 配置 31 | zookeeperIds = args(0) 32 | HBaseTableName = args(1) 33 | HBaseClientService = new HBaseClient(HBaseTableName,zookeeperIds) 34 | 35 | // 等待训练的文件 36 | characteristicsFile = args(2) 37 | //characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/00000*" 38 | //characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/mytest2" 39 | //characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/000000_1" 40 | 41 | this.calculate() 42 | } 43 | 44 | 45 | /** 46 | * 算法逻辑 47 | */ 48 | def calculate() : Unit = { 49 | println("----- 初始化 -----") 50 | val conf = new SparkConf() 51 | conf.setAppName("InventoryIBCF") 52 | conf.setMaster("local[2]") 53 | 54 | val sc = new SparkContext(conf) 55 | 56 | println("----- 加载数据源: " + characteristicsFile + " -----") 57 | // 读取数据源 58 | val sourceDataRDD = sc.textFile(characteristicsFile) 59 | //println(sourceDataRDD.first()) 60 | 61 | 62 | println("----- 过滤数据源 -----") 63 | 64 | // 筛选过滤原始数据 65 | val filterSourceDataRDD = sourceDataRDD.filter { line => 66 | var checkStatus = true 67 | if (line.isEmpty()) { 68 | checkStatus = false 69 | } 70 | val lineArr = line.split("\t") 71 | if (lineArr.length < 2) { 72 | checkStatus = false 73 | } else { 74 | val userId = lineArr.apply(0) 75 | val inventoryId = lineArr.apply(1) 76 | if (userId.matches("[0-9]+") == false) { 77 | checkStatus = false 78 | } 79 | if (inventoryId.matches("[0-9]+") == false) { 80 | checkStatus = false 81 | } 82 | } 83 | checkStatus 84 | } 85 | 86 | println("----- 归并用户 item 集合 -----") 87 | // 用户喜欢 items 的集合 RDD 88 | val userLikeItemsCollectionRDD = filterSourceDataRDD.map { line => 89 | val lineArr = line.split("\t") 90 | val userId = lineArr.apply(0).toInt 91 | val inventoryId = lineArr.apply(1).toInt 92 | (userId, inventoryId) 93 | }.groupByKey() 94 | 95 | 96 | println("----- 用户物品集合生成矩阵 -----") 97 | // 用户的 Item 矩阵 B 98 | val userItemMatrixsRDD = userLikeItemsCollectionRDD.map{userAndItems => 99 | // 用户喜欢物品的集合 100 | val userItems = userAndItems._2 101 | 102 | /** 103 | * 数据结构 104 | Map("50:57"->1, "57:50"->1, "51:55"->1) 105 | */ 106 | // 保存用户,每个物品对的矩阵 B 107 | val userItemMatrix : Map[String,Int] = Map[String,Int]() 108 | 109 | // 二二配对当前用户的物品, 为每个用户,产出一个 B 矩阵 110 | for (i <- userItems) { 111 | for (j <- userItems) { 112 | // 排除相同的物品 113 | if (i != j) { 114 | // 默认为每个用户 +1 个访问次数 115 | //userItemMatrix += Map(i -> Map(j -> 1)) 116 | val key = i.toString() + ":" + j.toString() 117 | userItemMatrix.put(key ,1) 118 | } 119 | } 120 | } 121 | 122 | userItemMatrix 123 | } 124 | 125 | //userItemMatrixs.take(10) 126 | //exit() 127 | 128 | 129 | println("----- 合并所有物品矩阵 -----") 130 | // 合并最终的矩阵 131 | val itemAndItemMatriCollection = userItemMatrixsRDD.reduce{ (x, y) => 132 | 133 | var curMatrix = x 134 | var nextMatrix = y 135 | 136 | /** 137 | * 目标 : 138 | * 1. 把 curMatrix 和 nextMatrix 相同 key 的值相加 139 | * 2. 把 nextMatrix 不在 curMatrix 中的原样追加到 curMatrix 140 | */ 141 | for ((yK, yV) <- nextMatrix) { 142 | 143 | if (curMatrix.contains(yK) == true) { 144 | curMatrix(yK) += nextMatrix(yK) 145 | } else { 146 | curMatrix.put(yK,yV) 147 | } 148 | } 149 | 150 | curMatrix 151 | } 152 | //exit() 153 | //println(itemAndItemMatrix.toBuffer) 154 | 155 | 156 | /** 把同类型的物品, 聚合到一起 157 | (51, 51:55:2, 51:52:2, 51:53:2, 51:56:1) 158 | (56, 56:53:1, 56:55:1, 56:52:1,56:51:1) 159 | */ 160 | 161 | println("----- 聚合同类型物品 -----") 162 | val itemAndItemGroup = itemAndItemMatriCollection.map( f => { 163 | val ids = f._1.split(":") 164 | val invetoryId = ids(0).toString() // 房源 ID 165 | val invetoryRsId = ids(1).toString() // 推荐房源 ID 166 | val invetoryRsIdCnt = f._2.toString() // 共同看过的人数 167 | // 转换成数组 168 | Array(invetoryId, invetoryRsId, invetoryRsIdCnt) 169 | //println(invetoryId, invetoryRsId , invetoryRsIdCnt) 170 | // 把 171 | }).groupBy { 172 | // 然后, 按照 invetoryId 把把同类的房源 ID groupBy 到一起 173 | f => f(0) 174 | } 175 | 176 | val blankLines = sc.accumulator(0) 177 | 178 | println("----- 把聚合后的数据格式化成字符串 -----") 179 | val itemAndItemGroupToStringRDD = itemAndItemGroup.map(line => { 180 | val invetoryId = line._1 181 | val invetoryRsInfo = line._2 182 | // 把里面的 array 按照:组合, 最外层按照,组合 183 | val invetoryRsToString = invetoryRsInfo.map(f => f.mkString(":")).mkString(",") 184 | 185 | blankLines += 1 186 | // println("write[" + blankLines + "]: " + invetoryId) 187 | //结果写到 Hbase (invetoryId, invetoryRsToString) 188 | resultWriteHBase(invetoryId, invetoryRsToString) 189 | }) 190 | 191 | println("") 192 | println("----- HBase Table inventoryRecommend: ", 193 | " inventoryRecommend 写入了: " + blankLines + " 行") 194 | 195 | //itemAndItemGroupToStringRDD 196 | /* 197 | //val blankLines = sc.accumulator(0) 198 | println("----- 写到 Hbase -----") 199 | var n = 0 200 | itemAndItemGroupToStringRDD.foreach(f => { 201 | n += 1 202 | println("write[" + n + "]: " + f._1) 203 | resultWriteHBase(f._1, f._2) 204 | }) 205 | 206 | 207 | */ 208 | } 209 | 210 | // 拆解 Map 返回 211 | def dismantlingMap (data: Map[Int,Int]): (Int, Int) = { 212 | val keys = data.keySet.toArray 213 | 214 | (keys(0), data.get(keys(0)).get) 215 | } 216 | 217 | 218 | def resultWriteHBase(rowKey: String, value: String) : Unit = { 219 | //FileUtil.fileOutputStream("/data/log/recommend/result",rowKey + "--" + value + "\n",true) 220 | HBaseClientService.insert(rowKey, "inventoryRecommendInventory", "inventoryIds", value) 221 | } 222 | 223 | 224 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/inventory/InventoryItemCFBak.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.inventory 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.SparkConf 6 | import scala.collection.mutable.Map 7 | import scala.collection.mutable.LinkedList 8 | import scala.collection.mutable.ArrayBuffer 9 | 10 | import com.angejia.dw.hadoop.hbase.HBaseClient 11 | import com.angejia.dw.common.util.{FileUtil} 12 | 13 | 14 | object InventoryItemCFBak { 15 | 16 | var zookeeperIds: String = null 17 | var HBaseTableName: String = null 18 | var HBaseClientService: HBaseClient = null 19 | var characteristicsFile: String = null 20 | 21 | def main (args: Array[String]) { 22 | 23 | for (ar <- args) { 24 | println(ar) 25 | } 26 | 27 | 28 | 29 | // Hbase 配置 30 | zookeeperIds = args(0) 31 | HBaseTableName = args(1) 32 | //HBaseClientService = new HBaseClient(HBaseTableName,zookeeperIds) 33 | 34 | // 等待训练的文件 35 | characteristicsFile = args(2) 36 | characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/00000*" 37 | //characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/mytest2" 38 | 39 | this.calculate() 40 | } 41 | 42 | def calculate() : Unit = { 43 | println("----- 初始化 -----") 44 | val conf = new SparkConf() 45 | conf.setAppName("InventoryIBCF") 46 | conf.setMaster("local[2]") 47 | 48 | val sc = new SparkContext(conf) 49 | 50 | println("----- 加载数据源: " + characteristicsFile + " -----") 51 | // 读取数据源 52 | val sourceDataRDD = sc.textFile(characteristicsFile) 53 | //println(sourceDataRDD.first()) 54 | 55 | 56 | println("----- 归并用户 item 集合 -----") 57 | // 用户喜欢 items 的集合 RDD 58 | val userLikeItemsCollectionRDD = sourceDataRDD.map(line => { 59 | val curLine = line.split("\t").map { x => x.toInt} 60 | //println(curLine(0) + " " + curLine(1) + " " + curLine(2)) 61 | (curLine(0), curLine(1)) 62 | }).groupByKey() 63 | 64 | 65 | println("----- 用户物品集合生成矩阵 -----") 66 | // 用户的 Item 矩阵 B 67 | val userItemMatrixs = userLikeItemsCollectionRDD.map{userAndItems => 68 | val userItems = userAndItems._2 69 | 70 | /* 71 | val map : Map[Int,Map[Int,Int]] = Map[Int,Map[Int,Int]]() 72 | var am : Map[Int,Int] = Map[Int,Int]() 73 | val a = Map(1 -> 2) 74 | map.put(1,a) 75 | println(map(1)(1)) 76 | * */ 77 | 78 | // 保存用户,每个物品对的矩阵 B 79 | //var itemMatrix = ArrayBuffer[ArrayBuffer[Int]]() 80 | //var itemMatrix = Array.ofDim[Int](55,55) 81 | var userItemMatrix : Map[Int,Map[Int,Int]] = Map[Int,Map[Int,Int]]() 82 | 83 | // 二二配对当前用户的物品, 为每个用户,产出一个 B 矩阵 84 | for (i <- userItems) { 85 | for (j <- userItems) { 86 | // 排除相同的物品 87 | if (i != j) { 88 | // 默认为每个用户 +1 个访问次数 89 | userItemMatrix.put(i,Map(j -> 1)) 90 | } 91 | } 92 | } 93 | userItemMatrix 94 | } 95 | 96 | 97 | println("----- 合并用户矩阵 -----") 98 | 99 | // 合并最终的矩阵 100 | val itemAndItemMatrix = userItemMatrixs.reduce{ (x, y) => 101 | // var curRsMatrix : Map[Int,Map[Int,Int]] = Map[Int,Map[Int,Int]]() 102 | 103 | var curMatrix = x 104 | var nextMatrix = y 105 | 106 | // 合并 2 个 map ,把用户矩阵相加 107 | for ((yK, yV) <- nextMatrix) { 108 | val iKey = yK 109 | val tmp = yV.keySet.toArray 110 | val jKey = tmp(0).toInt 111 | 112 | // 当前 x 的 map key 和 // 子 map 的 key 相同 113 | if (x.contains(iKey) == true && x.get(iKey).get.contains(jKey) == true) { 114 | curMatrix(iKey)(jKey) += 1 115 | } else { 116 | // 追加矩阵 117 | curMatrix.put(iKey,yV) 118 | } 119 | //println("yK:" + yK + " yV:" + yV) 120 | } 121 | 122 | //println("-----") 123 | curMatrix 124 | } 125 | 126 | //println(itemAndItemMatrix.toBuffer) 127 | 128 | println("----- 写到 Hbase -----") 129 | itemAndItemMatrix.foreach{ f => 130 | val invetoryId = f._1 131 | val invetoryRsInfo = this.dismantlingMap(f._2) 132 | val invetoryRsId = invetoryRsInfo._1 133 | val invetoryRsIdCnt = invetoryRsInfo._2 134 | 135 | println(f) 136 | println(invetoryId , invetoryRsId , invetoryRsIdCnt) 137 | println("-----") 138 | val key = invetoryId.toString() + ":" 139 | val value = invetoryRsId.toString() + ":" + invetoryRsIdCnt.toString() 140 | this.resultWriteHBase(key,value) 141 | //this.resultWriteHBase() 142 | 143 | } 144 | 145 | //itemAndItemMatrix.foreach(f => println(f)) 146 | //itemAndItemMatrix.take(10) 147 | 148 | 149 | } 150 | 151 | 152 | // 拆解 Map 返回 153 | def dismantlingMap (data: Map[Int,Int]): (Int, Int) = { 154 | val keys = data.keySet.toArray 155 | 156 | (keys(0), data.get(keys(0)).get) 157 | } 158 | 159 | 160 | def resultWriteHBase(rowKey: String, value: String) : Unit = { 161 | //FileUtil.fileOutputStream("/data/log/recommend/result","",false) 162 | FileUtil.fileOutputStream("/data/log/recommend/result",rowKey + value + "\n",true) 163 | //HBaseClientService.insert(rowKey, "inventoryRecommendInventory", "inventoryIds", value) 164 | } 165 | 166 | 167 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/inventory/InventoryItemCFTest.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.inventory 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.SparkConf 6 | import scala.collection.mutable.Map 7 | 8 | import com.angejia.dw.hadoop.hbase.HBaseClient 9 | import com.angejia.dw.common.util.{FileUtil} 10 | 11 | /** 12 | * create 'inventoryRecommend',{NAME=>'inventoryRecommendInventory'} 13 | 14 | 15 | spark-submit \ 16 | --name InventoryIBCF \ 17 | --class com.angejia.dw.recommend.inventory.InventoryIBCF \ 18 | --master local[2] \ 19 | ~/app/recommend/recommend-2.0/target/scala-2.10/recommend-2.0.jar "DataNode01" "inventoryRecommend" "/data/log/recommend/ml-100k/u.data" 20 | 21 | 参数: 22 | [zookeeperIds] [HBaseTableName] [characteristicsFile] 23 | */ 24 | object InventoryItemCFTest { 25 | 26 | var characteristicsFile: String = null 27 | 28 | def main (args: Array[String]) { 29 | 30 | for (ar <- args) { 31 | println(ar) 32 | } 33 | 34 | 35 | // 等待训练的文件 36 | characteristicsFile = args(0) 37 | //characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/00000*" 38 | 39 | this.calculate() 40 | } 41 | 42 | 43 | /** 44 | * 算法逻辑 45 | */ 46 | def calculate() : Unit = { 47 | println("----- 初始化 -----") 48 | val conf = new SparkConf() 49 | conf.setAppName("InventoryIBCF") 50 | conf.setMaster("local[2]") 51 | 52 | val sc = new SparkContext(conf) 53 | 54 | println("----- 加载数据源: " + characteristicsFile + " -----") 55 | // 读取数据源 56 | val sourceDataRDD = sc.textFile(characteristicsFile) 57 | //println(sourceDataRDD.first()) 58 | 59 | 60 | println("----- 归并用户 item 集合 -----") 61 | // 用户喜欢 items 的集合 RDD 62 | val userLikeItemsCollectionRDD = sourceDataRDD.map(line => { 63 | val curLine = line.split("\t").map { x => x.toInt} 64 | (curLine(0), curLine(1)) 65 | }).groupByKey() 66 | 67 | 68 | println("----- 用户物品集合生成矩阵 -----") 69 | // 用户的 Item 矩阵 B 70 | val userItemMatrixs = userLikeItemsCollectionRDD.map{userAndItems => 71 | // 用户喜欢物品的集合 72 | val userItems = userAndItems._2 73 | 74 | /** 75 | * 数据结构 76 | Map("50:57"->1, "57:50"->1, "51:55"->1) 77 | */ 78 | // 保存用户,每个物品对的矩阵 B 79 | val userItemMatrix : Map[String,Int] = Map[String,Int]() 80 | 81 | // 二二配对当前用户的物品, 为每个用户,产出一个 B 矩阵 82 | for (i <- userItems) { 83 | for (j <- userItems) { 84 | // 排除相同的物品 85 | if (i != j) { 86 | // 默认为每个用户 +1 个访问次数 87 | //userItemMatrix += Map(i -> Map(j -> 1)) 88 | val key = i.toString() + ":" + j.toString() 89 | userItemMatrix.put(key ,1) 90 | } 91 | } 92 | } 93 | 94 | userItemMatrix 95 | } 96 | 97 | //userItemMatrixs.take(10) 98 | //exit() 99 | 100 | 101 | println("----- 合并所有用户物品矩阵 -----") 102 | // 合并最终的矩阵 103 | val itemAndItemMatrixRDD = userItemMatrixs.reduce{ (x, y) => 104 | 105 | var curMatrix = x 106 | var nextMatrix = y 107 | 108 | /** 109 | * 目标 : 110 | * 1. 把 curMatrix 和 nextMatrix 相同 key 的值相加 111 | * 2. 把 nextMatrix 不在 curMatrix 中的原样追加到 curMatrix 112 | */ 113 | for ((yK, yV) <- nextMatrix) { 114 | 115 | if (curMatrix.contains(yK) == true) { 116 | curMatrix(yK) += nextMatrix(yK) 117 | } else { 118 | curMatrix.put(yK,yV) 119 | } 120 | } 121 | 122 | curMatrix 123 | } 124 | //exit() 125 | //println(itemAndItemMatrix.toBuffer) 126 | 127 | 128 | /** 把同类型的物品, 聚合到一起 129 | (51, 51:55:2, 51:52:2, 51:53:2, 51:56:1) 130 | (56, 56:53:1, 56:55:1, 56:52:1,56:51:1) 131 | */ 132 | 133 | println("----- 聚合同类型物品 -----") 134 | val itemAndItemGroupRDD = itemAndItemMatrixRDD.map( f => { 135 | val ids = f._1.split(":") 136 | val invetoryId = ids(0).toString() // 房源 ID 137 | val invetoryRsId = ids(1).toString() // 推荐房源 ID 138 | val invetoryRsIdCnt = f._2.toString() // 共同看过的人数 139 | // 转换成数组 140 | Array(invetoryId, invetoryRsId, invetoryRsIdCnt) 141 | //println(invetoryId, invetoryRsId , invetoryRsIdCnt) 142 | // 把 143 | }).groupBy { 144 | // 然后, 按照 invetoryId 把把同类的房源 ID groupBy 到一起 145 | f => f(0) 146 | } 147 | 148 | val blankLines = sc.accumulator(0) 149 | 150 | println("----- 把聚合后的数据格式化成字符串 -----") 151 | val itemAndItemGroupToStringRDD = itemAndItemGroupRDD.map(line => { 152 | val invetoryId = line._1 153 | val invetoryRsInfo = line._2 154 | // 把里面的 array 按照:组合, 最外层按照,组合 155 | val invetoryRsToString = invetoryRsInfo.map(f => f.mkString(":")).mkString(",") 156 | 157 | blankLines += 1 158 | println("write[" + blankLines + "]: " + invetoryId) 159 | 160 | }) 161 | 162 | 163 | } 164 | 165 | 166 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/inventory/portrait/InventoryPortraitCommon.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.inventory.portrait 2 | 3 | import scala.collection.mutable.HashMap 4 | import scala.collection.mutable.Map 5 | 6 | import com.angejia.dw.common.util.mysql.MysqlClient 7 | import com.angejia.dw.recommend.user.portrait.UserPortraitCommon 8 | import com.angejia.dw.recommend.user.portrait.UserPortraitTags 9 | 10 | object InventoryPortraitCommon { 11 | 12 | // mysql 数据库连接对象 13 | var mysqlClient: MysqlClient = null 14 | 15 | /** 16 | * 通过房源 Id 获取房源画像基础数据 17 | */ 18 | private def getInventoryPortraitByInventoryId(inventoryId: String): HashMap[String, String] = { 19 | var querySql = sqlStmt.format(inventoryId.toInt) 20 | val res = mysqlClient.select(querySql) 21 | val result = new HashMap[String, String]() 22 | if (!res.isEmpty) { 23 | for ((k, v) <- res(0)) { 24 | result.put(k, v.toString) 25 | } 26 | } 27 | result 28 | } 29 | 30 | /** 31 | * 通过房源 Id 获取, 获取标签code 与房源属性的 Mapping 数据 32 | */ 33 | def getUserTagsInventoryMappingByInventoryId(inventoryId: String): Map[String, String] = { 34 | val rs = Map[String, String]() 35 | 36 | val inventoryPortrait = this.getInventoryPortraitByInventoryId(inventoryId) 37 | if (!inventoryPortrait.isEmpty) { 38 | val cityId = inventoryPortrait.getOrElse("city_id", "0") 39 | rs.put(UserPortraitCommon.cityTagCode, cityId) 40 | 41 | val districtId = inventoryPortrait.getOrElse("district_id", "0") 42 | rs.put(UserPortraitCommon.districtTagCode, districtId) 43 | 44 | val blockId = inventoryPortrait.getOrElse("block_id", "0") 45 | rs.put(UserPortraitCommon.blockTagCode, blockId) 46 | 47 | val communityId = inventoryPortrait.getOrElse("community_id", "0") 48 | rs.put(UserPortraitCommon.communityTagCode, communityId) 49 | 50 | val bedrooms = inventoryPortrait.getOrElse("bedrooms", "0") 51 | rs.put(UserPortraitCommon.bedroomsTagCode, bedrooms) 52 | 53 | // 价格转换为价格段 54 | val price = inventoryPortrait.getOrElse("price", "0") 55 | val priceTierId = UserPortraitTags.getPriceTier(price) 56 | rs.put(UserPortraitCommon.priceTagCode, priceTierId) 57 | } 58 | 59 | rs 60 | } 61 | 62 | val sqlStmt = """ 63 | SELECT 64 | community.city_id AS city_id 65 | ,community.district_id AS district_id 66 | ,community.block_id AS block_id 67 | ,house.community_id AS community_id 68 | ,inventory.id AS inventory_id 69 | ,inventory.price AS price 70 | ,inventory.area AS area 71 | ,inventory.is_real AS is_real 72 | ,inventory.survey_status AS survey_status 73 | ,inventory.source AS source 74 | ,inventory.has_checked AS has_checked 75 | ,inventory.created_at AS created_at 76 | ,inventory.updated_at AS updated_at 77 | ,inventory.verify_status AS verify_status 78 | ,inventory.status AS status 79 | ,house.orientation AS orientation 80 | ,property.bedrooms AS bedrooms 81 | ,house.floor AS floor 82 | ,house.total_floors AS total_floors 83 | FROM 84 | property.inventory AS inventory 85 | LEFT JOIN property.property AS property on inventory.property_id = property.id 86 | LEFT JOIN property.house AS house on property.house_id = house.id 87 | LEFT JOIN angejia.community AS community on house.community_id = community.id 88 | WHERE inventory.id = %d 89 | """ 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/inventory/portrait/MarketingInventoryPortrait.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.inventory.portrait 2 | 3 | import scala.collection.mutable.Map 4 | import scala.collection.mutable.HashMap 5 | import com.angejia.dw.hadoop.hbase.HBaseClient 6 | 7 | import com.angejia.dw.recommend.user.portrait.UserPortraitCommon 8 | import com.angejia.dw.recommend.user.portrait.UserPortraitTags 9 | import com.angejia.dw.common.util.mysql.MysqlClient 10 | 11 | object MarketingInventoryPortrait { 12 | /** 13 | * 通过房源 Id 获取房源画像基础数据 14 | */ 15 | private def getInventoryPortraitByInventoryId(inventoryId: String): HashMap[String, String] = { 16 | var querySql = sqlStmt.format(inventoryId.toInt) 17 | val res = UserPortraitCommon.mysqlClient.select(querySql) 18 | val result = new HashMap[String, String]() 19 | if (!res.isEmpty) { 20 | for ((k, v) <- res(0)) { 21 | result.put(k, v.toString) 22 | } 23 | } 24 | result 25 | } 26 | 27 | /** 28 | * 通过房源 Id 获取, 获取标签code 与房源属性的 Mapping 数据 29 | */ 30 | def getUserTagsInventoryMappingByInventoryId(inventoryId: String): Map[String, String] = { 31 | val rs = Map[String, String]() 32 | 33 | val inventoryPortrait = this.getInventoryPortraitByInventoryId(inventoryId) 34 | if (!inventoryPortrait.isEmpty) { 35 | val cityId = inventoryPortrait.getOrElse("city_id", "0") 36 | rs.put(UserPortraitCommon.cityTagCode, cityId) 37 | 38 | val districtId = inventoryPortrait.getOrElse("district_id", "0") 39 | rs.put(UserPortraitCommon.districtTagCode, districtId) 40 | 41 | val blockId = inventoryPortrait.getOrElse("block_id", "0") 42 | rs.put(UserPortraitCommon.blockTagCode, blockId) 43 | 44 | val communityId = inventoryPortrait.getOrElse("community_id", "0") 45 | rs.put(UserPortraitCommon.communityTagCode, communityId) 46 | 47 | val bedrooms = inventoryPortrait.getOrElse("bedrooms", "0") 48 | rs.put(UserPortraitCommon.bedroomsTagCode, bedrooms) 49 | 50 | // 价格转换为价格段 51 | val price = inventoryPortrait.getOrElse("price", "0") 52 | val priceTierId = UserPortraitTags.getPriceTier(price) 53 | rs.put(UserPortraitCommon.priceTagCode, priceTierId) 54 | } 55 | 56 | rs 57 | } 58 | 59 | val sqlStmt = """ 60 | SELECT 61 | city_id 62 | , district_id 63 | , block_id 64 | , community_id 65 | , id 66 | , price 67 | , area 68 | , publish_time AS created_at 69 | , orientation 70 | , bedrooms 71 | , floor 72 | , total_floors 73 | FROM angejia.marketing_inventory 74 | WHERE id = %d 75 | """ 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/user/UserUBCF.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.user 2 | 3 | import scala.collection.mutable.Map 4 | import scala.collection.mutable.HashMap 5 | import scala.collection.mutable.ListBuffer 6 | 7 | import org.apache.log4j.{ Level, Logger } 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.SparkConf 10 | 11 | import com.angejia.dw.recommend.Conf 12 | import com.angejia.dw.hadoop.hbase.HBaseClient 13 | import com.angejia.dw.recommend.UBCF 14 | 15 | /** 16 | * UBCF 算法实现 17 | * 18 | * create 'userUBCF',{NAME=>'relation'},{NAME=>'recommend'},{NAME=>'baseInfo'} 19 | * relation 用户关系 20 | * recommend 用户推荐 21 | */ 22 | object UserUBCF { 23 | 24 | // hbase 数据表 25 | var hbaseResultTb: HBaseClient = null 26 | 27 | // 等待训练的数据文件 28 | var characteristicsFile: String = null 29 | 30 | // 文件分隔符 31 | var separator = "\t" 32 | 33 | def main(args: Array[String]) { 34 | 35 | for (ar <- args) { 36 | println(ar) 37 | } 38 | 39 | val env = args(0) 40 | this.init(env) 41 | 42 | this.characteristicsFile = args(1) 43 | 44 | //this.separator = " " 45 | //this.characteristicsFile = "/data/log/recommend/recommend_user_inventory_history/mytest" 46 | 47 | this.calculate() 48 | } 49 | 50 | /** 51 | * 初始化 52 | * env: dev 开发环境, online 线上环境 53 | */ 54 | def init(env: String): Unit = { 55 | Conf.setEnv(env) 56 | 57 | // 连接 userUBCF 数据表 58 | this.hbaseResultTb = new HBaseClient("userUBCF", Conf.getZookeeperQuorum()) 59 | } 60 | 61 | def calculate(): Unit = { 62 | /** 63 | * 初始化 spark 64 | */ 65 | val sparkConf = new SparkConf() 66 | sparkConf.setAppName("UserUBCF") 67 | sparkConf.setMaster("local[2]") 68 | 69 | /** 70 | * 初始化推荐模型 71 | */ 72 | val userUBCF = new UBCF() 73 | 74 | // user -> uesr 相似度矩阵 75 | val userAndUserMatrixCollection = userUBCF.calculateByFile(sparkConf, characteristicsFile, separator) 76 | 77 | // 根据 userId , groupBy userRelationMatrix 78 | val userRelationGroup = userUBCF.userRelationMatrixGroupByUserId(userAndUserMatrixCollection) 79 | 80 | // user -> items 集合 81 | val userItemsCollectionRDD = userUBCF.userAndItemsCollection() 82 | 83 | println("----- userId -> items 集合本地化 ") 84 | val userItemsCollectionMap: scala.collection.immutable.Map[String, Iterable[(String, Int)]] = userItemsCollectionRDD.collect().toMap 85 | 86 | println("----- Start : 基于 userRelationGroup 集合, 持久化数据到 Hbase ----- \n") 87 | 88 | println("----- user 关联结果, user 推荐结果, 持久化到 Hbase Table ") 89 | var userRelationMatrixLineNum = 0 // user 关系矩阵行数 90 | var userRecommendLineNum = 0 // user 推荐行数 91 | 92 | // 遍历 user -> users 相似度集合 93 | userRelationGroup.foreach { userRelationInfo => 94 | 95 | val curUserId = userRelationInfo._1 // userId 96 | val curUserRelationUsers = userRelationInfo._2 // 关联的 users 集合 97 | 98 | // ---------- Start user -> user 集合关系保存到 Hbase ---------- 99 | 100 | val userRelationToString = curUserRelationUsers.map(userRelation => userRelation.mkString(":")).mkString(",") 101 | this.userRelationMatrixWriteHbase(curUserId, userRelationToString) 102 | 103 | // ---------- End user -> user 集合关系保存到 Hbase ---------- 104 | 105 | // ---------- Start 为 user 推荐 items ---------- 106 | 107 | // 保存当前 user 最终推荐的 items 108 | val userRecommendItemsRs = ListBuffer[Array[String]]() 109 | //val userRecommendItems = ListBuffer[Map[String,String]]() 110 | 111 | // 当前 user 已经推荐的 item Ids , 用来保存已经存在 itemIds 112 | //val userRecommendItemsPool = ListBuffer[String]() 113 | val userRecommendItemsPool = Map[String, String]() 114 | 115 | // 当前 user 自身的 items 116 | val curUserItems = userItemsCollectionMap.getOrElse(curUserId, null) 117 | if (curUserItems != null) { 118 | curUserItems.foreach { itemInfo => 119 | val itemId = itemInfo._1 120 | val itemPf = itemInfo._2 121 | 122 | // 组合推荐结果 123 | val rsInfo = Array( 124 | curUserId, // 当期 userId 125 | "0", // 关联 userId (因为是自身的所以用 0 表示) 126 | "0", // 关联 user 相似度分数 (因为是自身的所以用 0 表示) 127 | itemId.toString(), // 关联 user ItemId 128 | itemPf.toString() // 关联 user item 喜欢次数 129 | ) 130 | 131 | userRecommendItemsRs.append(rsInfo) 132 | } 133 | } 134 | 135 | // 当前 user Relation user 下的 items 集合, 把当前 user items 不存在的 item 追加进去, 最终汇总后作为推荐结果 136 | curUserRelationUsers.foreach { relationUserInfo => 137 | val relationUserId = relationUserInfo.apply(1) // 关联 uesrId 138 | val relationUserPf = relationUserInfo.apply(2) // 关联 user 的相似度分数 139 | 140 | // 相似度分数大于 1 才会推荐 141 | if (relationUserPf.toInt > 1) { 142 | // 关联 user 的 items 143 | val relationUserItems = userItemsCollectionMap.getOrElse(relationUserId, null) 144 | if (relationUserItems != null) { 145 | 146 | relationUserItems.foreach { itemInfo => 147 | val itemId = itemInfo._1 148 | val itemPf = itemInfo._2 149 | 150 | // 若推荐的 items 已经存在, 则不推荐了 151 | if (!userRecommendItemsPool.contains(itemId.toString())) { 152 | // 组合推荐结果 153 | val rsInfo = Array( 154 | curUserId, // 当期 userId 155 | relationUserId, // 关联 userId 156 | relationUserPf, // 关联 user 相似度分数 157 | itemId.toString(), // 关联 user ItemId 158 | itemPf.toString() // 关联 user item 喜欢次数 159 | ) 160 | 161 | // userRecommendItemsPool.append(itemId.toString()) 162 | userRecommendItemsPool.put(itemId.toString(), "exist") 163 | 164 | userRecommendItemsRs.append(rsInfo) 165 | } 166 | // println(rsInfo.toBuffer) 167 | } 168 | } 169 | } 170 | } 171 | 172 | // 当前推荐结果转换成 字符串 173 | val userRecommendItemsToString = userRecommendItemsRs.map(recommendItemInfo => recommendItemInfo.mkString(":")).mkString(",") 174 | // println(curUserId, userRecommendItemsRs.size) 175 | 176 | // userRecommendItemsRs.foreach{ f => println(f.toBuffer) } 177 | this.userRecommendWriteHbase(curUserId, userRecommendItemsToString) 178 | 179 | // ---------- End 为 user 推荐 items ---------- 180 | if (!userRecommendItemsRs.isEmpty) userRecommendLineNum += 1 181 | userRelationMatrixLineNum += 1 182 | } 183 | println("") 184 | println("----- HBase Table userUBCF: ", 185 | " userRelationMatrixLineNum 写入了: " + userRelationMatrixLineNum + " 行", 186 | " userRecommendLineNum 写入了: " + userRecommendLineNum + " 行") 187 | println("") 188 | 189 | println("----- End : 基于 userRelationGroup 集合, 持久化数据到 Hbase -----") 190 | 191 | } 192 | 193 | /** 194 | * uesr 关系矩阵, 写入到 Hbase 195 | */ 196 | def userRelationMatrixWriteHbase(rowKey: String, value: String): Unit = { 197 | this.hbaseResultTb.insert(rowKey, "relation", "userRelation", value) 198 | } 199 | 200 | /** 201 | * 保存推荐结果到 Hbase 202 | */ 203 | def userRecommendWriteHbase(rowKey: String, value: String): Unit = { 204 | this.hbaseResultTb.insert(rowKey, "recommend", "userRecommend", value) 205 | } 206 | 207 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitBrowse.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.user.portrait 2 | 3 | import scala.collection.mutable.ArrayBuffer 4 | import scala.collection.mutable.HashMap 5 | import scala.collection.mutable.Map 6 | 7 | import com.angejia.dw.common.util.DateUtil 8 | import com.angejia.dw.common.util.RegexUtil 9 | 10 | /** 11 | * 浏览房源数据行为 12 | * 根据 URL 找出 房源 Id 13 | * 1. 查找 Hbase 房源 ID 的,城市,区域,板块,小区,户型,所属的价格区间段 14 | * 2. 读取 Hbase 中的以上标签原始 Json 数据 15 | * 3. 对找出的标签进行 浏览加分后写回 Hbase 16 | */ 17 | object UserPortraitBrowse { 18 | 19 | val actionName = "Browse" 20 | 21 | // 当前处理的 userId 22 | var userId: String = new String() 23 | 24 | def setUserId(userId: String): Unit = { 25 | this.userId = userId 26 | } 27 | 28 | def getUserId(): String = { 29 | this.userId 30 | } 31 | 32 | // 请求的 URI 33 | var requestUri = new String() 34 | def setRequestUri(uri: String): Unit = { 35 | this.requestUri = uri 36 | } 37 | def getRequestUri(): String = { 38 | this.requestUri 39 | } 40 | 41 | var inventoryId = new String() 42 | 43 | // 房源单页 uri 正则匹配 44 | val browseRegex = """^/mobile/member/(?:inventories|inventory/detail)/\d+/(\d+)""" 45 | 46 | /** 47 | * 本行为中用到的用户标签 48 | */ 49 | var userTags = Map[String, String]() 50 | 51 | def run(): String = { 52 | // 推荐状态 53 | var reStatus = "no" 54 | 55 | // 清空 56 | this.inventoryId = "" 57 | 58 | /** 59 | * 解析出 Url 中的article id 60 | */ 61 | val articleId = RegexUtil.findStrData(this.browseRegex, this.getRequestUri()) 62 | 63 | if (articleId.isEmpty() || articleId == "") return reStatus 64 | 65 | /* 66 | * 根据article获取inventory id和resource 67 | * 若resource为1,为安个家二手房 68 | * 若resource为2,则为营销房源 69 | */ 70 | val sql = "SELECT inventory_id,resource " + 71 | "FROM angejia.article " + 72 | "WHERE id = " + articleId 73 | val article: ArrayBuffer[HashMap[String, Any]] = UserPortraitCommon.mysqlClient.select(sql) 74 | 75 | if (article.length != 1) { 76 | return reStatus 77 | } 78 | 79 | inventoryId = article(0).getOrElse("inventory_id", "").toString 80 | val resource = article(0).getOrElse("resource", "").toString 81 | 82 | this.userTags = Map[String, String]( 83 | UserPortraitCommon.cityTagCode -> new String(), 84 | UserPortraitCommon.districtTagCode -> new String(), 85 | UserPortraitCommon.blockTagCode -> new String(), 86 | UserPortraitCommon.communityTagCode -> new String(), 87 | UserPortraitCommon.bedroomsTagCode -> new String(), 88 | UserPortraitCommon.priceTagCode -> new String()) 89 | 90 | if (resource == "1") { 91 | // 安个家房源 92 | println(DateUtil.getCurTime(DateUtil.SIMPLE_FORMAT) + "|" 93 | + getUserId() + ": UserPortraitBrowse", inventoryId, this.getRequestUri(), "angejia") 94 | this.updateUserNeedsByInventoryId(inventoryId) 95 | this.scoreByInventoryId(inventoryId) 96 | } else { 97 | // 营销房源 98 | println(DateUtil.getCurTime(DateUtil.SIMPLE_FORMAT) + "|" 99 | + getUserId() + ": UserPortraitBrowse", inventoryId, this.getRequestUri(), "marketing") 100 | this.updateUserNeedsByMarketingInventoryId(inventoryId) 101 | this.scoreByMarketingInventoryId(inventoryId) 102 | } 103 | 104 | reStatus = "yes" 105 | reStatus 106 | } 107 | 108 | /** 109 | * 一组标签进行合并(安个家房源) 110 | */ 111 | private def updateUserNeedsByInventoryId(inventoryId: String): Unit = { 112 | val inventoryIds = Array(inventoryId) 113 | 114 | // 合并 115 | UserPortraitNeeds.setUserId(this.getUserId()) 116 | UserPortraitNeeds.userNeedsMergeByInventoryIds(inventoryIds) 117 | } 118 | 119 | /** 120 | * 一组标签进行合并(营销房源) 121 | */ 122 | private def updateUserNeedsByMarketingInventoryId(inventoryId: String): Unit = { 123 | val inventoryIds = Array(inventoryId) 124 | 125 | // 合并 126 | UserPortraitNeeds.setUserId(this.getUserId()) 127 | UserPortraitNeeds.userNeedsMergeByMarketingInventoryIds(inventoryIds) 128 | } 129 | 130 | /** 131 | * 打分入口(安个家房源) 132 | */ 133 | def scoreByInventoryId(inventoryId: String): Unit = { 134 | UserPortraitTags.setUserId(this.getUserId()) 135 | UserPortraitTags.scoreByInventoryIdAndAction(inventoryId, this.actionName) 136 | } 137 | 138 | /** 139 | * 打分入口(营销房源) 140 | */ 141 | def scoreByMarketingInventoryId(inventoryId: String): Unit = { 142 | UserPortraitTags.setUserId(this.getUserId()) 143 | UserPortraitTags.scoreByMarketingInventoryIdAndAction(inventoryId, this.actionName) 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitCommon.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.user.portrait 2 | 3 | import scala.collection.mutable.Map 4 | import scala.collection.mutable.HashMap 5 | 6 | import javax.crypto.Cipher 7 | import javax.crypto.spec.IvParameterSpec 8 | import javax.crypto.spec.SecretKeySpec 9 | import sun.misc.BASE64Decoder 10 | 11 | import com.angejia.dw.hadoop.hbase.HBaseClient 12 | import com.angejia.dw.common.util.mysql.MysqlClient 13 | import com.angejia.dw.hadoop.hive.HiveClient 14 | 15 | import com.angejia.dw.common.util.JsonUtil 16 | 17 | object UserPortraitCommon { 18 | 19 | // mysql 数据库连接对象 20 | var mysqlClient: MysqlClient = null 21 | 22 | // spark 通过 thriftServer 连接 hive 数据仓库的对象 23 | var sparkHiveClient: HiveClient = null 24 | 25 | // 用户画像表连接对象 26 | var userPortraitTable: HBaseClient = null 27 | 28 | // 用户画像标签 列族 29 | val TagColumnFamily = "tags" 30 | 31 | // 用户画像维度 列族 32 | val DimColumnFamily = "dimension" 33 | 34 | // 用户画像需求 列族 35 | val NeedsColumnFamily = "needs" 36 | 37 | // 用户画像推荐 列族(保存推荐信息) 38 | val ModelStateColumnFamily = "modelState" 39 | 40 | // 标签配置! 41 | // 城市 42 | val cityTagConf = UserPortraitTagConf.CITY_TAG // 标签配置 43 | val cityTagCode = cityTagConf.getOrElse("TagCode", "") // 标签代码 44 | 45 | // 区域 46 | val districtTagConf = UserPortraitTagConf.DISTRICT_TAG 47 | val districtTagCode = districtTagConf.getOrElse("TagCode", "") 48 | 49 | // 版块 50 | val blockTagConf = UserPortraitTagConf.BLOCK_TAG 51 | val blockTagCode = blockTagConf.getOrElse("TagCode", "") 52 | 53 | // 小区 54 | val communityTagConf = UserPortraitTagConf.COMMUNITY_TAG 55 | val communityTagCode = communityTagConf.getOrElse("TagCode", "") 56 | 57 | // 户型 58 | val bedroomsTagConf = UserPortraitTagConf.BEDROOMS_TAG 59 | val bedroomsTagCode = bedroomsTagConf.getOrElse("TagCode", "").toString() 60 | // 户型段映射 61 | val bedroomsType = bedroomsTagConf.get("bedroomsType").get.asInstanceOf[collection.Map[String, String]] 62 | 63 | // 价格 64 | val priceTagConf = UserPortraitTagConf.PRICE_TAG 65 | val priceTagCode = priceTagConf.getOrElse("TagCode", "").toString() 66 | // 价格段 67 | val priceTier = priceTagConf.get("PriceTier").get.asInstanceOf[collection.Map[String, String]] 68 | // 价格段映射 69 | val priceTierType = priceTagConf.get("PriceTierType").get.asInstanceOf[collection.Map[String, String]] 70 | 71 | /** 72 | * 获取用户画像 tags 标签列族数据 73 | * userId 用户 ID 74 | * columnFamily 列族 75 | */ 76 | def getUserPortraitTagsByUserId(userId: String): HashMap[String, String] = { 77 | 78 | val hbaseData: HashMap[String, String] = UserPortraitCommon.userPortraitTable.select( 79 | userId, 80 | UserPortraitCommon.TagColumnFamily, 81 | Array(UserPortraitCommon.cityTagCode, 82 | UserPortraitCommon.districtTagCode, 83 | UserPortraitCommon.blockTagCode, 84 | UserPortraitCommon.communityTagCode, 85 | UserPortraitCommon.bedroomsTagCode, 86 | UserPortraitCommon.priceTagCode)) 87 | 88 | hbaseData 89 | } 90 | 91 | /** 92 | * 获取用户画像 dim 维度列族数据 93 | * userId 用户 ID 94 | * columnFamily 列族 95 | */ 96 | def getUserPortraitDimByUserId(userId: String): HashMap[String, String] = { 97 | 98 | val hbaseData: HashMap[String, String] = UserPortraitCommon.userPortraitTable.select( 99 | userId, 100 | UserPortraitCommon.DimColumnFamily, 101 | Array( //"userDemand", // 需求单维度老的, 102 | "memberDemand", // 需求单维度 103 | "likeInventorys", // 收藏房源 104 | "visitItemInventorys", // 带看房源 105 | "linkInventorys" // 连接房源 106 | )) 107 | 108 | hbaseData 109 | } 110 | 111 | /** 112 | * 获取用户画像 needs 列族需求数据 113 | * userId 用户 ID 114 | * columnFamily 列族 115 | */ 116 | def getUserPortraitNeedsByUserId(userId: String): HashMap[String, String] = { 117 | 118 | val hbaseData: HashMap[String, String] = UserPortraitCommon.userPortraitTable.select( 119 | userId, 120 | UserPortraitCommon.NeedsColumnFamily, // 列族 121 | Array("actionNeeds" // 需要抽取的列 122 | )) 123 | 124 | hbaseData 125 | } 126 | 127 | /** 128 | * 获取用户画像 ModelState 建模状态列族数据 129 | */ 130 | def getUserPortraitModelStateByUserId(userId: String): HashMap[String, String] = { 131 | 132 | val hbaseData: HashMap[String, String] = UserPortraitCommon.userPortraitTable.select( 133 | userId, 134 | UserPortraitCommon.ModelStateColumnFamily, // 列族 135 | // 需要抽取的列 136 | Array("visitItemInventorysRecord", // 带看上次修改记录 137 | "linkInventorysRecord", // 连接上次修改记录 138 | "memberDemandTime" // 需求单上次修改记录 139 | )) 140 | 141 | hbaseData 142 | } 143 | 144 | /** 145 | * 对于 map 中,key 是空的值,进行处理 146 | */ 147 | def mapKeyDefaultValue(map: HashMap[String, String], key: String, default: String = ""): String = { 148 | var rs: String = "" 149 | if (map.contains(key)) { 150 | if (map.getOrElse(key, null) == null) { 151 | rs = default 152 | } else { 153 | rs = map.get(key).get.toString() 154 | } 155 | } else { 156 | rs = default 157 | } 158 | rs 159 | } 160 | 161 | /** 162 | * 解密 auth 163 | */ 164 | def Decrypt(data: String): String = { 165 | try { 166 | val key = "12345678123456xx" 167 | val iv = "12345678123456xx" 168 | 169 | val encrypted1 = new BASE64Decoder().decodeBuffer(data) 170 | 171 | val cipher = Cipher.getInstance("AES/CBC/NoPadding"); 172 | val keyspec = new SecretKeySpec(key.getBytes(), "AES"); 173 | val ivspec = new IvParameterSpec(iv.getBytes()); 174 | 175 | cipher.init(Cipher.DECRYPT_MODE, keyspec, ivspec); 176 | val original = cipher.doFinal(encrypted1) 177 | val originalString = new String(original) 178 | return originalString; 179 | 180 | } catch { 181 | case e: Exception => 182 | e.printStackTrace(); 183 | return ""; 184 | } 185 | } 186 | 187 | /** 188 | * Json 转换成 Map, 不可变的 Map 189 | */ 190 | def jsonStrToMap(jsonString: String): Map[String, Object] = { 191 | JsonUtil.smartJsonStrToMap(jsonString) 192 | } 193 | 194 | /** 195 | * Map 转换成 String, 不可变的 Map 196 | */ 197 | def mapToJsonStr(map: Map[String, Object]): String = { 198 | JsonUtil.smartMapToJsonStr(map) 199 | } 200 | 201 | /** 202 | * 两层 json 转换 -> 可变的 Map 203 | * jsonStr : 204 | * {} 205 | * 或者 206 | * { 207 | * "x": {"a":"1","b":"2"}, 208 | * "y": {"c":"3","d":"4"} 209 | * } 210 | * 211 | * return : 212 | * Map( 213 | * "x" => Map("a"=> 1, "b"=> 2), 214 | * "y" => Map("c"=> 3, "d"=> 4) 215 | * ) 216 | */ 217 | def jsonStrToMapByTwolayers(jsonStr: String): Map[String, Map[String, Object]] = { 218 | // 返回的是一个 Map[String, Object] , Object = Map[String, String] 219 | val baseMap = JsonUtil.playJsonToMap(jsonStr) 220 | // json 转换为可变 map 221 | val mapChildToVariable = baseMap.map { 222 | case (k, v) => 223 | val curK = k 224 | // 把 v 转换为 Map[String,String] 225 | val curV = v.asInstanceOf[scala.collection.immutable.Map[String, Object]] 226 | // 再把 map 转换为可变 Map 227 | val formatV = scala.collection.mutable.Map(curV.toSeq: _*) 228 | k -> formatV 229 | } 230 | // 转变 map 数据 231 | val mapVariable = collection.mutable.Map(mapChildToVariable.toSeq: _*).asInstanceOf[scala.collection.mutable.Map[String, Map[String, Object]]] 232 | mapVariable 233 | } 234 | 235 | /** 236 | * 两层 map 转换 -> 为 json 237 | */ 238 | def mapToJsonStrByTwolayers(mapData: Map[String, Map[String, Object]]): String = { 239 | val mapDataToMap = mapData.map { 240 | case (k, v) => k -> v.toMap // 转换成不可变 Map 241 | }.toMap 242 | // 转换为 json 字符串 243 | val mapToStr = JsonUtil.playMapToJson(mapDataToMap) 244 | mapToStr 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitFilter.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.user.portrait 2 | 3 | import scala.collection.mutable.Map 4 | import scala.collection.mutable.ListBuffer 5 | 6 | import com.angejia.dw.common.util.JsonUtil 7 | import com.angejia.dw.common.util.RegexUtil 8 | 9 | /** 10 | * 筛选房源逻辑处理 11 | * 根据 URL - 找出 城市,区域,板块,户型,价格区间(转换成价格段) 12 | * 1. 收集 url 出现的以上标签, 13 | * 比如: val cityIds = Set(1) 14 | * 2. 读取 Hbase 中的以上标签原始 Json 数据 15 | * 3. 对找出的标签进行 筛选逻辑加分后写回 Hbase 16 | */ 17 | object UserPortraitFilter { 18 | 19 | val actionName = "Filter" 20 | 21 | // 当前处理的 userId 22 | var userId: String = new String() 23 | def setUserId(userId: String): Unit = { 24 | this.userId = userId 25 | } 26 | def getUserId(): String = { 27 | this.userId 28 | } 29 | 30 | // 请求的 URI 31 | var requestUri = new String() 32 | def setRequestUri(uri: String): Unit = { 33 | this.requestUri = uri 34 | } 35 | def getRequestUri(): String = { 36 | this.requestUri 37 | } 38 | 39 | /** 40 | * 本行为中用到的用户标签 41 | */ 42 | var userTags = Map[String, String]() 43 | 44 | // URI 搜索匹配正则 45 | val filterRegex = "/mobile/member/inventories/list[?](.*)" 46 | 47 | def run(): String = { 48 | // 推荐状态 49 | var reStatus = "no" 50 | 51 | /** 52 | * 解析出 Url 中的筛选字段数据 53 | */ 54 | val urlPars = RegexUtil.findStrData(this.filterRegex, this.getRequestUri()) 55 | if (urlPars.isEmpty()) return reStatus 56 | println(getUserId() + ": UserPortraitFilter ", this.getRequestUri()) 57 | 58 | this.userTags = Map[String, String]( 59 | UserPortraitCommon.cityTagCode -> new String(), 60 | UserPortraitCommon.districtTagCode -> new String(), 61 | UserPortraitCommon.blockTagCode -> new String(), 62 | UserPortraitCommon.communityTagCode -> new String(), 63 | UserPortraitCommon.bedroomsTagCode -> new String(), 64 | UserPortraitCommon.priceTagCode -> new String()) 65 | 66 | /** 67 | * 处理 url 中出现的标签 68 | */ 69 | urlPars.split("&").foreach { keyValueStr => 70 | 71 | val keyValue = keyValueStr.split("=") 72 | // 表示一对 key,value 73 | if (keyValue.size == 2) { 74 | 75 | val key: String = keyValue(0).toString() 76 | val value: String = keyValue(1).toString() 77 | 78 | key match { 79 | case "city_id" => { 80 | userTags.update(UserPortraitCommon.cityTagCode, value) 81 | } 82 | case "district_id" => { 83 | userTags.update(UserPortraitCommon.districtTagCode, value) 84 | } 85 | case "block_id" => { 86 | userTags.update(UserPortraitCommon.blockTagCode, value) 87 | } 88 | case "community_id" => { 89 | userTags.update(UserPortraitCommon.communityTagCode, value) 90 | } 91 | case "bedroom_id" => { 92 | // 通过户型 key 找到实际户型 93 | val bedrooms = UserPortraitCommon.bedroomsType.getOrElse(value, "0").toString() 94 | userTags.update(UserPortraitCommon.bedroomsTagCode, bedrooms) 95 | } 96 | case "price_id" => { 97 | // 通过价格段 key 找到实际的户型 98 | val priceTierId = UserPortraitCommon.priceTierType.getOrElse(value, "0").toString() 99 | userTags.update(UserPortraitCommon.priceTagCode, priceTierId) 100 | } 101 | case _ => "filter nothing" 102 | } 103 | 104 | } 105 | } 106 | 107 | this.userNeeds() 108 | 109 | this.score() 110 | 111 | reStatus = "yes" 112 | reStatus 113 | } 114 | 115 | /** 116 | * 一组标签进行合并 117 | */ 118 | def userNeeds(): Unit = { 119 | val uesrActions = ListBuffer[Map[String, String]]() 120 | uesrActions.append(userTags) 121 | 122 | // 对标签动作进行累加 123 | UserPortraitNeeds.setUserId(this.getUserId()) 124 | UserPortraitNeeds.userActionNeedsMergeAction(uesrActions) 125 | } 126 | 127 | /** 128 | * 标签打分 129 | */ 130 | def score(): Unit = { 131 | 132 | // 设置操作标签的用户 133 | UserPortraitTags.setUserId(this.getUserId()) 134 | 135 | // 城市标签打分 136 | val cityId = userTags.getOrElse(UserPortraitCommon.cityTagCode, "0") 137 | UserPortraitTags.cityTag(Set(cityId), Set(), UserPortraitCommon.cityTagConf.getOrElse("filterScore", "0")) 138 | 139 | // 区域标签打分 140 | val districtId = userTags.getOrElse(UserPortraitCommon.districtTagCode, "0") 141 | UserPortraitTags.districtTag(Set(districtId), Set(), UserPortraitCommon.districtTagConf.getOrElse("filterScore", "0")) 142 | 143 | // 版块标签打分 144 | val blockId = userTags.getOrElse(UserPortraitCommon.blockTagCode, "0") 145 | UserPortraitTags.blockTag(Set(blockId), Set(), UserPortraitCommon.blockTagConf.getOrElse("filterScore", "0")) 146 | 147 | // 小区标签打分 148 | val communityId = userTags.getOrElse(UserPortraitCommon.communityTagCode, "0") 149 | UserPortraitTags.communityTag(Set(communityId), Set(), UserPortraitCommon.communityTagConf.getOrElse("filterScore", "0")) 150 | 151 | // 户型标签打分 152 | val bedrooms = userTags.getOrElse(UserPortraitCommon.bedroomsTagCode, "0") 153 | UserPortraitTags.bedroomsTag(Set(bedrooms), Set(), UserPortraitCommon.bedroomsTagConf.getOrElse("filterScore", "0").toString()) 154 | 155 | // 价格段标签打分 156 | val priceTierId = userTags.getOrElse(UserPortraitCommon.priceTagCode, "0") 157 | UserPortraitTags.priceTag(Set(priceTierId), Set(), UserPortraitCommon.priceTagConf.getOrElse("filterScore", "0").toString()) 158 | } 159 | 160 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitLinkInventory.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.user.portrait 2 | 3 | import scala.collection.mutable.HashMap 4 | import scala.collection.mutable.Map 5 | 6 | import com.angejia.dw.common.util.DateUtil 7 | 8 | /** 9 | * 用户发生过连接的房源 10 | * 1. 获取用户被连接过得房源 11 | * 2. 为连接过得房子打分 12 | */ 13 | object UserPortraitLinkInventory { 14 | 15 | val actionName = "LinkInventory" 16 | 17 | def run(userId: String, date: String): String = { 18 | var userPortraitLinkInventory = new UserPortraitLinkInventory() 19 | userPortraitLinkInventory.setUserId(userId) 20 | userPortraitLinkInventory.setDwDate(date) 21 | var reStatus = userPortraitLinkInventory.run() 22 | 23 | userPortraitLinkInventory = null 24 | reStatus 25 | } 26 | } 27 | 28 | /** 29 | * 流程 30 | */ 31 | class UserPortraitLinkInventory { 32 | 33 | // hbase dimension:linkInventorys 列名 34 | val column: String = "linkInventorys" 35 | 36 | // 当前处理的 userId 37 | var userId: String = new String() 38 | def setUserId(userId: String): Unit = { 39 | if (userId.isEmpty()) { 40 | return 41 | } 42 | this.userId = userId 43 | } 44 | def getUserId(): String = { 45 | if (this.userId.isEmpty()) { 46 | return "0" 47 | } 48 | this.userId 49 | } 50 | 51 | // 分区日期 52 | var dwDate: String = new String() 53 | def setDwDate(date: String): Unit = { 54 | this.dwDate = date 55 | } 56 | def getDwDate(): String = { 57 | this.dwDate 58 | } 59 | 60 | /** 61 | * 初始化环境 62 | */ 63 | def run(): String = { 64 | // 推荐状态 65 | var reStatus = "no" 66 | 67 | // 当日日期 68 | val offsetDate = DateUtil.getCalendarOffsetDateDay(0) // 获取当天日期 69 | val todayYmd = DateUtil.DateToString(offsetDate, DateUtil.SIMPLE_Y_M_D_FORMAT) // 格式化日期 70 | 71 | // 当日建模建模状态 72 | val modelState = this.getModelStateByDate(todayYmd) 73 | if (modelState == true) return reStatus 74 | 75 | // 最新连接房源数据 76 | val newLinkInventoryIds = this.getNewLinkInventoryIds() 77 | // 连接数据为空, 则退出 78 | if (newLinkInventoryIds.isEmpty) { 79 | this.saveModelStateByDate(todayYmd) // 保存当日的建模状态 80 | return reStatus 81 | } 82 | 83 | // 原始连接房源数据 84 | val linkInventoryIds = this.getLinkInventoryIds() 85 | 86 | // 检测新增的房源 Ids 87 | val diffInventoryIds = this.diffInventoryIds(newLinkInventoryIds, linkInventoryIds) 88 | 89 | // 如果没有变化, 则退出 90 | if (diffInventoryIds.isEmpty) { 91 | this.saveModelStateByDate(todayYmd) // 保存当日的建模状态 92 | return reStatus 93 | } 94 | println(getUserId() + ": UserPortraitLinkInventory ", diffInventoryIds.mkString(",")) 95 | 96 | // 需求标签合并 97 | this.userNeeds(diffInventoryIds) 98 | 99 | // 打分 100 | this.score(diffInventoryIds) 101 | 102 | // 最新的需求更新到 hbase 中 103 | this.updateInventorysToHbase(newLinkInventoryIds) 104 | 105 | // 全部成功, 保存当日的建模状态 106 | this.saveModelStateByDate(todayYmd) // 保存当日的建模状态 107 | 108 | // 返回 109 | reStatus = "yes" 110 | reStatus 111 | } 112 | 113 | /** 114 | * 获取用户最新连接数据 115 | */ 116 | def getNewLinkInventoryIds(): Map[String, Object] = { 117 | var rs: Map[String, Object] = Map[String, Object]() 118 | 119 | // 读取指定用户的连接房源数据 120 | val querySql = "SELECT link_invs_a FROM dw_db.dw_user_sd WHERE user_id = '" + this.getUserId() + "' AND p_dt = '" + this.getDwDate() + "' limit 1" 121 | //println(querySql) 122 | val userSdData = UserPortraitCommon.sparkHiveClient.select(querySql, "link_invs_a"); 123 | 124 | if (!userSdData.isEmpty()) { 125 | // 所有的连接房源数据 126 | val linkInventoryIvns: String = userSdData.get(0).get("link_invs_a") 127 | if (linkInventoryIvns != null) { 128 | rs.put(column, linkInventoryIvns) 129 | } 130 | } 131 | rs 132 | } 133 | 134 | /** 135 | * 从 Hbase 获取用户最新连接数据 136 | */ 137 | def getLinkInventoryIds(): Map[String, Object] = { 138 | 139 | var rs: Map[String, Object] = Map[String, Object]() 140 | 141 | // 获取用户维度数据 142 | val linkInventorys: HashMap[String, String] = UserPortraitCommon.getUserPortraitDimByUserId(this.getUserId()) 143 | 144 | // 获取用户 喜欢房源维度 维度数据 jsonStri 145 | val dimLinkInventorysJsonStr = UserPortraitCommon.mapKeyDefaultValue(linkInventorys, column, "{}") 146 | 147 | // 转换 jsonStr 成 Map 148 | rs = UserPortraitCommon.jsonStrToMap(dimLinkInventorysJsonStr) 149 | 150 | rs 151 | } 152 | 153 | // 获取房源差集 154 | def diffInventoryIds(newInventorys: Map[String, Object], oldInventorys: Map[String, Object]): Array[String] = { 155 | var newInventoryIds: Set[String] = Set[String]() 156 | var oldInventoryIds: Set[String] = Set[String]() 157 | 158 | if (!newInventorys.isEmpty) { 159 | newInventoryIds = newInventorys.getOrElse(column, "").toString().split(",").toSet 160 | } 161 | if (!oldInventorys.isEmpty) { 162 | oldInventoryIds = oldInventorys.getOrElse(column, "").toString().split(",").toSet 163 | } 164 | 165 | // 差集 166 | val diffInventoryIds = newInventoryIds -- oldInventoryIds 167 | 168 | diffInventoryIds.toArray 169 | } 170 | 171 | /** 172 | * 一组标签进行合并 173 | */ 174 | def userNeeds(inventoryIds: Array[String]): Unit = { 175 | // 合并 176 | UserPortraitNeeds.setUserId(this.getUserId()) 177 | UserPortraitNeeds.userNeedsMergeByInventoryIds(inventoryIds) 178 | } 179 | 180 | /** 181 | * 通过房源 Id, 为房源属性打分 182 | */ 183 | def score(inventoryIds: Array[String]): Unit = { 184 | 185 | // 通过房源 ID , 为用户标签打分 186 | inventoryIds.foreach { inventoryId => 187 | UserPortraitTags.setUserId(this.getUserId()) 188 | // 分数 189 | // val score = UserPortraitCommon.cityTagConf.getOrElse("linkInventoryScore", "0").toString() 190 | // UserPortraitTags.tagScoreByInventoryId(inventoryId, score) 191 | UserPortraitTags.tagsScoreByInventoryAndAction(inventoryId, UserPortraitLinkInventory.actionName) 192 | } 193 | 194 | } 195 | 196 | /** 197 | * 把需求转换成 Json 198 | * 保存到用户画像表的 dimension:linkInventorys 中 199 | */ 200 | def updateInventorysToHbase(inventorys: Map[String, Object]) = { 201 | if (!inventorys.isEmpty) { 202 | // Map 转换为 Json Str 203 | val toString = UserPortraitCommon.mapToJsonStr(inventorys) 204 | UserPortraitCommon.userPortraitTable.update(this.getUserId(), UserPortraitCommon.DimColumnFamily, column, toString) 205 | } 206 | } 207 | 208 | /** 209 | * 指定日期的建模状态 210 | * dateYmd: 日期 2016-04-10 211 | * return 212 | * true : 已建模 213 | * false : 未建模 214 | */ 215 | def getModelStateByDate(dateYmd: String): Boolean = { 216 | var status = false 217 | UserPortraitrModelState.setUserId(this.getUserId()) 218 | UserPortraitrModelState.setLinkInventorysRecord() 219 | val linkInventorysRecord = UserPortraitrModelState.getLinkInventorysRecord() // 获取指定天数是否已经连接过了 220 | if (linkInventorysRecord.contains(dateYmd)) { 221 | status = true 222 | } 223 | status 224 | } 225 | 226 | /** 227 | * 保存属性的建模状态 228 | * dateYmd: 日期 2016-04-10 229 | */ 230 | def saveModelStateByDate(dateYmd: String): Unit = { 231 | // 把建模状态写入到 hbase 中 232 | UserPortraitrModelState.setUserId(this.getUserId()) 233 | var newLinkInventorysRecord: Map[String, Map[String, String]] = Map[String, Map[String, String]]() 234 | newLinkInventorysRecord.put(dateYmd, Map("status" -> "1")) // Map[当前日期 -> Map[status->1]] 235 | UserPortraitrModelState.saveLinkInventorysRecord(newLinkInventorysRecord) // 更新到 Hbase 236 | } 237 | 238 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitTagConf.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.user.portrait 2 | 3 | import scala.collection.mutable.Map 4 | 5 | /** 6 | * 用户画像 标签核心配置 7 | */ 8 | 9 | object UserPortraitTagConf { 10 | 11 | // 城市标签 12 | val CITY_TAG = Map( 13 | "TagCode" -> "city", 14 | "TagName" -> "城市", 15 | 16 | // 用户选房单分数 17 | "userDemandScore" -> "0", 18 | 19 | // 房源筛选给的分数 20 | "filterScore" -> "0", 21 | 22 | // 浏览房源分数 23 | "browseScore" -> "0", 24 | 25 | // 收藏房源分数 26 | "likeInventoryScore" -> "0", 27 | 28 | // 发生带看房源发生的分数 29 | "visitItemInventoryScore" -> "0", 30 | 31 | // 发生连接的分数 32 | "linkInventoryScore" -> "0", 33 | 34 | // 分数衰减百分比 35 | "attenuationPercentage" -> "0.1") 36 | 37 | // 区域标签 38 | val DISTRICT_TAG = Map( 39 | "TagCode" -> "district", 40 | "TagName" -> "区域", 41 | 42 | // 用户选房单分数 43 | "userDemandScore" -> "0", 44 | 45 | // 房源筛选给的分数 46 | "filterScore" -> "0", 47 | 48 | // 浏览房源分数 49 | "browseScore" -> "0", 50 | 51 | // 收藏房源分数 52 | "likeInventoryScore" -> "0", 53 | 54 | // 发生带看房源发生的分数 55 | "visitItemInventoryScore" -> "0", 56 | 57 | // 发生连接的分数 58 | "linkInventoryScore" -> "0", 59 | 60 | // 分数衰减百分比 61 | "attenuationPercentage" -> "0.1") 62 | 63 | // 版块标签 64 | val BLOCK_TAG = Map( 65 | "TagCode" -> "block", 66 | "TagName" -> "版块", 67 | 68 | // 用户选房单分数 69 | "userDemandScore" -> "20", 70 | 71 | // 房源筛选给的分数 72 | //"filterScore" -> "2", 73 | "filterScore" -> "10", 74 | 75 | // 浏览房源分数 76 | "browseScore" -> "1", 77 | 78 | // 收藏房源分数 79 | "likeInventoryScore" -> "5", 80 | 81 | // 发生带看房源发生的分数 82 | "visitItemInventoryScore" -> "50", 83 | 84 | // 发生连接的分数 85 | "linkInventoryScore" -> "30", 86 | 87 | // 分数衰减百分比 88 | "attenuationPercentage" -> "0.1") 89 | 90 | // 小区标签 91 | val COMMUNITY_TAG = Map( 92 | "TagCode" -> "community", 93 | "TagName" -> "小区", 94 | 95 | // 用户选房单分数 96 | "userDemandScore" -> "20", 97 | 98 | // 房源筛选给的分数 99 | //"filterScore" -> "10", 100 | "filterScore" -> "10", // 1.0 101 | 102 | // 浏览房源分数 103 | //"browseScore" -> "2", 104 | "browseScore" -> "5", // 1.0 105 | 106 | // 收藏房源分数 107 | "likeInventoryScore" -> "5", 108 | 109 | // 发生带看房源发生的分数 110 | "visitItemInventoryScore" -> "50", 111 | 112 | // 发生连接的分数 113 | "linkInventoryScore" -> "30", 114 | 115 | // 分数衰减百分比 116 | "attenuationPercentage" -> "0.1") 117 | 118 | // 户型标签 119 | val BEDROOMS_TAG = Map( 120 | "TagCode" -> "bedrooms", 121 | "TagName" -> "户型", 122 | 123 | // 用户选房单分数 124 | "userDemandScore" -> "20", 125 | 126 | // 房源筛选给的分数 127 | //"filterScore" -> "2", 128 | "filterScore" -> "10", // 1.0 129 | 130 | // 浏览房源分数 131 | //"browseScore" -> "2", 132 | "browseScore" -> "5", // 1.0 133 | 134 | // 收藏房源分数 135 | "likeInventoryScore" -> "5", 136 | 137 | // 发生带看房源发生的分数 138 | "visitItemInventoryScore" -> "50", 139 | 140 | // 发生连接的分数 141 | "linkInventoryScore" -> "30", 142 | 143 | // 分数衰减百分比 144 | "attenuationPercentage" -> "0.1", 145 | 146 | // 户型映射(筛选列表时) 147 | "bedroomsType" -> Map[String, String]( 148 | "2" -> "1", 149 | "3" -> "2", 150 | "4" -> "3", 151 | "5" -> "4", 152 | "6" -> "5", 153 | "7" -> "6")) 154 | 155 | // 价格段标签 156 | val PRICE_TAG = Map( 157 | "TagCode" -> "price", 158 | "TagName" -> "价格段", 159 | 160 | // 用户选房单分数 161 | "userDemandScore" -> "20", 162 | 163 | // 房源筛选给的分数 164 | //"filterScore" -> "2", 165 | "filterScore" -> "10", // 1.0 166 | 167 | // 浏览房源分数 168 | //"browseScore" -> "2", 169 | "browseScore" -> "5", // 1.0 170 | 171 | // 收藏房源分数 172 | "likeInventoryScore" -> "5", 173 | 174 | // 发生带看房源发生的分数 175 | "visitItemInventoryScore" -> "50", 176 | 177 | // 发生连接的分数 178 | "linkInventoryScore" -> "30", 179 | 180 | // 分数衰减百分比 181 | "attenuationPercentage" -> "0.1", 182 | 183 | // 价格段映射 184 | "PriceTierType" -> Map[String, String]( 185 | "2" -> "1", 186 | "3" -> "2", 187 | "4" -> "3", 188 | "5" -> "4", 189 | "6" -> "5", 190 | "7" -> "6", 191 | "8" -> "7", 192 | "9" -> "8", 193 | "10" -> "9"), 194 | 195 | // 价格段数据 196 | "PriceTier" -> Map[String, String]( 197 | "0-1500000" -> "1", 198 | "1500000-2000000" -> "2", 199 | "2000000-2500000" -> "3", 200 | "2500000-3000000" -> "4", 201 | "3000000-4000000" -> "5", 202 | "4000000-5000000" -> "6", 203 | "5000000-7000000" -> "7", 204 | "7000000-10000000" -> "8", 205 | "10000000-1000000000" -> "9")) 206 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitVisitItem.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.user.portrait 2 | 3 | import scala.collection.mutable.Map 4 | import scala.collection.mutable.HashMap 5 | import scala.collection.mutable.ArrayBuffer 6 | import com.angejia.dw.common.util.DateUtil 7 | 8 | /** 9 | * 用户发生过带看的房源 10 | * 1. 获取用户被带看过得房源 11 | * 2. 为带看过得房子打分 12 | */ 13 | object UserPortraitVisitItem { 14 | 15 | val actionName = "VisitItem" 16 | 17 | def run(userId: String, date: String): String = { 18 | var userPortraitVisitItem = new UserPortraitVisitItem() 19 | userPortraitVisitItem.setUserId(userId) 20 | userPortraitVisitItem.setDwDate(date) 21 | var reStatus = userPortraitVisitItem.run() 22 | 23 | userPortraitVisitItem = null 24 | reStatus 25 | } 26 | } 27 | 28 | /** 29 | * 流程 30 | */ 31 | class UserPortraitVisitItem { 32 | 33 | // hbase dimension:visitItemInventorys 列名 34 | val column: String = "visitItemInventorys" 35 | 36 | // 当前处理的 userId 37 | var userId: String = new String() 38 | def setUserId(userId: String): Unit = { 39 | if (userId.isEmpty()) { 40 | return 41 | } 42 | this.userId = userId 43 | } 44 | def getUserId(): String = { 45 | if (this.userId.isEmpty()) { 46 | return "0" 47 | } 48 | this.userId 49 | } 50 | 51 | // 分区日期 52 | var dwDate: String = new String() 53 | def setDwDate(date: String): Unit = { 54 | this.dwDate = date 55 | } 56 | def getDwDate(): String = { 57 | this.dwDate 58 | } 59 | 60 | /** 61 | * 初始化环境 62 | */ 63 | def run(): String = { 64 | // 推荐状态 65 | var reStatus = "no" 66 | 67 | // 当日日期 68 | val offsetDate = DateUtil.getCalendarOffsetDateDay(0) // 获取当天日期 69 | val todayYmd = DateUtil.DateToString(offsetDate, DateUtil.SIMPLE_Y_M_D_FORMAT) // 格式化日期 70 | 71 | // 当日建模建模状态 72 | val modelState = this.getModelStateByDate(todayYmd) 73 | if (modelState == true) return reStatus 74 | 75 | // 最新带看房源数据 76 | val newVisitItemIds = this.getNewVisitItemIds() 77 | // 如果最新带看数据为空, 则退出 78 | if (newVisitItemIds.isEmpty) { 79 | this.saveModelStateByDate(todayYmd) // 保存当日的建模状态 80 | return reStatus 81 | } 82 | 83 | // 原始带看房源数据 84 | val visitItemIds = this.getVisitItemIds() 85 | 86 | // 检测新增的房源 Ids 87 | val diffInventoryIds = this.diffInventoryIds(newVisitItemIds, visitItemIds) 88 | 89 | // 如果没有变化, 则退出 90 | if (diffInventoryIds.isEmpty) { 91 | this.saveModelStateByDate(todayYmd) // 保存当日的建模状态 92 | return reStatus 93 | } 94 | println(getUserId() + ": UserPortraitVisitItem ", diffInventoryIds.mkString(",")) 95 | 96 | // 需求标签合并 97 | this.userNeeds(diffInventoryIds) 98 | 99 | // 打分 100 | this.score(diffInventoryIds) 101 | 102 | // 最新的需求更新到 hbase 中 103 | this.updateInventorysToHbase(newVisitItemIds) 104 | 105 | // 全部成功, 保存当日的建模状态 106 | this.saveModelStateByDate(todayYmd) // 保存当日的建模状态 107 | 108 | // 返回 109 | reStatus = "yes" 110 | reStatus 111 | } 112 | 113 | /** 114 | * 获取用户最新带看数据 115 | */ 116 | def getNewVisitItemIds(): Map[String, Object] = { 117 | var rs: Map[String, Object] = Map[String, Object]() 118 | 119 | // 读取指定用户的带看房源数据 120 | val querySql = "SELECT visit_item_invs_a FROM dw_user_sd WHERE user_id = '" + this.getUserId() + "' AND p_dt = '" + this.getDwDate() + "' limit 1" 121 | //println(querySql) 122 | //println(querySql) 123 | val userSdData = UserPortraitCommon.sparkHiveClient.select(querySql, "visit_item_invs_a"); 124 | 125 | if (!userSdData.isEmpty()) { 126 | // 所有的带看房源数据 127 | val visitItemIvns: String = userSdData.get(0).get("visit_item_invs_a") 128 | if (visitItemIvns != null) { 129 | rs.put(column, visitItemIvns) 130 | } 131 | } 132 | rs 133 | } 134 | 135 | /** 136 | * 从 Hbase 获取用户最新带看数据 137 | */ 138 | def getVisitItemIds(): Map[String, Object] = { 139 | 140 | var rs: Map[String, Object] = Map[String, Object]() 141 | 142 | // 获取用户维度数据 143 | val visitItemInventorys: HashMap[String, String] = UserPortraitCommon.getUserPortraitDimByUserId(this.getUserId()) 144 | 145 | // 获取用户 喜欢房源维度 维度数据 jsonStri 146 | val dimVisitItemInventorysJsonStr = UserPortraitCommon.mapKeyDefaultValue(visitItemInventorys, column, "{}") 147 | 148 | // 转换 jsonStr 成 Map 149 | rs = UserPortraitCommon.jsonStrToMap(dimVisitItemInventorysJsonStr) 150 | 151 | rs 152 | } 153 | 154 | // 获取房源差集 155 | def diffInventoryIds(newInventorys: Map[String, Object], oldInventorys: Map[String, Object]): Array[String] = { 156 | var newInventoryIds: Set[String] = Set[String]() 157 | var oldInventoryIds: Set[String] = Set[String]() 158 | 159 | if (!newInventorys.isEmpty) { 160 | newInventoryIds = newInventorys.getOrElse(column, "").toString().split(",").toSet 161 | } 162 | if (!oldInventorys.isEmpty) { 163 | oldInventoryIds = oldInventorys.getOrElse(column, "").toString().split(",").toSet 164 | } 165 | 166 | // 差集 167 | val diffInventoryIds = newInventoryIds -- oldInventoryIds 168 | 169 | diffInventoryIds.toArray 170 | } 171 | 172 | /** 173 | * 一组标签进行合并 174 | */ 175 | def userNeeds(inventoryIds: Array[String]): Unit = { 176 | // 合并 177 | UserPortraitNeeds.setUserId(this.getUserId()) 178 | UserPortraitNeeds.userNeedsMergeByInventoryIds(inventoryIds) 179 | } 180 | 181 | /** 182 | * 通过房源 Id, 为房源属性打分 183 | */ 184 | def score(inventoryIds: Array[String]): Unit = { 185 | // 通过房源 ID , 为用户标签打分 186 | inventoryIds.foreach { inventoryId => 187 | UserPortraitTags.setUserId(this.getUserId()) 188 | // 分数 189 | // val score = UserPortraitCommon.cityTagConf.getOrElse("visitItemInventoryScore", "0").toString() 190 | // UserPortraitTags.tagScoreByInventoryId(inventoryId, score) 191 | UserPortraitTags.tagsScoreByInventoryAndAction(inventoryId, UserPortraitVisitItem.actionName) 192 | } 193 | } 194 | 195 | /** 196 | * 把需求转换成 Json 197 | * 保存到用户画像表的 dimension:visitItemInventorys 中 198 | */ 199 | def updateInventorysToHbase(inventorys: Map[String, Object]) = { 200 | if (!inventorys.isEmpty) { 201 | // Map 转换为 Json Str 202 | val toString = UserPortraitCommon.mapToJsonStr(inventorys) 203 | UserPortraitCommon.userPortraitTable.update(this.getUserId(), UserPortraitCommon.DimColumnFamily, column, toString) 204 | } 205 | } 206 | 207 | /** 208 | * 指定日期的建模状态 209 | * dateYmd: 日期 2016-04-10 210 | * return 211 | * true : 已建模 212 | * false : 未建模 213 | */ 214 | def getModelStateByDate(dateYmd: String): Boolean = { 215 | var status = false 216 | UserPortraitrModelState.setUserId(this.getUserId()) 217 | UserPortraitrModelState.setVisitItemInventorysRecord() 218 | val visitItemInventorysRecord = UserPortraitrModelState.getVisitItemInventorysRecord() // 获取指定天数是否已经建模过了 219 | if (visitItemInventorysRecord.contains(dateYmd)) { 220 | status = true 221 | } 222 | status 223 | } 224 | 225 | /** 226 | * 保存属性的建模状态 227 | * dateYmd: 日期 2016-04-10 228 | */ 229 | def saveModelStateByDate(dateYmd: String): Unit = { 230 | // 把建模状态写入到 hbase 中 231 | UserPortraitrModelState.setUserId(this.getUserId()) 232 | var newVisitItemInventorysRecord: Map[String, Map[String, String]] = Map[String, Map[String, String]]() 233 | newVisitItemInventorysRecord.put(dateYmd, Map("status" -> "1")) // Map[当前日期 -> Map[status->1]] 234 | UserPortraitrModelState.saveVisitItemInventorysRecord(newVisitItemInventorysRecord) // 更新到 Hbase 235 | } 236 | 237 | } -------------------------------------------------------------------------------- /src/main/scala/com/angejia/dw/recommend/user/portrait/UserPortraitrModelState.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.user.portrait 2 | 3 | import scala.collection.mutable.Map 4 | import scala.collection.mutable.HashMap 5 | 6 | import com.angejia.dw.common.util.JsonUtil 7 | 8 | /** 9 | * 用户画像,建模状态 10 | */ 11 | object UserPortraitrModelState { 12 | 13 | // 当前处理的 userId 14 | var userId: String = new String() 15 | def setUserId(userId: String): Unit = { 16 | if (userId.isEmpty()) { 17 | return 18 | } 19 | this.userId = userId 20 | } 21 | def getUserId(): String = { 22 | if (this.userId.isEmpty()) { 23 | return "0" 24 | } 25 | this.userId 26 | } 27 | 28 | /** 29 | * 用户 modelState 列族下的所有列的数据 30 | */ 31 | var modelState: HashMap[String, String] = HashMap[String, String]() 32 | def setModelState(): Unit = { 33 | this.modelState = UserPortraitCommon.getUserPortraitModelStateByUserId(this.getUserId()) 34 | } 35 | def getModelState(): HashMap[String, String] = { 36 | this.modelState 37 | } 38 | 39 | /** 40 | * 用户 visitItemInventorysRecord 带看房源推荐记录 41 | * 初始化 modelState:visitItemInventorysRecord 列数据 42 | * return 43 | * Map[String, Map[String, String]] 44 | */ 45 | var visitItemInventorysRecord: Map[String, Map[String, String]] = Map[String, Map[String, String]]() 46 | 47 | def setVisitItemInventorysRecord(): Unit = { 48 | this.setModelState() 49 | 50 | // 读取 Hbase 中已存在的标签数据 json String 51 | val jsonString = UserPortraitCommon.mapKeyDefaultValue(this.getModelState(), "visitItemInventorysRecord", "{}") 52 | 53 | // 转化成为可变的 Map 54 | this.visitItemInventorysRecord = this.toolJsonStringToChangeMap(jsonString) 55 | } 56 | 57 | def getVisitItemInventorysRecord(): Map[String, Map[String, String]] = { 58 | this.visitItemInventorysRecord 59 | } 60 | 61 | /** 62 | * 保存 visitItemInventorysRecord 结果到 Hbase 中 63 | */ 64 | def saveVisitItemInventorysRecord(visitItemInventorysRecord: Map[String, Map[String, String]]): String = { 65 | // 更新数据 66 | this.toolSaveMapDataToHbaseColumn("visitItemInventorysRecord", visitItemInventorysRecord) 67 | } 68 | 69 | /** 70 | * 用户 linkInventorysRecord 连接房源记录 71 | * 初始化 modelState:linkInventorysRecord 列数据 72 | * return 73 | * Map[String, Map[String, String]] 74 | */ 75 | var linkInventorysRecord: Map[String, Map[String, String]] = Map[String, Map[String, String]]() 76 | 77 | def setLinkInventorysRecord(): Unit = { 78 | this.setModelState() 79 | 80 | // 读取 Hbase 中已存在的标签数据 json String 81 | val jsonString = UserPortraitCommon.mapKeyDefaultValue(this.getModelState(), "linkInventorysRecord", "{}") 82 | 83 | // 转化成为可变的 Map 84 | this.linkInventorysRecord = this.toolJsonStringToChangeMap(jsonString) 85 | } 86 | 87 | def getLinkInventorysRecord(): Map[String, Map[String, String]] = { 88 | this.linkInventorysRecord 89 | } 90 | 91 | /** 92 | * 保存 linkInventorysRecord 结果到 Hbase 中 93 | */ 94 | def saveLinkInventorysRecord(linkInventorysRecord: Map[String, Map[String, String]]): String = { 95 | // 更新数据 96 | this.toolSaveMapDataToHbaseColumn("linkInventorysRecord", linkInventorysRecord) 97 | } 98 | 99 | /** 100 | * json 字符串转换为一个可变的 Map 101 | * return 102 | * Map[String, Map[String, String] 103 | */ 104 | def toolJsonStringToChangeMap(jsonStringInput: String): scala.collection.mutable.Map[String, scala.collection.mutable.Map[String, String]] = { 105 | //import scala.collection.mutable.Map 106 | var jsonString = jsonStringInput 107 | if (jsonString.isEmpty() || jsonString == "") { 108 | jsonString = "{}" 109 | } 110 | 111 | // string 转换为 json 格式(不可变 Map) 112 | val mapData = JsonUtil.playJsonToMap(jsonString) // 返回的是一个 Map[String, Object] 113 | 114 | /** 115 | * 这段转换写了好长时间,不要问为什么,好蛋疼......... 116 | */ 117 | // 把 json 转换为可变 map 118 | val jsonToMap = mapData.map { 119 | case (k, v) => 120 | val curK = k 121 | // 把元祖 v 转换为 Map[String,String] 122 | val curV = v.asInstanceOf[scala.collection.immutable.Map[String, String]] 123 | // 再把 map 转换为可变 Map 124 | val formatV = scala.collection.mutable.Map(curV.toSeq: _*) 125 | k -> formatV 126 | } 127 | // hbase 可变 map 数据 128 | val rs = collection.mutable.Map(jsonToMap.toSeq: _*).asInstanceOf[scala.collection.mutable.Map[String, Map[String, String]]] 129 | 130 | rs 131 | } 132 | 133 | /** 134 | * 保存 map 到数据到 hbase 135 | * column : UserPortraitCommon.RecommendColumnFamily 列族下的列 136 | * mapData : map 数据 137 | */ 138 | def toolSaveMapDataToHbaseColumn(column: String, mapData: Map[String, Map[String, String]]): String = { 139 | val map = mapData.map { 140 | case (k, v) => k -> v.toMap // 转换成不可变 Map 141 | }.toMap 142 | // 转换为 json 字符串 143 | val jsonString = JsonUtil.playMapToJson(map) 144 | // 更新数据 145 | UserPortraitCommon.userPortraitTable.update(this.getUserId(), UserPortraitCommon.ModelStateColumnFamily, column, jsonString) 146 | } 147 | 148 | } -------------------------------------------------------------------------------- /src/test/scala/com/angejia/dw/recommend/inventory/portrait/InventoryPortraitCommonTest.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.inventory.portrait; 2 | 3 | import collection.mutable.Stack 4 | import collection.mutable.HashMap 5 | import org.scalatest._ 6 | import com.angejia.dw.common.util.mysql.MysqlClient 7 | import com.angejia.dw.recommend.Conf 8 | 9 | class InventoryPortraitCommonTest extends FlatSpec with Matchers { 10 | Conf.setEnv("dev") 11 | val productMysqDBInfo = Conf.getProductMysqDBInfo() 12 | InventoryPortraitCommon.mysqlClient = new MysqlClient( 13 | productMysqDBInfo.get("host").get, 14 | productMysqDBInfo.get("account").get, 15 | productMysqDBInfo.get("password").get, 16 | productMysqDBInfo.get("defaultDB").get) 17 | 18 | "getInventoryPortraitByInventoryId" should "works" in { 19 | val res = InventoryPortraitCommon.getUserTagsInventoryMappingByInventoryId("1") 20 | 21 | res should contain key ("price") 22 | res should contain key ("district") 23 | res should contain key ("city") 24 | res should contain key ("block") 25 | res should contain key ("bedrooms") 26 | res should contain key ("community") 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/test/scala/com/angejia/dw/recommend/user/portrait/UserPortraitTest.scala: -------------------------------------------------------------------------------- 1 | package com.angejia.dw.recommend.user.portrait 2 | 3 | import collection.mutable.Stack 4 | import org.scalatest._ 5 | 6 | class UserPortraitTest extends FlatSpec with Matchers { 7 | 8 | "new log" should "be parsed" in { 9 | val log = ("0.045\t0.045\t183.198.218.230\t1378\t127.0.0.1:9000\t" 10 | + "[2016-11-23T16:59:15+08:00]\tapi.angejia.com\t" 11 | + "GET /mobile/member/inventories/1/2 HTTP/1.1\t" 12 | + "200\t379\t-\tDalvik/2.1.0 (Linux; U; Android 6.0.1; OPPO R9s Build/MMB29M)\t" 13 | + "0.46\t183.198.218.230\t" 14 | + "ExhuBkt16RSHa0/0C+y9x4sAxoD7EKhTTIoMs76vHmF4082Rl0cMQrHG/n4sx5Swb2xKFoKuT0q71Fh1+vcW/wo7KxexQroTAfJbSze3I5pDxw6TdZ/8HoE2wwmq0Zfcoevuqh00RFkCTG88w2CddEjoOiL6xJ1PI0GK4i4D5MYk3WvuFyoYcBZ+Nk5i4yVhrD8GCRVu8uRiXCoQAyX8mahVxbtae3MEyOZD4E2goMoiul9tEcXyK0XYB+aJhEZVL6jdCR5kg9bihbVyulmWOIvjLnqkZCjBcEFrpv9kcG9zNTY9MUTUyJxPg2MuvzeZZ2FG5Yv8GKIhOSSNl1pKY/v35cpXBzldw55381DmC1s=\t" 15 | + "app=a-angejia;av=4.8.1;ccid=2;gcid=;ch=B14;lng=;lat=;net=WIFI;p=android;pm=Android-OPPO R9s;osv=6.0.1;dvid=86253103615747802:00:00:00:00:00;uid=760198\t" 16 | + "-\t-") 17 | val res = UserPortrait.formatLogData(log) 18 | 19 | res should contain key ("logRequestUri") 20 | res.get("logRequestUri") should equal(Some("/mobile/member/inventories/1/2")) 21 | 22 | res should contain key ("logHost") 23 | res.get("logHost") should equal(Some("api.angejia.com")) 24 | 25 | res should contain key ("logTime") 26 | res.get("logTime") should equal(Some("[2016-11-23T16:59:15+08:00]")) 27 | 28 | res should contain key ("userId") 29 | //res.get("userId") should equal(Some("728924")) 30 | 31 | res should contain key ("appAgent") 32 | res.get("appAgent") should equal(Some("app=a-angejia;av=4.8.1;ccid=2;gcid=;ch=B14;lng=;lat=;net=WIFI;p=android;pm=Android-OPPO R9s;osv=6.0.1;dvid=86253103615747802:00:00:00:00:00;uid=760198")) 33 | 34 | res should contain key ("cityId") 35 | res.get("cityId") should equal(Some("2")) 36 | 37 | res should contain key ("auth") 38 | res.get("auth") should equal(Some("ExhuBkt16RSHa0/0C+y9x4sAxoD7EKhTTIoMs76vHmF4082Rl0cMQrHG/n4sx5Swb2xKFoKuT0q71Fh1+vcW/wo7KxexQroTAfJbSze3I5pDxw6TdZ/8HoE2wwmq0Zfcoevuqh00RFkCTG88w2CddEjoOiL6xJ1PI0GK4i4D5MYk3WvuFyoYcBZ+Nk5i4yVhrD8GCRVu8uRiXCoQAyX8mahVxbtae3MEyOZD4E2goMoiul9tEcXyK0XYB+aJhEZVL6jdCR5kg9bihbVyulmWOIvjLnqkZCjBcEFrpv9kcG9zNTY9MUTUyJxPg2MuvzeZZ2FG5Yv8GKIhOSSNl1pKY/v35cpXBzldw55381DmC1s=")) 39 | 40 | res should contain key ("logType") 41 | res.get("logType") should equal(Some("accessLog")) 42 | } 43 | 44 | "old log" should "be parsed" in { 45 | val log =("0.064\t0.064\t153.99.123.51\t1529\t127.0.0.1:9000\t" 46 | +"[2016-11-01T00:00:00+08:00]\tapi.angejia.com\t" 47 | +"GET /mobile/member/inventories/1/2 HTTP/1.1\t" 48 | +"200\t2195\t-\tAngejia/4.6.2 CFNetwork/808.0.2 Darwin/16.0.0\t7.42\t153.99.123.51\t" 49 | +"kCp+SLcl85sKrn/1jntFnhRXZlG79zMr6wEAy7Vkd9TyJ46da3IxyJPRLdd/ngMk/KqLmF8p26/izeoN7/Pgo7NB5VO21FyaHKrN370snfqWOv5CYb1x7fFJNJQYwwX54ketZAJ1mMSWj7LzbhSj9Kedl56dUi/9OL64djEld2iecKGWtNk2Rc4I2FWjoLiavAsJh/6RCOJ84tcc7KLB+IeCjz/uW3JlrZoJO3qvDfMiCv28y6geQjRNVljmBo3P\t" 50 | +"app=i-angejia;av=4.6;ccid=1;gcid=1;ch=A01;lng=0.000000;lat=0.000000;ip=192.168.1.100;mac=None;net=WIFI;p=iOS;pm=iPhone9,1;osv=10.0.1;dvid=09DF78A6-935D-46E6-9BB5-201610241142;uid=728924;idfa=D2CAED51-9235-4B51-9CC6-7ECC3AE7DD91\t" 51 | +"-") 52 | val res = UserPortrait.formatLogData(log) 53 | 54 | res should contain key ("logRequestUri") 55 | res.get("logRequestUri") should equal(Some("/mobile/member/inventories/1/2")) 56 | 57 | res should contain key ("logHost") 58 | res.get("logHost") should equal(Some("api.angejia.com")) 59 | 60 | res should contain key ("logTime") 61 | res.get("logTime") should equal(Some("[2016-11-01T00:00:00+08:00]")) 62 | 63 | res should contain key ("userId") 64 | res.get("userId") should equal(Some("728924")) 65 | 66 | res should contain key ("appAgent") 67 | res.get("appAgent") should equal(Some("app=i-angejia;av=4.6;ccid=1;gcid=1;ch=A01;lng=0.000000;lat=0.000000;ip=192.168.1.100;mac=None;net=WIFI;p=iOS;pm=iPhone9,1;osv=10.0.1;dvid=09DF78A6-935D-46E6-9BB5-201610241142;uid=728924;idfa=D2CAED51-9235-4B51-9CC6-7ECC3AE7DD91")) 68 | 69 | res should contain key ("cityId") 70 | res.get("cityId") should equal(Some("1")) 71 | 72 | res should contain key ("auth") 73 | res.get("auth") should equal(Some("kCp+SLcl85sKrn/1jntFnhRXZlG79zMr6wEAy7Vkd9TyJ46da3IxyJPRLdd/ngMk/KqLmF8p26/izeoN7/Pgo7NB5VO21FyaHKrN370snfqWOv5CYb1x7fFJNJQYwwX54ketZAJ1mMSWj7LzbhSj9Kedl56dUi/9OL64djEld2iecKGWtNk2Rc4I2FWjoLiavAsJh/6RCOJ84tcc7KLB+IeCjz/uW3JlrZoJO3qvDfMiCv28y6geQjRNVljmBo3P")) 74 | 75 | res should contain key ("logType") 76 | res.get("logType") should equal(Some("accessLog")) 77 | } 78 | } 79 | --------------------------------------------------------------------------------