├── apply ├── bin │ ├── batchRun.sh │ ├── deploy.sh │ ├── lzo.sh │ └── start.sh ├── dependency-reduced-pom.xml ├── pom.xml └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── lakala │ │ │ └── audit │ │ │ └── rabbitmqMsg │ │ │ ├── consumer │ │ │ └── Receiver.java │ │ │ ├── entityV │ │ │ └── RequestMessageV.java │ │ │ └── produce │ │ │ └── Sender.java │ ├── resources │ │ ├── dev │ │ │ └── config.properties │ │ ├── extract_data_hql │ │ ├── log4j.xml │ │ ├── product │ │ │ ├── config.properties │ │ │ ├── hdfs-site.xml │ │ │ └── hive-site.xml │ │ └── test │ │ │ └── config.properties │ └── scala │ │ ├── ApplyPageRank.scala │ │ ├── CastToInt.scala │ │ ├── ExploreLPAData.scala │ │ ├── GraphOneDegreeApplyPerDiem.scala │ │ ├── GraphxBSP.scala │ │ ├── JudgeIsMobile.scala │ │ ├── LoadCallhistoryData.scala │ │ ├── LoadHiveData.scala │ │ ├── LoadHiveData2.scala │ │ ├── RunGraphx.scala │ │ ├── RunLoadApplyGraphx.scala │ │ ├── RunLoadApplyGraphx2.scala │ │ ├── RunLoadApplyGraphx3.scala │ │ ├── TestSql.scala │ │ ├── com │ │ └── lakala │ │ │ └── datacenter │ │ │ ├── abstractions │ │ │ └── PregelProgram.scala │ │ │ ├── apply │ │ │ ├── buildGraph │ │ │ │ ├── BuildGraphData.scala │ │ │ │ ├── GraphOperators.scala │ │ │ │ └── NewEdgeArr.scala │ │ │ └── model │ │ │ │ ├── ApplyInfo.scala │ │ │ │ ├── BaseEntity.scala │ │ │ │ ├── CallHistoryEntity.scala │ │ │ │ ├── EdgeEntity.scala │ │ │ │ └── NDegreeEntity.scala │ │ │ ├── faund │ │ │ ├── ApplyRandomForest.scala │ │ │ ├── DatasetTitanic.scala │ │ │ ├── ScalaRandomForest.scala │ │ │ ├── SparkConfUtil.scala │ │ │ └── Titanic.scala │ │ │ ├── grograms │ │ │ └── ApplyDegreeCentralityProgram.scala │ │ │ ├── grogress │ │ │ └── ExportNDegreeData.scala │ │ │ ├── jaccard │ │ │ ├── Jaccard.scala │ │ │ └── PowerIterationClustering.scala │ │ │ ├── louvain │ │ │ ├── HDFSLouvainRunner.scala │ │ │ ├── LouvainCore.scala │ │ │ ├── LouvainHarness.scala │ │ │ ├── VertexData.scala │ │ │ └── VertexState.scala │ │ │ ├── main │ │ │ ├── Analytics.scala │ │ │ ├── CallHistoryPageRank.scala │ │ │ ├── Driver.scala │ │ │ ├── LPAAlgorithm.scala │ │ │ ├── LPCoarseAlgorithm.scala │ │ │ ├── LiveCommunityDetection.scala │ │ │ ├── LouvainDGA.scala │ │ │ ├── PICCallAlgorithm.scala │ │ │ ├── PSCANAlgorithm.scala │ │ │ └── SemiSupervisedLabelPropagation.scala │ │ │ ├── talk │ │ │ ├── builtin │ │ │ │ └── ShortestPathSample.scala │ │ │ └── types │ │ │ │ ├── City.scala │ │ │ │ ├── Person.scala │ │ │ │ └── VertexAttribute.scala │ │ │ └── utils │ │ │ ├── SparkCommon.scala │ │ │ └── UtilsToos.scala │ │ └── edu │ │ └── gatech │ │ └── cse8803 │ │ ├── clustering │ │ └── PowerIterationClustering.scala │ │ ├── graphconstruct │ │ └── GraphLoader.scala │ │ ├── ioutils │ │ └── CSVUtils.scala │ │ ├── jaccard │ │ └── Jaccard.scala │ │ ├── main │ │ └── Main.scala │ │ ├── model │ │ └── models.scala │ │ └── randomwalk │ │ └── randomwalk.scala │ └── test │ └── scala │ ├── CollectionUtil.scala │ ├── CreateApplyData.scala │ ├── CreateApplyData2.scala │ ├── Driver.scala │ ├── EdgeTuplesTest.scala │ ├── GraphNdegUtil.scala │ ├── GraphXExample.scala │ ├── GraphxBSP.scala │ ├── GraphxBSP2.scala │ ├── GraphxBSP3.scala │ ├── Median.scala │ ├── NDegreeResult.scala │ ├── NNTest.scala │ ├── NumOnce.scala │ ├── ParsesTest.scala │ ├── TestCSV.scala │ ├── TestRunGraphx.scala │ ├── TrustRank.scala │ ├── UDF_test.scala │ ├── apply │ ├── NDegreeCallMiddlePath.scala │ └── NDegreeMiddlePathResult.scala │ ├── entity │ ├── CallEntity.scala │ ├── CallVertex.scala │ └── TwoDegree.scala │ └── utils │ ├── CollectionUtil.scala │ ├── GraphNdegUtil.scala │ └── GraphNdegUtil2.scala ├── common ├── pom.xml └── src │ ├── main │ ├── resources │ │ └── css │ │ │ └── style.css │ └── scala │ │ └── com │ │ └── lakala │ │ └── datacenter │ │ └── common │ │ ├── graphstream │ │ └── SimpleGraphViewer.scala │ │ └── utils │ │ └── DateTimeUtils.scala │ └── test │ ├── data │ ├── cities_edges.txt │ ├── cities_vertices.txt │ ├── likeness_edges.txt │ ├── maxvalue_edges.txt │ ├── maxvalue_vertices.txt │ ├── papers_edges.txt │ ├── people_vertices.txt │ ├── relationships_edges.txt │ ├── us_cities_edges.txt │ ├── us_cities_vertices.txt │ ├── users_dense_edges.txt │ ├── users_disjoint_edges.txt │ ├── users_edges.txt │ └── users_vertices.txt │ └── scala │ └── TestGraphViewer.scala ├── core ├── pom.xml └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── lakala │ │ │ └── datacenter │ │ │ └── core │ │ │ ├── config │ │ │ └── ConfigurationLoader.java │ │ │ ├── hdfs │ │ │ └── FileUtil.java │ │ │ ├── messaging │ │ │ ├── Sender.java │ │ │ └── Worker.java │ │ │ ├── models │ │ │ ├── PartitionDescription.java │ │ │ ├── ProcessorMessage.java │ │ │ └── ProcessorMode.java │ │ │ └── processor │ │ │ └── GraphProcessor.java │ └── scala │ │ └── com │ │ └── lakala │ │ └── datacenter │ │ └── core │ │ ├── abstractions │ │ └── PregelProgram.scala │ │ ├── algorithms │ │ └── Algorithms.scala │ │ ├── grograms │ │ ├── BetweennessCentralityProgram.scala │ │ ├── EdgeBetweennessProgram.scala │ │ ├── MaximumValueProgram.scala │ │ └── ShortestPathProgram.scala │ │ └── utils │ │ └── UtilsToos.scala │ └── test │ ├── java │ └── com │ │ └── lakala │ │ └── datacenter │ │ └── core │ │ ├── hdfs │ │ └── FileUtilTest.java │ │ ├── messaging │ │ └── SenderTest.java │ │ └── processor │ │ └── GraphProcessorTest.java │ └── scala │ └── com │ └── lakala │ └── datacenter │ └── core │ └── grograms │ ├── GraphProcessorTest.scala │ ├── ShortestPathProgramTests.scala │ └── ShortestPathTests.scala ├── neo4j ├── bin │ └── start2.sh ├── pom.xml └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── lakala │ │ │ └── datacenter │ │ │ └── enums │ │ │ ├── DataAttributeType.java │ │ │ ├── GraphEnum.java │ │ │ ├── Labels.java │ │ │ └── RelationshipTypes.java │ ├── resources │ │ ├── css │ │ │ └── style.css │ │ ├── dev │ │ │ ├── config.properties │ │ │ └── hive-site.xml │ │ ├── log4j.xml │ │ ├── product │ │ │ └── config.properties │ │ └── test │ │ │ └── config.properties │ └── scala │ │ └── com │ │ └── lakala │ │ └── datacenter │ │ ├── abstractions │ │ └── DataGenerator.scala │ │ ├── constant │ │ └── StreamingConstant.scala │ │ ├── cypher │ │ └── NeoData.scala │ │ ├── grogram │ │ └── Neo4jDataGenerator.scala │ │ ├── load │ │ └── spark │ │ │ ├── ClusterGraphDatabase.scala │ │ │ ├── ExplortApplyData.scala │ │ │ ├── ExplortApplyData2.scala │ │ │ ├── LoadHiveData.scala │ │ │ ├── Neo4j.scala │ │ │ ├── Neo4jConfig.scala │ │ │ ├── Neo4jDataFrame.scala │ │ │ ├── Neo4jGraph.scala │ │ │ ├── Neo4jJavaIntegration.scala │ │ │ ├── Neo4jPartition.scala │ │ │ ├── Neo4jRowRDD.scala │ │ │ └── Neo4jTupleRDD.scala │ │ ├── main │ │ ├── HandleTask.scala │ │ ├── Main.scala │ │ ├── MessageParam.scala │ │ └── TrialConsumerKafka.scala │ │ ├── realtimeBuildGraphx │ │ ├── MsgOffsetStreamListener.scala │ │ ├── SendMsg.scala │ │ └── SparkStreamingOnKafkaDirect.scala │ │ └── utils │ │ ├── ArgsCommon.scala │ │ ├── RedisUtils.scala │ │ └── UtilsTools.scala │ └── test │ ├── java │ ├── ApplyInfoConsumer.java │ ├── ConsumerKafka.java │ ├── DataAttributeType.java │ ├── JavaKafkaSimpleConsumerAPI.java │ ├── JavaKafkaSimpleConsumerAPITest.java │ ├── KafkaBrokerInfo.java │ ├── KafkaConsumer.java │ ├── KafkaProducer.java │ ├── KafkaProducer2.java │ ├── KafkaTopicPartitionInfo.java │ ├── LogSession.java │ ├── OperatorKafka.java │ ├── SendKafkaMsgTest.java │ └── TestCypher.java │ └── scala │ ├── BroadcastAccumulatorStreaming.scala │ ├── ClientRedisTest.scala │ ├── CollectionUtil.scala │ ├── ConsumerGroupExample.scala │ ├── GraphNdegUtil.scala │ ├── Main.scala │ ├── StreamingFromKafka.scala │ ├── TestApiNeo4j.scala │ ├── TestCypher.scala │ ├── TestKafka.scala │ ├── TestRedis.scala │ └── org │ └── neo4j │ └── spark │ ├── ExplortApplyDataTest.scala │ ├── MainTest.scala │ ├── Neo4jContstanTest.scala │ ├── Neo4jDataFrameScalaTest.scala │ ├── Neo4jGraphScalaTest.scala │ ├── Neo4jRestSparkTest.scala │ └── Neo4jSparkTest.scala └── pom.xml /apply/bin/batchRun.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -lt 1 ] ; then 4 | echo "USAGE: $0 begin_date [end_date]" 5 | exit 1; 6 | fi 7 | 8 | beginDate=$1 9 | yesterday=$(date --date="1 days ago" '+%Y-%m-%d') 10 | endDate=$yesterday 11 | if [ $# -gt 1 ] ; then 12 | endDate=$2 13 | fi 14 | 15 | beginTime=`date -d $beginDate '+%s'` 16 | yesterdayTime=`date -d $yesterday '+%s'` 17 | endTime=`date -d $endDate '+%s'` 18 | if [ $beginTime -gt $yesterdayTime ] ; then 19 | echo "begin_date can only be yesterday[$endDate] at the latest" 20 | exit 1; 21 | fi 22 | if [ $endTime -gt $yesterdayTime ] ; then 23 | echo "end_date can only be yesterday[$yesterday] at the latest" 24 | exit 1; 25 | fi 26 | if [ $beginTime -gt $endTime ] ; then 27 | echo "begin_date can only be end_date[$endDate] at the latest" 28 | exit 1; 29 | fi 30 | 31 | #echo $beginDate 32 | #echo $endDate 33 | currentDate=$beginDate 34 | currentTime=$beginTime 35 | 36 | cd "`dirname "$0"`" 37 | 38 | while [ $currentTime -le $endTime ] 39 | do 40 | #echo $currentDate 41 | sh start.sh $currentDate 42 | currentDate=`date -d "$currentDate +1 day" '+%Y-%m-%d'` 43 | currentTime=`date -d $currentDate '+%s'` 44 | done 45 | -------------------------------------------------------------------------------- /apply/bin/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ $# != 1 ] ; then 4 | echo "USAGE: $0 ENV(dev|test|product)" 5 | exit 1; 6 | fi 7 | 8 | cd "$(cd "`dirname "$0"`"/../..; pwd)" 9 | mvn -U clean package dependency:copy-dependencies -DskipTests -P$1 -Papply 10 | -------------------------------------------------------------------------------- /apply/bin/lzo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #################################### 3 | ## lzo hadoop textfile 4 | ## usage:sh lzo.sh source_dir 5 | ## example:sh lzo.sh /user/flume 6 | #################################### 7 | startTime=`date +%s` 8 | echo "the script begin at $(date +%H:%M:%S)" 9 | source_dir=$1 10 | cd /tmp 11 | hadoop fs -get ${source_dir} /tmp 12 | filepaths=() 13 | function getfilePath(){ 14 | for file in ` ls $1 ` 15 | do 16 | if [ -d $1"/"$file ] 17 | then 18 | getfilePath $1"/"$file 19 | else 20 | filepaths[${#filepaths[@]}]=$1"/"$file 21 | fi 22 | done 23 | } 24 | path=/tmp/${source_dir##*/} 25 | getfilePath $path 26 | #echo ${filepaths[*]} 27 | for filepath in ${filepaths[@]} 28 | do 29 | lzop ${filepath} 30 | rm -rf ${filepath} 31 | done 32 | hadoop fs -mv ${source_dir} ${source_dir}.bak 33 | hadoop fs -put $path ${source_dir%/*} 34 | for filepath in ${filepaths[@]} 35 | do 36 | hadoop jar /usr/hdp/2.2.6.0-2800/hadoop/lib/hadoop-lzo-0.6.0.2.2.6.0-2800.jar com.hadoop.compression.lzo.LzoIndexer ${source_dir%/*}/${filepath#*/tmp/}.lzo 37 | #2>&1 > /data/hdfs_logs/${source_dir##*/}.log 38 | done 39 | rm -rf $path 40 | endTime=`date +%s` 41 | echo "the script end at $(date +%H:%M:%S)" 42 | echo "total second is" $(($endTime-$startTime)) -------------------------------------------------------------------------------- /apply/bin/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## usage: sh bin/start.sh -i /logs/device/* -d 2016-01-11 3 | 4 | SPARK_HOME=/usr/hdp/current/spark-client 5 | HIVE_HOME=/usr/hdp/current/hive-client 6 | PROJECT_HOME="$(cd "`dirname "$0"`"/..; pwd)" 7 | HDP_VERSION=2.4.0.0-169 8 | APP_CACHE_DIR=/tmp/device 9 | 10 | stdate=${1:-`date -d '1 days ago' +"%Y-%m-%d"`} 11 | #inputdir=/logs/device/* 12 | #inputfile=/logs/device/*/2016-01-{1[1-9],2[0-1]} 13 | while getopts "d:i:" opt ; do 14 | case $opt in 15 | d)stdate=$OPTARG ;; 16 | i)inputdir=$OPTARG ;; 17 | ?)echo "==> please input arg: stdate(d), inputdir(i)" && exit 1 ;; 18 | esac 19 | done 20 | 21 | #echo "==> ready for geoip...." 22 | #hadoop fs -mkdir -p $APP_CACHE_DIR/geoip 23 | #hadoop fs -test -e $APP_CACHE_DIR/geoip/GeoLite2-City.mmdb 24 | #if [ $? -ne 0 ]; then 25 | # echo "GeoLite2-City.mmdb not exists!" 26 | # hadoop fs -put $PROJECT_HOME/../tcloud-log-analysis/src/main/bundleApp/coord-common/geoip/GeoLite2-City.mmdb $APP_CACHE_DIR/geoip/ 27 | #fi 28 | 29 | ## https://issues.apache.org/jira/browse/ZEPPELIN-93 30 | ## https://github.com/caskdata/cdap/pull/4106 31 | spark-submit \ 32 | --class RunLoadApplyGraphx3 \ 33 | --master yarn \ 34 | --deploy-mode cluster \ 35 | --queue dc \ 36 | --driver-memory 2G \ 37 | --executor-memory 8G \ 38 | --num-executors 4 \ 39 | --executor-cores 3 \ 40 | --conf "spark.rpc.askTimeout=300s" \ 41 | --driver-java-options "-XX:-UseGCOverheadLimit -Xms2G -Xmx2G -XX:MaxPermSize=2G -Dhdp.version=$HDP_VERSION -Dspark.yarn.am.extraJavaOptions=-Dhdp.version=$HDP_VERSION" \ 42 | --verbose \ 43 | --files $PROJECT_HOME/target/classes/hive-site.xml \ 44 | --driver-class-path $PROJECT_HOME/target/dependency/mysql-connector-java-5.1.36.jar \ 45 | --jars $PROJECT_HOME/target/dependency/mysql-connector-java-5.1.36.jar,$SPARK_HOME/lib/datanucleus-api-jdo-3.2.6.jar,$SPARK_HOME/lib/datanucleus-core-3.2.10.jar,$SPARK_HOME/lib/datanucleus-rdbms-3.2.9.jar \ 46 | $PROJECT_HOME/target/data-analysis-sdk.jar \ 47 | $stdate 48 | 49 | ## --packages com.databricks:spark-csv_2.10:1.3.0 \ 50 | ## 2>&1 > output.txt -------------------------------------------------------------------------------- /apply/dependency-reduced-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | graphx-analysis 5 | com.lakala.datacenter 6 | 1.0.0-SNAPSHOT 7 | 8 | 4.0.0 9 | graphx-analysis-apply 10 | graphx-analysis-apply 11 | http://maven.apache.org 12 | 13 | graphx-analysis-apply 14 | 15 | 16 | UTF-8 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /apply/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | com.lakala.datacenter 5 | graphx-analysis 6 | 1.0.0-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | graphx-analysis-apply 11 | jar 12 | 13 | graphx-analysis-apply 14 | http://maven.apache.org 15 | 16 | 17 | UTF-8 18 | 19 | 20 | 21 | 22 | 23 | com.lakala.datacenter 24 | graphx-analysis-core 25 | ${project.version} 26 | 27 | 28 | 29 | graphx-analysis-apply 30 | 31 | 32 | -------------------------------------------------------------------------------- /apply/src/main/java/com/lakala/audit/rabbitmqMsg/consumer/Receiver.java: -------------------------------------------------------------------------------- 1 | package com.lakala.audit.rabbitmqMsg.consumer; 2 | 3 | import com.google.gson.Gson; 4 | import com.lakala.audit.rabbitmqMsg.entityV.RequestMessageV; 5 | import com.rabbitmq.client.Channel; 6 | import com.rabbitmq.client.Connection; 7 | import com.rabbitmq.client.ConnectionFactory; 8 | import com.rabbitmq.client.QueueingConsumer; 9 | 10 | import java.io.IOException; 11 | import java.util.concurrent.TimeoutException; 12 | 13 | /** 14 | * Created by Administrator on 2017/8/1 0001. 15 | */ 16 | public class Receiver { 17 | private final static String AUDIT_QUEUE_NAME = "audit_mq"; 18 | // private final static String USERNAME = "lys"; 19 | // private final static String PASSWORD = "123456"; 20 | // private final static String VIRTUALHOST = "/"; 21 | // private final static String HOST = "localhost"; 22 | 23 | private final static String HOST = "192.168.0.182"; 24 | private final static String USERNAME = "antifraud"; 25 | private final static String PASSWORD = "antifraud"; 26 | private final static String VIRTUALHOST = "antifraud"; 27 | private final static int PORTNUMBER = 5672; 28 | 29 | public static void main(String[] args) { 30 | try { 31 | work(); 32 | } catch (IOException e) { 33 | e.printStackTrace(); 34 | } catch (InterruptedException e) { 35 | e.printStackTrace(); 36 | } catch (TimeoutException e) { 37 | e.printStackTrace(); 38 | } 39 | 40 | } 41 | 42 | public static void work() throws java.io.IOException, 43 | java.lang.InterruptedException, TimeoutException { 44 | ConnectionFactory factory = new ConnectionFactory(); 45 | // factory.setHost("192.168.0.182"); 46 | factory.setHost(HOST); 47 | factory.setPort(PORTNUMBER); 48 | factory.setUsername(USERNAME); 49 | factory.setPassword(PASSWORD); 50 | factory.setVirtualHost(VIRTUALHOST); 51 | Connection connection = factory.newConnection(); 52 | Channel channel = connection.createChannel(); 53 | 54 | channel.queueDeclare(AUDIT_QUEUE_NAME, false, false, false, null); 55 | channel.basicQos(20); 56 | 57 | QueueingConsumer consumer = new QueueingConsumer(channel); 58 | channel.basicConsume(AUDIT_QUEUE_NAME, false, consumer); 59 | 60 | System.out.println(" [*] Waiting for messages. To exit press CTRL+C"); 61 | 62 | while (true) { 63 | QueueingConsumer.Delivery delivery = consumer.nextDelivery(); 64 | String message = new String(delivery.getBody()); 65 | 66 | System.out.println(" [x] Received '" + message + "'"); 67 | 68 | Gson gson = new Gson(); 69 | RequestMessageV requestMessageV = gson.fromJson(message, RequestMessageV.class); 70 | //TODO 数据解析放到redis 71 | 72 | System.out.println(requestMessageV.getOrderno()); 73 | System.out.println(" [x] Done '" + message + "'"); 74 | channel.basicAck(delivery.getEnvelope().getDeliveryTag(), false); 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /apply/src/main/java/com/lakala/audit/rabbitmqMsg/entityV/RequestMessageV.java: -------------------------------------------------------------------------------- 1 | package com.lakala.audit.rabbitmqMsg.entityV; 2 | 3 | /** 4 | * Created by Administrator on 2017/8/1 0001. 5 | */ 6 | public class RequestMessageV { 7 | public RequestMessageV() { 8 | } 9 | 10 | public RequestMessageV(String orderno, String statue) { 11 | this.orderno = orderno; 12 | this.statue = statue; 13 | } 14 | 15 | String orderno; 16 | String statue; 17 | 18 | public String getOrderno() { 19 | return orderno; 20 | } 21 | 22 | public void setOrderno(String orderno) { 23 | this.orderno = orderno; 24 | } 25 | 26 | public String getStatue() { 27 | return statue; 28 | } 29 | 30 | public void setStatue(String statue) { 31 | this.statue = statue; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /apply/src/main/java/com/lakala/audit/rabbitmqMsg/produce/Sender.java: -------------------------------------------------------------------------------- 1 | package com.lakala.audit.rabbitmqMsg.produce; 2 | 3 | import com.google.gson.Gson; 4 | import com.lakala.audit.rabbitmqMsg.entityV.RequestMessageV; 5 | import com.rabbitmq.client.Channel; 6 | import com.rabbitmq.client.Connection; 7 | import com.rabbitmq.client.ConnectionFactory; 8 | 9 | import java.io.IOException; 10 | import java.util.concurrent.TimeoutException; 11 | 12 | 13 | /** 14 | * Created by Administrator on 2017/8/1 0001. 15 | */ 16 | public class Sender { 17 | private final static String AUDIT_QUEUE_NAME = "audit_mq"; 18 | // private final static String USERNAME = "lys"; 19 | // private final static String PASSWORD = "123456"; 20 | // private final static String VIRTUALHOST = "/"; 21 | // private final static String HOST = "localhost"; 22 | 23 | private final static String HOST = "192.168.0.182"; 24 | private final static String USERNAME = "antifraud"; 25 | private final static String PASSWORD = "antifraud"; 26 | private final static String VIRTUALHOST = "antifraud"; 27 | private final static int PORTNUMBER = 5672; 28 | 29 | public static void main(String[] args) { 30 | Gson gson = new Gson(); 31 | RequestMessageV requestMessageV = new RequestMessageV("XNA20170505131153011496369566130", "Q"); 32 | String message = gson.toJson(requestMessageV); 33 | System.out.println(message); 34 | //message={"orderno":"XNA20170505131153011496369566130","statue":"Q"} 35 | try { 36 | send(message); 37 | } catch (IOException e) { 38 | e.printStackTrace(); 39 | } catch (InterruptedException e) { 40 | e.printStackTrace(); 41 | } catch (TimeoutException e) { 42 | e.printStackTrace(); 43 | } 44 | 45 | } 46 | 47 | public static void send(String message) throws java.io.IOException, 48 | java.lang.InterruptedException, TimeoutException { 49 | 50 | ConnectionFactory factory = new ConnectionFactory(); 51 | factory.setHost(HOST); 52 | factory.setPort(PORTNUMBER); 53 | factory.setUsername(USERNAME); 54 | factory.setPassword(PASSWORD); 55 | factory.setVirtualHost(VIRTUALHOST); 56 | Connection connection = factory.newConnection(); 57 | Channel channel = connection.createChannel(); 58 | channel.queueDeclare(AUDIT_QUEUE_NAME, false, false, false, null); 59 | channel.basicPublish("", AUDIT_QUEUE_NAME, null, message.getBytes("UTF-8")); 60 | System.out.println("已经发送消息....." + message); 61 | channel.close(); 62 | connection.close(); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /apply/src/main/resources/dev/config.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luhm2017/graphx-analysis/9e3a96ec0df5da8208655face4ff0b0e6b3ed497/apply/src/main/resources/dev/config.properties -------------------------------------------------------------------------------- /apply/src/main/resources/extract_data_hql: -------------------------------------------------------------------------------- 1 | use lkl_card_score; 2 | set mapreduce.job.queuename=szoffline; 3 | -- 抽取出每个人的逾期数据 4 | create table fraud_mobile_performance AS SELECT if(a.label=0,1,0) AS good,if(a.label=1,1,0) AS bad,if(a.label=2,1,0) AS unknown,a.apply_time,b.history_due_day,b.current_due_day,b.mobile from fqz_order_performance_data_new a inner join creditloan.s_c_apply_user b ON a.cert_no= b.cert_no AND a.year='2017' AND a.month ='09' AND a.day='19' AND b.year='2017' AND b.month='09' AND b.day='19'; 5 | -- 社区 黑名单 6 | create table fraud_community_mobile_black as select a.community_mobile,a.mobile,if(b.mobile is not null,0,1) as lable from louvain_result2 a left outer join creditloan.s_c_loan_blacklist b on a.community_mobile = b.mobile and b.year='2017' and b.month='09' and b.day='19'; 7 | -- 社区id每个人的逾期情况按社区id,是否黑名单分组 8 | create table fraud_community_mobile_black_performance as SELECT a.community_mobile,a.lable,sum(if(b.good>=0,b.good,0)) goods,sum(if(b.bad>=0,b.bad,0)) bads,sum(if(b.unknown>=0,b.unknown,0)) unknowns,sum(if(b.history_due_day>=0,b.history_due_day,0)) history_due_days,sum(if(b.current_due_day>=0,b.current_due_day,0)) current_due_days from fraud_community_mobile_black AS a LEFT JOIN fraud_mobile_performance AS b on a.community_mobile=b.mobile GROUP BY a.community_mobile,a.lable; 9 | -------------------------------------------------------------------------------- /apply/src/main/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /apply/src/main/resources/product/config.properties: -------------------------------------------------------------------------------- 1 | hdfs_root_path=hdfs://ns1/ -------------------------------------------------------------------------------- /apply/src/main/resources/product/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | dfs.nameservices 7 | ns1 8 | 9 | 10 | dfs.client.failover.proxy.provider.ns1 11 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider 12 | 13 | 14 | dfs.ha.automatic-failover.enabled.ns1 15 | true 16 | 17 | 18 | ha.zookeeper.quorum 19 | 20 | datanode4.lakala.com:2181,datanode5.lakala.com:2181,datanode6.lakala.com:2181,datanode7.lakala.com:2181,datanode8.lakala.com:2181 21 | 22 | 23 | 24 | dfs.ha.namenodes.ns1 25 | namenode114,namenode148 26 | 27 | 28 | dfs.namenode.rpc-address.ns1.namenode114 29 | namenode.lakala.com:8020 30 | 31 | 32 | dfs.namenode.servicerpc-address.ns1.namenode114 33 | namenode.lakala.com:8022 34 | 35 | 36 | dfs.namenode.http-address.ns1.namenode114 37 | namenode.lakala.com:50070 38 | 39 | 40 | dfs.namenode.https-address.ns1.namenode114 41 | namenode.lakala.com:50470 42 | 43 | 44 | dfs.namenode.rpc-address.ns1.namenode148 45 | namenodestandby.lakala.com:8020 46 | 47 | 48 | dfs.namenode.servicerpc-address.ns1.namenode148 49 | namenodestandby.lakala.com:8022 50 | 51 | 52 | dfs.namenode.http-address.ns1.namenode148 53 | namenodestandby.lakala.com:50070 54 | 55 | 56 | dfs.namenode.https-address.ns1.namenode148 57 | namenodestandby.lakala.com:50470 58 | 59 | 60 | dfs.replication 61 | 3 62 | 63 | 64 | dfs.blocksize 65 | 134217728 66 | 67 | 68 | dfs.client.use.datanode.hostname 69 | false 70 | 71 | 72 | fs.permissions.umask-mode 73 | 022 74 | 75 | 76 | dfs.namenode.acls.enabled 77 | true 78 | 79 | 80 | dfs.client.use.legacy.blockreader 81 | false 82 | 83 | 84 | dfs.client.read.shortcircuit 85 | false 86 | 87 | 88 | dfs.domain.socket.path 89 | /var/run/hdfs-sockets/dn 90 | 91 | 92 | dfs.client.read.shortcircuit.skip.checksum 93 | false 94 | 95 | 96 | dfs.client.domain.socket.data.traffic 97 | false 98 | 99 | 100 | dfs.datanode.hdfs-blocks-metadata.enabled 101 | true 102 | 103 | -------------------------------------------------------------------------------- /apply/src/main/resources/test/config.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luhm2017/graphx-analysis/9e3a96ec0df5da8208655face4ff0b0e6b3ed497/apply/src/main/resources/test/config.properties -------------------------------------------------------------------------------- /apply/src/main/scala/CastToInt.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql.api.java.UDF1 2 | 3 | import scala.util.matching.Regex 4 | 5 | /** 6 | * Created by linyanshi on 2017/9/14 0014. 7 | */ 8 | class CastToInt extends UDF1[String, Long] { 9 | val pattern = new Regex("[0-9]{1,}") 10 | 11 | override def call(value: String): Long = { 12 | if (pattern.pattern.matcher(value).matches() && value.toLong < 86400l) value.trim.toLong 13 | else if (pattern.pattern.matcher(value).matches() && value.toLong >= 86400l) 86400l 14 | else 0L 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /apply/src/main/scala/ExploreLPAData.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.graphx.Edge 2 | import org.apache.spark.{SparkConf, SparkContext} 3 | 4 | /** 5 | * Created by linyanshi on 2017/9/14 0014. 6 | */ 7 | object ExploreLPAData { 8 | def main(args: Array[String]): Unit = { 9 | val conf = new SparkConf().setAppName("ExploreLPAData").set("spark.eventLog.enabled", "true") 10 | val sc = new SparkContext(conf) 11 | val rdd = sc.textFile(args(0), 100).mapPartitions(lines => lines.map { line => 12 | val arr = line.split(",") 13 | Edge(arr(1).toLong,arr(2).toLong) 14 | }) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /apply/src/main/scala/JudgeIsMobile.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql.api.java.UDF1 2 | 3 | import scala.util.matching.Regex 4 | 5 | /** 6 | * Created by linyanshi on 2017/9/14 0014. 7 | */ 8 | class JudgeIsMobile extends UDF1[String,Boolean]{ 9 | val pattern = new Regex("^((17[0-9])|(14[0-9])|(13[0-9])|(15[^4,\\D])|(18[0,5-9]))\\d{8}$") 10 | override def call(value: String): Boolean = { 11 | pattern.pattern.matcher(value).matches() 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /apply/src/main/scala/LoadCallhistoryData.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql.hive.HiveContext 2 | import org.apache.spark.sql.types.DataTypes 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by linyanshi on 2017/9/14 0014. 7 | */ 8 | object LoadCallhistoryData { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setAppName("LoadCallhistoryData") 11 | val sc = new SparkContext(conf) 12 | val hc = new HiveContext(sc) 13 | val date = args(0).split("-") 14 | val year = date(0) 15 | val month = date(1) 16 | val day = date(2) 17 | hc.sql("use datacenter") 18 | hc.udf.register("isMobile", new JudgeIsMobile(), DataTypes.BooleanType) 19 | hc.udf.register("castInt", new CastToInt(), DataTypes.LongType) 20 | val hql = 21 | s"""SELECT a.deviceid,a.loginname,a.caller_phone,sum(castInt(a.duration)) AS duration,max(a.date) AS date,max(a.collecttime) AS collecttime 22 | |FROM r_callhistory_week a WHERE a.year='${year}' AND a.month='${month}' AND a.day='${day}' 23 | | AND a.loginname is not null AND a.caller_phone is not null AND isMobile(a.loginname) 24 | | AND isMobile(a.caller_phone) AND a.duration is not null AND a.collecttime <>'null' 25 | | group by a.deviceid,a.loginname,a.caller_phone 26 | """.stripMargin 27 | hc.sql(hql).repartition(100).mapPartitions(rows => rows.map { row => s"${row.getAs("deviceid")},${row.getAs("loginname")},${row.getAs("caller_phone")},${row.getAs("duration")},${row.getAs("date")},${row.getAs("collecttime")}" }) 28 | .saveAsTextFile(args(1)) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /apply/src/main/scala/TestSql.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql.{DataFrame, SQLContext} 2 | import org.apache.spark.storage.StorageLevel 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by Administrator on 2017/7/27 0027. 7 | */ 8 | object TestSql { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("test") 11 | val sc = new SparkContext(conf) 12 | val sqlContext = new SQLContext(sc) 13 | val list = List("1","2","3","3","5") 14 | import sqlContext.implicits._ 15 | val vertexInfoDF = sc.parallelize(list).toDF().persist(StorageLevel.MEMORY_AND_DISK_SER) 16 | // 用聚合顶点信息来创建特征向量的函数 17 | val mean: DataFrame = vertexInfoDF.agg("_1" -> "mean") 18 | val sd: DataFrame = vertexInfoDF.agg("_1" -> "stddev") 19 | // val median: DataFrame = vertexInfoDF.agg("_1" -> "median") 20 | val min: DataFrame = vertexInfoDF.agg("_1" -> "min") 21 | val max: DataFrame = vertexInfoDF.agg("_1" -> "max") 22 | val skew: DataFrame = vertexInfoDF.agg("_1" -> "skewness") 23 | val kurt: DataFrame = vertexInfoDF.agg("_1" -> "kurtosis") 24 | val vari: DataFrame = vertexInfoDF.agg("_1" -> "variance") 25 | 26 | val joinedStats: DataFrame = sd.join(mean).join(min).join(max).join(skew).join(kurt).join(vari) 27 | // .join(median) 28 | println(joinedStats.printSchema()) 29 | println(joinedStats.foreach(row=>println(row.get(0)))) 30 | vertexInfoDF.unpersist(blocking = true) 31 | val sdtestDF = Seq((1.2.toDouble, 1.6.toDouble, 1.8.toDouble, 1.9.toDouble)) 32 | .toDF("numNodes", "numEdges", "maxDeg", "avgDeg") 33 | val df = sdtestDF.join(joinedStats) 34 | println(df.count()) } 35 | 36 | } 37 | 38 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/abstractions/PregelProgram.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.abstractions 2 | 3 | /** 4 | * Created by peter on 2017/4/27. 5 | */ 6 | 7 | import org.apache.spark.graphx._ 8 | 9 | import scala.reflect.ClassTag 10 | 11 | /** 12 | * The [[PregelProgram]] abstraction wraps Spark's Pregel API implementation from the [[GraphOps]] 13 | * class into a model that is easier to write graph algorithms. 14 | * @tparam VertexState is the generic type representing the state of a vertex 15 | */ 16 | abstract class PregelProgram[VertexState: ClassTag, VD: ClassTag, ED: ClassTag] protected() extends Serializable { 17 | 18 | @transient val graph: Graph[VD, ED] 19 | 20 | /** 21 | * The vertex program receives a state update and acts to update its state 22 | * @param id is the [[VertexId]] that this program will perform a state operation for 23 | * @param state is the current state of this [[VertexId]] 24 | * @param message is the state received from another vertex in the graph 25 | * @return a [[VertexState]] resulting from a comparison between current state and incoming state 26 | */ 27 | def vertexProgram(id : VertexId, state : VertexState, message : VertexState) : VertexState 28 | 29 | /** 30 | * The message broker sends and receives messages. It will initially receive one message for 31 | * each vertex in the graph. 32 | * @param triplet An edge triplet is an object containing a pair of connected vertex objects and edge object. 33 | * For example (v1)-[r]->(v2) 34 | * @return The message broker returns a key value list, each containing a VertexId and a new message 35 | */ 36 | def messageBroker(triplet :EdgeTriplet[VertexState, ED]) : Iterator[(VertexId, VertexState)] 37 | 38 | /** 39 | * This method is used to reduce or combine the set of all state outcomes produced by a vertexProgram 40 | * for each vertex in each superstep iteration. Each vertex has a list of state updates received from 41 | * other vertices in the graph via the messageBroker method. This method is used to reduce the list 42 | * of state updates into a single state for the next superstep iteration. 43 | * @param a A first [[VertexState]] representing a partial state of a vertex. 44 | * @param b A second [[VertexState]] representing a different partial state of a vertex 45 | * @return a merged [[VertexState]] representation from the two [[VertexState]] parameters 46 | */ 47 | def combinerMessage(a: VertexState, b: VertexState) : VertexState 48 | 49 | } 50 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/apply/buildGraph/NewEdgeArr.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.apply.buildGraph 2 | 3 | /** 4 | * Created by linyanshi on 2017/9/1 0001. 5 | */ 6 | case class NewEdgeArr(srcV: String, dstV: String, var srcType: String, dstType: String, init: Boolean = false) 7 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/apply/model/ApplyInfo.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.apply.model 2 | 3 | import org.apache.commons.lang3.StringUtils 4 | 5 | /** 6 | * Created by ASUS-PC on 2017/4/13. 7 | */ 8 | class ApplyInfo(var order_id: String = "", 9 | var contract_no: String = "", 10 | var business_no: String = "", 11 | var term_id: String = "", 12 | var loan_pan: String = "", 13 | var return_pan: String = "", 14 | var empmobile: String = "", 15 | var datatype: Int = 0 //0,1黑,2百 16 | ) extends BaseEntity with Product { 17 | override def toString = s"ApplyInfo(order_id=$order_id, contract_no=$contract_no, business_no=$business_no, term_id=$term_id, loan_pan=$loan_pan, return_pan=$return_pan, empmobile=$empmobile)" 18 | 19 | override def productElement(idx: Int): Any = idx match { 20 | case 0 => order_id 21 | case 1 => contract_no 22 | case 2 => business_no 23 | case 3 => term_id 24 | case 4 => loan_pan 25 | case 5 => return_pan 26 | case 6 => empmobile 27 | case 7 => datatype 28 | case 8 => inDeg 29 | case 9 => outDeg 30 | } 31 | 32 | override def productArity: Int = 10 33 | 34 | override def canEqual(that: Any): Boolean = that.isInstanceOf[ApplyInfo] 35 | 36 | override def equals(other: Any): Boolean = other match { 37 | case that: ApplyInfo => 38 | (that canEqual this) && 39 | order_id == that.order_id && 40 | contract_no == that.contract_no && 41 | business_no == that.business_no && 42 | term_id == that.term_id && 43 | loan_pan == that.loan_pan && 44 | return_pan == that.return_pan && 45 | empmobile == that.empmobile 46 | case _ => false 47 | } 48 | 49 | override def hashCode(): Int = { 50 | val state = Seq(order_id, contract_no, business_no, term_id, loan_pan, return_pan, empmobile) 51 | state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b) 52 | } 53 | 54 | def getKey: String = { 55 | if (StringUtils.isNotEmpty(order_id)) order_id 56 | else if (StringUtils.isNotEmpty(contract_no)) contract_no 57 | else if (StringUtils.isNotEmpty(business_no)) business_no 58 | else if (StringUtils.isNotEmpty(term_id)) term_id 59 | else if (StringUtils.isNotEmpty(loan_pan)) loan_pan 60 | else if (StringUtils.isNotEmpty(return_pan)) return_pan 61 | else empmobile 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/apply/model/BaseEntity.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.apply.model 2 | 3 | /** 4 | * Created by ASUS-PC on 2017/4/17. 5 | */ 6 | trait BaseEntity extends Serializable { 7 | var inDeg: Int = 0; 8 | var outDeg: Int = 0; 9 | } 10 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/apply/model/CallHistoryEntity.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.apply.model 2 | 3 | /** 4 | * Created by ASUS-PC on 2017/4/18. 5 | */ 6 | class CallHistoryEntity(var loginname: Long = 0L, var caller_phone: Long = 0L) extends BaseEntity with Serializable with Product { 7 | override def productElement(idx: Int): Any = idx match { 8 | case 0 => loginname 9 | case 1 => caller_phone 10 | 11 | } 12 | 13 | override def productArity: Int = 2 14 | 15 | override def canEqual(that: Any): Boolean = that.isInstanceOf[CallHistoryEntity] 16 | 17 | override def equals(other: Any): Boolean = other match { 18 | case that: CallHistoryEntity => 19 | (that canEqual this) && 20 | loginname == that.loginname && 21 | caller_phone == that.caller_phone 22 | case _ => false 23 | } 24 | 25 | override def hashCode(): Int = { 26 | val state = Seq(loginname, caller_phone) 27 | state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b) 28 | } 29 | 30 | override def toString = s"CallHistoryEntity($loginname, $caller_phone)" 31 | } 32 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/apply/model/EdgeEntity.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.apply.model 2 | 3 | /** 4 | * Created by ASUS-PC on 2017/4/17. 5 | */ 6 | class EdgeEntity(var scrId: Long, val destId: Long, var attr: String) extends Serializable with Product { 7 | override def productElement(idx: Int): Any = idx match { 8 | case 0 => scrId 9 | case 1 => destId 10 | case 2 => attr 11 | } 12 | 13 | override def productArity: Int = 3 14 | 15 | override def canEqual(that: Any): Boolean = that.isInstanceOf[EdgeEntity] 16 | 17 | override def equals(other: Any): Boolean = other match { 18 | case that: EdgeEntity => 19 | (that canEqual this) && 20 | scrId == that.scrId && 21 | destId == that.destId 22 | case _ => false 23 | } 24 | 25 | override def hashCode(): Int = { 26 | val state = Seq(scrId, destId) 27 | state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/apply/model/NDegreeEntity.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.apply.model 2 | 3 | /** 4 | * Created by ASUS-PC on 2017/4/24. 5 | */ 6 | case class NDegreeEntity(var attr: String = "", 7 | var initType: Int = 0, 8 | var loop: Int = 0) 9 | extends Serializable { 10 | } 11 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/faund/DatasetTitanic.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.faund 2 | 3 | import java.util 4 | 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.mllib.linalg.Vectors 7 | import org.apache.spark.mllib.regression.LabeledPoint 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.sql.{DataFrame, SQLContext} 10 | 11 | /** 12 | * Created by Administrator on 2017/7/28 0028. 13 | */ 14 | object DatasetTitanic { 15 | def createDF(sqlContext: SQLContext, inputFile: String): DataFrame = { // options 16 | val options = new util.HashMap[String, String] 17 | options.put("header", "true") 18 | options.put("path", inputFile) 19 | options.put("delimiter", ",") 20 | // create dataframe from input file 21 | val df = sqlContext.load("com.databricks.spark.csv", options) 22 | df.printSchema() 23 | df 24 | } 25 | 26 | // create an RDD of Vectors from a DataFrame 27 | def createLabeledPointsRDD(ctx: SparkContext, sqlContext: SQLContext, inputFile: String): RDD[LabeledPoint] = { 28 | val df = createDF(sqlContext, inputFile) 29 | // convert dataframe to an RDD of Vectors 30 | df.map { row => 31 | val survived = row.getString(1).toInt 32 | val arr = new Array[Double](2) 33 | arr(0) = toDouble(row.getString(5)) 34 | arr(1) = toDouble(row.getString(6)) 35 | new LabeledPoint(survived, Vectors.dense(arr)) 36 | } 37 | } 38 | 39 | def toDouble = (value: String) => { 40 | if (value.length == 0) 0.0 else value.toDouble 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/faund/SparkConfUtil.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.faund 2 | 3 | import org.apache.spark.SparkConf 4 | 5 | /** 6 | * Created by Administrator on 2017/7/28 0028. 7 | */ 8 | object SparkConfUtil { 9 | val isLocal = true; 10 | 11 | def setConf(conf: SparkConf): Unit = { 12 | 13 | if (isLocal) { 14 | conf.setMaster("local") 15 | conf.set("spark.broadcast.compress", "false") 16 | conf.set("spark.shuffle.compress", "false") 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/faund/Titanic.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.faund 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.mllib.regression.LabeledPoint 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.sql.{DataFrame, SQLContext} 7 | 8 | /** 9 | * Created by Administrator on 2017/7/28 0028. 10 | */ 11 | object Titanic { 12 | def main(args: Array[String]) { 13 | if (args.length < 1) { 14 | System.err.println("Usage: Titanic ") 15 | System.exit(1) 16 | } 17 | 18 | val inputFile: String = args(0) 19 | val sparkConf: SparkConf = new SparkConf().setAppName("Titanic") 20 | SparkConfUtil.setConf(sparkConf) 21 | 22 | val sc: SparkContext = new SparkContext(sparkConf) 23 | val sqlContext: SQLContext = new SQLContext(sc) 24 | val results: DataFrame = DatasetTitanic.createDF(sqlContext, inputFile) 25 | 26 | results.printSchema 27 | 28 | val data: RDD[LabeledPoint] = DatasetTitanic.createLabeledPointsRDD(sc, sqlContext, inputFile) 29 | val splits: Array[RDD[LabeledPoint]] = data.randomSplit(Array[Double](0.7, 0.3)) 30 | val trainingData: RDD[LabeledPoint] = splits(0) 31 | val testData: RDD[LabeledPoint] = splits(1) 32 | 33 | System.out.println("\nRunning example of classification using RandomForest\n") 34 | ScalaRandomForest.testClassification(trainingData, testData) 35 | 36 | System.out.println("\nRunning example of regression using RandomForest\n") 37 | ScalaRandomForest.testRegression(trainingData, testData) 38 | 39 | sc.stop 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/jaccard/Jaccard.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.jaccard 2 | 3 | import org.apache.spark.graphx.{EdgeDirection, Graph} 4 | import org.apache.spark.rdd.RDD 5 | 6 | /** 7 | * Created by linyanshi on 2017/9/20 0020. 8 | */ 9 | object Jaccard { 10 | /** 11 | * Return a RDD of (1-id, 2-id, similarity) where 12 | * 1-id < 2-id to avoid duplications 13 | * 14 | * @param graph 15 | * @return 16 | */ 17 | 18 | def jaccardSimilarityAllMobiles(graph: Graph[Int, Int]): RDD[(Long, Long, Double)] = { 19 | val neighbors = graph.collectNeighborIds(EdgeDirection.Either).map(x => (x._1, x._2)) 20 | val combinations = neighbors.cartesian(neighbors) 21 | val SimilarityAll = combinations.map { x => (x._1._1, x._2._1, jaccard(x._1._2.toSet, x._2._2.toSet)) } 22 | val result = SimilarityAll.map(x => (x._3, (x._1, x._2))).sortByKey(false, 1).map(x => (x._2._1, x._2._2, x._1)) 23 | result 24 | } 25 | 26 | /** 27 | * Helper function 28 | * Jaccard 系数定义为A与B交集的大小与A与B并集的大小的比值 29 | * Given two sets, compute its Jaccard similarity and return its result. 30 | * If the union part is zero, then return 0. 31 | * @param a 32 | * @param b 33 | * @tparam A 34 | * @return 35 | */ 36 | def jaccard[A](a: Set[A], b: Set[A]): Double = { 37 | val union: Double = (a ++ b).size 38 | val intersect: Double = a.intersect(b).size 39 | return (if (union == 0) 0.0 else (intersect / union)) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/jaccard/PowerIterationClustering.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.jaccard 2 | 3 | import org.apache.spark.mllib.clustering.PowerIterationClustering 4 | import org.apache.spark.rdd.RDD 5 | 6 | /** 7 | * Created by linyanshi on 2017/9/20 0020. 8 | */ 9 | object PowerIterationClustering { 10 | 11 | /** 12 | * run PIC using Spark's PowerIterationClustering implementation 13 | * @param similarities All pair similarities in the shape of RDD[(selfmobile, caller, similarity)] 14 | * @return Cluster assignment for each patient in the shape of RDD[(mobile, Cluster)] 15 | */ 16 | def runPIC(similarities: RDD[(Long, Long, Double)]): RDD[(Long, Int)] = { 17 | val sc = similarities.sparkContext 18 | 19 | 20 | /** Remove placeholder code below and run Spark's PIC implementation */ 21 | similarities.cache().count() 22 | val pic = new PowerIterationClustering().setK(3).setMaxIterations(100) 23 | val model=pic.run(similarities) 24 | val result = model.assignments.map(a => (a.id,a.cluster)) 25 | val check = result.map(x=>x.swap).groupByKey().map(x=>(x._1,x._2.size)) 26 | 27 | println("PIC: ") 28 | println(check.foreach(println)) 29 | 30 | result 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/louvain/HDFSLouvainRunner.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.louvain 2 | 3 | /** 4 | * Created by chenqingqing on 2017/4/4. 5 | */ 6 | 7 | 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.graphx._ 10 | import scala.Array.canBuildFrom 11 | 12 | /** 13 | * Execute the louvain algorithim and save the vertices and edges in hdfs at each level. 14 | * Can also save locally if in local mode. 15 | * 16 | * See LouvainHarness for algorithm details 17 | */ 18 | class HDFSLouvainRunner(minProgress: Int, progressCounter: Int, outputdir: String) extends LouvainHarness(minProgress: Int, progressCounter: Int) { 19 | 20 | var qValues = Array[(Int, Double)]() 21 | 22 | override def saveLevel(sc: SparkContext, level: Int, q: Double, graph: Graph[VertexState, Double]) = { 23 | graph.vertices.saveAsTextFile(outputdir + "/level_" + level + "_vertices") 24 | graph.edges.saveAsTextFile(outputdir + "/level_" + level + "_edges") 25 | qValues = qValues :+ ((level, q)) 26 | println(s"qValue: $q") 27 | 28 | // overwrite the q values at each level 29 | sc.parallelize(qValues, 1).saveAsTextFile(outputdir + "/qvalues") 30 | } 31 | 32 | override def finalSave(sc: SparkContext, level: Int, q: Double, graph: Graph[VertexState, Double]) = { 33 | graph.vertices.filter(k=>k._1 != k._2.community).sortBy(k=>k._2.community).map { x => x._1 + "," + x._2 }.repartition(10).saveAsTextFile(outputdir) 34 | //graph.edges.saveAsTextFile(outputdir+"/final_edges") 35 | 36 | println(s"qValue: $q") 37 | } 38 | 39 | 40 | } 41 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/louvain/VertexData.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.louvain 2 | 3 | import scala.collection.mutable.HashSet 4 | 5 | /** 6 | * Created by chenqingqing on 2017/4/4. 7 | */ 8 | class VertexData(val vId: Long, var cId: Long) extends Serializable { 9 | var innerDegree = 0.0 //内部结点的权重 10 | var innerVertices = new HashSet[Long]() //内部的结点 11 | var degree = 0.0 //结点的度 12 | var commVertices = new HashSet[Long]() //社区中的结点 13 | } -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/louvain/VertexState.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.louvain 2 | 3 | /** 4 | * Louvain vertex state 5 | * Contains all information needed for louvain community detection 6 | */ 7 | class VertexState extends Serializable { 8 | 9 | var community = -1L //社区ID 10 | var communitySigmaTot = 0D //入度 11 | var internalWeight = 0D // self edges 12 | var nodeWeight = 0D; //out degree //出度 13 | var changed = false 14 | var q = 0D //模块度的值 15 | 16 | override def toString(): String = { 17 | // "{community:"+community+",communitySigmaTot:"+communitySigmaTot+ 18 | // ",internalWeight:"+internalWeight+",nodeWeight:"+nodeWeight+"}" 19 | // s"community:$community,communitySigmaTot:$communitySigmaTot,internalWeight:$internalWeight,nodeWeight:$nodeWeight" 20 | s"community:$community,q:$q" 21 | // community.toString 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/main/CallHistoryPageRank.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.main 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.graphx.{Edge, Graph} 5 | import org.apache.spark.storage.StorageLevel 6 | import org.apache.spark.{SparkConf, SparkContext} 7 | 8 | /** 9 | * Created by linyanshi on 2017/9/19 0019. 10 | */ 11 | object CallHistoryPageRank { 12 | def main(args: Array[String]): Unit = { 13 | 14 | Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) 15 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.ERROR) 16 | 17 | val conf = new SparkConf().setAppName("CallHistoryPageRank") 18 | val sc = new SparkContext(conf) 19 | val edgeRdd = sc.textFile(args(0)).mapPartitions(lines => lines.map { line => 20 | // val arr = line.split("\t") 21 | // Edge(arr(0).toLong, arr(1).toLong, 1) 22 | val arr = line.split(",") 23 | Edge(arr(1).toLong, arr(2).toLong, 1) 24 | // Edge(arr(2).toLong, arr(1).toLong, arr(3).toInt) 25 | }) 26 | // val graph = GraphLoader.edgeListFile(sc, args(0), numEdgePartitions = 4) 27 | 28 | val graph = Graph.fromEdges(edgeRdd, 1, edgeStorageLevel = StorageLevel.MEMORY_AND_DISK_SER, vertexStorageLevel = StorageLevel.MEMORY_AND_DISK_SER) 29 | //参数:图,迭代次数 30 | val pageRankGraph = graph.pageRank(0.0001) 31 | 32 | pageRankGraph.vertices.sortBy(x => x._2).mapPartitions(ls => ls.map(k => s"${k._1},${k._2}")).repartition(1).saveAsTextFile(args(1)) 33 | sc.stop() 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/main/Driver.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.main 2 | 3 | import com.lakala.datacenter.grogress.ExportNDegreeData 4 | 5 | /** 6 | * Created by Administrator on 2017/5/4 0004. 7 | */ 8 | 9 | 10 | object Driver extends App { 11 | override def main(args: Array[String]) = { 12 | val enD = new ExportNDegreeData() 13 | enD.main(args) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/main/LPAAlgorithm.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.main 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.graphx.{Edge, Graph} 5 | import org.apache.spark.graphx.lib.LabelPropagation 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.{SparkConf, SparkContext} 8 | import ml.sparkling.graph.operators.OperatorsDSL._ 9 | 10 | /** 11 | * Created by linyanshi on 2017/9/14 0014. 12 | */ 13 | object LPAAlgorithm { 14 | def main(args: Array[String]): Unit = { 15 | Logger.getLogger("org.apache.spark").setLevel(Level.ERROR); 16 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.ERROR); 17 | 18 | val conf = new SparkConf().setAppName("LPAAlgorithm") 19 | val sc = new SparkContext(conf) 20 | val edgeRdd = sc.textFile(args(0)).mapPartitions(lines => lines.map { line => 21 | val arr = line.split(",") 22 | // Edge(arr(1).toLong,arr(2).toLong,arr(3).toInt) 23 | Edge(arr(1).toLong,arr(2).toLong,1) 24 | // Edge(arr(0).toLong,arr(1).toLong,1) 25 | }) 26 | // val graph = GraphLoader.edgeListFile(sc, args(0), numEdgePartitions = 4) 27 | 28 | val graph = Graph.fromEdges(edgeRdd,1,edgeStorageLevel=StorageLevel.MEMORY_AND_DISK_SER,vertexStorageLevel=StorageLevel.MEMORY_AND_DISK_SER) 29 | //参数:图,迭代次数 30 | val lpaGraph = LabelPropagation.run(graph.reverse, args(2).toInt) 31 | val modularity = lpaGraph.modularity() 32 | println(modularity) 33 | lpaGraph.vertices.sortBy(x => x._2).mapPartitions(ls=>ls.map(k=>s"${k._1},${k._2}")).repartition(1).saveAsTextFile(args(1)) 34 | sc.stop() 35 | } 36 | 37 | 38 | } 39 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/main/LPCoarseAlgorithm.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.main 2 | 3 | import ml.sparkling.graph.operators.OperatorsDSL._ 4 | import org.apache.log4j.{Level, Logger} 5 | import org.apache.spark.graphx.{Edge, Graph} 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.{SparkConf, SparkContext} 8 | 9 | /** 10 | * Created by linyanshi on 2017/9/18 0018. 11 | */ 12 | object LPCoarseAlgorithm { 13 | def main(args: Array[String]): Unit = { 14 | Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) 15 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.ERROR) 16 | 17 | val conf = new SparkConf().setAppName("LPCoarseAlgorithm") 18 | val sc = new SparkContext(conf) 19 | val edgeRdd = sc.textFile(args(0)).mapPartitions(lines => lines.map { line => 20 | // val arr = line.split("\t") 21 | // Edge(arr(0).toLong, arr(1).toLong, 1) 22 | val arr = line.split(",") 23 | Edge(arr(1).toLong, arr(2).toLong, 1) 24 | }) 25 | // val graph = GraphLoader.edgeListFile(sc, args(0), numEdgePartitions = 4) 26 | 27 | val graph = Graph.fromEdges(edgeRdd, 1, edgeStorageLevel = StorageLevel.MEMORY_AND_DISK_SER, vertexStorageLevel = StorageLevel.MEMORY_AND_DISK_SER) 28 | //参数:图,迭代次数 29 | val lpaGraph = graph.LPCoarse(treatAsUndirected = true) 30 | // val modularity = lpaGraph.modularity() 31 | // println(modularity) 32 | lpaGraph.vertices.mapPartitions(kcs => kcs.map(kc => (kc._1, kc._2.sortBy(k => k).head))) 33 | .filter(k => k._1 != k._2).sortBy(x => x._2) 34 | /*.mapPartitions(ls => ls.map(k => s"${k._1},${k._2.mkString(",")}"))*/ .repartition(1).saveAsTextFile(args(1)) 35 | sc.stop() 36 | } 37 | 38 | 39 | 40 | 41 | } 42 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/main/LiveCommunityDetection.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.main 2 | 3 | /** 4 | * Created by linyanshi on 2017/9/25 0025. 5 | */ 6 | object LiveCommunityDetection { 7 | def main(args: Array[String]): Unit = { 8 | if (args.length < 1) { 9 | System.err.println( 10 | "Usage: LiveCommunityDetection \n" + 11 | " --numEPart=\n" + 12 | " The number of partitions for the graph's edge RDD.\n" + 13 | " [--tol=]\n" + 14 | " The tolerance allowed at convergence (smaller => more accurate). Default is " + 15 | "0.001.\n" + 16 | " [--output=]\n" + 17 | " If specified, the file to write the ranks to.\n" + 18 | " [--partStrategy=RandomVertexCut | EdgePartition1D | EdgePartition2D | " + 19 | "CanonicalRandomVertexCut]\n" + 20 | " The way edges are assigned to edge partitions. Default is RandomVertexCut.") 21 | System.exit(-1) 22 | } 23 | //file/data/graphx/input/followers.txt -numEPart=100 -tol=0.001 -output=F:\idea_workspace\SparkLearning\outfile -partStrategy=RandomVertexCut 24 | Analytics.main(args.patch(0, List("pagerank"), 0)) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/main/LouvainDGA.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.main 2 | 3 | /** 4 | * Created by linyanshi on 2017/9/14 0014. 5 | */ 6 | 7 | import com.lakala.datacenter.louvain.{HDFSLouvainRunner, VertexState} 8 | import org.apache.log4j.{Level, Logger} 9 | import org.apache.spark.graphx.{Edge, Graph} 10 | import org.apache.spark.{SparkConf, SparkContext} 11 | 12 | //totalEdgeWeight: 1.56262281191699E15 13 | //# vertices moved: 61,897,309 14 | //# vertices moved: 13,746,461 15 | //# vertices moved: 5,352,635 16 | //# vertices moved: 130,270 17 | //# vertices moved: 82,426 18 | //# vertices moved: 71,584 19 | //# vertices moved: 71,105 20 | //# vertices moved: 70,030 21 | //# vertices moved: 69,937 22 | // 23 | //Completed in 18 cycles 24 | // 25 | //Starting Louvain level 1 26 | //totalEdgeWeight: 2.237895102976331E15 27 | //# vertices moved: 664,919 28 | //# vertices moved: 191,039 29 | //# vertices moved: 12,426 30 | //# vertices moved: 393 31 | //# vertices moved: 7 32 | //# vertices moved: 0 33 | // 34 | //Completed in 12 cycles 35 | //qValue: 0.9182326588364285 36 | // 总的用户数1232060 总的call_phone yong用户数 101825071 37 | //总的社区 275141 大于两个人的总的社区id 77442 关联黑名单 总的社区 1784 38 | 39 | object LouvainDGA { 40 | def main(args: Array[String]) { 41 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 42 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) 43 | val conf = new SparkConf().setAppName("LouvainDGA") 44 | conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 45 | conf.registerKryoClasses(Array(classOf[VertexState])) 46 | // intputpath iterator 1 outputpath 47 | val sc = new SparkContext(conf) 48 | val data = sc.textFile(args(0)) 49 | val edges = data.map(line => { 50 | val items = line.split(",") 51 | // Edge(items(0).toLong, items(1).toLong, items(2).toDouble) 52 | Edge(items(1).toLong, items(2).toLong, items(3).toDouble) 53 | // Edge(items(1).toLong, items(2).toLong, 1d) 54 | }) 55 | val graph = Graph.fromEdges(edges, 1) 56 | val runner = new HDFSLouvainRunner(args(2).toInt, args(3).toInt, args(1)) 57 | runner.run(sc, graph) 58 | sc.stop() 59 | } 60 | } 61 | 62 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/main/PICCallAlgorithm.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.main 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.graphx.{Edge, Graph} 5 | import org.apache.spark.mllib.clustering.PowerIterationClustering 6 | import org.apache.spark.storage.StorageLevel 7 | import org.apache.spark.{SparkConf, SparkContext} 8 | 9 | /** 10 | * Created by linyanshi on 2017/9/20 0020. 11 | * http://blog.sina.com.cn/s/blog_482da2d20102drpt.html 12 | */ 13 | object PICCallAlgorithm { 14 | def main(args: Array[String]) { 15 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 16 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) 17 | val conf = new SparkConf().setAppName("PICCallAlgorithm") 18 | 19 | val sc = new SparkContext(conf) 20 | val data = sc.textFile(args(0), 200) 21 | val edges = data.map(line => { 22 | val items = line.split(",") 23 | Edge(items(1).toLong, items(2).toLong, 1) 24 | // val items = line.split("\t") 25 | // Edge(items(0).toLong, items(1).toLong, 1) 26 | }) 27 | val graph = Graph.fromEdges(edges, 1, edgeStorageLevel = StorageLevel.MEMORY_AND_DISK_SER, vertexStorageLevel = StorageLevel.MEMORY_AND_DISK_SER) 28 | //参数:图,迭代次数 29 | val pageRankGraph = graph.pageRank(0.0001) 30 | val pic = new PowerIterationClustering().setK(args(2).toInt).setMaxIterations(args(3).toInt).setInitializationMode("degree") 31 | val model = pic.run(pageRankGraph) 32 | val result = model.assignments.map(a => (a.id, a.cluster)) 33 | result.mapPartitions(ves => ves.map(ve => s"${ve._1},${ve._2}")).repartition(1).saveAsTextFile(args(1)) 34 | // val landmarks = sc.textFile("/user/guozhijie/explortoutput/louvainout4") 35 | // .mapPartitions(lines=>lines.map(line=>{val arr =line.split(",") 36 | // arr(1).toLong})).distinct().top(args(2).toInt) 37 | // val landmarks = data.map(line => { 38 | // val items = line.split(",") 39 | // items(1).toLong 40 | // }).distinct().top(args(2).toInt) 41 | // val landmarksBR = sc.broadcast(landmarks) 42 | // val shortPathGraph = ShortestPaths.run(graph, landmarksBR.value) 43 | // graph.unpersist() 44 | // 45 | // implicit def iterebleWithAvg[T: Numeric](data: Iterable[T]) = new { 46 | // def avg = average(data) 47 | // } 48 | // 49 | // def average[T](ts: Iterable[T])(implicit num: Numeric[T]) = { 50 | // num.toDouble(ts.sum) / ts.size 51 | // } 52 | // 53 | // shortPathGraph.vertices.map { 54 | // vx => 55 | // (vx._1, { 56 | // val dx = 1.0 / vx._2.map { 57 | // sx => sx._2 58 | // }.seq.avg 59 | // val d = if (dx.isNaN | dx.isNegInfinity | dx.isPosInfinity) 0.0 else dx 60 | // d 61 | // }) 62 | // }.sortBy({ vx => vx._1 }, ascending = true) 63 | // .mapPartitions(rows => rows.filter(k => k._2 > 0d).map(row => s"${row._1},${row._2}")).repartition(1).saveAsTextFile(args(1)) 64 | // val similarities = Jaccard.jaccardSimilarityAllMobiles(graph) 65 | // val centralityGraph: Graph[(Double,Double),Int] = graph.hits(VertexMeasureConfiguration(treatAsUndirected=true)) 66 | // val picLabels = PowerIterationClustering.runPIC(similarities) 67 | // picLabels.mapPartitions(lca => lca.map(l => s"${l._1},${l._2}")).repartition(1).saveAsTextFile(args(1)) 68 | 69 | // val vertexembeddedness = graph.closenessCentrality(VertexMeasureConfiguration(treatAsUndirected = true)) 70 | // vertexembeddedness.vertices.mapPartitions(ves=>ves.map(ve=>s"${ve._1},${ve._2}")).repartition(1).saveAsTextFile(args(1)) 71 | sc.stop() 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/main/PSCANAlgorithm.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.main 2 | 3 | import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration 4 | import ml.sparkling.graph.operators.OperatorsDSL._ 5 | import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN 6 | import org.apache.log4j.{Level, Logger} 7 | import org.apache.spark.graphx.{Edge, Graph} 8 | import org.apache.spark.storage.StorageLevel 9 | import org.apache.spark.{SparkConf, SparkContext} 10 | 11 | /** 12 | * Created by linyanshi on 2017/9/18 0018. 13 | */ 14 | object PSCANAlgorithm { 15 | 16 | def main(args: Array[String]): Unit = { 17 | Logger.getLogger("org.apache.spark").setLevel(Level.ERROR) 18 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.ERROR) 19 | 20 | val conf = new SparkConf().setAppName("PSCANAlgorithm") 21 | val sc = new SparkContext(conf) 22 | val edgeRdd = sc.textFile(args(0)).mapPartitions(lines => lines.map { line => 23 | // val arr = line.split("\t") 24 | // Edge(arr(0).toLong, arr(1).toLong, 1) 25 | val arr = line.split(",") 26 | // Edge(arr(1).toLong, arr(2).toLong, 1) 27 | Edge(arr(1).toLong, arr(2).toLong, arr(3).toInt) 28 | }) 29 | // val graph = GraphLoader.edgeListFile(sc, args(0), numEdgePartitions = 4) 30 | 31 | val graph = Graph.fromEdges(edgeRdd, 1, edgeStorageLevel = StorageLevel.MEMORY_AND_DISK_SER, vertexStorageLevel = StorageLevel.MEMORY_AND_DISK_SER) 32 | //参数:图,迭代次数 33 | val pscanGraph = PSCAN.computeConnectedComponents(graph, 0.000001) 34 | // val lpaGraph = PSCAN.computeConnectedComponentsUsing(graph, args(2).toInt) 35 | val modularity = pscanGraph.modularity() 36 | 37 | println(modularity) 38 | 39 | 40 | pscanGraph.vertices.filter(k => k._1 != k._2).sortBy(x => x._2).mapPartitions(ls => ls.map(k => s"${k._1},${k._2}")).repartition(1).saveAsTextFile(args(1)) 41 | sc.stop() 42 | } 43 | 44 | 45 | } 46 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/talk/types/City.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.talk.types 2 | 3 | import org.apache.spark.graphx.VertexId 4 | 5 | case class City(name: String, id: VertexId) { 6 | override def toString() = name + " [" + id + "]" 7 | } 8 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/talk/types/Person.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.talk.types 2 | 3 | case class Person(name: String, age: Int) 4 | -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/talk/types/VertexAttribute.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.talk.types 2 | 3 | case class VertexAttribute(cityName: String, distance: Double, path: List[City]) -------------------------------------------------------------------------------- /apply/src/main/scala/com/lakala/datacenter/utils/UtilsToos.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.utils 2 | 3 | import java.nio.charset.StandardCharsets 4 | 5 | import com.google.common.hash.Hashing 6 | import com.lakala.datacenter.common.utils.DateTimeUtils 7 | 8 | import scala.util.matching.Regex 9 | 10 | /** 11 | * Created by ASUS-PC on 2017/4/18. 12 | */ 13 | object UtilsToos { 14 | /** 15 | * 根据字符串生成唯一的hashcode值 16 | * 17 | * @param str 18 | * @return 19 | */ 20 | def hashId(str: String) = { 21 | Hashing.md5().hashString(str, StandardCharsets.UTF_8).asLong() 22 | } 23 | 24 | /** 25 | * 手机号,电话号码验证 26 | * 27 | * @param num 28 | * @return 验证通过返回true 29 | */ 30 | def isMobileOrPhone(num: String): Boolean = { 31 | val pattern = new Regex("^((17[0-9])(14[0-9])|(13[0-9])|(15[^4,\\D])|(18[0,5-9]))\\d{8}$") 32 | val pattern2 = new Regex("(?:(\\(\\+?86\\))(0[0-9]{2,3}\\-?)?([2-9][0-9]{6,7})+(\\-[0-9]{1,4})?)|(?:(86-?)?(0[0-9]{2,3}\\-?)?([2-9][0-9]{6,7})+(\\-[0-9]{1,4})?)") // 验证带区号的 33 | // val pattern2 = new Regex("^[0][1-9]{2,3}-[0-9]{5,10}$") // 验证带区号的 34 | val pattern3 = new Regex("^[1-9]{1}[0-9]{5,8}$") // 验证没有区号的 35 | num match { 36 | case pattern(_*) => { 37 | true 38 | } 39 | case pattern2(_*) => { 40 | true 41 | } 42 | case pattern3(_*) => { 43 | true 44 | } 45 | case _ => { 46 | false 47 | } 48 | } 49 | } 50 | 51 | def jugeInit(dataDt: String, sdt: String, edt: String): Boolean = { 52 | var init = false 53 | try { 54 | init = if (DateTimeUtils.parseDataString(dataDt).getMillis >= DateTimeUtils.parseDataString(sdt).getMillis 55 | && DateTimeUtils.parseDataString(dataDt).getMillis <= DateTimeUtils.parseDataString(edt).getMillis) true 56 | else false 57 | } catch { 58 | case e: Exception => 59 | } 60 | init 61 | } 62 | 63 | def byDateFileterData(line: String, edt: String): Boolean = { 64 | var init = false 65 | try { 66 | val arr = line.split(",") 67 | val dt = if (arr(5).indexOf(".") > 0) arr(5).substring(0, arr(5).indexOf(".")) else arr(5) 68 | init = if (DateTimeUtils.parseDataString(dt).getMillis <= DateTimeUtils.parseDataString(edt).getMillis) true 69 | else false 70 | } catch { 71 | case e: Exception => 72 | } 73 | init 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /apply/src/main/scala/edu/gatech/cse8803/clustering/PowerIterationClustering.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Ting Pan . 3 | */ 4 | 5 | package edu.gatech.cse8803.clustering 6 | 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.mllib.clustering.{PowerIterationClustering => PIC} 9 | import org.apache.spark.mllib.clustering.PowerIterationClustering 10 | 11 | 12 | /** 13 | * Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by 14 | * [[http://www.icml2010.org/papers/387.pdf Lin and Cohen]]. From the abstract: PIC finds a very 15 | * low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise 16 | * similarity matrix of the data. 17 | * 18 | * @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]] 19 | */ 20 | 21 | object PowerIterationClustering { 22 | 23 | /** run PIC using Spark's PowerIterationClustering implementation 24 | * 25 | * @input: All pair similarities in the shape of RDD[(patientID1, patientID2, similarity)] 26 | * @return: Cluster assignment for each patient in the shape of RDD[(PatientID, Cluster)] 27 | * 28 | * */ 29 | 30 | def runPIC(similarities: RDD[(Long, Long, Double)]): RDD[(Long, Int)] = { 31 | val sc = similarities.sparkContext 32 | 33 | 34 | /** Remove placeholder code below and run Spark's PIC implementation */ 35 | similarities.cache().count() 36 | val pic = new PowerIterationClustering().setK(3).setMaxIterations(100) 37 | val model=pic.run(similarities) 38 | val result = model.assignments.map(a => (a.id,a.cluster)) 39 | //val check = result.map(x=>x.swap).groupByKey().map(x=>(x._1,x._2.size)) 40 | 41 | //println("PIC: ") 42 | //println(check.foreach(println)) 43 | 44 | result 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /apply/src/main/scala/edu/gatech/cse8803/ioutils/CSVUtils.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Ting Pan . 3 | */ 4 | package edu.gatech.cse8803.ioutils 5 | 6 | import org.apache.spark.sql.SchemaRDD 7 | import org.apache.spark.sql.SQLContext 8 | import com.databricks.spark.csv.CsvContext 9 | 10 | 11 | object CSVUtils { 12 | def loadCSVAsTable(sqlContext: SQLContext, path: String, tableName: String): SchemaRDD = { 13 | val data = sqlContext.csvFile(path) 14 | data.registerTempTable(tableName) 15 | data 16 | } 17 | 18 | def loadCSVAsTable(sqlContext: SQLContext, path: String): SchemaRDD = { 19 | loadCSVAsTable(sqlContext, path, inferTableNameFromPath(path)) 20 | } 21 | 22 | private val pattern = "(\\w+)(\\.csv)?$".r.unanchored 23 | def inferTableNameFromPath(path: String) = path match { 24 | case pattern(filename, extension) => filename 25 | case _ => path 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /apply/src/main/scala/edu/gatech/cse8803/jaccard/Jaccard.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * * 3 | * @author: Ting Pan 4 | **/ 5 | package edu.gatech.cse8803.jaccard 6 | 7 | import edu.gatech.cse8803.model._ 8 | import edu.gatech.cse8803.model.{EdgeProperty, VertexProperty} 9 | import org.apache.spark.graphx._ 10 | import org.apache.spark.rdd.RDD 11 | 12 | object Jaccard { 13 | 14 | def jaccardSimilarityOneVsAll(graph: Graph[VertexProperty, EdgeProperty], patientID: Long): List[Long] = { 15 | /** 16 | * Given a patient ID, compute the Jaccard similarity w.r.t. to all other patients. 17 | * Return a List of patient IDs ordered by the highest to the lowest similarity. 18 | * For ties, random order is okay 19 | */ 20 | 21 | 22 | val neighbors = graph.collectNeighborIds(EdgeDirection.Either).map(x => (x._1, x._2.filter(p => p > 1000))).filter(_._1 <= 1000) 23 | val neighbors_wo_patient = neighbors.filter(_._1 != patientID) 24 | val source = neighbors.filter(_._1 == patientID).map(_._2).collect.flatten.toSet 25 | val SimilarityOneVsAll = neighbors_wo_patient.map { case (vid, nbrs) => (vid, jaccard(source, nbrs.toSet)) } 26 | val result = SimilarityOneVsAll.sortBy(_._2, false).map(_._1).take(10).toList 27 | result 28 | } 29 | 30 | def jaccardSimilarityAllPatients(graph: Graph[VertexProperty, EdgeProperty]): RDD[(Long, Long, Double)] = { 31 | /** 32 | * Given a patient, med, diag, lab graph, calculate pairwise similarity between all 33 | *patients. Return a RDD of (patient-1-id, patient-2-id, similarity) where 34 | * patient-1-id < patient-2-id to avoid duplications 35 | */ 36 | val neighbors = graph.collectNeighborIds(EdgeDirection.Either).map(x => (x._1, x._2.filter(p => p > 1000))).filter(_._1 <= 1000) 37 | val combinations = neighbors.cartesian(neighbors).filter { case (a, b) => a._1 < b._1 } 38 | val SimilarityAll = combinations.map { x => (x._1._1, x._2._1, jaccard(x._1._2.toSet, x._2._2.toSet)) } 39 | val result = SimilarityAll.map(x => (x._3, (x._1, x._2))).sortByKey(false, 1).map(x => (x._2._1, x._2._2, x._1)) 40 | result 41 | } 42 | 43 | def jaccard[A](a: Set[A], b: Set[A]): Double = { 44 | /** 45 | * Helper function 46 | * * 47 | * Given two sets, compute its Jaccard similarity and return its result. 48 | * If the union part is zero, then return 0. 49 | */ 50 | 51 | 52 | val union: Double = (a ++ b).size 53 | val intersect: Double = a.intersect(b).size 54 | return (if (union == 0) 0.0 else (intersect / union)) 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /apply/src/main/scala/edu/gatech/cse8803/main/Main.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Ting Pan . 3 | */ 4 | 5 | package edu.gatech.cse8803.main 6 | 7 | import java.text.SimpleDateFormat 8 | 9 | import edu.gatech.cse8803.ioutils.CSVUtils 10 | import edu.gatech.cse8803.jaccard.Jaccard 11 | import edu.gatech.cse8803.model._ 12 | import edu.gatech.cse8803.randomwalk.RandomWalk 13 | import edu.gatech.cse8803.clustering.PowerIterationClustering 14 | import org.apache.spark.rdd.RDD 15 | 16 | import org.apache.spark.sql.SQLContext 17 | import org.apache.spark.{SparkConf, SparkContext} 18 | import edu.gatech.cse8803.graphconstruct.GraphLoader 19 | 20 | 21 | object Main { 22 | def main(args: Array[String]) { 23 | import org.apache.log4j.Logger 24 | import org.apache.log4j.Level 25 | 26 | Logger.getLogger("org").setLevel(Level.WARN) 27 | Logger.getLogger("akka").setLevel(Level.WARN) 28 | 29 | val sc = createContext 30 | val sqlContext = new SQLContext(sc) 31 | 32 | /** initialize loading of data */ 33 | val (patient, medication, labResult, diagnostic) = loadRddRawData(sqlContext) 34 | val patientGraph = GraphLoader.load(patient, labResult, medication, diagnostic) 35 | 36 | println(Jaccard.jaccardSimilarityOneVsAll(patientGraph, 9)) 37 | println(RandomWalk.randomWalkOneVsAll(patientGraph, 9)) 38 | 39 | val similarities = Jaccard.jaccardSimilarityAllPatients(patientGraph) 40 | 41 | val PICLabels = PowerIterationClustering.runPIC(similarities) 42 | 43 | sc.stop() 44 | } 45 | 46 | def loadRddRawData(sqlContext: SQLContext): (RDD[PatientProperty], RDD[Medication], RDD[LabResult], RDD[Diagnostic]) = { 47 | 48 | val dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX") 49 | /** test data, must change back!! */ 50 | List("data/PATIENT.csv", "data/LAB.csv", "data/DIAGNOSTIC.csv", "data/MEDICATION.csv") 51 | .foreach(CSVUtils.loadCSVAsTable(sqlContext, _)) 52 | 53 | val patient = sqlContext.sql( // fix this 54 | """ 55 | |SELECT subject_id, sex, dob, dod 56 | |FROM PATIENT 57 | """.stripMargin) 58 | .map(r => PatientProperty(r(0).toString, r(1).toString, r(2).toString, r(3).toString)) 59 | 60 | val labResult = sqlContext.sql( 61 | """ 62 | |SELECT subject_id, date, lab_name, value 63 | |FROM LAB 64 | |WHERE value IS NOT NULL and value <> '' 65 | """.stripMargin) 66 | .map(r => LabResult(r(0).toString, r(1).toString.toLong, r(2).toString, r(3).toString)) 67 | 68 | val diagnostic = sqlContext.sql( 69 | """ 70 | |SELECT subject_id, date, code, sequence 71 | |FROM DIAGNOSTIC 72 | """.stripMargin) 73 | .map(r => Diagnostic(r(0).toString, r(1).toString.toLong, r(2).toString, r(3).toString.toInt)) 74 | 75 | val medication = sqlContext.sql( 76 | """ 77 | |SELECT subject_id, date, med_name 78 | |FROM MEDICATION 79 | """.stripMargin) 80 | .map(r => Medication(r(0).toString, r(1).toString.toLong, r(2).toString)) 81 | 82 | (patient, medication, labResult, diagnostic) 83 | 84 | } 85 | 86 | 87 | def createContext(appName: String, masterUrl: String): SparkContext = { 88 | val conf = new SparkConf().setAppName(appName).setMaster(masterUrl) 89 | new SparkContext(conf) 90 | } 91 | 92 | def createContext(appName: String): SparkContext = createContext(appName, "local") 93 | 94 | def createContext: SparkContext = createContext("CSE 8803 Homework Three Application", "local") 95 | } 96 | -------------------------------------------------------------------------------- /apply/src/main/scala/edu/gatech/cse8803/model/models.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Ting Pan . 3 | */ 4 | 5 | package edu.gatech.cse8803.model 6 | 7 | case class LabResult(patientID: String, date: Long, labName: String, value: String) 8 | 9 | case class Diagnostic(patientID: String, date: Long, icd9code: String, sequence: Int) 10 | 11 | case class Medication(patientID: String, date: Long, medicine: String) 12 | 13 | abstract class VertexProperty 14 | 15 | case class PatientProperty(patientID: String, sex: String, dob: String, dod: String) extends VertexProperty 16 | 17 | case class LabResultProperty(testName: String) extends VertexProperty 18 | 19 | case class DiagnosticProperty(icd9code: String) extends VertexProperty 20 | 21 | case class MedicationProperty(medicine: String) extends VertexProperty 22 | 23 | abstract class EdgeProperty 24 | 25 | case class SampleEdgeProperty(name: String = "Sample") extends EdgeProperty 26 | 27 | case class PatientLabEdgeProperty(labResult: LabResult) extends EdgeProperty 28 | 29 | case class PatientDiagnosticEdgeProperty(diagnostic: Diagnostic) extends EdgeProperty 30 | 31 | case class PatientMedicationEdgeProperty(medication: Medication) extends EdgeProperty 32 | 33 | -------------------------------------------------------------------------------- /apply/src/main/scala/edu/gatech/cse8803/randomwalk/randomwalk.scala: -------------------------------------------------------------------------------- 1 | package edu.gatech.cse8803.randomwalk 2 | 3 | import edu.gatech.cse8803.model.{PatientProperty, EdgeProperty, VertexProperty} 4 | import org.apache.spark.graphx._ 5 | 6 | object RandomWalk { 7 | 8 | def randomWalkOneVsAll(graph: Graph[VertexProperty, EdgeProperty], patientID: Long, numIter: Int = 100, alpha: Double = 0.15): List[Long] = { 9 | /** 10 | * Given a patient ID, compute the random walk probability w.r.t. to all other patients. 11 | * Return a List of patient IDs ordered by the highest to the lowest similarity. 12 | * For ties, random order is okay 13 | */ 14 | 15 | val patient = graph.vertices.filter(_._2.isInstanceOf[PatientProperty]) 16 | val patient_count = patient.keys.max() 17 | 18 | val personalized = true 19 | val src: VertexId = patientID 20 | 21 | var rankGraph: Graph[Double, Double] = graph 22 | // Associate the degree with each vertex 23 | .outerJoinVertices(graph.outDegrees) { (vid, vdata, deg) => deg.getOrElse(0) } 24 | // Set the weight on the edges based on the degree 25 | .mapTriplets( e => 1.0 / e.srcAttr, TripletFields.Src ) 26 | // Set the vertex attributes to the initial pagerank values 27 | .mapVertices { (id, attr) => 28 | if (!(id != src && personalized)) alpha else 0.0 29 | } 30 | 31 | def delta(u: VertexId, v: VertexId): Double = { if (u == v) 1.0 else 0.0 } 32 | 33 | var iteration = 0 34 | var prevRankGraph: Graph[Double, Double] = null 35 | while (iteration < numIter) { 36 | rankGraph.cache() 37 | 38 | // Compute the outgoing rank contributions of each vertex, perform local preaggregation, and 39 | // do the final aggregation at the receiving vertices. Requires a shuffle for aggregation. 40 | val rankUpdates = rankGraph.aggregateMessages[Double]( 41 | ctx => ctx.sendToDst(ctx.srcAttr * ctx.attr), _ + _, TripletFields.Src) 42 | 43 | // Apply the final rank updates to get the new ranks, using join to preserve ranks of vertices 44 | // that didn't receive a message. Requires a shuffle for broadcasting updated ranks to the 45 | // edge partitions. 46 | prevRankGraph = rankGraph 47 | // new update rule 48 | //PR[i] = (1 - alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum (if i not start node) 49 | //PR[i] = alpha + (1 - alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum (if i is start node) 50 | val rPrb = { 51 | (src: VertexId, id: VertexId) => alpha * delta(src, id) 52 | } 53 | rankGraph = rankGraph.joinVertices(rankUpdates) { 54 | (id, oldRank, msgSum) => rPrb(src, id) + (1.0 - alpha) * msgSum 55 | }.cache() 56 | 57 | rankGraph.edges.foreachPartition(x => {}) // also materializes rankGraph.vertices 58 | // logInfo(s"PageRank finished iteration $iteration.") 59 | prevRankGraph.vertices.unpersist(false) 60 | prevRankGraph.edges.unpersist(false) 61 | 62 | /** println("iteration: "+iteration) 63 | println() 64 | println(rankGraph.vertices.filter(_._1<=1000).filter( _._1!=patientID).sortBy(_._2,false).take(15).foreach(println))*/ 65 | iteration += 1 66 | } 67 | 68 | val result = rankGraph.vertices.filter(_._1<=1000).filter( _._1!=patientID).sortBy(_._2,false).map(_._1).take(10).toList 69 | result 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /apply/src/test/scala/CollectionUtil.scala: -------------------------------------------------------------------------------- 1 | import scala.collection.mutable.ArrayBuffer 2 | import scala.reflect.ClassTag 3 | 4 | /** 5 | * Created by liuchen on 2017/8/10. 6 | * Description: 7 | */ 8 | object CollectionUtil { 9 | 10 | /** 11 | * 对具有Traversable[(K, V)]类型的集合添加reduceByKey相关方法 12 | * 13 | * @param collection 14 | * @param kt 15 | * @param vt 16 | * @tparam K 17 | * @tparam V 18 | */ 19 | implicit class CollectionHelper[K, V](collection: ArrayBuffer[(K, V)])(implicit kt: ClassTag[K], vt: ClassTag[V]) { 20 | def reduceByKeyMy(f: (V, V) => V): Traversable[(K, V)] = { 21 | val group: Map[K, ArrayBuffer[(K, V)]] = collection.groupBy(_._1) 22 | group.map(x => x._2.reduce((a, b) => (a._1, f(a._2, b._2)))) 23 | } 24 | 25 | 26 | /** 27 | * reduceByKey的同时,返回被reduce掉的元素的集合 28 | * 29 | * @param f 30 | * @return 31 | */ 32 | def reduceByKeyWithReduced(f: (V, V) => V)(implicit kt: ClassTag[K], vt: ClassTag[V]): (Traversable[(K, V)], Traversable[(K, V)]) = { 33 | val reduced: ArrayBuffer[(K, V)] = ArrayBuffer() 34 | val newSeq = collection.groupBy(_._1).map { 35 | case (_: K, values: Traversable[(K, V)]) => values.reduce((a, b) => { 36 | val newValue: V = f(a._2, b._2) 37 | val reducedValue: V = if (newValue == a._2) b._2 else a._2 38 | val reducedPair: (K, V) = (a._1, reducedValue) 39 | reduced += reducedPair 40 | (a._1, newValue) 41 | }) 42 | } 43 | (newSeq, reduced.toTraversable) 44 | } 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /apply/src/test/scala/CreateApplyData.scala: -------------------------------------------------------------------------------- 1 | import org.apache.commons.lang3.StringUtils 2 | import org.apache.spark.{SparkConf, SparkContext} 3 | import com.lakala.datacenter.utils.UtilsToos._ 4 | import scala.util.Random 5 | 6 | /** 7 | * Created by ASUS-PC on 2017/4/18. 8 | */ 9 | object CreateApplyData { 10 | 11 | def main(args: Array[String]): Unit = { 12 | val conf = new SparkConf().setMaster("local[2]").setAppName("CreateApplyData") 13 | val sc = new SparkContext(conf) 14 | val callLine = sc.textFile("file:///F:/lakalaFinance_workspaces/applogs/000000_0") 15 | val applyLine = sc.textFile("file:///F:/lakalaFinance_workspaces/applogs/query_result.csv").filter(line => (!line.startsWith("s_c_loan_apply"))) 16 | val call = callLine.mapPartitions { lines => 17 | lines.map { line => 18 | var arr = line.split("\u0001") 19 | (if (StringUtils.isNotBlank(arr(4)) && isMobileOrPhone(arr(4))) arr(4) else "", if (StringUtils.isNotBlank(arr(6)) && isMobileOrPhone(arr(6))) arr(6) else "") 20 | } 21 | } 22 | val list = call.filter(k => StringUtils.isNotBlank(k._1)).map(k => k._1.toLong).union(call.filter(k => StringUtils.isNotBlank(k._2)).map(k => k._2.toLong)).collect().toSet.toList 23 | println("mobil ************************") 24 | list.sorted.foreach(println) 25 | println("mobil ************************") 26 | val ac = sc.broadcast(list) 27 | applyLine.mapPartitions { 28 | val list: List[Long] = ac.value 29 | val seed: Int = list.size 30 | lines => lines.map { 31 | line => 32 | var arr = line.split(",") 33 | val index = getIndex(seed) 34 | val s = if (StringUtils.isBlank(arr(41)) || "null".equals(arr(41).toLowerCase)) "," + list(index) + "," 35 | else if (StringUtils.isNotBlank(arr(41)) && !isMobileOrPhone(arr(41))) "," + list(index) + "," 36 | else "," + arr(41) + "," 37 | s"${arr.slice(0, 41).mkString(",")}$s${arr.slice(42, arr.length).mkString(",")}" 38 | } 39 | }.repartition(1).saveAsTextFile("file:///F:/lakalaFinance_workspaces/applogs2/query_result.csv") 40 | } 41 | 42 | def getIndex(seed: Int): Int = { 43 | val rand = new Random() 44 | rand.nextInt(seed) 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /apply/src/test/scala/CreateApplyData2.scala: -------------------------------------------------------------------------------- 1 | 2 | import org.apache.commons.lang3.StringUtils 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import com.lakala.datacenter.utils.UtilsToos._ 5 | import scala.util.Random 6 | /** 7 | * Created by ASUS-PC on 2017/4/18. 8 | */ 9 | object CreateApplyData2 { 10 | 11 | def main(args: Array[String]): Unit = { 12 | val conf = new SparkConf().setMaster("local[2]").setAppName("CreateApplyData2") 13 | val sc = new SparkContext(conf) 14 | val callLine = sc.textFile("file:///F:/lakalaFinance_workspaces/applogs/000000_0") 15 | val call = callLine.mapPartitions { lines => 16 | lines.map { line => 17 | var arr = line.split("\u0001") 18 | (s"${if (StringUtils.isNotBlank(arr(4)) && isMobileOrPhone(arr(4))) arr(4) else "0"},${if (StringUtils.isNotBlank(arr(6)) && isMobileOrPhone(arr(6))) arr(6) else "0"}") 19 | } 20 | } 21 | call.distinct().repartition(1).saveAsTextFile("file:///F:/lakalaFinance_workspaces/applogs3/query_result.csv") 22 | } 23 | 24 | def getIndex(seed: Int): Int = { 25 | val rand = new Random() 26 | rand.nextInt(seed) 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /apply/src/test/scala/EdgeTuplesTest.scala: -------------------------------------------------------------------------------- 1 | import org.apache.log4j.{Level, Logger} 2 | import org.apache.spark.graphx._ 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import utils.GraphNdegUtil2 5 | 6 | /** 7 | * Created by ASUS-PC on 2017/4/19. 8 | */ 9 | object EdgeTuplesTest { 10 | def main(args: Array[String]): Unit = { 11 | val conf = new SparkConf().setMaster("local[2]").setAppName("CreateApplyData") 12 | val sc = new SparkContext(conf) 13 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 14 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) 15 | // val orderMobile = GraphLoader.edgeListFile(sc, "file:///F:/lakalaFinance_workspaces/graphx-analysis/apply/data3/part-00003") 16 | val orderMobile = GraphLoader.edgeListFile(sc, "file:///F:/lakalaFinance_workspaces/graphx-analysis/apply/data/friends.txt") 17 | // val orderMobile = sc.textFile("file:///F:/lakalaFinance_workspaces/graphx-analysis/apply/data3/part-00000") 18 | // val edgeTuple = orderMobile.mapPartitions { lines => 19 | // lines.map { line => 20 | // val arr = line.split(",") 21 | // (arr(0).toLong, arr(1).toLong) 22 | // } 23 | // } 24 | 25 | val validGraph = orderMobile.subgraph(k => k.srcId != 0 && k.dstId != 0) 26 | // val choiceRdd = sc.parallelize(Seq(18028726374L, 18692892122L, 13761981426L)) 27 | val choiceRdd = sc.parallelize(Seq(6L)) 28 | 29 | val rss: VertexRDD[Map[Int, Set[VertexId]]] = GraphNdegUtil2.aggNdegreedVertices(validGraph, choiceRdd, 3) 30 | println("00000++++++0000000") 31 | rss.foreach { k => 32 | println(s"${k._1}${k._2.map(kk => k._2.map(kkk => kkk._2.toArray.mkString(",")))}") 33 | } 34 | 35 | // val applyLine = sc.textFile("file:///F:/lakalaFinance_workspaces/applogs/query_result.csv").filter(line => (!line.startsWith("s_c_loan_apply"))) 36 | // val rs = applyLine.mapPartitions { lines => 37 | // lines.map { line => 38 | // val arr = line.split(",") 39 | // val term_id = if (StringUtils.isNotBlank(arr(7)) && !"null".equals(arr(7).toLowerCase)) arr(7) else "OL" 40 | // val return_pan = if (StringUtils.isNotBlank(arr(16)) && !"null".equals(arr(16).toLowerCase)) arr(16) else "0L" 41 | // val empmobile = if (StringUtils.isNotBlank(arr(41)) && !"null".equals(arr(41).toLowerCase)) arr(41) else "0L" 42 | // ((s"${hashId(arr(1))},${hashId(term_id)}"), (s"${hashId(arr(1))},${hashId(return_pan)}"), (s"${hashId(arr(1))},${empmobile}")) 43 | // } 44 | // } 45 | // val edge1: RDD[String] = rs.map(ve => ve._1).filter(k => !k.endsWith("," + hashId("0L"))) 46 | // val edge2: RDD[String] = rs.map(ve => ve._2).filter(k => !k.endsWith("," + hashId("0L"))) 47 | // val edge3: RDD[String] = rs.map(ve => ve._3).filter(k => !k.endsWith(",0L")) 48 | // edge1.union(edge2).union(edge3).repartition(1).saveAsTextFile("file:///F:/lakalaFinance_workspaces/graphx-analysis/apply/data3") 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /apply/src/test/scala/GraphxBSP3.scala: -------------------------------------------------------------------------------- 1 | /* 2 | import org.apache.commons.lang3.StringUtils 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Created by Administrator on 2017/6/16 0016. 7 | */ 8 | object GraphxBSP3 { 9 | def main(args: Array[String]): Unit = { 10 | @transient 11 | val conf = new SparkConf().setAppName("GraphxBSP").setMaster("local[4]") 12 | @transient 13 | val sc = new SparkContext(conf) 14 | //orderId,contractNo,termId,loanPan,returnPan,insertTime,recommend,userId, 15 | // deviceId 16 | //certNo,email,company,mobile,compAddr,compPhone,emergencyContactMobile,contactMobile,ipv4,msgphone,telecode 17 | val edgeRDD = sc.textFile("F:\\graphx-analysis\\apply\\bin\\test.csv").mapPartitions(lines => lines.map { line => 18 | val fields = line.split(",") 19 | val kv = if (StringUtils.isNoneEmpty(fields(2))) { 20 | (fields(2), 1) 21 | } else 22 | ("0", 0) 23 | kv 24 | }).reduceByKey(_ + _).filter(_._2 > 2) 25 | edgeRDD.foreach(kv=>println(kv._1+" === "+kv._2)) 26 | 27 | } 28 | 29 | } 30 | */ 31 | -------------------------------------------------------------------------------- /apply/src/test/scala/Median.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by linyanshi on 2017/8/19 0019. 3 | */ 4 | 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | 7 | object Median { 8 | def main(args: Array[String]) { 9 | val conf = new SparkConf().setAppName("Spark Pi") 10 | val spark = new SparkContext(conf) 11 | val data = spark.textFile("data") 12 | /*将数据逻辑划分为10个桶,这里用户可以自行设置桶数量,统计每个桶中落入的数据量*/ 13 | val mappeddata = data.map(num => { 14 | (num.toInt / 1000, num) 15 | }) 16 | 17 | val count: Array[(Int, String)] = mappeddata.reduceByKey((a, b) => { 18 | a + b 19 | }).collect() 20 | 21 | /*根据总的数据量,逐次根据桶序号由低到高依次累加,判断中位数落在哪个桶中,并获取到中位数在桶中的偏移量*/ 22 | val sum_count = count.map(data => { 23 | data._2.toInt 24 | }).sum 25 | 26 | var temp = 0 27 | var index = 0 28 | var mid = sum_count.toInt / 2 29 | for (i <- 0 to 10) { 30 | temp = temp + count(i)._2.toInt 31 | if (temp >= mid) { 32 | index = i 33 | } 34 | } 35 | /*中位数在桶中的偏移量*/ 36 | val offset = temp - mid 37 | /*获取到中位数所在桶中的偏移量为offset的数,也就是中位数*/ 38 | val result = mappeddata.filter(num => num._1 == index).takeOrdered(offset) 39 | println("Median is " + result(offset)) 40 | spark.stop() 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /apply/src/test/scala/NDegreeResult.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.{SparkConf, SparkContext} 2 | import org.apache.spark.graphx._ 3 | 4 | /** 5 | * Created by lys on 2017/4/23. 6 | */ 7 | object NDegreeResult { 8 | def main(args: Array[String]): Unit = { 9 | val conf = new SparkConf() 10 | conf.setMaster("local[2]") 11 | conf.setAppName("DTWWW") 12 | val sc = new SparkContext(conf); 13 | val edge = List(//边的信息 14 | (1, 2), (1, 3), (2, 3), (3, 4), (3, 5), (3, 6), 15 | (4, 5), (5, 6), (7, 8), (7, 9), (8, 9),(2,11),(6,11),(2,12),(6,12)) 16 | //构建边的rdd 17 | val edgeRdd = sc.parallelize(edge).map(x => { 18 | Edge(x._1.toLong, x._2.toLong, None) 19 | }) 20 | //构建图 顶点Int类型 21 | val g = Graph.fromEdges(edgeRdd, 0) 22 | //可以了解图中“超级节点”的个数和规模,以及所有节点度的分布曲线。 23 | g.degrees.collect.foreach(println(_)) 24 | //使用两次遍历,首先进行初始化的时候将自己的生命值设为2, 25 | // 第一次遍历向邻居节点传播自身带的ID以及生命值为1(2-1)的消息, 26 | // 第二次遍历的时候收到消息的邻居再转发一次,生命值为0, 27 | // 最终汇总统计的时候 只需要对带有消息为0 ID的进行统计即可得到二跳邻居 28 | 29 | 30 | type VMap = Map[VertexId, Int] 31 | 32 | /** 33 | * 节点数据的更新 就是集合的union 34 | */ 35 | def vprog(vid: VertexId, vdata: VMap, message: VMap): Map[VertexId, Int] = addMaps(vdata, message) 36 | 37 | /** 38 | * 发送消息 39 | */ 40 | def sendMsg(e: EdgeTriplet[VMap, _]) = { 41 | //取两个集合的差集 然后将生命值减1 42 | val srcMap:Map[VertexId, Int] = (e.dstAttr.keySet -- e.srcAttr.keySet).map { k => k -> (e.dstAttr(k) - 1) }.toMap 43 | val dstMap:Map[VertexId, Int] = (e.srcAttr.keySet -- e.dstAttr.keySet).map { k => k -> (e.srcAttr(k) - 1) }.toMap 44 | 45 | if (srcMap.size == 0 && dstMap.size == 0) 46 | Iterator.empty 47 | else 48 | Iterator((e.dstId, dstMap), (e.srcId, srcMap)) 49 | } 50 | 51 | /** 52 | * 消息的合并 53 | */ 54 | def addMaps(spmap1: VMap, spmap2: VMap): VMap = 55 | (spmap1.keySet ++ spmap2.keySet).map { 56 | k => k -> math.min(spmap1.getOrElse(k, Int.MaxValue), spmap2.getOrElse(k, Int.MaxValue)) 57 | }.toMap 58 | 59 | val two = 2 //这里是二跳邻居 所以只需要定义为2即可 60 | val newG = g.mapVertices((vid, _) => Map[VertexId, Int](vid -> two)) 61 | .pregel(Map[VertexId, Int](), two, EdgeDirection.Out)(vprog, sendMsg, addMaps) 62 | 63 | //可以看一下二次遍历之后各个顶点的数据: 64 | newG.vertices.collect().foreach(println(_)) 65 | // (4,Map(5 -> 1, 1 -> 0, 6 -> 0, 2 -> 0, 3 -> 1, 4 -> 2)) 66 | // (6,Map(5 -> 1, 1 -> 0, 6 -> 2, 2 -> 0, 3 -> 1, 4 -> 0)) 67 | // (8,Map(8 -> 2, 7 -> 1, 9 -> 1)) 68 | // (2,Map(5 -> 0, 1 -> 1, 6 -> 0, 2 -> 2, 3 -> 1, 4 -> 0)) 69 | // (1,Map(5 -> 0, 1 -> 2, 6 -> 0, 2 -> 1, 3 -> 1, 4 -> 0)) 70 | // (3,Map(5 -> 1, 1 -> 1, 6 -> 1, 2 -> 1, 3 -> 2, 4 -> 1)) 71 | // (7,Map(7 -> 2, 8 -> 1, 9 -> 1)) 72 | // (9,Map(9 -> 2, 7 -> 1, 8 -> 1)) 73 | // (5,Map(5 -> 2, 1 -> 0, 6 -> 1, 2 -> 0, 3 -> 1, 4 -> 1)) 74 | // Map中的key表示周边的顶点id,其value就是对应顶点id的生命值,所以我们现在对该rdd再做一次mapValues处理即可得到最后的二跳邻居 75 | //过滤得到二跳邻居 就是value=0 的顶点 76 | val twoJumpFirends = newG.vertices 77 | .mapValues(_.filter(_._2 == 0).keys) 78 | 79 | twoJumpFirends.collect().foreach(println(_)) 80 | // (4,Set(1, 6, 2)) 81 | // (6,Set(1, 2, 4)) 82 | // (8,Set()) 83 | // (2,Set(5, 6, 4)) 84 | // (1,Set(5, 6, 4)) 85 | // (3,Set()) 86 | // (7,Set()) 87 | // (9,Set()) 88 | // (5,Set(1, 2)) 89 | 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /apply/src/test/scala/NumOnce.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by linyanshi on 2017/8/19 0019. 3 | */ 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.SparkContext._ 6 | 7 | object NumOnce { 8 | //利用异或运算将列表中的所有ID异或,之后得到的值即为所求ID。先将每个分区的数据 9 | //异或,然后将结果进行异或运算。 10 | def computeOneNum(args:Array[String]) { 11 | val conf = new SparkConf().setAppName("NumOnce").setMaster("local[1]") 12 | val spark = new SparkContext(conf) 13 | val data = spark.textFile("data") 14 | /*每个分区分别对数据进行异或运算,最后在reduceByKey阶段,将各分区异或运算的结果再做异或运算合并。 15 | 偶数次出现的数字,异或运算之后为0,奇数次出现的数字,异或后为数字本身*/ 16 | val result = data.mapPartitions(iter => { 17 | var temp = iter.next().toInt 18 | while(iter.hasNext) { 19 | temp = temp^(iter.next()).toInt 20 | } 21 | Seq((1, temp)).iterator 22 | }).reduceByKey(_^_).collect() 23 | println("num appear once is: "+result(0)) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /apply/src/test/scala/ParsesTest.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.{SparkConf, SparkContext} 2 | import org.apache.spark.sql.SQLContext 3 | 4 | /** 5 | * Created by Administrator on 2017/8/4 0004. 6 | */ 7 | object ParsesTest { 8 | // case class Data(index: String, title: String, content: String) 9 | // def main(args: Array[String]): Unit = { 10 | // val conf = new SparkConf().setAppName("WordCount").setMaster("local") 11 | // val sc = new SparkContext(conf) 12 | // val input = sc.textFile("F:\\out\\output") 13 | // //wholeTextFiles读出来是一个RDD(String,String) 14 | // val result = input.map{line=> 15 | // val reader = new CSVReader(new StringReader(line)); 16 | // reader.readAll().map(x => Data(x(0), x(1), x(2))) 17 | // } 18 | // for(res <- result){ 19 | // println(res) 20 | // } 21 | // } 22 | def main(args: Array[String]): Unit = { 23 | val conf = new SparkConf().setAppName("ParsesTest").setMaster("local") 24 | val sc = new SparkContext(conf) 25 | val sqlContext = new SQLContext(sc) 26 | val df = sqlContext.load("com.databricks.spark.csv", Map("path" -> "F:\\out\\output\\*", "header" -> "true")) 27 | df.select("index", "title").foreach(row=>println(row.get(0))) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /apply/src/test/scala/TestCSV.scala: -------------------------------------------------------------------------------- 1 | import com.lakala.datacenter.core.utils.UtilsToos 2 | 3 | import scala.collection.mutable.ArrayBuffer 4 | import scala.util.matching.Regex 5 | 6 | /** 7 | * Created by Administrator on 2017/8/4 0004. 8 | */ 9 | object TestCSV { 10 | 11 | case class Data(index: String, title: String, content: String) 12 | 13 | val arr = Array(4) 14 | 15 | def main(args: Array[String]) { 16 | val value ="1472100411047" 17 | val pattern = new Regex("[0-9]{1,}") 18 | if(pattern.pattern.matcher(value).matches()) 19 | println(value.toLong) 20 | } 21 | 22 | private def splitSpecificDelimiterData(line: String): String = { 23 | val context = new StringBuffer() 24 | val haveSplitAtt = line.split(",") 25 | 26 | val oneSplitAtt = haveSplitAtt(1).split("\\|") 27 | for (i <- 0 until (oneSplitAtt.length)) { 28 | if (arr(0) == 4) { 29 | val secondSplitAtt = haveSplitAtt(3).split("\\|") 30 | for (j <- 0 until (secondSplitAtt.length)) { 31 | if (j == secondSplitAtt.length - 1) 32 | context.append(s"${haveSplitAtt(0)},${oneSplitAtt(i)},${haveSplitAtt(2)},${secondSplitAtt(j)},${haveSplitAtt(haveSplitAtt.size - 1)}") 33 | else 34 | context.append(s"${haveSplitAtt(0)},${oneSplitAtt(i)},${haveSplitAtt(2)},${secondSplitAtt(j)},${haveSplitAtt(haveSplitAtt.size - 1)}\n") 35 | } 36 | } else { 37 | if (i == oneSplitAtt.length - 1) 38 | context.append(s"${haveSplitAtt(0)},${oneSplitAtt(i)},${haveSplitAtt(haveSplitAtt.size - 1)}") 39 | else 40 | context.append(s"${haveSplitAtt(0)},${oneSplitAtt(i)},${haveSplitAtt(haveSplitAtt.size - 1)}\n") 41 | } 42 | } 43 | context.toString 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /apply/src/test/scala/TestRunGraphx.scala: -------------------------------------------------------------------------------- 1 | import com.lakala.datacenter.core.utils.UtilsToos.hashId 2 | import org.apache.spark.graphx.Edge 3 | 4 | import scala.collection.mutable.ListBuffer 5 | 6 | /** 7 | * Created by ASUS-PC on 2017/4/12. 8 | */ 9 | object TestRunGraphx { 10 | def main(args: Array[String]): Unit = { 11 | // RunLoadApplyGraphx2.main(Array()) 12 | // com.lakala.datacenter.main.Driver.main(args) 13 | // val s ="15397661996->XNW28459058720408576" 14 | // val ss = "13666199888->XNW28459058720408576" 15 | // s.substring(0,s.indexOf("->")) 16 | // println(s.substring(0,s.indexOf("->"))+"##"+ss.substring(0,ss.indexOf("->"))) 17 | val arry= args(0).split(",") 18 | val edge =new EdgeArr("001","4334","7","0") 19 | if(judgSendMsg(arry,edge)) println("=========") 20 | } 21 | def judgSendMsg(sendType: Array[String], edge: EdgeArr): Boolean = { 22 | var flag = false 23 | for (stype <- sendType) if (edge.srcType.equals(stype)) flag = true 24 | flag 25 | } 26 | 27 | // var messages = g.mapReduceTriplets(sendMsg,mergeMsg); 28 | // print("messages:"+messages.take(10).mkString("\n")) 29 | // var activeMessages = messages.count(); 30 | // //LOAD 31 | // var prevG:Graph[VD,ED] = null 32 | // var i = 0; 33 | // while(activeMessages > 0 && i < maxIterations){ 34 | // //③Receive the messages.Vertices that didn‘t get any message do not appear in newVerts. 35 | // //内联操作,返回的结果是VertexRDD,可以参看后面的调试信息 36 | // val newVerts = g.vertices.innerJoin(messages)(vprog).cache(); 37 | // print("newVerts:"+newVerts.take(10).mkString("\n")) 38 | // //④update the graph with the new vertices. 39 | // prevG = g;//先把旧的graph备份,以利于后面的graph更新和unpersist掉旧的graph 40 | //      //④外联操作,返回整个更新的graph 41 | // g = g.outerJoinVertices(newVerts){(vid,old,newOpt) => newOpt.getOrElse(old)}//getOrElse方法,意味,如果newOpt存在,返回newOpt,不存在返回old 42 | // print(g.vertices.take(10).mkString("\n")) 43 | // g.cache();//新的graph cache起来,下一次迭代使用 44 | // 45 | // val oldMessages = messages;//备份,同prevG = g操作一样 46 | // //Send new messages.Vertices that didn‘t get any message do not appear in newVerts.so 47 | // //don‘t send messages.We must cache messages.so it can be materialized on the next line. 48 | // //allowing us to uncache the previous iteration. 49 | //     //⑤下一次迭代要发送的新的messages,先cache起来 50 | // messages = g.mapReduceTriplets(sendMsg,mergeMsg,Some((newVerts,activeDirection))).cache() 51 | // print("下一次迭代要发送的messages:"+messages.take(10).mkString("\n")) 52 | // activeMessages = messages.count();//⑥ 53 | // print("下一次迭代要发送的messages的个数:"+ activeMessages)//如果activeMessages==0,迭代结束 54 | // logInfo("Pregel finished iteration" + i); 55 | //     //原来,旧的message和graph不可用了,unpersist掉 56 | // oldMessages.unpersist(blocking= false); 57 | // newVerts.unpersist(blocking=false)//unpersist之后,就不可用了 58 | // prevG.unpersistVertices(blocking=false) 59 | // prevG.edges.unpersist(blocking=false) 60 | // i += 1; 61 | // } 62 | // g//返回最后的graph 63 | //} 64 | // 65 | //} 66 | 67 | 68 | 69 | // val conf = if (ctx.isLocals) new Configuration else ctx.getSparkContext.hadoopConfiguration 70 | // val hdfsPath: String = hdfsMasterPath + path 71 | // rdd.saveAsTextFile(hdfsPath) 72 | // hiveCT.sql(s"ALTER TABLE $tableName DROP PARTITION(execute_dt='$date', project_id='$project')") 73 | // hiveCT.sql(s"ALTER TABLE $tableName SET FILEFORMAT TEXTFILE") 74 | // hiveCT.sql(s"LOAD DATA INPATH '$hdfsPath/part-*' OVERWRITE INTO TABLE $tableName PARTITION (execute_dt='$date', project_id='$project')") 75 | // hiveCT.sql(s"ALTER TABLE $tableName SET FILEFORMAT RCFILE") 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | } 86 | -------------------------------------------------------------------------------- /apply/src/test/scala/TrustRank.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.Logging 2 | import org.apache.spark.graphx._ 3 | 4 | import scala.reflect.ClassTag 5 | import scala.util.Random 6 | 7 | /** 8 | * Created by linyanshi on 2017/9/19 0019. 9 | */ 10 | object TrustRank extends Logging { 11 | 12 | /* 13 | * VD : (double, double) denotes rank and score 14 | * ED : double , not used 15 | */ 16 | def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], numIter: Int): Long = { 17 | val resetProb: Double = 0.15 18 | val resetRank: Double = 0.15 19 | 20 | def resetScore: Double = Random.nextDouble() 21 | 22 | 23 | var rankGraph: Graph[Double, Double] = graph 24 | .outerJoinVertices(graph.outDegrees) { (vid, vd, deg) => deg.getOrElse(0) } 25 | .mapTriplets(e => 1.0 / e.srcAttr, TripletFields.Src) 26 | .mapVertices((id, attr) => resetRank) 27 | 28 | val scoreGraph: Graph[Double, _] = graph.mapVertices((id, attr) => resetScore).cache() 29 | 30 | var iteration = 0 31 | 32 | val start_ms = System.currentTimeMillis() 33 | println("Start time : " + start_ms) 34 | 35 | while (iteration < numIter) { 36 | val rankUpdates = rankGraph.aggregateMessages[Double]( 37 | ctx => ctx.sendToDst(ctx.srcAttr * ctx.attr), 38 | _ + _, 39 | TripletFields.Src 40 | ) 41 | 42 | // update rank and apply 43 | rankGraph = rankGraph.joinVertices(rankUpdates) { 44 | (id, old_vd, msgSum) => (1.0 - resetProb) * msgSum 45 | }.joinVertices(scoreGraph.vertices) { 46 | (id, rank, score) => (rank + resetProb * score) 47 | } 48 | 49 | rankGraph.vertices.count() // materialize rank graph 50 | logInfo(s"TrustRank finished iteration $iteration.") 51 | 52 | iteration += 1 53 | 54 | } 55 | 56 | 57 | var end_ms = System.currentTimeMillis() 58 | println("End time : " + end_ms) 59 | 60 | println("Cost : " + (end_ms - start_ms)) 61 | 62 | end_ms - start_ms 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /apply/src/test/scala/UDF_test.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql.hive.HiveContext 2 | import org.apache.spark.{SparkConf, SparkContext} 3 | 4 | /** 5 | * Created by linyanshi on 2017/9/14 0014. 6 | */ 7 | object UDF_test { 8 | def main(args: Array[String]): Unit = { 9 | 10 | val conf = new SparkConf() 11 | implicit val sc = new SparkContext(conf) 12 | implicit val sqlContext = new HiveContext(sc) 13 | 14 | import sqlContext.implicits._ 15 | 16 | val data = sc.parallelize(Seq(("a", 1), ("bb", 5), ("cccc", 10), ("dddddd", 15))).toDF("a", "b") 17 | data.registerTempTable("data") 18 | 19 | 20 | { 21 | //函数体采用原生类型(非Column类型),使用udf包装函数体,将函数体注册到sqlContext.udf 22 | import org.apache.spark.sql.functions._ 23 | 24 | //函数体 25 | val filter_length_f = (str: String, _length: Int) => { 26 | str.length > _length; 27 | } 28 | 29 | //注册函数体到当前sqlContext,注意,注册到sqlContext的函数体,参数不能为Column 30 | //注册后,可以在以下地方使用:1、df.selectExpr 2、df.filter ,3、将该df注册为temptable,之后在sql中使用 31 | sqlContext.udf.register("filter_length", filter_length_f) 32 | 33 | val filter_length = udf(filter_length_f) //为方便使用Column,我们对函数体进行包装,包装后的输入参数为Column 34 | 35 | data.select($"*", filter_length($"a", lit(2))).show //使用udf包装过的,必须传入Column,注意 lit(2) 36 | data.selectExpr("*", " filter_length(a,2) as ax").show //select 若写表达式调用函数,则需要使用selectExpr 37 | 38 | data.filter(filter_length($"a", lit(2))).show //同select 39 | data.filter("filter_length(a,2)").show //filter调用表达式,可以直接使用df.filter函数, 40 | 41 | sqlContext.sql("select *,filter_length(a,2) from data").show 42 | sqlContext.sql("select *,filter_length(a,2) from data where filter_length(a,2)").show 43 | } 44 | { 45 | //函数体使用Column类型,无法注册到sqlContext.udf 46 | //使用udf包装后,每列都必须输入column,能否我们自己定义呢,比如一个参数是Column,一个是其他类型 47 | import org.apache.spark.sql.Column 48 | import org.apache.spark.sql.functions._ 49 | 50 | val filter_length_f2 = (str: Column, _length: Int) => { 51 | length(str) > _length 52 | } 53 | sqlContext.udf.register("filter_length", filter_length_f2) //todo:不好意思,这里注册不了,注册到sqlContext.udf的函数,入参不支持Column类型 54 | 55 | data.select($"*", filter_length_f2($"a", 2)).show //不用udf包装,我们就可以完全自定义,这时 length 就可以传入整型了 56 | data.selectExpr("*", " filter_length_f2(a,2) as ax").show //todo:不好意思,这里用不了了, 57 | 58 | data.filter(filter_length_f2($"a", 2)).show //同select 59 | data.filter("filter_length(a,2)").show //todo:不好意思,这里用不了了 60 | 61 | } 62 | //最后,我们写一个相对通用的吧 63 | { 64 | //定义两个函数体,入参一个使用column类型,一个使用原生类型,将原生类型函数注册到sqlContext.udf 65 | 66 | import org.apache.spark.sql.Column 67 | import org.apache.spark.sql.functions._ 68 | 69 | //函数体 70 | val filter_length_f = (str: String, _length: Int) => { 71 | str.length > _length; 72 | } 73 | //主函数,下面df.select df.filter 等中使用 74 | val filter_length = (str: Column, _length: Int) => { 75 | length(str) > _length 76 | } 77 | //注册函数体到当前sqlContext,注意,注册到sqlContext的函数体,参数不能为Column 78 | //注册后,可以在以下地方使用:1、df.selectExpr 2、df.filter ,3、将该df注册为temptable,之后在sql中使用 79 | sqlContext.udf.register("filter_length", filter_length_f) 80 | 81 | //这里我们不使用udf了,直接使用自己定义的支持Column的函数 82 | //val filter_length = udf(filter_length_f) //为方便使用Column,我们对函数体进行包装,包装后的输入参数为Column 83 | 84 | data.select($"*", filter_length($"a", 2)).show //使用udf包装过的,必须传入Column,注意 lit(2) 85 | data.selectExpr("*", " filter_length(a,2) as ax").show //select 若写表达式调用函数,则需要使用selectExpr 86 | 87 | data.filter(filter_length($"a", 2)).show //同select 88 | data.filter("filter_length(a,2)").show //filter调用表达式,可以直接使用df.filter函数, 89 | 90 | sqlContext.sql("select *,filter_length(a,2) from data").show 91 | sqlContext.sql("select *,filter_length(a,2) from data where filter_length(a,2)").show 92 | } 93 | 94 | 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /apply/src/test/scala/entity/CallEntity.scala: -------------------------------------------------------------------------------- 1 | package entity 2 | 3 | import scala.collection.mutable.ListBuffer 4 | 5 | /** 6 | * Created by ASUS-PC on 2017/4/19. 7 | */ 8 | case class CallEntity(var totalRounds: Int = 0, var propertyList: ListBuffer[String] = ListBuffer()) extends Serializable with Product { 9 | override def productElement(idx: Int): Any = idx match { 10 | case 0 => totalRounds 11 | case 1 => propertyList 12 | } 13 | 14 | override def productArity: Int = 2 15 | 16 | override def canEqual(that: Any): Boolean = that.isInstanceOf[CallEntity] 17 | 18 | override def toString = s"CallEntity($totalRounds, ${propertyList.toArray.mkString(",")})" 19 | } 20 | -------------------------------------------------------------------------------- /apply/src/test/scala/entity/CallVertex.scala: -------------------------------------------------------------------------------- 1 | package entity 2 | 3 | import scala.reflect.ClassTag 4 | 5 | /** 6 | * Created by ASUS-PC on 2017/4/20. 7 | */ 8 | case class CallVertex[VD: ClassTag](var oldAttr: VD = null, 9 | var newAttr: VD = null, 10 | var init: Boolean = false, 11 | var loop: Int = 0) 12 | extends Serializable { 13 | } 14 | -------------------------------------------------------------------------------- /apply/src/test/scala/entity/TwoDegree.scala: -------------------------------------------------------------------------------- 1 | package entity 2 | 3 | /** 4 | * Created by ASUS-PC on 2017/4/24. 5 | */ 6 | case class TwoDegree (var attr:String ="", 7 | var loop: Int = 0) 8 | extends Serializable { 9 | } 10 | -------------------------------------------------------------------------------- /apply/src/test/scala/utils/CollectionUtil.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import scala.collection.mutable.ArrayBuffer 4 | import scala.reflect.ClassTag 5 | 6 | /** 7 | * Created by ASUS-PC on 2017/4/19. 8 | */ 9 | 10 | object CollectionUtil { 11 | 12 | /** 13 | * 对具有Traversable[(K, V)]类型的集合添加reduceByKey相关方法 14 | * 15 | * @param collection 16 | * @param kt 17 | * @param vt 18 | * @tparam K 19 | * @tparam V 20 | */ 21 | implicit class CollectionHelper[K, V](collection: Traversable[(K, V)])(implicit kt: ClassTag[K], vt: ClassTag[V]) { 22 | def reduceByKey(f: (V, V) => V): Traversable[(K, V)] = { 23 | collection.groupBy(_._1).map { case (_: K, values: Traversable[(K, V)]) => values.reduce((a, b) => (a._1, f(a._2, b._2))) }} 24 | 25 | /** 26 | * reduceByKey的同时,返回被reduce掉的元素的集合 27 | * 28 | * @param f 29 | * @return 30 | */ 31 | def reduceByKeyWithReduced(f: (V, V) => V)(implicit kt: ClassTag[K], vt: ClassTag[V]): (Traversable[(K, V)], Traversable[(K, V)]) = { 32 | val reduced: ArrayBuffer[(K, V)] = ArrayBuffer() 33 | val newSeq = collection.groupBy(_._1).map { 34 | case (_: K, values: Traversable[(K, V)]) => values.reduce((a, b) => { 35 | val newValue: V = f(a._2, b._2) 36 | val reducedValue: V = if (newValue == a._2) b._2 else a._2 37 | val reducedPair: (K, V) = (a._1, reducedValue) 38 | reduced += reducedPair 39 | (a._1, newValue) 40 | }) 41 | } 42 | (newSeq, reduced.toTraversable) 43 | } 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /common/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | com.lakala.datacenter 7 | graphx-analysis 8 | 1.0.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | graphx-analysis-common 13 | 14 | -------------------------------------------------------------------------------- /common/src/main/resources/css/style.css: -------------------------------------------------------------------------------- 1 | graph { 2 | fill-color: white; 3 | } 4 | node { 5 | size: 65; 6 | fill-color: #CCCCCC, #AAAAAA; 7 | fill-mode: gradient-radial; 8 | text-offset: 0, 0; 9 | stroke-mode: plain; 10 | stroke-color: #333333; 11 | } 12 | node:clicked { 13 | fill-color: #2277FF, #88AAFF; 14 | fill-mode: gradient-radial; 15 | size: 100; 16 | text-size:18; 17 | text-offset: 0, 0; 18 | } 19 | edge { 20 | text-alignment: along; 21 | } 22 | -------------------------------------------------------------------------------- /common/src/test/data/cities_edges.txt: -------------------------------------------------------------------------------- 1 | 1 2 75 2 | 1 4 140 3 | 1 8 118 4 | 2 3 71 5 | 3 4 151 6 | 4 5 99 7 | 4 6 80 8 | 5 13 211 9 | 6 7 97 10 | 6 12 146 11 | 7 13 101 12 | 7 12 138 13 | 8 9 111 14 | 9 10 70 15 | 10 11 75 16 | 11 12 120 17 | 13 14 90 -------------------------------------------------------------------------------- /common/src/test/data/cities_vertices.txt: -------------------------------------------------------------------------------- 1 | 1 Arad 2 | 2 Zerind 3 | 3 Oradea 4 | 4 Sibiu 5 | 5 Fagaras 6 | 6 RimnicuVilcea 7 | 7 Pitesti 8 | 8 Timisoara 9 | 9 Lugoj 10 | 10 Mehadia 11 | 11 Drobeta 12 | 12 Craiova 13 | 13 Bucharest 14 | 14 Giurgiu -------------------------------------------------------------------------------- /common/src/test/data/likeness_edges.txt: -------------------------------------------------------------------------------- 1 | 1 2 likes 2 | 1 4 follows 3 | 1 6 follows 4 | 1 6 likes 5 | 2 1 follows 6 | 2 5 likes 7 | 2 6 likes 8 | 3 1 follows 9 | 3 4 likes 10 | 4 2 likes 11 | 4 3 follows 12 | 5 3 likes 13 | 6 1 follows 14 | 6 4 likes -------------------------------------------------------------------------------- /common/src/test/data/maxvalue_edges.txt: -------------------------------------------------------------------------------- 1 | 1 2 2 | 2 1 3 | 2 4 4 | 3 2 5 | 3 4 6 | 4 3 -------------------------------------------------------------------------------- /common/src/test/data/maxvalue_vertices.txt: -------------------------------------------------------------------------------- 1 | 1 3 2 | 2 6 3 | 3 2 4 | 4 1 -------------------------------------------------------------------------------- /common/src/test/data/papers_edges.txt: -------------------------------------------------------------------------------- 1 | 1 2 2 | 1 4 3 | 1 6 4 | 2 1 5 | 2 6 6 | 3 1 7 | 3 4 8 | 4 2 9 | 4 5 10 | 5 2 11 | 5 3 12 | 6 1 13 | 6 4 -------------------------------------------------------------------------------- /common/src/test/data/people_vertices.txt: -------------------------------------------------------------------------------- 1 | #ID NAME AGE 2 | 1 tom 34 3 | 2 chiara 51 4 | 3 22 5 | 4 marco 28 6 | 5 lucia 40 7 | 6 meria 32 8 | 7 tommy 30 9 | 8 giulio 45 10 | 9 ada 33 -------------------------------------------------------------------------------- /common/src/test/data/relationships_edges.txt: -------------------------------------------------------------------------------- 1 | 1 4 2 | 1 6 3 | 3 4 4 | 3 5 5 | 4 3 6 | 5 2 7 | 5 6 8 | 6 1 9 | 6 4 -------------------------------------------------------------------------------- /common/src/test/data/us_cities_edges.txt: -------------------------------------------------------------------------------- 1 | 1 2 27 2 | 1 3 91 3 | 2 3 35 4 | 2 5 67 5 | 3 4 48 6 | 3 5 14 7 | 5 4 29 8 | 5 6 15 -------------------------------------------------------------------------------- /common/src/test/data/us_cities_vertices.txt: -------------------------------------------------------------------------------- 1 | 1 Washington 2 | 2 Baltimore 3 | 3 Detroit 4 | 4 Chicago 5 | 5 NewYork 6 | 6 Philadelphia -------------------------------------------------------------------------------- /common/src/test/data/users_dense_edges.txt: -------------------------------------------------------------------------------- 1 | 1 2 2 | 1 4 3 | 2 3 4 | 2 4 5 | 2 5 6 | 3 4 7 | 5 1 8 | 5 3 9 | 5 6 10 | 6 1 11 | 6 3 -------------------------------------------------------------------------------- /common/src/test/data/users_disjoint_edges.txt: -------------------------------------------------------------------------------- 1 | 1 2 2 | 2 5 3 | 2 6 4 | 3 4 5 | 4 3 6 | 5 6 7 | 6 1 -------------------------------------------------------------------------------- /common/src/test/data/users_edges.txt: -------------------------------------------------------------------------------- 1 | 1 2 2 | 2 6 3 | 3 1 4 | 3 4 5 | 4 2 6 | 5 3 7 | 6 1 8 | 6 4 -------------------------------------------------------------------------------- /common/src/test/data/users_vertices.txt: -------------------------------------------------------------------------------- 1 | # ID USERNAME AGE 2 | 1 Alice 35 3 | 2 Bob 41 4 | 3 Carol 28 5 | 4 Dave 43 6 | 5 Eve 29 7 | 6 Frank 30 -------------------------------------------------------------------------------- /common/src/test/scala/TestGraphViewer.scala: -------------------------------------------------------------------------------- 1 | import com.lakala.datacenter.common.graphstream.SimpleGraphViewer 2 | 3 | /** 4 | * Created by peter on 2017/4/26. 5 | */ 6 | object TestGraphViewer { 7 | def main(args: Array[String]): Unit = { 8 | SimpleGraphViewer.main(Array()) 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /core/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | com.lakala.datacenter 7 | graphx-analysis 8 | 1.0.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | graphx-analysis-core 13 | 14 | 15 | 16 | 17 | com.lakala.datacenter 18 | graphx-analysis-common 19 | ${project.version} 20 | 21 | 22 | -------------------------------------------------------------------------------- /core/src/main/java/com/lakala/datacenter/core/messaging/Sender.java: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.core.messaging; 2 | 3 | import com.lakala.datacenter.core.config.ConfigurationLoader; 4 | import com.rabbitmq.client.Channel; 5 | import com.rabbitmq.client.Connection; 6 | import com.rabbitmq.client.ConnectionFactory; 7 | import com.rabbitmq.client.MessageProperties; 8 | 9 | import java.util.concurrent.TimeoutException; 10 | 11 | public class Sender { 12 | private static final String TASK_QUEUE_NAME = "processor"; 13 | 14 | public static void sendMessage(String message) 15 | throws java.io.IOException, 16 | java.lang.InterruptedException, TimeoutException { 17 | 18 | ConnectionFactory factory = new ConnectionFactory(); 19 | factory.setHost(ConfigurationLoader.getInstance().getRabbitmqNodename()); 20 | Connection connection = factory.newConnection(); 21 | Channel channel = connection.createChannel(); 22 | 23 | channel.queueDeclare(TASK_QUEUE_NAME, true, false, false, null); 24 | 25 | channel.basicPublish("", TASK_QUEUE_NAME, 26 | MessageProperties.PERSISTENT_TEXT_PLAIN, 27 | message.getBytes()); 28 | System.out.println(" [x] Sent '" + message + "'"); 29 | 30 | channel.close(); 31 | connection.close(); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /core/src/main/java/com/lakala/datacenter/core/models/PartitionDescription.java: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.core.models; 2 | 3 | public class PartitionDescription { 4 | private Long partitionId; 5 | private String partitionLabel; 6 | private String groupRelationship; 7 | private String targetRelationship; 8 | 9 | public String getPartitionLabel() { 10 | return partitionLabel; 11 | } 12 | 13 | public void setPartitionLabel(String partitionLabel) { 14 | this.partitionLabel = partitionLabel; 15 | } 16 | 17 | public Long getPartitionId() { 18 | return partitionId; 19 | } 20 | 21 | public void setPartitionId(Long partitionId) { 22 | this.partitionId = partitionId; 23 | } 24 | 25 | public String getTargetRelationship() { 26 | return targetRelationship; 27 | } 28 | 29 | public void setTargetRelationship(String targetRelationship) { 30 | this.targetRelationship = targetRelationship; 31 | } 32 | 33 | public String getGroupRelationship() { 34 | return groupRelationship; 35 | } 36 | 37 | public void setGroupRelationship(String groupRelationship) { 38 | this.groupRelationship = groupRelationship; 39 | } 40 | 41 | public PartitionDescription(Long partitionId, String partitionLabel) { 42 | this.partitionId = partitionId; 43 | this.partitionLabel = partitionLabel; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /core/src/main/java/com/lakala/datacenter/core/models/ProcessorMessage.java: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.core.models; 2 | 3 | /** 4 | * The ProcessorMessage class is used to distribute messages between the graph processor and Neo4j. 5 | */ 6 | public class ProcessorMessage { 7 | private String path; 8 | private String analysis; 9 | private ProcessorMode mode; 10 | private PartitionDescription partitionDescription; 11 | 12 | public ProcessorMessage(String path, String analysis, ProcessorMode mode) { 13 | this.path = path; 14 | this.analysis = analysis; 15 | this.mode = mode; 16 | } 17 | 18 | /** 19 | * Get the HDFS path. 20 | * @return The path to the HDFS file for this process. 21 | */ 22 | public String getPath() { 23 | return path; 24 | } 25 | 26 | /** 27 | * Set the HDFS path. 28 | * @param path The path to the HDFS file for this process. 29 | */ 30 | public void setPath(String path) { 31 | this.path = path; 32 | } 33 | 34 | /** 35 | * Get the analysis type. 36 | * @return The key for the analysis type. 37 | */ 38 | public String getAnalysis() { 39 | return analysis; 40 | } 41 | 42 | /** 43 | * Set the analysis type. 44 | * @param analysis The key for the analysis type. 45 | */ 46 | public void setAnalysis(String analysis) { 47 | this.analysis = analysis; 48 | } 49 | 50 | /** 51 | * Get the mode type. 52 | * @return The mode type for the analysis, either partitioned or unpartitioned. 53 | */ 54 | public ProcessorMode getMode() { 55 | return mode; 56 | } 57 | 58 | /** 59 | * Set the mode type. 60 | * @param mode The mode type represents whether the analysis should be partitioned. 61 | */ 62 | public void setMode(ProcessorMode mode) { 63 | this.mode = mode; 64 | } 65 | 66 | /** 67 | * Get the description for the partitioned analysis. 68 | * @return Returns a description for the queried partition. 69 | */ 70 | public PartitionDescription getPartitionDescription() { 71 | return partitionDescription; 72 | } 73 | 74 | /** 75 | * Set the partition description for an analysis. Preserves information related to 76 | * the analysis being performed on the current partition. 77 | * @param partitionDescription A set of fields that describe the partition being analyzed. 78 | */ 79 | public void setPartitionDescription(PartitionDescription partitionDescription) { 80 | this.partitionDescription = partitionDescription; 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /core/src/main/java/com/lakala/datacenter/core/models/ProcessorMode.java: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.core.models; 2 | 3 | public enum ProcessorMode { 4 | Partitioned, 5 | Unpartitioned 6 | } 7 | -------------------------------------------------------------------------------- /core/src/main/scala/com/lakala/datacenter/core/abstractions/PregelProgram.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.core.abstractions 2 | 3 | /** 4 | * Created by peter on 2017/4/26. 5 | */ 6 | import org.apache.spark.graphx._ 7 | import scala.reflect.ClassTag 8 | 9 | /** 10 | * The [[PregelProgram]] abstraction wraps Spark's Pregel API implementation from the [[GraphOps]] 11 | * class into a model that is easier to write graph algorithms. 12 | * @tparam VertexState is the generic type representing the state of a vertex 13 | */ 14 | abstract class PregelProgram[VertexState: ClassTag, VD: ClassTag, ED: ClassTag] protected () extends Serializable { 15 | 16 | @transient val graph: Graph[VD, ED] 17 | 18 | /** 19 | * The vertex program receives a state update and acts to update its state 20 | * @param id is the [[VertexId]] that this program will perform a state operation for 21 | * @param state is the current state of this [[VertexId]] 22 | * @param message is the state received from another vertex in the graph 23 | * @return a [[VertexState]] resulting from a comparison between current state and incoming state 24 | */ 25 | def vertexProgram(id : VertexId, state : VertexState, message : VertexState) : VertexState 26 | 27 | /** 28 | * The message broker sends and receives messages. It will initially receive one message for 29 | * each vertex in the graph. 30 | * @param triplet An edge triplet is an object containing a pair of connected vertex objects and edge object. 31 | * For example (v1)-[r]->(v2) 32 | * @return The message broker returns a key value list, each containing a VertexId and a new message 33 | */ 34 | def messageBroker(triplet :EdgeTriplet[VertexState, ED]) : Iterator[(VertexId, VertexState)] 35 | 36 | /** 37 | * This method is used to reduce or combine the set of all state outcomes produced by a vertexProgram 38 | * for each vertex in each superstep iteration. Each vertex has a list of state updates received from 39 | * other vertices in the graph via the messageBroker method. This method is used to reduce the list 40 | * of state updates into a single state for the next superstep iteration. 41 | * @param a A first [[VertexState]] representing a partial state of a vertex. 42 | * @param b A second [[VertexState]] representing a different partial state of a vertex 43 | * @return a merged [[VertexState]] representation from the two [[VertexState]] parameters 44 | */ 45 | def combiner(a: VertexState, b: VertexState) : VertexState 46 | 47 | } -------------------------------------------------------------------------------- /core/src/main/scala/com/lakala/datacenter/core/grograms/EdgeBetweennessProgram.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.core.grograms 2 | 3 | /** 4 | * Created by peter on 2017/4/26. 5 | */ 6 | import org.apache.spark.graphx.{EdgeTriplet, Graph, VertexId} 7 | import com.lakala.datacenter.core.abstractions.PregelProgram 8 | 9 | /** 10 | * The [[EdgeBetweennessProgram]] is an example graph algorithm implemented on the [[PregelProgram]] 11 | * abstraction. 12 | */ 13 | class EdgeBetweennessProgram(@transient val graph : Graph[Seq[VertexId], Seq[VertexId]]) 14 | extends PregelProgram[Seq[VertexId], Seq[VertexId], Seq[VertexId]] with Serializable { 15 | 16 | protected def this() = this(null) 17 | 18 | /** 19 | * Return the larger of the two vertex attribute values 20 | * @param id is the [[VertexId]] that this program will perform a state operation for 21 | * @param state is the current state of this [[VertexId]] 22 | * @param message is the state received from another vertex in the graph 23 | * @return an [[Int]] resulting from a comparison between current state and incoming state 24 | */ 25 | override def vertexProgram(id: VertexId, state: Seq[VertexId], message: Seq[VertexId]): Seq[VertexId] = { 26 | if(state == null) { 27 | message 28 | } else { 29 | (state ++ message).distinct 30 | } 31 | } 32 | 33 | /** 34 | * Return the larger of the two vertex state results 35 | * @param a A first [[Int]] representing a partial state of a vertex. 36 | * @param b A second [[Int]] representing a different partial state of a vertex 37 | * @return a merged [[Int]] representation from the two [[Int]] parameters 38 | */ 39 | override def combiner(a: Seq[VertexId], b: Seq[VertexId]): Seq[VertexId] = { 40 | (a ++ b).distinct 41 | } 42 | 43 | /** 44 | * If the dstVertex's value is less than the srcVertex's value, send a message to the dstVertex to update 45 | * its state 46 | * @param triplet An edge triplet is an object containing a pair of connected vertex objects and edge object. 47 | * For example (v1)-[r]->(v2) 48 | * @return The message broker returns a key value list, each containing a VertexId and a new message 49 | */ 50 | override def messageBroker(triplet: EdgeTriplet[Seq[VertexId], Seq[VertexId]]): Iterator[(VertexId, Seq[VertexId])] = { 51 | // If the srcAttr is greater than the dstAttr then notify the dstVertex to update its state 52 | 53 | if(!triplet.srcAttr.contains(triplet.dstId)) { 54 | Iterator((triplet.srcId, Seq(triplet.dstId))) 55 | } else { 56 | Iterator() 57 | } 58 | 59 | } 60 | 61 | /** 62 | * This method wraps Spark's Pregel API entry point from the [[org.apache.spark.graphx.GraphOps]] class. This provides 63 | * a simple way to write a suite of graph algorithms by extending the [[PregelProgram]] abstract 64 | * class and implementing vertexProgram, messageBroker, and combiner methods. 65 | * @param initialMsg is the initial message received for all vertices in the graph 66 | */ 67 | def run(initialMsg: Seq[VertexId]): Graph[Seq[VertexId], Seq[VertexId]] = { 68 | graph.pregel(initialMsg)(this.vertexProgram, this.messageBroker, this.combiner) 69 | } 70 | } 71 | 72 | -------------------------------------------------------------------------------- /core/src/main/scala/com/lakala/datacenter/core/grograms/MaximumValueProgram.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.core.grograms 2 | 3 | import com.lakala.datacenter.core.abstractions.PregelProgram 4 | import org.apache.spark.graphx.{EdgeTriplet, Graph, VertexId} 5 | 6 | /** 7 | * The [[MaximumValueProgram]] is an example graph algorithm implemented on the [[PregelProgram]] 8 | * abstraction. 9 | */ 10 | class MaximumValueProgram(@transient val graph : Graph[Int, Int]) 11 | extends PregelProgram[Int, Int, Int] with Serializable { 12 | 13 | protected def this() = this(null) 14 | 15 | /** 16 | * Return the larger of the two vertex attribute values 17 | * @param id is the [[VertexId]] that this program will perform a state operation for 18 | * @param state is the current state of this [[VertexId]] 19 | * @param message is the state received from another vertex in the graph 20 | * @return an [[Int]] resulting from a comparison between current state and incoming state 21 | */ 22 | override def vertexProgram(id: VertexId, state: Int, message: Int): Int = { 23 | if (message > state) { 24 | message 25 | } else { 26 | state 27 | } 28 | } 29 | 30 | /** 31 | * Return the larger of the two vertex state results 32 | * @param a A first [[Int]] representing a partial state of a vertex. 33 | * @param b A second [[Int]] representing a different partial state of a vertex 34 | * @return a merged [[Int]] representation from the two [[Int]] parameters 35 | */ 36 | override def combiner(a: Int, b: Int): Int = { 37 | math.max(a, b) 38 | } 39 | 40 | /** 41 | * If the dstVertex's value is less than the srcVertex's value, send a message to the dstVertex to update 42 | * its state 43 | * @param triplet An edge triplet is an object containing a pair of connected vertex objects and edge object. 44 | * For example (v1)-[r]->(v2) 45 | * @return The message broker returns a key value list, each containing a VertexId and a new message 46 | */ 47 | override def messageBroker(triplet: EdgeTriplet[Int, Int]): Iterator[(VertexId, Int)] = { 48 | // If the srcAttr is greater than the dstAttr then notify the dstVertex to update its state 49 | if (triplet.srcAttr > triplet.dstAttr) { 50 | Iterator((triplet.dstId, triplet.srcAttr)) 51 | } else { 52 | Iterator.empty 53 | } 54 | } 55 | 56 | /** 57 | * This method wraps Spark's Pregel API entry point from the [[org.apache.spark.graphx.GraphOps]] class. This provides 58 | * a simple way to write a suite of graph algorithms by extending the [[PregelProgram]] abstract 59 | * class and implementing vertexProgram, messageBroker, and combiner methods. 60 | * @param initialMsg is the initial message received for all vertices in the graph 61 | */ 62 | def run(initialMsg: Int): Graph[Int, Int] = { 63 | graph.pregel(initialMsg)(this.vertexProgram, this.messageBroker, this.combiner) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /core/src/main/scala/com/lakala/datacenter/core/utils/UtilsToos.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.core.utils 2 | 3 | import java.nio.charset.StandardCharsets 4 | 5 | import com.google.common.hash.Hashing 6 | import com.lakala.datacenter.common.utils.DateTimeUtils 7 | 8 | import scala.util.matching.Regex 9 | 10 | /** 11 | * Created by ASUS-PC on 2017/4/18. 12 | */ 13 | object UtilsToos { 14 | /** 15 | * 根据字符串生成唯一的hashcode值 16 | * 17 | * @param str 18 | * @return 19 | */ 20 | def hashId(str: String) = { 21 | Hashing.md5().hashString(str, StandardCharsets.UTF_8).asLong() 22 | } 23 | 24 | /** 25 | * 手机号,电话号码验证 26 | * 27 | * @param num 28 | * @return 验证通过返回true 29 | */ 30 | def isMobileOrPhone(num: String): Boolean = { 31 | val pattern = new Regex("^((17[0-9])(14[0-9])|(13[0-9])|(15[^4,\\D])|(18[0,5-9]))\\d{8}$") 32 | val pattern2 = new Regex("(?:(\\(\\+?86\\))(0[0-9]{2,3}\\-?)?([2-9][0-9]{6,7})+(\\-[0-9]{1,4})?)|(?:(86-?)?(0[0-9]{2,3}\\-?)?([2-9][0-9]{6,7})+(\\-[0-9]{1,4})?)") // 验证带区号的 33 | // val pattern2 = new Regex("^[0][1-9]{2,3}-[0-9]{5,10}$") // 验证带区号的 34 | val pattern3 = new Regex("^[1-9]{1}[0-9]{5,8}$") // 验证没有区号的 35 | num match { 36 | case pattern(_*) => { 37 | true 38 | } 39 | case pattern2(_*) => { 40 | true 41 | } 42 | case pattern3(_*) => { 43 | true 44 | } 45 | case _ => { 46 | false 47 | } 48 | } 49 | } 50 | 51 | 52 | 53 | 54 | } 55 | -------------------------------------------------------------------------------- /core/src/test/java/com/lakala/datacenter/core/hdfs/FileUtilTest.java: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.core.hdfs; 2 | 3 | import com.lakala.datacenter.core.config.ConfigurationLoader; 4 | import com.lakala.datacenter.core.models.ProcessorMessage; 5 | import com.lakala.datacenter.core.models.ProcessorMode; 6 | import com.lakala.datacenter.core.processor.GraphProcessor; 7 | import junit.framework.TestCase; 8 | import org.junit.Test; 9 | 10 | import java.util.ArrayList; 11 | import java.util.Arrays; 12 | 13 | public class FileUtilTest extends TestCase { 14 | 15 | @Test 16 | public void testWritePropertyGraphUpdate() throws Exception { 17 | 18 | ConfigurationLoader.testPropertyAccess=true; 19 | 20 | // Create sample PageRank result 21 | String nodeList = 22 | "0 .001\n" + 23 | "1 .002\n" + 24 | "3 .003"; 25 | 26 | // Create test path 27 | String path = ConfigurationLoader.getInstance().getHadoopHdfsUri() + "/test/propertyNodeList.txt"; 28 | 29 | // Test writing the PageRank result to HDFS path 30 | FileUtil.writePropertyGraphUpdate(new ProcessorMessage(path, GraphProcessor.PAGERANK, ProcessorMode.Partitioned), 31 | new ArrayList<>(Arrays.asList( 32 | "0 .001\n", 33 | "1 .002\n", 34 | "3 .003" 35 | ))); 36 | 37 | // Validate node list 38 | assertEquals(FileUtil.readHdfsFile(path), "# Node Property Value List" + "\n" + nodeList); 39 | } 40 | } -------------------------------------------------------------------------------- /core/src/test/java/com/lakala/datacenter/core/messaging/SenderTest.java: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.core.messaging; 2 | 3 | import com.google.gson.Gson; 4 | import com.lakala.datacenter.core.config.ConfigurationLoader; 5 | import com.lakala.datacenter.core.models.ProcessorMessage; 6 | import com.lakala.datacenter.core.models.ProcessorMode; 7 | import com.lakala.datacenter.core.processor.GraphProcessor; 8 | import junit.framework.TestCase; 9 | 10 | public class SenderTest extends TestCase { 11 | 12 | private static final String EDGE_LIST_RELATIVE_FILE_PATH = "/neo4j/mazerunner/edgeList.txt"; 13 | 14 | public void testSendMessage() throws Exception { 15 | ConfigurationLoader.testPropertyAccess=true; 16 | ProcessorMessage processorMessage = new ProcessorMessage("", "strongly_connected_components", ProcessorMode.Partitioned); 17 | processorMessage.setPath(ConfigurationLoader.getInstance().getHadoopHdfsUri() + GraphProcessor.PROPERTY_GRAPH_UPDATE_PATH); 18 | // Serialize the processor message 19 | Gson gson = new Gson(); 20 | String message = gson.toJson(processorMessage); 21 | 22 | // Notify Neo4j that a property update list is available for processing 23 | Sender.sendMessage(message); 24 | } 25 | 26 | 27 | } -------------------------------------------------------------------------------- /core/src/test/scala/com/lakala/datacenter/core/grograms/ShortestPathTests.scala: -------------------------------------------------------------------------------- 1 | /* 2 | package com.lakala.datacenter.core.grograms 3 | 4 | import com.lakala.datacenter.core.algorithms.Algorithms 5 | import com.lakala.datacenter.core.config.ConfigurationLoader 6 | import com.lakala.datacenter.core.processor.GraphProcessor 7 | import org.apache.spark.graphx._ 8 | import org.apache.spark.graphx.lib.ShortestPaths 9 | import org.apache.spark.rdd.RDD 10 | import org.scalatest.FlatSpec 11 | import scala.collection.mutable 12 | 13 | class ShortestPathTests extends FlatSpec { 14 | /** 15 | * To collect the shortest path results for all nodes to a single destination node, 16 | * the following steps must be taken: 17 | * 18 | */ 19 | 20 | ConfigurationLoader.testPropertyAccess = true 21 | 22 | // Create Spark context 23 | val sc = GraphProcessor.initializeSparkContext.sc 24 | 25 | val vertexIds = sc.parallelize(Seq(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L)).collect().toSeq 26 | 27 | def fixture = 28 | new { 29 | 30 | // Create an RDD for the vertices 31 | val vertices: RDD[(VertexId, ShortestPathState)] = sc.parallelize(Array( 32 | (0L, new ShortestPathState(0L, vertexIds)), 33 | (1L, new ShortestPathState(1L, vertexIds)), 34 | (2L, new ShortestPathState(2L, vertexIds)), 35 | (3L, new ShortestPathState(3L, vertexIds)), 36 | (4L, new ShortestPathState(4L, vertexIds)), 37 | (5L, new ShortestPathState(5L, vertexIds)), 38 | (6L, new ShortestPathState(6L, vertexIds)), 39 | (7L, new ShortestPathState(7L, vertexIds)), 40 | (8L, new ShortestPathState(8L, vertexIds)), 41 | (9L, new ShortestPathState(9L, vertexIds)), 42 | (10L, new ShortestPathState(10L, vertexIds)), 43 | (11L, new ShortestPathState(11L, vertexIds)), 44 | (12L, new ShortestPathState(12L, vertexIds)))) 45 | 46 | // Create an RDD for edges 47 | val edges: RDD[Edge[Int]] = sc.parallelize(Array( 48 | Edge(0L, 1L, 0), 49 | Edge(1L, 4L, 0), 50 | Edge(1L, 2L, 0), 51 | Edge(2L, 3L, 0), 52 | Edge(5L, 6L, 0), 53 | Edge(6L, 7L, 0), 54 | Edge(7L, 8L, 0), 55 | Edge(8L, 9L, 0), 56 | Edge(9L, 10L, 0), 57 | Edge(10L, 11L, 0), 58 | Edge(11L, 12L, 0), 59 | Edge(12L, 3L, 0), 60 | Edge(7L, 3L, 0), 61 | Edge(4L, 3L, 0))) 62 | 63 | // Build the initial Graph 64 | val graph = Graph(vertices, edges, new ShortestPathState(-1L, null)) 65 | } 66 | 67 | "A node's state" should "have a decision tree" in { 68 | val graph = fixture.graph 69 | 70 | val tree = new DecisionTree[VertexId](0L, mutable.HashMap[VertexId, DecisionTree[VertexId]]()) 71 | 72 | graph.edges.collect().foreach(ed => tree.addLeaf(ed.srcId).addLeaf(ed.dstId)) 73 | 74 | val vertexIds = graph.vertices.map(v => v._1).cache().collect() 75 | 76 | val sssp = ShortestPaths.run(graph, graph.vertices.map { vx => vx._1}.collect()).vertices.collect() 77 | 78 | val graphResults = sc.parallelize(vertexIds).map(row => { 79 | println("*** " + row) 80 | (row, vertexIds.map(vt => { 81 | (vt, tree.getNode(row).allShortestPathsTo(vt, sssp)) 82 | })) 83 | }).collectAsync().get().toArray 84 | 85 | val result = Algorithms.betweennessCentrality(sc, graphResults) 86 | 87 | val resultStream = result 88 | 89 | for (x <- resultStream) { 90 | println(x) 91 | } 92 | 93 | } 94 | 95 | } 96 | */ 97 | -------------------------------------------------------------------------------- /neo4j/bin/start2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## usage: sh bin/start.sh -i /logs/device/* -d 2016-01-11 3 | 4 | SPARK_HOME=/home/hadoop/spark-1.6.3-bin-hadoop2.6 5 | HIVE_HOME=/usr/hdp/current/hive-client 6 | PROJECT_HOME="$(cd "`dirname "$0"`"/..; pwd)" 7 | HDP_VERSION=2.4.0.0-169 8 | APP_CACHE_DIR=/tmp/device 9 | 10 | stdate=${1:-`date -d '1 days ago' +"%Y-%m-%d"`} 11 | #inputdir=/logs/device/* 12 | #inputfile=/logs/device/*/2016-01-{1[1-9],2[0-1]} 13 | while getopts "d:i:" opt ; do 14 | case $opt in 15 | d)stdate=$OPTARG ;; 16 | i)inputdir=$OPTARG ;; 17 | ?)echo "==> please input arg: stdate(d), inputdir(i)" && exit 1 ;; 18 | esac 19 | done 20 | 21 | #echo "==> ready for geoip...." 22 | #hadoop fs -mkdir -p $APP_CACHE_DIR/geoip 23 | #hadoop fs -test -e $APP_CACHE_DIR/geoip/GeoLite2-City.mmdb 24 | #if [ $? -ne 0 ]; then 25 | # echo "GeoLite2-City.mmdb not exists!" 26 | # hadoop fs -put $PROJECT_HOME/../tcloud-log-analysis/src/main/bundleApp/coord-common/geoip/GeoLite2-City.mmdb $APP_CACHE_DIR/geoip/ 27 | #fi 28 | 29 | ## https://issues.apache.org/jira/browse/ZEPPELIN-93 30 | ## https://github.com/caskdata/cdap/pull/4106 31 | spark-submit \ 32 | --master spark://datacenter17:7077,datacenter18:7077 \ 33 | --class com.lakala.datacenter.main.Driver \ 34 | --driver-memory 2G \ 35 | --executor-memory 4G \ 36 | --num-executors 3 \ 37 | --executor-cores 3 \ 38 | --conf "spark.rpc.askTimeout=300s" \ 39 | --verbose \ 40 | --files $SPARK_HOME/conf/hive-site.xml \ 41 | --driver-class-path $PROJECT_HOME/target/dependency/mysql-connector-java-5.1.36.jar \ 42 | --jars $PROJECT_HOME/target/dependency/mysql-connector-java-5.1.36.jar,$SPARK_HOME/lib/datanucleus-api-jdo-3.2.6.jar,$SPARK_HOME/lib/datanucleus-core-3.2.10.jar,$PROJECT_HOME/target/dependency/guava-14.0.1.jar,$SPARK_HOME/lib/datanucleus-rdbms-3.2.9.jar \ 43 | $PROJECT_HOME/target/graphx-analysis-apply.jar \ 44 | -i /user/linyanshi/query_result.csv -c /user/linyanshi/part-00003 -o file:////home/hadoop/grogram/analysis/graphx-analysis/apply/bin/output 45 | 46 | ## --packages com.databricks:spark-csv_2.10:1.3.0 \ 47 | ## 2>&1 > output.txt 48 | -------------------------------------------------------------------------------- /neo4j/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | com.lakala.datacenter 7 | graphx-analysis 8 | 1.0.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | graphx-analysis-neo4j 13 | jar 14 | 15 | graphx-analysis-neo4j 16 | http://maven.apache.org 17 | 18 | 19 | UTF-8 20 | 21 | 22 | 23 | 24 | 25 | com.lakala.datacenter 26 | graphx-analysis-core 27 | ${project.version} 28 | 29 | 30 | 31 | 32 | graphx-analysis-neo4j 33 | 34 | 35 | 36 | spark-repo 37 | http://dl.bintray.com/spark-packages/maven/ 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /neo4j/src/main/java/com/lakala/datacenter/enums/DataAttributeType.java: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.enums; 2 | 3 | /** 4 | * Created by Administrator on 2017/6/16 0016. 5 | */ 6 | interface DataInterface{} 7 | 8 | public enum DataAttributeType implements DataInterface{ 9 | ORDERID(1, "orderid"), CONTRACTNO(2, "contractno"), TERMID(3, "termid"), LOANPAN(4, "loanpan"), RETURNPAN(5, "returnpan"), 10 | INSERTTIME(6, "inserttime"), RECOMMEND(7, "recommend"), USERID(8, "userid"), DEVICEID(9, "deviceid"), 11 | CERTNO(10, "certno"), EMAIL(11, "email"), COMPANY(12, "company"), MOBILE(13, "mobile"), COMPADDR(14, "compaddr"), 12 | COMPPHONE(15, "compphone"), EMERGENCYCONTACTMOBILE(16, "emergencycontactmobile"), 13 | CONTACTMOBILE(17, "contactmobile"), IPV4(18, "ipv4"), MSGPHONE(19, "msgphone"), TELECODE(20, "telecode"); 14 | //成员变量 15 | private int sequence; 16 | private String name; 17 | 18 | //构造方法 19 | private DataAttributeType(int sequence, String name) { 20 | this.sequence = sequence; 21 | this.name = name; 22 | } 23 | 24 | //自定义方法 25 | public static String getColorName(int sequence) { 26 | for (DataAttributeType c : DataAttributeType.values()) { 27 | if (c.getSequence() == sequence) 28 | return c.name; 29 | } 30 | return null; 31 | } 32 | 33 | //getter&setter 34 | public int getSequence() { 35 | return sequence; 36 | } 37 | 38 | public void setSequence(int sequence) { 39 | this.sequence = sequence; 40 | } 41 | 42 | public String getName() { 43 | return name; 44 | } 45 | 46 | public void setName(String name) { 47 | this.name = name; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /neo4j/src/main/java/com/lakala/datacenter/enums/GraphEnum.java: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.enums; 2 | 3 | 4 | /** 5 | * Created by Administrator on 2017/7/11 0011. 6 | */ 7 | public enum GraphEnum { 8 | TERMINAL("terminal", RelationshipTypes.terminal), BANKCARD("bankcard", RelationshipTypes.bankcard); 9 | private String relType; 10 | private RelationshipTypes relationshipTypes; 11 | 12 | 13 | private GraphEnum(String relType, RelationshipTypes relationshipTypes) { 14 | this.relType = relType; 15 | this.relationshipTypes = relationshipTypes; 16 | } 17 | 18 | public String getRelType() { 19 | return relType; 20 | } 21 | 22 | public RelationshipTypes getRelationshipTypes(String relType) { 23 | for (GraphEnum ge : GraphEnum.values()) { 24 | if (ge.relType.equals(relType)) return ge.relationshipTypes; 25 | continue; 26 | } 27 | return null; 28 | } 29 | 30 | public RelationshipTypes getRelationshipTypes() { 31 | return relationshipTypes; 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /neo4j/src/main/java/com/lakala/datacenter/enums/Labels.java: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.enums; 2 | 3 | import org.neo4j.graphdb.Label; 4 | 5 | /** 6 | * Created by Administrator on 2017/5/31 0031. 7 | */ 8 | public enum Labels implements Label { 9 | ApplyInfo, Terminal, BankCard, Mobile, Identification, Email, Company, CompanyAddress, CompanyTel, Device, IPV4 10 | } 11 | -------------------------------------------------------------------------------- /neo4j/src/main/java/com/lakala/datacenter/enums/RelationshipTypes.java: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.enums; 2 | 3 | import org.neo4j.graphdb.RelationshipType; 4 | 5 | /** 6 | * Created by Administrator on 2017/5/31 0031. 7 | */ 8 | public enum RelationshipTypes implements RelationshipType { 9 | terminal, bankcard, loginmobile, ipv4, applymymobile, hometel, recommend, identification, email, company, companyaddress, companytel, emergencymobile,merchantmobile,channelmobile,relativemobile, relativecontact, device; 10 | } 11 | -------------------------------------------------------------------------------- /neo4j/src/main/resources/css/style.css: -------------------------------------------------------------------------------- 1 | graph { 2 | fill-color: white; 3 | } 4 | node { 5 | size: 65; 6 | fill-color: #CCCCCC, #AAAAAA; 7 | fill-mode: gradient-radial; 8 | text-offset: 0, 0; 9 | stroke-mode: plain; 10 | stroke-color: #333333; 11 | } 12 | node:clicked { 13 | fill-color: #2277FF, #88AAFF; 14 | fill-mode: gradient-radial; 15 | size: 100; 16 | text-size:18; 17 | text-offset: 0, 0; 18 | } 19 | edge { 20 | text-alignment: along; 21 | } 22 | -------------------------------------------------------------------------------- /neo4j/src/main/resources/dev/config.properties: -------------------------------------------------------------------------------- 1 | neoIP=bolt://192.168.0.33:7687 2 | user=neo4j 3 | password=123456 4 | #************redis config ********** 5 | redisIp=192.168.0.192:6380,192.168.0.192:6381,192.168.0.192:6382,192.168.0.192:6383,192.168.0.192:6384,192.168.0.192:6385 6 | psubscribe=testsub11 -------------------------------------------------------------------------------- /neo4j/src/main/resources/dev/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 21 | 22 | 23 | hive.metastore.uris 24 | thrift://192.168.0.212:9083 25 | Thrift uri for the remote metastore. Used by metastore client to connect to remote metastore. 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /neo4j/src/main/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /neo4j/src/main/resources/product/config.properties: -------------------------------------------------------------------------------- 1 | neoIP=bolt://10.16.65.15:7688 2 | user=neo4j 3 | password=123456 4 | #************redis config ********** 5 | redisIp=10.0.8.170:6800,10.0.8.170:6801,10.0.8.171:6800,10.0.8.171:6801,10.0.8.172:6800,10.0.8.172:6801 6 | psubscribe=dataPlatform.anti_fraud.order_monitor -------------------------------------------------------------------------------- /neo4j/src/main/resources/test/config.properties: -------------------------------------------------------------------------------- 1 | neoIP=http://192.168.0.33:7474/db/data 2 | user=neo4j 3 | password=123456 4 | #************redis config ********** 5 | redisIp=192.168.0.192:6380,192.168.0.192:6381,192.168.0.192:6382,192.168.0.192:6383,192.168.0.192:6384,192.168.0.192:6385 6 | psubscribe=testsub11 -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/abstractions/DataGenerator.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.abstractions 2 | 3 | import com.lakala.datacenter.utils.Config 4 | 5 | /** 6 | * Created by Administrator on 2017/5/31 0031. 7 | */ 8 | trait DataGenerator { 9 | def generateUsers(config: Config): Unit 10 | } 11 | -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/load/spark/ClusterGraphDatabase.scala: -------------------------------------------------------------------------------- 1 | /* 2 | package com.lakala.datacenter.load.spark 3 | 4 | import com.lakala.datacenter.enums.Labels 5 | import org.neo4j.graphdb.index.IndexHits 6 | import org.neo4j.graphdb.{Node, Relationship} 7 | import org.neo4j.helpers.collection.MapUtil 8 | import org.neo4j.index.impl.lucene.legacy.LuceneIndexImplementation 9 | import org.neo4j.rest.graphdb.index.RestIndex 10 | import org.neo4j.rest.graphdb.query.RestCypherQueryEngine 11 | import org.neo4j.rest.graphdb.{RestAPI, RestAPIFacade} 12 | 13 | /** 14 | * Created by Administrator on 2017/6/19 0019. 15 | */ 16 | object ClusterGraphDatabase { 17 | private var restAPI: RestAPI = null 18 | private val serverBaseUrl = "http://192.168.0.33:7474/db/data" 19 | private val user = "neo4j" 20 | private val password = "123456" 21 | 22 | def main(args: Array[String]): Unit = { 23 | try 24 | setUp 25 | countExistingNodes 26 | tearDown 27 | } 28 | 29 | @throws[Throwable] 30 | def setUp(): Unit = { 31 | restAPI = new RestAPIFacade(serverBaseUrl, user, password) 32 | validateServerIsUp() 33 | val queryEngine = new RestCypherQueryEngine(restAPI) 34 | // graphdb = queryEngine.asInstanceOf[GraphDatabaseService] 35 | } 36 | 37 | @throws[Throwable] 38 | private def validateServerIsUp() = { 39 | try 40 | restAPI.getAllLabelNames 41 | catch { 42 | case e: Throwable => 43 | println(" !!!!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!!!! \n" + "this test assumes a Neo4j Server is running in a separate process \n" + "on localhost port 7474. You will need to manually start it before \n" + "running these demo tests.") 44 | throw e 45 | } 46 | } 47 | 48 | def tearDown(): Unit = { 49 | restAPI.close() 50 | } 51 | 52 | 53 | def countExistingNodes(): Unit = { 54 | //472 55 | val node2 = restAPI.getNodeById(293) 56 | println(node2.getLabels.iterator().next().name()) 57 | val indexs = restAPI.createIndex(classOf[Node], "orderno", LuceneIndexImplementation.EXACT_CONFIG) 58 | val relIndex: RestIndex[Relationship] = restAPI.createIndex(classOf[Relationship], "terminal", LuceneIndexImplementation.EXACT_CONFIG) 59 | val terminalIndexs = restAPI.createIndex(classOf[Node], "content", LuceneIndexImplementation.EXACT_CONFIG) 60 | 61 | val hitIndex2: IndexHits[Node] = indexs.get("orderno", "XNA20170617214709013851193476043") 62 | val hitIndex: IndexHits[Node] = terminalIndexs.get("content", "CBC3A110160228103") 63 | println(hitIndex2.size()) 64 | println(hitIndex2.getSingle) 65 | println("#################") 66 | println(hitIndex.size()) 67 | println(hitIndex.getSingle) 68 | val applyNode = restAPI.getOrCreateNode(indexs, "orderno", "XNA20170617214709013851193476043", MapUtil.map("term_id", "CBC3A110160228103")) 69 | applyNode.addLabel(Labels.ApplyInfo) 70 | println(applyNode.getLabels.iterator().next().name()) 71 | applyNode.setProperty("orderno", "XNA20170617214709013851193476043") 72 | // applyNode.setProperty("term_id", "CBC3A110160228103") 73 | applyNode.setProperty("modelname", Labels.ApplyInfo) 74 | 75 | val terminalNode = restAPI.getOrCreateNode(terminalIndexs, "content", "CBC3A110160228103", MapUtil.map()) 76 | terminalNode.addLabel(Labels.Terminal) 77 | terminalNode.setProperty("modelname", Labels.Terminal) 78 | println(terminalNode.getLabels.iterator().next().name()) 79 | val rel = restAPI.getOrCreateRelationship(relIndex, "", "", applyNode, terminalNode, "terminal", MapUtil.map()) 80 | 81 | if (applyNode != null) { 82 | println("====================") 83 | println("apply node " + applyNode.getId() + " terminal node " + terminalNode.getId + " relationship " + rel.getId + " is created.") 84 | } 85 | 86 | } 87 | } 88 | */ 89 | -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/load/spark/LoadHiveData.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.load.spark 2 | 3 | import com.lakala.datacenter.core.utils.UtilsToos 4 | import org.apache.commons.lang3.StringUtils 5 | import org.apache.spark.sql.hive.HiveContext 6 | import org.apache.spark.{SparkConf, SparkContext} 7 | 8 | /** 9 | * Created by Administrator on "2017"/5/"31" 0031. 10 | */ 11 | object LoadHiveData { 12 | def main(args: Array[String]): Unit = { 13 | val conf = new SparkConf().setAppName("LoadHiveData") 14 | val sc = new SparkContext(conf) 15 | val hc = new HiveContext(sc) 16 | hc.sql("use creditloan") 17 | val sql = 18 | s"""select a.order_id,a.contract_no,a.term_id,a.loan_pan,a.return_pan,a.insert_time,a.recommend,a.user_id,b.cert_no,b.email,b.company,b.mobile,b.comp_addr,b.comp_phone,b.emergency_contact_mobile,b.contact_mobile,c.device_id 19 | |from creditloan.s_c_loan_apply a 20 | | left join creditloan.s_c_apply_user b on a.user_id =b.id and (a.year="2017" and a.month="05" and a.day="31") and (b.year="2017" and b.month="05" and b.day="31") 21 | | left join creditloan.s_c_loan_deviceidauth c on a.order_id =c.order_no and (a.year="2017" and a.month="05" and a.day="31") and (c.year="2017" and c.month="05" and c.day="31") """.stripMargin 22 | 23 | val df = hc.sql(sql) 24 | val lineRDD = df.mapPartitions { rows => 25 | rows.map { row => 26 | val orderId = row.getAs[String]("order_id") 27 | val contractNo = if (StringUtils.isNotBlank(row.getAs[String]("contract_no"))) row.getAs[String]("contract_no") else "" 28 | val termId = if (StringUtils.isNotBlank(row.getAs[String]("term_id"))) row.getAs[String]("term_id") else "" 29 | val loanPan = if (StringUtils.isNotBlank(row.getAs[String]("loan_pan"))) row.getAs[String]("loan_pan") else "" 30 | val returnPan = if (StringUtils.isNotBlank(row.getAs[String]("return_pan"))) row.getAs[String]("return_pan") else "" 31 | val insertTime = if (StringUtils.isNotBlank(row.getAs[String]("insert_time"))) row.getAs[String]("insert_time") else "" 32 | val recommend = if (StringUtils.isNotBlank(row.getAs[String]("recommend")) && UtilsToos.isMobileOrPhone(row.getAs[String]("recommend"))) row.getAs[String]("recommend") else "" 33 | val userId = if (StringUtils.isNotBlank(row.getAs[String]("user_id"))) row.getAs[String]("user_id") else "" 34 | val certNo = if (StringUtils.isNotBlank(row.getAs[String]("cert_no"))) row.getAs[String]("cert_no") else "" 35 | val email = if (StringUtils.isNotBlank(row.getAs[String]("email"))) row.getAs[String]("email") else "" 36 | val company = if (StringUtils.isNotBlank(row.getAs[String]("company"))) row.getAs[String]("company") else "" 37 | val mobile = if (StringUtils.isNotBlank(row.getAs[String]("mobile")) && UtilsToos.isMobileOrPhone(row.getAs[String]("mobile"))) row.getAs[String]("mobile") else "" 38 | val compAddr = if (StringUtils.isNotBlank(row.getAs[String]("comp_addr"))) row.getAs[String]("comp_addr") else "" 39 | val compPhone = if (StringUtils.isNotBlank(row.getAs[String]("comp_phone"))) row.getAs[String]("comp_phone") else "" 40 | val emergencyContactMobile = if (StringUtils.isNotBlank(row.getAs[String]("emergency_contact_mobile"))) row.getAs[String]("emergency_contact_mobile") else "" 41 | val contactMobile = if (StringUtils.isNotBlank(row.getAs[String]("contact_mobile"))) row.getAs[String]("contact_mobile") else "" 42 | val deviceId = if (StringUtils.isNotBlank(row.getAs[String]("device_id"))) row.getAs[String]("device_id") else "" 43 | s"$orderId,$contractNo,$termId,$loanPan,$returnPan,$insertTime,$recommend,$userId,$certNo,$email,$company,$mobile,$compAddr,$compPhone,$emergencyContactMobile,$contactMobile,$deviceId" 44 | } 45 | } 46 | 47 | lineRDD.saveAsTextFile(args(1)) 48 | 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/load/spark/Neo4jConfig.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.load.spark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.neo4j.driver.v1.{AuthTokens, Config, Driver, GraphDatabase} 5 | 6 | /** 7 | * @author lys 8 | * @since 02.03.16 9 | */ 10 | case class Neo4jConfig(val url: String, val user: String = "neo4j", val password: Option[String] = None) { 11 | 12 | def boltConfig() = Config.build.withEncryptionLevel(Config.EncryptionLevel.NONE).toConfig 13 | 14 | def driver(config: Neo4jConfig): Driver = config.password match { 15 | case Some(pwd) => GraphDatabase.driver(config.url, AuthTokens.basic(config.user, pwd), boltConfig()) 16 | case _ => GraphDatabase.driver(config.url, boltConfig()) 17 | } 18 | 19 | def driver(): Driver = driver(this) 20 | 21 | def driver(url: String): Driver = GraphDatabase.driver(url, boltConfig()) 22 | 23 | } 24 | 25 | object Neo4jConfig { 26 | val prefix = "spark.neo4j.bolt." 27 | 28 | def apply(sparkConf: SparkConf): Neo4jConfig = { 29 | val url = sparkConf.get(prefix + "url", "bolt://192.168.0.33:7687") 30 | val user = sparkConf.get(prefix + "user", "neo4j") 31 | val password: Option[String] = Option(sparkConf.get(prefix + "password", "123456")) 32 | Neo4jConfig(url, user, password) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/load/spark/Neo4jJavaIntegration.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.load.spark 2 | 3 | import java.util 4 | 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.api.java.JavaRDD 7 | import org.apache.spark.sql.SQLContext 8 | 9 | import scala.collection.JavaConverters._ 10 | 11 | /** 12 | * @author lys 13 | * @since 19.03.16 14 | */ 15 | object Neo4jJavaIntegration { 16 | def rowRDD(sc: SparkContext, query: String, parameters: java.util.Map[String, AnyRef]) = 17 | new Neo4jRowRDD(sc, query, if (parameters == null) Seq.empty else parameters.asScala.toSeq).toJavaRDD() 18 | 19 | def tupleRDD(sc: SparkContext, query: String, parameters: java.util.Map[String, AnyRef]): JavaRDD[util.Map[String, AnyRef]] = { 20 | val params = if (parameters == null) Seq.empty else parameters.asScala.toSeq 21 | Neo4jTupleRDD(sc, query, params) 22 | .map((t) => new util.LinkedHashMap[String, AnyRef](t.toMap.asJava).asInstanceOf[util.Map[String, AnyRef]]) 23 | .toJavaRDD() 24 | } 25 | 26 | def dataFrame(sqlContext: SQLContext, query: String, parameters: java.util.Map[String, AnyRef], schemaInfo: util.Map[String, String]) = { 27 | Neo4jDataFrame(sqlContext, query, parameters.asScala.toSeq, schemaInfo.asScala.toSeq: _*) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/load/spark/Neo4jPartition.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.load.spark 2 | 3 | import org.apache.spark.Partition 4 | 5 | /** 6 | * @author lys 7 | * @since 02.03.16 8 | */ 9 | // , val lower: Long = 0, val upper: Long = 0 -> paging for cypher queries with skip / limit 10 | class Neo4jPartition(idx: Long = 0, skip : Long = 0, limit : Long = Long.MaxValue) extends Partition { 11 | override def index: Int = idx.toInt 12 | val window : Map[String,Any] = Map("_limit" -> limit, "_skip" -> skip) 13 | 14 | override def toString: String = s"Neo4jRDD index $index skip $skip limit: $limit" 15 | } 16 | -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/load/spark/Neo4jRowRDD.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.load.spark 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.Row 6 | import org.neo4j.driver.v1._ 7 | 8 | import scala.collection.JavaConverters._ 9 | 10 | class Neo4jRowRDD(@transient sc: SparkContext, val query: String, val parameters: Seq[(String, Any)]) 11 | extends RDD[Row](sc, Nil) { 12 | 13 | private val config = Neo4jConfig(sc.getConf) 14 | 15 | override def compute(split: Partition, context: TaskContext): Iterator[Row] = { 16 | val driver = config.driver() 17 | val session = driver.session() 18 | 19 | val result: StatementResult = session.run(query, parameters.toMap.mapValues(_.asInstanceOf[AnyRef]).asJava) 20 | 21 | result.asScala.map((record) => { 22 | val keyCount = record.size() 23 | 24 | val res = if (keyCount == 0) Row.empty 25 | else if (keyCount == 1) Row(record.get(0).asObject()) 26 | else { 27 | val builder = Seq.newBuilder[AnyRef] 28 | var i = 0 29 | while (i < keyCount) { 30 | builder += record.get(i).asObject() 31 | i = i + 1 32 | } 33 | Row.fromSeq(builder.result()) 34 | } 35 | if (!result.hasNext) { 36 | session.close() 37 | driver.close() 38 | } 39 | res 40 | }) 41 | } 42 | 43 | override protected def getPartitions: Array[Partition] = Array(new Neo4jPartition()) 44 | } 45 | 46 | object Neo4jRowRDD { 47 | def apply(sc: SparkContext, query: String, parameters: Seq[(String, Any)] = Seq.empty) = new Neo4jRowRDD(sc, query, parameters) 48 | } 49 | -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/load/spark/Neo4jTupleRDD.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.load.spark 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.rdd.RDD 5 | import org.neo4j.driver.v1.Driver 6 | 7 | import scala.collection.JavaConverters._ 8 | 9 | class Neo4jTupleRDD(@transient sc: SparkContext, val query: String, val parameters: Seq[(String, AnyRef)]) 10 | extends RDD[Seq[(String, AnyRef)]](sc, Nil) { 11 | 12 | private val config = Neo4jConfig(sc.getConf) 13 | 14 | override def compute(split: Partition, context: TaskContext): Iterator[Seq[(String, AnyRef)]] = { 15 | val driver: Driver = config.driver() 16 | val session = driver.session() 17 | 18 | val result = session.run(query, parameters.toMap.asJava) 19 | 20 | result.asScala.map( (record) => { 21 | val res = record.asMap().asScala.toSeq 22 | if (!result.hasNext) { 23 | session.close() 24 | driver.close() 25 | } 26 | res 27 | }) 28 | } 29 | 30 | override protected def getPartitions: Array[Partition] = Array(new Neo4jPartition()) 31 | } 32 | 33 | object Neo4jTupleRDD { 34 | def apply(sc: SparkContext, query: String, parameters: Seq[(String,AnyRef)] = Seq.empty) = new Neo4jTupleRDD(sc, query, parameters) 35 | } 36 | 37 | 38 | -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/main/Main.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.main 2 | 3 | import java.io.File 4 | 5 | import com.lakala.datacenter.enums.RelationshipTypes 6 | import com.lakala.datacenter.grogram.Neo4jDataGenerator 7 | import com.lakala.datacenter.utils.Config 8 | import org.joda.time.DateTime 9 | import org.neo4j.graphdb.factory.GraphDatabaseFactory 10 | import org.neo4j.graphdb.{Direction, GraphDatabaseService} 11 | import org.neo4j.io.fs.FileUtils 12 | import org.slf4j.LoggerFactory 13 | 14 | /** 15 | * Created by Administrator on 2017/5/31 0031. 16 | */ 17 | object Main { 18 | private val logger = LoggerFactory.getLogger("Main") 19 | val COUNT = 100000 //数据批量提交 20 | //F:\tmp\applydir F:\tmp\neo4j\tmp01 21 | val FRIENDS_PER_USER = 50 22 | 23 | def main(args: Array[String]): Unit = { 24 | val mainTime = DateTime.now() 25 | println("start generateGraphData time " + DateTime.now()) 26 | // chackArgs(args) 13199050 27 | // val config = ArgsCommon.parseArgs(args) 28 | val config = new Config() 29 | config.input = args(0) 30 | config.output = args(1) 31 | generateGraphData(config) 32 | val endtime = DateTime.now() 33 | println("end generateGraphData time " + endtime + "+run long time " + (endtime.getMillis - mainTime.getMillis) / 36000) 34 | } 35 | 36 | def generateGraphData(config: Config): Unit = { 37 | FileUtils.deleteRecursively(new File(config.output + "/" + config.neo4jDB)) 38 | var graphdb = new GraphDatabaseFactory().newEmbeddedDatabase(new File(config.output + "/" + config.neo4jDB)) 39 | val neo4jDataGenerator = new Neo4jDataGenerator(graphdb) 40 | //生成数据 41 | neo4jDataGenerator.generateUsers(config) 42 | registerShutdownHook(graphdb) 43 | } 44 | 45 | /** 46 | * START SNIPPET: shutdownHook 47 | * @param graph 48 | */ 49 | def registerShutdownHook(graph: GraphDatabaseService): Unit = { 50 | Runtime.getRuntime.addShutdownHook(new Thread() { 51 | override def run(): Unit = { 52 | graph.shutdown() 53 | } 54 | }) 55 | } 56 | 57 | def chackArgs(args: Array[String]): Unit = { 58 | if (args.length < 1) { 59 | println("Usage: class com.lakala.datacenter.grogress.ExportNDegreeData$ [options]\n" + 60 | "[=....]\n " + 61 | "-i | --Input \n applyInput file or path Required.\n " + 62 | "-o | --output \n output path Required\n " + 63 | "-m | --master \n spark master, local[N] or spark://host:port default=local\n " + 64 | "-h | --sparkhome \n SPARK_HOME Required to run on cluster\n " + 65 | "-n | --jobname \n job name\n " + 66 | "-s | --startDate \n use start date load data\n " + 67 | "-t | --endDate \n use end date load data\n " + 68 | "-p | --parallelism \n sets spark.default.parallelism and minSplits on the edge file. default=based on input partitions\n " + 69 | "-x | --minprogress \n Number of vertices that must change communites for the algorithm to consider progress. default=2000\n " + 70 | "-y | --progresscounter \n Number of times the algorithm can fail to make progress before exiting. default=1\n " + 71 | "-d | --edgedelimiter \n specify input file edge delimiter. default=\",\"\n " + 72 | "-j | --jars \n comma seperated list of jars\n " + 73 | "-e | --encrypy \n Set to true to all data convert encrypy need all data use google hash's MD5 generage Long ids. Defaults to false\n " + 74 | "-b | --blacType \n Set to true to exprot black result data, Defaults to false\n " + 75 | " =.... ") 76 | sys.exit(1) 77 | } 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/main/MessageParam.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.main 2 | 3 | import kafka.consumer.KafkaStream 4 | import org.neo4j.driver.v1.Session 5 | import redis.clients.jedis.JedisCluster 6 | 7 | /** 8 | * Created by Administrator on 2017/8/7 0007. 9 | */ 10 | case class MessageParam(m_stream: KafkaStream[_, _], m_threadNumber: Int, redis: JedisCluster, 11 | session: Session, sessionBak: Session,psubscribe:String) { 12 | 13 | } 14 | -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/main/TrialConsumerKafka.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.main 2 | 3 | import java.util.Properties 4 | import java.util.concurrent.{ExecutorService, Executors, TimeUnit} 5 | 6 | import com.lakala.datacenter.constant.StreamingConstant 7 | import com.lakala.datacenter.utils.RedisUtils 8 | import com.lakala.datacenter.utils.UtilsTools.properties 9 | import kafka.consumer.{ConsumerConfig, ConsumerConnector, KafkaStream} 10 | import kafka.serializer.StringDecoder 11 | import kafka.utils.VerifiableProperties 12 | import org.apache.commons.lang3.StringUtils.trim 13 | import org.neo4j.driver.v1.{AuthTokens, Driver, GraphDatabase} 14 | import redis.clients.jedis.JedisCluster 15 | 16 | import scala.collection.Map 17 | 18 | /** 19 | * Created by Administrator on 2017/8/7 0007. 20 | * 21 | */ 22 | 23 | object TrialConsumerKafka{ 24 | def main(args: Array[String]): Unit = { 25 | val zooKeeper: String = args(0) 26 | val groupId: String = args(1) 27 | val topic: String = args(2) 28 | val threads: Int = args(3).toInt 29 | println("start trial consumer kafaka message .....") 30 | val example:TrialConsumerKafka= new TrialConsumerKafka(zooKeeper, groupId, topic) 31 | example.run(threads) 32 | 33 | try { 34 | Thread.sleep(10000) 35 | } catch { 36 | case ie: InterruptedException => 37 | println("==============") 38 | } 39 | 40 | } 41 | } 42 | 43 | class TrialConsumerKafka { 44 | private var consumer: ConsumerConnector = null 45 | private var topic: String = null 46 | private var executor: ExecutorService = null 47 | private var driver: Driver = null 48 | private var redis: JedisCluster = RedisUtils.jedisCluster() 49 | val properies = properties(StreamingConstant.CONFIG) 50 | def this(a_zookeeper: String, a_groupId: String, a_topic: String) { 51 | this() 52 | this.topic = a_topic 53 | consumer = kafka.consumer.Consumer.create(createConsumerConfig(a_zookeeper, a_groupId)) 54 | driver = GraphDatabase.driver(trim(properies.getProperty(StreamingConstant.NEOIP)), AuthTokens.basic(trim(properies.getProperty(StreamingConstant.USER)), trim(properies.getProperty(StreamingConstant.PASSWORD)))) 55 | } 56 | 57 | def shutdown(): Unit = { 58 | if (consumer != null) consumer.shutdown 59 | if (executor != null) executor.shutdown 60 | try { 61 | if (!executor.awaitTermination(5000, TimeUnit.MILLISECONDS)) System.out.println("Timed out waiting for consumer threads to shut down, exiting uncleanly") 62 | } catch { 63 | case e: InterruptedException => 64 | System.out.println("Interrupted during shutdown, exiting uncleanly") 65 | } 66 | } 67 | 68 | 69 | def run(a_numThreads: Int): Unit = { 70 | val topicCountMap = Map(topic -> a_numThreads) 71 | // val topicCountMap = Map(topic -> 1) 72 | val keyDecoder = new StringDecoder(new VerifiableProperties) 73 | val valueDecoder = new StringDecoder(new VerifiableProperties) 74 | val consumerMap: Map[String, List[KafkaStream[String, String]]] = consumer.createMessageStreams(topicCountMap, keyDecoder, valueDecoder) 75 | val streams: List[KafkaStream[String, String]] = consumerMap.get(topic).get 76 | 77 | executor = Executors.newFixedThreadPool(a_numThreads) 78 | var threadNumber = 0 79 | streams.foreach { stream => 80 | executor.submit(new HandleTask(MessageParam(stream, threadNumber, redis, 81 | driver.session, driver.session, properies.getProperty(StreamingConstant.PSUBSCRIBE)))) 82 | threadNumber += 1 83 | } 84 | } 85 | 86 | private def createConsumerConfig(a_zookeeper: String, a_groupId: String): ConsumerConfig = { 87 | val props = new Properties() 88 | props.put("zookeeper.connect", a_zookeeper) 89 | props.put("group.id", a_groupId) 90 | props.put("zookeeper.session.timeout.ms", "60000") 91 | props.put("zookeeper.sync.time.ms", "200") 92 | props.put("auto.commit.interval.ms", "1000") 93 | props.put("auto.offset.reset", "smallest") 94 | props.put("rebalance.max.retries", "5") 95 | props.put("rebalance.backoff.ms", "12000") 96 | props.put("serializer.class", "kafka.serializer.StringEncoder") 97 | new ConsumerConfig(props) 98 | } 99 | } -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/realtimeBuildGraphx/MsgOffsetStreamListener.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.realtimeBuildGraphx 2 | 3 | import com.lakala.datacenter.constant.StreamingConstant 4 | import com.lakala.datacenter.utils.Config 5 | import kafka.utils.{ZKGroupTopicDirs, ZkUtils} 6 | import org.I0Itec.zkclient.ZkClient 7 | import org.I0Itec.zkclient.exception.ZkMarshallingError 8 | import org.I0Itec.zkclient.serialize.ZkSerializer 9 | import org.apache.spark.Logging 10 | import org.apache.spark.streaming.Time 11 | import org.apache.spark.streaming.kafka.OffsetRange 12 | import org.apache.spark.streaming.scheduler.{StreamingListener, StreamingListenerBatchCompleted, StreamingListenerReceiverError, StreamingListenerReceiverStopped} 13 | 14 | import scala.collection.mutable 15 | 16 | /** 17 | * Created by Administrator on 2017/6/9 0009. 18 | */ 19 | class MsgOffsetStreamListener(config: Config, offsetRanges: mutable.Map[Time, Array[OffsetRange]]) extends StreamingListener with Logging { 20 | 21 | var zkClient = getZkClient(config.zkIPs) 22 | // val zkUtils = ZkUtils.apply(zkClient,true) 23 | val topicDirs = new ZKGroupTopicDirs(config.group, config.topic) 24 | 25 | override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = { 26 | //创建一个 ZKGroupTopicDirs 对象,对保存 27 | //查询该路径下是否字节点(默认有字节点为我们自己保存不同 partition 时生成的) 28 | // println(batchCompleted.batchInfo.numRecords) 29 | if (batchCompleted.batchInfo.numRecords > 0) { 30 | val currOffsetRange = offsetRanges.remove(batchCompleted.batchInfo.batchTime).getOrElse(Array[OffsetRange]()) 31 | currOffsetRange.foreach { x => 32 | val zkPath = s"${topicDirs.consumerOffsetDir}/${x.partition}" 33 | //将该 partition 的 offset 保存到 zookeeper 34 | // ZkUtils.apply(zkClient,true).updatePersistentPath(zkPath, s"${x.fromOffset}") 35 | ZkUtils.updatePersistentPath(zkClient, zkPath, s"${x.fromOffset}") 36 | println(s"zkPath:${zkPath} offset:fromOffset ${x.fromOffset} untilOffset ${x.untilOffset}") 37 | // logInfo(s"zkPath:${zkPath} offset:fromOffset ${x.fromOffset} untilOffset ${x.untilOffset}") 38 | } 39 | } 40 | } 41 | 42 | override def onReceiverError(receiverError: StreamingListenerReceiverError): Unit = { 43 | val topicDirs = new ZKGroupTopicDirs(config.group, config.topic) 44 | logError(s"ERROR:${receiverError.receiverInfo.lastError}\n Message:${receiverError.receiverInfo.lastErrorMessage}") 45 | val currOffsetRange = offsetRanges.remove(Time.apply(receiverError.receiverInfo.lastErrorTime)).getOrElse(Array[OffsetRange]()) 46 | currOffsetRange.foreach { x => 47 | val zkPath = s"${topicDirs.consumerOffsetDir}/${x.partition}" 48 | // ZkUtils.apply(zkClient,true).updatePersistentPath(zkPath, s"${x.fromOffset}") 49 | ZkUtils.updatePersistentPath(zkClient, zkPath, s"${x.fromOffset}") 50 | println(s"zkPath:${zkPath} offset:fromOffset ${x.fromOffset} untilOffset ${x.untilOffset}") 51 | // logInfo(s"zkPath:${zkPath} offset:fromOffset ${x.fromOffset} untilOffset ${x.untilOffset}") 52 | } 53 | } 54 | 55 | def getZkClient(zkServers: String, sessionTimeout: Int = 60000, connectionTimeout: Int = 60000): ZkClient = { 56 | val zkClient = new ZkClient(zkServers, sessionTimeout, connectionTimeout, new ZkSerializer { 57 | override def serialize(data: Object): Array[Byte] = { 58 | try { 59 | return data.toString().getBytes(StreamingConstant.CODE) 60 | } catch { 61 | case e: ZkMarshallingError => return null 62 | 63 | } 64 | } 65 | override def deserialize(bytes: Array[Byte]): Object = { 66 | try { 67 | return new String(bytes, StreamingConstant.CODE) 68 | } catch { 69 | case e: ZkMarshallingError => return null 70 | } 71 | } 72 | }) 73 | zkClient 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/realtimeBuildGraphx/SendMsg.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.realtimeBuildGraphx 2 | 3 | /** 4 | * Created by Administrator on 2017/8/2 0002. 5 | */ 6 | case class SendMsg(orderno:String,insert_time:String,cert_no:String) { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/utils/RedisUtils.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.utils 2 | 3 | 4 | import java.util 5 | 6 | import com.lakala.datacenter.constant.StreamingConstant 7 | import com.lakala.datacenter.utils.UtilsTools.properties 8 | import redis.clients.jedis.{HostAndPort, JedisCluster} 9 | 10 | import scala.collection.JavaConversions 11 | 12 | /** 13 | * Created by Administrator on 2017/6/29 0029. 14 | */ 15 | object RedisUtils { 16 | private var cluster: JedisCluster = _ 17 | private val properies = properties(StreamingConstant.CONFIG) 18 | 19 | def jedisCluster(): JedisCluster = { 20 | if (cluster == null) { 21 | synchronized { 22 | if (cluster == null) { 23 | val cluseterNodesSet = for (ipAndPort <- properies.getProperty("redisIp").split(",")) yield 24 | new HostAndPort(ipAndPort.split(":")(0).trim, (ipAndPort.split(":")(1).trim).toInt) 25 | cluster = new JedisCluster(JavaConversions.setAsJavaSet[HostAndPort](cluseterNodesSet.toSet)) 26 | } 27 | } 28 | } 29 | cluster 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /neo4j/src/main/scala/com/lakala/datacenter/utils/UtilsTools.scala: -------------------------------------------------------------------------------- 1 | package com.lakala.datacenter.utils 2 | 3 | import java.io.Serializable 4 | import java.util.Properties 5 | 6 | import org.slf4j.LoggerFactory 7 | 8 | /** 9 | * Created by lenovo on 2016/8/10. 10 | */ 11 | object UtilsTools { 12 | private val logger = LoggerFactory.getLogger(this.getClass) 13 | 14 | def properties(propertiesPath: String): Properties = { 15 | var _properties: Option[Properties] = None 16 | _properties match { 17 | case None => { 18 | logger.info("Loading configuration...") 19 | val inputStream = this.getClass.getClassLoader.getResourceAsStream(propertiesPath) 20 | val underlying = new Properties() 21 | underlying.load(inputStream) 22 | _properties = Some(underlying) 23 | underlying 24 | } 25 | case Some(underlying) => { 26 | underlying 27 | } 28 | } 29 | _properties.get 30 | } 31 | 32 | 33 | } 34 | -------------------------------------------------------------------------------- /neo4j/src/test/java/ConsumerKafka.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Administrator on 2017/8/7 0007. 3 | */ 4 | 5 | import kafka.consumer.Consumer; 6 | import kafka.consumer.ConsumerConfig; 7 | import kafka.consumer.ConsumerIterator; 8 | import kafka.consumer.KafkaStream; 9 | import kafka.javaapi.consumer.ConsumerConnector; 10 | import kafka.message.MessageAndMetadata; 11 | import kafka.serializer.StringEncoder; 12 | 13 | import java.util.HashMap; 14 | import java.util.List; 15 | import java.util.Map; 16 | import java.util.Properties; 17 | import java.util.concurrent.ExecutorService; 18 | import java.util.concurrent.Executors; 19 | 20 | public class ConsumerKafka { 21 | private ConsumerConfig config; 22 | private String topic; 23 | private int partitionsNum; 24 | private MessageExecutor executor; 25 | private ConsumerConnector connector; 26 | private ExecutorService threadPool; 27 | 28 | public ConsumerKafka(String topic, int partitionsNum, MessageExecutor executor) throws Exception { 29 | Properties prop = new Properties(); 30 | prop.put("auto.offset.reset", "smallest"); //必须要加,如果要读旧数据 31 | prop.put("zookeeper.connect", "192.168.0.208:2181,192.168.0.211:2181,192.168.0.212:2181"); 32 | prop.put("serializer.class", StringEncoder.class.getName()); 33 | prop.put("metadata.broker.list", "192.168.0.211:9092,192.168.0.212:9092"); 34 | prop.put("group.id", "test-consumer-group"); 35 | config = new ConsumerConfig(prop); 36 | this.topic = topic; 37 | this.partitionsNum = partitionsNum; 38 | this.executor = executor; 39 | } 40 | 41 | public void start() throws Exception { 42 | connector = Consumer.createJavaConsumerConnector(config); 43 | Map topics = new HashMap(); 44 | topics.put(topic, partitionsNum); 45 | Map>> streams = connector.createMessageStreams(topics); 46 | List> partitions = streams.get(topic); 47 | threadPool = Executors.newFixedThreadPool(partitionsNum); 48 | for (KafkaStream partition : partitions) { 49 | threadPool.execute(new MessageRunner(partition)); 50 | } 51 | } 52 | 53 | 54 | public void close() { 55 | try { 56 | threadPool.shutdownNow(); 57 | } catch (Exception e) { 58 | // 59 | } finally { 60 | connector.shutdown(); 61 | } 62 | 63 | } 64 | 65 | class MessageRunner implements Runnable { 66 | private KafkaStream partition; 67 | 68 | MessageRunner(KafkaStream partition) { 69 | this.partition = partition; 70 | } 71 | 72 | public void run() { 73 | ConsumerIterator it = partition.iterator(); 74 | while (it.hasNext()) { 75 | MessageAndMetadata item = it.next(); 76 | System.out.println("partiton:" + item.partition()); 77 | System.out.println("offset:" + item.offset()); 78 | executor.execute(new String(item.message()));//UTF-8 79 | } 80 | } 81 | } 82 | 83 | interface MessageExecutor { 84 | 85 | public void execute(String message); 86 | } 87 | 88 | /** 89 | * @param args 90 | */ 91 | public static void main(String[] args) { 92 | ConsumerKafka consumer = null; 93 | try { 94 | MessageExecutor executor = new MessageExecutor() { 95 | 96 | public void execute(String message) { 97 | System.out.println(message); 98 | } 99 | }; 100 | consumer = new ConsumerKafka("topic1", 3, executor); 101 | consumer.start(); 102 | } catch (Exception e) { 103 | e.printStackTrace(); 104 | } finally { 105 | if (consumer != null) { 106 | consumer.close(); 107 | } 108 | } 109 | 110 | } 111 | 112 | } 113 | -------------------------------------------------------------------------------- /neo4j/src/test/java/DataAttributeType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Administrator on 2017/6/16 0016. 3 | */ 4 | interface DataInterface{} 5 | 6 | public enum DataAttributeType implements DataInterface { 7 | ORDERID(1, "orderid"), CONTRACTNO(2, "contractno"), TERMID(3, "termid"), LOANPAN(4, "loanpan"), RETURNPAN(5, "returnpan"), 8 | INSERTTIME(6, "inserttime"), RECOMMEND(7, "recommend"), USERID(8, "userid"), DEVICEID(9, "deviceid"), 9 | CERTNO(10, "certno"), EMAIL(11, "email"), COMPANY(12, "company"), MOBILE(13, "mobile"), COMPADDR(14, "compaddr"), 10 | COMPPHONE(15, "compphone"), EMERGENCYCONTACTMOBILE(16, "emergencycontactmobile"), 11 | CONTACTMOBILE(17, "contactmobile"), IPV4(18, "ipv4"), MSGPHONE(19, "msgphone"), TELECODE(20, "telecode"); 12 | //成员变量 13 | private int sequence; 14 | private String name; 15 | 16 | //构造方法 17 | private DataAttributeType(int sequence, String name) { 18 | this.sequence = sequence; 19 | this.name = name; 20 | } 21 | 22 | //自定义方法 23 | public static String getColorName(int sequence) { 24 | for (DataAttributeType c : DataAttributeType.values()) { 25 | if (c.getSequence() == sequence) 26 | return c.name; 27 | } 28 | return null; 29 | } 30 | 31 | //getter&setter 32 | public int getSequence() { 33 | return sequence; 34 | } 35 | 36 | public void setSequence(int sequence) { 37 | this.sequence = sequence; 38 | } 39 | 40 | public String getName() { 41 | return name; 42 | } 43 | 44 | public void setName(String name) { 45 | this.name = name; 46 | } 47 | } 48 | 49 | -------------------------------------------------------------------------------- /neo4j/src/test/java/JavaKafkaSimpleConsumerAPITest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Administrator on 2017/6/21 0021. 3 | *//* 4 | 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | */ 10 | /** 11 | * Created by gerry on 12/21. 12 | *//* 13 | 14 | public class JavaKafkaSimpleConsumerAPITest { 15 | public static void main(String[] args) { 16 | JavaKafkaSimpleConsumerAPI example = new JavaKafkaSimpleConsumerAPI(); 17 | long maxReads = 300; 18 | String topic = "logCollect_cleanData"; 19 | int partitionID = 2; 20 | 21 | KafkaTopicPartitionInfo topicPartitionInfo = new KafkaTopicPartitionInfo(topic, partitionID); 22 | List seeds = new ArrayList(); 23 | seeds.add(new KafkaBrokerInfo("192.168.0.211", 9092)); 24 | seeds.add(new KafkaBrokerInfo("192.168.0.212", 9092)); 25 | 26 | try { 27 | example.run(maxReads, topicPartitionInfo, seeds); 28 | } catch (Exception e) { 29 | e.printStackTrace(); 30 | } 31 | 32 | // 获取该topic所属的所有分区ID列表 33 | System.out.println(example.fetchTopicPartitionIDs(seeds, topic, 100000, 64 * 1024, "client-id")); 34 | } 35 | } 36 | */ 37 | -------------------------------------------------------------------------------- /neo4j/src/test/java/KafkaBrokerInfo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Kafka服务器连接参数 3 | * Created by Administrator on 2017/6/21 0021. 4 | */ 5 | 6 | public class KafkaBrokerInfo { 7 | // 主机名 8 | public final String brokerHost; 9 | // 端口号 10 | public final int brokerPort; 11 | 12 | /** 13 | * 构造方法 14 | * 15 | * @param brokerHost Kafka服务器主机或者IP地址 16 | * @param brokerPort 端口号 17 | */ 18 | public KafkaBrokerInfo(String brokerHost, int brokerPort) { 19 | this.brokerHost = brokerHost; 20 | this.brokerPort = brokerPort; 21 | } 22 | 23 | /** 24 | * 构造方法, 使用默认端口号9092进行构造 25 | * 26 | * @param brokerHost 27 | */ 28 | public KafkaBrokerInfo(String brokerHost) { 29 | this(brokerHost, 9092); 30 | } 31 | } -------------------------------------------------------------------------------- /neo4j/src/test/java/KafkaConsumer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Administrator on 2017/6/8 0008. 3 | */ 4 | 5 | import kafka.consumer.ConsumerConfig; 6 | import kafka.consumer.ConsumerIterator; 7 | import kafka.consumer.KafkaStream; 8 | import kafka.javaapi.consumer.ConsumerConnector; 9 | import kafka.serializer.StringDecoder; 10 | import kafka.utils.VerifiableProperties; 11 | 12 | import java.util.HashMap; 13 | import java.util.List; 14 | import java.util.Map; 15 | import java.util.Properties; 16 | 17 | public class KafkaConsumer { 18 | 19 | private final ConsumerConnector consumer; 20 | // private String TOPIC ="topic_creditloan_orderinfo_wait_score"; 21 | private String TOPIC ="logCollect_cleanData"; 22 | private KafkaConsumer() { 23 | Properties props = new Properties(); 24 | //zookeeper 配置192.168.0.208:2181,1 25 | props.put("zookeeper.connect", "192.168.0.208:2181,192.168.0.211:2181,192.168.0.212:2181"); 26 | 27 | //group 代表一个消费组 28 | // props.put("group.id", "test-consumer-group125"); 29 | props.put("group.id", "testcheatgraph"); 30 | 31 | //zk连接超时 32 | props.put("zookeeper.session.timeout.ms", "60000"); 33 | props.put("zookeeper.sync.time.ms", "200"); 34 | props.put("auto.commit.interval.ms", "1000"); 35 | props.put("auto.offset.reset", "smallest"); 36 | props.put("rebalance.max.retries", "5"); 37 | props.put("rebalance.backoff.ms", "12000"); 38 | //序列化类 39 | props.put("serializer.class", "kafka.serializer.StringEncoder"); 40 | 41 | ConsumerConfig config = new ConsumerConfig(props); 42 | 43 | consumer = kafka.consumer.Consumer.createJavaConsumerConnector(config); 44 | } 45 | 46 | void consume() { 47 | Map topicCountMap = new HashMap(); 48 | topicCountMap.put(TOPIC, new Integer(1)); 49 | 50 | StringDecoder keyDecoder = new StringDecoder(new VerifiableProperties()); 51 | StringDecoder valueDecoder = new StringDecoder(new VerifiableProperties()); 52 | 53 | Map>> consumerMap = 54 | consumer.createMessageStreams(topicCountMap, keyDecoder, valueDecoder); 55 | KafkaStream stream = consumerMap.get(TOPIC).get(0); 56 | ConsumerIterator it = stream.iterator(); 57 | while (it.hasNext()) 58 | System.out.println(it.next().message()); 59 | } 60 | 61 | public static void main(String[] args) { 62 | new KafkaConsumer().consume(); 63 | } 64 | } -------------------------------------------------------------------------------- /neo4j/src/test/java/KafkaTopicPartitionInfo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Administrator on 2017/6/21 0021. 3 | */ 4 | 5 | public class KafkaTopicPartitionInfo { 6 | // 主题名称 7 | public final String topic; 8 | // 分区id 9 | public final int partitionID; 10 | 11 | /** 12 | * 构造函数 13 | * 14 | * @param topic 主题名称 15 | * @param partitionID 分区id 16 | */ 17 | public KafkaTopicPartitionInfo(String topic, int partitionID) { 18 | this.topic = topic; 19 | this.partitionID = partitionID; 20 | } 21 | 22 | @Override 23 | public boolean equals(Object o) { 24 | if (this == o) return true; 25 | if (o == null || getClass() != o.getClass()) return false; 26 | 27 | KafkaTopicPartitionInfo that = (KafkaTopicPartitionInfo) o; 28 | 29 | if (partitionID != that.partitionID) return false; 30 | return topic != null ? topic.equals(that.topic) : that.topic == null; 31 | 32 | } 33 | 34 | @Override 35 | public int hashCode() { 36 | int result = topic != null ? topic.hashCode() : 0; 37 | result = 31 * result + partitionID; 38 | return result; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /neo4j/src/test/java/OperatorKafka.java: -------------------------------------------------------------------------------- 1 | //import com.lakala.datacenter.constant.StreamingConstant; 2 | //import kafka.admin.AdminUtils; 3 | //import org.I0Itec.zkclient.ZkClient; 4 | //import org.I0Itec.zkclient.exception.ZkMarshallingError; 5 | //import org.I0Itec.zkclient.serialize.ZkSerializer; 6 | // 7 | //import java.io.UnsupportedEncodingException; 8 | //import java.util.Iterator; 9 | //import java.util.Map; 10 | //import java.util.Properties; 11 | // 12 | ///** 13 | // * Created by Administrator on 2017/8/2 0002. 14 | // */ 15 | //public class OperatorKafka { 16 | // public static void main(String[] args) { 17 | // createTopic(); 18 | // } 19 | // 20 | // public static void createTopic() { 21 | // ZkClient zkUtils = getZk(); 22 | //// 创建一个单分区单副本名为t1的topic 23 | // AdminUtils.createTopic(zkUtils, "logCollect_cleanData", 3, 1, new Properties()); 24 | // zkUtils.close(); 25 | // } 26 | // 27 | // public static void deleteTopic() { 28 | // ZkClient zkUtils = getZk(); 29 | //// 创建一个单分区单副本名为t1的topic 30 | // AdminUtils.deleteTopic(zkUtils, "logCollect_cleanData"); 31 | // zkUtils.close(); 32 | // } 33 | // 34 | // public static void queryTopic() { 35 | // ZkClient zkUtils = getZk(); 36 | // // 获取topic 'test'的topic属性属性 37 | // Properties props = AdminUtils.fetchTopicConfig(zkUtils, "logCollect_cleanData"); 38 | //// 查询topic-level属性 39 | // Iterator it = props.entrySet().iterator(); 40 | // while (it.hasNext()) { 41 | // Map.Entry entry = (Map.Entry) it.next(); 42 | // Object key = entry.getKey(); 43 | // Object value = entry.getValue(); 44 | // System.out.println(key + " = " + value); 45 | // } 46 | // zkUtils.close(); 47 | // } 48 | // 49 | // 50 | // public static void updateTopic() { 51 | // ZkClient zkUtils = getZk(); 52 | // Properties props = AdminUtils.fetchTopicConfig(zkUtils, "logCollect_cleanData"); 53 | //// 增加topic级别属性 54 | // props.put("min.cleanable.dirty.ratio", "0.3"); 55 | //// 删除topic级别属性 56 | // props.remove("max.message.bytes"); 57 | //// 修改topic 'test'的属性 58 | // AdminUtils.changeTopicConfig(zkUtils, "logCollect_cleanData", props); 59 | // } 60 | // 61 | // public static ZkClient getZk() { 62 | // ZkClient zkUtils = new ZkClient("192.168.0.208:2181,192.168.0.211:2181,192.168.0.212:2181", 60000, 60000, new ZkSerializer() { 63 | // @Override 64 | // public byte[] serialize(Object data) throws ZkMarshallingError { 65 | // try { 66 | // return data.toString().getBytes(StreamingConstant.CODE()); 67 | // } catch (UnsupportedEncodingException e) { 68 | // e.printStackTrace(); 69 | // } 70 | // return new byte[0]; 71 | // } 72 | // 73 | // @Override 74 | // public Object deserialize(byte[] bytes) throws ZkMarshallingError { 75 | // try { 76 | // return new String(bytes, StreamingConstant.CODE()); 77 | // } catch (UnsupportedEncodingException e) { 78 | // e.printStackTrace(); 79 | // } 80 | // return new byte[0]; 81 | // } 82 | // }); 83 | // return zkUtils; 84 | // } 85 | //} 86 | -------------------------------------------------------------------------------- /neo4j/src/test/java/TestCypher.java: -------------------------------------------------------------------------------- 1 | import org.neo4j.driver.v1.*; 2 | 3 | import java.util.List; 4 | 5 | import static org.neo4j.driver.v1.Values.parameters; 6 | 7 | /** 8 | * Created by Administrator on 2017/8/2 0002. 9 | */ 10 | public class TestCypher { 11 | Driver driver = GraphDatabase.driver("bolt://localhost", AuthTokens.basic("neo4j", "123456")); 12 | 13 | public int addEmployees(final String companyName) { 14 | try (Session session = driver.session()) { 15 | int employees = 0; 16 | List persons = session.readTransaction(new TransactionWork>() { 17 | @Override 18 | public List execute(Transaction tx) { 19 | return matchPersonNodes(tx); 20 | } 21 | }); 22 | for (final Record person : persons) { 23 | employees += session.writeTransaction(new TransactionWork() { 24 | @Override 25 | public Integer execute(Transaction tx) { 26 | tx.run("MATCH (emp:Person {name: $person_name}) " + 27 | "MERGE (com:Company {name: $company_name}) " + 28 | "MERGE (emp)-[:WORKS_FOR]->(com)", 29 | parameters("person_name", person.get("name").asString(), "company_name", 30 | companyName)); 31 | return 1; 32 | } 33 | }); 34 | } 35 | return employees; 36 | } 37 | } 38 | 39 | private static List matchPersonNodes(Transaction tx) { 40 | return tx.run("MATCH (a:Person) RETURN a.name AS name").list(); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/BroadcastAccumulatorStreaming.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Administrator on 2017/8/10 0010. 3 | */ 4 | 5 | import org.apache.spark.broadcast.Broadcast 6 | import org.apache.spark.streaming.{Duration, StreamingContext} 7 | import org.apache.spark.{Accumulator, SparkConf, SparkContext} 8 | 9 | /** 10 | * Created by lxh on 2016/6/30. 11 | */ 12 | object BroadcastAccumulatorStreaming { 13 | 14 | /** 15 | * 声明一个广播和累加器! 16 | */ 17 | private var broadcastList: Broadcast[List[String]] = _ 18 | private var accumulator: Accumulator[Int] = _ 19 | 20 | def main(args: Array[String]) { 21 | 22 | val sparkConf = new SparkConf().setMaster("local[4]").setAppName("broadcasttest") 23 | val sc = new SparkContext(sparkConf) 24 | 25 | /** 26 | * duration是ms 27 | */ 28 | val ssc = new StreamingContext(sc, Duration(2000)) 29 | // broadcastList = ssc.sparkContext.broadcast(util.Arrays.asList("Hadoop","Spark")) 30 | broadcastList = ssc.sparkContext.broadcast(List("Hadoop", "Spark")) 31 | accumulator = ssc.sparkContext.accumulator(0, "broadcasttest") 32 | 33 | /** 34 | * 获取数据! 35 | */ 36 | val lines = ssc.socketTextStream("localhost", 9999) 37 | 38 | /** 39 | * 1.flatmap把行分割成词。 40 | * 2.map把词变成tuple(word,1) 41 | * 3.reducebykey累加value 42 | * (4.sortBykey排名) 43 | * 4.进行过滤。 value是否在累加器中。 44 | * 5.打印显示。 45 | */ 46 | val words = lines.flatMap(line => line.split(" ")) 47 | 48 | val wordpair = words.map(word => (word, 1)) 49 | 50 | wordpair.filter(record => { 51 | broadcastList.value.contains(record._1) 52 | }) 53 | 54 | 55 | val pair = wordpair.reduceByKey(_ + _) 56 | 57 | /** 58 | * 这个pair 是PairDStream 59 | * 查看这个id是否在黑名单中,如果是的话,累加器就+1 60 | */ 61 | /* pair.foreachRDD(rdd => { 62 | rdd.filter(record => { 63 | 64 | if (broadcastList.value.contains(record._1)) { 65 | accumulator.add(1) 66 | return true 67 | } else { 68 | return false 69 | } 70 | 71 | }) 72 | 73 | })*/ 74 | 75 | val filtedpair = pair.filter(record => { 76 | if (broadcastList.value.contains(record._1)) { 77 | accumulator.add(record._2) 78 | true 79 | } else { 80 | false 81 | } 82 | 83 | }).print 84 | 85 | println("累加器的值" + accumulator.value) 86 | 87 | // pair.filter(record => {broadcastList.value.contains(record._1)}) 88 | 89 | val keypair = pair.map(pair => (pair._2,pair._1)) 90 | 91 | /** 92 | * 如果DStream自己没有某个算子操作。就通过转化transform! 93 | */ 94 | keypair.transform(rdd => { 95 | rdd.sortByKey(false)//TODO 96 | }) 97 | pair.print() 98 | ssc.start() 99 | ssc.awaitTermination() 100 | 101 | } 102 | 103 | } 104 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/ClientRedisTest.scala: -------------------------------------------------------------------------------- 1 | import com.lakala.datacenter.utils.RedisUtils 2 | import redis.clients.jedis.JedisPubSub 3 | 4 | /** 5 | * Created by Administrator on 2017/6/29 0029. 6 | */ 7 | object ClientRedisTest { 8 | def main(args: Array[String]): Unit = { 9 | val jedis = RedisUtils.jedisCluster() 10 | println(jedis.subscribe(new ApplyPubSubListener(),args(0))) 11 | } 12 | 13 | class ApplyPubSubListener extends JedisPubSub { 14 | 15 | override def onMessage(channel: String, message: String): Unit = { 16 | System.out.println(channel + " onMessage=" + message) 17 | super.onMessage(channel, message) 18 | } 19 | // 初始化订阅时候的处理 20 | override def onSubscribe(channel: String, subscribedChannels: Int) { 21 | System.out.println(channel + " onSubscribe=" + subscribedChannels); 22 | } 23 | 24 | // 取消订阅时候的处理 25 | override def onUnsubscribe(channel: String, subscribedChannels: Int) { 26 | System.out.println(channel + "onUnsubscribe=" + subscribedChannels); 27 | } 28 | 29 | // 初始化按表达式的方式订阅时候的处理 30 | override def onPSubscribe(pattern: String, subscribedChannels: Int) { 31 | System.out.println(pattern + "onPSubscribe=" + subscribedChannels); 32 | } 33 | 34 | // 取消按表达式的方式订阅时候的处理 35 | override def onPUnsubscribe(pattern: String, subscribedChannels: Int) { 36 | System.out.println(pattern + "onPUnsubscribe=" + subscribedChannels); 37 | } 38 | 39 | // 取得按表达式的方式订阅的消息后的处理 40 | override def onPMessage(pattern: String, channel: String, message:String ) { 41 | System.out.println(pattern + "onPMessage=" + channel + "=" + message); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/CollectionUtil.scala: -------------------------------------------------------------------------------- 1 | import scala.collection.mutable.ArrayBuffer 2 | import scala.reflect.ClassTag 3 | 4 | /** 5 | * Created by Administrator on 2017/8/15 0015. 6 | */ 7 | object CollectionUtil { 8 | /** 9 | * 对具有Traversable[(K, V)]类型的集合添加reduceByKey相关方法 10 | * 11 | * @param collection 12 | * @param kt 13 | * @param vt 14 | * @tparam K 15 | * @tparam V 16 | */ 17 | implicit class CollectionHelper[K, V](collection: Traversable[(K, V)])(implicit kt: ClassTag[K], vt: ClassTag[V]) { 18 | def reduceByKey(f: (V, V) => V): Traversable[(K, V)] = collection.groupBy(_._1).map { case (_: K, values: Traversable[(K, V)]) => values.reduce((a, b) => (a._1, f(a._2, b._2))) } 19 | 20 | /** 21 | * reduceByKey的同时,返回被reduce掉的元素的集合 22 | * 23 | * @param f 24 | * @return 25 | */ 26 | def reduceByKeyWithReduced(f: (V, V) => V)(implicit kt: ClassTag[K], vt: ClassTag[V]): (Traversable[(K, V)], Traversable[(K, V)]) = { 27 | val reduced: ArrayBuffer[(K, V)] = ArrayBuffer() 28 | val newSeq = collection.groupBy(_._1).map { 29 | case (_: K, values: Traversable[(K, V)]) => values.reduce((a, b) => { 30 | val newValue: V = f(a._2, b._2) 31 | val reducedValue: V = if (newValue == a._2) b._2 else a._2 32 | val reducedPair: (K, V) = (a._1, reducedValue) 33 | reduced += reducedPair 34 | (a._1, newValue) 35 | }) 36 | } 37 | (newSeq, reduced.toTraversable) 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/ConsumerGroupExample.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Administrator on 2017/8/7 0007. 3 | */ 4 | 5 | 6 | import com.lakala.datacenter.main.TrialConsumerKafka 7 | 8 | object ConsumerGroupExample { 9 | def main(args: Array[String]): Unit = { 10 | TrialConsumerKafka.main(Array("192.168.0.208:2181,192.168.0.211:2181,192.168.0.212:2181", "test-consumer-group", 11 | "logCollect_cleanData", "3")) 12 | } 13 | } 14 | 15 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/Main.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Administrator on 2017/8/1 0001. 3 | */ 4 | import org.neo4j.driver.v1.GraphDatabase 5 | import org.neo4j.driver.v1.AuthTokens 6 | import com.lakala.datacenter.cypher.NeoData._ 7 | 8 | object Main { 9 | 10 | def main(args: Array[String]): Unit = { 11 | 12 | val driver = GraphDatabase.driver("bolt://localhost", AuthTokens.basic("neo4j", "123456")) 13 | 14 | val session = driver.session(); 15 | 16 | val nodes = allNodes(session) 17 | 18 | println(nodes.mkString("\n")) 19 | } 20 | } -------------------------------------------------------------------------------- /neo4j/src/test/scala/TestApiNeo4j.scala: -------------------------------------------------------------------------------- 1 | //import java.util 2 | //import java.util.Map 3 | // 4 | //import com.lakala.datacenter.constant.StreamingConstant 5 | //import com.lakala.datacenter.utils.UtilsTools.properties 6 | //import org.apache.commons.lang3.StringUtils.trim 7 | //import org.neo4j.rest.graphdb.RestAPIFacade 8 | //import org.neo4j.rest.graphdb.batch.CypherResult 9 | //import org.neo4j.rest.graphdb.query.RestCypherQueryEngine 10 | //import org.neo4j.rest.graphdb.util.QueryResult 11 | // 12 | ///** 13 | // * Created by Administrator on 2017/7/12 0012. 14 | // */ 15 | //object TestApiNeo4j { 16 | // def main(args: Array[String]): Unit = { 17 | //// val properies = properties(StreamingConstant.CONFIG) 18 | //// val restAPI = new RestAPIFacade(trim(properies.getProperty(StreamingConstant.NEOIP)), trim(properies.getProperty(StreamingConstant.USER)), trim(properies.getProperty(StreamingConstant.PASSWORD))) 19 | // 20 | // import scala.collection.JavaConversions._ 21 | // // 22 | // // // 23 | // // val orderno ="AX20160722090751068917" 24 | // // val centro = "500227198611307710" 25 | // // val applyNodeIndexs = restAPI.getNodesByLabelAndProperty("" + Labels.ApplyInfo, StreamingConstant.ORDERNO, orderno) 26 | // // 27 | // // val apply = applyNodeIndexs.toList 28 | // // if (apply.size == 0) { 29 | // // val applyNode = restAPI.createNode(MapUtil.map(StreamingConstant.ORDERNO, orderno.toUpperCase,StreamingConstant.MODELNAME, Labels.ApplyInfo)) 30 | // // applyNode.addLabel(Labels.ApplyInfo) 31 | // // applyNode.setProperty(StreamingConstant.ORDERNO,orderno) 32 | // // 33 | // // val contentIndexs = restAPI.getNodesByLabelAndProperty("Identification", StreamingConstant.CONTENT, centro) 34 | // // val list = contentIndexs.toList 35 | // // println(list.size) 36 | // // var otherNode: RestNode = if (list.size == 0) { 37 | // // val otherNode2 = restAPI.createNode(MapUtil.map(StreamingConstant.MODELNAME, "Identification", StreamingConstant.CONTENT, centro)) 38 | // // otherNode2.setProperty(StreamingConstant.CONTENT, centro) 39 | // // otherNode2.addLabel(Labels.Identification) 40 | // // otherNode2 41 | // // } else { 42 | // // applyNode.setProperty("cert_no", centro) 43 | // // list.get(0) 44 | // // } 45 | // // 46 | // // applyNode.createRelationshipTo(otherNode, RelationshipTypes.identification) 47 | // // println(otherNode.getId) 48 | // // println(applyNode.getId) 49 | // // } 50 | // val restAPI = new RestAPIFacade(trim("http://192.168.0.33:7474/db/data"), trim("neo4j"), trim("123456")) 51 | // val result = restAPI.query("MATCH (:Person {name:'Keanu'})-[:ACTED_IN]->(:Movie {title:'Matrix'}) RETURN count(*) as c" ,null) 52 | // val it = result.getData 53 | // it.flatten.toList.get(0) 54 | // println(it.flatten.toList.get(0)) 55 | // } 56 | //} 57 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/TestCypher.scala: -------------------------------------------------------------------------------- 1 | import com.lakala.datacenter.common.utils.DateTimeUtils 2 | import com.lakala.datacenter.constant.StreamingConstant 3 | import org.apache.commons.lang3.StringUtils 4 | import org.joda.time.DateTime 5 | import org.neo4j.driver.v1._ 6 | 7 | /** 8 | * Created by Administrator on 2017/8/2 0002. 9 | */ 10 | object TestCypher2 { 11 | val driver: Driver = GraphDatabase.driver("bolt://localhost:7687", AuthTokens.basic("neo4j", "123456")) 12 | 13 | def main(args: Array[String]): Unit = { 14 | var map: java.util.HashMap[String, String] = new java.util.HashMap[String, String]() 15 | var paramMap: java.util.HashMap[String, String] = new java.util.HashMap[String, String]() 16 | map.put("orderno", "TNA20170623102711010234032084429") 17 | map.put("_DeviceId", "A000005966DFEA") 18 | map.put("mobile", "18961922790") 19 | 20 | runCypherApply(driver.session(), map) 21 | driver.close() 22 | } 23 | 24 | private def runCypherApply(session: Session, map: java.util.HashMap[String, String]): Unit = { 25 | val applyStatementTemplate = new StringBuffer("MERGE (apply:ApplyInfo {orderno:$orderno})") 26 | applyStatementTemplate.append(" ON MATCH SET apply.modelname='ApplyInfo',apply.insertTime=$insertTime,apply.user_id=$user_id") 27 | val otherStatementTemplate = new StringBuffer() 28 | val relStatementTemplate = new StringBuffer() 29 | 30 | var paramMap: java.util.HashMap[String, Object] = new java.util.HashMap[String, Object]() 31 | paramMap.put("orderno", map.getOrDefault(StreamingConstant.ORDERNO, "")) 32 | paramMap.put(StreamingConstant.INSERTTIME, DateTimeUtils.formatter.print(DateTime.now())) 33 | paramMap.put(StreamingConstant.USER_ID, map.getOrDefault(StreamingConstant.USERID, "")) 34 | 35 | for (key <- StreamingConstant.fieldMap.keySet) { 36 | val fieldRelation = StreamingConstant.fieldMap.get(key).get.split(",") 37 | if (StringUtils.isNoneEmpty(map.get(key))) { 38 | val modelname = "" + StreamingConstant.labelMap.get(key).get 39 | val rel = "" + StreamingConstant.relationShipMap.get(key).get 40 | otherStatementTemplate.append(" MERGE (" + key + ":" + modelname + "{modelname:'" + modelname + "',content:$" + fieldRelation(0) + "})") 41 | otherStatementTemplate.append(" MERGE (apply)-[:" + rel + "]->(" + key + ")") 42 | applyStatementTemplate.append(",apply." + fieldRelation(0) + "=$" + fieldRelation(0)) 43 | paramMap.put(fieldRelation(0), map.get(key)) 44 | } 45 | } 46 | 47 | val statementStr = applyStatementTemplate.append(otherStatementTemplate).toString 48 | println(statementStr) 49 | session.writeTransaction(new TransactionWork[Integer]() { 50 | override def execute(tx: Transaction): Integer = { 51 | tx.run(statementStr, paramMap) 52 | 1 53 | } 54 | }) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/TestKafka.scala: -------------------------------------------------------------------------------- 1 | import com.lakala.datacenter.constant.StreamingConstant 2 | import kafka.utils.ZkUtils 3 | import org.I0Itec.zkclient.ZkClient 4 | import org.I0Itec.zkclient.exception.ZkMarshallingError 5 | import org.I0Itec.zkclient.serialize.ZkSerializer 6 | /** 7 | * Created by Administrator on 2017/6/12 0012. 8 | */ 9 | object TestKafka { 10 | def main(args: Array[String]): Unit = { 11 | val topic = "logCollect_cleanData" 12 | val zkConnect = "192.168.0.211:2181,192.168.0.212:2181" 13 | var zkClient: ZkClient = null 14 | try { 15 | zkClient = new ZkClient(zkConnect, 30000, 30000, new ZkSerializer { 16 | override def serialize(data: Object): Array[Byte] = { 17 | try { 18 | return data.toString().getBytes(StreamingConstant.CODE) 19 | } catch { 20 | case e: ZkMarshallingError => return null 21 | 22 | } 23 | } 24 | 25 | override def deserialize(bytes: Array[Byte]): Object = { 26 | try { 27 | return new String(bytes, StreamingConstant.CODE) 28 | } catch { 29 | case e: ZkMarshallingError => return null 30 | } 31 | } 32 | }) 33 | zkClient.deleteRecursive(ZkUtils.getTopicPath(topic)) //其实最终还是通过删除zk里面对应的路径来实现删除topic的功能 34 | println("deletion succeeded!") 35 | } 36 | catch { 37 | case e: Throwable => 38 | println("delection failed because of " + e.getMessage) 39 | // println(Utils.stackTrace(e)) 40 | } 41 | finally { 42 | if (zkClient != null) 43 | zkClient.close() 44 | } 45 | 46 | 47 | // import org.I0Itec.zkclient.ZkClient 48 | // val arrys = new Array[String](6) 49 | // arrys(0) = "--replication-factor" 50 | // arrys(1) = "1" 51 | // arrys(2) = "--partitions" 52 | // arrys(3) = "3" 53 | // arrys(4) = "--topic" 54 | // arrys(5) = "logCollect_cleanData" 55 | // val client = new ZkClient("192.168.0.211:2181,192.168.0.212:2181", 30000, 30000, ZKStringSerializer) 56 | // client.setZkSerializer(ZKStringSerializer) //一定要加上ZkSerializer 57 | // 58 | // 59 | // val opts = new TopicCommand.TopicCommandOptions(arrys) 60 | // TopicCommand.createTopic(client, opts) 61 | 62 | // import kafka.admin.AdminUtils 63 | // val client = new ZkClient("192.168.0.211:2181,192.168.0.212:2181", 30000, 30000) 64 | // 创建一个单分区单副本名为t1的topic 65 | // val props: Properties = new Properties 66 | //此处配置的是kafka的端口 67 | // props.put("metadata.broker.list", "192.168.0.211:9092,192.168.0.212:9092") 68 | //配置value的序列化类 69 | // props.put("serializer.class", "kafka.serializer.StringEncoder") 70 | //配置key的序列化类 71 | // props.put("key.serializer.class", "kafka.serializer.StringEncoder") 72 | //request.required.acks 73 | // props.put("request.required.acks", "-1") 74 | // AdminUtils.createTopic(client, "logCollect_cleanData", 3, 1, props) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/TestRedis.scala: -------------------------------------------------------------------------------- 1 | import java.util 2 | 3 | import com.alibaba.fastjson.{JSON, JSONObject} 4 | import com.lakala.datacenter.common.utils.DateTimeUtils 5 | import com.lakala.datacenter.constant.StreamingConstant 6 | import com.lakala.datacenter.utils.RedisUtils 7 | import org.joda.time.DateTime 8 | import redis.clients.jedis.JedisPubSub 9 | 10 | 11 | 12 | /** 13 | * Created by Administrator on 2017/6/29 0029. 14 | */ 15 | object TestRedis { 16 | def main(args: Array[String]): Unit = { 17 | 18 | // 19 | val jedis = RedisUtils.jedisCluster() 20 | try { 21 | val orderno = args(0) 22 | val insertTime=Map(StreamingConstant.INSERTTIME->"2017-06-30 12:01:10").getOrElse(StreamingConstant.INSERTTIME, DateTimeUtils.formatter.print(DateTime.now())) 23 | val s= "{\""+StreamingConstant.ORDERNO+"\":\""+orderno+"\",\""+StreamingConstant.INSERT_TIME+"\":\""+insertTime+"\"}" 24 | jedis.publish("testsub12", s) 25 | println(s) 26 | println(JSON.parseObject(s).getString(StreamingConstant.INSERT_TIME)) 27 | } catch { 28 | case e: Exception => println("AAAAAAAAA"+e.getMessage) 29 | } 30 | 31 | } 32 | 33 | 34 | } 35 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/org/neo4j/spark/ExplortApplyDataTest.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.spark 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | import scala.collection.mutable.ArrayBuffer 6 | 7 | /** 8 | * Created by Administrator on 2017/5/9 0009. 9 | * 10 | */ 11 | object ExplortApplyDataTest { 12 | def main(args: Array[String]): Unit = { 13 | // ExplortApplyData2.main(Array("192.168.0.33","file:///F:/output/out","BankCard,Device,Mobile",Email")) 14 | val conf = new SparkConf().setMaster("local[1]").setAppName("test") 15 | val sc = new SparkContext(conf) 16 | printSql(sc) 17 | println(System.getProperty("java.io.tmpdir")) 18 | } 19 | def printSql(sc:SparkContext)={ 20 | val map = Map("applymymobile" -> "Mobile","loanapply" -> "Mobile","emergencymobile" -> "Mobile", "device" -> "Device", "bankcard" -> "BankCard", "identification" -> "Identification", "email" -> "Email") 21 | val modelRdd = sc.parallelize(List("BankCard", "Device", "Mobile", "Email")) 22 | 23 | val broadcastVar2 = sc.broadcast(map) 24 | modelRdd.foreachPartition { models => 25 | models.foreach { model => 26 | runQueryApplyByApplyLevel1(broadcastVar2.value, model) 27 | } 28 | } 29 | } 30 | def runQueryApplyByApplyLevel1(map: Map[String, String],modelname: String):Unit = { 31 | 32 | val list = new ArrayBuffer[String]() 33 | for (k <- map.keySet) { 34 | for (k2 <- map.keySet) { 35 | if (k2.equals("applymymobile") || k2.equals("loanapply") || k2.equals("emergencymobile")) { 36 | list += s"match (n:$modelname {type:'1'})-[r1:${k}] -(p:ApplyInfo)-[r2:${k2}]-(m:${map.get(k2).get})-[r3:applymymobile]-(q:ApplyInfo) return n.content,p.orderno,m.content,q.orderno@@$k==$k2==applymymobile" 37 | list += s"match (n:$modelname {type:'1'})-[r1:${k}] -(p:ApplyInfo)-[r2:${k2}]-(m:${map.get(k2).get})-[r3:loanapply]-(q:ApplyInfo) return n.content,p.orderno,m.content,q.orderno@@$k==$k2==loanapply" 38 | list += s"match (n:$modelname {type:'1'})-[r1:${k}] -(p:ApplyInfo)-[r2:${k2}]-(m:${map.get(k2).get})-[r3:emergencymobile]-(q:ApplyInfo) return n.content,p.orderno,m.content,q.orderno@@$k==$k2=emergencymobile" 39 | } else { 40 | list += s"match (n:$modelname {type:'1'})-[r1:${k}] -(p:ApplyInfo)-[r2:${k2}]-(m:${map.get(k2).get})-[r3:${k2}]-(q:ApplyInfo) return n.content,p.orderno,m.content,q.orderno@@$k==$k2==$k2" 41 | } 42 | } 43 | } 44 | list.map { sql => 45 | val arr = sql.split("@@") 46 | println(arr(0)) 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/org/neo4j/spark/MainTest.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.spark 2 | 3 | import com.lakala.datacenter.main.Main 4 | 5 | /** 6 | * Created by Administrator on 2017/6/2 0002. 7 | */ 8 | object MainTest { 9 | def main(args: Array[String]): Unit = { 10 | //-i F:\tmp\applydir 11 | Main.main(args) 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/org/neo4j/spark/Neo4jContstanTest.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.spark 2 | 3 | /** 4 | * Created by Administrator on 2017/7/14 0014. 5 | */ 6 | object Neo4jContstanTest { 7 | val SERVER_BOLTURI ="bolt://192.168.0.33:7687" 8 | val RESTNEO4JURL ="http://192.168.0.33:7474/db/data" 9 | } 10 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/org/neo4j/spark/Neo4jDataFrameScalaTest.scala: -------------------------------------------------------------------------------- 1 | //package org.neo4j.spark 2 | // 3 | //import java.io.File 4 | // 5 | //import com.lakala.datacenter.load.spark.{Neo4jDataFrame, Neo4jGraph} 6 | //import org.apache.commons.lang3.StringUtils.trim 7 | //import org.apache.spark.api.java.JavaSparkContext 8 | //import org.apache.spark.graphx.{Edge, Graph} 9 | //import org.apache.spark.rdd.RDD 10 | //import org.apache.spark.sql.types.{DataTypes, StructField, StructType} 11 | //import org.apache.spark.sql.{Row, SQLContext} 12 | //import org.apache.spark.{SparkConf, SparkContext} 13 | //import org.junit.Assert._ 14 | //import org.junit._ 15 | //import org.neo4j.harness.{ServerControls, TestServerBuilders} 16 | //import org.neo4j.rest.graphdb.RestAPIFacade 17 | //import org.neo4j.rest.graphdb.batch.CypherResult 18 | // 19 | // 20 | ///** 21 | // * @author lys 22 | // * @since 17.07.16 23 | // */ 24 | //class Neo4jDataFrameScalaTest { 25 | // val FIXTURE: String = "CREATE (:A)-[:REL {foo:'bar'}]->(:B)" 26 | // private var conf: SparkConf = null 27 | // private var sc: JavaSparkContext = null 28 | // private var server: ServerControls = null 29 | // private val path:String ="F:\\tmp\\neo4j\\tmp02" 30 | // private var restAPI:RestAPIFacade = null 31 | // @Before 32 | // @throws[Exception] 33 | // def setUp { 34 | //// server = TestServerBuilders.newInProcessBuilder(new File(path)).withConfig("dbms.security.auth_enabled", "false").withFixture(FIXTURE).newServer 35 | // restAPI = new RestAPIFacade(trim(Neo4jContstanTest.RESTNEO4JURL), trim("neo4j"), trim("123456")) 36 | // 37 | // conf = new SparkConf().setAppName("neoTest").setMaster("local[*]").set("spark.driver.allowMultipleContexts", "true").set("spark.neo4j.bolt.url", Neo4jContstanTest.SERVER_BOLTURI) 38 | // sc = SparkContext.getOrCreate(conf) 39 | // } 40 | // 41 | // @After def tearDown { 42 | //// server.close 43 | // sc.close 44 | // } 45 | // 46 | // @Test def mergeEdgeList { 47 | // val rows = sc.makeRDD(Seq(Row("Keanu", "Matrix"))) 48 | // val schema = StructType(Seq(StructField("name", DataTypes.StringType), StructField("title", DataTypes.StringType))) 49 | // val sqlContext = new SQLContext(sc) 50 | // val df = sqlContext.createDataFrame(rows, schema) 51 | // Neo4jDataFrame.mergeEdgeList(sc, df, ("Person", Seq("name")), ("ACTED_IN", Seq.empty), ("Movie", Seq("title"))) 52 | // val edges: RDD[Edge[Long]] = sc.makeRDD(Seq(Edge(0, 1, 42L))) 53 | // val graph = Graph.fromEdges(edges, -1) 54 | // assertEquals(2, graph.vertices.count) 55 | // assertEquals(1, graph.edges.count) 56 | // Neo4jGraph.saveGraph(sc, graph, null, "test") 57 | // 58 | //// val it: ResourceIterator[Long] = server.graph().execute("MATCH (:Person {name:'Keanu'})-[:ACTED_IN]->(:Movie {title:'Matrix'}) RETURN count(*) as c").columnAs("c") 59 | // val result: CypherResult = restAPI.query("MATCH (:Person {name:'Keanu'})-[:ACTED_IN]->(:Movie {title:'Matrix'}) RETURN count(*) as c" ,null) 60 | // import scala.collection.JavaConversions._ 61 | // assertEquals(1L, result.getData.flatten.toList.get(0).toString.toLong) 62 | // restAPI.close() 63 | // } 64 | //} 65 | // 66 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/org/neo4j/spark/Neo4jGraphScalaTest.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.spark 2 | 3 | import com.lakala.datacenter.load.spark.{Executor, Neo4jGraph} 4 | import org.apache.spark.api.java.JavaSparkContext 5 | import org.apache.spark.graphx.{Edge, Graph} 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.{SparkConf, SparkContext} 8 | import org.junit.Assert._ 9 | import org.junit._ 10 | 11 | import scala.collection.JavaConverters._ 12 | 13 | 14 | /** 15 | * @author lys 16 | * @since 17.07.16 17 | */ 18 | class Neo4jGraphScalaTest { 19 | val FIXTURE: String = "CREATE (:A)-[:REL {foo:'bar'}]->(:B)" 20 | private var conf: SparkConf = null 21 | private var sc: JavaSparkContext = null 22 | // private var server: ServerControls = null 23 | 24 | @Before 25 | @throws[Exception] 26 | def setUp { 27 | // server = TestServerBuilders.newInProcessBuilder.withConfig("dbms.security.auth_enabled", "false").withFixture(FIXTURE).newServer 28 | conf = new SparkConf().setAppName("neoTest").setMaster("local[*]").set("spark.driver.allowMultipleContexts", "true") 29 | .set("spark.neo4j.bolt.url", Neo4jContstanTest.SERVER_BOLTURI) 30 | sc = SparkContext.getOrCreate(conf) 31 | } 32 | 33 | @After def tearDown { 34 | // server.close() 35 | sc.close 36 | } 37 | 38 | @Test def runCypherQueryWithParams { 39 | val data = List(Map("id" -> 3, "name" -> "Test3").asJava, Map("id" -> 2, "name" -> "Test2").asJava).asJava 40 | Executor.execute(sc.sc, "UNWIND {data} as row MERGE (n:Test {id:row.id}) SET n.name = row.name", Map(("data", data))) 41 | } 42 | 43 | @Test def runMatrixQuery { 44 | val graph = Neo4jGraph.loadGraph(sc.sc, "A", Seq.empty, "B") 45 | assertEquals(2, graph.vertices.count) 46 | assertEquals(1, graph.edges.count) 47 | } 48 | 49 | @Test def saveGraph { 50 | val edges: RDD[Edge[Long]] = sc.makeRDD(Seq(Edge(0, 1, 42L))) 51 | val graph = Graph.fromEdges(edges, -1) 52 | assertEquals(2, graph.vertices.count) 53 | assertEquals(1, graph.edges.count) 54 | Neo4jGraph.saveGraph(sc, graph, null, "test") 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /neo4j/src/test/scala/org/neo4j/spark/Neo4jRestSparkTest.scala: -------------------------------------------------------------------------------- 1 | package org.neo4j.spark 2 | 3 | import com.lakala.datacenter.load.spark.Neo4j 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | /** 7 | * Created by Administrator on 2017/5/11 0011. 8 | */ 9 | object Neo4jRestSparkTest { 10 | def main(args: Array[String]): Unit = { 11 | val conf = new SparkConf().setAppName("neoTest").setMaster("local[2]") 12 | /*.set("spark.neo4j.bolt.url","jdbc:neo4j:bolt:192.168.0.33:7687")*//*.set("spark.driver.allowMultipleContexts", "true").set("spark.neo4j.bolt.url", server.boltURI.toString)*/ 13 | val sc = new SparkContext(conf) 14 | runCypherRelQueryWithPartition(sc) 15 | } 16 | 17 | def runCypherRelQueryWithPartition(sc: SparkContext) { 18 | val neo4j: Neo4j = Neo4j(sc).cypher("match (n:Mobile {type:'1'})-[r1:loanapply] -(p:ApplyInfo)-[r2:loanapply]-(m:Mobile)-[r3:loanapply]-(q:ApplyInfo) return n.content as content1 ,type(r1) as value1,p.orderno as orderno1,type(r2) as value2,m.content as content2,type(r3) as value3,q.orderno as orderno2 ").partitions(7).batch(200) 19 | val knows: Long = neo4j.loadRowRdd.count() 20 | println(knows) 21 | } 22 | } 23 | --------------------------------------------------------------------------------