├── apply
├── bin
│ ├── batchRun.sh
│ ├── deploy.sh
│ ├── lzo.sh
│ └── start.sh
├── dependency-reduced-pom.xml
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── com
│ │ │ └── lakala
│ │ │ └── audit
│ │ │ └── rabbitmqMsg
│ │ │ ├── consumer
│ │ │ └── Receiver.java
│ │ │ ├── entityV
│ │ │ └── RequestMessageV.java
│ │ │ └── produce
│ │ │ └── Sender.java
│ ├── resources
│ │ ├── dev
│ │ │ └── config.properties
│ │ ├── extract_data_hql
│ │ ├── log4j.xml
│ │ ├── product
│ │ │ ├── config.properties
│ │ │ ├── hdfs-site.xml
│ │ │ └── hive-site.xml
│ │ └── test
│ │ │ └── config.properties
│ └── scala
│ │ ├── ApplyPageRank.scala
│ │ ├── CastToInt.scala
│ │ ├── ExploreLPAData.scala
│ │ ├── GraphOneDegreeApplyPerDiem.scala
│ │ ├── GraphxBSP.scala
│ │ ├── JudgeIsMobile.scala
│ │ ├── LoadCallhistoryData.scala
│ │ ├── LoadHiveData.scala
│ │ ├── LoadHiveData2.scala
│ │ ├── RunGraphx.scala
│ │ ├── RunLoadApplyGraphx.scala
│ │ ├── RunLoadApplyGraphx2.scala
│ │ ├── RunLoadApplyGraphx3.scala
│ │ ├── TestSql.scala
│ │ ├── com
│ │ └── lakala
│ │ │ └── datacenter
│ │ │ ├── abstractions
│ │ │ └── PregelProgram.scala
│ │ │ ├── apply
│ │ │ ├── buildGraph
│ │ │ │ ├── BuildGraphData.scala
│ │ │ │ ├── GraphOperators.scala
│ │ │ │ └── NewEdgeArr.scala
│ │ │ └── model
│ │ │ │ ├── ApplyInfo.scala
│ │ │ │ ├── BaseEntity.scala
│ │ │ │ ├── CallHistoryEntity.scala
│ │ │ │ ├── EdgeEntity.scala
│ │ │ │ └── NDegreeEntity.scala
│ │ │ ├── faund
│ │ │ ├── ApplyRandomForest.scala
│ │ │ ├── DatasetTitanic.scala
│ │ │ ├── ScalaRandomForest.scala
│ │ │ ├── SparkConfUtil.scala
│ │ │ └── Titanic.scala
│ │ │ ├── grograms
│ │ │ └── ApplyDegreeCentralityProgram.scala
│ │ │ ├── grogress
│ │ │ └── ExportNDegreeData.scala
│ │ │ ├── jaccard
│ │ │ ├── Jaccard.scala
│ │ │ └── PowerIterationClustering.scala
│ │ │ ├── louvain
│ │ │ ├── HDFSLouvainRunner.scala
│ │ │ ├── LouvainCore.scala
│ │ │ ├── LouvainHarness.scala
│ │ │ ├── VertexData.scala
│ │ │ └── VertexState.scala
│ │ │ ├── main
│ │ │ ├── Analytics.scala
│ │ │ ├── CallHistoryPageRank.scala
│ │ │ ├── Driver.scala
│ │ │ ├── LPAAlgorithm.scala
│ │ │ ├── LPCoarseAlgorithm.scala
│ │ │ ├── LiveCommunityDetection.scala
│ │ │ ├── LouvainDGA.scala
│ │ │ ├── PICCallAlgorithm.scala
│ │ │ ├── PSCANAlgorithm.scala
│ │ │ └── SemiSupervisedLabelPropagation.scala
│ │ │ ├── talk
│ │ │ ├── builtin
│ │ │ │ └── ShortestPathSample.scala
│ │ │ └── types
│ │ │ │ ├── City.scala
│ │ │ │ ├── Person.scala
│ │ │ │ └── VertexAttribute.scala
│ │ │ └── utils
│ │ │ ├── SparkCommon.scala
│ │ │ └── UtilsToos.scala
│ │ └── edu
│ │ └── gatech
│ │ └── cse8803
│ │ ├── clustering
│ │ └── PowerIterationClustering.scala
│ │ ├── graphconstruct
│ │ └── GraphLoader.scala
│ │ ├── ioutils
│ │ └── CSVUtils.scala
│ │ ├── jaccard
│ │ └── Jaccard.scala
│ │ ├── main
│ │ └── Main.scala
│ │ ├── model
│ │ └── models.scala
│ │ └── randomwalk
│ │ └── randomwalk.scala
│ └── test
│ └── scala
│ ├── CollectionUtil.scala
│ ├── CreateApplyData.scala
│ ├── CreateApplyData2.scala
│ ├── Driver.scala
│ ├── EdgeTuplesTest.scala
│ ├── GraphNdegUtil.scala
│ ├── GraphXExample.scala
│ ├── GraphxBSP.scala
│ ├── GraphxBSP2.scala
│ ├── GraphxBSP3.scala
│ ├── Median.scala
│ ├── NDegreeResult.scala
│ ├── NNTest.scala
│ ├── NumOnce.scala
│ ├── ParsesTest.scala
│ ├── TestCSV.scala
│ ├── TestRunGraphx.scala
│ ├── TrustRank.scala
│ ├── UDF_test.scala
│ ├── apply
│ ├── NDegreeCallMiddlePath.scala
│ └── NDegreeMiddlePathResult.scala
│ ├── entity
│ ├── CallEntity.scala
│ ├── CallVertex.scala
│ └── TwoDegree.scala
│ └── utils
│ ├── CollectionUtil.scala
│ ├── GraphNdegUtil.scala
│ └── GraphNdegUtil2.scala
├── common
├── pom.xml
└── src
│ ├── main
│ ├── resources
│ │ └── css
│ │ │ └── style.css
│ └── scala
│ │ └── com
│ │ └── lakala
│ │ └── datacenter
│ │ └── common
│ │ ├── graphstream
│ │ └── SimpleGraphViewer.scala
│ │ └── utils
│ │ └── DateTimeUtils.scala
│ └── test
│ ├── data
│ ├── cities_edges.txt
│ ├── cities_vertices.txt
│ ├── likeness_edges.txt
│ ├── maxvalue_edges.txt
│ ├── maxvalue_vertices.txt
│ ├── papers_edges.txt
│ ├── people_vertices.txt
│ ├── relationships_edges.txt
│ ├── us_cities_edges.txt
│ ├── us_cities_vertices.txt
│ ├── users_dense_edges.txt
│ ├── users_disjoint_edges.txt
│ ├── users_edges.txt
│ └── users_vertices.txt
│ └── scala
│ └── TestGraphViewer.scala
├── core
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── com
│ │ │ └── lakala
│ │ │ └── datacenter
│ │ │ └── core
│ │ │ ├── config
│ │ │ └── ConfigurationLoader.java
│ │ │ ├── hdfs
│ │ │ └── FileUtil.java
│ │ │ ├── messaging
│ │ │ ├── Sender.java
│ │ │ └── Worker.java
│ │ │ ├── models
│ │ │ ├── PartitionDescription.java
│ │ │ ├── ProcessorMessage.java
│ │ │ └── ProcessorMode.java
│ │ │ └── processor
│ │ │ └── GraphProcessor.java
│ └── scala
│ │ └── com
│ │ └── lakala
│ │ └── datacenter
│ │ └── core
│ │ ├── abstractions
│ │ └── PregelProgram.scala
│ │ ├── algorithms
│ │ └── Algorithms.scala
│ │ ├── grograms
│ │ ├── BetweennessCentralityProgram.scala
│ │ ├── EdgeBetweennessProgram.scala
│ │ ├── MaximumValueProgram.scala
│ │ └── ShortestPathProgram.scala
│ │ └── utils
│ │ └── UtilsToos.scala
│ └── test
│ ├── java
│ └── com
│ │ └── lakala
│ │ └── datacenter
│ │ └── core
│ │ ├── hdfs
│ │ └── FileUtilTest.java
│ │ ├── messaging
│ │ └── SenderTest.java
│ │ └── processor
│ │ └── GraphProcessorTest.java
│ └── scala
│ └── com
│ └── lakala
│ └── datacenter
│ └── core
│ └── grograms
│ ├── GraphProcessorTest.scala
│ ├── ShortestPathProgramTests.scala
│ └── ShortestPathTests.scala
├── neo4j
├── bin
│ └── start2.sh
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── com
│ │ │ └── lakala
│ │ │ └── datacenter
│ │ │ └── enums
│ │ │ ├── DataAttributeType.java
│ │ │ ├── GraphEnum.java
│ │ │ ├── Labels.java
│ │ │ └── RelationshipTypes.java
│ ├── resources
│ │ ├── css
│ │ │ └── style.css
│ │ ├── dev
│ │ │ ├── config.properties
│ │ │ └── hive-site.xml
│ │ ├── log4j.xml
│ │ ├── product
│ │ │ └── config.properties
│ │ └── test
│ │ │ └── config.properties
│ └── scala
│ │ └── com
│ │ └── lakala
│ │ └── datacenter
│ │ ├── abstractions
│ │ └── DataGenerator.scala
│ │ ├── constant
│ │ └── StreamingConstant.scala
│ │ ├── cypher
│ │ └── NeoData.scala
│ │ ├── grogram
│ │ └── Neo4jDataGenerator.scala
│ │ ├── load
│ │ └── spark
│ │ │ ├── ClusterGraphDatabase.scala
│ │ │ ├── ExplortApplyData.scala
│ │ │ ├── ExplortApplyData2.scala
│ │ │ ├── LoadHiveData.scala
│ │ │ ├── Neo4j.scala
│ │ │ ├── Neo4jConfig.scala
│ │ │ ├── Neo4jDataFrame.scala
│ │ │ ├── Neo4jGraph.scala
│ │ │ ├── Neo4jJavaIntegration.scala
│ │ │ ├── Neo4jPartition.scala
│ │ │ ├── Neo4jRowRDD.scala
│ │ │ └── Neo4jTupleRDD.scala
│ │ ├── main
│ │ ├── HandleTask.scala
│ │ ├── Main.scala
│ │ ├── MessageParam.scala
│ │ └── TrialConsumerKafka.scala
│ │ ├── realtimeBuildGraphx
│ │ ├── MsgOffsetStreamListener.scala
│ │ ├── SendMsg.scala
│ │ └── SparkStreamingOnKafkaDirect.scala
│ │ └── utils
│ │ ├── ArgsCommon.scala
│ │ ├── RedisUtils.scala
│ │ └── UtilsTools.scala
│ └── test
│ ├── java
│ ├── ApplyInfoConsumer.java
│ ├── ConsumerKafka.java
│ ├── DataAttributeType.java
│ ├── JavaKafkaSimpleConsumerAPI.java
│ ├── JavaKafkaSimpleConsumerAPITest.java
│ ├── KafkaBrokerInfo.java
│ ├── KafkaConsumer.java
│ ├── KafkaProducer.java
│ ├── KafkaProducer2.java
│ ├── KafkaTopicPartitionInfo.java
│ ├── LogSession.java
│ ├── OperatorKafka.java
│ ├── SendKafkaMsgTest.java
│ └── TestCypher.java
│ └── scala
│ ├── BroadcastAccumulatorStreaming.scala
│ ├── ClientRedisTest.scala
│ ├── CollectionUtil.scala
│ ├── ConsumerGroupExample.scala
│ ├── GraphNdegUtil.scala
│ ├── Main.scala
│ ├── StreamingFromKafka.scala
│ ├── TestApiNeo4j.scala
│ ├── TestCypher.scala
│ ├── TestKafka.scala
│ ├── TestRedis.scala
│ └── org
│ └── neo4j
│ └── spark
│ ├── ExplortApplyDataTest.scala
│ ├── MainTest.scala
│ ├── Neo4jContstanTest.scala
│ ├── Neo4jDataFrameScalaTest.scala
│ ├── Neo4jGraphScalaTest.scala
│ ├── Neo4jRestSparkTest.scala
│ └── Neo4jSparkTest.scala
└── pom.xml
/apply/bin/batchRun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ $# -lt 1 ] ; then
4 | echo "USAGE: $0 begin_date [end_date]"
5 | exit 1;
6 | fi
7 |
8 | beginDate=$1
9 | yesterday=$(date --date="1 days ago" '+%Y-%m-%d')
10 | endDate=$yesterday
11 | if [ $# -gt 1 ] ; then
12 | endDate=$2
13 | fi
14 |
15 | beginTime=`date -d $beginDate '+%s'`
16 | yesterdayTime=`date -d $yesterday '+%s'`
17 | endTime=`date -d $endDate '+%s'`
18 | if [ $beginTime -gt $yesterdayTime ] ; then
19 | echo "begin_date can only be yesterday[$endDate] at the latest"
20 | exit 1;
21 | fi
22 | if [ $endTime -gt $yesterdayTime ] ; then
23 | echo "end_date can only be yesterday[$yesterday] at the latest"
24 | exit 1;
25 | fi
26 | if [ $beginTime -gt $endTime ] ; then
27 | echo "begin_date can only be end_date[$endDate] at the latest"
28 | exit 1;
29 | fi
30 |
31 | #echo $beginDate
32 | #echo $endDate
33 | currentDate=$beginDate
34 | currentTime=$beginTime
35 |
36 | cd "`dirname "$0"`"
37 |
38 | while [ $currentTime -le $endTime ]
39 | do
40 | #echo $currentDate
41 | sh start.sh $currentDate
42 | currentDate=`date -d "$currentDate +1 day" '+%Y-%m-%d'`
43 | currentTime=`date -d $currentDate '+%s'`
44 | done
45 |
--------------------------------------------------------------------------------
/apply/bin/deploy.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | if [ $# != 1 ] ; then
4 | echo "USAGE: $0 ENV(dev|test|product)"
5 | exit 1;
6 | fi
7 |
8 | cd "$(cd "`dirname "$0"`"/../..; pwd)"
9 | mvn -U clean package dependency:copy-dependencies -DskipTests -P$1 -Papply
10 |
--------------------------------------------------------------------------------
/apply/bin/lzo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ####################################
3 | ## lzo hadoop textfile
4 | ## usage:sh lzo.sh source_dir
5 | ## example:sh lzo.sh /user/flume
6 | ####################################
7 | startTime=`date +%s`
8 | echo "the script begin at $(date +%H:%M:%S)"
9 | source_dir=$1
10 | cd /tmp
11 | hadoop fs -get ${source_dir} /tmp
12 | filepaths=()
13 | function getfilePath(){
14 | for file in ` ls $1 `
15 | do
16 | if [ -d $1"/"$file ]
17 | then
18 | getfilePath $1"/"$file
19 | else
20 | filepaths[${#filepaths[@]}]=$1"/"$file
21 | fi
22 | done
23 | }
24 | path=/tmp/${source_dir##*/}
25 | getfilePath $path
26 | #echo ${filepaths[*]}
27 | for filepath in ${filepaths[@]}
28 | do
29 | lzop ${filepath}
30 | rm -rf ${filepath}
31 | done
32 | hadoop fs -mv ${source_dir} ${source_dir}.bak
33 | hadoop fs -put $path ${source_dir%/*}
34 | for filepath in ${filepaths[@]}
35 | do
36 | hadoop jar /usr/hdp/2.2.6.0-2800/hadoop/lib/hadoop-lzo-0.6.0.2.2.6.0-2800.jar com.hadoop.compression.lzo.LzoIndexer ${source_dir%/*}/${filepath#*/tmp/}.lzo
37 | #2>&1 > /data/hdfs_logs/${source_dir##*/}.log
38 | done
39 | rm -rf $path
40 | endTime=`date +%s`
41 | echo "the script end at $(date +%H:%M:%S)"
42 | echo "total second is" $(($endTime-$startTime))
--------------------------------------------------------------------------------
/apply/bin/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ## usage: sh bin/start.sh -i /logs/device/* -d 2016-01-11
3 |
4 | SPARK_HOME=/usr/hdp/current/spark-client
5 | HIVE_HOME=/usr/hdp/current/hive-client
6 | PROJECT_HOME="$(cd "`dirname "$0"`"/..; pwd)"
7 | HDP_VERSION=2.4.0.0-169
8 | APP_CACHE_DIR=/tmp/device
9 |
10 | stdate=${1:-`date -d '1 days ago' +"%Y-%m-%d"`}
11 | #inputdir=/logs/device/*
12 | #inputfile=/logs/device/*/2016-01-{1[1-9],2[0-1]}
13 | while getopts "d:i:" opt ; do
14 | case $opt in
15 | d)stdate=$OPTARG ;;
16 | i)inputdir=$OPTARG ;;
17 | ?)echo "==> please input arg: stdate(d), inputdir(i)" && exit 1 ;;
18 | esac
19 | done
20 |
21 | #echo "==> ready for geoip...."
22 | #hadoop fs -mkdir -p $APP_CACHE_DIR/geoip
23 | #hadoop fs -test -e $APP_CACHE_DIR/geoip/GeoLite2-City.mmdb
24 | #if [ $? -ne 0 ]; then
25 | # echo "GeoLite2-City.mmdb not exists!"
26 | # hadoop fs -put $PROJECT_HOME/../tcloud-log-analysis/src/main/bundleApp/coord-common/geoip/GeoLite2-City.mmdb $APP_CACHE_DIR/geoip/
27 | #fi
28 |
29 | ## https://issues.apache.org/jira/browse/ZEPPELIN-93
30 | ## https://github.com/caskdata/cdap/pull/4106
31 | spark-submit \
32 | --class RunLoadApplyGraphx3 \
33 | --master yarn \
34 | --deploy-mode cluster \
35 | --queue dc \
36 | --driver-memory 2G \
37 | --executor-memory 8G \
38 | --num-executors 4 \
39 | --executor-cores 3 \
40 | --conf "spark.rpc.askTimeout=300s" \
41 | --driver-java-options "-XX:-UseGCOverheadLimit -Xms2G -Xmx2G -XX:MaxPermSize=2G -Dhdp.version=$HDP_VERSION -Dspark.yarn.am.extraJavaOptions=-Dhdp.version=$HDP_VERSION" \
42 | --verbose \
43 | --files $PROJECT_HOME/target/classes/hive-site.xml \
44 | --driver-class-path $PROJECT_HOME/target/dependency/mysql-connector-java-5.1.36.jar \
45 | --jars $PROJECT_HOME/target/dependency/mysql-connector-java-5.1.36.jar,$SPARK_HOME/lib/datanucleus-api-jdo-3.2.6.jar,$SPARK_HOME/lib/datanucleus-core-3.2.10.jar,$SPARK_HOME/lib/datanucleus-rdbms-3.2.9.jar \
46 | $PROJECT_HOME/target/data-analysis-sdk.jar \
47 | $stdate
48 |
49 | ## --packages com.databricks:spark-csv_2.10:1.3.0 \
50 | ## 2>&1 > output.txt
--------------------------------------------------------------------------------
/apply/dependency-reduced-pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | graphx-analysis
5 | com.lakala.datacenter
6 | 1.0.0-SNAPSHOT
7 |
8 | 4.0.0
9 | graphx-analysis-apply
10 | graphx-analysis-apply
11 | http://maven.apache.org
12 |
13 | graphx-analysis-apply
14 |
15 |
16 | UTF-8
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/apply/pom.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | com.lakala.datacenter
5 | graphx-analysis
6 | 1.0.0-SNAPSHOT
7 |
8 | 4.0.0
9 |
10 | graphx-analysis-apply
11 | jar
12 |
13 | graphx-analysis-apply
14 | http://maven.apache.org
15 |
16 |
17 | UTF-8
18 |
19 |
20 |
21 |
22 |
23 | com.lakala.datacenter
24 | graphx-analysis-core
25 | ${project.version}
26 |
27 |
28 |
29 | graphx-analysis-apply
30 |
31 |
32 |
--------------------------------------------------------------------------------
/apply/src/main/java/com/lakala/audit/rabbitmqMsg/consumer/Receiver.java:
--------------------------------------------------------------------------------
1 | package com.lakala.audit.rabbitmqMsg.consumer;
2 |
3 | import com.google.gson.Gson;
4 | import com.lakala.audit.rabbitmqMsg.entityV.RequestMessageV;
5 | import com.rabbitmq.client.Channel;
6 | import com.rabbitmq.client.Connection;
7 | import com.rabbitmq.client.ConnectionFactory;
8 | import com.rabbitmq.client.QueueingConsumer;
9 |
10 | import java.io.IOException;
11 | import java.util.concurrent.TimeoutException;
12 |
13 | /**
14 | * Created by Administrator on 2017/8/1 0001.
15 | */
16 | public class Receiver {
17 | private final static String AUDIT_QUEUE_NAME = "audit_mq";
18 | // private final static String USERNAME = "lys";
19 | // private final static String PASSWORD = "123456";
20 | // private final static String VIRTUALHOST = "/";
21 | // private final static String HOST = "localhost";
22 |
23 | private final static String HOST = "192.168.0.182";
24 | private final static String USERNAME = "antifraud";
25 | private final static String PASSWORD = "antifraud";
26 | private final static String VIRTUALHOST = "antifraud";
27 | private final static int PORTNUMBER = 5672;
28 |
29 | public static void main(String[] args) {
30 | try {
31 | work();
32 | } catch (IOException e) {
33 | e.printStackTrace();
34 | } catch (InterruptedException e) {
35 | e.printStackTrace();
36 | } catch (TimeoutException e) {
37 | e.printStackTrace();
38 | }
39 |
40 | }
41 |
42 | public static void work() throws java.io.IOException,
43 | java.lang.InterruptedException, TimeoutException {
44 | ConnectionFactory factory = new ConnectionFactory();
45 | // factory.setHost("192.168.0.182");
46 | factory.setHost(HOST);
47 | factory.setPort(PORTNUMBER);
48 | factory.setUsername(USERNAME);
49 | factory.setPassword(PASSWORD);
50 | factory.setVirtualHost(VIRTUALHOST);
51 | Connection connection = factory.newConnection();
52 | Channel channel = connection.createChannel();
53 |
54 | channel.queueDeclare(AUDIT_QUEUE_NAME, false, false, false, null);
55 | channel.basicQos(20);
56 |
57 | QueueingConsumer consumer = new QueueingConsumer(channel);
58 | channel.basicConsume(AUDIT_QUEUE_NAME, false, consumer);
59 |
60 | System.out.println(" [*] Waiting for messages. To exit press CTRL+C");
61 |
62 | while (true) {
63 | QueueingConsumer.Delivery delivery = consumer.nextDelivery();
64 | String message = new String(delivery.getBody());
65 |
66 | System.out.println(" [x] Received '" + message + "'");
67 |
68 | Gson gson = new Gson();
69 | RequestMessageV requestMessageV = gson.fromJson(message, RequestMessageV.class);
70 | //TODO 数据解析放到redis
71 |
72 | System.out.println(requestMessageV.getOrderno());
73 | System.out.println(" [x] Done '" + message + "'");
74 | channel.basicAck(delivery.getEnvelope().getDeliveryTag(), false);
75 | }
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/apply/src/main/java/com/lakala/audit/rabbitmqMsg/entityV/RequestMessageV.java:
--------------------------------------------------------------------------------
1 | package com.lakala.audit.rabbitmqMsg.entityV;
2 |
3 | /**
4 | * Created by Administrator on 2017/8/1 0001.
5 | */
6 | public class RequestMessageV {
7 | public RequestMessageV() {
8 | }
9 |
10 | public RequestMessageV(String orderno, String statue) {
11 | this.orderno = orderno;
12 | this.statue = statue;
13 | }
14 |
15 | String orderno;
16 | String statue;
17 |
18 | public String getOrderno() {
19 | return orderno;
20 | }
21 |
22 | public void setOrderno(String orderno) {
23 | this.orderno = orderno;
24 | }
25 |
26 | public String getStatue() {
27 | return statue;
28 | }
29 |
30 | public void setStatue(String statue) {
31 | this.statue = statue;
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/apply/src/main/java/com/lakala/audit/rabbitmqMsg/produce/Sender.java:
--------------------------------------------------------------------------------
1 | package com.lakala.audit.rabbitmqMsg.produce;
2 |
3 | import com.google.gson.Gson;
4 | import com.lakala.audit.rabbitmqMsg.entityV.RequestMessageV;
5 | import com.rabbitmq.client.Channel;
6 | import com.rabbitmq.client.Connection;
7 | import com.rabbitmq.client.ConnectionFactory;
8 |
9 | import java.io.IOException;
10 | import java.util.concurrent.TimeoutException;
11 |
12 |
13 | /**
14 | * Created by Administrator on 2017/8/1 0001.
15 | */
16 | public class Sender {
17 | private final static String AUDIT_QUEUE_NAME = "audit_mq";
18 | // private final static String USERNAME = "lys";
19 | // private final static String PASSWORD = "123456";
20 | // private final static String VIRTUALHOST = "/";
21 | // private final static String HOST = "localhost";
22 |
23 | private final static String HOST = "192.168.0.182";
24 | private final static String USERNAME = "antifraud";
25 | private final static String PASSWORD = "antifraud";
26 | private final static String VIRTUALHOST = "antifraud";
27 | private final static int PORTNUMBER = 5672;
28 |
29 | public static void main(String[] args) {
30 | Gson gson = new Gson();
31 | RequestMessageV requestMessageV = new RequestMessageV("XNA20170505131153011496369566130", "Q");
32 | String message = gson.toJson(requestMessageV);
33 | System.out.println(message);
34 | //message={"orderno":"XNA20170505131153011496369566130","statue":"Q"}
35 | try {
36 | send(message);
37 | } catch (IOException e) {
38 | e.printStackTrace();
39 | } catch (InterruptedException e) {
40 | e.printStackTrace();
41 | } catch (TimeoutException e) {
42 | e.printStackTrace();
43 | }
44 |
45 | }
46 |
47 | public static void send(String message) throws java.io.IOException,
48 | java.lang.InterruptedException, TimeoutException {
49 |
50 | ConnectionFactory factory = new ConnectionFactory();
51 | factory.setHost(HOST);
52 | factory.setPort(PORTNUMBER);
53 | factory.setUsername(USERNAME);
54 | factory.setPassword(PASSWORD);
55 | factory.setVirtualHost(VIRTUALHOST);
56 | Connection connection = factory.newConnection();
57 | Channel channel = connection.createChannel();
58 | channel.queueDeclare(AUDIT_QUEUE_NAME, false, false, false, null);
59 | channel.basicPublish("", AUDIT_QUEUE_NAME, null, message.getBytes("UTF-8"));
60 | System.out.println("已经发送消息....." + message);
61 | channel.close();
62 | connection.close();
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/apply/src/main/resources/dev/config.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luhm2017/graphx-analysis/9e3a96ec0df5da8208655face4ff0b0e6b3ed497/apply/src/main/resources/dev/config.properties
--------------------------------------------------------------------------------
/apply/src/main/resources/extract_data_hql:
--------------------------------------------------------------------------------
1 | use lkl_card_score;
2 | set mapreduce.job.queuename=szoffline;
3 | -- 抽取出每个人的逾期数据
4 | create table fraud_mobile_performance AS SELECT if(a.label=0,1,0) AS good,if(a.label=1,1,0) AS bad,if(a.label=2,1,0) AS unknown,a.apply_time,b.history_due_day,b.current_due_day,b.mobile from fqz_order_performance_data_new a inner join creditloan.s_c_apply_user b ON a.cert_no= b.cert_no AND a.year='2017' AND a.month ='09' AND a.day='19' AND b.year='2017' AND b.month='09' AND b.day='19';
5 | -- 社区 黑名单
6 | create table fraud_community_mobile_black as select a.community_mobile,a.mobile,if(b.mobile is not null,0,1) as lable from louvain_result2 a left outer join creditloan.s_c_loan_blacklist b on a.community_mobile = b.mobile and b.year='2017' and b.month='09' and b.day='19';
7 | -- 社区id每个人的逾期情况按社区id,是否黑名单分组
8 | create table fraud_community_mobile_black_performance as SELECT a.community_mobile,a.lable,sum(if(b.good>=0,b.good,0)) goods,sum(if(b.bad>=0,b.bad,0)) bads,sum(if(b.unknown>=0,b.unknown,0)) unknowns,sum(if(b.history_due_day>=0,b.history_due_day,0)) history_due_days,sum(if(b.current_due_day>=0,b.current_due_day,0)) current_due_days from fraud_community_mobile_black AS a LEFT JOIN fraud_mobile_performance AS b on a.community_mobile=b.mobile GROUP BY a.community_mobile,a.lable;
9 |
--------------------------------------------------------------------------------
/apply/src/main/resources/log4j.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/apply/src/main/resources/product/config.properties:
--------------------------------------------------------------------------------
1 | hdfs_root_path=hdfs://ns1/
--------------------------------------------------------------------------------
/apply/src/main/resources/product/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 | dfs.nameservices
7 | ns1
8 |
9 |
10 | dfs.client.failover.proxy.provider.ns1
11 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
12 |
13 |
14 | dfs.ha.automatic-failover.enabled.ns1
15 | true
16 |
17 |
18 | ha.zookeeper.quorum
19 |
20 | datanode4.lakala.com:2181,datanode5.lakala.com:2181,datanode6.lakala.com:2181,datanode7.lakala.com:2181,datanode8.lakala.com:2181
21 |
22 |
23 |
24 | dfs.ha.namenodes.ns1
25 | namenode114,namenode148
26 |
27 |
28 | dfs.namenode.rpc-address.ns1.namenode114
29 | namenode.lakala.com:8020
30 |
31 |
32 | dfs.namenode.servicerpc-address.ns1.namenode114
33 | namenode.lakala.com:8022
34 |
35 |
36 | dfs.namenode.http-address.ns1.namenode114
37 | namenode.lakala.com:50070
38 |
39 |
40 | dfs.namenode.https-address.ns1.namenode114
41 | namenode.lakala.com:50470
42 |
43 |
44 | dfs.namenode.rpc-address.ns1.namenode148
45 | namenodestandby.lakala.com:8020
46 |
47 |
48 | dfs.namenode.servicerpc-address.ns1.namenode148
49 | namenodestandby.lakala.com:8022
50 |
51 |
52 | dfs.namenode.http-address.ns1.namenode148
53 | namenodestandby.lakala.com:50070
54 |
55 |
56 | dfs.namenode.https-address.ns1.namenode148
57 | namenodestandby.lakala.com:50470
58 |
59 |
60 | dfs.replication
61 | 3
62 |
63 |
64 | dfs.blocksize
65 | 134217728
66 |
67 |
68 | dfs.client.use.datanode.hostname
69 | false
70 |
71 |
72 | fs.permissions.umask-mode
73 | 022
74 |
75 |
76 | dfs.namenode.acls.enabled
77 | true
78 |
79 |
80 | dfs.client.use.legacy.blockreader
81 | false
82 |
83 |
84 | dfs.client.read.shortcircuit
85 | false
86 |
87 |
88 | dfs.domain.socket.path
89 | /var/run/hdfs-sockets/dn
90 |
91 |
92 | dfs.client.read.shortcircuit.skip.checksum
93 | false
94 |
95 |
96 | dfs.client.domain.socket.data.traffic
97 | false
98 |
99 |
100 | dfs.datanode.hdfs-blocks-metadata.enabled
101 | true
102 |
103 |
--------------------------------------------------------------------------------
/apply/src/main/resources/test/config.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luhm2017/graphx-analysis/9e3a96ec0df5da8208655face4ff0b0e6b3ed497/apply/src/main/resources/test/config.properties
--------------------------------------------------------------------------------
/apply/src/main/scala/CastToInt.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.sql.api.java.UDF1
2 |
3 | import scala.util.matching.Regex
4 |
5 | /**
6 | * Created by linyanshi on 2017/9/14 0014.
7 | */
8 | class CastToInt extends UDF1[String, Long] {
9 | val pattern = new Regex("[0-9]{1,}")
10 |
11 | override def call(value: String): Long = {
12 | if (pattern.pattern.matcher(value).matches() && value.toLong < 86400l) value.trim.toLong
13 | else if (pattern.pattern.matcher(value).matches() && value.toLong >= 86400l) 86400l
14 | else 0L
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/apply/src/main/scala/ExploreLPAData.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.graphx.Edge
2 | import org.apache.spark.{SparkConf, SparkContext}
3 |
4 | /**
5 | * Created by linyanshi on 2017/9/14 0014.
6 | */
7 | object ExploreLPAData {
8 | def main(args: Array[String]): Unit = {
9 | val conf = new SparkConf().setAppName("ExploreLPAData").set("spark.eventLog.enabled", "true")
10 | val sc = new SparkContext(conf)
11 | val rdd = sc.textFile(args(0), 100).mapPartitions(lines => lines.map { line =>
12 | val arr = line.split(",")
13 | Edge(arr(1).toLong,arr(2).toLong)
14 | })
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/apply/src/main/scala/JudgeIsMobile.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.sql.api.java.UDF1
2 |
3 | import scala.util.matching.Regex
4 |
5 | /**
6 | * Created by linyanshi on 2017/9/14 0014.
7 | */
8 | class JudgeIsMobile extends UDF1[String,Boolean]{
9 | val pattern = new Regex("^((17[0-9])|(14[0-9])|(13[0-9])|(15[^4,\\D])|(18[0,5-9]))\\d{8}$")
10 | override def call(value: String): Boolean = {
11 | pattern.pattern.matcher(value).matches()
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/apply/src/main/scala/LoadCallhistoryData.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.sql.hive.HiveContext
2 | import org.apache.spark.sql.types.DataTypes
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Created by linyanshi on 2017/9/14 0014.
7 | */
8 | object LoadCallhistoryData {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setAppName("LoadCallhistoryData")
11 | val sc = new SparkContext(conf)
12 | val hc = new HiveContext(sc)
13 | val date = args(0).split("-")
14 | val year = date(0)
15 | val month = date(1)
16 | val day = date(2)
17 | hc.sql("use datacenter")
18 | hc.udf.register("isMobile", new JudgeIsMobile(), DataTypes.BooleanType)
19 | hc.udf.register("castInt", new CastToInt(), DataTypes.LongType)
20 | val hql =
21 | s"""SELECT a.deviceid,a.loginname,a.caller_phone,sum(castInt(a.duration)) AS duration,max(a.date) AS date,max(a.collecttime) AS collecttime
22 | |FROM r_callhistory_week a WHERE a.year='${year}' AND a.month='${month}' AND a.day='${day}'
23 | | AND a.loginname is not null AND a.caller_phone is not null AND isMobile(a.loginname)
24 | | AND isMobile(a.caller_phone) AND a.duration is not null AND a.collecttime <>'null'
25 | | group by a.deviceid,a.loginname,a.caller_phone
26 | """.stripMargin
27 | hc.sql(hql).repartition(100).mapPartitions(rows => rows.map { row => s"${row.getAs("deviceid")},${row.getAs("loginname")},${row.getAs("caller_phone")},${row.getAs("duration")},${row.getAs("date")},${row.getAs("collecttime")}" })
28 | .saveAsTextFile(args(1))
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/apply/src/main/scala/TestSql.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.sql.{DataFrame, SQLContext}
2 | import org.apache.spark.storage.StorageLevel
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Created by Administrator on 2017/7/27 0027.
7 | */
8 | object TestSql {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("test")
11 | val sc = new SparkContext(conf)
12 | val sqlContext = new SQLContext(sc)
13 | val list = List("1","2","3","3","5")
14 | import sqlContext.implicits._
15 | val vertexInfoDF = sc.parallelize(list).toDF().persist(StorageLevel.MEMORY_AND_DISK_SER)
16 | // 用聚合顶点信息来创建特征向量的函数
17 | val mean: DataFrame = vertexInfoDF.agg("_1" -> "mean")
18 | val sd: DataFrame = vertexInfoDF.agg("_1" -> "stddev")
19 | // val median: DataFrame = vertexInfoDF.agg("_1" -> "median")
20 | val min: DataFrame = vertexInfoDF.agg("_1" -> "min")
21 | val max: DataFrame = vertexInfoDF.agg("_1" -> "max")
22 | val skew: DataFrame = vertexInfoDF.agg("_1" -> "skewness")
23 | val kurt: DataFrame = vertexInfoDF.agg("_1" -> "kurtosis")
24 | val vari: DataFrame = vertexInfoDF.agg("_1" -> "variance")
25 |
26 | val joinedStats: DataFrame = sd.join(mean).join(min).join(max).join(skew).join(kurt).join(vari)
27 | // .join(median)
28 | println(joinedStats.printSchema())
29 | println(joinedStats.foreach(row=>println(row.get(0))))
30 | vertexInfoDF.unpersist(blocking = true)
31 | val sdtestDF = Seq((1.2.toDouble, 1.6.toDouble, 1.8.toDouble, 1.9.toDouble))
32 | .toDF("numNodes", "numEdges", "maxDeg", "avgDeg")
33 | val df = sdtestDF.join(joinedStats)
34 | println(df.count()) }
35 |
36 | }
37 |
38 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/abstractions/PregelProgram.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.abstractions
2 |
3 | /**
4 | * Created by peter on 2017/4/27.
5 | */
6 |
7 | import org.apache.spark.graphx._
8 |
9 | import scala.reflect.ClassTag
10 |
11 | /**
12 | * The [[PregelProgram]] abstraction wraps Spark's Pregel API implementation from the [[GraphOps]]
13 | * class into a model that is easier to write graph algorithms.
14 | * @tparam VertexState is the generic type representing the state of a vertex
15 | */
16 | abstract class PregelProgram[VertexState: ClassTag, VD: ClassTag, ED: ClassTag] protected() extends Serializable {
17 |
18 | @transient val graph: Graph[VD, ED]
19 |
20 | /**
21 | * The vertex program receives a state update and acts to update its state
22 | * @param id is the [[VertexId]] that this program will perform a state operation for
23 | * @param state is the current state of this [[VertexId]]
24 | * @param message is the state received from another vertex in the graph
25 | * @return a [[VertexState]] resulting from a comparison between current state and incoming state
26 | */
27 | def vertexProgram(id : VertexId, state : VertexState, message : VertexState) : VertexState
28 |
29 | /**
30 | * The message broker sends and receives messages. It will initially receive one message for
31 | * each vertex in the graph.
32 | * @param triplet An edge triplet is an object containing a pair of connected vertex objects and edge object.
33 | * For example (v1)-[r]->(v2)
34 | * @return The message broker returns a key value list, each containing a VertexId and a new message
35 | */
36 | def messageBroker(triplet :EdgeTriplet[VertexState, ED]) : Iterator[(VertexId, VertexState)]
37 |
38 | /**
39 | * This method is used to reduce or combine the set of all state outcomes produced by a vertexProgram
40 | * for each vertex in each superstep iteration. Each vertex has a list of state updates received from
41 | * other vertices in the graph via the messageBroker method. This method is used to reduce the list
42 | * of state updates into a single state for the next superstep iteration.
43 | * @param a A first [[VertexState]] representing a partial state of a vertex.
44 | * @param b A second [[VertexState]] representing a different partial state of a vertex
45 | * @return a merged [[VertexState]] representation from the two [[VertexState]] parameters
46 | */
47 | def combinerMessage(a: VertexState, b: VertexState) : VertexState
48 |
49 | }
50 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/apply/buildGraph/NewEdgeArr.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.apply.buildGraph
2 |
3 | /**
4 | * Created by linyanshi on 2017/9/1 0001.
5 | */
6 | case class NewEdgeArr(srcV: String, dstV: String, var srcType: String, dstType: String, init: Boolean = false)
7 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/apply/model/ApplyInfo.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.apply.model
2 |
3 | import org.apache.commons.lang3.StringUtils
4 |
5 | /**
6 | * Created by ASUS-PC on 2017/4/13.
7 | */
8 | class ApplyInfo(var order_id: String = "",
9 | var contract_no: String = "",
10 | var business_no: String = "",
11 | var term_id: String = "",
12 | var loan_pan: String = "",
13 | var return_pan: String = "",
14 | var empmobile: String = "",
15 | var datatype: Int = 0 //0,1黑,2百
16 | ) extends BaseEntity with Product {
17 | override def toString = s"ApplyInfo(order_id=$order_id, contract_no=$contract_no, business_no=$business_no, term_id=$term_id, loan_pan=$loan_pan, return_pan=$return_pan, empmobile=$empmobile)"
18 |
19 | override def productElement(idx: Int): Any = idx match {
20 | case 0 => order_id
21 | case 1 => contract_no
22 | case 2 => business_no
23 | case 3 => term_id
24 | case 4 => loan_pan
25 | case 5 => return_pan
26 | case 6 => empmobile
27 | case 7 => datatype
28 | case 8 => inDeg
29 | case 9 => outDeg
30 | }
31 |
32 | override def productArity: Int = 10
33 |
34 | override def canEqual(that: Any): Boolean = that.isInstanceOf[ApplyInfo]
35 |
36 | override def equals(other: Any): Boolean = other match {
37 | case that: ApplyInfo =>
38 | (that canEqual this) &&
39 | order_id == that.order_id &&
40 | contract_no == that.contract_no &&
41 | business_no == that.business_no &&
42 | term_id == that.term_id &&
43 | loan_pan == that.loan_pan &&
44 | return_pan == that.return_pan &&
45 | empmobile == that.empmobile
46 | case _ => false
47 | }
48 |
49 | override def hashCode(): Int = {
50 | val state = Seq(order_id, contract_no, business_no, term_id, loan_pan, return_pan, empmobile)
51 | state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
52 | }
53 |
54 | def getKey: String = {
55 | if (StringUtils.isNotEmpty(order_id)) order_id
56 | else if (StringUtils.isNotEmpty(contract_no)) contract_no
57 | else if (StringUtils.isNotEmpty(business_no)) business_no
58 | else if (StringUtils.isNotEmpty(term_id)) term_id
59 | else if (StringUtils.isNotEmpty(loan_pan)) loan_pan
60 | else if (StringUtils.isNotEmpty(return_pan)) return_pan
61 | else empmobile
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/apply/model/BaseEntity.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.apply.model
2 |
3 | /**
4 | * Created by ASUS-PC on 2017/4/17.
5 | */
6 | trait BaseEntity extends Serializable {
7 | var inDeg: Int = 0;
8 | var outDeg: Int = 0;
9 | }
10 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/apply/model/CallHistoryEntity.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.apply.model
2 |
3 | /**
4 | * Created by ASUS-PC on 2017/4/18.
5 | */
6 | class CallHistoryEntity(var loginname: Long = 0L, var caller_phone: Long = 0L) extends BaseEntity with Serializable with Product {
7 | override def productElement(idx: Int): Any = idx match {
8 | case 0 => loginname
9 | case 1 => caller_phone
10 |
11 | }
12 |
13 | override def productArity: Int = 2
14 |
15 | override def canEqual(that: Any): Boolean = that.isInstanceOf[CallHistoryEntity]
16 |
17 | override def equals(other: Any): Boolean = other match {
18 | case that: CallHistoryEntity =>
19 | (that canEqual this) &&
20 | loginname == that.loginname &&
21 | caller_phone == that.caller_phone
22 | case _ => false
23 | }
24 |
25 | override def hashCode(): Int = {
26 | val state = Seq(loginname, caller_phone)
27 | state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
28 | }
29 |
30 | override def toString = s"CallHistoryEntity($loginname, $caller_phone)"
31 | }
32 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/apply/model/EdgeEntity.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.apply.model
2 |
3 | /**
4 | * Created by ASUS-PC on 2017/4/17.
5 | */
6 | class EdgeEntity(var scrId: Long, val destId: Long, var attr: String) extends Serializable with Product {
7 | override def productElement(idx: Int): Any = idx match {
8 | case 0 => scrId
9 | case 1 => destId
10 | case 2 => attr
11 | }
12 |
13 | override def productArity: Int = 3
14 |
15 | override def canEqual(that: Any): Boolean = that.isInstanceOf[EdgeEntity]
16 |
17 | override def equals(other: Any): Boolean = other match {
18 | case that: EdgeEntity =>
19 | (that canEqual this) &&
20 | scrId == that.scrId &&
21 | destId == that.destId
22 | case _ => false
23 | }
24 |
25 | override def hashCode(): Int = {
26 | val state = Seq(scrId, destId)
27 | state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/apply/model/NDegreeEntity.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.apply.model
2 |
3 | /**
4 | * Created by ASUS-PC on 2017/4/24.
5 | */
6 | case class NDegreeEntity(var attr: String = "",
7 | var initType: Int = 0,
8 | var loop: Int = 0)
9 | extends Serializable {
10 | }
11 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/faund/DatasetTitanic.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.faund
2 |
3 | import java.util
4 |
5 | import org.apache.spark.SparkContext
6 | import org.apache.spark.mllib.linalg.Vectors
7 | import org.apache.spark.mllib.regression.LabeledPoint
8 | import org.apache.spark.rdd.RDD
9 | import org.apache.spark.sql.{DataFrame, SQLContext}
10 |
11 | /**
12 | * Created by Administrator on 2017/7/28 0028.
13 | */
14 | object DatasetTitanic {
15 | def createDF(sqlContext: SQLContext, inputFile: String): DataFrame = { // options
16 | val options = new util.HashMap[String, String]
17 | options.put("header", "true")
18 | options.put("path", inputFile)
19 | options.put("delimiter", ",")
20 | // create dataframe from input file
21 | val df = sqlContext.load("com.databricks.spark.csv", options)
22 | df.printSchema()
23 | df
24 | }
25 |
26 | // create an RDD of Vectors from a DataFrame
27 | def createLabeledPointsRDD(ctx: SparkContext, sqlContext: SQLContext, inputFile: String): RDD[LabeledPoint] = {
28 | val df = createDF(sqlContext, inputFile)
29 | // convert dataframe to an RDD of Vectors
30 | df.map { row =>
31 | val survived = row.getString(1).toInt
32 | val arr = new Array[Double](2)
33 | arr(0) = toDouble(row.getString(5))
34 | arr(1) = toDouble(row.getString(6))
35 | new LabeledPoint(survived, Vectors.dense(arr))
36 | }
37 | }
38 |
39 | def toDouble = (value: String) => {
40 | if (value.length == 0) 0.0 else value.toDouble
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/faund/SparkConfUtil.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.faund
2 |
3 | import org.apache.spark.SparkConf
4 |
5 | /**
6 | * Created by Administrator on 2017/7/28 0028.
7 | */
8 | object SparkConfUtil {
9 | val isLocal = true;
10 |
11 | def setConf(conf: SparkConf): Unit = {
12 |
13 | if (isLocal) {
14 | conf.setMaster("local")
15 | conf.set("spark.broadcast.compress", "false")
16 | conf.set("spark.shuffle.compress", "false")
17 | }
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/faund/Titanic.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.faund
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import org.apache.spark.mllib.regression.LabeledPoint
5 | import org.apache.spark.rdd.RDD
6 | import org.apache.spark.sql.{DataFrame, SQLContext}
7 |
8 | /**
9 | * Created by Administrator on 2017/7/28 0028.
10 | */
11 | object Titanic {
12 | def main(args: Array[String]) {
13 | if (args.length < 1) {
14 | System.err.println("Usage: Titanic ")
15 | System.exit(1)
16 | }
17 |
18 | val inputFile: String = args(0)
19 | val sparkConf: SparkConf = new SparkConf().setAppName("Titanic")
20 | SparkConfUtil.setConf(sparkConf)
21 |
22 | val sc: SparkContext = new SparkContext(sparkConf)
23 | val sqlContext: SQLContext = new SQLContext(sc)
24 | val results: DataFrame = DatasetTitanic.createDF(sqlContext, inputFile)
25 |
26 | results.printSchema
27 |
28 | val data: RDD[LabeledPoint] = DatasetTitanic.createLabeledPointsRDD(sc, sqlContext, inputFile)
29 | val splits: Array[RDD[LabeledPoint]] = data.randomSplit(Array[Double](0.7, 0.3))
30 | val trainingData: RDD[LabeledPoint] = splits(0)
31 | val testData: RDD[LabeledPoint] = splits(1)
32 |
33 | System.out.println("\nRunning example of classification using RandomForest\n")
34 | ScalaRandomForest.testClassification(trainingData, testData)
35 |
36 | System.out.println("\nRunning example of regression using RandomForest\n")
37 | ScalaRandomForest.testRegression(trainingData, testData)
38 |
39 | sc.stop
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/jaccard/Jaccard.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.jaccard
2 |
3 | import org.apache.spark.graphx.{EdgeDirection, Graph}
4 | import org.apache.spark.rdd.RDD
5 |
6 | /**
7 | * Created by linyanshi on 2017/9/20 0020.
8 | */
9 | object Jaccard {
10 | /**
11 | * Return a RDD of (1-id, 2-id, similarity) where
12 | * 1-id < 2-id to avoid duplications
13 | *
14 | * @param graph
15 | * @return
16 | */
17 |
18 | def jaccardSimilarityAllMobiles(graph: Graph[Int, Int]): RDD[(Long, Long, Double)] = {
19 | val neighbors = graph.collectNeighborIds(EdgeDirection.Either).map(x => (x._1, x._2))
20 | val combinations = neighbors.cartesian(neighbors)
21 | val SimilarityAll = combinations.map { x => (x._1._1, x._2._1, jaccard(x._1._2.toSet, x._2._2.toSet)) }
22 | val result = SimilarityAll.map(x => (x._3, (x._1, x._2))).sortByKey(false, 1).map(x => (x._2._1, x._2._2, x._1))
23 | result
24 | }
25 |
26 | /**
27 | * Helper function
28 | * Jaccard 系数定义为A与B交集的大小与A与B并集的大小的比值
29 | * Given two sets, compute its Jaccard similarity and return its result.
30 | * If the union part is zero, then return 0.
31 | * @param a
32 | * @param b
33 | * @tparam A
34 | * @return
35 | */
36 | def jaccard[A](a: Set[A], b: Set[A]): Double = {
37 | val union: Double = (a ++ b).size
38 | val intersect: Double = a.intersect(b).size
39 | return (if (union == 0) 0.0 else (intersect / union))
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/jaccard/PowerIterationClustering.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.jaccard
2 |
3 | import org.apache.spark.mllib.clustering.PowerIterationClustering
4 | import org.apache.spark.rdd.RDD
5 |
6 | /**
7 | * Created by linyanshi on 2017/9/20 0020.
8 | */
9 | object PowerIterationClustering {
10 |
11 | /**
12 | * run PIC using Spark's PowerIterationClustering implementation
13 | * @param similarities All pair similarities in the shape of RDD[(selfmobile, caller, similarity)]
14 | * @return Cluster assignment for each patient in the shape of RDD[(mobile, Cluster)]
15 | */
16 | def runPIC(similarities: RDD[(Long, Long, Double)]): RDD[(Long, Int)] = {
17 | val sc = similarities.sparkContext
18 |
19 |
20 | /** Remove placeholder code below and run Spark's PIC implementation */
21 | similarities.cache().count()
22 | val pic = new PowerIterationClustering().setK(3).setMaxIterations(100)
23 | val model=pic.run(similarities)
24 | val result = model.assignments.map(a => (a.id,a.cluster))
25 | val check = result.map(x=>x.swap).groupByKey().map(x=>(x._1,x._2.size))
26 |
27 | println("PIC: ")
28 | println(check.foreach(println))
29 |
30 | result
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/louvain/HDFSLouvainRunner.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.louvain
2 |
3 | /**
4 | * Created by chenqingqing on 2017/4/4.
5 | */
6 |
7 |
8 | import org.apache.spark.SparkContext
9 | import org.apache.spark.graphx._
10 | import scala.Array.canBuildFrom
11 |
12 | /**
13 | * Execute the louvain algorithim and save the vertices and edges in hdfs at each level.
14 | * Can also save locally if in local mode.
15 | *
16 | * See LouvainHarness for algorithm details
17 | */
18 | class HDFSLouvainRunner(minProgress: Int, progressCounter: Int, outputdir: String) extends LouvainHarness(minProgress: Int, progressCounter: Int) {
19 |
20 | var qValues = Array[(Int, Double)]()
21 |
22 | override def saveLevel(sc: SparkContext, level: Int, q: Double, graph: Graph[VertexState, Double]) = {
23 | graph.vertices.saveAsTextFile(outputdir + "/level_" + level + "_vertices")
24 | graph.edges.saveAsTextFile(outputdir + "/level_" + level + "_edges")
25 | qValues = qValues :+ ((level, q))
26 | println(s"qValue: $q")
27 |
28 | // overwrite the q values at each level
29 | sc.parallelize(qValues, 1).saveAsTextFile(outputdir + "/qvalues")
30 | }
31 |
32 | override def finalSave(sc: SparkContext, level: Int, q: Double, graph: Graph[VertexState, Double]) = {
33 | graph.vertices.filter(k=>k._1 != k._2.community).sortBy(k=>k._2.community).map { x => x._1 + "," + x._2 }.repartition(10).saveAsTextFile(outputdir)
34 | //graph.edges.saveAsTextFile(outputdir+"/final_edges")
35 |
36 | println(s"qValue: $q")
37 | }
38 |
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/louvain/VertexData.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.louvain
2 |
3 | import scala.collection.mutable.HashSet
4 |
5 | /**
6 | * Created by chenqingqing on 2017/4/4.
7 | */
8 | class VertexData(val vId: Long, var cId: Long) extends Serializable {
9 | var innerDegree = 0.0 //内部结点的权重
10 | var innerVertices = new HashSet[Long]() //内部的结点
11 | var degree = 0.0 //结点的度
12 | var commVertices = new HashSet[Long]() //社区中的结点
13 | }
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/louvain/VertexState.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.louvain
2 |
3 | /**
4 | * Louvain vertex state
5 | * Contains all information needed for louvain community detection
6 | */
7 | class VertexState extends Serializable {
8 |
9 | var community = -1L //社区ID
10 | var communitySigmaTot = 0D //入度
11 | var internalWeight = 0D // self edges
12 | var nodeWeight = 0D; //out degree //出度
13 | var changed = false
14 | var q = 0D //模块度的值
15 |
16 | override def toString(): String = {
17 | // "{community:"+community+",communitySigmaTot:"+communitySigmaTot+
18 | // ",internalWeight:"+internalWeight+",nodeWeight:"+nodeWeight+"}"
19 | // s"community:$community,communitySigmaTot:$communitySigmaTot,internalWeight:$internalWeight,nodeWeight:$nodeWeight"
20 | s"community:$community,q:$q"
21 | // community.toString
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/main/CallHistoryPageRank.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.main
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.graphx.{Edge, Graph}
5 | import org.apache.spark.storage.StorageLevel
6 | import org.apache.spark.{SparkConf, SparkContext}
7 |
8 | /**
9 | * Created by linyanshi on 2017/9/19 0019.
10 | */
11 | object CallHistoryPageRank {
12 | def main(args: Array[String]): Unit = {
13 |
14 | Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
15 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.ERROR)
16 |
17 | val conf = new SparkConf().setAppName("CallHistoryPageRank")
18 | val sc = new SparkContext(conf)
19 | val edgeRdd = sc.textFile(args(0)).mapPartitions(lines => lines.map { line =>
20 | // val arr = line.split("\t")
21 | // Edge(arr(0).toLong, arr(1).toLong, 1)
22 | val arr = line.split(",")
23 | Edge(arr(1).toLong, arr(2).toLong, 1)
24 | // Edge(arr(2).toLong, arr(1).toLong, arr(3).toInt)
25 | })
26 | // val graph = GraphLoader.edgeListFile(sc, args(0), numEdgePartitions = 4)
27 |
28 | val graph = Graph.fromEdges(edgeRdd, 1, edgeStorageLevel = StorageLevel.MEMORY_AND_DISK_SER, vertexStorageLevel = StorageLevel.MEMORY_AND_DISK_SER)
29 | //参数:图,迭代次数
30 | val pageRankGraph = graph.pageRank(0.0001)
31 |
32 | pageRankGraph.vertices.sortBy(x => x._2).mapPartitions(ls => ls.map(k => s"${k._1},${k._2}")).repartition(1).saveAsTextFile(args(1))
33 | sc.stop()
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/main/Driver.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.main
2 |
3 | import com.lakala.datacenter.grogress.ExportNDegreeData
4 |
5 | /**
6 | * Created by Administrator on 2017/5/4 0004.
7 | */
8 |
9 |
10 | object Driver extends App {
11 | override def main(args: Array[String]) = {
12 | val enD = new ExportNDegreeData()
13 | enD.main(args)
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/main/LPAAlgorithm.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.main
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.graphx.{Edge, Graph}
5 | import org.apache.spark.graphx.lib.LabelPropagation
6 | import org.apache.spark.storage.StorageLevel
7 | import org.apache.spark.{SparkConf, SparkContext}
8 | import ml.sparkling.graph.operators.OperatorsDSL._
9 |
10 | /**
11 | * Created by linyanshi on 2017/9/14 0014.
12 | */
13 | object LPAAlgorithm {
14 | def main(args: Array[String]): Unit = {
15 | Logger.getLogger("org.apache.spark").setLevel(Level.ERROR);
16 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.ERROR);
17 |
18 | val conf = new SparkConf().setAppName("LPAAlgorithm")
19 | val sc = new SparkContext(conf)
20 | val edgeRdd = sc.textFile(args(0)).mapPartitions(lines => lines.map { line =>
21 | val arr = line.split(",")
22 | // Edge(arr(1).toLong,arr(2).toLong,arr(3).toInt)
23 | Edge(arr(1).toLong,arr(2).toLong,1)
24 | // Edge(arr(0).toLong,arr(1).toLong,1)
25 | })
26 | // val graph = GraphLoader.edgeListFile(sc, args(0), numEdgePartitions = 4)
27 |
28 | val graph = Graph.fromEdges(edgeRdd,1,edgeStorageLevel=StorageLevel.MEMORY_AND_DISK_SER,vertexStorageLevel=StorageLevel.MEMORY_AND_DISK_SER)
29 | //参数:图,迭代次数
30 | val lpaGraph = LabelPropagation.run(graph.reverse, args(2).toInt)
31 | val modularity = lpaGraph.modularity()
32 | println(modularity)
33 | lpaGraph.vertices.sortBy(x => x._2).mapPartitions(ls=>ls.map(k=>s"${k._1},${k._2}")).repartition(1).saveAsTextFile(args(1))
34 | sc.stop()
35 | }
36 |
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/main/LPCoarseAlgorithm.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.main
2 |
3 | import ml.sparkling.graph.operators.OperatorsDSL._
4 | import org.apache.log4j.{Level, Logger}
5 | import org.apache.spark.graphx.{Edge, Graph}
6 | import org.apache.spark.storage.StorageLevel
7 | import org.apache.spark.{SparkConf, SparkContext}
8 |
9 | /**
10 | * Created by linyanshi on 2017/9/18 0018.
11 | */
12 | object LPCoarseAlgorithm {
13 | def main(args: Array[String]): Unit = {
14 | Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
15 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.ERROR)
16 |
17 | val conf = new SparkConf().setAppName("LPCoarseAlgorithm")
18 | val sc = new SparkContext(conf)
19 | val edgeRdd = sc.textFile(args(0)).mapPartitions(lines => lines.map { line =>
20 | // val arr = line.split("\t")
21 | // Edge(arr(0).toLong, arr(1).toLong, 1)
22 | val arr = line.split(",")
23 | Edge(arr(1).toLong, arr(2).toLong, 1)
24 | })
25 | // val graph = GraphLoader.edgeListFile(sc, args(0), numEdgePartitions = 4)
26 |
27 | val graph = Graph.fromEdges(edgeRdd, 1, edgeStorageLevel = StorageLevel.MEMORY_AND_DISK_SER, vertexStorageLevel = StorageLevel.MEMORY_AND_DISK_SER)
28 | //参数:图,迭代次数
29 | val lpaGraph = graph.LPCoarse(treatAsUndirected = true)
30 | // val modularity = lpaGraph.modularity()
31 | // println(modularity)
32 | lpaGraph.vertices.mapPartitions(kcs => kcs.map(kc => (kc._1, kc._2.sortBy(k => k).head)))
33 | .filter(k => k._1 != k._2).sortBy(x => x._2)
34 | /*.mapPartitions(ls => ls.map(k => s"${k._1},${k._2.mkString(",")}"))*/ .repartition(1).saveAsTextFile(args(1))
35 | sc.stop()
36 | }
37 |
38 |
39 |
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/main/LiveCommunityDetection.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.main
2 |
3 | /**
4 | * Created by linyanshi on 2017/9/25 0025.
5 | */
6 | object LiveCommunityDetection {
7 | def main(args: Array[String]): Unit = {
8 | if (args.length < 1) {
9 | System.err.println(
10 | "Usage: LiveCommunityDetection \n" +
11 | " --numEPart=\n" +
12 | " The number of partitions for the graph's edge RDD.\n" +
13 | " [--tol=]\n" +
14 | " The tolerance allowed at convergence (smaller => more accurate). Default is " +
15 | "0.001.\n" +
16 | " [--output=]\n" +
17 | " If specified, the file to write the ranks to.\n" +
18 | " [--partStrategy=RandomVertexCut | EdgePartition1D | EdgePartition2D | " +
19 | "CanonicalRandomVertexCut]\n" +
20 | " The way edges are assigned to edge partitions. Default is RandomVertexCut.")
21 | System.exit(-1)
22 | }
23 | //file/data/graphx/input/followers.txt -numEPart=100 -tol=0.001 -output=F:\idea_workspace\SparkLearning\outfile -partStrategy=RandomVertexCut
24 | Analytics.main(args.patch(0, List("pagerank"), 0))
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/main/LouvainDGA.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.main
2 |
3 | /**
4 | * Created by linyanshi on 2017/9/14 0014.
5 | */
6 |
7 | import com.lakala.datacenter.louvain.{HDFSLouvainRunner, VertexState}
8 | import org.apache.log4j.{Level, Logger}
9 | import org.apache.spark.graphx.{Edge, Graph}
10 | import org.apache.spark.{SparkConf, SparkContext}
11 |
12 | //totalEdgeWeight: 1.56262281191699E15
13 | //# vertices moved: 61,897,309
14 | //# vertices moved: 13,746,461
15 | //# vertices moved: 5,352,635
16 | //# vertices moved: 130,270
17 | //# vertices moved: 82,426
18 | //# vertices moved: 71,584
19 | //# vertices moved: 71,105
20 | //# vertices moved: 70,030
21 | //# vertices moved: 69,937
22 | //
23 | //Completed in 18 cycles
24 | //
25 | //Starting Louvain level 1
26 | //totalEdgeWeight: 2.237895102976331E15
27 | //# vertices moved: 664,919
28 | //# vertices moved: 191,039
29 | //# vertices moved: 12,426
30 | //# vertices moved: 393
31 | //# vertices moved: 7
32 | //# vertices moved: 0
33 | //
34 | //Completed in 12 cycles
35 | //qValue: 0.9182326588364285
36 | // 总的用户数1232060 总的call_phone yong用户数 101825071
37 | //总的社区 275141 大于两个人的总的社区id 77442 关联黑名单 总的社区 1784
38 |
39 | object LouvainDGA {
40 | def main(args: Array[String]) {
41 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
42 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
43 | val conf = new SparkConf().setAppName("LouvainDGA")
44 | conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
45 | conf.registerKryoClasses(Array(classOf[VertexState]))
46 | // intputpath iterator 1 outputpath
47 | val sc = new SparkContext(conf)
48 | val data = sc.textFile(args(0))
49 | val edges = data.map(line => {
50 | val items = line.split(",")
51 | // Edge(items(0).toLong, items(1).toLong, items(2).toDouble)
52 | Edge(items(1).toLong, items(2).toLong, items(3).toDouble)
53 | // Edge(items(1).toLong, items(2).toLong, 1d)
54 | })
55 | val graph = Graph.fromEdges(edges, 1)
56 | val runner = new HDFSLouvainRunner(args(2).toInt, args(3).toInt, args(1))
57 | runner.run(sc, graph)
58 | sc.stop()
59 | }
60 | }
61 |
62 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/main/PICCallAlgorithm.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.main
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.graphx.{Edge, Graph}
5 | import org.apache.spark.mllib.clustering.PowerIterationClustering
6 | import org.apache.spark.storage.StorageLevel
7 | import org.apache.spark.{SparkConf, SparkContext}
8 |
9 | /**
10 | * Created by linyanshi on 2017/9/20 0020.
11 | * http://blog.sina.com.cn/s/blog_482da2d20102drpt.html
12 | */
13 | object PICCallAlgorithm {
14 | def main(args: Array[String]) {
15 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
16 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
17 | val conf = new SparkConf().setAppName("PICCallAlgorithm")
18 |
19 | val sc = new SparkContext(conf)
20 | val data = sc.textFile(args(0), 200)
21 | val edges = data.map(line => {
22 | val items = line.split(",")
23 | Edge(items(1).toLong, items(2).toLong, 1)
24 | // val items = line.split("\t")
25 | // Edge(items(0).toLong, items(1).toLong, 1)
26 | })
27 | val graph = Graph.fromEdges(edges, 1, edgeStorageLevel = StorageLevel.MEMORY_AND_DISK_SER, vertexStorageLevel = StorageLevel.MEMORY_AND_DISK_SER)
28 | //参数:图,迭代次数
29 | val pageRankGraph = graph.pageRank(0.0001)
30 | val pic = new PowerIterationClustering().setK(args(2).toInt).setMaxIterations(args(3).toInt).setInitializationMode("degree")
31 | val model = pic.run(pageRankGraph)
32 | val result = model.assignments.map(a => (a.id, a.cluster))
33 | result.mapPartitions(ves => ves.map(ve => s"${ve._1},${ve._2}")).repartition(1).saveAsTextFile(args(1))
34 | // val landmarks = sc.textFile("/user/guozhijie/explortoutput/louvainout4")
35 | // .mapPartitions(lines=>lines.map(line=>{val arr =line.split(",")
36 | // arr(1).toLong})).distinct().top(args(2).toInt)
37 | // val landmarks = data.map(line => {
38 | // val items = line.split(",")
39 | // items(1).toLong
40 | // }).distinct().top(args(2).toInt)
41 | // val landmarksBR = sc.broadcast(landmarks)
42 | // val shortPathGraph = ShortestPaths.run(graph, landmarksBR.value)
43 | // graph.unpersist()
44 | //
45 | // implicit def iterebleWithAvg[T: Numeric](data: Iterable[T]) = new {
46 | // def avg = average(data)
47 | // }
48 | //
49 | // def average[T](ts: Iterable[T])(implicit num: Numeric[T]) = {
50 | // num.toDouble(ts.sum) / ts.size
51 | // }
52 | //
53 | // shortPathGraph.vertices.map {
54 | // vx =>
55 | // (vx._1, {
56 | // val dx = 1.0 / vx._2.map {
57 | // sx => sx._2
58 | // }.seq.avg
59 | // val d = if (dx.isNaN | dx.isNegInfinity | dx.isPosInfinity) 0.0 else dx
60 | // d
61 | // })
62 | // }.sortBy({ vx => vx._1 }, ascending = true)
63 | // .mapPartitions(rows => rows.filter(k => k._2 > 0d).map(row => s"${row._1},${row._2}")).repartition(1).saveAsTextFile(args(1))
64 | // val similarities = Jaccard.jaccardSimilarityAllMobiles(graph)
65 | // val centralityGraph: Graph[(Double,Double),Int] = graph.hits(VertexMeasureConfiguration(treatAsUndirected=true))
66 | // val picLabels = PowerIterationClustering.runPIC(similarities)
67 | // picLabels.mapPartitions(lca => lca.map(l => s"${l._1},${l._2}")).repartition(1).saveAsTextFile(args(1))
68 |
69 | // val vertexembeddedness = graph.closenessCentrality(VertexMeasureConfiguration(treatAsUndirected = true))
70 | // vertexembeddedness.vertices.mapPartitions(ves=>ves.map(ve=>s"${ve._1},${ve._2}")).repartition(1).saveAsTextFile(args(1))
71 | sc.stop()
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/main/PSCANAlgorithm.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.main
2 |
3 | import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration
4 | import ml.sparkling.graph.operators.OperatorsDSL._
5 | import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN
6 | import org.apache.log4j.{Level, Logger}
7 | import org.apache.spark.graphx.{Edge, Graph}
8 | import org.apache.spark.storage.StorageLevel
9 | import org.apache.spark.{SparkConf, SparkContext}
10 |
11 | /**
12 | * Created by linyanshi on 2017/9/18 0018.
13 | */
14 | object PSCANAlgorithm {
15 |
16 | def main(args: Array[String]): Unit = {
17 | Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
18 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.ERROR)
19 |
20 | val conf = new SparkConf().setAppName("PSCANAlgorithm")
21 | val sc = new SparkContext(conf)
22 | val edgeRdd = sc.textFile(args(0)).mapPartitions(lines => lines.map { line =>
23 | // val arr = line.split("\t")
24 | // Edge(arr(0).toLong, arr(1).toLong, 1)
25 | val arr = line.split(",")
26 | // Edge(arr(1).toLong, arr(2).toLong, 1)
27 | Edge(arr(1).toLong, arr(2).toLong, arr(3).toInt)
28 | })
29 | // val graph = GraphLoader.edgeListFile(sc, args(0), numEdgePartitions = 4)
30 |
31 | val graph = Graph.fromEdges(edgeRdd, 1, edgeStorageLevel = StorageLevel.MEMORY_AND_DISK_SER, vertexStorageLevel = StorageLevel.MEMORY_AND_DISK_SER)
32 | //参数:图,迭代次数
33 | val pscanGraph = PSCAN.computeConnectedComponents(graph, 0.000001)
34 | // val lpaGraph = PSCAN.computeConnectedComponentsUsing(graph, args(2).toInt)
35 | val modularity = pscanGraph.modularity()
36 |
37 | println(modularity)
38 |
39 |
40 | pscanGraph.vertices.filter(k => k._1 != k._2).sortBy(x => x._2).mapPartitions(ls => ls.map(k => s"${k._1},${k._2}")).repartition(1).saveAsTextFile(args(1))
41 | sc.stop()
42 | }
43 |
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/talk/types/City.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.talk.types
2 |
3 | import org.apache.spark.graphx.VertexId
4 |
5 | case class City(name: String, id: VertexId) {
6 | override def toString() = name + " [" + id + "]"
7 | }
8 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/talk/types/Person.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.talk.types
2 |
3 | case class Person(name: String, age: Int)
4 |
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/talk/types/VertexAttribute.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.talk.types
2 |
3 | case class VertexAttribute(cityName: String, distance: Double, path: List[City])
--------------------------------------------------------------------------------
/apply/src/main/scala/com/lakala/datacenter/utils/UtilsToos.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.utils
2 |
3 | import java.nio.charset.StandardCharsets
4 |
5 | import com.google.common.hash.Hashing
6 | import com.lakala.datacenter.common.utils.DateTimeUtils
7 |
8 | import scala.util.matching.Regex
9 |
10 | /**
11 | * Created by ASUS-PC on 2017/4/18.
12 | */
13 | object UtilsToos {
14 | /**
15 | * 根据字符串生成唯一的hashcode值
16 | *
17 | * @param str
18 | * @return
19 | */
20 | def hashId(str: String) = {
21 | Hashing.md5().hashString(str, StandardCharsets.UTF_8).asLong()
22 | }
23 |
24 | /**
25 | * 手机号,电话号码验证
26 | *
27 | * @param num
28 | * @return 验证通过返回true
29 | */
30 | def isMobileOrPhone(num: String): Boolean = {
31 | val pattern = new Regex("^((17[0-9])(14[0-9])|(13[0-9])|(15[^4,\\D])|(18[0,5-9]))\\d{8}$")
32 | val pattern2 = new Regex("(?:(\\(\\+?86\\))(0[0-9]{2,3}\\-?)?([2-9][0-9]{6,7})+(\\-[0-9]{1,4})?)|(?:(86-?)?(0[0-9]{2,3}\\-?)?([2-9][0-9]{6,7})+(\\-[0-9]{1,4})?)") // 验证带区号的
33 | // val pattern2 = new Regex("^[0][1-9]{2,3}-[0-9]{5,10}$") // 验证带区号的
34 | val pattern3 = new Regex("^[1-9]{1}[0-9]{5,8}$") // 验证没有区号的
35 | num match {
36 | case pattern(_*) => {
37 | true
38 | }
39 | case pattern2(_*) => {
40 | true
41 | }
42 | case pattern3(_*) => {
43 | true
44 | }
45 | case _ => {
46 | false
47 | }
48 | }
49 | }
50 |
51 | def jugeInit(dataDt: String, sdt: String, edt: String): Boolean = {
52 | var init = false
53 | try {
54 | init = if (DateTimeUtils.parseDataString(dataDt).getMillis >= DateTimeUtils.parseDataString(sdt).getMillis
55 | && DateTimeUtils.parseDataString(dataDt).getMillis <= DateTimeUtils.parseDataString(edt).getMillis) true
56 | else false
57 | } catch {
58 | case e: Exception =>
59 | }
60 | init
61 | }
62 |
63 | def byDateFileterData(line: String, edt: String): Boolean = {
64 | var init = false
65 | try {
66 | val arr = line.split(",")
67 | val dt = if (arr(5).indexOf(".") > 0) arr(5).substring(0, arr(5).indexOf(".")) else arr(5)
68 | init = if (DateTimeUtils.parseDataString(dt).getMillis <= DateTimeUtils.parseDataString(edt).getMillis) true
69 | else false
70 | } catch {
71 | case e: Exception =>
72 | }
73 | init
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/apply/src/main/scala/edu/gatech/cse8803/clustering/PowerIterationClustering.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * @author Ting Pan .
3 | */
4 |
5 | package edu.gatech.cse8803.clustering
6 |
7 | import org.apache.spark.rdd.RDD
8 | import org.apache.spark.mllib.clustering.{PowerIterationClustering => PIC}
9 | import org.apache.spark.mllib.clustering.PowerIterationClustering
10 |
11 |
12 | /**
13 | * Power Iteration Clustering (PIC), a scalable graph clustering algorithm developed by
14 | * [[http://www.icml2010.org/papers/387.pdf Lin and Cohen]]. From the abstract: PIC finds a very
15 | * low-dimensional embedding of a dataset using truncated power iteration on a normalized pair-wise
16 | * similarity matrix of the data.
17 | *
18 | * @see [[http://en.wikipedia.org/wiki/Spectral_clustering Spectral clustering (Wikipedia)]]
19 | */
20 |
21 | object PowerIterationClustering {
22 |
23 | /** run PIC using Spark's PowerIterationClustering implementation
24 | *
25 | * @input: All pair similarities in the shape of RDD[(patientID1, patientID2, similarity)]
26 | * @return: Cluster assignment for each patient in the shape of RDD[(PatientID, Cluster)]
27 | *
28 | * */
29 |
30 | def runPIC(similarities: RDD[(Long, Long, Double)]): RDD[(Long, Int)] = {
31 | val sc = similarities.sparkContext
32 |
33 |
34 | /** Remove placeholder code below and run Spark's PIC implementation */
35 | similarities.cache().count()
36 | val pic = new PowerIterationClustering().setK(3).setMaxIterations(100)
37 | val model=pic.run(similarities)
38 | val result = model.assignments.map(a => (a.id,a.cluster))
39 | //val check = result.map(x=>x.swap).groupByKey().map(x=>(x._1,x._2.size))
40 |
41 | //println("PIC: ")
42 | //println(check.foreach(println))
43 |
44 | result
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/apply/src/main/scala/edu/gatech/cse8803/ioutils/CSVUtils.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * @author Ting Pan .
3 | */
4 | package edu.gatech.cse8803.ioutils
5 |
6 | import org.apache.spark.sql.SchemaRDD
7 | import org.apache.spark.sql.SQLContext
8 | import com.databricks.spark.csv.CsvContext
9 |
10 |
11 | object CSVUtils {
12 | def loadCSVAsTable(sqlContext: SQLContext, path: String, tableName: String): SchemaRDD = {
13 | val data = sqlContext.csvFile(path)
14 | data.registerTempTable(tableName)
15 | data
16 | }
17 |
18 | def loadCSVAsTable(sqlContext: SQLContext, path: String): SchemaRDD = {
19 | loadCSVAsTable(sqlContext, path, inferTableNameFromPath(path))
20 | }
21 |
22 | private val pattern = "(\\w+)(\\.csv)?$".r.unanchored
23 | def inferTableNameFromPath(path: String) = path match {
24 | case pattern(filename, extension) => filename
25 | case _ => path
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/apply/src/main/scala/edu/gatech/cse8803/jaccard/Jaccard.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * *
3 | * @author: Ting Pan
4 | **/
5 | package edu.gatech.cse8803.jaccard
6 |
7 | import edu.gatech.cse8803.model._
8 | import edu.gatech.cse8803.model.{EdgeProperty, VertexProperty}
9 | import org.apache.spark.graphx._
10 | import org.apache.spark.rdd.RDD
11 |
12 | object Jaccard {
13 |
14 | def jaccardSimilarityOneVsAll(graph: Graph[VertexProperty, EdgeProperty], patientID: Long): List[Long] = {
15 | /**
16 | * Given a patient ID, compute the Jaccard similarity w.r.t. to all other patients.
17 | * Return a List of patient IDs ordered by the highest to the lowest similarity.
18 | * For ties, random order is okay
19 | */
20 |
21 |
22 | val neighbors = graph.collectNeighborIds(EdgeDirection.Either).map(x => (x._1, x._2.filter(p => p > 1000))).filter(_._1 <= 1000)
23 | val neighbors_wo_patient = neighbors.filter(_._1 != patientID)
24 | val source = neighbors.filter(_._1 == patientID).map(_._2).collect.flatten.toSet
25 | val SimilarityOneVsAll = neighbors_wo_patient.map { case (vid, nbrs) => (vid, jaccard(source, nbrs.toSet)) }
26 | val result = SimilarityOneVsAll.sortBy(_._2, false).map(_._1).take(10).toList
27 | result
28 | }
29 |
30 | def jaccardSimilarityAllPatients(graph: Graph[VertexProperty, EdgeProperty]): RDD[(Long, Long, Double)] = {
31 | /**
32 | * Given a patient, med, diag, lab graph, calculate pairwise similarity between all
33 | *patients. Return a RDD of (patient-1-id, patient-2-id, similarity) where
34 | * patient-1-id < patient-2-id to avoid duplications
35 | */
36 | val neighbors = graph.collectNeighborIds(EdgeDirection.Either).map(x => (x._1, x._2.filter(p => p > 1000))).filter(_._1 <= 1000)
37 | val combinations = neighbors.cartesian(neighbors).filter { case (a, b) => a._1 < b._1 }
38 | val SimilarityAll = combinations.map { x => (x._1._1, x._2._1, jaccard(x._1._2.toSet, x._2._2.toSet)) }
39 | val result = SimilarityAll.map(x => (x._3, (x._1, x._2))).sortByKey(false, 1).map(x => (x._2._1, x._2._2, x._1))
40 | result
41 | }
42 |
43 | def jaccard[A](a: Set[A], b: Set[A]): Double = {
44 | /**
45 | * Helper function
46 | * *
47 | * Given two sets, compute its Jaccard similarity and return its result.
48 | * If the union part is zero, then return 0.
49 | */
50 |
51 |
52 | val union: Double = (a ++ b).size
53 | val intersect: Double = a.intersect(b).size
54 | return (if (union == 0) 0.0 else (intersect / union))
55 | }
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/apply/src/main/scala/edu/gatech/cse8803/main/Main.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * @author Ting Pan .
3 | */
4 |
5 | package edu.gatech.cse8803.main
6 |
7 | import java.text.SimpleDateFormat
8 |
9 | import edu.gatech.cse8803.ioutils.CSVUtils
10 | import edu.gatech.cse8803.jaccard.Jaccard
11 | import edu.gatech.cse8803.model._
12 | import edu.gatech.cse8803.randomwalk.RandomWalk
13 | import edu.gatech.cse8803.clustering.PowerIterationClustering
14 | import org.apache.spark.rdd.RDD
15 |
16 | import org.apache.spark.sql.SQLContext
17 | import org.apache.spark.{SparkConf, SparkContext}
18 | import edu.gatech.cse8803.graphconstruct.GraphLoader
19 |
20 |
21 | object Main {
22 | def main(args: Array[String]) {
23 | import org.apache.log4j.Logger
24 | import org.apache.log4j.Level
25 |
26 | Logger.getLogger("org").setLevel(Level.WARN)
27 | Logger.getLogger("akka").setLevel(Level.WARN)
28 |
29 | val sc = createContext
30 | val sqlContext = new SQLContext(sc)
31 |
32 | /** initialize loading of data */
33 | val (patient, medication, labResult, diagnostic) = loadRddRawData(sqlContext)
34 | val patientGraph = GraphLoader.load(patient, labResult, medication, diagnostic)
35 |
36 | println(Jaccard.jaccardSimilarityOneVsAll(patientGraph, 9))
37 | println(RandomWalk.randomWalkOneVsAll(patientGraph, 9))
38 |
39 | val similarities = Jaccard.jaccardSimilarityAllPatients(patientGraph)
40 |
41 | val PICLabels = PowerIterationClustering.runPIC(similarities)
42 |
43 | sc.stop()
44 | }
45 |
46 | def loadRddRawData(sqlContext: SQLContext): (RDD[PatientProperty], RDD[Medication], RDD[LabResult], RDD[Diagnostic]) = {
47 |
48 | val dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX")
49 | /** test data, must change back!! */
50 | List("data/PATIENT.csv", "data/LAB.csv", "data/DIAGNOSTIC.csv", "data/MEDICATION.csv")
51 | .foreach(CSVUtils.loadCSVAsTable(sqlContext, _))
52 |
53 | val patient = sqlContext.sql( // fix this
54 | """
55 | |SELECT subject_id, sex, dob, dod
56 | |FROM PATIENT
57 | """.stripMargin)
58 | .map(r => PatientProperty(r(0).toString, r(1).toString, r(2).toString, r(3).toString))
59 |
60 | val labResult = sqlContext.sql(
61 | """
62 | |SELECT subject_id, date, lab_name, value
63 | |FROM LAB
64 | |WHERE value IS NOT NULL and value <> ''
65 | """.stripMargin)
66 | .map(r => LabResult(r(0).toString, r(1).toString.toLong, r(2).toString, r(3).toString))
67 |
68 | val diagnostic = sqlContext.sql(
69 | """
70 | |SELECT subject_id, date, code, sequence
71 | |FROM DIAGNOSTIC
72 | """.stripMargin)
73 | .map(r => Diagnostic(r(0).toString, r(1).toString.toLong, r(2).toString, r(3).toString.toInt))
74 |
75 | val medication = sqlContext.sql(
76 | """
77 | |SELECT subject_id, date, med_name
78 | |FROM MEDICATION
79 | """.stripMargin)
80 | .map(r => Medication(r(0).toString, r(1).toString.toLong, r(2).toString))
81 |
82 | (patient, medication, labResult, diagnostic)
83 |
84 | }
85 |
86 |
87 | def createContext(appName: String, masterUrl: String): SparkContext = {
88 | val conf = new SparkConf().setAppName(appName).setMaster(masterUrl)
89 | new SparkContext(conf)
90 | }
91 |
92 | def createContext(appName: String): SparkContext = createContext(appName, "local")
93 |
94 | def createContext: SparkContext = createContext("CSE 8803 Homework Three Application", "local")
95 | }
96 |
--------------------------------------------------------------------------------
/apply/src/main/scala/edu/gatech/cse8803/model/models.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * @author Ting Pan .
3 | */
4 |
5 | package edu.gatech.cse8803.model
6 |
7 | case class LabResult(patientID: String, date: Long, labName: String, value: String)
8 |
9 | case class Diagnostic(patientID: String, date: Long, icd9code: String, sequence: Int)
10 |
11 | case class Medication(patientID: String, date: Long, medicine: String)
12 |
13 | abstract class VertexProperty
14 |
15 | case class PatientProperty(patientID: String, sex: String, dob: String, dod: String) extends VertexProperty
16 |
17 | case class LabResultProperty(testName: String) extends VertexProperty
18 |
19 | case class DiagnosticProperty(icd9code: String) extends VertexProperty
20 |
21 | case class MedicationProperty(medicine: String) extends VertexProperty
22 |
23 | abstract class EdgeProperty
24 |
25 | case class SampleEdgeProperty(name: String = "Sample") extends EdgeProperty
26 |
27 | case class PatientLabEdgeProperty(labResult: LabResult) extends EdgeProperty
28 |
29 | case class PatientDiagnosticEdgeProperty(diagnostic: Diagnostic) extends EdgeProperty
30 |
31 | case class PatientMedicationEdgeProperty(medication: Medication) extends EdgeProperty
32 |
33 |
--------------------------------------------------------------------------------
/apply/src/main/scala/edu/gatech/cse8803/randomwalk/randomwalk.scala:
--------------------------------------------------------------------------------
1 | package edu.gatech.cse8803.randomwalk
2 |
3 | import edu.gatech.cse8803.model.{PatientProperty, EdgeProperty, VertexProperty}
4 | import org.apache.spark.graphx._
5 |
6 | object RandomWalk {
7 |
8 | def randomWalkOneVsAll(graph: Graph[VertexProperty, EdgeProperty], patientID: Long, numIter: Int = 100, alpha: Double = 0.15): List[Long] = {
9 | /**
10 | * Given a patient ID, compute the random walk probability w.r.t. to all other patients.
11 | * Return a List of patient IDs ordered by the highest to the lowest similarity.
12 | * For ties, random order is okay
13 | */
14 |
15 | val patient = graph.vertices.filter(_._2.isInstanceOf[PatientProperty])
16 | val patient_count = patient.keys.max()
17 |
18 | val personalized = true
19 | val src: VertexId = patientID
20 |
21 | var rankGraph: Graph[Double, Double] = graph
22 | // Associate the degree with each vertex
23 | .outerJoinVertices(graph.outDegrees) { (vid, vdata, deg) => deg.getOrElse(0) }
24 | // Set the weight on the edges based on the degree
25 | .mapTriplets( e => 1.0 / e.srcAttr, TripletFields.Src )
26 | // Set the vertex attributes to the initial pagerank values
27 | .mapVertices { (id, attr) =>
28 | if (!(id != src && personalized)) alpha else 0.0
29 | }
30 |
31 | def delta(u: VertexId, v: VertexId): Double = { if (u == v) 1.0 else 0.0 }
32 |
33 | var iteration = 0
34 | var prevRankGraph: Graph[Double, Double] = null
35 | while (iteration < numIter) {
36 | rankGraph.cache()
37 |
38 | // Compute the outgoing rank contributions of each vertex, perform local preaggregation, and
39 | // do the final aggregation at the receiving vertices. Requires a shuffle for aggregation.
40 | val rankUpdates = rankGraph.aggregateMessages[Double](
41 | ctx => ctx.sendToDst(ctx.srcAttr * ctx.attr), _ + _, TripletFields.Src)
42 |
43 | // Apply the final rank updates to get the new ranks, using join to preserve ranks of vertices
44 | // that didn't receive a message. Requires a shuffle for broadcasting updated ranks to the
45 | // edge partitions.
46 | prevRankGraph = rankGraph
47 | // new update rule
48 | //PR[i] = (1 - alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum (if i not start node)
49 | //PR[i] = alpha + (1 - alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum (if i is start node)
50 | val rPrb = {
51 | (src: VertexId, id: VertexId) => alpha * delta(src, id)
52 | }
53 | rankGraph = rankGraph.joinVertices(rankUpdates) {
54 | (id, oldRank, msgSum) => rPrb(src, id) + (1.0 - alpha) * msgSum
55 | }.cache()
56 |
57 | rankGraph.edges.foreachPartition(x => {}) // also materializes rankGraph.vertices
58 | // logInfo(s"PageRank finished iteration $iteration.")
59 | prevRankGraph.vertices.unpersist(false)
60 | prevRankGraph.edges.unpersist(false)
61 |
62 | /** println("iteration: "+iteration)
63 | println()
64 | println(rankGraph.vertices.filter(_._1<=1000).filter( _._1!=patientID).sortBy(_._2,false).take(15).foreach(println))*/
65 | iteration += 1
66 | }
67 |
68 | val result = rankGraph.vertices.filter(_._1<=1000).filter( _._1!=patientID).sortBy(_._2,false).map(_._1).take(10).toList
69 | result
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/apply/src/test/scala/CollectionUtil.scala:
--------------------------------------------------------------------------------
1 | import scala.collection.mutable.ArrayBuffer
2 | import scala.reflect.ClassTag
3 |
4 | /**
5 | * Created by liuchen on 2017/8/10.
6 | * Description:
7 | */
8 | object CollectionUtil {
9 |
10 | /**
11 | * 对具有Traversable[(K, V)]类型的集合添加reduceByKey相关方法
12 | *
13 | * @param collection
14 | * @param kt
15 | * @param vt
16 | * @tparam K
17 | * @tparam V
18 | */
19 | implicit class CollectionHelper[K, V](collection: ArrayBuffer[(K, V)])(implicit kt: ClassTag[K], vt: ClassTag[V]) {
20 | def reduceByKeyMy(f: (V, V) => V): Traversable[(K, V)] = {
21 | val group: Map[K, ArrayBuffer[(K, V)]] = collection.groupBy(_._1)
22 | group.map(x => x._2.reduce((a, b) => (a._1, f(a._2, b._2))))
23 | }
24 |
25 |
26 | /**
27 | * reduceByKey的同时,返回被reduce掉的元素的集合
28 | *
29 | * @param f
30 | * @return
31 | */
32 | def reduceByKeyWithReduced(f: (V, V) => V)(implicit kt: ClassTag[K], vt: ClassTag[V]): (Traversable[(K, V)], Traversable[(K, V)]) = {
33 | val reduced: ArrayBuffer[(K, V)] = ArrayBuffer()
34 | val newSeq = collection.groupBy(_._1).map {
35 | case (_: K, values: Traversable[(K, V)]) => values.reduce((a, b) => {
36 | val newValue: V = f(a._2, b._2)
37 | val reducedValue: V = if (newValue == a._2) b._2 else a._2
38 | val reducedPair: (K, V) = (a._1, reducedValue)
39 | reduced += reducedPair
40 | (a._1, newValue)
41 | })
42 | }
43 | (newSeq, reduced.toTraversable)
44 | }
45 | }
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/apply/src/test/scala/CreateApplyData.scala:
--------------------------------------------------------------------------------
1 | import org.apache.commons.lang3.StringUtils
2 | import org.apache.spark.{SparkConf, SparkContext}
3 | import com.lakala.datacenter.utils.UtilsToos._
4 | import scala.util.Random
5 |
6 | /**
7 | * Created by ASUS-PC on 2017/4/18.
8 | */
9 | object CreateApplyData {
10 |
11 | def main(args: Array[String]): Unit = {
12 | val conf = new SparkConf().setMaster("local[2]").setAppName("CreateApplyData")
13 | val sc = new SparkContext(conf)
14 | val callLine = sc.textFile("file:///F:/lakalaFinance_workspaces/applogs/000000_0")
15 | val applyLine = sc.textFile("file:///F:/lakalaFinance_workspaces/applogs/query_result.csv").filter(line => (!line.startsWith("s_c_loan_apply")))
16 | val call = callLine.mapPartitions { lines =>
17 | lines.map { line =>
18 | var arr = line.split("\u0001")
19 | (if (StringUtils.isNotBlank(arr(4)) && isMobileOrPhone(arr(4))) arr(4) else "", if (StringUtils.isNotBlank(arr(6)) && isMobileOrPhone(arr(6))) arr(6) else "")
20 | }
21 | }
22 | val list = call.filter(k => StringUtils.isNotBlank(k._1)).map(k => k._1.toLong).union(call.filter(k => StringUtils.isNotBlank(k._2)).map(k => k._2.toLong)).collect().toSet.toList
23 | println("mobil ************************")
24 | list.sorted.foreach(println)
25 | println("mobil ************************")
26 | val ac = sc.broadcast(list)
27 | applyLine.mapPartitions {
28 | val list: List[Long] = ac.value
29 | val seed: Int = list.size
30 | lines => lines.map {
31 | line =>
32 | var arr = line.split(",")
33 | val index = getIndex(seed)
34 | val s = if (StringUtils.isBlank(arr(41)) || "null".equals(arr(41).toLowerCase)) "," + list(index) + ","
35 | else if (StringUtils.isNotBlank(arr(41)) && !isMobileOrPhone(arr(41))) "," + list(index) + ","
36 | else "," + arr(41) + ","
37 | s"${arr.slice(0, 41).mkString(",")}$s${arr.slice(42, arr.length).mkString(",")}"
38 | }
39 | }.repartition(1).saveAsTextFile("file:///F:/lakalaFinance_workspaces/applogs2/query_result.csv")
40 | }
41 |
42 | def getIndex(seed: Int): Int = {
43 | val rand = new Random()
44 | rand.nextInt(seed)
45 | }
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/apply/src/test/scala/CreateApplyData2.scala:
--------------------------------------------------------------------------------
1 |
2 | import org.apache.commons.lang3.StringUtils
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import com.lakala.datacenter.utils.UtilsToos._
5 | import scala.util.Random
6 | /**
7 | * Created by ASUS-PC on 2017/4/18.
8 | */
9 | object CreateApplyData2 {
10 |
11 | def main(args: Array[String]): Unit = {
12 | val conf = new SparkConf().setMaster("local[2]").setAppName("CreateApplyData2")
13 | val sc = new SparkContext(conf)
14 | val callLine = sc.textFile("file:///F:/lakalaFinance_workspaces/applogs/000000_0")
15 | val call = callLine.mapPartitions { lines =>
16 | lines.map { line =>
17 | var arr = line.split("\u0001")
18 | (s"${if (StringUtils.isNotBlank(arr(4)) && isMobileOrPhone(arr(4))) arr(4) else "0"},${if (StringUtils.isNotBlank(arr(6)) && isMobileOrPhone(arr(6))) arr(6) else "0"}")
19 | }
20 | }
21 | call.distinct().repartition(1).saveAsTextFile("file:///F:/lakalaFinance_workspaces/applogs3/query_result.csv")
22 | }
23 |
24 | def getIndex(seed: Int): Int = {
25 | val rand = new Random()
26 | rand.nextInt(seed)
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/apply/src/test/scala/EdgeTuplesTest.scala:
--------------------------------------------------------------------------------
1 | import org.apache.log4j.{Level, Logger}
2 | import org.apache.spark.graphx._
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import utils.GraphNdegUtil2
5 |
6 | /**
7 | * Created by ASUS-PC on 2017/4/19.
8 | */
9 | object EdgeTuplesTest {
10 | def main(args: Array[String]): Unit = {
11 | val conf = new SparkConf().setMaster("local[2]").setAppName("CreateApplyData")
12 | val sc = new SparkContext(conf)
13 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
14 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
15 | // val orderMobile = GraphLoader.edgeListFile(sc, "file:///F:/lakalaFinance_workspaces/graphx-analysis/apply/data3/part-00003")
16 | val orderMobile = GraphLoader.edgeListFile(sc, "file:///F:/lakalaFinance_workspaces/graphx-analysis/apply/data/friends.txt")
17 | // val orderMobile = sc.textFile("file:///F:/lakalaFinance_workspaces/graphx-analysis/apply/data3/part-00000")
18 | // val edgeTuple = orderMobile.mapPartitions { lines =>
19 | // lines.map { line =>
20 | // val arr = line.split(",")
21 | // (arr(0).toLong, arr(1).toLong)
22 | // }
23 | // }
24 |
25 | val validGraph = orderMobile.subgraph(k => k.srcId != 0 && k.dstId != 0)
26 | // val choiceRdd = sc.parallelize(Seq(18028726374L, 18692892122L, 13761981426L))
27 | val choiceRdd = sc.parallelize(Seq(6L))
28 |
29 | val rss: VertexRDD[Map[Int, Set[VertexId]]] = GraphNdegUtil2.aggNdegreedVertices(validGraph, choiceRdd, 3)
30 | println("00000++++++0000000")
31 | rss.foreach { k =>
32 | println(s"${k._1}${k._2.map(kk => k._2.map(kkk => kkk._2.toArray.mkString(",")))}")
33 | }
34 |
35 | // val applyLine = sc.textFile("file:///F:/lakalaFinance_workspaces/applogs/query_result.csv").filter(line => (!line.startsWith("s_c_loan_apply")))
36 | // val rs = applyLine.mapPartitions { lines =>
37 | // lines.map { line =>
38 | // val arr = line.split(",")
39 | // val term_id = if (StringUtils.isNotBlank(arr(7)) && !"null".equals(arr(7).toLowerCase)) arr(7) else "OL"
40 | // val return_pan = if (StringUtils.isNotBlank(arr(16)) && !"null".equals(arr(16).toLowerCase)) arr(16) else "0L"
41 | // val empmobile = if (StringUtils.isNotBlank(arr(41)) && !"null".equals(arr(41).toLowerCase)) arr(41) else "0L"
42 | // ((s"${hashId(arr(1))},${hashId(term_id)}"), (s"${hashId(arr(1))},${hashId(return_pan)}"), (s"${hashId(arr(1))},${empmobile}"))
43 | // }
44 | // }
45 | // val edge1: RDD[String] = rs.map(ve => ve._1).filter(k => !k.endsWith("," + hashId("0L")))
46 | // val edge2: RDD[String] = rs.map(ve => ve._2).filter(k => !k.endsWith("," + hashId("0L")))
47 | // val edge3: RDD[String] = rs.map(ve => ve._3).filter(k => !k.endsWith(",0L"))
48 | // edge1.union(edge2).union(edge3).repartition(1).saveAsTextFile("file:///F:/lakalaFinance_workspaces/graphx-analysis/apply/data3")
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/apply/src/test/scala/GraphxBSP3.scala:
--------------------------------------------------------------------------------
1 | /*
2 | import org.apache.commons.lang3.StringUtils
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Created by Administrator on 2017/6/16 0016.
7 | */
8 | object GraphxBSP3 {
9 | def main(args: Array[String]): Unit = {
10 | @transient
11 | val conf = new SparkConf().setAppName("GraphxBSP").setMaster("local[4]")
12 | @transient
13 | val sc = new SparkContext(conf)
14 | //orderId,contractNo,termId,loanPan,returnPan,insertTime,recommend,userId,
15 | // deviceId
16 | //certNo,email,company,mobile,compAddr,compPhone,emergencyContactMobile,contactMobile,ipv4,msgphone,telecode
17 | val edgeRDD = sc.textFile("F:\\graphx-analysis\\apply\\bin\\test.csv").mapPartitions(lines => lines.map { line =>
18 | val fields = line.split(",")
19 | val kv = if (StringUtils.isNoneEmpty(fields(2))) {
20 | (fields(2), 1)
21 | } else
22 | ("0", 0)
23 | kv
24 | }).reduceByKey(_ + _).filter(_._2 > 2)
25 | edgeRDD.foreach(kv=>println(kv._1+" === "+kv._2))
26 |
27 | }
28 |
29 | }
30 | */
31 |
--------------------------------------------------------------------------------
/apply/src/test/scala/Median.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by linyanshi on 2017/8/19 0019.
3 | */
4 |
5 | import org.apache.spark.{SparkConf, SparkContext}
6 |
7 | object Median {
8 | def main(args: Array[String]) {
9 | val conf = new SparkConf().setAppName("Spark Pi")
10 | val spark = new SparkContext(conf)
11 | val data = spark.textFile("data")
12 | /*将数据逻辑划分为10个桶,这里用户可以自行设置桶数量,统计每个桶中落入的数据量*/
13 | val mappeddata = data.map(num => {
14 | (num.toInt / 1000, num)
15 | })
16 |
17 | val count: Array[(Int, String)] = mappeddata.reduceByKey((a, b) => {
18 | a + b
19 | }).collect()
20 |
21 | /*根据总的数据量,逐次根据桶序号由低到高依次累加,判断中位数落在哪个桶中,并获取到中位数在桶中的偏移量*/
22 | val sum_count = count.map(data => {
23 | data._2.toInt
24 | }).sum
25 |
26 | var temp = 0
27 | var index = 0
28 | var mid = sum_count.toInt / 2
29 | for (i <- 0 to 10) {
30 | temp = temp + count(i)._2.toInt
31 | if (temp >= mid) {
32 | index = i
33 | }
34 | }
35 | /*中位数在桶中的偏移量*/
36 | val offset = temp - mid
37 | /*获取到中位数所在桶中的偏移量为offset的数,也就是中位数*/
38 | val result = mappeddata.filter(num => num._1 == index).takeOrdered(offset)
39 | println("Median is " + result(offset))
40 | spark.stop()
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/apply/src/test/scala/NDegreeResult.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.{SparkConf, SparkContext}
2 | import org.apache.spark.graphx._
3 |
4 | /**
5 | * Created by lys on 2017/4/23.
6 | */
7 | object NDegreeResult {
8 | def main(args: Array[String]): Unit = {
9 | val conf = new SparkConf()
10 | conf.setMaster("local[2]")
11 | conf.setAppName("DTWWW")
12 | val sc = new SparkContext(conf);
13 | val edge = List(//边的信息
14 | (1, 2), (1, 3), (2, 3), (3, 4), (3, 5), (3, 6),
15 | (4, 5), (5, 6), (7, 8), (7, 9), (8, 9),(2,11),(6,11),(2,12),(6,12))
16 | //构建边的rdd
17 | val edgeRdd = sc.parallelize(edge).map(x => {
18 | Edge(x._1.toLong, x._2.toLong, None)
19 | })
20 | //构建图 顶点Int类型
21 | val g = Graph.fromEdges(edgeRdd, 0)
22 | //可以了解图中“超级节点”的个数和规模,以及所有节点度的分布曲线。
23 | g.degrees.collect.foreach(println(_))
24 | //使用两次遍历,首先进行初始化的时候将自己的生命值设为2,
25 | // 第一次遍历向邻居节点传播自身带的ID以及生命值为1(2-1)的消息,
26 | // 第二次遍历的时候收到消息的邻居再转发一次,生命值为0,
27 | // 最终汇总统计的时候 只需要对带有消息为0 ID的进行统计即可得到二跳邻居
28 |
29 |
30 | type VMap = Map[VertexId, Int]
31 |
32 | /**
33 | * 节点数据的更新 就是集合的union
34 | */
35 | def vprog(vid: VertexId, vdata: VMap, message: VMap): Map[VertexId, Int] = addMaps(vdata, message)
36 |
37 | /**
38 | * 发送消息
39 | */
40 | def sendMsg(e: EdgeTriplet[VMap, _]) = {
41 | //取两个集合的差集 然后将生命值减1
42 | val srcMap:Map[VertexId, Int] = (e.dstAttr.keySet -- e.srcAttr.keySet).map { k => k -> (e.dstAttr(k) - 1) }.toMap
43 | val dstMap:Map[VertexId, Int] = (e.srcAttr.keySet -- e.dstAttr.keySet).map { k => k -> (e.srcAttr(k) - 1) }.toMap
44 |
45 | if (srcMap.size == 0 && dstMap.size == 0)
46 | Iterator.empty
47 | else
48 | Iterator((e.dstId, dstMap), (e.srcId, srcMap))
49 | }
50 |
51 | /**
52 | * 消息的合并
53 | */
54 | def addMaps(spmap1: VMap, spmap2: VMap): VMap =
55 | (spmap1.keySet ++ spmap2.keySet).map {
56 | k => k -> math.min(spmap1.getOrElse(k, Int.MaxValue), spmap2.getOrElse(k, Int.MaxValue))
57 | }.toMap
58 |
59 | val two = 2 //这里是二跳邻居 所以只需要定义为2即可
60 | val newG = g.mapVertices((vid, _) => Map[VertexId, Int](vid -> two))
61 | .pregel(Map[VertexId, Int](), two, EdgeDirection.Out)(vprog, sendMsg, addMaps)
62 |
63 | //可以看一下二次遍历之后各个顶点的数据:
64 | newG.vertices.collect().foreach(println(_))
65 | // (4,Map(5 -> 1, 1 -> 0, 6 -> 0, 2 -> 0, 3 -> 1, 4 -> 2))
66 | // (6,Map(5 -> 1, 1 -> 0, 6 -> 2, 2 -> 0, 3 -> 1, 4 -> 0))
67 | // (8,Map(8 -> 2, 7 -> 1, 9 -> 1))
68 | // (2,Map(5 -> 0, 1 -> 1, 6 -> 0, 2 -> 2, 3 -> 1, 4 -> 0))
69 | // (1,Map(5 -> 0, 1 -> 2, 6 -> 0, 2 -> 1, 3 -> 1, 4 -> 0))
70 | // (3,Map(5 -> 1, 1 -> 1, 6 -> 1, 2 -> 1, 3 -> 2, 4 -> 1))
71 | // (7,Map(7 -> 2, 8 -> 1, 9 -> 1))
72 | // (9,Map(9 -> 2, 7 -> 1, 8 -> 1))
73 | // (5,Map(5 -> 2, 1 -> 0, 6 -> 1, 2 -> 0, 3 -> 1, 4 -> 1))
74 | // Map中的key表示周边的顶点id,其value就是对应顶点id的生命值,所以我们现在对该rdd再做一次mapValues处理即可得到最后的二跳邻居
75 | //过滤得到二跳邻居 就是value=0 的顶点
76 | val twoJumpFirends = newG.vertices
77 | .mapValues(_.filter(_._2 == 0).keys)
78 |
79 | twoJumpFirends.collect().foreach(println(_))
80 | // (4,Set(1, 6, 2))
81 | // (6,Set(1, 2, 4))
82 | // (8,Set())
83 | // (2,Set(5, 6, 4))
84 | // (1,Set(5, 6, 4))
85 | // (3,Set())
86 | // (7,Set())
87 | // (9,Set())
88 | // (5,Set(1, 2))
89 |
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/apply/src/test/scala/NumOnce.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by linyanshi on 2017/8/19 0019.
3 | */
4 | import org.apache.spark.{SparkConf, SparkContext}
5 | import org.apache.spark.SparkContext._
6 |
7 | object NumOnce {
8 | //利用异或运算将列表中的所有ID异或,之后得到的值即为所求ID。先将每个分区的数据
9 | //异或,然后将结果进行异或运算。
10 | def computeOneNum(args:Array[String]) {
11 | val conf = new SparkConf().setAppName("NumOnce").setMaster("local[1]")
12 | val spark = new SparkContext(conf)
13 | val data = spark.textFile("data")
14 | /*每个分区分别对数据进行异或运算,最后在reduceByKey阶段,将各分区异或运算的结果再做异或运算合并。
15 | 偶数次出现的数字,异或运算之后为0,奇数次出现的数字,异或后为数字本身*/
16 | val result = data.mapPartitions(iter => {
17 | var temp = iter.next().toInt
18 | while(iter.hasNext) {
19 | temp = temp^(iter.next()).toInt
20 | }
21 | Seq((1, temp)).iterator
22 | }).reduceByKey(_^_).collect()
23 | println("num appear once is: "+result(0))
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/apply/src/test/scala/ParsesTest.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.{SparkConf, SparkContext}
2 | import org.apache.spark.sql.SQLContext
3 |
4 | /**
5 | * Created by Administrator on 2017/8/4 0004.
6 | */
7 | object ParsesTest {
8 | // case class Data(index: String, title: String, content: String)
9 | // def main(args: Array[String]): Unit = {
10 | // val conf = new SparkConf().setAppName("WordCount").setMaster("local")
11 | // val sc = new SparkContext(conf)
12 | // val input = sc.textFile("F:\\out\\output")
13 | // //wholeTextFiles读出来是一个RDD(String,String)
14 | // val result = input.map{line=>
15 | // val reader = new CSVReader(new StringReader(line));
16 | // reader.readAll().map(x => Data(x(0), x(1), x(2)))
17 | // }
18 | // for(res <- result){
19 | // println(res)
20 | // }
21 | // }
22 | def main(args: Array[String]): Unit = {
23 | val conf = new SparkConf().setAppName("ParsesTest").setMaster("local")
24 | val sc = new SparkContext(conf)
25 | val sqlContext = new SQLContext(sc)
26 | val df = sqlContext.load("com.databricks.spark.csv", Map("path" -> "F:\\out\\output\\*", "header" -> "true"))
27 | df.select("index", "title").foreach(row=>println(row.get(0)))
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/apply/src/test/scala/TestCSV.scala:
--------------------------------------------------------------------------------
1 | import com.lakala.datacenter.core.utils.UtilsToos
2 |
3 | import scala.collection.mutable.ArrayBuffer
4 | import scala.util.matching.Regex
5 |
6 | /**
7 | * Created by Administrator on 2017/8/4 0004.
8 | */
9 | object TestCSV {
10 |
11 | case class Data(index: String, title: String, content: String)
12 |
13 | val arr = Array(4)
14 |
15 | def main(args: Array[String]) {
16 | val value ="1472100411047"
17 | val pattern = new Regex("[0-9]{1,}")
18 | if(pattern.pattern.matcher(value).matches())
19 | println(value.toLong)
20 | }
21 |
22 | private def splitSpecificDelimiterData(line: String): String = {
23 | val context = new StringBuffer()
24 | val haveSplitAtt = line.split(",")
25 |
26 | val oneSplitAtt = haveSplitAtt(1).split("\\|")
27 | for (i <- 0 until (oneSplitAtt.length)) {
28 | if (arr(0) == 4) {
29 | val secondSplitAtt = haveSplitAtt(3).split("\\|")
30 | for (j <- 0 until (secondSplitAtt.length)) {
31 | if (j == secondSplitAtt.length - 1)
32 | context.append(s"${haveSplitAtt(0)},${oneSplitAtt(i)},${haveSplitAtt(2)},${secondSplitAtt(j)},${haveSplitAtt(haveSplitAtt.size - 1)}")
33 | else
34 | context.append(s"${haveSplitAtt(0)},${oneSplitAtt(i)},${haveSplitAtt(2)},${secondSplitAtt(j)},${haveSplitAtt(haveSplitAtt.size - 1)}\n")
35 | }
36 | } else {
37 | if (i == oneSplitAtt.length - 1)
38 | context.append(s"${haveSplitAtt(0)},${oneSplitAtt(i)},${haveSplitAtt(haveSplitAtt.size - 1)}")
39 | else
40 | context.append(s"${haveSplitAtt(0)},${oneSplitAtt(i)},${haveSplitAtt(haveSplitAtt.size - 1)}\n")
41 | }
42 | }
43 | context.toString
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/apply/src/test/scala/TestRunGraphx.scala:
--------------------------------------------------------------------------------
1 | import com.lakala.datacenter.core.utils.UtilsToos.hashId
2 | import org.apache.spark.graphx.Edge
3 |
4 | import scala.collection.mutable.ListBuffer
5 |
6 | /**
7 | * Created by ASUS-PC on 2017/4/12.
8 | */
9 | object TestRunGraphx {
10 | def main(args: Array[String]): Unit = {
11 | // RunLoadApplyGraphx2.main(Array())
12 | // com.lakala.datacenter.main.Driver.main(args)
13 | // val s ="15397661996->XNW28459058720408576"
14 | // val ss = "13666199888->XNW28459058720408576"
15 | // s.substring(0,s.indexOf("->"))
16 | // println(s.substring(0,s.indexOf("->"))+"##"+ss.substring(0,ss.indexOf("->")))
17 | val arry= args(0).split(",")
18 | val edge =new EdgeArr("001","4334","7","0")
19 | if(judgSendMsg(arry,edge)) println("=========")
20 | }
21 | def judgSendMsg(sendType: Array[String], edge: EdgeArr): Boolean = {
22 | var flag = false
23 | for (stype <- sendType) if (edge.srcType.equals(stype)) flag = true
24 | flag
25 | }
26 |
27 | // var messages = g.mapReduceTriplets(sendMsg,mergeMsg);
28 | // print("messages:"+messages.take(10).mkString("\n"))
29 | // var activeMessages = messages.count();
30 | // //LOAD
31 | // var prevG:Graph[VD,ED] = null
32 | // var i = 0;
33 | // while(activeMessages > 0 && i < maxIterations){
34 | // //③Receive the messages.Vertices that didn‘t get any message do not appear in newVerts.
35 | // //内联操作,返回的结果是VertexRDD,可以参看后面的调试信息
36 | // val newVerts = g.vertices.innerJoin(messages)(vprog).cache();
37 | // print("newVerts:"+newVerts.take(10).mkString("\n"))
38 | // //④update the graph with the new vertices.
39 | // prevG = g;//先把旧的graph备份,以利于后面的graph更新和unpersist掉旧的graph
40 | // //④外联操作,返回整个更新的graph
41 | // g = g.outerJoinVertices(newVerts){(vid,old,newOpt) => newOpt.getOrElse(old)}//getOrElse方法,意味,如果newOpt存在,返回newOpt,不存在返回old
42 | // print(g.vertices.take(10).mkString("\n"))
43 | // g.cache();//新的graph cache起来,下一次迭代使用
44 | //
45 | // val oldMessages = messages;//备份,同prevG = g操作一样
46 | // //Send new messages.Vertices that didn‘t get any message do not appear in newVerts.so
47 | // //don‘t send messages.We must cache messages.so it can be materialized on the next line.
48 | // //allowing us to uncache the previous iteration.
49 | // //⑤下一次迭代要发送的新的messages,先cache起来
50 | // messages = g.mapReduceTriplets(sendMsg,mergeMsg,Some((newVerts,activeDirection))).cache()
51 | // print("下一次迭代要发送的messages:"+messages.take(10).mkString("\n"))
52 | // activeMessages = messages.count();//⑥
53 | // print("下一次迭代要发送的messages的个数:"+ activeMessages)//如果activeMessages==0,迭代结束
54 | // logInfo("Pregel finished iteration" + i);
55 | // //原来,旧的message和graph不可用了,unpersist掉
56 | // oldMessages.unpersist(blocking= false);
57 | // newVerts.unpersist(blocking=false)//unpersist之后,就不可用了
58 | // prevG.unpersistVertices(blocking=false)
59 | // prevG.edges.unpersist(blocking=false)
60 | // i += 1;
61 | // }
62 | // g//返回最后的graph
63 | //}
64 | //
65 | //}
66 |
67 |
68 |
69 | // val conf = if (ctx.isLocals) new Configuration else ctx.getSparkContext.hadoopConfiguration
70 | // val hdfsPath: String = hdfsMasterPath + path
71 | // rdd.saveAsTextFile(hdfsPath)
72 | // hiveCT.sql(s"ALTER TABLE $tableName DROP PARTITION(execute_dt='$date', project_id='$project')")
73 | // hiveCT.sql(s"ALTER TABLE $tableName SET FILEFORMAT TEXTFILE")
74 | // hiveCT.sql(s"LOAD DATA INPATH '$hdfsPath/part-*' OVERWRITE INTO TABLE $tableName PARTITION (execute_dt='$date', project_id='$project')")
75 | // hiveCT.sql(s"ALTER TABLE $tableName SET FILEFORMAT RCFILE")
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/apply/src/test/scala/TrustRank.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.Logging
2 | import org.apache.spark.graphx._
3 |
4 | import scala.reflect.ClassTag
5 | import scala.util.Random
6 |
7 | /**
8 | * Created by linyanshi on 2017/9/19 0019.
9 | */
10 | object TrustRank extends Logging {
11 |
12 | /*
13 | * VD : (double, double) denotes rank and score
14 | * ED : double , not used
15 | */
16 | def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], numIter: Int): Long = {
17 | val resetProb: Double = 0.15
18 | val resetRank: Double = 0.15
19 |
20 | def resetScore: Double = Random.nextDouble()
21 |
22 |
23 | var rankGraph: Graph[Double, Double] = graph
24 | .outerJoinVertices(graph.outDegrees) { (vid, vd, deg) => deg.getOrElse(0) }
25 | .mapTriplets(e => 1.0 / e.srcAttr, TripletFields.Src)
26 | .mapVertices((id, attr) => resetRank)
27 |
28 | val scoreGraph: Graph[Double, _] = graph.mapVertices((id, attr) => resetScore).cache()
29 |
30 | var iteration = 0
31 |
32 | val start_ms = System.currentTimeMillis()
33 | println("Start time : " + start_ms)
34 |
35 | while (iteration < numIter) {
36 | val rankUpdates = rankGraph.aggregateMessages[Double](
37 | ctx => ctx.sendToDst(ctx.srcAttr * ctx.attr),
38 | _ + _,
39 | TripletFields.Src
40 | )
41 |
42 | // update rank and apply
43 | rankGraph = rankGraph.joinVertices(rankUpdates) {
44 | (id, old_vd, msgSum) => (1.0 - resetProb) * msgSum
45 | }.joinVertices(scoreGraph.vertices) {
46 | (id, rank, score) => (rank + resetProb * score)
47 | }
48 |
49 | rankGraph.vertices.count() // materialize rank graph
50 | logInfo(s"TrustRank finished iteration $iteration.")
51 |
52 | iteration += 1
53 |
54 | }
55 |
56 |
57 | var end_ms = System.currentTimeMillis()
58 | println("End time : " + end_ms)
59 |
60 | println("Cost : " + (end_ms - start_ms))
61 |
62 | end_ms - start_ms
63 | }
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/apply/src/test/scala/UDF_test.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.sql.hive.HiveContext
2 | import org.apache.spark.{SparkConf, SparkContext}
3 |
4 | /**
5 | * Created by linyanshi on 2017/9/14 0014.
6 | */
7 | object UDF_test {
8 | def main(args: Array[String]): Unit = {
9 |
10 | val conf = new SparkConf()
11 | implicit val sc = new SparkContext(conf)
12 | implicit val sqlContext = new HiveContext(sc)
13 |
14 | import sqlContext.implicits._
15 |
16 | val data = sc.parallelize(Seq(("a", 1), ("bb", 5), ("cccc", 10), ("dddddd", 15))).toDF("a", "b")
17 | data.registerTempTable("data")
18 |
19 |
20 | {
21 | //函数体采用原生类型(非Column类型),使用udf包装函数体,将函数体注册到sqlContext.udf
22 | import org.apache.spark.sql.functions._
23 |
24 | //函数体
25 | val filter_length_f = (str: String, _length: Int) => {
26 | str.length > _length;
27 | }
28 |
29 | //注册函数体到当前sqlContext,注意,注册到sqlContext的函数体,参数不能为Column
30 | //注册后,可以在以下地方使用:1、df.selectExpr 2、df.filter ,3、将该df注册为temptable,之后在sql中使用
31 | sqlContext.udf.register("filter_length", filter_length_f)
32 |
33 | val filter_length = udf(filter_length_f) //为方便使用Column,我们对函数体进行包装,包装后的输入参数为Column
34 |
35 | data.select($"*", filter_length($"a", lit(2))).show //使用udf包装过的,必须传入Column,注意 lit(2)
36 | data.selectExpr("*", " filter_length(a,2) as ax").show //select 若写表达式调用函数,则需要使用selectExpr
37 |
38 | data.filter(filter_length($"a", lit(2))).show //同select
39 | data.filter("filter_length(a,2)").show //filter调用表达式,可以直接使用df.filter函数,
40 |
41 | sqlContext.sql("select *,filter_length(a,2) from data").show
42 | sqlContext.sql("select *,filter_length(a,2) from data where filter_length(a,2)").show
43 | }
44 | {
45 | //函数体使用Column类型,无法注册到sqlContext.udf
46 | //使用udf包装后,每列都必须输入column,能否我们自己定义呢,比如一个参数是Column,一个是其他类型
47 | import org.apache.spark.sql.Column
48 | import org.apache.spark.sql.functions._
49 |
50 | val filter_length_f2 = (str: Column, _length: Int) => {
51 | length(str) > _length
52 | }
53 | sqlContext.udf.register("filter_length", filter_length_f2) //todo:不好意思,这里注册不了,注册到sqlContext.udf的函数,入参不支持Column类型
54 |
55 | data.select($"*", filter_length_f2($"a", 2)).show //不用udf包装,我们就可以完全自定义,这时 length 就可以传入整型了
56 | data.selectExpr("*", " filter_length_f2(a,2) as ax").show //todo:不好意思,这里用不了了,
57 |
58 | data.filter(filter_length_f2($"a", 2)).show //同select
59 | data.filter("filter_length(a,2)").show //todo:不好意思,这里用不了了
60 |
61 | }
62 | //最后,我们写一个相对通用的吧
63 | {
64 | //定义两个函数体,入参一个使用column类型,一个使用原生类型,将原生类型函数注册到sqlContext.udf
65 |
66 | import org.apache.spark.sql.Column
67 | import org.apache.spark.sql.functions._
68 |
69 | //函数体
70 | val filter_length_f = (str: String, _length: Int) => {
71 | str.length > _length;
72 | }
73 | //主函数,下面df.select df.filter 等中使用
74 | val filter_length = (str: Column, _length: Int) => {
75 | length(str) > _length
76 | }
77 | //注册函数体到当前sqlContext,注意,注册到sqlContext的函数体,参数不能为Column
78 | //注册后,可以在以下地方使用:1、df.selectExpr 2、df.filter ,3、将该df注册为temptable,之后在sql中使用
79 | sqlContext.udf.register("filter_length", filter_length_f)
80 |
81 | //这里我们不使用udf了,直接使用自己定义的支持Column的函数
82 | //val filter_length = udf(filter_length_f) //为方便使用Column,我们对函数体进行包装,包装后的输入参数为Column
83 |
84 | data.select($"*", filter_length($"a", 2)).show //使用udf包装过的,必须传入Column,注意 lit(2)
85 | data.selectExpr("*", " filter_length(a,2) as ax").show //select 若写表达式调用函数,则需要使用selectExpr
86 |
87 | data.filter(filter_length($"a", 2)).show //同select
88 | data.filter("filter_length(a,2)").show //filter调用表达式,可以直接使用df.filter函数,
89 |
90 | sqlContext.sql("select *,filter_length(a,2) from data").show
91 | sqlContext.sql("select *,filter_length(a,2) from data where filter_length(a,2)").show
92 | }
93 |
94 |
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/apply/src/test/scala/entity/CallEntity.scala:
--------------------------------------------------------------------------------
1 | package entity
2 |
3 | import scala.collection.mutable.ListBuffer
4 |
5 | /**
6 | * Created by ASUS-PC on 2017/4/19.
7 | */
8 | case class CallEntity(var totalRounds: Int = 0, var propertyList: ListBuffer[String] = ListBuffer()) extends Serializable with Product {
9 | override def productElement(idx: Int): Any = idx match {
10 | case 0 => totalRounds
11 | case 1 => propertyList
12 | }
13 |
14 | override def productArity: Int = 2
15 |
16 | override def canEqual(that: Any): Boolean = that.isInstanceOf[CallEntity]
17 |
18 | override def toString = s"CallEntity($totalRounds, ${propertyList.toArray.mkString(",")})"
19 | }
20 |
--------------------------------------------------------------------------------
/apply/src/test/scala/entity/CallVertex.scala:
--------------------------------------------------------------------------------
1 | package entity
2 |
3 | import scala.reflect.ClassTag
4 |
5 | /**
6 | * Created by ASUS-PC on 2017/4/20.
7 | */
8 | case class CallVertex[VD: ClassTag](var oldAttr: VD = null,
9 | var newAttr: VD = null,
10 | var init: Boolean = false,
11 | var loop: Int = 0)
12 | extends Serializable {
13 | }
14 |
--------------------------------------------------------------------------------
/apply/src/test/scala/entity/TwoDegree.scala:
--------------------------------------------------------------------------------
1 | package entity
2 |
3 | /**
4 | * Created by ASUS-PC on 2017/4/24.
5 | */
6 | case class TwoDegree (var attr:String ="",
7 | var loop: Int = 0)
8 | extends Serializable {
9 | }
10 |
--------------------------------------------------------------------------------
/apply/src/test/scala/utils/CollectionUtil.scala:
--------------------------------------------------------------------------------
1 | package utils
2 |
3 | import scala.collection.mutable.ArrayBuffer
4 | import scala.reflect.ClassTag
5 |
6 | /**
7 | * Created by ASUS-PC on 2017/4/19.
8 | */
9 |
10 | object CollectionUtil {
11 |
12 | /**
13 | * 对具有Traversable[(K, V)]类型的集合添加reduceByKey相关方法
14 | *
15 | * @param collection
16 | * @param kt
17 | * @param vt
18 | * @tparam K
19 | * @tparam V
20 | */
21 | implicit class CollectionHelper[K, V](collection: Traversable[(K, V)])(implicit kt: ClassTag[K], vt: ClassTag[V]) {
22 | def reduceByKey(f: (V, V) => V): Traversable[(K, V)] = {
23 | collection.groupBy(_._1).map { case (_: K, values: Traversable[(K, V)]) => values.reduce((a, b) => (a._1, f(a._2, b._2))) }}
24 |
25 | /**
26 | * reduceByKey的同时,返回被reduce掉的元素的集合
27 | *
28 | * @param f
29 | * @return
30 | */
31 | def reduceByKeyWithReduced(f: (V, V) => V)(implicit kt: ClassTag[K], vt: ClassTag[V]): (Traversable[(K, V)], Traversable[(K, V)]) = {
32 | val reduced: ArrayBuffer[(K, V)] = ArrayBuffer()
33 | val newSeq = collection.groupBy(_._1).map {
34 | case (_: K, values: Traversable[(K, V)]) => values.reduce((a, b) => {
35 | val newValue: V = f(a._2, b._2)
36 | val reducedValue: V = if (newValue == a._2) b._2 else a._2
37 | val reducedPair: (K, V) = (a._1, reducedValue)
38 | reduced += reducedPair
39 | (a._1, newValue)
40 | })
41 | }
42 | (newSeq, reduced.toTraversable)
43 | }
44 | }
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/common/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | com.lakala.datacenter
7 | graphx-analysis
8 | 1.0.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | graphx-analysis-common
13 |
14 |
--------------------------------------------------------------------------------
/common/src/main/resources/css/style.css:
--------------------------------------------------------------------------------
1 | graph {
2 | fill-color: white;
3 | }
4 | node {
5 | size: 65;
6 | fill-color: #CCCCCC, #AAAAAA;
7 | fill-mode: gradient-radial;
8 | text-offset: 0, 0;
9 | stroke-mode: plain;
10 | stroke-color: #333333;
11 | }
12 | node:clicked {
13 | fill-color: #2277FF, #88AAFF;
14 | fill-mode: gradient-radial;
15 | size: 100;
16 | text-size:18;
17 | text-offset: 0, 0;
18 | }
19 | edge {
20 | text-alignment: along;
21 | }
22 |
--------------------------------------------------------------------------------
/common/src/test/data/cities_edges.txt:
--------------------------------------------------------------------------------
1 | 1 2 75
2 | 1 4 140
3 | 1 8 118
4 | 2 3 71
5 | 3 4 151
6 | 4 5 99
7 | 4 6 80
8 | 5 13 211
9 | 6 7 97
10 | 6 12 146
11 | 7 13 101
12 | 7 12 138
13 | 8 9 111
14 | 9 10 70
15 | 10 11 75
16 | 11 12 120
17 | 13 14 90
--------------------------------------------------------------------------------
/common/src/test/data/cities_vertices.txt:
--------------------------------------------------------------------------------
1 | 1 Arad
2 | 2 Zerind
3 | 3 Oradea
4 | 4 Sibiu
5 | 5 Fagaras
6 | 6 RimnicuVilcea
7 | 7 Pitesti
8 | 8 Timisoara
9 | 9 Lugoj
10 | 10 Mehadia
11 | 11 Drobeta
12 | 12 Craiova
13 | 13 Bucharest
14 | 14 Giurgiu
--------------------------------------------------------------------------------
/common/src/test/data/likeness_edges.txt:
--------------------------------------------------------------------------------
1 | 1 2 likes
2 | 1 4 follows
3 | 1 6 follows
4 | 1 6 likes
5 | 2 1 follows
6 | 2 5 likes
7 | 2 6 likes
8 | 3 1 follows
9 | 3 4 likes
10 | 4 2 likes
11 | 4 3 follows
12 | 5 3 likes
13 | 6 1 follows
14 | 6 4 likes
--------------------------------------------------------------------------------
/common/src/test/data/maxvalue_edges.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 2 1
3 | 2 4
4 | 3 2
5 | 3 4
6 | 4 3
--------------------------------------------------------------------------------
/common/src/test/data/maxvalue_vertices.txt:
--------------------------------------------------------------------------------
1 | 1 3
2 | 2 6
3 | 3 2
4 | 4 1
--------------------------------------------------------------------------------
/common/src/test/data/papers_edges.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 1 4
3 | 1 6
4 | 2 1
5 | 2 6
6 | 3 1
7 | 3 4
8 | 4 2
9 | 4 5
10 | 5 2
11 | 5 3
12 | 6 1
13 | 6 4
--------------------------------------------------------------------------------
/common/src/test/data/people_vertices.txt:
--------------------------------------------------------------------------------
1 | #ID NAME AGE
2 | 1 tom 34
3 | 2 chiara 51
4 | 3 22
5 | 4 marco 28
6 | 5 lucia 40
7 | 6 meria 32
8 | 7 tommy 30
9 | 8 giulio 45
10 | 9 ada 33
--------------------------------------------------------------------------------
/common/src/test/data/relationships_edges.txt:
--------------------------------------------------------------------------------
1 | 1 4
2 | 1 6
3 | 3 4
4 | 3 5
5 | 4 3
6 | 5 2
7 | 5 6
8 | 6 1
9 | 6 4
--------------------------------------------------------------------------------
/common/src/test/data/us_cities_edges.txt:
--------------------------------------------------------------------------------
1 | 1 2 27
2 | 1 3 91
3 | 2 3 35
4 | 2 5 67
5 | 3 4 48
6 | 3 5 14
7 | 5 4 29
8 | 5 6 15
--------------------------------------------------------------------------------
/common/src/test/data/us_cities_vertices.txt:
--------------------------------------------------------------------------------
1 | 1 Washington
2 | 2 Baltimore
3 | 3 Detroit
4 | 4 Chicago
5 | 5 NewYork
6 | 6 Philadelphia
--------------------------------------------------------------------------------
/common/src/test/data/users_dense_edges.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 1 4
3 | 2 3
4 | 2 4
5 | 2 5
6 | 3 4
7 | 5 1
8 | 5 3
9 | 5 6
10 | 6 1
11 | 6 3
--------------------------------------------------------------------------------
/common/src/test/data/users_disjoint_edges.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 2 5
3 | 2 6
4 | 3 4
5 | 4 3
6 | 5 6
7 | 6 1
--------------------------------------------------------------------------------
/common/src/test/data/users_edges.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 2 6
3 | 3 1
4 | 3 4
5 | 4 2
6 | 5 3
7 | 6 1
8 | 6 4
--------------------------------------------------------------------------------
/common/src/test/data/users_vertices.txt:
--------------------------------------------------------------------------------
1 | # ID USERNAME AGE
2 | 1 Alice 35
3 | 2 Bob 41
4 | 3 Carol 28
5 | 4 Dave 43
6 | 5 Eve 29
7 | 6 Frank 30
--------------------------------------------------------------------------------
/common/src/test/scala/TestGraphViewer.scala:
--------------------------------------------------------------------------------
1 | import com.lakala.datacenter.common.graphstream.SimpleGraphViewer
2 |
3 | /**
4 | * Created by peter on 2017/4/26.
5 | */
6 | object TestGraphViewer {
7 | def main(args: Array[String]): Unit = {
8 | SimpleGraphViewer.main(Array())
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/core/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | com.lakala.datacenter
7 | graphx-analysis
8 | 1.0.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | graphx-analysis-core
13 |
14 |
15 |
16 |
17 | com.lakala.datacenter
18 | graphx-analysis-common
19 | ${project.version}
20 |
21 |
22 |
--------------------------------------------------------------------------------
/core/src/main/java/com/lakala/datacenter/core/messaging/Sender.java:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.core.messaging;
2 |
3 | import com.lakala.datacenter.core.config.ConfigurationLoader;
4 | import com.rabbitmq.client.Channel;
5 | import com.rabbitmq.client.Connection;
6 | import com.rabbitmq.client.ConnectionFactory;
7 | import com.rabbitmq.client.MessageProperties;
8 |
9 | import java.util.concurrent.TimeoutException;
10 |
11 | public class Sender {
12 | private static final String TASK_QUEUE_NAME = "processor";
13 |
14 | public static void sendMessage(String message)
15 | throws java.io.IOException,
16 | java.lang.InterruptedException, TimeoutException {
17 |
18 | ConnectionFactory factory = new ConnectionFactory();
19 | factory.setHost(ConfigurationLoader.getInstance().getRabbitmqNodename());
20 | Connection connection = factory.newConnection();
21 | Channel channel = connection.createChannel();
22 |
23 | channel.queueDeclare(TASK_QUEUE_NAME, true, false, false, null);
24 |
25 | channel.basicPublish("", TASK_QUEUE_NAME,
26 | MessageProperties.PERSISTENT_TEXT_PLAIN,
27 | message.getBytes());
28 | System.out.println(" [x] Sent '" + message + "'");
29 |
30 | channel.close();
31 | connection.close();
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/core/src/main/java/com/lakala/datacenter/core/models/PartitionDescription.java:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.core.models;
2 |
3 | public class PartitionDescription {
4 | private Long partitionId;
5 | private String partitionLabel;
6 | private String groupRelationship;
7 | private String targetRelationship;
8 |
9 | public String getPartitionLabel() {
10 | return partitionLabel;
11 | }
12 |
13 | public void setPartitionLabel(String partitionLabel) {
14 | this.partitionLabel = partitionLabel;
15 | }
16 |
17 | public Long getPartitionId() {
18 | return partitionId;
19 | }
20 |
21 | public void setPartitionId(Long partitionId) {
22 | this.partitionId = partitionId;
23 | }
24 |
25 | public String getTargetRelationship() {
26 | return targetRelationship;
27 | }
28 |
29 | public void setTargetRelationship(String targetRelationship) {
30 | this.targetRelationship = targetRelationship;
31 | }
32 |
33 | public String getGroupRelationship() {
34 | return groupRelationship;
35 | }
36 |
37 | public void setGroupRelationship(String groupRelationship) {
38 | this.groupRelationship = groupRelationship;
39 | }
40 |
41 | public PartitionDescription(Long partitionId, String partitionLabel) {
42 | this.partitionId = partitionId;
43 | this.partitionLabel = partitionLabel;
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/core/src/main/java/com/lakala/datacenter/core/models/ProcessorMessage.java:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.core.models;
2 |
3 | /**
4 | * The ProcessorMessage class is used to distribute messages between the graph processor and Neo4j.
5 | */
6 | public class ProcessorMessage {
7 | private String path;
8 | private String analysis;
9 | private ProcessorMode mode;
10 | private PartitionDescription partitionDescription;
11 |
12 | public ProcessorMessage(String path, String analysis, ProcessorMode mode) {
13 | this.path = path;
14 | this.analysis = analysis;
15 | this.mode = mode;
16 | }
17 |
18 | /**
19 | * Get the HDFS path.
20 | * @return The path to the HDFS file for this process.
21 | */
22 | public String getPath() {
23 | return path;
24 | }
25 |
26 | /**
27 | * Set the HDFS path.
28 | * @param path The path to the HDFS file for this process.
29 | */
30 | public void setPath(String path) {
31 | this.path = path;
32 | }
33 |
34 | /**
35 | * Get the analysis type.
36 | * @return The key for the analysis type.
37 | */
38 | public String getAnalysis() {
39 | return analysis;
40 | }
41 |
42 | /**
43 | * Set the analysis type.
44 | * @param analysis The key for the analysis type.
45 | */
46 | public void setAnalysis(String analysis) {
47 | this.analysis = analysis;
48 | }
49 |
50 | /**
51 | * Get the mode type.
52 | * @return The mode type for the analysis, either partitioned or unpartitioned.
53 | */
54 | public ProcessorMode getMode() {
55 | return mode;
56 | }
57 |
58 | /**
59 | * Set the mode type.
60 | * @param mode The mode type represents whether the analysis should be partitioned.
61 | */
62 | public void setMode(ProcessorMode mode) {
63 | this.mode = mode;
64 | }
65 |
66 | /**
67 | * Get the description for the partitioned analysis.
68 | * @return Returns a description for the queried partition.
69 | */
70 | public PartitionDescription getPartitionDescription() {
71 | return partitionDescription;
72 | }
73 |
74 | /**
75 | * Set the partition description for an analysis. Preserves information related to
76 | * the analysis being performed on the current partition.
77 | * @param partitionDescription A set of fields that describe the partition being analyzed.
78 | */
79 | public void setPartitionDescription(PartitionDescription partitionDescription) {
80 | this.partitionDescription = partitionDescription;
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/core/src/main/java/com/lakala/datacenter/core/models/ProcessorMode.java:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.core.models;
2 |
3 | public enum ProcessorMode {
4 | Partitioned,
5 | Unpartitioned
6 | }
7 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/lakala/datacenter/core/abstractions/PregelProgram.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.core.abstractions
2 |
3 | /**
4 | * Created by peter on 2017/4/26.
5 | */
6 | import org.apache.spark.graphx._
7 | import scala.reflect.ClassTag
8 |
9 | /**
10 | * The [[PregelProgram]] abstraction wraps Spark's Pregel API implementation from the [[GraphOps]]
11 | * class into a model that is easier to write graph algorithms.
12 | * @tparam VertexState is the generic type representing the state of a vertex
13 | */
14 | abstract class PregelProgram[VertexState: ClassTag, VD: ClassTag, ED: ClassTag] protected () extends Serializable {
15 |
16 | @transient val graph: Graph[VD, ED]
17 |
18 | /**
19 | * The vertex program receives a state update and acts to update its state
20 | * @param id is the [[VertexId]] that this program will perform a state operation for
21 | * @param state is the current state of this [[VertexId]]
22 | * @param message is the state received from another vertex in the graph
23 | * @return a [[VertexState]] resulting from a comparison between current state and incoming state
24 | */
25 | def vertexProgram(id : VertexId, state : VertexState, message : VertexState) : VertexState
26 |
27 | /**
28 | * The message broker sends and receives messages. It will initially receive one message for
29 | * each vertex in the graph.
30 | * @param triplet An edge triplet is an object containing a pair of connected vertex objects and edge object.
31 | * For example (v1)-[r]->(v2)
32 | * @return The message broker returns a key value list, each containing a VertexId and a new message
33 | */
34 | def messageBroker(triplet :EdgeTriplet[VertexState, ED]) : Iterator[(VertexId, VertexState)]
35 |
36 | /**
37 | * This method is used to reduce or combine the set of all state outcomes produced by a vertexProgram
38 | * for each vertex in each superstep iteration. Each vertex has a list of state updates received from
39 | * other vertices in the graph via the messageBroker method. This method is used to reduce the list
40 | * of state updates into a single state for the next superstep iteration.
41 | * @param a A first [[VertexState]] representing a partial state of a vertex.
42 | * @param b A second [[VertexState]] representing a different partial state of a vertex
43 | * @return a merged [[VertexState]] representation from the two [[VertexState]] parameters
44 | */
45 | def combiner(a: VertexState, b: VertexState) : VertexState
46 |
47 | }
--------------------------------------------------------------------------------
/core/src/main/scala/com/lakala/datacenter/core/grograms/EdgeBetweennessProgram.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.core.grograms
2 |
3 | /**
4 | * Created by peter on 2017/4/26.
5 | */
6 | import org.apache.spark.graphx.{EdgeTriplet, Graph, VertexId}
7 | import com.lakala.datacenter.core.abstractions.PregelProgram
8 |
9 | /**
10 | * The [[EdgeBetweennessProgram]] is an example graph algorithm implemented on the [[PregelProgram]]
11 | * abstraction.
12 | */
13 | class EdgeBetweennessProgram(@transient val graph : Graph[Seq[VertexId], Seq[VertexId]])
14 | extends PregelProgram[Seq[VertexId], Seq[VertexId], Seq[VertexId]] with Serializable {
15 |
16 | protected def this() = this(null)
17 |
18 | /**
19 | * Return the larger of the two vertex attribute values
20 | * @param id is the [[VertexId]] that this program will perform a state operation for
21 | * @param state is the current state of this [[VertexId]]
22 | * @param message is the state received from another vertex in the graph
23 | * @return an [[Int]] resulting from a comparison between current state and incoming state
24 | */
25 | override def vertexProgram(id: VertexId, state: Seq[VertexId], message: Seq[VertexId]): Seq[VertexId] = {
26 | if(state == null) {
27 | message
28 | } else {
29 | (state ++ message).distinct
30 | }
31 | }
32 |
33 | /**
34 | * Return the larger of the two vertex state results
35 | * @param a A first [[Int]] representing a partial state of a vertex.
36 | * @param b A second [[Int]] representing a different partial state of a vertex
37 | * @return a merged [[Int]] representation from the two [[Int]] parameters
38 | */
39 | override def combiner(a: Seq[VertexId], b: Seq[VertexId]): Seq[VertexId] = {
40 | (a ++ b).distinct
41 | }
42 |
43 | /**
44 | * If the dstVertex's value is less than the srcVertex's value, send a message to the dstVertex to update
45 | * its state
46 | * @param triplet An edge triplet is an object containing a pair of connected vertex objects and edge object.
47 | * For example (v1)-[r]->(v2)
48 | * @return The message broker returns a key value list, each containing a VertexId and a new message
49 | */
50 | override def messageBroker(triplet: EdgeTriplet[Seq[VertexId], Seq[VertexId]]): Iterator[(VertexId, Seq[VertexId])] = {
51 | // If the srcAttr is greater than the dstAttr then notify the dstVertex to update its state
52 |
53 | if(!triplet.srcAttr.contains(triplet.dstId)) {
54 | Iterator((triplet.srcId, Seq(triplet.dstId)))
55 | } else {
56 | Iterator()
57 | }
58 |
59 | }
60 |
61 | /**
62 | * This method wraps Spark's Pregel API entry point from the [[org.apache.spark.graphx.GraphOps]] class. This provides
63 | * a simple way to write a suite of graph algorithms by extending the [[PregelProgram]] abstract
64 | * class and implementing vertexProgram, messageBroker, and combiner methods.
65 | * @param initialMsg is the initial message received for all vertices in the graph
66 | */
67 | def run(initialMsg: Seq[VertexId]): Graph[Seq[VertexId], Seq[VertexId]] = {
68 | graph.pregel(initialMsg)(this.vertexProgram, this.messageBroker, this.combiner)
69 | }
70 | }
71 |
72 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/lakala/datacenter/core/grograms/MaximumValueProgram.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.core.grograms
2 |
3 | import com.lakala.datacenter.core.abstractions.PregelProgram
4 | import org.apache.spark.graphx.{EdgeTriplet, Graph, VertexId}
5 |
6 | /**
7 | * The [[MaximumValueProgram]] is an example graph algorithm implemented on the [[PregelProgram]]
8 | * abstraction.
9 | */
10 | class MaximumValueProgram(@transient val graph : Graph[Int, Int])
11 | extends PregelProgram[Int, Int, Int] with Serializable {
12 |
13 | protected def this() = this(null)
14 |
15 | /**
16 | * Return the larger of the two vertex attribute values
17 | * @param id is the [[VertexId]] that this program will perform a state operation for
18 | * @param state is the current state of this [[VertexId]]
19 | * @param message is the state received from another vertex in the graph
20 | * @return an [[Int]] resulting from a comparison between current state and incoming state
21 | */
22 | override def vertexProgram(id: VertexId, state: Int, message: Int): Int = {
23 | if (message > state) {
24 | message
25 | } else {
26 | state
27 | }
28 | }
29 |
30 | /**
31 | * Return the larger of the two vertex state results
32 | * @param a A first [[Int]] representing a partial state of a vertex.
33 | * @param b A second [[Int]] representing a different partial state of a vertex
34 | * @return a merged [[Int]] representation from the two [[Int]] parameters
35 | */
36 | override def combiner(a: Int, b: Int): Int = {
37 | math.max(a, b)
38 | }
39 |
40 | /**
41 | * If the dstVertex's value is less than the srcVertex's value, send a message to the dstVertex to update
42 | * its state
43 | * @param triplet An edge triplet is an object containing a pair of connected vertex objects and edge object.
44 | * For example (v1)-[r]->(v2)
45 | * @return The message broker returns a key value list, each containing a VertexId and a new message
46 | */
47 | override def messageBroker(triplet: EdgeTriplet[Int, Int]): Iterator[(VertexId, Int)] = {
48 | // If the srcAttr is greater than the dstAttr then notify the dstVertex to update its state
49 | if (triplet.srcAttr > triplet.dstAttr) {
50 | Iterator((triplet.dstId, triplet.srcAttr))
51 | } else {
52 | Iterator.empty
53 | }
54 | }
55 |
56 | /**
57 | * This method wraps Spark's Pregel API entry point from the [[org.apache.spark.graphx.GraphOps]] class. This provides
58 | * a simple way to write a suite of graph algorithms by extending the [[PregelProgram]] abstract
59 | * class and implementing vertexProgram, messageBroker, and combiner methods.
60 | * @param initialMsg is the initial message received for all vertices in the graph
61 | */
62 | def run(initialMsg: Int): Graph[Int, Int] = {
63 | graph.pregel(initialMsg)(this.vertexProgram, this.messageBroker, this.combiner)
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/lakala/datacenter/core/utils/UtilsToos.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.core.utils
2 |
3 | import java.nio.charset.StandardCharsets
4 |
5 | import com.google.common.hash.Hashing
6 | import com.lakala.datacenter.common.utils.DateTimeUtils
7 |
8 | import scala.util.matching.Regex
9 |
10 | /**
11 | * Created by ASUS-PC on 2017/4/18.
12 | */
13 | object UtilsToos {
14 | /**
15 | * 根据字符串生成唯一的hashcode值
16 | *
17 | * @param str
18 | * @return
19 | */
20 | def hashId(str: String) = {
21 | Hashing.md5().hashString(str, StandardCharsets.UTF_8).asLong()
22 | }
23 |
24 | /**
25 | * 手机号,电话号码验证
26 | *
27 | * @param num
28 | * @return 验证通过返回true
29 | */
30 | def isMobileOrPhone(num: String): Boolean = {
31 | val pattern = new Regex("^((17[0-9])(14[0-9])|(13[0-9])|(15[^4,\\D])|(18[0,5-9]))\\d{8}$")
32 | val pattern2 = new Regex("(?:(\\(\\+?86\\))(0[0-9]{2,3}\\-?)?([2-9][0-9]{6,7})+(\\-[0-9]{1,4})?)|(?:(86-?)?(0[0-9]{2,3}\\-?)?([2-9][0-9]{6,7})+(\\-[0-9]{1,4})?)") // 验证带区号的
33 | // val pattern2 = new Regex("^[0][1-9]{2,3}-[0-9]{5,10}$") // 验证带区号的
34 | val pattern3 = new Regex("^[1-9]{1}[0-9]{5,8}$") // 验证没有区号的
35 | num match {
36 | case pattern(_*) => {
37 | true
38 | }
39 | case pattern2(_*) => {
40 | true
41 | }
42 | case pattern3(_*) => {
43 | true
44 | }
45 | case _ => {
46 | false
47 | }
48 | }
49 | }
50 |
51 |
52 |
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/core/src/test/java/com/lakala/datacenter/core/hdfs/FileUtilTest.java:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.core.hdfs;
2 |
3 | import com.lakala.datacenter.core.config.ConfigurationLoader;
4 | import com.lakala.datacenter.core.models.ProcessorMessage;
5 | import com.lakala.datacenter.core.models.ProcessorMode;
6 | import com.lakala.datacenter.core.processor.GraphProcessor;
7 | import junit.framework.TestCase;
8 | import org.junit.Test;
9 |
10 | import java.util.ArrayList;
11 | import java.util.Arrays;
12 |
13 | public class FileUtilTest extends TestCase {
14 |
15 | @Test
16 | public void testWritePropertyGraphUpdate() throws Exception {
17 |
18 | ConfigurationLoader.testPropertyAccess=true;
19 |
20 | // Create sample PageRank result
21 | String nodeList =
22 | "0 .001\n" +
23 | "1 .002\n" +
24 | "3 .003";
25 |
26 | // Create test path
27 | String path = ConfigurationLoader.getInstance().getHadoopHdfsUri() + "/test/propertyNodeList.txt";
28 |
29 | // Test writing the PageRank result to HDFS path
30 | FileUtil.writePropertyGraphUpdate(new ProcessorMessage(path, GraphProcessor.PAGERANK, ProcessorMode.Partitioned),
31 | new ArrayList<>(Arrays.asList(
32 | "0 .001\n",
33 | "1 .002\n",
34 | "3 .003"
35 | )));
36 |
37 | // Validate node list
38 | assertEquals(FileUtil.readHdfsFile(path), "# Node Property Value List" + "\n" + nodeList);
39 | }
40 | }
--------------------------------------------------------------------------------
/core/src/test/java/com/lakala/datacenter/core/messaging/SenderTest.java:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.core.messaging;
2 |
3 | import com.google.gson.Gson;
4 | import com.lakala.datacenter.core.config.ConfigurationLoader;
5 | import com.lakala.datacenter.core.models.ProcessorMessage;
6 | import com.lakala.datacenter.core.models.ProcessorMode;
7 | import com.lakala.datacenter.core.processor.GraphProcessor;
8 | import junit.framework.TestCase;
9 |
10 | public class SenderTest extends TestCase {
11 |
12 | private static final String EDGE_LIST_RELATIVE_FILE_PATH = "/neo4j/mazerunner/edgeList.txt";
13 |
14 | public void testSendMessage() throws Exception {
15 | ConfigurationLoader.testPropertyAccess=true;
16 | ProcessorMessage processorMessage = new ProcessorMessage("", "strongly_connected_components", ProcessorMode.Partitioned);
17 | processorMessage.setPath(ConfigurationLoader.getInstance().getHadoopHdfsUri() + GraphProcessor.PROPERTY_GRAPH_UPDATE_PATH);
18 | // Serialize the processor message
19 | Gson gson = new Gson();
20 | String message = gson.toJson(processorMessage);
21 |
22 | // Notify Neo4j that a property update list is available for processing
23 | Sender.sendMessage(message);
24 | }
25 |
26 |
27 | }
--------------------------------------------------------------------------------
/core/src/test/scala/com/lakala/datacenter/core/grograms/ShortestPathTests.scala:
--------------------------------------------------------------------------------
1 | /*
2 | package com.lakala.datacenter.core.grograms
3 |
4 | import com.lakala.datacenter.core.algorithms.Algorithms
5 | import com.lakala.datacenter.core.config.ConfigurationLoader
6 | import com.lakala.datacenter.core.processor.GraphProcessor
7 | import org.apache.spark.graphx._
8 | import org.apache.spark.graphx.lib.ShortestPaths
9 | import org.apache.spark.rdd.RDD
10 | import org.scalatest.FlatSpec
11 | import scala.collection.mutable
12 |
13 | class ShortestPathTests extends FlatSpec {
14 | /**
15 | * To collect the shortest path results for all nodes to a single destination node,
16 | * the following steps must be taken:
17 | *
18 | */
19 |
20 | ConfigurationLoader.testPropertyAccess = true
21 |
22 | // Create Spark context
23 | val sc = GraphProcessor.initializeSparkContext.sc
24 |
25 | val vertexIds = sc.parallelize(Seq(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L)).collect().toSeq
26 |
27 | def fixture =
28 | new {
29 |
30 | // Create an RDD for the vertices
31 | val vertices: RDD[(VertexId, ShortestPathState)] = sc.parallelize(Array(
32 | (0L, new ShortestPathState(0L, vertexIds)),
33 | (1L, new ShortestPathState(1L, vertexIds)),
34 | (2L, new ShortestPathState(2L, vertexIds)),
35 | (3L, new ShortestPathState(3L, vertexIds)),
36 | (4L, new ShortestPathState(4L, vertexIds)),
37 | (5L, new ShortestPathState(5L, vertexIds)),
38 | (6L, new ShortestPathState(6L, vertexIds)),
39 | (7L, new ShortestPathState(7L, vertexIds)),
40 | (8L, new ShortestPathState(8L, vertexIds)),
41 | (9L, new ShortestPathState(9L, vertexIds)),
42 | (10L, new ShortestPathState(10L, vertexIds)),
43 | (11L, new ShortestPathState(11L, vertexIds)),
44 | (12L, new ShortestPathState(12L, vertexIds))))
45 |
46 | // Create an RDD for edges
47 | val edges: RDD[Edge[Int]] = sc.parallelize(Array(
48 | Edge(0L, 1L, 0),
49 | Edge(1L, 4L, 0),
50 | Edge(1L, 2L, 0),
51 | Edge(2L, 3L, 0),
52 | Edge(5L, 6L, 0),
53 | Edge(6L, 7L, 0),
54 | Edge(7L, 8L, 0),
55 | Edge(8L, 9L, 0),
56 | Edge(9L, 10L, 0),
57 | Edge(10L, 11L, 0),
58 | Edge(11L, 12L, 0),
59 | Edge(12L, 3L, 0),
60 | Edge(7L, 3L, 0),
61 | Edge(4L, 3L, 0)))
62 |
63 | // Build the initial Graph
64 | val graph = Graph(vertices, edges, new ShortestPathState(-1L, null))
65 | }
66 |
67 | "A node's state" should "have a decision tree" in {
68 | val graph = fixture.graph
69 |
70 | val tree = new DecisionTree[VertexId](0L, mutable.HashMap[VertexId, DecisionTree[VertexId]]())
71 |
72 | graph.edges.collect().foreach(ed => tree.addLeaf(ed.srcId).addLeaf(ed.dstId))
73 |
74 | val vertexIds = graph.vertices.map(v => v._1).cache().collect()
75 |
76 | val sssp = ShortestPaths.run(graph, graph.vertices.map { vx => vx._1}.collect()).vertices.collect()
77 |
78 | val graphResults = sc.parallelize(vertexIds).map(row => {
79 | println("*** " + row)
80 | (row, vertexIds.map(vt => {
81 | (vt, tree.getNode(row).allShortestPathsTo(vt, sssp))
82 | }))
83 | }).collectAsync().get().toArray
84 |
85 | val result = Algorithms.betweennessCentrality(sc, graphResults)
86 |
87 | val resultStream = result
88 |
89 | for (x <- resultStream) {
90 | println(x)
91 | }
92 |
93 | }
94 |
95 | }
96 | */
97 |
--------------------------------------------------------------------------------
/neo4j/bin/start2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ## usage: sh bin/start.sh -i /logs/device/* -d 2016-01-11
3 |
4 | SPARK_HOME=/home/hadoop/spark-1.6.3-bin-hadoop2.6
5 | HIVE_HOME=/usr/hdp/current/hive-client
6 | PROJECT_HOME="$(cd "`dirname "$0"`"/..; pwd)"
7 | HDP_VERSION=2.4.0.0-169
8 | APP_CACHE_DIR=/tmp/device
9 |
10 | stdate=${1:-`date -d '1 days ago' +"%Y-%m-%d"`}
11 | #inputdir=/logs/device/*
12 | #inputfile=/logs/device/*/2016-01-{1[1-9],2[0-1]}
13 | while getopts "d:i:" opt ; do
14 | case $opt in
15 | d)stdate=$OPTARG ;;
16 | i)inputdir=$OPTARG ;;
17 | ?)echo "==> please input arg: stdate(d), inputdir(i)" && exit 1 ;;
18 | esac
19 | done
20 |
21 | #echo "==> ready for geoip...."
22 | #hadoop fs -mkdir -p $APP_CACHE_DIR/geoip
23 | #hadoop fs -test -e $APP_CACHE_DIR/geoip/GeoLite2-City.mmdb
24 | #if [ $? -ne 0 ]; then
25 | # echo "GeoLite2-City.mmdb not exists!"
26 | # hadoop fs -put $PROJECT_HOME/../tcloud-log-analysis/src/main/bundleApp/coord-common/geoip/GeoLite2-City.mmdb $APP_CACHE_DIR/geoip/
27 | #fi
28 |
29 | ## https://issues.apache.org/jira/browse/ZEPPELIN-93
30 | ## https://github.com/caskdata/cdap/pull/4106
31 | spark-submit \
32 | --master spark://datacenter17:7077,datacenter18:7077 \
33 | --class com.lakala.datacenter.main.Driver \
34 | --driver-memory 2G \
35 | --executor-memory 4G \
36 | --num-executors 3 \
37 | --executor-cores 3 \
38 | --conf "spark.rpc.askTimeout=300s" \
39 | --verbose \
40 | --files $SPARK_HOME/conf/hive-site.xml \
41 | --driver-class-path $PROJECT_HOME/target/dependency/mysql-connector-java-5.1.36.jar \
42 | --jars $PROJECT_HOME/target/dependency/mysql-connector-java-5.1.36.jar,$SPARK_HOME/lib/datanucleus-api-jdo-3.2.6.jar,$SPARK_HOME/lib/datanucleus-core-3.2.10.jar,$PROJECT_HOME/target/dependency/guava-14.0.1.jar,$SPARK_HOME/lib/datanucleus-rdbms-3.2.9.jar \
43 | $PROJECT_HOME/target/graphx-analysis-apply.jar \
44 | -i /user/linyanshi/query_result.csv -c /user/linyanshi/part-00003 -o file:////home/hadoop/grogram/analysis/graphx-analysis/apply/bin/output
45 |
46 | ## --packages com.databricks:spark-csv_2.10:1.3.0 \
47 | ## 2>&1 > output.txt
48 |
--------------------------------------------------------------------------------
/neo4j/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | com.lakala.datacenter
7 | graphx-analysis
8 | 1.0.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | graphx-analysis-neo4j
13 | jar
14 |
15 | graphx-analysis-neo4j
16 | http://maven.apache.org
17 |
18 |
19 | UTF-8
20 |
21 |
22 |
23 |
24 |
25 | com.lakala.datacenter
26 | graphx-analysis-core
27 | ${project.version}
28 |
29 |
30 |
31 |
32 | graphx-analysis-neo4j
33 |
34 |
35 |
36 | spark-repo
37 | http://dl.bintray.com/spark-packages/maven/
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/neo4j/src/main/java/com/lakala/datacenter/enums/DataAttributeType.java:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.enums;
2 |
3 | /**
4 | * Created by Administrator on 2017/6/16 0016.
5 | */
6 | interface DataInterface{}
7 |
8 | public enum DataAttributeType implements DataInterface{
9 | ORDERID(1, "orderid"), CONTRACTNO(2, "contractno"), TERMID(3, "termid"), LOANPAN(4, "loanpan"), RETURNPAN(5, "returnpan"),
10 | INSERTTIME(6, "inserttime"), RECOMMEND(7, "recommend"), USERID(8, "userid"), DEVICEID(9, "deviceid"),
11 | CERTNO(10, "certno"), EMAIL(11, "email"), COMPANY(12, "company"), MOBILE(13, "mobile"), COMPADDR(14, "compaddr"),
12 | COMPPHONE(15, "compphone"), EMERGENCYCONTACTMOBILE(16, "emergencycontactmobile"),
13 | CONTACTMOBILE(17, "contactmobile"), IPV4(18, "ipv4"), MSGPHONE(19, "msgphone"), TELECODE(20, "telecode");
14 | //成员变量
15 | private int sequence;
16 | private String name;
17 |
18 | //构造方法
19 | private DataAttributeType(int sequence, String name) {
20 | this.sequence = sequence;
21 | this.name = name;
22 | }
23 |
24 | //自定义方法
25 | public static String getColorName(int sequence) {
26 | for (DataAttributeType c : DataAttributeType.values()) {
27 | if (c.getSequence() == sequence)
28 | return c.name;
29 | }
30 | return null;
31 | }
32 |
33 | //getter&setter
34 | public int getSequence() {
35 | return sequence;
36 | }
37 |
38 | public void setSequence(int sequence) {
39 | this.sequence = sequence;
40 | }
41 |
42 | public String getName() {
43 | return name;
44 | }
45 |
46 | public void setName(String name) {
47 | this.name = name;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/neo4j/src/main/java/com/lakala/datacenter/enums/GraphEnum.java:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.enums;
2 |
3 |
4 | /**
5 | * Created by Administrator on 2017/7/11 0011.
6 | */
7 | public enum GraphEnum {
8 | TERMINAL("terminal", RelationshipTypes.terminal), BANKCARD("bankcard", RelationshipTypes.bankcard);
9 | private String relType;
10 | private RelationshipTypes relationshipTypes;
11 |
12 |
13 | private GraphEnum(String relType, RelationshipTypes relationshipTypes) {
14 | this.relType = relType;
15 | this.relationshipTypes = relationshipTypes;
16 | }
17 |
18 | public String getRelType() {
19 | return relType;
20 | }
21 |
22 | public RelationshipTypes getRelationshipTypes(String relType) {
23 | for (GraphEnum ge : GraphEnum.values()) {
24 | if (ge.relType.equals(relType)) return ge.relationshipTypes;
25 | continue;
26 | }
27 | return null;
28 | }
29 |
30 | public RelationshipTypes getRelationshipTypes() {
31 | return relationshipTypes;
32 | }
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/neo4j/src/main/java/com/lakala/datacenter/enums/Labels.java:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.enums;
2 |
3 | import org.neo4j.graphdb.Label;
4 |
5 | /**
6 | * Created by Administrator on 2017/5/31 0031.
7 | */
8 | public enum Labels implements Label {
9 | ApplyInfo, Terminal, BankCard, Mobile, Identification, Email, Company, CompanyAddress, CompanyTel, Device, IPV4
10 | }
11 |
--------------------------------------------------------------------------------
/neo4j/src/main/java/com/lakala/datacenter/enums/RelationshipTypes.java:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.enums;
2 |
3 | import org.neo4j.graphdb.RelationshipType;
4 |
5 | /**
6 | * Created by Administrator on 2017/5/31 0031.
7 | */
8 | public enum RelationshipTypes implements RelationshipType {
9 | terminal, bankcard, loginmobile, ipv4, applymymobile, hometel, recommend, identification, email, company, companyaddress, companytel, emergencymobile,merchantmobile,channelmobile,relativemobile, relativecontact, device;
10 | }
11 |
--------------------------------------------------------------------------------
/neo4j/src/main/resources/css/style.css:
--------------------------------------------------------------------------------
1 | graph {
2 | fill-color: white;
3 | }
4 | node {
5 | size: 65;
6 | fill-color: #CCCCCC, #AAAAAA;
7 | fill-mode: gradient-radial;
8 | text-offset: 0, 0;
9 | stroke-mode: plain;
10 | stroke-color: #333333;
11 | }
12 | node:clicked {
13 | fill-color: #2277FF, #88AAFF;
14 | fill-mode: gradient-radial;
15 | size: 100;
16 | text-size:18;
17 | text-offset: 0, 0;
18 | }
19 | edge {
20 | text-alignment: along;
21 | }
22 |
--------------------------------------------------------------------------------
/neo4j/src/main/resources/dev/config.properties:
--------------------------------------------------------------------------------
1 | neoIP=bolt://192.168.0.33:7687
2 | user=neo4j
3 | password=123456
4 | #************redis config **********
5 | redisIp=192.168.0.192:6380,192.168.0.192:6381,192.168.0.192:6382,192.168.0.192:6383,192.168.0.192:6384,192.168.0.192:6385
6 | psubscribe=testsub11
--------------------------------------------------------------------------------
/neo4j/src/main/resources/dev/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
18 |
19 |
20 |
21 |
22 |
23 | hive.metastore.uris
24 | thrift://192.168.0.212:9083
25 | Thrift uri for the remote metastore. Used by metastore client to connect to remote metastore.
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/neo4j/src/main/resources/log4j.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/neo4j/src/main/resources/product/config.properties:
--------------------------------------------------------------------------------
1 | neoIP=bolt://10.16.65.15:7688
2 | user=neo4j
3 | password=123456
4 | #************redis config **********
5 | redisIp=10.0.8.170:6800,10.0.8.170:6801,10.0.8.171:6800,10.0.8.171:6801,10.0.8.172:6800,10.0.8.172:6801
6 | psubscribe=dataPlatform.anti_fraud.order_monitor
--------------------------------------------------------------------------------
/neo4j/src/main/resources/test/config.properties:
--------------------------------------------------------------------------------
1 | neoIP=http://192.168.0.33:7474/db/data
2 | user=neo4j
3 | password=123456
4 | #************redis config **********
5 | redisIp=192.168.0.192:6380,192.168.0.192:6381,192.168.0.192:6382,192.168.0.192:6383,192.168.0.192:6384,192.168.0.192:6385
6 | psubscribe=testsub11
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/abstractions/DataGenerator.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.abstractions
2 |
3 | import com.lakala.datacenter.utils.Config
4 |
5 | /**
6 | * Created by Administrator on 2017/5/31 0031.
7 | */
8 | trait DataGenerator {
9 | def generateUsers(config: Config): Unit
10 | }
11 |
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/load/spark/ClusterGraphDatabase.scala:
--------------------------------------------------------------------------------
1 | /*
2 | package com.lakala.datacenter.load.spark
3 |
4 | import com.lakala.datacenter.enums.Labels
5 | import org.neo4j.graphdb.index.IndexHits
6 | import org.neo4j.graphdb.{Node, Relationship}
7 | import org.neo4j.helpers.collection.MapUtil
8 | import org.neo4j.index.impl.lucene.legacy.LuceneIndexImplementation
9 | import org.neo4j.rest.graphdb.index.RestIndex
10 | import org.neo4j.rest.graphdb.query.RestCypherQueryEngine
11 | import org.neo4j.rest.graphdb.{RestAPI, RestAPIFacade}
12 |
13 | /**
14 | * Created by Administrator on 2017/6/19 0019.
15 | */
16 | object ClusterGraphDatabase {
17 | private var restAPI: RestAPI = null
18 | private val serverBaseUrl = "http://192.168.0.33:7474/db/data"
19 | private val user = "neo4j"
20 | private val password = "123456"
21 |
22 | def main(args: Array[String]): Unit = {
23 | try
24 | setUp
25 | countExistingNodes
26 | tearDown
27 | }
28 |
29 | @throws[Throwable]
30 | def setUp(): Unit = {
31 | restAPI = new RestAPIFacade(serverBaseUrl, user, password)
32 | validateServerIsUp()
33 | val queryEngine = new RestCypherQueryEngine(restAPI)
34 | // graphdb = queryEngine.asInstanceOf[GraphDatabaseService]
35 | }
36 |
37 | @throws[Throwable]
38 | private def validateServerIsUp() = {
39 | try
40 | restAPI.getAllLabelNames
41 | catch {
42 | case e: Throwable =>
43 | println(" !!!!!!!!!!!!!!!! NOTE !!!!!!!!!!!!!!!!!!!!!!!! \n" + "this test assumes a Neo4j Server is running in a separate process \n" + "on localhost port 7474. You will need to manually start it before \n" + "running these demo tests.")
44 | throw e
45 | }
46 | }
47 |
48 | def tearDown(): Unit = {
49 | restAPI.close()
50 | }
51 |
52 |
53 | def countExistingNodes(): Unit = {
54 | //472
55 | val node2 = restAPI.getNodeById(293)
56 | println(node2.getLabels.iterator().next().name())
57 | val indexs = restAPI.createIndex(classOf[Node], "orderno", LuceneIndexImplementation.EXACT_CONFIG)
58 | val relIndex: RestIndex[Relationship] = restAPI.createIndex(classOf[Relationship], "terminal", LuceneIndexImplementation.EXACT_CONFIG)
59 | val terminalIndexs = restAPI.createIndex(classOf[Node], "content", LuceneIndexImplementation.EXACT_CONFIG)
60 |
61 | val hitIndex2: IndexHits[Node] = indexs.get("orderno", "XNA20170617214709013851193476043")
62 | val hitIndex: IndexHits[Node] = terminalIndexs.get("content", "CBC3A110160228103")
63 | println(hitIndex2.size())
64 | println(hitIndex2.getSingle)
65 | println("#################")
66 | println(hitIndex.size())
67 | println(hitIndex.getSingle)
68 | val applyNode = restAPI.getOrCreateNode(indexs, "orderno", "XNA20170617214709013851193476043", MapUtil.map("term_id", "CBC3A110160228103"))
69 | applyNode.addLabel(Labels.ApplyInfo)
70 | println(applyNode.getLabels.iterator().next().name())
71 | applyNode.setProperty("orderno", "XNA20170617214709013851193476043")
72 | // applyNode.setProperty("term_id", "CBC3A110160228103")
73 | applyNode.setProperty("modelname", Labels.ApplyInfo)
74 |
75 | val terminalNode = restAPI.getOrCreateNode(terminalIndexs, "content", "CBC3A110160228103", MapUtil.map())
76 | terminalNode.addLabel(Labels.Terminal)
77 | terminalNode.setProperty("modelname", Labels.Terminal)
78 | println(terminalNode.getLabels.iterator().next().name())
79 | val rel = restAPI.getOrCreateRelationship(relIndex, "", "", applyNode, terminalNode, "terminal", MapUtil.map())
80 |
81 | if (applyNode != null) {
82 | println("====================")
83 | println("apply node " + applyNode.getId() + " terminal node " + terminalNode.getId + " relationship " + rel.getId + " is created.")
84 | }
85 |
86 | }
87 | }
88 | */
89 |
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/load/spark/LoadHiveData.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.load.spark
2 |
3 | import com.lakala.datacenter.core.utils.UtilsToos
4 | import org.apache.commons.lang3.StringUtils
5 | import org.apache.spark.sql.hive.HiveContext
6 | import org.apache.spark.{SparkConf, SparkContext}
7 |
8 | /**
9 | * Created by Administrator on "2017"/5/"31" 0031.
10 | */
11 | object LoadHiveData {
12 | def main(args: Array[String]): Unit = {
13 | val conf = new SparkConf().setAppName("LoadHiveData")
14 | val sc = new SparkContext(conf)
15 | val hc = new HiveContext(sc)
16 | hc.sql("use creditloan")
17 | val sql =
18 | s"""select a.order_id,a.contract_no,a.term_id,a.loan_pan,a.return_pan,a.insert_time,a.recommend,a.user_id,b.cert_no,b.email,b.company,b.mobile,b.comp_addr,b.comp_phone,b.emergency_contact_mobile,b.contact_mobile,c.device_id
19 | |from creditloan.s_c_loan_apply a
20 | | left join creditloan.s_c_apply_user b on a.user_id =b.id and (a.year="2017" and a.month="05" and a.day="31") and (b.year="2017" and b.month="05" and b.day="31")
21 | | left join creditloan.s_c_loan_deviceidauth c on a.order_id =c.order_no and (a.year="2017" and a.month="05" and a.day="31") and (c.year="2017" and c.month="05" and c.day="31") """.stripMargin
22 |
23 | val df = hc.sql(sql)
24 | val lineRDD = df.mapPartitions { rows =>
25 | rows.map { row =>
26 | val orderId = row.getAs[String]("order_id")
27 | val contractNo = if (StringUtils.isNotBlank(row.getAs[String]("contract_no"))) row.getAs[String]("contract_no") else ""
28 | val termId = if (StringUtils.isNotBlank(row.getAs[String]("term_id"))) row.getAs[String]("term_id") else ""
29 | val loanPan = if (StringUtils.isNotBlank(row.getAs[String]("loan_pan"))) row.getAs[String]("loan_pan") else ""
30 | val returnPan = if (StringUtils.isNotBlank(row.getAs[String]("return_pan"))) row.getAs[String]("return_pan") else ""
31 | val insertTime = if (StringUtils.isNotBlank(row.getAs[String]("insert_time"))) row.getAs[String]("insert_time") else ""
32 | val recommend = if (StringUtils.isNotBlank(row.getAs[String]("recommend")) && UtilsToos.isMobileOrPhone(row.getAs[String]("recommend"))) row.getAs[String]("recommend") else ""
33 | val userId = if (StringUtils.isNotBlank(row.getAs[String]("user_id"))) row.getAs[String]("user_id") else ""
34 | val certNo = if (StringUtils.isNotBlank(row.getAs[String]("cert_no"))) row.getAs[String]("cert_no") else ""
35 | val email = if (StringUtils.isNotBlank(row.getAs[String]("email"))) row.getAs[String]("email") else ""
36 | val company = if (StringUtils.isNotBlank(row.getAs[String]("company"))) row.getAs[String]("company") else ""
37 | val mobile = if (StringUtils.isNotBlank(row.getAs[String]("mobile")) && UtilsToos.isMobileOrPhone(row.getAs[String]("mobile"))) row.getAs[String]("mobile") else ""
38 | val compAddr = if (StringUtils.isNotBlank(row.getAs[String]("comp_addr"))) row.getAs[String]("comp_addr") else ""
39 | val compPhone = if (StringUtils.isNotBlank(row.getAs[String]("comp_phone"))) row.getAs[String]("comp_phone") else ""
40 | val emergencyContactMobile = if (StringUtils.isNotBlank(row.getAs[String]("emergency_contact_mobile"))) row.getAs[String]("emergency_contact_mobile") else ""
41 | val contactMobile = if (StringUtils.isNotBlank(row.getAs[String]("contact_mobile"))) row.getAs[String]("contact_mobile") else ""
42 | val deviceId = if (StringUtils.isNotBlank(row.getAs[String]("device_id"))) row.getAs[String]("device_id") else ""
43 | s"$orderId,$contractNo,$termId,$loanPan,$returnPan,$insertTime,$recommend,$userId,$certNo,$email,$company,$mobile,$compAddr,$compPhone,$emergencyContactMobile,$contactMobile,$deviceId"
44 | }
45 | }
46 |
47 | lineRDD.saveAsTextFile(args(1))
48 |
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/load/spark/Neo4jConfig.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.load.spark
2 |
3 | import org.apache.spark.SparkConf
4 | import org.neo4j.driver.v1.{AuthTokens, Config, Driver, GraphDatabase}
5 |
6 | /**
7 | * @author lys
8 | * @since 02.03.16
9 | */
10 | case class Neo4jConfig(val url: String, val user: String = "neo4j", val password: Option[String] = None) {
11 |
12 | def boltConfig() = Config.build.withEncryptionLevel(Config.EncryptionLevel.NONE).toConfig
13 |
14 | def driver(config: Neo4jConfig): Driver = config.password match {
15 | case Some(pwd) => GraphDatabase.driver(config.url, AuthTokens.basic(config.user, pwd), boltConfig())
16 | case _ => GraphDatabase.driver(config.url, boltConfig())
17 | }
18 |
19 | def driver(): Driver = driver(this)
20 |
21 | def driver(url: String): Driver = GraphDatabase.driver(url, boltConfig())
22 |
23 | }
24 |
25 | object Neo4jConfig {
26 | val prefix = "spark.neo4j.bolt."
27 |
28 | def apply(sparkConf: SparkConf): Neo4jConfig = {
29 | val url = sparkConf.get(prefix + "url", "bolt://192.168.0.33:7687")
30 | val user = sparkConf.get(prefix + "user", "neo4j")
31 | val password: Option[String] = Option(sparkConf.get(prefix + "password", "123456"))
32 | Neo4jConfig(url, user, password)
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/load/spark/Neo4jJavaIntegration.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.load.spark
2 |
3 | import java.util
4 |
5 | import org.apache.spark.SparkContext
6 | import org.apache.spark.api.java.JavaRDD
7 | import org.apache.spark.sql.SQLContext
8 |
9 | import scala.collection.JavaConverters._
10 |
11 | /**
12 | * @author lys
13 | * @since 19.03.16
14 | */
15 | object Neo4jJavaIntegration {
16 | def rowRDD(sc: SparkContext, query: String, parameters: java.util.Map[String, AnyRef]) =
17 | new Neo4jRowRDD(sc, query, if (parameters == null) Seq.empty else parameters.asScala.toSeq).toJavaRDD()
18 |
19 | def tupleRDD(sc: SparkContext, query: String, parameters: java.util.Map[String, AnyRef]): JavaRDD[util.Map[String, AnyRef]] = {
20 | val params = if (parameters == null) Seq.empty else parameters.asScala.toSeq
21 | Neo4jTupleRDD(sc, query, params)
22 | .map((t) => new util.LinkedHashMap[String, AnyRef](t.toMap.asJava).asInstanceOf[util.Map[String, AnyRef]])
23 | .toJavaRDD()
24 | }
25 |
26 | def dataFrame(sqlContext: SQLContext, query: String, parameters: java.util.Map[String, AnyRef], schemaInfo: util.Map[String, String]) = {
27 | Neo4jDataFrame(sqlContext, query, parameters.asScala.toSeq, schemaInfo.asScala.toSeq: _*)
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/load/spark/Neo4jPartition.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.load.spark
2 |
3 | import org.apache.spark.Partition
4 |
5 | /**
6 | * @author lys
7 | * @since 02.03.16
8 | */
9 | // , val lower: Long = 0, val upper: Long = 0 -> paging for cypher queries with skip / limit
10 | class Neo4jPartition(idx: Long = 0, skip : Long = 0, limit : Long = Long.MaxValue) extends Partition {
11 | override def index: Int = idx.toInt
12 | val window : Map[String,Any] = Map("_limit" -> limit, "_skip" -> skip)
13 |
14 | override def toString: String = s"Neo4jRDD index $index skip $skip limit: $limit"
15 | }
16 |
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/load/spark/Neo4jRowRDD.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.load.spark
2 |
3 | import org.apache.spark._
4 | import org.apache.spark.rdd.RDD
5 | import org.apache.spark.sql.Row
6 | import org.neo4j.driver.v1._
7 |
8 | import scala.collection.JavaConverters._
9 |
10 | class Neo4jRowRDD(@transient sc: SparkContext, val query: String, val parameters: Seq[(String, Any)])
11 | extends RDD[Row](sc, Nil) {
12 |
13 | private val config = Neo4jConfig(sc.getConf)
14 |
15 | override def compute(split: Partition, context: TaskContext): Iterator[Row] = {
16 | val driver = config.driver()
17 | val session = driver.session()
18 |
19 | val result: StatementResult = session.run(query, parameters.toMap.mapValues(_.asInstanceOf[AnyRef]).asJava)
20 |
21 | result.asScala.map((record) => {
22 | val keyCount = record.size()
23 |
24 | val res = if (keyCount == 0) Row.empty
25 | else if (keyCount == 1) Row(record.get(0).asObject())
26 | else {
27 | val builder = Seq.newBuilder[AnyRef]
28 | var i = 0
29 | while (i < keyCount) {
30 | builder += record.get(i).asObject()
31 | i = i + 1
32 | }
33 | Row.fromSeq(builder.result())
34 | }
35 | if (!result.hasNext) {
36 | session.close()
37 | driver.close()
38 | }
39 | res
40 | })
41 | }
42 |
43 | override protected def getPartitions: Array[Partition] = Array(new Neo4jPartition())
44 | }
45 |
46 | object Neo4jRowRDD {
47 | def apply(sc: SparkContext, query: String, parameters: Seq[(String, Any)] = Seq.empty) = new Neo4jRowRDD(sc, query, parameters)
48 | }
49 |
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/load/spark/Neo4jTupleRDD.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.load.spark
2 |
3 | import org.apache.spark._
4 | import org.apache.spark.rdd.RDD
5 | import org.neo4j.driver.v1.Driver
6 |
7 | import scala.collection.JavaConverters._
8 |
9 | class Neo4jTupleRDD(@transient sc: SparkContext, val query: String, val parameters: Seq[(String, AnyRef)])
10 | extends RDD[Seq[(String, AnyRef)]](sc, Nil) {
11 |
12 | private val config = Neo4jConfig(sc.getConf)
13 |
14 | override def compute(split: Partition, context: TaskContext): Iterator[Seq[(String, AnyRef)]] = {
15 | val driver: Driver = config.driver()
16 | val session = driver.session()
17 |
18 | val result = session.run(query, parameters.toMap.asJava)
19 |
20 | result.asScala.map( (record) => {
21 | val res = record.asMap().asScala.toSeq
22 | if (!result.hasNext) {
23 | session.close()
24 | driver.close()
25 | }
26 | res
27 | })
28 | }
29 |
30 | override protected def getPartitions: Array[Partition] = Array(new Neo4jPartition())
31 | }
32 |
33 | object Neo4jTupleRDD {
34 | def apply(sc: SparkContext, query: String, parameters: Seq[(String,AnyRef)] = Seq.empty) = new Neo4jTupleRDD(sc, query, parameters)
35 | }
36 |
37 |
38 |
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/main/Main.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.main
2 |
3 | import java.io.File
4 |
5 | import com.lakala.datacenter.enums.RelationshipTypes
6 | import com.lakala.datacenter.grogram.Neo4jDataGenerator
7 | import com.lakala.datacenter.utils.Config
8 | import org.joda.time.DateTime
9 | import org.neo4j.graphdb.factory.GraphDatabaseFactory
10 | import org.neo4j.graphdb.{Direction, GraphDatabaseService}
11 | import org.neo4j.io.fs.FileUtils
12 | import org.slf4j.LoggerFactory
13 |
14 | /**
15 | * Created by Administrator on 2017/5/31 0031.
16 | */
17 | object Main {
18 | private val logger = LoggerFactory.getLogger("Main")
19 | val COUNT = 100000 //数据批量提交
20 | //F:\tmp\applydir F:\tmp\neo4j\tmp01
21 | val FRIENDS_PER_USER = 50
22 |
23 | def main(args: Array[String]): Unit = {
24 | val mainTime = DateTime.now()
25 | println("start generateGraphData time " + DateTime.now())
26 | // chackArgs(args) 13199050
27 | // val config = ArgsCommon.parseArgs(args)
28 | val config = new Config()
29 | config.input = args(0)
30 | config.output = args(1)
31 | generateGraphData(config)
32 | val endtime = DateTime.now()
33 | println("end generateGraphData time " + endtime + "+run long time " + (endtime.getMillis - mainTime.getMillis) / 36000)
34 | }
35 |
36 | def generateGraphData(config: Config): Unit = {
37 | FileUtils.deleteRecursively(new File(config.output + "/" + config.neo4jDB))
38 | var graphdb = new GraphDatabaseFactory().newEmbeddedDatabase(new File(config.output + "/" + config.neo4jDB))
39 | val neo4jDataGenerator = new Neo4jDataGenerator(graphdb)
40 | //生成数据
41 | neo4jDataGenerator.generateUsers(config)
42 | registerShutdownHook(graphdb)
43 | }
44 |
45 | /**
46 | * START SNIPPET: shutdownHook
47 | * @param graph
48 | */
49 | def registerShutdownHook(graph: GraphDatabaseService): Unit = {
50 | Runtime.getRuntime.addShutdownHook(new Thread() {
51 | override def run(): Unit = {
52 | graph.shutdown()
53 | }
54 | })
55 | }
56 |
57 | def chackArgs(args: Array[String]): Unit = {
58 | if (args.length < 1) {
59 | println("Usage: class com.lakala.datacenter.grogress.ExportNDegreeData$ [options]\n" +
60 | "[=....]\n " +
61 | "-i | --Input \n applyInput file or path Required.\n " +
62 | "-o | --output \n output path Required\n " +
63 | "-m | --master \n spark master, local[N] or spark://host:port default=local\n " +
64 | "-h | --sparkhome \n SPARK_HOME Required to run on cluster\n " +
65 | "-n | --jobname \n job name\n " +
66 | "-s | --startDate \n use start date load data\n " +
67 | "-t | --endDate \n use end date load data\n " +
68 | "-p | --parallelism \n sets spark.default.parallelism and minSplits on the edge file. default=based on input partitions\n " +
69 | "-x | --minprogress \n Number of vertices that must change communites for the algorithm to consider progress. default=2000\n " +
70 | "-y | --progresscounter \n Number of times the algorithm can fail to make progress before exiting. default=1\n " +
71 | "-d | --edgedelimiter \n specify input file edge delimiter. default=\",\"\n " +
72 | "-j | --jars \n comma seperated list of jars\n " +
73 | "-e | --encrypy \n Set to true to all data convert encrypy need all data use google hash's MD5 generage Long ids. Defaults to false\n " +
74 | "-b | --blacType \n Set to true to exprot black result data, Defaults to false\n " +
75 | " =.... ")
76 | sys.exit(1)
77 | }
78 | }
79 |
80 | }
81 |
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/main/MessageParam.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.main
2 |
3 | import kafka.consumer.KafkaStream
4 | import org.neo4j.driver.v1.Session
5 | import redis.clients.jedis.JedisCluster
6 |
7 | /**
8 | * Created by Administrator on 2017/8/7 0007.
9 | */
10 | case class MessageParam(m_stream: KafkaStream[_, _], m_threadNumber: Int, redis: JedisCluster,
11 | session: Session, sessionBak: Session,psubscribe:String) {
12 |
13 | }
14 |
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/main/TrialConsumerKafka.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.main
2 |
3 | import java.util.Properties
4 | import java.util.concurrent.{ExecutorService, Executors, TimeUnit}
5 |
6 | import com.lakala.datacenter.constant.StreamingConstant
7 | import com.lakala.datacenter.utils.RedisUtils
8 | import com.lakala.datacenter.utils.UtilsTools.properties
9 | import kafka.consumer.{ConsumerConfig, ConsumerConnector, KafkaStream}
10 | import kafka.serializer.StringDecoder
11 | import kafka.utils.VerifiableProperties
12 | import org.apache.commons.lang3.StringUtils.trim
13 | import org.neo4j.driver.v1.{AuthTokens, Driver, GraphDatabase}
14 | import redis.clients.jedis.JedisCluster
15 |
16 | import scala.collection.Map
17 |
18 | /**
19 | * Created by Administrator on 2017/8/7 0007.
20 | *
21 | */
22 |
23 | object TrialConsumerKafka{
24 | def main(args: Array[String]): Unit = {
25 | val zooKeeper: String = args(0)
26 | val groupId: String = args(1)
27 | val topic: String = args(2)
28 | val threads: Int = args(3).toInt
29 | println("start trial consumer kafaka message .....")
30 | val example:TrialConsumerKafka= new TrialConsumerKafka(zooKeeper, groupId, topic)
31 | example.run(threads)
32 |
33 | try {
34 | Thread.sleep(10000)
35 | } catch {
36 | case ie: InterruptedException =>
37 | println("==============")
38 | }
39 |
40 | }
41 | }
42 |
43 | class TrialConsumerKafka {
44 | private var consumer: ConsumerConnector = null
45 | private var topic: String = null
46 | private var executor: ExecutorService = null
47 | private var driver: Driver = null
48 | private var redis: JedisCluster = RedisUtils.jedisCluster()
49 | val properies = properties(StreamingConstant.CONFIG)
50 | def this(a_zookeeper: String, a_groupId: String, a_topic: String) {
51 | this()
52 | this.topic = a_topic
53 | consumer = kafka.consumer.Consumer.create(createConsumerConfig(a_zookeeper, a_groupId))
54 | driver = GraphDatabase.driver(trim(properies.getProperty(StreamingConstant.NEOIP)), AuthTokens.basic(trim(properies.getProperty(StreamingConstant.USER)), trim(properies.getProperty(StreamingConstant.PASSWORD))))
55 | }
56 |
57 | def shutdown(): Unit = {
58 | if (consumer != null) consumer.shutdown
59 | if (executor != null) executor.shutdown
60 | try {
61 | if (!executor.awaitTermination(5000, TimeUnit.MILLISECONDS)) System.out.println("Timed out waiting for consumer threads to shut down, exiting uncleanly")
62 | } catch {
63 | case e: InterruptedException =>
64 | System.out.println("Interrupted during shutdown, exiting uncleanly")
65 | }
66 | }
67 |
68 |
69 | def run(a_numThreads: Int): Unit = {
70 | val topicCountMap = Map(topic -> a_numThreads)
71 | // val topicCountMap = Map(topic -> 1)
72 | val keyDecoder = new StringDecoder(new VerifiableProperties)
73 | val valueDecoder = new StringDecoder(new VerifiableProperties)
74 | val consumerMap: Map[String, List[KafkaStream[String, String]]] = consumer.createMessageStreams(topicCountMap, keyDecoder, valueDecoder)
75 | val streams: List[KafkaStream[String, String]] = consumerMap.get(topic).get
76 |
77 | executor = Executors.newFixedThreadPool(a_numThreads)
78 | var threadNumber = 0
79 | streams.foreach { stream =>
80 | executor.submit(new HandleTask(MessageParam(stream, threadNumber, redis,
81 | driver.session, driver.session, properies.getProperty(StreamingConstant.PSUBSCRIBE))))
82 | threadNumber += 1
83 | }
84 | }
85 |
86 | private def createConsumerConfig(a_zookeeper: String, a_groupId: String): ConsumerConfig = {
87 | val props = new Properties()
88 | props.put("zookeeper.connect", a_zookeeper)
89 | props.put("group.id", a_groupId)
90 | props.put("zookeeper.session.timeout.ms", "60000")
91 | props.put("zookeeper.sync.time.ms", "200")
92 | props.put("auto.commit.interval.ms", "1000")
93 | props.put("auto.offset.reset", "smallest")
94 | props.put("rebalance.max.retries", "5")
95 | props.put("rebalance.backoff.ms", "12000")
96 | props.put("serializer.class", "kafka.serializer.StringEncoder")
97 | new ConsumerConfig(props)
98 | }
99 | }
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/realtimeBuildGraphx/MsgOffsetStreamListener.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.realtimeBuildGraphx
2 |
3 | import com.lakala.datacenter.constant.StreamingConstant
4 | import com.lakala.datacenter.utils.Config
5 | import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
6 | import org.I0Itec.zkclient.ZkClient
7 | import org.I0Itec.zkclient.exception.ZkMarshallingError
8 | import org.I0Itec.zkclient.serialize.ZkSerializer
9 | import org.apache.spark.Logging
10 | import org.apache.spark.streaming.Time
11 | import org.apache.spark.streaming.kafka.OffsetRange
12 | import org.apache.spark.streaming.scheduler.{StreamingListener, StreamingListenerBatchCompleted, StreamingListenerReceiverError, StreamingListenerReceiverStopped}
13 |
14 | import scala.collection.mutable
15 |
16 | /**
17 | * Created by Administrator on 2017/6/9 0009.
18 | */
19 | class MsgOffsetStreamListener(config: Config, offsetRanges: mutable.Map[Time, Array[OffsetRange]]) extends StreamingListener with Logging {
20 |
21 | var zkClient = getZkClient(config.zkIPs)
22 | // val zkUtils = ZkUtils.apply(zkClient,true)
23 | val topicDirs = new ZKGroupTopicDirs(config.group, config.topic)
24 |
25 | override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {
26 | //创建一个 ZKGroupTopicDirs 对象,对保存
27 | //查询该路径下是否字节点(默认有字节点为我们自己保存不同 partition 时生成的)
28 | // println(batchCompleted.batchInfo.numRecords)
29 | if (batchCompleted.batchInfo.numRecords > 0) {
30 | val currOffsetRange = offsetRanges.remove(batchCompleted.batchInfo.batchTime).getOrElse(Array[OffsetRange]())
31 | currOffsetRange.foreach { x =>
32 | val zkPath = s"${topicDirs.consumerOffsetDir}/${x.partition}"
33 | //将该 partition 的 offset 保存到 zookeeper
34 | // ZkUtils.apply(zkClient,true).updatePersistentPath(zkPath, s"${x.fromOffset}")
35 | ZkUtils.updatePersistentPath(zkClient, zkPath, s"${x.fromOffset}")
36 | println(s"zkPath:${zkPath} offset:fromOffset ${x.fromOffset} untilOffset ${x.untilOffset}")
37 | // logInfo(s"zkPath:${zkPath} offset:fromOffset ${x.fromOffset} untilOffset ${x.untilOffset}")
38 | }
39 | }
40 | }
41 |
42 | override def onReceiverError(receiverError: StreamingListenerReceiverError): Unit = {
43 | val topicDirs = new ZKGroupTopicDirs(config.group, config.topic)
44 | logError(s"ERROR:${receiverError.receiverInfo.lastError}\n Message:${receiverError.receiverInfo.lastErrorMessage}")
45 | val currOffsetRange = offsetRanges.remove(Time.apply(receiverError.receiverInfo.lastErrorTime)).getOrElse(Array[OffsetRange]())
46 | currOffsetRange.foreach { x =>
47 | val zkPath = s"${topicDirs.consumerOffsetDir}/${x.partition}"
48 | // ZkUtils.apply(zkClient,true).updatePersistentPath(zkPath, s"${x.fromOffset}")
49 | ZkUtils.updatePersistentPath(zkClient, zkPath, s"${x.fromOffset}")
50 | println(s"zkPath:${zkPath} offset:fromOffset ${x.fromOffset} untilOffset ${x.untilOffset}")
51 | // logInfo(s"zkPath:${zkPath} offset:fromOffset ${x.fromOffset} untilOffset ${x.untilOffset}")
52 | }
53 | }
54 |
55 | def getZkClient(zkServers: String, sessionTimeout: Int = 60000, connectionTimeout: Int = 60000): ZkClient = {
56 | val zkClient = new ZkClient(zkServers, sessionTimeout, connectionTimeout, new ZkSerializer {
57 | override def serialize(data: Object): Array[Byte] = {
58 | try {
59 | return data.toString().getBytes(StreamingConstant.CODE)
60 | } catch {
61 | case e: ZkMarshallingError => return null
62 |
63 | }
64 | }
65 | override def deserialize(bytes: Array[Byte]): Object = {
66 | try {
67 | return new String(bytes, StreamingConstant.CODE)
68 | } catch {
69 | case e: ZkMarshallingError => return null
70 | }
71 | }
72 | })
73 | zkClient
74 | }
75 |
76 | }
77 |
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/realtimeBuildGraphx/SendMsg.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.realtimeBuildGraphx
2 |
3 | /**
4 | * Created by Administrator on 2017/8/2 0002.
5 | */
6 | case class SendMsg(orderno:String,insert_time:String,cert_no:String) {
7 |
8 | }
9 |
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/utils/RedisUtils.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.utils
2 |
3 |
4 | import java.util
5 |
6 | import com.lakala.datacenter.constant.StreamingConstant
7 | import com.lakala.datacenter.utils.UtilsTools.properties
8 | import redis.clients.jedis.{HostAndPort, JedisCluster}
9 |
10 | import scala.collection.JavaConversions
11 |
12 | /**
13 | * Created by Administrator on 2017/6/29 0029.
14 | */
15 | object RedisUtils {
16 | private var cluster: JedisCluster = _
17 | private val properies = properties(StreamingConstant.CONFIG)
18 |
19 | def jedisCluster(): JedisCluster = {
20 | if (cluster == null) {
21 | synchronized {
22 | if (cluster == null) {
23 | val cluseterNodesSet = for (ipAndPort <- properies.getProperty("redisIp").split(",")) yield
24 | new HostAndPort(ipAndPort.split(":")(0).trim, (ipAndPort.split(":")(1).trim).toInt)
25 | cluster = new JedisCluster(JavaConversions.setAsJavaSet[HostAndPort](cluseterNodesSet.toSet))
26 | }
27 | }
28 | }
29 | cluster
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/neo4j/src/main/scala/com/lakala/datacenter/utils/UtilsTools.scala:
--------------------------------------------------------------------------------
1 | package com.lakala.datacenter.utils
2 |
3 | import java.io.Serializable
4 | import java.util.Properties
5 |
6 | import org.slf4j.LoggerFactory
7 |
8 | /**
9 | * Created by lenovo on 2016/8/10.
10 | */
11 | object UtilsTools {
12 | private val logger = LoggerFactory.getLogger(this.getClass)
13 |
14 | def properties(propertiesPath: String): Properties = {
15 | var _properties: Option[Properties] = None
16 | _properties match {
17 | case None => {
18 | logger.info("Loading configuration...")
19 | val inputStream = this.getClass.getClassLoader.getResourceAsStream(propertiesPath)
20 | val underlying = new Properties()
21 | underlying.load(inputStream)
22 | _properties = Some(underlying)
23 | underlying
24 | }
25 | case Some(underlying) => {
26 | underlying
27 | }
28 | }
29 | _properties.get
30 | }
31 |
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/neo4j/src/test/java/ConsumerKafka.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by Administrator on 2017/8/7 0007.
3 | */
4 |
5 | import kafka.consumer.Consumer;
6 | import kafka.consumer.ConsumerConfig;
7 | import kafka.consumer.ConsumerIterator;
8 | import kafka.consumer.KafkaStream;
9 | import kafka.javaapi.consumer.ConsumerConnector;
10 | import kafka.message.MessageAndMetadata;
11 | import kafka.serializer.StringEncoder;
12 |
13 | import java.util.HashMap;
14 | import java.util.List;
15 | import java.util.Map;
16 | import java.util.Properties;
17 | import java.util.concurrent.ExecutorService;
18 | import java.util.concurrent.Executors;
19 |
20 | public class ConsumerKafka {
21 | private ConsumerConfig config;
22 | private String topic;
23 | private int partitionsNum;
24 | private MessageExecutor executor;
25 | private ConsumerConnector connector;
26 | private ExecutorService threadPool;
27 |
28 | public ConsumerKafka(String topic, int partitionsNum, MessageExecutor executor) throws Exception {
29 | Properties prop = new Properties();
30 | prop.put("auto.offset.reset", "smallest"); //必须要加,如果要读旧数据
31 | prop.put("zookeeper.connect", "192.168.0.208:2181,192.168.0.211:2181,192.168.0.212:2181");
32 | prop.put("serializer.class", StringEncoder.class.getName());
33 | prop.put("metadata.broker.list", "192.168.0.211:9092,192.168.0.212:9092");
34 | prop.put("group.id", "test-consumer-group");
35 | config = new ConsumerConfig(prop);
36 | this.topic = topic;
37 | this.partitionsNum = partitionsNum;
38 | this.executor = executor;
39 | }
40 |
41 | public void start() throws Exception {
42 | connector = Consumer.createJavaConsumerConnector(config);
43 | Map topics = new HashMap();
44 | topics.put(topic, partitionsNum);
45 | Map>> streams = connector.createMessageStreams(topics);
46 | List> partitions = streams.get(topic);
47 | threadPool = Executors.newFixedThreadPool(partitionsNum);
48 | for (KafkaStream partition : partitions) {
49 | threadPool.execute(new MessageRunner(partition));
50 | }
51 | }
52 |
53 |
54 | public void close() {
55 | try {
56 | threadPool.shutdownNow();
57 | } catch (Exception e) {
58 | //
59 | } finally {
60 | connector.shutdown();
61 | }
62 |
63 | }
64 |
65 | class MessageRunner implements Runnable {
66 | private KafkaStream partition;
67 |
68 | MessageRunner(KafkaStream partition) {
69 | this.partition = partition;
70 | }
71 |
72 | public void run() {
73 | ConsumerIterator it = partition.iterator();
74 | while (it.hasNext()) {
75 | MessageAndMetadata item = it.next();
76 | System.out.println("partiton:" + item.partition());
77 | System.out.println("offset:" + item.offset());
78 | executor.execute(new String(item.message()));//UTF-8
79 | }
80 | }
81 | }
82 |
83 | interface MessageExecutor {
84 |
85 | public void execute(String message);
86 | }
87 |
88 | /**
89 | * @param args
90 | */
91 | public static void main(String[] args) {
92 | ConsumerKafka consumer = null;
93 | try {
94 | MessageExecutor executor = new MessageExecutor() {
95 |
96 | public void execute(String message) {
97 | System.out.println(message);
98 | }
99 | };
100 | consumer = new ConsumerKafka("topic1", 3, executor);
101 | consumer.start();
102 | } catch (Exception e) {
103 | e.printStackTrace();
104 | } finally {
105 | if (consumer != null) {
106 | consumer.close();
107 | }
108 | }
109 |
110 | }
111 |
112 | }
113 |
--------------------------------------------------------------------------------
/neo4j/src/test/java/DataAttributeType.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by Administrator on 2017/6/16 0016.
3 | */
4 | interface DataInterface{}
5 |
6 | public enum DataAttributeType implements DataInterface {
7 | ORDERID(1, "orderid"), CONTRACTNO(2, "contractno"), TERMID(3, "termid"), LOANPAN(4, "loanpan"), RETURNPAN(5, "returnpan"),
8 | INSERTTIME(6, "inserttime"), RECOMMEND(7, "recommend"), USERID(8, "userid"), DEVICEID(9, "deviceid"),
9 | CERTNO(10, "certno"), EMAIL(11, "email"), COMPANY(12, "company"), MOBILE(13, "mobile"), COMPADDR(14, "compaddr"),
10 | COMPPHONE(15, "compphone"), EMERGENCYCONTACTMOBILE(16, "emergencycontactmobile"),
11 | CONTACTMOBILE(17, "contactmobile"), IPV4(18, "ipv4"), MSGPHONE(19, "msgphone"), TELECODE(20, "telecode");
12 | //成员变量
13 | private int sequence;
14 | private String name;
15 |
16 | //构造方法
17 | private DataAttributeType(int sequence, String name) {
18 | this.sequence = sequence;
19 | this.name = name;
20 | }
21 |
22 | //自定义方法
23 | public static String getColorName(int sequence) {
24 | for (DataAttributeType c : DataAttributeType.values()) {
25 | if (c.getSequence() == sequence)
26 | return c.name;
27 | }
28 | return null;
29 | }
30 |
31 | //getter&setter
32 | public int getSequence() {
33 | return sequence;
34 | }
35 |
36 | public void setSequence(int sequence) {
37 | this.sequence = sequence;
38 | }
39 |
40 | public String getName() {
41 | return name;
42 | }
43 |
44 | public void setName(String name) {
45 | this.name = name;
46 | }
47 | }
48 |
49 |
--------------------------------------------------------------------------------
/neo4j/src/test/java/JavaKafkaSimpleConsumerAPITest.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by Administrator on 2017/6/21 0021.
3 | *//*
4 |
5 |
6 | import java.util.ArrayList;
7 | import java.util.List;
8 |
9 | */
10 | /**
11 | * Created by gerry on 12/21.
12 | *//*
13 |
14 | public class JavaKafkaSimpleConsumerAPITest {
15 | public static void main(String[] args) {
16 | JavaKafkaSimpleConsumerAPI example = new JavaKafkaSimpleConsumerAPI();
17 | long maxReads = 300;
18 | String topic = "logCollect_cleanData";
19 | int partitionID = 2;
20 |
21 | KafkaTopicPartitionInfo topicPartitionInfo = new KafkaTopicPartitionInfo(topic, partitionID);
22 | List seeds = new ArrayList();
23 | seeds.add(new KafkaBrokerInfo("192.168.0.211", 9092));
24 | seeds.add(new KafkaBrokerInfo("192.168.0.212", 9092));
25 |
26 | try {
27 | example.run(maxReads, topicPartitionInfo, seeds);
28 | } catch (Exception e) {
29 | e.printStackTrace();
30 | }
31 |
32 | // 获取该topic所属的所有分区ID列表
33 | System.out.println(example.fetchTopicPartitionIDs(seeds, topic, 100000, 64 * 1024, "client-id"));
34 | }
35 | }
36 | */
37 |
--------------------------------------------------------------------------------
/neo4j/src/test/java/KafkaBrokerInfo.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Kafka服务器连接参数
3 | * Created by Administrator on 2017/6/21 0021.
4 | */
5 |
6 | public class KafkaBrokerInfo {
7 | // 主机名
8 | public final String brokerHost;
9 | // 端口号
10 | public final int brokerPort;
11 |
12 | /**
13 | * 构造方法
14 | *
15 | * @param brokerHost Kafka服务器主机或者IP地址
16 | * @param brokerPort 端口号
17 | */
18 | public KafkaBrokerInfo(String brokerHost, int brokerPort) {
19 | this.brokerHost = brokerHost;
20 | this.brokerPort = brokerPort;
21 | }
22 |
23 | /**
24 | * 构造方法, 使用默认端口号9092进行构造
25 | *
26 | * @param brokerHost
27 | */
28 | public KafkaBrokerInfo(String brokerHost) {
29 | this(brokerHost, 9092);
30 | }
31 | }
--------------------------------------------------------------------------------
/neo4j/src/test/java/KafkaConsumer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by Administrator on 2017/6/8 0008.
3 | */
4 |
5 | import kafka.consumer.ConsumerConfig;
6 | import kafka.consumer.ConsumerIterator;
7 | import kafka.consumer.KafkaStream;
8 | import kafka.javaapi.consumer.ConsumerConnector;
9 | import kafka.serializer.StringDecoder;
10 | import kafka.utils.VerifiableProperties;
11 |
12 | import java.util.HashMap;
13 | import java.util.List;
14 | import java.util.Map;
15 | import java.util.Properties;
16 |
17 | public class KafkaConsumer {
18 |
19 | private final ConsumerConnector consumer;
20 | // private String TOPIC ="topic_creditloan_orderinfo_wait_score";
21 | private String TOPIC ="logCollect_cleanData";
22 | private KafkaConsumer() {
23 | Properties props = new Properties();
24 | //zookeeper 配置192.168.0.208:2181,1
25 | props.put("zookeeper.connect", "192.168.0.208:2181,192.168.0.211:2181,192.168.0.212:2181");
26 |
27 | //group 代表一个消费组
28 | // props.put("group.id", "test-consumer-group125");
29 | props.put("group.id", "testcheatgraph");
30 |
31 | //zk连接超时
32 | props.put("zookeeper.session.timeout.ms", "60000");
33 | props.put("zookeeper.sync.time.ms", "200");
34 | props.put("auto.commit.interval.ms", "1000");
35 | props.put("auto.offset.reset", "smallest");
36 | props.put("rebalance.max.retries", "5");
37 | props.put("rebalance.backoff.ms", "12000");
38 | //序列化类
39 | props.put("serializer.class", "kafka.serializer.StringEncoder");
40 |
41 | ConsumerConfig config = new ConsumerConfig(props);
42 |
43 | consumer = kafka.consumer.Consumer.createJavaConsumerConnector(config);
44 | }
45 |
46 | void consume() {
47 | Map topicCountMap = new HashMap();
48 | topicCountMap.put(TOPIC, new Integer(1));
49 |
50 | StringDecoder keyDecoder = new StringDecoder(new VerifiableProperties());
51 | StringDecoder valueDecoder = new StringDecoder(new VerifiableProperties());
52 |
53 | Map>> consumerMap =
54 | consumer.createMessageStreams(topicCountMap, keyDecoder, valueDecoder);
55 | KafkaStream stream = consumerMap.get(TOPIC).get(0);
56 | ConsumerIterator it = stream.iterator();
57 | while (it.hasNext())
58 | System.out.println(it.next().message());
59 | }
60 |
61 | public static void main(String[] args) {
62 | new KafkaConsumer().consume();
63 | }
64 | }
--------------------------------------------------------------------------------
/neo4j/src/test/java/KafkaTopicPartitionInfo.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by Administrator on 2017/6/21 0021.
3 | */
4 |
5 | public class KafkaTopicPartitionInfo {
6 | // 主题名称
7 | public final String topic;
8 | // 分区id
9 | public final int partitionID;
10 |
11 | /**
12 | * 构造函数
13 | *
14 | * @param topic 主题名称
15 | * @param partitionID 分区id
16 | */
17 | public KafkaTopicPartitionInfo(String topic, int partitionID) {
18 | this.topic = topic;
19 | this.partitionID = partitionID;
20 | }
21 |
22 | @Override
23 | public boolean equals(Object o) {
24 | if (this == o) return true;
25 | if (o == null || getClass() != o.getClass()) return false;
26 |
27 | KafkaTopicPartitionInfo that = (KafkaTopicPartitionInfo) o;
28 |
29 | if (partitionID != that.partitionID) return false;
30 | return topic != null ? topic.equals(that.topic) : that.topic == null;
31 |
32 | }
33 |
34 | @Override
35 | public int hashCode() {
36 | int result = topic != null ? topic.hashCode() : 0;
37 | result = 31 * result + partitionID;
38 | return result;
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/neo4j/src/test/java/OperatorKafka.java:
--------------------------------------------------------------------------------
1 | //import com.lakala.datacenter.constant.StreamingConstant;
2 | //import kafka.admin.AdminUtils;
3 | //import org.I0Itec.zkclient.ZkClient;
4 | //import org.I0Itec.zkclient.exception.ZkMarshallingError;
5 | //import org.I0Itec.zkclient.serialize.ZkSerializer;
6 | //
7 | //import java.io.UnsupportedEncodingException;
8 | //import java.util.Iterator;
9 | //import java.util.Map;
10 | //import java.util.Properties;
11 | //
12 | ///**
13 | // * Created by Administrator on 2017/8/2 0002.
14 | // */
15 | //public class OperatorKafka {
16 | // public static void main(String[] args) {
17 | // createTopic();
18 | // }
19 | //
20 | // public static void createTopic() {
21 | // ZkClient zkUtils = getZk();
22 | //// 创建一个单分区单副本名为t1的topic
23 | // AdminUtils.createTopic(zkUtils, "logCollect_cleanData", 3, 1, new Properties());
24 | // zkUtils.close();
25 | // }
26 | //
27 | // public static void deleteTopic() {
28 | // ZkClient zkUtils = getZk();
29 | //// 创建一个单分区单副本名为t1的topic
30 | // AdminUtils.deleteTopic(zkUtils, "logCollect_cleanData");
31 | // zkUtils.close();
32 | // }
33 | //
34 | // public static void queryTopic() {
35 | // ZkClient zkUtils = getZk();
36 | // // 获取topic 'test'的topic属性属性
37 | // Properties props = AdminUtils.fetchTopicConfig(zkUtils, "logCollect_cleanData");
38 | //// 查询topic-level属性
39 | // Iterator it = props.entrySet().iterator();
40 | // while (it.hasNext()) {
41 | // Map.Entry entry = (Map.Entry) it.next();
42 | // Object key = entry.getKey();
43 | // Object value = entry.getValue();
44 | // System.out.println(key + " = " + value);
45 | // }
46 | // zkUtils.close();
47 | // }
48 | //
49 | //
50 | // public static void updateTopic() {
51 | // ZkClient zkUtils = getZk();
52 | // Properties props = AdminUtils.fetchTopicConfig(zkUtils, "logCollect_cleanData");
53 | //// 增加topic级别属性
54 | // props.put("min.cleanable.dirty.ratio", "0.3");
55 | //// 删除topic级别属性
56 | // props.remove("max.message.bytes");
57 | //// 修改topic 'test'的属性
58 | // AdminUtils.changeTopicConfig(zkUtils, "logCollect_cleanData", props);
59 | // }
60 | //
61 | // public static ZkClient getZk() {
62 | // ZkClient zkUtils = new ZkClient("192.168.0.208:2181,192.168.0.211:2181,192.168.0.212:2181", 60000, 60000, new ZkSerializer() {
63 | // @Override
64 | // public byte[] serialize(Object data) throws ZkMarshallingError {
65 | // try {
66 | // return data.toString().getBytes(StreamingConstant.CODE());
67 | // } catch (UnsupportedEncodingException e) {
68 | // e.printStackTrace();
69 | // }
70 | // return new byte[0];
71 | // }
72 | //
73 | // @Override
74 | // public Object deserialize(byte[] bytes) throws ZkMarshallingError {
75 | // try {
76 | // return new String(bytes, StreamingConstant.CODE());
77 | // } catch (UnsupportedEncodingException e) {
78 | // e.printStackTrace();
79 | // }
80 | // return new byte[0];
81 | // }
82 | // });
83 | // return zkUtils;
84 | // }
85 | //}
86 |
--------------------------------------------------------------------------------
/neo4j/src/test/java/TestCypher.java:
--------------------------------------------------------------------------------
1 | import org.neo4j.driver.v1.*;
2 |
3 | import java.util.List;
4 |
5 | import static org.neo4j.driver.v1.Values.parameters;
6 |
7 | /**
8 | * Created by Administrator on 2017/8/2 0002.
9 | */
10 | public class TestCypher {
11 | Driver driver = GraphDatabase.driver("bolt://localhost", AuthTokens.basic("neo4j", "123456"));
12 |
13 | public int addEmployees(final String companyName) {
14 | try (Session session = driver.session()) {
15 | int employees = 0;
16 | List persons = session.readTransaction(new TransactionWork>() {
17 | @Override
18 | public List execute(Transaction tx) {
19 | return matchPersonNodes(tx);
20 | }
21 | });
22 | for (final Record person : persons) {
23 | employees += session.writeTransaction(new TransactionWork() {
24 | @Override
25 | public Integer execute(Transaction tx) {
26 | tx.run("MATCH (emp:Person {name: $person_name}) " +
27 | "MERGE (com:Company {name: $company_name}) " +
28 | "MERGE (emp)-[:WORKS_FOR]->(com)",
29 | parameters("person_name", person.get("name").asString(), "company_name",
30 | companyName));
31 | return 1;
32 | }
33 | });
34 | }
35 | return employees;
36 | }
37 | }
38 |
39 | private static List matchPersonNodes(Transaction tx) {
40 | return tx.run("MATCH (a:Person) RETURN a.name AS name").list();
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/BroadcastAccumulatorStreaming.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by Administrator on 2017/8/10 0010.
3 | */
4 |
5 | import org.apache.spark.broadcast.Broadcast
6 | import org.apache.spark.streaming.{Duration, StreamingContext}
7 | import org.apache.spark.{Accumulator, SparkConf, SparkContext}
8 |
9 | /**
10 | * Created by lxh on 2016/6/30.
11 | */
12 | object BroadcastAccumulatorStreaming {
13 |
14 | /**
15 | * 声明一个广播和累加器!
16 | */
17 | private var broadcastList: Broadcast[List[String]] = _
18 | private var accumulator: Accumulator[Int] = _
19 |
20 | def main(args: Array[String]) {
21 |
22 | val sparkConf = new SparkConf().setMaster("local[4]").setAppName("broadcasttest")
23 | val sc = new SparkContext(sparkConf)
24 |
25 | /**
26 | * duration是ms
27 | */
28 | val ssc = new StreamingContext(sc, Duration(2000))
29 | // broadcastList = ssc.sparkContext.broadcast(util.Arrays.asList("Hadoop","Spark"))
30 | broadcastList = ssc.sparkContext.broadcast(List("Hadoop", "Spark"))
31 | accumulator = ssc.sparkContext.accumulator(0, "broadcasttest")
32 |
33 | /**
34 | * 获取数据!
35 | */
36 | val lines = ssc.socketTextStream("localhost", 9999)
37 |
38 | /**
39 | * 1.flatmap把行分割成词。
40 | * 2.map把词变成tuple(word,1)
41 | * 3.reducebykey累加value
42 | * (4.sortBykey排名)
43 | * 4.进行过滤。 value是否在累加器中。
44 | * 5.打印显示。
45 | */
46 | val words = lines.flatMap(line => line.split(" "))
47 |
48 | val wordpair = words.map(word => (word, 1))
49 |
50 | wordpair.filter(record => {
51 | broadcastList.value.contains(record._1)
52 | })
53 |
54 |
55 | val pair = wordpair.reduceByKey(_ + _)
56 |
57 | /**
58 | * 这个pair 是PairDStream
59 | * 查看这个id是否在黑名单中,如果是的话,累加器就+1
60 | */
61 | /* pair.foreachRDD(rdd => {
62 | rdd.filter(record => {
63 |
64 | if (broadcastList.value.contains(record._1)) {
65 | accumulator.add(1)
66 | return true
67 | } else {
68 | return false
69 | }
70 |
71 | })
72 |
73 | })*/
74 |
75 | val filtedpair = pair.filter(record => {
76 | if (broadcastList.value.contains(record._1)) {
77 | accumulator.add(record._2)
78 | true
79 | } else {
80 | false
81 | }
82 |
83 | }).print
84 |
85 | println("累加器的值" + accumulator.value)
86 |
87 | // pair.filter(record => {broadcastList.value.contains(record._1)})
88 |
89 | val keypair = pair.map(pair => (pair._2,pair._1))
90 |
91 | /**
92 | * 如果DStream自己没有某个算子操作。就通过转化transform!
93 | */
94 | keypair.transform(rdd => {
95 | rdd.sortByKey(false)//TODO
96 | })
97 | pair.print()
98 | ssc.start()
99 | ssc.awaitTermination()
100 |
101 | }
102 |
103 | }
104 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/ClientRedisTest.scala:
--------------------------------------------------------------------------------
1 | import com.lakala.datacenter.utils.RedisUtils
2 | import redis.clients.jedis.JedisPubSub
3 |
4 | /**
5 | * Created by Administrator on 2017/6/29 0029.
6 | */
7 | object ClientRedisTest {
8 | def main(args: Array[String]): Unit = {
9 | val jedis = RedisUtils.jedisCluster()
10 | println(jedis.subscribe(new ApplyPubSubListener(),args(0)))
11 | }
12 |
13 | class ApplyPubSubListener extends JedisPubSub {
14 |
15 | override def onMessage(channel: String, message: String): Unit = {
16 | System.out.println(channel + " onMessage=" + message)
17 | super.onMessage(channel, message)
18 | }
19 | // 初始化订阅时候的处理
20 | override def onSubscribe(channel: String, subscribedChannels: Int) {
21 | System.out.println(channel + " onSubscribe=" + subscribedChannels);
22 | }
23 |
24 | // 取消订阅时候的处理
25 | override def onUnsubscribe(channel: String, subscribedChannels: Int) {
26 | System.out.println(channel + "onUnsubscribe=" + subscribedChannels);
27 | }
28 |
29 | // 初始化按表达式的方式订阅时候的处理
30 | override def onPSubscribe(pattern: String, subscribedChannels: Int) {
31 | System.out.println(pattern + "onPSubscribe=" + subscribedChannels);
32 | }
33 |
34 | // 取消按表达式的方式订阅时候的处理
35 | override def onPUnsubscribe(pattern: String, subscribedChannels: Int) {
36 | System.out.println(pattern + "onPUnsubscribe=" + subscribedChannels);
37 | }
38 |
39 | // 取得按表达式的方式订阅的消息后的处理
40 | override def onPMessage(pattern: String, channel: String, message:String ) {
41 | System.out.println(pattern + "onPMessage=" + channel + "=" + message);
42 | }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/CollectionUtil.scala:
--------------------------------------------------------------------------------
1 | import scala.collection.mutable.ArrayBuffer
2 | import scala.reflect.ClassTag
3 |
4 | /**
5 | * Created by Administrator on 2017/8/15 0015.
6 | */
7 | object CollectionUtil {
8 | /**
9 | * 对具有Traversable[(K, V)]类型的集合添加reduceByKey相关方法
10 | *
11 | * @param collection
12 | * @param kt
13 | * @param vt
14 | * @tparam K
15 | * @tparam V
16 | */
17 | implicit class CollectionHelper[K, V](collection: Traversable[(K, V)])(implicit kt: ClassTag[K], vt: ClassTag[V]) {
18 | def reduceByKey(f: (V, V) => V): Traversable[(K, V)] = collection.groupBy(_._1).map { case (_: K, values: Traversable[(K, V)]) => values.reduce((a, b) => (a._1, f(a._2, b._2))) }
19 |
20 | /**
21 | * reduceByKey的同时,返回被reduce掉的元素的集合
22 | *
23 | * @param f
24 | * @return
25 | */
26 | def reduceByKeyWithReduced(f: (V, V) => V)(implicit kt: ClassTag[K], vt: ClassTag[V]): (Traversable[(K, V)], Traversable[(K, V)]) = {
27 | val reduced: ArrayBuffer[(K, V)] = ArrayBuffer()
28 | val newSeq = collection.groupBy(_._1).map {
29 | case (_: K, values: Traversable[(K, V)]) => values.reduce((a, b) => {
30 | val newValue: V = f(a._2, b._2)
31 | val reducedValue: V = if (newValue == a._2) b._2 else a._2
32 | val reducedPair: (K, V) = (a._1, reducedValue)
33 | reduced += reducedPair
34 | (a._1, newValue)
35 | })
36 | }
37 | (newSeq, reduced.toTraversable)
38 | }
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/ConsumerGroupExample.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by Administrator on 2017/8/7 0007.
3 | */
4 |
5 |
6 | import com.lakala.datacenter.main.TrialConsumerKafka
7 |
8 | object ConsumerGroupExample {
9 | def main(args: Array[String]): Unit = {
10 | TrialConsumerKafka.main(Array("192.168.0.208:2181,192.168.0.211:2181,192.168.0.212:2181", "test-consumer-group",
11 | "logCollect_cleanData", "3"))
12 | }
13 | }
14 |
15 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/Main.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Created by Administrator on 2017/8/1 0001.
3 | */
4 | import org.neo4j.driver.v1.GraphDatabase
5 | import org.neo4j.driver.v1.AuthTokens
6 | import com.lakala.datacenter.cypher.NeoData._
7 |
8 | object Main {
9 |
10 | def main(args: Array[String]): Unit = {
11 |
12 | val driver = GraphDatabase.driver("bolt://localhost", AuthTokens.basic("neo4j", "123456"))
13 |
14 | val session = driver.session();
15 |
16 | val nodes = allNodes(session)
17 |
18 | println(nodes.mkString("\n"))
19 | }
20 | }
--------------------------------------------------------------------------------
/neo4j/src/test/scala/TestApiNeo4j.scala:
--------------------------------------------------------------------------------
1 | //import java.util
2 | //import java.util.Map
3 | //
4 | //import com.lakala.datacenter.constant.StreamingConstant
5 | //import com.lakala.datacenter.utils.UtilsTools.properties
6 | //import org.apache.commons.lang3.StringUtils.trim
7 | //import org.neo4j.rest.graphdb.RestAPIFacade
8 | //import org.neo4j.rest.graphdb.batch.CypherResult
9 | //import org.neo4j.rest.graphdb.query.RestCypherQueryEngine
10 | //import org.neo4j.rest.graphdb.util.QueryResult
11 | //
12 | ///**
13 | // * Created by Administrator on 2017/7/12 0012.
14 | // */
15 | //object TestApiNeo4j {
16 | // def main(args: Array[String]): Unit = {
17 | //// val properies = properties(StreamingConstant.CONFIG)
18 | //// val restAPI = new RestAPIFacade(trim(properies.getProperty(StreamingConstant.NEOIP)), trim(properies.getProperty(StreamingConstant.USER)), trim(properies.getProperty(StreamingConstant.PASSWORD)))
19 | //
20 | // import scala.collection.JavaConversions._
21 | // //
22 | // // //
23 | // // val orderno ="AX20160722090751068917"
24 | // // val centro = "500227198611307710"
25 | // // val applyNodeIndexs = restAPI.getNodesByLabelAndProperty("" + Labels.ApplyInfo, StreamingConstant.ORDERNO, orderno)
26 | // //
27 | // // val apply = applyNodeIndexs.toList
28 | // // if (apply.size == 0) {
29 | // // val applyNode = restAPI.createNode(MapUtil.map(StreamingConstant.ORDERNO, orderno.toUpperCase,StreamingConstant.MODELNAME, Labels.ApplyInfo))
30 | // // applyNode.addLabel(Labels.ApplyInfo)
31 | // // applyNode.setProperty(StreamingConstant.ORDERNO,orderno)
32 | // //
33 | // // val contentIndexs = restAPI.getNodesByLabelAndProperty("Identification", StreamingConstant.CONTENT, centro)
34 | // // val list = contentIndexs.toList
35 | // // println(list.size)
36 | // // var otherNode: RestNode = if (list.size == 0) {
37 | // // val otherNode2 = restAPI.createNode(MapUtil.map(StreamingConstant.MODELNAME, "Identification", StreamingConstant.CONTENT, centro))
38 | // // otherNode2.setProperty(StreamingConstant.CONTENT, centro)
39 | // // otherNode2.addLabel(Labels.Identification)
40 | // // otherNode2
41 | // // } else {
42 | // // applyNode.setProperty("cert_no", centro)
43 | // // list.get(0)
44 | // // }
45 | // //
46 | // // applyNode.createRelationshipTo(otherNode, RelationshipTypes.identification)
47 | // // println(otherNode.getId)
48 | // // println(applyNode.getId)
49 | // // }
50 | // val restAPI = new RestAPIFacade(trim("http://192.168.0.33:7474/db/data"), trim("neo4j"), trim("123456"))
51 | // val result = restAPI.query("MATCH (:Person {name:'Keanu'})-[:ACTED_IN]->(:Movie {title:'Matrix'}) RETURN count(*) as c" ,null)
52 | // val it = result.getData
53 | // it.flatten.toList.get(0)
54 | // println(it.flatten.toList.get(0))
55 | // }
56 | //}
57 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/TestCypher.scala:
--------------------------------------------------------------------------------
1 | import com.lakala.datacenter.common.utils.DateTimeUtils
2 | import com.lakala.datacenter.constant.StreamingConstant
3 | import org.apache.commons.lang3.StringUtils
4 | import org.joda.time.DateTime
5 | import org.neo4j.driver.v1._
6 |
7 | /**
8 | * Created by Administrator on 2017/8/2 0002.
9 | */
10 | object TestCypher2 {
11 | val driver: Driver = GraphDatabase.driver("bolt://localhost:7687", AuthTokens.basic("neo4j", "123456"))
12 |
13 | def main(args: Array[String]): Unit = {
14 | var map: java.util.HashMap[String, String] = new java.util.HashMap[String, String]()
15 | var paramMap: java.util.HashMap[String, String] = new java.util.HashMap[String, String]()
16 | map.put("orderno", "TNA20170623102711010234032084429")
17 | map.put("_DeviceId", "A000005966DFEA")
18 | map.put("mobile", "18961922790")
19 |
20 | runCypherApply(driver.session(), map)
21 | driver.close()
22 | }
23 |
24 | private def runCypherApply(session: Session, map: java.util.HashMap[String, String]): Unit = {
25 | val applyStatementTemplate = new StringBuffer("MERGE (apply:ApplyInfo {orderno:$orderno})")
26 | applyStatementTemplate.append(" ON MATCH SET apply.modelname='ApplyInfo',apply.insertTime=$insertTime,apply.user_id=$user_id")
27 | val otherStatementTemplate = new StringBuffer()
28 | val relStatementTemplate = new StringBuffer()
29 |
30 | var paramMap: java.util.HashMap[String, Object] = new java.util.HashMap[String, Object]()
31 | paramMap.put("orderno", map.getOrDefault(StreamingConstant.ORDERNO, ""))
32 | paramMap.put(StreamingConstant.INSERTTIME, DateTimeUtils.formatter.print(DateTime.now()))
33 | paramMap.put(StreamingConstant.USER_ID, map.getOrDefault(StreamingConstant.USERID, ""))
34 |
35 | for (key <- StreamingConstant.fieldMap.keySet) {
36 | val fieldRelation = StreamingConstant.fieldMap.get(key).get.split(",")
37 | if (StringUtils.isNoneEmpty(map.get(key))) {
38 | val modelname = "" + StreamingConstant.labelMap.get(key).get
39 | val rel = "" + StreamingConstant.relationShipMap.get(key).get
40 | otherStatementTemplate.append(" MERGE (" + key + ":" + modelname + "{modelname:'" + modelname + "',content:$" + fieldRelation(0) + "})")
41 | otherStatementTemplate.append(" MERGE (apply)-[:" + rel + "]->(" + key + ")")
42 | applyStatementTemplate.append(",apply." + fieldRelation(0) + "=$" + fieldRelation(0))
43 | paramMap.put(fieldRelation(0), map.get(key))
44 | }
45 | }
46 |
47 | val statementStr = applyStatementTemplate.append(otherStatementTemplate).toString
48 | println(statementStr)
49 | session.writeTransaction(new TransactionWork[Integer]() {
50 | override def execute(tx: Transaction): Integer = {
51 | tx.run(statementStr, paramMap)
52 | 1
53 | }
54 | })
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/TestKafka.scala:
--------------------------------------------------------------------------------
1 | import com.lakala.datacenter.constant.StreamingConstant
2 | import kafka.utils.ZkUtils
3 | import org.I0Itec.zkclient.ZkClient
4 | import org.I0Itec.zkclient.exception.ZkMarshallingError
5 | import org.I0Itec.zkclient.serialize.ZkSerializer
6 | /**
7 | * Created by Administrator on 2017/6/12 0012.
8 | */
9 | object TestKafka {
10 | def main(args: Array[String]): Unit = {
11 | val topic = "logCollect_cleanData"
12 | val zkConnect = "192.168.0.211:2181,192.168.0.212:2181"
13 | var zkClient: ZkClient = null
14 | try {
15 | zkClient = new ZkClient(zkConnect, 30000, 30000, new ZkSerializer {
16 | override def serialize(data: Object): Array[Byte] = {
17 | try {
18 | return data.toString().getBytes(StreamingConstant.CODE)
19 | } catch {
20 | case e: ZkMarshallingError => return null
21 |
22 | }
23 | }
24 |
25 | override def deserialize(bytes: Array[Byte]): Object = {
26 | try {
27 | return new String(bytes, StreamingConstant.CODE)
28 | } catch {
29 | case e: ZkMarshallingError => return null
30 | }
31 | }
32 | })
33 | zkClient.deleteRecursive(ZkUtils.getTopicPath(topic)) //其实最终还是通过删除zk里面对应的路径来实现删除topic的功能
34 | println("deletion succeeded!")
35 | }
36 | catch {
37 | case e: Throwable =>
38 | println("delection failed because of " + e.getMessage)
39 | // println(Utils.stackTrace(e))
40 | }
41 | finally {
42 | if (zkClient != null)
43 | zkClient.close()
44 | }
45 |
46 |
47 | // import org.I0Itec.zkclient.ZkClient
48 | // val arrys = new Array[String](6)
49 | // arrys(0) = "--replication-factor"
50 | // arrys(1) = "1"
51 | // arrys(2) = "--partitions"
52 | // arrys(3) = "3"
53 | // arrys(4) = "--topic"
54 | // arrys(5) = "logCollect_cleanData"
55 | // val client = new ZkClient("192.168.0.211:2181,192.168.0.212:2181", 30000, 30000, ZKStringSerializer)
56 | // client.setZkSerializer(ZKStringSerializer) //一定要加上ZkSerializer
57 | //
58 | //
59 | // val opts = new TopicCommand.TopicCommandOptions(arrys)
60 | // TopicCommand.createTopic(client, opts)
61 |
62 | // import kafka.admin.AdminUtils
63 | // val client = new ZkClient("192.168.0.211:2181,192.168.0.212:2181", 30000, 30000)
64 | // 创建一个单分区单副本名为t1的topic
65 | // val props: Properties = new Properties
66 | //此处配置的是kafka的端口
67 | // props.put("metadata.broker.list", "192.168.0.211:9092,192.168.0.212:9092")
68 | //配置value的序列化类
69 | // props.put("serializer.class", "kafka.serializer.StringEncoder")
70 | //配置key的序列化类
71 | // props.put("key.serializer.class", "kafka.serializer.StringEncoder")
72 | //request.required.acks
73 | // props.put("request.required.acks", "-1")
74 | // AdminUtils.createTopic(client, "logCollect_cleanData", 3, 1, props)
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/TestRedis.scala:
--------------------------------------------------------------------------------
1 | import java.util
2 |
3 | import com.alibaba.fastjson.{JSON, JSONObject}
4 | import com.lakala.datacenter.common.utils.DateTimeUtils
5 | import com.lakala.datacenter.constant.StreamingConstant
6 | import com.lakala.datacenter.utils.RedisUtils
7 | import org.joda.time.DateTime
8 | import redis.clients.jedis.JedisPubSub
9 |
10 |
11 |
12 | /**
13 | * Created by Administrator on 2017/6/29 0029.
14 | */
15 | object TestRedis {
16 | def main(args: Array[String]): Unit = {
17 |
18 | //
19 | val jedis = RedisUtils.jedisCluster()
20 | try {
21 | val orderno = args(0)
22 | val insertTime=Map(StreamingConstant.INSERTTIME->"2017-06-30 12:01:10").getOrElse(StreamingConstant.INSERTTIME, DateTimeUtils.formatter.print(DateTime.now()))
23 | val s= "{\""+StreamingConstant.ORDERNO+"\":\""+orderno+"\",\""+StreamingConstant.INSERT_TIME+"\":\""+insertTime+"\"}"
24 | jedis.publish("testsub12", s)
25 | println(s)
26 | println(JSON.parseObject(s).getString(StreamingConstant.INSERT_TIME))
27 | } catch {
28 | case e: Exception => println("AAAAAAAAA"+e.getMessage)
29 | }
30 |
31 | }
32 |
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/org/neo4j/spark/ExplortApplyDataTest.scala:
--------------------------------------------------------------------------------
1 | package org.neo4j.spark
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | import scala.collection.mutable.ArrayBuffer
6 |
7 | /**
8 | * Created by Administrator on 2017/5/9 0009.
9 | *
10 | */
11 | object ExplortApplyDataTest {
12 | def main(args: Array[String]): Unit = {
13 | // ExplortApplyData2.main(Array("192.168.0.33","file:///F:/output/out","BankCard,Device,Mobile",Email"))
14 | val conf = new SparkConf().setMaster("local[1]").setAppName("test")
15 | val sc = new SparkContext(conf)
16 | printSql(sc)
17 | println(System.getProperty("java.io.tmpdir"))
18 | }
19 | def printSql(sc:SparkContext)={
20 | val map = Map("applymymobile" -> "Mobile","loanapply" -> "Mobile","emergencymobile" -> "Mobile", "device" -> "Device", "bankcard" -> "BankCard", "identification" -> "Identification", "email" -> "Email")
21 | val modelRdd = sc.parallelize(List("BankCard", "Device", "Mobile", "Email"))
22 |
23 | val broadcastVar2 = sc.broadcast(map)
24 | modelRdd.foreachPartition { models =>
25 | models.foreach { model =>
26 | runQueryApplyByApplyLevel1(broadcastVar2.value, model)
27 | }
28 | }
29 | }
30 | def runQueryApplyByApplyLevel1(map: Map[String, String],modelname: String):Unit = {
31 |
32 | val list = new ArrayBuffer[String]()
33 | for (k <- map.keySet) {
34 | for (k2 <- map.keySet) {
35 | if (k2.equals("applymymobile") || k2.equals("loanapply") || k2.equals("emergencymobile")) {
36 | list += s"match (n:$modelname {type:'1'})-[r1:${k}] -(p:ApplyInfo)-[r2:${k2}]-(m:${map.get(k2).get})-[r3:applymymobile]-(q:ApplyInfo) return n.content,p.orderno,m.content,q.orderno@@$k==$k2==applymymobile"
37 | list += s"match (n:$modelname {type:'1'})-[r1:${k}] -(p:ApplyInfo)-[r2:${k2}]-(m:${map.get(k2).get})-[r3:loanapply]-(q:ApplyInfo) return n.content,p.orderno,m.content,q.orderno@@$k==$k2==loanapply"
38 | list += s"match (n:$modelname {type:'1'})-[r1:${k}] -(p:ApplyInfo)-[r2:${k2}]-(m:${map.get(k2).get})-[r3:emergencymobile]-(q:ApplyInfo) return n.content,p.orderno,m.content,q.orderno@@$k==$k2=emergencymobile"
39 | } else {
40 | list += s"match (n:$modelname {type:'1'})-[r1:${k}] -(p:ApplyInfo)-[r2:${k2}]-(m:${map.get(k2).get})-[r3:${k2}]-(q:ApplyInfo) return n.content,p.orderno,m.content,q.orderno@@$k==$k2==$k2"
41 | }
42 | }
43 | }
44 | list.map { sql =>
45 | val arr = sql.split("@@")
46 | println(arr(0))
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/org/neo4j/spark/MainTest.scala:
--------------------------------------------------------------------------------
1 | package org.neo4j.spark
2 |
3 | import com.lakala.datacenter.main.Main
4 |
5 | /**
6 | * Created by Administrator on 2017/6/2 0002.
7 | */
8 | object MainTest {
9 | def main(args: Array[String]): Unit = {
10 | //-i F:\tmp\applydir
11 | Main.main(args)
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/org/neo4j/spark/Neo4jContstanTest.scala:
--------------------------------------------------------------------------------
1 | package org.neo4j.spark
2 |
3 | /**
4 | * Created by Administrator on 2017/7/14 0014.
5 | */
6 | object Neo4jContstanTest {
7 | val SERVER_BOLTURI ="bolt://192.168.0.33:7687"
8 | val RESTNEO4JURL ="http://192.168.0.33:7474/db/data"
9 | }
10 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/org/neo4j/spark/Neo4jDataFrameScalaTest.scala:
--------------------------------------------------------------------------------
1 | //package org.neo4j.spark
2 | //
3 | //import java.io.File
4 | //
5 | //import com.lakala.datacenter.load.spark.{Neo4jDataFrame, Neo4jGraph}
6 | //import org.apache.commons.lang3.StringUtils.trim
7 | //import org.apache.spark.api.java.JavaSparkContext
8 | //import org.apache.spark.graphx.{Edge, Graph}
9 | //import org.apache.spark.rdd.RDD
10 | //import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
11 | //import org.apache.spark.sql.{Row, SQLContext}
12 | //import org.apache.spark.{SparkConf, SparkContext}
13 | //import org.junit.Assert._
14 | //import org.junit._
15 | //import org.neo4j.harness.{ServerControls, TestServerBuilders}
16 | //import org.neo4j.rest.graphdb.RestAPIFacade
17 | //import org.neo4j.rest.graphdb.batch.CypherResult
18 | //
19 | //
20 | ///**
21 | // * @author lys
22 | // * @since 17.07.16
23 | // */
24 | //class Neo4jDataFrameScalaTest {
25 | // val FIXTURE: String = "CREATE (:A)-[:REL {foo:'bar'}]->(:B)"
26 | // private var conf: SparkConf = null
27 | // private var sc: JavaSparkContext = null
28 | // private var server: ServerControls = null
29 | // private val path:String ="F:\\tmp\\neo4j\\tmp02"
30 | // private var restAPI:RestAPIFacade = null
31 | // @Before
32 | // @throws[Exception]
33 | // def setUp {
34 | //// server = TestServerBuilders.newInProcessBuilder(new File(path)).withConfig("dbms.security.auth_enabled", "false").withFixture(FIXTURE).newServer
35 | // restAPI = new RestAPIFacade(trim(Neo4jContstanTest.RESTNEO4JURL), trim("neo4j"), trim("123456"))
36 | //
37 | // conf = new SparkConf().setAppName("neoTest").setMaster("local[*]").set("spark.driver.allowMultipleContexts", "true").set("spark.neo4j.bolt.url", Neo4jContstanTest.SERVER_BOLTURI)
38 | // sc = SparkContext.getOrCreate(conf)
39 | // }
40 | //
41 | // @After def tearDown {
42 | //// server.close
43 | // sc.close
44 | // }
45 | //
46 | // @Test def mergeEdgeList {
47 | // val rows = sc.makeRDD(Seq(Row("Keanu", "Matrix")))
48 | // val schema = StructType(Seq(StructField("name", DataTypes.StringType), StructField("title", DataTypes.StringType)))
49 | // val sqlContext = new SQLContext(sc)
50 | // val df = sqlContext.createDataFrame(rows, schema)
51 | // Neo4jDataFrame.mergeEdgeList(sc, df, ("Person", Seq("name")), ("ACTED_IN", Seq.empty), ("Movie", Seq("title")))
52 | // val edges: RDD[Edge[Long]] = sc.makeRDD(Seq(Edge(0, 1, 42L)))
53 | // val graph = Graph.fromEdges(edges, -1)
54 | // assertEquals(2, graph.vertices.count)
55 | // assertEquals(1, graph.edges.count)
56 | // Neo4jGraph.saveGraph(sc, graph, null, "test")
57 | //
58 | //// val it: ResourceIterator[Long] = server.graph().execute("MATCH (:Person {name:'Keanu'})-[:ACTED_IN]->(:Movie {title:'Matrix'}) RETURN count(*) as c").columnAs("c")
59 | // val result: CypherResult = restAPI.query("MATCH (:Person {name:'Keanu'})-[:ACTED_IN]->(:Movie {title:'Matrix'}) RETURN count(*) as c" ,null)
60 | // import scala.collection.JavaConversions._
61 | // assertEquals(1L, result.getData.flatten.toList.get(0).toString.toLong)
62 | // restAPI.close()
63 | // }
64 | //}
65 | //
66 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/org/neo4j/spark/Neo4jGraphScalaTest.scala:
--------------------------------------------------------------------------------
1 | package org.neo4j.spark
2 |
3 | import com.lakala.datacenter.load.spark.{Executor, Neo4jGraph}
4 | import org.apache.spark.api.java.JavaSparkContext
5 | import org.apache.spark.graphx.{Edge, Graph}
6 | import org.apache.spark.rdd.RDD
7 | import org.apache.spark.{SparkConf, SparkContext}
8 | import org.junit.Assert._
9 | import org.junit._
10 |
11 | import scala.collection.JavaConverters._
12 |
13 |
14 | /**
15 | * @author lys
16 | * @since 17.07.16
17 | */
18 | class Neo4jGraphScalaTest {
19 | val FIXTURE: String = "CREATE (:A)-[:REL {foo:'bar'}]->(:B)"
20 | private var conf: SparkConf = null
21 | private var sc: JavaSparkContext = null
22 | // private var server: ServerControls = null
23 |
24 | @Before
25 | @throws[Exception]
26 | def setUp {
27 | // server = TestServerBuilders.newInProcessBuilder.withConfig("dbms.security.auth_enabled", "false").withFixture(FIXTURE).newServer
28 | conf = new SparkConf().setAppName("neoTest").setMaster("local[*]").set("spark.driver.allowMultipleContexts", "true")
29 | .set("spark.neo4j.bolt.url", Neo4jContstanTest.SERVER_BOLTURI)
30 | sc = SparkContext.getOrCreate(conf)
31 | }
32 |
33 | @After def tearDown {
34 | // server.close()
35 | sc.close
36 | }
37 |
38 | @Test def runCypherQueryWithParams {
39 | val data = List(Map("id" -> 3, "name" -> "Test3").asJava, Map("id" -> 2, "name" -> "Test2").asJava).asJava
40 | Executor.execute(sc.sc, "UNWIND {data} as row MERGE (n:Test {id:row.id}) SET n.name = row.name", Map(("data", data)))
41 | }
42 |
43 | @Test def runMatrixQuery {
44 | val graph = Neo4jGraph.loadGraph(sc.sc, "A", Seq.empty, "B")
45 | assertEquals(2, graph.vertices.count)
46 | assertEquals(1, graph.edges.count)
47 | }
48 |
49 | @Test def saveGraph {
50 | val edges: RDD[Edge[Long]] = sc.makeRDD(Seq(Edge(0, 1, 42L)))
51 | val graph = Graph.fromEdges(edges, -1)
52 | assertEquals(2, graph.vertices.count)
53 | assertEquals(1, graph.edges.count)
54 | Neo4jGraph.saveGraph(sc, graph, null, "test")
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/neo4j/src/test/scala/org/neo4j/spark/Neo4jRestSparkTest.scala:
--------------------------------------------------------------------------------
1 | package org.neo4j.spark
2 |
3 | import com.lakala.datacenter.load.spark.Neo4j
4 | import org.apache.spark.{SparkConf, SparkContext}
5 |
6 | /**
7 | * Created by Administrator on 2017/5/11 0011.
8 | */
9 | object Neo4jRestSparkTest {
10 | def main(args: Array[String]): Unit = {
11 | val conf = new SparkConf().setAppName("neoTest").setMaster("local[2]")
12 | /*.set("spark.neo4j.bolt.url","jdbc:neo4j:bolt:192.168.0.33:7687")*//*.set("spark.driver.allowMultipleContexts", "true").set("spark.neo4j.bolt.url", server.boltURI.toString)*/
13 | val sc = new SparkContext(conf)
14 | runCypherRelQueryWithPartition(sc)
15 | }
16 |
17 | def runCypherRelQueryWithPartition(sc: SparkContext) {
18 | val neo4j: Neo4j = Neo4j(sc).cypher("match (n:Mobile {type:'1'})-[r1:loanapply] -(p:ApplyInfo)-[r2:loanapply]-(m:Mobile)-[r3:loanapply]-(q:ApplyInfo) return n.content as content1 ,type(r1) as value1,p.orderno as orderno1,type(r2) as value2,m.content as content2,type(r3) as value3,q.orderno as orderno2 ").partitions(7).batch(200)
19 | val knows: Long = neo4j.loadRowRdd.count()
20 | println(knows)
21 | }
22 | }
23 |
--------------------------------------------------------------------------------