├── .gitignore
├── README.md
├── dataImport
    ├── pom.xml
    └── src
    │   └── main
    │       ├── resources
    │           ├── clean.groovy
    │           ├── clean.sh
    │           ├── conf
    │           │   ├── gremlin-server
    │           │   │   └── janusgraph-hbase-es-server-new.properties
    │           │   └── hadoop-graph
    │           │   │   ├── hadoop-vertex-script-yarn.properties
    │           │   │   └── hadoop-vertex-script.properties
    │           ├── convert.sh
    │           ├── data
    │           │   └── lolth-schema.groovy
    │           ├── data_export.txt
    │           ├── gremlin_run.groovy
    │           ├── gremlin_run.sh
    │           ├── hive
    │           │   └── migrate_csv_to_hive_table.sql
    │           ├── scripts
    │           │   └── script_mobile.groovy
    │           └── test.data
    │           │   ├── call_edge.csv
    │           │   ├── device.csv
    │           │   ├── has.csv
    │           │   └── mobile.csv
    │       └── scala
    │           └── com
    │               └── qihoo
    │                   └── finance
    │                       └── tap
    │                           ├── Helper.java
    │                           ├── ImportCommon.scala
    │                           ├── JanusGraphProvider.java
    │                           ├── ScalaHelper.scala
    │                           ├── data
    │                               └── convert
    │                               │   ├── CallEdgeConvertToCsv.scala
    │                               │   ├── DeviceConvertToCsv.scala
    │                               │   ├── MergeNodesAndEdges.scala
    │                               │   ├── MobileConvertToCsv.scala
    │                               │   └── OtherEdgeConvertToCsv.scala
    │                           ├── direct
    │                               ├── EdgeImport.scala
    │                               └── VertexImport.scala
    │                           └── increment
    │                               ├── EdgeImportIncrement.scala
    │                               └── VertexImportIncrement.scala
├── janusgraph_yarn.md
├── optimize.md
└── pom.xml


/.gitignore:
--------------------------------------------------------------------------------
 1 | # maven ignore
 2 | target/
 3 | *.jar
 4 | *.war
 5 | *.zip
 6 | *.tar
 7 | 
 8 | # eclipse ignore
 9 | .settings/
10 | .project
11 | .classpath
12 | 
13 | # idea ignore
14 | .idea/*
15 | py_tag_tool/.idea/*
16 | *.ipr
17 | *.iml
18 | *.iws
19 | 
20 | # temp ignore
21 | logs/
22 | *.doc
23 | *.log
24 | *.cache
25 | *.diff
26 | *.patch
27 | *.tmp
28 | *.versionsBackup
29 | 
30 | # system ignore
31 | .DS_Store
32 | Thumbs.db


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Janusgraph-data-importer
 2 | 
 3 | ## 说明
 4 | 本项目是本人具体项目数据迁移过程中的代码，需要根据自身情况修改代码
 5 | 交流邮箱：zhoupengblack@qq.com
 6 | 
 7 | ## 工程说明
 8 | 
 9 | ### resources 文件说明
10 | * conf 目录下放置了数据导入的配置文件，并且对相关配置项都做了些说明   
11 | * data 为图创建的 schema  
12 | * test.data 是我们从 AgensGraph 数据库导出的顶点和边的数据格式 后续的代码都是基于这份数据格式进行解析和操作的    
13 | * hive 将转换后的数据利用 hive 添加唯一 id  
14 | * scripts janusgraph 导入的时候需要对数据进行解析    
15 | * resources 下的 sh,groovy 脚本是方便数据导入 写的一些简单脚本    
16 | 
17 | 
18 | ### 代码文件说明
19 | * data.convert 目录下代码是将导出的数据转换为hive 表，再利用上面说的hive脚本添加 唯一id  
20 | 核心为 MergeNodesAndEdges 利用 spark的 cogroup 操作 转换为 janusgraph 接受的导入格式
21 | 此步操作耗时比较久，比较占内存，spark.network.timeout=600 设置长一点
22 | 
23 | * direct 此目录下的代码 为直接连接 janusgraphServer 插入数据。如果数据量比较小的情况下可以使用
24 | 
25 | * increment 为导入增量数据，当历史数据导入完毕后需要导入增量数据，需要检查顶点和表是否已经存在
26 | 
27 | 
28 | ## 常见问题
29 | java.lang.OutOfMemoryError： unable to create new native thread
30 | * 机器的 ulimit -u 设置比较小 可以设置 102400
31 | 
32 | Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
33 | * spark 内存不足，有两个方案 增大 spark.executor.memory 或者调小  spark.executor.cores
34 | 保证 spark.executor.memory / spark.executor.cores 在 6G，7G左右
35 | 
36 | KryoSerializer Failed to find one of the right cookies
37 | * KryoSerializer 序列化 spark 配置不对，参考 hadoop-vertex-script.properties 配置文件
38 | 
39 | * 确保建议唯一索引的数据是唯一的，id是唯一的，不然数据导入会有问题
40 | 
41 | 
42 | ## 补充
43 | * pom 文件中的   <scope>provided</scope> 表示在打包的时候不会打入进去
44 | 集群环境中是已经有这些包的，在本地调试的时候需要将这行注释掉
45 | 本地调试时 代码中的这行也需要放开 //    conf.setMaster("local")
46 | ```
47 | <dependency>
48 |     <groupId>org.apache.spark</groupId>
49 |     <artifactId>spark-hive_${scala.version}</artifactId>
50 |     <version>${spark.version}</version>
51 |     <scope>provided</scope>
52 | </dependency>
53 | ```
54 | 


--------------------------------------------------------------------------------
/dataImport/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <parent>
  6 |         <groupId>com.qihoo.finance</groupId>
  7 |         <artifactId>janusgraph</artifactId>
  8 |         <version>1.0.0-SNAPSHOT</version>
  9 |     </parent>
 10 |     <modelVersion>4.0.0</modelVersion>
 11 | 
 12 |     <artifactId>dataImport</artifactId>
 13 | 
 14 |     <dependencies>
 15 |         <dependency>
 16 |             <groupId>org.janusgraph</groupId>
 17 |             <artifactId>janusgraph-core</artifactId>
 18 |             <version>0.3.1</version>
 19 |         </dependency>
 20 |         <dependency>
 21 |             <groupId>org.janusgraph</groupId>
 22 |             <artifactId>janusgraph-hbase</artifactId>
 23 |             <version>0.3.1</version>
 24 |         </dependency>
 25 |         <dependency>
 26 |             <groupId>org.apache.hbase</groupId>
 27 |             <artifactId>hbase-shaded-client</artifactId>
 28 |             <version>1.2.6</version>
 29 |         </dependency>
 30 |         <dependency>
 31 |             <groupId>org.apache.hbase</groupId>
 32 |             <artifactId>hbase-shaded-server</artifactId>
 33 |             <version>1.2.6</version>
 34 |         </dependency>
 35 |         <!-- https://mvnrepository.com/artifact/org.elasticsearch.client/elasticsearch-rest-client -->
 36 |         <dependency>
 37 |             <groupId>org.janusgraph</groupId>
 38 |             <artifactId>janusgraph-es</artifactId>
 39 |             <version>0.3.1</version>
 40 |         </dependency>
 41 |         <dependency>
 42 |             <groupId>org.apache.tinkerpop</groupId>
 43 |             <artifactId>gremlin-driver</artifactId>
 44 |             <version>3.3.3</version>
 45 |         </dependency>
 46 |         <dependency>
 47 |             <groupId>com.google.guava</groupId>
 48 |             <artifactId>guava</artifactId>
 49 |             <version>16.0</version>
 50 |         </dependency>
 51 |         <!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
 52 |         <dependency>
 53 |             <groupId>com.alibaba</groupId>
 54 |             <artifactId>fastjson</artifactId>
 55 |             <version>1.2.58</version>
 56 |         </dependency>
 57 |         <dependency>
 58 |             <groupId>commons-codec</groupId>
 59 |             <artifactId>commons-codec</artifactId>
 60 |             <version>${commons.codec.version}</version>
 61 |         </dependency>
 62 | 
 63 |         <dependency>
 64 |             <groupId>org.apache.spark</groupId>
 65 |             <artifactId>spark-core_${scala.version}</artifactId>
 66 |             <version>${spark.version}</version>
 67 |             <scope>provided</scope>
 68 |         </dependency>
 69 |         <dependency>
 70 |             <groupId>org.apache.spark</groupId>
 71 |             <artifactId>spark-streaming_${scala.version}</artifactId>
 72 |             <version>${spark.version}</version>
 73 |             <scope>provided</scope>
 74 |         </dependency>
 75 |         <dependency>
 76 |             <groupId>org.apache.spark</groupId>
 77 |             <artifactId>spark-sql_${scala.version}</artifactId>
 78 |             <version>${spark.version}</version>
 79 |             <scope>provided</scope>
 80 |         </dependency>
 81 |         <dependency>
 82 |             <groupId>org.apache.spark</groupId>
 83 |             <artifactId>spark-hive_${scala.version}</artifactId>
 84 |             <version>${spark.version}</version>
 85 |             <scope>provided</scope>
 86 |         </dependency>
 87 | 
 88 |         <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
 89 |         <dependency>
 90 |             <groupId>org.apache.hadoop</groupId>
 91 |             <artifactId>hadoop-client</artifactId>
 92 |             <version>${hadoop.version}</version>
 93 |             <scope>provided</scope>
 94 |         </dependency>
 95 | 
 96 |         <dependency>
 97 |             <groupId>org.apache.hadoop</groupId>
 98 |             <artifactId>hadoop-hdfs</artifactId>
 99 |             <version>${hadoop.version}</version>
100 |             <scope>provided</scope>
101 |         </dependency>
102 |         <dependency>
103 |             <groupId>org.apache.hadoop</groupId>
104 |             <artifactId>hadoop-common</artifactId>
105 |             <version>${hadoop.version}</version>
106 |             <scope>provided</scope>
107 |         </dependency>
108 |         <dependency>
109 |             <groupId>org.apache.hadoop</groupId>
110 |             <artifactId>hadoop-streaming</artifactId>
111 |             <version>${hadoop.version}</version>
112 |             <scope>provided</scope>
113 |         </dependency>
114 | 
115 |     </dependencies>
116 | 
117 |     <build>
118 |         <plugins>
119 | 
120 |             <plugin>
121 |                 <groupId>org.scala-tools</groupId>
122 |                 <artifactId>maven-scala-plugin</artifactId>
123 |                 <version>2.15.2</version>
124 |                 <executions>
125 |                     <execution>
126 |                         <id>compile</id>
127 |                         <goals>
128 |                             <goal>compile</goal>
129 |                         </goals>
130 |                         <phase>compile</phase>
131 |                     </execution>
132 |                     <execution>
133 |                         <id>test-compile</id>
134 |                         <goals>
135 |                             <goal>testCompile</goal>
136 |                         </goals>
137 |                         <phase>test-compile</phase>
138 |                     </execution>
139 |                     <execution>
140 |                         <phase>process-resources</phase>
141 |                         <goals>
142 |                             <goal>compile</goal>
143 |                         </goals>
144 |                     </execution>
145 |                 </executions>
146 |             </plugin>
147 |             <plugin>
148 |                 <groupId>org.apache.maven.plugins</groupId>
149 |                 <artifactId>maven-source-plugin</artifactId>
150 |                 <version>3.0.1</version>
151 |                 <executions>
152 |                     <execution>
153 |                         <phase>package</phase>
154 |                         <goals>
155 |                             <goal>jar-no-fork</goal>
156 |                         </goals>
157 |                     </execution>
158 |                 </executions>
159 |             </plugin>
160 |             <!-- Compiler 插件, 设定JDK版本 -->
161 |             <plugin>
162 |                 <groupId>org.apache.maven.plugins</groupId>
163 |                 <artifactId>maven-compiler-plugin</artifactId>
164 |                 <version>3.6.0</version>
165 |                 <configuration>
166 |                     <source>${java.version}</source>
167 |                     <target>${java.version}</target>
168 |                     <showWarnings>true</showWarnings>
169 |                 </configuration>
170 |             </plugin>
171 | 
172 | 
173 |             <plugin>
174 |                 <groupId>org.apache.maven.plugins</groupId>
175 |                 <artifactId>maven-jar-plugin</artifactId>
176 |                 <version>2.4</version>
177 |                 <configuration>
178 |                     <archive>
179 |                         <manifest>
180 |                             <mainClass>com.qihoo.finance.tap.Main</mainClass>
181 |                         </manifest>
182 |                     </archive>
183 |                 </configuration>
184 |             </plugin>
185 | 
186 |             <plugin>
187 |                 <groupId>org.apache.maven.plugins</groupId>
188 |                 <artifactId>maven-shade-plugin</artifactId>
189 |                 <version>2.3</version>
190 |                 <configuration>
191 |                     <shadedArtifactAttached>false</shadedArtifactAttached>
192 |                     <outputFile>${project.build.directory}/${project.artifactId}-libs.jar</outputFile>
193 |                     <filters>
194 |                         <filter>
195 |                             <artifact>*:*</artifact>
196 |                             <excludes>
197 |                                 <exclude>META-INF/*.SF</exclude>
198 |                                 <exclude>META-INF/*.DSA</exclude>
199 |                                 <exclude>META-INF/*.RSA</exclude>
200 |                             </excludes>
201 |                         </filter>
202 |                     </filters>
203 |                 </configuration>
204 |                 <executions>
205 |                     <execution>
206 |                         <phase>package</phase>
207 |                         <goals>
208 |                             <goal>shade</goal>
209 |                         </goals>
210 |                     </execution>
211 |                 </executions>
212 |             </plugin>
213 |         </plugins>
214 |     </build>
215 | 
216 | </project>


--------------------------------------------------------------------------------
/dataImport/src/main/resources/clean.groovy:
--------------------------------------------------------------------------------
1 | graph = JanusGraphFactory.open('conf/gremlin-server/janusgraph-hbase-es-server-new.properties')
2 | graph.close(); org.janusgraph.core.util.JanusGraphCleanup.clear(graph)
3 | 
4 | :load data/lolth-schema-pass.groovy
5 | graph = JanusGraphFactory.open('conf/gremlin-server/janusgraph-hbase-es-server-new.properties')
6 | 
7 | defineLolthSchema(graph)
8 | graph.close()


--------------------------------------------------------------------------------
/dataImport/src/main/resources/clean.sh:
--------------------------------------------------------------------------------
1 | bin/gremlin.sh -e ./clean.groovy


--------------------------------------------------------------------------------
/dataImport/src/main/resources/conf/gremlin-server/janusgraph-hbase-es-server-new.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/360digitech/janusgraph-data-importer/e4a09b32984961884a5994e4cdff80b211318e43/dataImport/src/main/resources/conf/gremlin-server/janusgraph-hbase-es-server-new.properties


--------------------------------------------------------------------------------
/dataImport/src/main/resources/conf/hadoop-graph/hadoop-vertex-script-yarn.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/360digitech/janusgraph-data-importer/e4a09b32984961884a5994e4cdff80b211318e43/dataImport/src/main/resources/conf/hadoop-graph/hadoop-vertex-script-yarn.properties


--------------------------------------------------------------------------------
/dataImport/src/main/resources/conf/hadoop-graph/hadoop-vertex-script.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/360digitech/janusgraph-data-importer/e4a09b32984961884a5994e4cdff80b211318e43/dataImport/src/main/resources/conf/hadoop-graph/hadoop-vertex-script.properties


--------------------------------------------------------------------------------
/dataImport/src/main/resources/convert.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | spark-submit --class com.qihoo.finance.tap.data.convert.MobileConvertToCsv --master yarn --conf spark.network.timeout=300 --deploy-mode client --queue root.graph ./dataImport-libs.jar  hdfs://360jinrongbdp/user/finloan/janusgraph_new/mobile_split/part-*
3 | 
4 | 
5 | spark-submit --class com.qihoo.finance.tap.data.convert.MergeNodesAndEdges --master yarn --conf spark.network.timeout=600 --deploy-mode client  --queue root.graph ./dataImport-libs.jar --outputFile hdfs://360jinrongbdp/user/finloan/janusgraph_new/merge_all_relation.txt
6 | 


--------------------------------------------------------------------------------
/dataImport/src/main/resources/data/lolth-schema.groovy:
--------------------------------------------------------------------------------
 1 | /* lolth-schema.groovy
 2 |  *
 3 |  * Helper functions for declaring JanusGraph schema elements
 4 |  * (vertex labels, edge labels, property keys) to accommodate
 5 |  * TP3 sample data.
 6 |  *
 7 |  * Sample usage in a gremlin.sh session:
 8 |  * bin/gremlin.sh
 9 |  * :load data/lolth-schema.groovy
10 |  * t = JanusGraphFactory.open('conf/gremlin-server/janusgraph-hbase-es-server.properties')
11 |  * defineLolthSchema(t)
12 |  * t.close()
13 |  * gremlin>
14 |  */
15 | 
16 | def defineLolthSchema(janusGraph) {
17 |     mgmt = janusGraph.openManagement()
18 |     name = mgmt.makePropertyKey("name").dataType(String.class).make()
19 |     is_register = mgmt.makePropertyKey("is_register").dataType(String.class).make()
20 |     is_risk = mgmt.makePropertyKey("is_risk").dataType(String.class).make()
21 |     is_internal = mgmt.makePropertyKey("is_internal").dataType(String.class).make()
22 |     is_service = mgmt.makePropertyKey("is_service").dataType(String.class).make()
23 |     merchant_name = mgmt.makePropertyKey("merchant_name").dataType(String.class).make()
24 |     is_exception = mgmt.makePropertyKey("is_exception").dataType(String.class).make()
25 |     is_white = mgmt.makePropertyKey("is_white").dataType(String.class).make()
26 | 
27 |     // name 属性加密
28 |     // nm_pass = mgmt.makePropertyKey("nm_pass").dataType(String.class).make()
29 |     // nm_sha1 = mgmt.makePropertyKey("nm_sha1").dataType(String.class).make()
30 | 
31 |     status = mgmt.makePropertyKey("status").dataType(Integer.class).make()
32 |     suspect_risk = mgmt.makePropertyKey("suspect_risk").dataType(Integer.class).make()
33 |     overdue_status = mgmt.makePropertyKey("overdue_status").dataType(Integer.class).make()
34 |     mgm = mgmt.makePropertyKey("mgm").dataType(Integer.class).make()
35 | 
36 |     blid = mgmt.makePropertyKey("bulkLoader.vertex.id").dataType(Long.class).make()
37 |     mgmt.buildIndex("byBulkLoaderVertexId", Vertex.class).addKey(blid).buildCompositeIndex()
38 | 
39 |     // 注意 JanusGraph 的 label名称区分大小写，而 AgensGraph 不做区分
40 |     // 所有统一使用大写
41 |     mgmt.makeVertexLabel("DEVICE").make()
42 |     mgmt.makeVertexLabel("MOBILE").make()
43 |     mgmt.makeVertexLabel("WIFI").make()
44 | 
45 |     mgmt.makeEdgeLabel("CALL").multiplicity(Multiplicity.SIMPLE).make()
46 |     mgmt.makeEdgeLabel("HAS").multiplicity(Multiplicity.SIMPLE).make()
47 |     mgmt.makeEdgeLabel("USE").multiplicity(Multiplicity.SIMPLE).make()
48 |     mgmt.makeEdgeLabel("USE_WIFI").multiplicity(Multiplicity.SIMPLE).make()
49 | 
50 |     mgmt.buildIndex("name", Vertex.class).addKey(name).unique().buildCompositeIndex()
51 |     // mgmt.buildIndex("nm_sha1", Vertex.class).addKey(nm_sha1).unique().buildCompositeIndex()
52 |     mgmt.commit()
53 | }


--------------------------------------------------------------------------------
/dataImport/src/main/resources/data_export.txt:
--------------------------------------------------------------------------------
 1 | # AgensGraph data export
 2 | # 顶点
 3 | COPY (
 4 | match (m:mobile) return m
 5 | ) TO '/tmp/mobile.csv';
 6 | 
 7 | 
 8 | COPY (
 9 | match (m:device) return m
10 | ) TO '/tmp/device.csv';
11 | 
12 | 
13 | COPY (
14 | match (m:wifi) return m
15 | ) TO '/tmp/wifi.csv';
16 | 
17 | 
18 | COPY (
19 | match (m1:MOBILE)-[r:CALL]->(m2:MOBILE) return m1.name, 'CALL', m2.name, r.mgm
20 | ) TO '/tmp/call.csv'  DELIMITER ',';
21 | 
22 | 
23 | COPY (
24 | match (m1:MOBILE)-[r:USE]->(m2:DEVICE) return m1.name, 'USE', m2.name
25 | ) TO '/tmp/use.csv'  DELIMITER ',';
26 | 
27 | COPY (
28 | match (m1:DEVICE)-[r:HAS]->(m2:MOBILE) return m1.name, 'HAS', m2.name
29 | ) TO '/tmp/has.csv'  DELIMITER ',';
30 | 
31 | COPY (
32 | match (m1:MOBILE)-[r:USE_WIFI]->(m2:WIFI) return m1.name, 'USE_WIFI', m2.name
33 | ) TO '/tmp/use_wifi.csv'  DELIMITER ',';
34 | 


--------------------------------------------------------------------------------
/dataImport/src/main/resources/gremlin_run.groovy:
--------------------------------------------------------------------------------
1 | 
2 | graph = GraphFactory.open("conf/hadoop-graph/hadoop-vertex-script.properties")
3 | blvp = BulkLoaderVertexProgram.build().bulkLoader(OneTimeBulkLoader).writeGraph("conf/gremlin-server/janusgraph-hbase-es-server-new.properties").create(graph)
4 | graph.compute(SparkGraphComputer).program(blvp).submit().get()


--------------------------------------------------------------------------------
/dataImport/src/main/resources/gremlin_run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # 配置 hadoop 环境读取 hdfs文件
3 | export HADOOP_CONF_DIR=/etc/hadoop/conf
4 | export CLASSPATH=$HADOOP_CONF_DIR
5 | nohup bin/gremlin.sh -e ./gremlin_run.groovy &


--------------------------------------------------------------------------------
/dataImport/src/main/resources/hive/migrate_csv_to_hive_table.sql:
--------------------------------------------------------------------------------
  1 | 
  2 | # 2425 重复值在 wifi中全部有
  3 | select count(*) from migrate_id_repe_check_result_tmp a 
  4 | join migrate_use_wifi_tmp b 
  5 | on a.name = b.end_name;
  6 | 
  7 | 
  8 | insert overwrite table migrate_wifi_tmp 
  9 | select  a.* from migrate_wifi_tmp a
 10 | left join migrate_id_repe_check_result_tmp b 
 11 | on a.name = b.name
 12 | where b.name is null;
 13 | 
 14 | 
 15 | 
 16 | select * from migrate_use_wifi_tmp where end_name = 'FP2368037167772741632';
 17 | 
 18 | 
 19 | select row_number() over () as rowid, name from migrate_wifi_tmp limit 100;
 20 | 
 21 | 
 22 | # 合并生成唯一 id
 23 | create table migrate_id_mat_tmp as 
 24 | select row_number() over () as id, name from (
 25 |     select name from migrate_device_tmp
 26 |     union
 27 |     select name from migrate_wifi_tmp
 28 |     union
 29 |     select name from migrate_mobile_tmp 
 30 | ) as abc;
 31 | 
 32 | 
 33 | insert overwrite table migrate_id_repe_check_tmp  
 34 | select name, type from (
 35 |     select name, 'd' as type from migrate_device_tmp
 36 |     union
 37 |     select name, 'w' as type from migrate_wifi_tmp
 38 |     union
 39 |     select name, 'm' as type from migrate_mobile_tmp 
 40 | ) as abc;
 41 | 
 42 | 
 43 | create table migrate_id_repe_check_result_tmp as
 44 | select name, count(*) as count
 45 | from migrate_id_repe_check_tmp
 46 | group by name
 47 | having count(*) > 1;
 48 | 
 49 | 
 50 | 
 51 | create table migrate_mobile_id_tmp as 
 52 | select b.id, a.* from migrate_mobile_tmp a
 53 | join migrate_id_mat_tmp b 
 54 | on a.name = b.name;
 55 | 
 56 | 
 57 | create table migrate_device_id_tmp as 
 58 | select b.id, a.* from migrate_device_tmp a
 59 | join migrate_id_mat_tmp b 
 60 | on a.name = b.name;
 61 | 
 62 | create table migrate_wifi_id_tmp as 
 63 | select b.id, a.* from migrate_wifi_tmp a
 64 | join migrate_id_mat_tmp b 
 65 | on a.name = b.name;
 66 | 
 67 | 
 68 | 
 69 | DROP TABLE IF EXISTS migrate_call_id_tmp;
 70 | create table migrate_call_id_tmp as 
 71 | select b.id as start_id, c.id as end_id, a.mgm from migrate_call_tmp a
 72 | join migrate_mobile_id_tmp b 
 73 | on a.start_name = b.name
 74 | join migrate_mobile_id_tmp c
 75 | on a.end_name = c.name;
 76 | 
 77 | 
 78 | DROP TABLE IF EXISTS migrate_use_id_tmp;
 79 | create table migrate_use_id_tmp as 
 80 | select b.id as start_id, c.id as end_id from migrate_use_tmp a
 81 | join migrate_mobile_id_tmp b 
 82 | on a.start_name = b.name
 83 | join migrate_device_id_tmp c
 84 | on a.end_name = c.name;
 85 | 
 86 | 
 87 | 
 88 | DROP TABLE IF EXISTS migrate_has_id_tmp;
 89 | create table migrate_has_id_tmp as 
 90 | select b.id as start_id, c.id as end_id from migrate_has_tmp a
 91 | join migrate_device_id_tmp b 
 92 | on a.start_name = b.name
 93 | join migrate_mobile_id_tmp c
 94 | on a.end_name = c.name;
 95 | 
 96 | 
 97 | DROP TABLE IF EXISTS migrate_use_wifi_id_tmp;
 98 | create table migrate_use_wifi_id_tmp as 
 99 | select b.id as start_id, c.id as end_id from migrate_use_wifi_tmp a
100 | join migrate_mobile_id_tmp b 
101 | on a.start_name = b.name
102 | join migrate_wifi_id_tmp c
103 | on a.end_name = c.name;


--------------------------------------------------------------------------------
/dataImport/src/main/resources/scripts/script_mobile.groovy:
--------------------------------------------------------------------------------
 1 | def parse(line) {
 2 |     def (vertex, inEdges, outEdges) = line.split(/\t/, 3)
 3 |     def (v1id, v1label, v1props) = vertex.split(/,/, 3)
 4 |     def v1 = graph.addVertex(T.id, v1id.toLong(), T.label, v1label)
 5 |     switch (v1label) {
 6 |         case "MOBILE":
 7 |             def (name, nm_pass, nm_sha1, is_register, is_risk, is_internal, is_service, merchant_name, status, suspect_risk, overdue_status) = v1props.split(/,/, 11)
 8 |             v1.property("name", name)
 9 |             v1.property("nm_pass", nm_pass)
10 |             v1.property("nm_sha1", nm_sha1)
11 | 
12 |             if (is_register?.trim()) {
13 |                 v1.property("is_register", is_register)
14 |             }
15 |             if (is_risk?.trim()) {
16 |                 v1.property("is_risk", is_risk)
17 |             }
18 |             if (is_internal?.trim()) {
19 |                 v1.property("is_internal", is_internal)
20 |             }
21 |             if (is_service?.trim()) {
22 |                 v1.property("is_service", is_service)
23 |             }
24 |             if (merchant_name?.trim()) {
25 |                 v1.property("merchant_name", merchant_name)
26 |             }
27 |             if (status?.trim()) {
28 |                 v1.property("status", status.toInteger())
29 |             }
30 |             if (suspect_risk?.trim()) {
31 |                 v1.property("suspect_risk", suspect_risk.toInteger())
32 |             }
33 |             if (overdue_status?.trim()) {
34 |                 v1.property("overdue_status", overdue_status.toInteger())
35 |             }
36 |             break
37 |         case "DEVICE":
38 |         case "WIFI":
39 |             def (name, is_exception, is_white) = v1props.split(/,/, 3)
40 |             v1.property("name", name)
41 |             if (is_exception?.trim()) {
42 |                 v1.property("is_exception", is_exception)
43 |             }
44 |             if (is_white?.trim()) {
45 |                 v1.property("is_white", is_white)
46 |             }
47 |             break
48 |         default:
49 |             throw new Exception("Unexpected vertex label: ${v1label}")
50 |     }
51 |     [[outEdges, true], [inEdges, false]].each { def edges, def out ->
52 |         edges.split(/\|/).grep().each { def edge ->
53 |             def parts = edge.split(/,/)
54 |             def otherV, eLabel, mgm = null
55 |             if (parts.size() == 2) {
56 |                 (eLabel, otherV) = parts
57 |             } else {
58 |                 (eLabel, otherV, mgm) = parts
59 |             }
60 |             def v2 = graph.addVertex(T.id, otherV.toLong())
61 |             def e = out ? v1.addOutEdge(eLabel, v2) : v1.addInEdge(eLabel, v2)
62 | 
63 |             if (mgm?.trim()) e.property("mgm", mgm.toInteger())
64 |         }
65 |     }
66 |     return v1
67 | }


--------------------------------------------------------------------------------
/dataImport/src/main/resources/test.data/call_edge.csv:
--------------------------------------------------------------------------------
1 | "187027xx013","CALL","187xxx63006",1
2 | "187027xx006","CALL","187xxx61013",\N


--------------------------------------------------------------------------------
/dataImport/src/main/resources/test.data/device.csv:
--------------------------------------------------------------------------------
1 | device[10.2]{"name": "FP1627486073238818816"}
2 | device[10.3]{"name": "FP1418992331331342021"}
3 | device[10.4]{"name": "FP1418992331331342005"}
4 | device[10.6]{"name": "FP2659928169380958208"}


--------------------------------------------------------------------------------
/dataImport/src/main/resources/test.data/has.csv:
--------------------------------------------------------------------------------
1 | "FP1418992331331342001","HAS","18xxx761005"
2 | "FP1418992331331342001","HAS","18xxx761004"


--------------------------------------------------------------------------------
/dataImport/src/main/resources/test.data/mobile.csv:
--------------------------------------------------------------------------------
1 | mobile[11.46]{"name": "1870xxx1013", "is_service": "true", "suspect_risk": 0, "status": 1}
2 | mobile[11.17]{"name": "1870xxx3006", "status": 4, "is_register": "true"}


--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/Helper.java:
--------------------------------------------------------------------------------
 1 | package com.qihoo.finance.tap;
 2 | 
 3 | import com.alibaba.fastjson.JSON;
 4 | import com.alibaba.fastjson.JSONObject;
 5 | import org.apache.commons.codec.digest.DigestUtils;
 6 | 
 7 | /**
 8 |  * @author zhoupeng
 9 |  * @date 2019/5/9
10 |  */
11 | public class Helper {
12 |     public static String buildVertexProperty(String label, String jsonString) {
13 |         JSONObject jsonObject = Helper.getVertexProperty(jsonString);
14 |         return buildPropertyString(label, jsonObject);
15 |     }
16 | 
17 |     private static String buildPropertyString(String label, JSONObject jsonObject) {
18 |         StringBuilder builder = new StringBuilder();
19 |         jsonObject.forEach((key, value) -> {
20 |             if (value instanceof String) {
21 |                 builder.append(".property('").append(key).append("', '").append(value).append("')");
22 |             } else {
23 |                 builder.append(".property('").append(key).append("', ").append(value).append(")");
24 |             }
25 | 
26 |             // 手机号才需要加密
27 |             if ("MOBILE".equals(label) && "name".equals(key)) {
28 |                 String encrypt = value.toString();
29 |                 String sha1Hex = DigestUtils.sha1Hex(value.toString());
30 |                 builder.append(".property('").append("nm_pass").append("', '").append(encrypt).append("')");
31 |                 builder.append(".property('").append("nm_sha1").append("', '").append(sha1Hex).append("')");
32 |             }
33 |         });
34 |         return builder.toString();
35 |     }
36 | 
37 |     public static String buildIncrementOtherPropertyString(String label, JSONObject jsonObject) {
38 |         StringBuilder builder = new StringBuilder();
39 |         jsonObject.forEach((key, value) -> {
40 | 
41 |             if ("name".equals(key)) {
42 |                 // 手机号 才需要加密信息
43 |                 if ("MOBILE".equals(label)) {
44 |                     String encrypt =value.toString();
45 |                     String sha1Hex = DigestUtils.sha1Hex(value.toString());
46 | 
47 |                     builder.append(".property('").append("nm_pass").append("', '").append(encrypt).append("')");
48 |                     builder.append(".property('").append("nm_sha1").append("', '").append(sha1Hex).append("')");
49 |                 }
50 | 
51 |             } else if ("status".equals(key)) {
52 |                 // do nothing
53 |             } else {
54 |                 if (value instanceof String) {
55 |                     builder.append(".property('").append(key).append("', '").append(value).append("')");
56 |                 } else {
57 |                     builder.append(".property('").append(key).append("', ").append(value).append(")");
58 |                 }
59 |             }
60 | 
61 |         });
62 |         return builder.toString();
63 |     }
64 | 
65 |     public static JSONObject getVertexProperty(String jsonString) {
66 |         return JSON.parseObject(jsonString);
67 |     }
68 | }


--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/ImportCommon.scala:
--------------------------------------------------------------------------------
  1 | package com.qihoo.finance.tap
  2 | 
  3 | import java.util
  4 | import java.util.concurrent.TimeUnit
  5 | import java.util.function.Consumer
  6 | 
  7 | import com.alibaba.fastjson.JSONObject
  8 | import org.apache.log4j.{LogManager, Logger}
  9 | import org.apache.tinkerpop.gremlin.driver.{Client, Result}
 10 | 
 11 | import scala.util.control.Breaks
 12 | 
 13 | 
 14 | object ImportCommon {
 15 |   val logger: Logger = LogManager.getLogger("ImportCommon")
 16 | 
 17 |   type OptionMap = Map[Symbol, Any]
 18 | 
 19 |   def getJanusGraph(hosts: String, port: Int, poolSize: Int): JanusGraphProvider = {
 20 |     new JanusGraphProvider(hosts, port, poolSize)
 21 |   }
 22 | 
 23 |   def isEmpty(x: String) = Option(x).forall(_.isEmpty)
 24 | 
 25 | 
 26 |   def submitWithRetry(client: Client, runCql: String) = {
 27 |     val loop = new Breaks
 28 |     loop.breakable {
 29 |       for (a <- 1 to 100) {
 30 |         try {
 31 |           client.submit(runCql).stream().forEach(new Consumer[Result] {
 32 |             override def accept(t: Result): Unit =
 33 |               logger.info(t.getLong)
 34 |           })
 35 |           loop.break()
 36 |         } catch {
 37 |           case ex: Exception =>
 38 |             logger.warn(runCql)
 39 |             logger.warn(ex.getMessage, ex)
 40 |             TimeUnit.MILLISECONDS.sleep(1000 * a)
 41 |         }
 42 | 
 43 |       }
 44 |     }
 45 |   }
 46 | 
 47 | 
 48 |   def getResultWithRetry(client: Client, runCql: String): util.List[Result] = {
 49 |     var results: util.List[Result] = null
 50 |     val loop = new Breaks
 51 |     loop.breakable {
 52 |       for (a <- 1 to 100) {
 53 |         try {
 54 |           results = client.submit(runCql).all.get
 55 |           loop.break()
 56 |         } catch {
 57 |           case ex: Exception =>
 58 |             logger.warn(ex.getMessage, ex)
 59 |             TimeUnit.MILLISECONDS.sleep(1000 * a)
 60 |         }
 61 |       }
 62 |     }
 63 | 
 64 |     results
 65 |   }
 66 | 
 67 | 
 68 |   def handleVertexList(recordList: List[(String, String)], client: Client): Unit = {
 69 |     var runCql = "g = graph.traversal();g"
 70 | 
 71 |     recordList.foreach { case (label, attrString) =>
 72 |       runCql += ".addV('" + label + "')"
 73 |       runCql += Helper.buildVertexProperty(label, attrString);
 74 |     }
 75 | 
 76 |     if (recordList.nonEmpty) {
 77 |       runCql += ".count()"
 78 | 
 79 |       ImportCommon.submitWithRetry(client, runCql)
 80 |     }
 81 |   }
 82 | 
 83 |   // 顶点增量插入
 84 |   def handleVertexIncrementList(recordList: List[(String, String)], client: Client): Unit = {
 85 |     var runCql = "g = graph.traversal();g"
 86 | 
 87 |     recordList.foreach { case (label, attrString) =>
 88 |       val attrJson = Helper.getVertexProperty(attrString)
 89 |       val name = attrJson.getString("name")
 90 |       runCql += ".V().has('name', '" + name + "').hasLabel('" + label + "').as('m').fold().coalesce(unfold(), addV('" + label + "').property('name', '" + name + "'))"
 91 | 
 92 |       runCql += Helper.buildIncrementOtherPropertyString(label, attrJson)
 93 |       // status 属性单独进行操作
 94 |       val status = attrJson.getIntValue("status")
 95 |       if (status > 0) {
 96 |         runCql += ".V().has('name', '" + name + "').as('m').where(or(select('m').values('status').is(lt(" + status + ")), select('m').hasNot('status'))).property('status', " + status + ")"
 97 |       }
 98 |     }
 99 | 
100 |     if (recordList.nonEmpty) {
101 |       runCql += ".count()"
102 |       ImportCommon.submitWithRetry(client, runCql)
103 |     }
104 |   }
105 | 
106 |   def nextOption(map: OptionMap, list: List[String]): OptionMap = {
107 |     def isSwitch(s: String) = s(0) == '-'
108 | 
109 |     list match {
110 |       case Nil => map
111 |       case "--janusgraph-hosts" :: value :: tail =>
112 |         ImportCommon.nextOption(map ++ Map('janusgraphHosts -> value.toString), tail)
113 |       case "--janusgraph-port" :: value :: tail =>
114 |         ImportCommon.nextOption(map ++ Map('janusgraphPort -> value.toInt), tail)
115 |       case "--batch-size" :: value :: tail =>
116 |         ImportCommon.nextOption(map ++ Map('batchSize -> value.toInt), tail)
117 |       case "--pool-size" :: value :: tail =>
118 |         ImportCommon.nextOption(map ++ Map('poolSize -> value.toInt), tail)
119 |       case "--storage-hostname" :: value :: tail =>
120 |         ImportCommon.nextOption(map ++ Map('storageHostname -> value.toString), tail)
121 |       case "--label" :: value :: tail =>
122 |         ImportCommon.nextOption(map ++ Map('label -> value.toString), tail)
123 |       case "--deviceType" :: value :: tail =>
124 |         ImportCommon.nextOption(map ++ Map('deviceType -> value.toString), tail)
125 |       case "--edgeType" :: value :: tail =>
126 |         ImportCommon.nextOption(map ++ Map('edgeType -> value.toString), tail)
127 |       case "--fromLabel" :: value :: tail =>
128 |         ImportCommon.nextOption(map ++ Map('fromLabel -> value.toString), tail)
129 |       case "--toLabel" :: value :: tail =>
130 |         ImportCommon.nextOption(map ++ Map('toLabel -> value.toString), tail)
131 |       case "--outputFile" :: value :: tail =>
132 |         ImportCommon.nextOption(map ++ Map('outputFile -> value.toString), tail)
133 |       case string :: opt2 :: tail if isSwitch(opt2) =>
134 |         ImportCommon.nextOption(map ++ Map('importFile -> string.toString), list.tail)
135 |       case string :: Nil => ImportCommon.nextOption(map ++ Map('importFile -> string.toString), list.tail)
136 |       case option :: tail => println("Unknown option " + option)
137 |         Map()
138 |     }
139 |   }
140 | }
141 | 


--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/JanusGraphProvider.java:
--------------------------------------------------------------------------------
 1 | package com.qihoo.finance.tap;
 2 | 
 3 | import org.apache.commons.configuration.Configuration;
 4 | import org.apache.commons.configuration.PropertiesConfiguration;
 5 | import org.apache.log4j.LogManager;
 6 | import org.apache.log4j.Logger;
 7 | import org.apache.tinkerpop.gremlin.driver.Client;
 8 | import org.apache.tinkerpop.gremlin.driver.Cluster;
 9 | 
10 | import java.util.Objects;
11 | 
12 | /**
13 |  * @author zhoupeng
14 |  * @date 2019/1/31
15 |  */
16 | public class JanusGraphProvider {
17 |     private static final Logger logger = LogManager.getLogger(JanusGraphProvider.class);
18 |     public Cluster cluster;
19 | 
20 | 
21 |     public JanusGraphProvider(String hosts, int port, int poolSize) {
22 |         Configuration clusterConfig = new PropertiesConfiguration();
23 |         clusterConfig.setProperty("hosts", hosts);
24 |         clusterConfig.setProperty("port", port);
25 |         clusterConfig.setProperty("connectionPool.minSize", poolSize);
26 |         clusterConfig.setProperty("connectionPool.maxSize", poolSize);
27 |         clusterConfig.setProperty("connectionPool.maxInProcessPerConnection", poolSize);
28 |         clusterConfig.setProperty("connectionPool.maxSimultaneousUsagePerConnection", poolSize);
29 |         clusterConfig.setProperty("connectionPool.maxContentLength", 65536000);
30 |         clusterConfig.setProperty("serializer.className", "org.apache.tinkerpop.gremlin.driver.ser.GryoMessageSerializerV3d0");
31 |         // 此处很蛋疼，需要返回列表，只能加逗号分隔才行，生成两个类
32 |         clusterConfig.setProperty("serializer.config.ioRegistries",
33 |                 "org.janusgraph.graphdb.tinkerpop.JanusGraphIoRegistry,org.janusgraph.graphdb.tinkerpop.JanusGraphIoRegistry");
34 | 
35 |         cluster = Cluster.open(clusterConfig);
36 |     }
37 | 
38 |     public Client getClient() {
39 |         return this.cluster.connect();
40 |     }
41 | 
42 | 
43 |     public void close() throws Exception {
44 |         try {
45 |             if (cluster != null) {
46 |                 // the cluster closes all of its clients
47 |                 cluster.close();
48 |             }
49 |         } finally {
50 |             cluster = null;
51 |         }
52 |     }
53 | 
54 |     public void submit(String cql) {
55 |         Client client = this.getClient();
56 |         try {
57 |             client.submit(cql).stream();
58 |         } finally {
59 |             if (!Objects.isNull(client)) {
60 |                 client.close();
61 |             }
62 |         }
63 |     }
64 | }


--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/ScalaHelper.scala:
--------------------------------------------------------------------------------
 1 | package com.qihoo.finance.tap
 2 | 
 3 | import java.lang
 4 | 
 5 | import com.alibaba.fastjson.JSONObject
 6 | import org.apache.spark.sql.DataFrame
 7 | 
 8 | object ScalaHelper {
 9 |   def convertHeader(label: String, headerMap: Map[String, String], headerList: Array[String]): Map[String, String] = {
10 |     var headResult = Map[String, String]()
11 |     for (field <- headerList) {
12 |       var result: String = null
13 | 
14 |       if ("name".equals(field)) {
15 |         result = "%s:ID(%s)".format(field, label)
16 |       } else {
17 |         if (headerMap.contains(field)) {
18 |           result = field + ":" + headerMap(field)
19 |         } else {
20 |           result = field
21 |         }
22 |       }
23 | 
24 |       headResult = headResult + (field -> result)
25 |     }
26 | 
27 |     headResult
28 |   }
29 | 
30 | 
31 |   def saveAsCSV(outputFile: String, df: DataFrame) = {
32 |     df.repartition(1)
33 |       .write
34 |       .mode("overwrite")
35 |       .format("com.databricks.spark.csv")
36 |       .option("header", "true")
37 |       .option("treatEmptyValuesAsNulls", "false")
38 |       .save(outputFile)
39 |   }
40 | 
41 | 
42 |   def parseVertexLineGetIdAndAttr(line: String) = {
43 |     val labelLast = line.indexOf("[")
44 |     val idLast = line.indexOf("]")
45 |     val attrStart = line.indexOf("{")
46 |     val attrStr = line.substring(attrStart, line.length)
47 | 
48 |     val jsonObject = Helper.getVertexProperty(attrStr)
49 |     jsonObject
50 |   }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/CallEdgeConvertToCsv.scala:
--------------------------------------------------------------------------------
 1 | package com.qihoo.finance.tap.data.convert
 2 | 
 3 | import com.qihoo.finance.tap.ImportCommon
 4 | import org.apache.log4j.{LogManager, Logger}
 5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 6 | import org.apache.spark.sql.{RowFactory, SQLContext}
 7 | import org.apache.spark.{SparkConf, SparkContext}
 8 | import org.apache.tinkerpop.gremlin.driver.Client
 9 | 
10 | object CallEdgeConvertToCsv {
11 |   val logger: Logger = LogManager.getLogger("CallEdgeConvertToCsv")
12 | 
13 |   val usage =
14 |     """
15 |     Usage: CallEdgeConvertToCsv [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] E:\360_doc\lolth\call_edge.csv
16 |   """
17 | 
18 |   def main(args: Array[String]) {
19 |     if (args.length == 0) {
20 |       println(usage)
21 |       System.exit(0)
22 |     }
23 | 
24 |     val argList = args.toList
25 |     val options = ImportCommon.nextOption(Map(), argList)
26 | 
27 |     val conf = new SparkConf().setAppName("CallEdgeConvertToCsv")
28 |     //setMaster("local") 本机的spark就用local，远端的就写ip
29 |     //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
30 |     //    conf.setMaster("local")
31 | 
32 |     val sc = new SparkContext(conf)
33 |     val sqlContext = new SQLContext(sc)
34 |     val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
35 |     val outputFile = options.getOrElse('outputFile, "").asInstanceOf[String]
36 | 
37 |     val dataRdd = txtFile.map {
38 |       line =>
39 |         val fields = line.replace("\"", "").split(",")
40 |         // "1870276152746","CALL","18602761525746"
41 |         // "13512340050","CALL","15607804358",1
42 |         // CALL 边有 mgm 属性
43 | 
44 |         if (!"\\N".equals(fields(3))) {
45 |           val mgmInt: java.lang.Integer = Integer.parseInt(fields(3))
46 |           RowFactory.create(fields(0), fields(2), mgmInt)
47 |         } else {
48 |           RowFactory.create(fields(0), fields(2), null)
49 |         }
50 |     }
51 | 
52 |     val structType = new StructType()
53 |       .add(StructField("start_name", StringType, nullable = true))
54 |       .add(StructField("end_name", StringType, nullable = true))
55 |       .add(StructField("mgm", IntegerType, nullable = true))
56 | 
57 |     val df = sqlContext.createDataFrame(dataRdd, structType)
58 | 
59 |     df.createOrReplaceTempView("csv_df")
60 |     sqlContext.sql("create table migrate_call_tmp as select * from csv_df")
61 | 
62 |     //    df.show()
63 |     //    ScalaHelper.saveAsCSV(outputFile, df)
64 | 
65 |     println("***********************stoped***********************")
66 |     sc.stop()
67 |   }
68 | 
69 |   private def handleEdgeList(cqlList: List[String], client: Client): Unit = {
70 |     var runCql = "g = graph.traversal();g"
71 | 
72 |     cqlList.foreach(cql => runCql += cql)
73 |     if (cqlList.nonEmpty) {
74 |       runCql += ".count()"
75 |       ImportCommon.submitWithRetry(client, runCql)
76 |     }
77 |   }
78 | 
79 | }
80 | 


--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/DeviceConvertToCsv.scala:
--------------------------------------------------------------------------------
 1 | package com.qihoo.finance.tap.data.convert
 2 | 
 3 | import com.alibaba.fastjson.JSONObject
 4 | import com.qihoo.finance.tap.{ImportCommon, ScalaHelper}
 5 | import org.apache.log4j.{LogManager, Logger}
 6 | import org.apache.spark.sql.types.{StringType, StructType}
 7 | import org.apache.spark.sql.{RowFactory, SQLContext}
 8 | import org.apache.spark.{SparkConf, SparkContext}
 9 | 
10 | object DeviceConvertToCsv {
11 | 
12 |   val logger: Logger = LogManager.getLogger("DeviceConvertToCsv")
13 | 
14 |   val usage =
15 |     """
16 |     Usage: DeviceConvertToCsv [--label] [--outputFile] E:\360_doc\lolth\mobile.csv
17 |   """
18 | 
19 |   type OptionMap = Map[Symbol, Any]
20 | 
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length == 0) {
24 |       println(usage)
25 |       System.exit(0)
26 |     }
27 | 
28 |     val argList = args.toList
29 |     val options = ImportCommon.nextOption(Map(), argList)
30 | 
31 | 
32 |     val conf = new SparkConf().setAppName("DeviceConvertToCsv")
33 |     //setMaster("local") 本机的spark就用local，远端的就写ip
34 |     //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
35 |     //    conf.setMaster("local")
36 | 
37 |     val sc = new SparkContext(conf)
38 |     val sqlContext = new SQLContext(sc)
39 |     val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
40 |     val outputFile = options.getOrElse('outputFile, "").asInstanceOf[String]
41 |     val deviceType = options.getOrElse('deviceType, "").asInstanceOf[String]
42 |     if (deviceType == null) {
43 |       println("--deviceType 不能为空 device|wifi")
44 |       System.exit(0)
45 |     }
46 | 
47 |     val headerList = Array("name", "is_exception", "is_white")
48 | 
49 |     //    name:ID(human)	age:Int
50 |     val dataRdd = txtFile.map {
51 |       line =>
52 |         val jsonObject: JSONObject = ScalaHelper.parseVertexLineGetIdAndAttr(line)
53 |         RowFactory.create(jsonObject.getString("name"),
54 |           jsonObject.getString("is_exception"),
55 |           jsonObject.getString("is_white")
56 |         )
57 |     }
58 |     var structType = new StructType()
59 | 
60 |     for ((elem, i) <- headerList.view.zipWithIndex) {
61 |       structType = structType.add(headerList(i), StringType, nullable = true)
62 |     }
63 | 
64 |     val df = sqlContext.createDataFrame(dataRdd, structType)
65 | 
66 |     df.createOrReplaceTempView("device_csv_df")
67 | 
68 |     sqlContext.sql("DROP TABLE IF EXISTS migrate_" + deviceType + "_tmp")
69 |     sqlContext.sql("create table migrate_" + deviceType + "_tmp as select * from device_csv_df")
70 | 
71 |     //    df.show()
72 |     //    ScalaHelper.saveAsCSV(outputFile, df)
73 | 
74 |     println("***********************stoped***********************")
75 |     sc.stop()
76 |   }
77 | 
78 | 
79 | }
80 | 


--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/MergeNodesAndEdges.scala:
--------------------------------------------------------------------------------
  1 | package com.qihoo.finance.tap.data.convert
  2 | 
  3 | import com.qihoo.finance.tap.ImportCommon
  4 | import org.apache.log4j.{LogManager, Logger}
  5 | import org.apache.spark.sql.types._
  6 | import org.apache.spark.sql.{DataFrame, Row, SQLContext}
  7 | import org.apache.spark.{SparkConf, SparkContext}
  8 | 
  9 | object MergeNodesAndEdges {
 10 | 
 11 |   val logger: Logger = LogManager.getLogger("MergeNodesAndEdges")
 12 | 
 13 |   val usage =
 14 |     """
 15 |        将顶点和边合并为一行，做批量导入
 16 |       Usage: MergeNodesAndEdges --outputFile
 17 |   """
 18 | 
 19 |   type OptionMap = Map[Symbol, Any]
 20 | 
 21 | 
 22 |   def main(args: Array[String]) {
 23 |     //    if (args.length == 0) {
 24 |     //      println(usage)
 25 |     //      System.exit(0)
 26 |     //    }
 27 | 
 28 |     val argList = args.toList
 29 |     val options = ImportCommon.nextOption(Map(), argList)
 30 | 
 31 |     val conf = new SparkConf().setAppName("MergeNodesAndEdges")
 32 |     //setMaster("local") 本机的spark就用local，远端的就写ip
 33 |     //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
 34 |     //    conf.setMaster("local")
 35 | 
 36 |     val sc = new SparkContext(conf)
 37 |     val sqlContext = new SQLContext(sc)
 38 |     val outputFile = options.getOrElse('outputFile, "").asInstanceOf[String]
 39 | 
 40 |     val (mobile_df: DataFrame, device_df: DataFrame, wifi_df: DataFrame, call_df: DataFrame, has_df: DataFrame, use_df: DataFrame, use_wifi_df: DataFrame) =
 41 |       generateTestDataDF(sc, sqlContext)
 42 | 
 43 | //    val mobile_df = sqlContext.sql("select * from migrate_mobile_id_tmp")
 44 | //    val device_df = sqlContext.sql("select * from migrate_device_id_tmp")
 45 | //    val wifi_df = sqlContext.sql("select * from migrate_wifi_id_tmp")
 46 | //
 47 | //    val call_df = sqlContext.sql("select * from migrate_call_id_tmp")
 48 | //    val has_df = sqlContext.sql("select * from migrate_has_id_tmp")
 49 | //    val use_df = sqlContext.sql("select * from migrate_use_id_tmp")
 50 | //    val use_wifi_df = sqlContext.sql("select * from migrate_use_wifi_id_tmp")
 51 | 
 52 | 
 53 |     val mobile_kv = mobile_df.rdd.keyBy(_ (0)).mapValues(fields => {
 54 |       // label, other props ...
 55 |       List("MOBILE",
 56 |         replaceNullEmpty(fields(1)),
 57 |         replaceNullEmpty(fields(2)),
 58 |         replaceNullEmpty(fields(3)),
 59 |         replaceNullEmpty(fields(4)),
 60 |         replaceNullEmpty(fields(5)),
 61 |         replaceNullEmpty(fields(6)),
 62 |         replaceNullEmpty(fields(7)),
 63 |         replaceNullEmpty(fields(8)),
 64 |         replaceNullEmpty(fields(9)),
 65 |         replaceNullEmpty(fields(10)),
 66 |         replaceNullEmpty(fields(11))
 67 |       ).mkString(",")
 68 |     })
 69 | 
 70 |     val device_kv = device_df.rdd.keyBy(_ (0)).mapValues(fields => {
 71 |       // label, other props ...
 72 |       ("DEVICE", fields(1), replaceNullEmpty(fields(2)), replaceNullEmpty(fields(3))).productIterator.mkString(",")
 73 |     })
 74 | 
 75 |     val wifi_kv = wifi_df.rdd.keyBy(_ (0)).mapValues(fields => {
 76 |       // label, other props ...
 77 |       ("WIFI", fields(1), replaceNullEmpty(fields(2)), replaceNullEmpty(fields(3))).productIterator.mkString(",")
 78 |     })
 79 | 
 80 |     val call_in_kv = call_df.rdd.keyBy(_ (1)).mapValues(fields => (fields(0), replaceNullEmpty(fields(2))))
 81 |     val call_out_kv = call_df.rdd.keyBy(_ (0)).mapValues(fields => (fields(1), replaceNullEmpty(fields(2))))
 82 | 
 83 |     val has_in_kv = has_df.rdd.keyBy(_ (1)).mapValues(fields => fields(0))
 84 |     val has_out_kv = has_df.rdd.keyBy(_ (0)).mapValues(fields => fields(1))
 85 | 
 86 |     val use_out_kv = use_df.rdd.keyBy(_ (0)).mapValues(fields => fields(1))
 87 |     val use_in_kv = use_df.rdd.keyBy(_ (1)).mapValues(fields => fields(0))
 88 | 
 89 |     val use_wifi_out_kv = use_wifi_df.rdd.keyBy(_ (0)).mapValues(fields => fields(1))
 90 |     val use_wifi_in_kv = use_wifi_df.rdd.keyBy(_ (1)).mapValues(fields => fields(0))
 91 | 
 92 |     val mobile_result_rdd = mobile_kv.cogroup(call_in_kv).map(v => {
 93 |       val callIn = v._2._2.toList.map(v => {
 94 |         "CALL," + v._1 + "," + v._2
 95 |       }).mkString("|")
 96 |       val edge = forceJoinBeforeAndNow(v._2._1.toList.head, callIn, "\t")
 97 |       (v._1, edge)
 98 |     }).cogroup(has_in_kv).map(v => {
 99 |       val hasIn = v._2._2.toList.map(v => {
100 |         "HAS," + v
101 |       }).mkString("|")
102 |       val edge = joinBeforeAndNowWithCheck(v._2._1.toList.head, hasIn, "\t", "|")
103 |       (v._1, edge)
104 |     }).cogroup(call_out_kv).map(v => {
105 |       val callOut = v._2._2.toList.map(v => {
106 |         "CALL," + v._1 + "," + v._2
107 |       }).mkString("|")
108 |       val edge = forceJoinBeforeAndNow(v._2._1.toList.head, callOut, "\t")
109 |       (v._1, edge)
110 |     }).cogroup(use_out_kv).map(v => {
111 |       val useOut = v._2._2.toList.map(v => {
112 |         "USE," + v
113 |       }).mkString("|")
114 |       val edge = joinBeforeAndNowWithCheck(v._2._1.toList.head, useOut, "\t", "|")
115 |       (v._1, edge)
116 |     }).cogroup(use_wifi_out_kv).map(v => {
117 |       val useOut = v._2._2.toList.map(v => {
118 |         "USE_WIFI," + v
119 |       }).mkString("|")
120 |       val edge = joinBeforeAndNowWithCheck(v._2._1.toList.head, useOut, "\t", "|")
121 |       (v._1, edge)
122 |     }).map(v => v._1 + "," + v._2)
123 | 
124 |     val device_result_rdd = device_kv.cogroup(use_in_kv).map(v => {
125 |       val useIn = v._2._2.toList.map(v => {
126 |         "USE," + v
127 |       }).mkString("|")
128 |       val edge = forceJoinBeforeAndNow(v._2._1.toList.head, useIn, "\t")
129 |       (v._1, edge)
130 |     }).cogroup(has_out_kv).map(v => {
131 |       val hasOut = v._2._2.toList.map(v => {
132 |         "HAS," + v
133 |       }).mkString("|")
134 |       val edge = forceJoinBeforeAndNow(v._2._1.toList.head, hasOut, "\t")
135 |       (v._1, edge)
136 |     }).map(v => v._1 + "," + v._2)
137 | 
138 |     val wifi_result_rdd = wifi_kv.cogroup(use_wifi_in_kv).map(v => {
139 |       val useIn = v._2._2.toList.map(v => {
140 |         "USE_WIFI," + v
141 |       }).mkString("|")
142 |       val edge = forceJoinBeforeAndNow(v._2._1.toList.head, useIn, "\t")
143 |       // USE_WIFI has't outEdge so add \t
144 |       (v._1, edge + "\t")
145 |     }).map(v => v._1 + "," + v._2)
146 | 
147 |     val total_result = mobile_result_rdd ++ device_result_rdd ++ wifi_result_rdd
148 | 
149 |     total_result.saveAsTextFile(outputFile)
150 | 
151 |     //    total_result.collect().foreach(println)
152 | 
153 |     println("***********************stoped***********************")
154 |     sc.stop()
155 |   }
156 | 
157 |   private def generateTestDataDF(sc: SparkContext, sqlContext: SQLContext) = {
158 |     val mobile_rdd = sc.parallelize(Seq(
159 |       Row(1L, "13908125867", "3|TVOiyN2mC/ihdQuMBaw+0A==", "12dd2479ed75af60968d012fa139ff1cffac3683", "true", "", "", "", "", 1, null, null),
160 |       Row(2L, "13908125868", "3|TVOiyN2mC/ihdQuMBaw+0A==", "12dd2479ed75af60968d012fa139ff1cffac3683", "true", "", "", "", "", 1, null, 0),
161 |       Row(3L, "13908125869", "3|TVOiyN2mC/ihdQuMBaw+0A==", "12dd2479ed75af60968d012fa139ff1cffac3683", "true", "", "", "", "", 3, null, 1)
162 |     ))
163 |     val device_rdd = sc.parallelize(Seq(
164 |       Row(11L, "FP13682956455", null, "false"),
165 |       Row(12L, "FP13682956456", "true", null),
166 |       Row(13L, "FP13682956457", "true", "false")
167 |     ))
168 |     val wifi_rdd = sc.parallelize(Seq(
169 |       Row(21L, "bssid13682956455", null, "false"),
170 |       Row(22L, "bssid13682956456", "true", null),
171 |       Row(23L, "bssid13682956457", "true", "false")
172 |     ))
173 | 
174 |     val call_rdd = sc.parallelize(Seq(
175 |       Row(1L, 2L, null),
176 |       //      Row(3L, 2L, 1),
177 |       Row(2L, 1L, 1)
178 |       //      Row(2L, 3L, 1)
179 |     ))
180 |     val has_rdd = sc.parallelize(Seq(
181 |       Row(11L, 1L),
182 |       Row(11L, 2L),
183 |       Row(11L, 3L),
184 |       Row(12L, 1L),
185 |       Row(12L, 2L)
186 |     ))
187 |     val use_rdd = sc.parallelize(Seq(
188 |       //      Row(1L, 11L),
189 |       //      Row(1L, 12L),
190 |       //      Row(2L, 13L),
191 |       Row(2L, 11L)
192 |       //      Row(3L, 12L)
193 |     ))
194 |     val use_wifi_rdd = sc.parallelize(Seq(
195 |       Row(1L, 21L),
196 |       Row(1L, 22L),
197 |       Row(2L, 23L),
198 |       Row(2L, 21L),
199 |       Row(3L, 22L)
200 |     ))
201 | 
202 |     val mobile_schema = StructType(List(
203 |       StructField("id", LongType, nullable = false),
204 |       StructField("name", StringType, nullable = true),
205 |       StructField("nm_pass", StringType, nullable = true),
206 |       StructField("nm_sha1", StringType, nullable = true),
207 |       StructField("is_register", StringType, nullable = true),
208 |       StructField("is_risk", StringType, nullable = true),
209 |       StructField("is_internal", StringType, nullable = true),
210 |       StructField("is_service", StringType, nullable = true),
211 |       StructField("merchant_name", StringType, nullable = true),
212 |       StructField("status", IntegerType, nullable = true),
213 |       StructField("suspect_risk", IntegerType, nullable = true),
214 |       StructField("overdue_status", IntegerType, nullable = true)
215 |     ))
216 | 
217 |     val call_schema = StructType(List(
218 |       StructField("start_id", LongType, nullable = false),
219 |       StructField("end_id", LongType, nullable = true),
220 |       StructField("mgm", IntegerType, nullable = true)
221 |     ))
222 |     val edge_schema = StructType(List(
223 |       StructField("start_id", LongType, nullable = false),
224 |       StructField("end_id", LongType, nullable = true)
225 |     ))
226 |     val device_schema = StructType(List(
227 |       StructField("id", LongType, nullable = false),
228 |       StructField("name", StringType, nullable = true),
229 |       StructField("is_exception", StringType, nullable = true),
230 |       StructField("is_white", StringType, nullable = true)
231 |     ))
232 | 
233 |     val mobile_df = sqlContext.createDataFrame(mobile_rdd, mobile_schema)
234 |     val device_df = sqlContext.createDataFrame(device_rdd, device_schema)
235 |     val wifi_df = sqlContext.createDataFrame(wifi_rdd, device_schema)
236 | 
237 |     val call_df = sqlContext.createDataFrame(call_rdd, call_schema)
238 |     val has_df = sqlContext.createDataFrame(has_rdd, edge_schema)
239 |     val use_df = sqlContext.createDataFrame(use_rdd, edge_schema)
240 |     val use_wifi_df = sqlContext.createDataFrame(use_wifi_rdd, edge_schema)
241 |     (mobile_df, device_df, wifi_df, call_df, has_df, use_df, use_wifi_df)
242 |   }
243 | 
244 |   def joinBeforeAndNowWithCheck(before: String, now: String, beforSep: String, separate: String): String = {
245 |     var edge: String = null
246 |     if (now.isEmpty) {
247 |       edge = before
248 |     } else if (before.endsWith(beforSep)) {
249 |       edge = List(before, now).mkString("")
250 |     } else {
251 |       edge = List(before, now).mkString(separate)
252 |     }
253 |     edge
254 |   }
255 | 
256 |   def forceJoinBeforeAndNow(before: String, now: String, separate: String): String = {
257 |     val edge = List(before, now).mkString(separate)
258 |     edge
259 |   }
260 | 
261 |   def replaceNullEmpty(field: Any): Any = {
262 |     var value = field
263 |     if (value == null) {
264 |       value = ""
265 |     }
266 |     value
267 |   }
268 | 
269 | 
270 | }
271 | 


--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/MobileConvertToCsv.scala:
--------------------------------------------------------------------------------
 1 | package com.qihoo.finance.tap.data.convert
 2 | 
 3 | import com.alibaba.fastjson.JSONObject
 4 | import com.qihoo.finance.tap.{ImportCommon, ScalaHelper}
 5 | import org.apache.commons.codec.digest.DigestUtils
 6 | import org.apache.log4j.{LogManager, Logger}
 7 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
 8 | import org.apache.spark.sql.{RowFactory, SQLContext}
 9 | import org.apache.spark.{SparkConf, SparkContext}
10 | 
11 | object MobileConvertToCsv {
12 | 
13 |   val logger: Logger = LogManager.getLogger("MobileConvertToCsv")
14 | 
15 |   val usage =
16 |     """
17 |     Usage: MobileConvertToCsv [--outputFile]  E:\360_doc\lolth\mobile.csv
18 |   """
19 | 
20 |   type OptionMap = Map[Symbol, Any]
21 | 
22 | 
23 |   def main(args: Array[String]) {
24 |     if (args.length == 0) {
25 |       println(usage)
26 |       System.exit(0)
27 |     }
28 | 
29 |     val argList = args.toList
30 |     val options = ImportCommon.nextOption(Map(), argList)
31 | 
32 |     val conf = new SparkConf().setAppName("MobileConvertToCsv")
33 |     //setMaster("local") 本机的spark就用local，远端的就写ip
34 |     //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
35 | //    conf.setMaster("local")
36 | 
37 |     val sc = new SparkContext(conf)
38 |     val sqlContext = new SQLContext(sc)
39 |     val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
40 |     val outputFile = options.getOrElse('outputFile, "").asInstanceOf[String]
41 | 
42 |     val headerList = Array( "name", "nm_pass", "nm_sha1", "is_register", "is_risk", "is_internal", "is_service", "merchant_name", "status", "suspect_risk", "overdue_status")
43 | 
44 |     val dataRdd = txtFile.map {
45 |       line =>
46 |         val jsonObject: JSONObject = ScalaHelper.parseVertexLineGetIdAndAttr(line)
47 | 
48 |         val nameValue = jsonObject.getString("name")
49 |         // 加密信息
50 |         val encrypt = nameValue
51 |         val sha1Hex = DigestUtils.sha1Hex(nameValue)
52 | 
53 |         RowFactory.create(nameValue, encrypt, sha1Hex,
54 |           jsonObject.getString("is_register"),
55 |           jsonObject.getString("is_risk"),
56 |           jsonObject.getString("is_internal"),
57 |           jsonObject.getString("is_service"),
58 |           jsonObject.getString("merchant_name"),
59 |           jsonObject.getInteger("status"),
60 |           jsonObject.getInteger("suspect_risk"),
61 |           jsonObject.getInteger("overdue_status")
62 |         )
63 |     }
64 |     var structType = new StructType()
65 | 
66 |     for ((elem, i) <- headerList.view.zipWithIndex) {
67 |       if (List("status", "suspect_risk", "overdue_status").contains(elem)) {
68 |         structType = structType.add(headerList(i), IntegerType, nullable = true)
69 |       } else {
70 |         structType = structType.add(headerList(i), StringType, nullable = true)
71 |       }
72 |     }
73 | 
74 |     val df = sqlContext.createDataFrame(dataRdd, structType)
75 | 
76 |     df.createOrReplaceTempView("mobile_csv_df")
77 | 
78 |     sqlContext.sql("DROP TABLE IF EXISTS migrate_mobile_tmp")
79 |     sqlContext.sql("create table migrate_mobile_tmp as select * from mobile_csv_df")
80 |     //        df.show()
81 |     //    ScalaHelper.saveAsCSV(outputFile, df)
82 | 
83 |     println("***********************stoped***********************")
84 |     sc.stop()
85 |   }
86 | 
87 | }
88 | 


--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/OtherEdgeConvertToCsv.scala:
--------------------------------------------------------------------------------
 1 | package com.qihoo.finance.tap.data.convert
 2 | 
 3 | import com.qihoo.finance.tap.ImportCommon
 4 | import org.apache.log4j.{LogManager, Logger}
 5 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 6 | import org.apache.spark.sql.{RowFactory, SQLContext}
 7 | import org.apache.spark.{SparkConf, SparkContext}
 8 | import org.apache.tinkerpop.gremlin.driver.Client
 9 | 
10 | object OtherEdgeConvertToCsv {
11 |   val logger: Logger = LogManager.getLogger("OtherEdgeConvertToCsv")
12 | 
13 |   val usage =
14 |     """
15 |     Usage: OtherEdgeConvertToCsv [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] E:\360_doc\lolth\call_edge.csv
16 |   """
17 | 
18 |   def main(args: Array[String]) {
19 |     if (args.length == 0) {
20 |       println(usage)
21 |       System.exit(0)
22 |     }
23 | 
24 |     val argList = args.toList
25 |     val options = ImportCommon.nextOption(Map(), argList)
26 | 
27 |     val conf = new SparkConf().setAppName("OtherEdgeConvertToCsv")
28 |     //setMaster("local") 本机的spark就用local，远端的就写ip
29 |     //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
30 | //    conf.setMaster("local")
31 | 
32 |     val sc = new SparkContext(conf)
33 |     val sqlContext = new SQLContext(sc)
34 |     val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
35 | 
36 |     val fromLabel = options.getOrElse('fromLabel, null).asInstanceOf[String]
37 |     val toLabel = options.getOrElse('toLabel, null).asInstanceOf[String]
38 |     val edgeType = options.getOrElse('edgeType, null).asInstanceOf[String]
39 | 
40 |     if (fromLabel == null || toLabel == null) {
41 |       println("必须添加参数 --from-label DEVICE|WIFI|MOBILE --to-label DEVICE|WIFI|MOBILE")
42 |       System.exit(0)
43 |     }
44 | 
45 |     //      :START_ID(god)	:END_ID(titan)
46 |     //    jupiter	saturn
47 | 
48 |     val dataRdd = txtFile.map {
49 |       line =>
50 |         val fields = line.replace("\"", "").split(",")
51 |         // "1870276152746","CALL","18602761525746"
52 |         // "13512340050","CALL","15607804358",1
53 |         // CALL 边有 mgm 属性
54 |         RowFactory.create(fields(0), fields(2))
55 |     }
56 | 
57 |     val structType = new StructType()
58 |       .add(StructField("start_name", StringType, nullable = true))
59 |       .add(StructField("end_name", StringType, nullable = true))
60 | 
61 |     val df = sqlContext.createDataFrame(dataRdd, structType)
62 |     //    df.show()
63 | 
64 |     df.createOrReplaceTempView("edge_csv_df")
65 | 
66 |     sqlContext.sql("DROP TABLE IF EXISTS migrate_" + edgeType + "_tmp")
67 |     sqlContext.sql("create table migrate_" + edgeType + "_tmp as select * from edge_csv_df")
68 | 
69 | 
70 |     println("***********************stoped***********************")
71 |     sc.stop()
72 |   }
73 | 
74 |   private def handleEdgeList(cqlList: List[String], client: Client): Unit = {
75 |     var runCql = "g = graph.traversal();g"
76 | 
77 |     cqlList.foreach(cql => runCql += cql)
78 |     if (cqlList.nonEmpty) {
79 |       runCql += ".count()"
80 |       ImportCommon.submitWithRetry(client, runCql)
81 |     }
82 |   }
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/direct/EdgeImport.scala:
--------------------------------------------------------------------------------
 1 | package com.qihoo.finance.tap.direct
 2 | 
 3 | import com.qihoo.finance.tap.ImportCommon
 4 | import org.apache.log4j.{LogManager, Logger}
 5 | import org.apache.spark.{SparkConf, SparkContext}
 6 | import org.apache.tinkerpop.gremlin.driver.Client
 7 | 
 8 | object EdgeImport {
 9 |   val logger: Logger = LogManager.getLogger("EdgeImport")
10 | 
11 |   val usage =
12 |     """
13 |     Usage: EdgeImport [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] E:\360_doc\lolth\call_edge.csv
14 |   """
15 | 
16 |   def main(args: Array[String]) {
17 |     if (args.length == 0) {
18 |       println(usage)
19 |       System.exit(0)
20 |     }
21 | 
22 |     val argList = args.toList
23 |     val options = ImportCommon.nextOption(Map(), argList)
24 | 
25 |     val conf = new SparkConf().setAppName("EdgeImport")
26 |     //setMaster("local") 本机的spark就用local，远端的就写ip
27 |     //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
28 |     //     conf.setMaster("local")
29 | 
30 |     val sc = new SparkContext(conf)
31 |     val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
32 |     val hosts = options.getOrElse('janusgraphHosts, "").asInstanceOf[String]
33 |     val port = options.getOrElse('janusgraphPort, 0).asInstanceOf[Int]
34 |     val batchSize = options.getOrElse('batchSize, 50).asInstanceOf[Int]
35 |     val poolSize = options.getOrElse('poolSize, 16).asInstanceOf[Int]
36 | 
37 |     txtFile.map {
38 |       line =>
39 |         val fields = line.replace("\"", "").split(",")
40 |         // "1870276152746","CALL","18602761525746"
41 |         // "13512340050","CALL","15607804358",1
42 |         // CALL 边有 mgm 属性
43 |         if (fields.length == 4 && "CALL".equals(fields(1)) && !"\\N".equals(fields(3))) {
44 |           (fields(0), fields(1), fields(2), Some(fields(3)))
45 |         } else {
46 |           (fields(0), fields(1), fields(2), None)
47 |         }
48 | 
49 |     }.foreachPartition(partitionOfRecords => {
50 |       val provider = ImportCommon.getJanusGraph(hosts, port, poolSize)
51 |       val client = provider.getClient
52 | 
53 |       var cqlList: List[String] = List()
54 |       partitionOfRecords.foreach(record => {
55 |         var edgeCql = ""
56 |         if (record._2 == "CALL" && record._4.nonEmpty) {
57 |           edgeCql = ".V().has('name','" + record._1 + "').as('a').V().has('name','" + record._3 + "').addE('" + record._2 + "').from('a').property('mgm'," + record._4.get + ")"
58 |         } else {
59 |           edgeCql = ".V().has('name','" + record._1 + "').as('a').V().has('name','" + record._3 + "').addE('" + record._2 + "').from('a')"
60 |         }
61 |         cqlList = edgeCql :: cqlList
62 |         if (cqlList.size >= batchSize) {
63 |           handleEdgeList(cqlList, client)
64 |           cqlList = List()
65 |         }
66 |       })
67 | 
68 |       handleEdgeList(cqlList, client)
69 |       client.close()
70 |       provider.close()
71 |     })
72 |     println("***********************stoped***********************")
73 |     sc.stop()
74 |   }
75 | 
76 |   private def handleEdgeList(cqlList: List[String], client: Client): Unit = {
77 |     var runCql = "g = graph.traversal();g"
78 | 
79 |     cqlList.foreach(cql => runCql += cql)
80 |     if (cqlList.nonEmpty) {
81 |       runCql += ".count()"
82 |       ImportCommon.submitWithRetry(client, runCql)
83 |     }
84 |   }
85 | 
86 | }
87 | 


--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/direct/VertexImport.scala:
--------------------------------------------------------------------------------
 1 | package com.qihoo.finance.tap.direct
 2 | 
 3 | import com.qihoo.finance.tap.ImportCommon
 4 | import org.apache.log4j.{LogManager, Logger}
 5 | import org.apache.spark.{SparkConf, SparkContext}
 6 | 
 7 | object VertexImport {
 8 | 
 9 |   val logger: Logger = LogManager.getLogger("VertexImport")
10 | 
11 |   val usage =
12 |     """
13 |     Usage: VertexImport [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] [--batch-size 20] E:\360_doc\lolth\mobile.csv
14 |   """
15 | 
16 |   type OptionMap = Map[Symbol, Any]
17 | 
18 |   def main(args: Array[String]) {
19 |     if (args.length == 0) {
20 |       println(usage)
21 |       System.exit(0)
22 |     }
23 | 
24 |     val argList = args.toList
25 |     val options = ImportCommon.nextOption(Map(), argList)
26 | 
27 |     val conf = new SparkConf().setAppName("VertexImport")
28 |     //setMaster("local") 本机的spark就用local，远端的就写ip
29 |     //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
30 | //        conf.setMaster("local")
31 | 
32 |     val sc = new SparkContext(conf)
33 |     val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
34 |     val hosts = options.getOrElse('janusgraphHosts, "").asInstanceOf[String]
35 |     val port = options.getOrElse('janusgraphPort, 8182).asInstanceOf[Int]
36 |     val batchSize = options.getOrElse('batchSize, 50).asInstanceOf[Int]
37 |     val poolSize = options.getOrElse('poolSize, 16).asInstanceOf[Int]
38 | 
39 | 
40 |     txtFile.map {
41 |       line =>
42 |         val labelLast = line.indexOf("[")
43 |         val attrStart = line.indexOf("{")
44 |         val label = line.substring(0, labelLast)
45 |         val attrStr = line.substring(attrStart, line.length)
46 |         (label.toUpperCase(), attrStr)
47 |     }.foreachPartition(partitionOfRecords => {
48 |       val provider = ImportCommon.getJanusGraph(hosts, port, poolSize)
49 |       val client = provider.getClient
50 | 
51 |       var recordList: List[(String, String)] = List()
52 |       partitionOfRecords.foreach(record => {
53 |         if (!ImportCommon.isEmpty(record._1)) {
54 |           recordList = (record._1, record._2) :: recordList
55 |           if (recordList.size >= batchSize) {
56 |             ImportCommon.handleVertexList(recordList, client)
57 |             recordList = List()
58 |           }
59 |         }
60 |       })
61 | 
62 |       ImportCommon.handleVertexList(recordList, client)
63 |       client.close()
64 |       provider.close()
65 |     })
66 | 
67 |     println("***********************stoped***********************")
68 |     sc.stop()
69 |   }
70 | 
71 | 
72 | }
73 | 


--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/increment/EdgeImportIncrement.scala:
--------------------------------------------------------------------------------
 1 | package com.qihoo.finance.tap.increment
 2 | 
 3 | import com.qihoo.finance.tap.ImportCommon
 4 | import org.apache.log4j.{LogManager, Logger}
 5 | import org.apache.spark.{SparkConf, SparkContext}
 6 | import org.apache.tinkerpop.gremlin.driver.Client
 7 | 
 8 | object EdgeImportIncrement {
 9 |   val logger: Logger = LogManager.getLogger("EdgeImportIncrement")
10 | 
11 |   val usage =
12 |     """
13 |        边的增量导入
14 |     Usage: EdgeImportIncrement [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] E:\360_doc\lolth\call_edge.csv
15 |   """
16 | 
17 |   def main(args: Array[String]) {
18 |     if (args.length == 0) {
19 |       println(usage)
20 |       System.exit(0)
21 |     }
22 | 
23 |     val argList = args.toList
24 |     val options = ImportCommon.nextOption(Map(), argList)
25 | 
26 |     val conf = new SparkConf().setAppName("EdgeImportIncrement")
27 |     //setMaster("local") 本机的spark就用local，远端的就写ip
28 |     //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
29 |     //     conf.setMaster("local")
30 | 
31 |     val sc = new SparkContext(conf)
32 |     val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
33 |     val hosts = options.getOrElse('janusgraphHosts, "").asInstanceOf[String]
34 |     val port = options.getOrElse('janusgraphPort, 8182).asInstanceOf[Int]
35 |     val batchSize = options.getOrElse('batchSize, 50).asInstanceOf[Int]
36 |     val poolSize = options.getOrElse('poolSize, 16).asInstanceOf[Int]
37 | 
38 |     txtFile.map {
39 |       line =>
40 |         val fields = line.replace("\"", "").split(",")
41 |         // "1870276152746","CALL","18602761525746"
42 |         // "13512340050","CALL","15607804358",1
43 |         // CALL 边有 mgm 属性
44 |         if (fields.length == 4 && "CALL".equals(fields(1)) && !"\\N".equals(fields(3))) {
45 |           (fields(0), fields(1), fields(2), Some(fields(3)))
46 |         } else {
47 |           (fields(0), fields(1), fields(2), None)
48 |         }
49 | 
50 |     }.foreachPartition(partitionOfRecords => {
51 |       val provider = ImportCommon.getJanusGraph(hosts, port, poolSize)
52 |       val client = provider.getClient
53 | 
54 |       var cqlList: List[String] = List()
55 |       partitionOfRecords.foreach(record => {
56 |         var edgeCql = ".V().has('name','" + record._1 + "').as('a').V().has('name','" + record._3 + "')" +
57 |           ".coalesce(inE('" + record._2 + "').where(outV().as('a')), addE('" + record._2 + "').from('a'))"
58 | 
59 |         if (record._2 == "CALL" && record._4.nonEmpty) {
60 |           edgeCql += ".property('mgm'," + record._4.get + ")"
61 |         }
62 | 
63 |         cqlList = edgeCql :: cqlList
64 |         if (cqlList.size >= batchSize) {
65 |           handleEdgeList(cqlList, client)
66 |           cqlList = List()
67 |         }
68 |       })
69 | 
70 |       handleEdgeList(cqlList, client)
71 |       client.close()
72 |       provider.close()
73 |     })
74 |     println("***********************stoped***********************")
75 |     sc.stop()
76 |   }
77 | 
78 |   private def handleEdgeList(cqlList: List[String], client: Client): Unit = {
79 |     var runCql = "g = graph.traversal();g"
80 | 
81 |     cqlList.foreach(cql => runCql += cql)
82 |     if (cqlList.nonEmpty) {
83 |       runCql += ".count()"
84 |       ImportCommon.submitWithRetry(client, runCql)
85 |     }
86 |   }
87 | 
88 | }
89 | 


--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/increment/VertexImportIncrement.scala:
--------------------------------------------------------------------------------
 1 | package com.qihoo.finance.tap.increment
 2 | 
 3 | import java.util
 4 | 
 5 | import com.qihoo.finance.tap.{Helper, ImportCommon}
 6 | import org.apache.log4j.{LogManager, Logger}
 7 | import org.apache.spark.{SparkConf, SparkContext}
 8 | import org.apache.tinkerpop.gremlin.driver.{Client, Result}
 9 | 
10 | object VertexImportIncrement {
11 | 
12 |   val logger: Logger = LogManager.getLogger("VertexImportIncrement")
13 | 
14 |   val usage =
15 |     """
16 |        顶点增量导入，判断属性
17 |     Usage: VertexImportIncrement [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] [--batch-size 20] E:\360_doc\lolth\mobile.csv
18 |   """
19 | 
20 |   type OptionMap = Map[Symbol, Any]
21 | 
22 |   def main(args: Array[String]) {
23 |     if (args.length == 0) {
24 |       println(usage)
25 |       System.exit(0)
26 |     }
27 | 
28 |     val argList = args.toList
29 |     val options = ImportCommon.nextOption(Map(), argList)
30 | 
31 |     val conf = new SparkConf().setAppName("VertexImportIncrement")
32 |     //setMaster("local") 本机的spark就用local，远端的就写ip
33 |     //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
34 | //    conf.setMaster("local")
35 | 
36 |     val sc = new SparkContext(conf)
37 |     val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
38 |     val hosts = options.getOrElse('janusgraphHosts, "").asInstanceOf[String]
39 |     val port = options.getOrElse('janusgraphPort, 8182).asInstanceOf[Int]
40 |     val batchSize = options.getOrElse('batchSize, 50).asInstanceOf[Int]
41 |     val poolSize = options.getOrElse('poolSize, 16).asInstanceOf[Int]
42 | 
43 | 
44 |     txtFile.map {
45 |       line =>
46 |         val labelLast = line.indexOf("[")
47 |         val attrStart = line.indexOf("{")
48 |         val label = line.substring(0, labelLast)
49 |         val attrStr = line.substring(attrStart, line.length)
50 |         (label.toUpperCase(), attrStr)
51 |     }.foreachPartition(partitionOfRecords => {
52 |       val provider = ImportCommon.getJanusGraph(hosts, port, poolSize)
53 |       val client = provider.getClient
54 |       var recordList: List[(String, String)] = List()
55 | 
56 |       partitionOfRecords.foreach(record => {
57 |         if (!ImportCommon.isEmpty(record._1)) {
58 | 
59 |             recordList = (record._1, record._2) :: recordList
60 |             if (recordList.size >= batchSize) {
61 |               ImportCommon.handleVertexIncrementList(recordList, client)
62 |               recordList = List()
63 |             }
64 | 
65 |         }
66 |       })
67 | 
68 |       ImportCommon.handleVertexIncrementList(recordList, client)
69 |       client.close()
70 |       provider.close()
71 |     })
72 | 
73 |     println("***********************stoped***********************")
74 |     sc.stop()
75 |   }
76 | 
77 |   def isVertexExist(record: (String, String), client: Client): Boolean = {
78 |     val jsonObject = Helper.getVertexProperty(record._2)
79 |     val name = jsonObject.getString("name")
80 |     val cql = "g.V().has('name','" + name + "').count()"
81 | 
82 |     val results: util.List[Result] = ImportCommon.getResultWithRetry(client, cql)
83 |     //    val results: util.List[Result] = client.submit(cql).all.get
84 | 
85 |     if (results != null && results.size() > 0 && results.get(0).getInt > 0) {
86 |       return true
87 |     }
88 |     false
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/janusgraph_yarn.md:
--------------------------------------------------------------------------------
 1 | # Janusgraph Yarn Configuration
 2 | 
 3 | 此文档旨在说明 JanusGraph 如何集成 yarn
 4 | 
 5 | ## 环境路径
 6 | CDH的安装目录 /opt/cloudera/parcels/CDH/
 7 | CDH的配置文件目录 /etc/hadoop
 8 | 
 9 | ## 下载
10 | spark-2.2.1-bin-hadoop2.7
11 | janusgraph-0.3.2-hadoop2
12 | 
13 | 
14 | ## Jar 包冲突解决
15 | spark-2.2.1-bin-hadoop2.7 依赖的 guava-14.0.1.jar 与
16 | janusgraph 依赖的 guava-18.0.jar 存着冲突。使用 guava-18.0.jar
17 | 
18 | rm -f spark-2.2.1-bin-hadoop2.7/jars/guava-*.jar 
19 | 
20 | cp janusgraph/lib/guava-18.0.jar spark/jars/
21 | 
22 | ## 修改 bin/gremlin.sh
23 | ```bash
24 | export CLASSPATH="$CLASSPATH:/etc/hadoop/conf/*:/opt/cloudera/parcels/CDH/lib/hadoop-yarn/*:/home/q/spark/jars/*"
25 | ```
26 | 
27 | ## 文件配置
28 | gremlin_yan.sh
29 | ```bash
30 | #!/bin/bash
31 | export HADOOP_CONF_DIR=/etc/hadoop/conf
32 | 
33 | export CLASSPATH=$CLASSPATH:$HADOOP_CONF_DIR
34 | # 关键，会从此目录加载依赖的 spark和yarn jar包 janusgraph 提供的spark jar包不全
35 | export SPARK_HOME=/home/q/spark
36 | 
37 | export PATH=$PATH:$SPARK_HOME/bin
38 | bin/gremlin.sh
39 | ```


--------------------------------------------------------------------------------
/optimize.md:
--------------------------------------------------------------------------------
  1 | # JanusGraph 查询优化
  2 | 
  3 | JanusGraph 的查询有比较多的优化点，在此做些说明
  4 | 
  5 | ## 属性 _multiPreFetch 优化
  6 | 这是个人认为最重要的优化，0.4.0 版本才提供的功能，没有这个功能 JanusGraph 在大数据量的生产环境基本不可用  
  7 | 
  8 | ```bash
  9 | g.V().has('name', P.within('186xxxx6666')).both('CALL').or(has('is_register','true'), has('is_risk','true')).as('m2').profile() 
 10 | ```
 11 | 类似上面的语句，没有这个优化，Jansugraph 会找到对端的顶点然后每个顶点单独去获取属性再做过滤条件  
 12 | 在生产获取的顶点数很多的时候基本不可用
 13 | 耗时特别长  
 14 | 触发了这个优化的话它会批量获取顶点的属性然后做过滤  
 15 | ```bash
 16 | gremlin> g.V(6554048).outE('aggregation').otherV().has('name', neq('bob')).count().profile()
 17 | ==>Traversal Metrics
 18 | Step                                                               Count  Traversers       Time (ms)    % Dur
 19 | =============================================================================================================
 20 | GraphStep(vertex,[6554048])                                            1           1          35.538     0.15
 21 | JanusGraphVertexStep(OUT,[aggregation],vertex)                     30159       30159        2220.394     9.28
 22 |     \_condition=(PROPERTY AND visibility:normal)
 23 |     \_orders=[]
 24 |     \_isFitted=true
 25 |     \_isOrdered=true
 26 |     \_query=org.janusgraph.diskstorage.keycolumnvalue.SliceQuery@8019d62e
 27 |     \_multi=true
 28 |     \_vertices=20000
 29 |     \_multiPreFetch=true
 30 |   optimization                                                                                82.480
 31 |   backend-query                                                    30159                     275.560
 32 |     \_query=org.janusgraph.diskstorage.keycolumnvalue.SliceQuery@81bebe6b
 33 |   optimization                                                                                 0.712
 34 |   backend-query                                                   257398                    1491.029
 35 |     \_query=org.janusgraph.diskstorage.keycolumnvalue.SliceQuery@8019d62e
 36 | HasStep([name.neq(bob)])                                           28054       28054       21612.923    90.33
 37 | CountGlobalStep                                                        1           1          56.938     0.24
 38 |                                             >TOTAL                     -           -       23925.795        -
 39 | ```
 40 | 在profile `_multiPreFetch=true` 表示触发了这个优化  
 41 | 这个优化触发的条件有点苛刻  
 42 | 首先需要在配置文件中配置  `query.batch-property-prefetch=true`   
 43 | 其次需要利用`has`进行属性过滤  
 44 | 再者查询出来的数据行数不能超过 配置文件中 `cache.tx-cache-size` 设置的值（默认值为2W）  
 45 | 意思是如果查询出点超过设置值就不会触发这个优化  
 46 | 更加详细的信息可以参考如下[链接](https://github.com/JanusGraph/janusgraph/issues/984)
 47 | 
 48 | ## 返回结果优化
 49 | ```bash
 50 | g.V().has("MOBILE", "name", P.within('186xxxx6666')).as("m1").both("CALL").as('m2') \
 51 | .select("m1", "m2") \
 52 | .by(valueMap("name")) \
 53 | .by(valueMap("name", "is_risk", "status", "is_service", "overdue_status"))
 54 | ```
 55 | 上面的查询返回的结果是 
 56 | ```bash
 57 | {m1={name=[18658606666]}, m2={name=[13064767986]}}
 58 | {m1={name=[18658606666]}, m2={name=[13291676581]}}
 59 | {m1={name=[18658606666]}, m2={name=[13566665915]}}
 60 | {m1={name=[18658606666]}, m2={name=[15072770149]}}
 61 | {m1={name=[18658606666]}, m2={name=[15268898802]}}
 62 | {m1={name=[18658606666]}, m2={name=[18657617779], status=[3]}}
 63 | ```
 64 | 这样的查询返回结果看似没啥问题，我们之前也是这样写的，这样查询语法比较简洁。  
 65 | 这个查询有两个问题，我们生产的数据量比较大，并且涉及大量的查询。在生产环境应用的内存很快被打满  
 66 | 这个查询有两个问题，第一个问题是 name返回的是list，在`Java`中 `ArrayList`的默认值是 10，意思是即使你属性只有一个值
 67 | 也会创建10个对象  
 68 | 第二个问题是，返回的Map 过多，m1, m2 这两个Map虽然只有一个key，但是在`Java` 中`HashMap`的默认值 16，
 69 | 同上面的问题会导致大量的内存浪费
 70 | ```java
 71 | public class ArrayList<E> extends AbstractList<E>
 72 |         implements List<E>, RandomAccess, Cloneable, java.io.Serializable
 73 | {
 74 |     private static final long serialVersionUID = 8683452581122892189L;
 75 | 
 76 |     /**
 77 |      * Default initial capacity.
 78 |      */
 79 |     private static final int DEFAULT_CAPACITY = 10;
 80 |     ...
 81 | }
 82 | 
 83 | 
 84 | public class HashMap<K,V> extends AbstractMap<K,V>
 85 |     implements Map<K,V>, Cloneable, Serializable {
 86 | 
 87 |     private static final long serialVersionUID = 362498820763181265L;
 88 |     
 89 |     /**
 90 |      * The default initial capacity - MUST be a power of two.
 91 |      */
 92 |     static final int DEFAULT_INITIAL_CAPACITY = 1 << 4; // aka 16
 93 |     ...
 94 | }
 95 | ```
 96 | 正确的查询语句应该是像下面这样的
 97 | ```bash
 98 | g.V().has("MOBILE", "name", P.within('186xxxx6666')).as("m1").both("CALL").as('m2') \
 99 | .select("m1", "m2") \
100 | .project("cName", "mobile", "isRisk", "isService") \
101 | .by(select("m1").by(coalesce(values("name"), constant("null"))) ) \
102 | .by(select("m2").by(coalesce(values("name"), constant("null"))) ) \
103 | .by(select("m2").by(coalesce(values("is_risk"), constant("null"))) ) \
104 | .by(select("m2").by(coalesce(values("is_service"), constant("null"))) ) \
105 | # 返回结果为一个 map 并且结果不为 list
106 | {cName=186xxxx6666, mobile=186xxxx6666, isRisk=true, status=0}
107 | ```
108 | 如果确实需要使用`valueMap` 并且不希望返回 List结果可以用下面的语法
109 | ```bash
110 | valueMap().by(unfold())
111 | ```
112 | ## 插入顶点和边的重复检查
113 | ```bash
114 | g.V().has("name", nodeName).fold().coalesce(unfold(), addV("MOBILE").property("name", nodeName)).next();
115 | ```
116 | 上面的语句类似merge功能，如果顶点不存在添加顶点
117 | 
118 | ```bash
119 | g.V(fromNode).as("a").V(toNode).coalesce(inE(relationLabel).where(outV().as("a")), addE(relationLabel).from("a"))
120 | ```
121 | 上面的语句会检查两个顶点之间的边是否存着，如果不存在添加对应的边
122 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>com.qihoo.finance</groupId>
 8 |     <artifactId>janusgraph</artifactId>
 9 |     <packaging>pom</packaging>
10 |     <version>1.0.0-SNAPSHOT</version>
11 | 
12 |     <modules>
13 |         <module>dataImport</module>
14 |     </modules>
15 | 
16 |     <properties>
17 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
18 |         <java.version>1.8</java.version>
19 |         <skip_maven_deploy>false</skip_maven_deploy>
20 | 
21 |         <commons.codec.version>1.9</commons.codec.version>
22 |         <commons.beanutils.version>1.8.3</commons.beanutils.version>
23 |         <spark.version>2.2.1</spark.version>
24 |         <scala.version>2.11</scala.version>
25 |         <hadoop.version>2.6.5</hadoop.version>
26 |         <hbase.version>1.2.6</hbase.version>
27 |     </properties>
28 | 
29 |     <build>
30 |         <plugins>
31 |             <!-- Compiler 插件, 设定JDK版本 -->
32 |             <plugin>
33 |                 <groupId>org.apache.maven.plugins</groupId>
34 |                 <artifactId>maven-compiler-plugin</artifactId>
35 |                 <version>3.3</version>
36 |                 <configuration>
37 |                     <source>${java.version}</source>
38 |                     <target>${java.version}</target>
39 |                     <showWarnings>true</showWarnings>
40 |                 </configuration>
41 |             </plugin>
42 | 
43 |             <plugin>
44 |                 <groupId>org.apache.maven.plugins</groupId>
45 |                 <artifactId>maven-deploy-plugin</artifactId>
46 |                 <configuration>
47 |                     <skip>${skip_maven_deploy}</skip>
48 |                 </configuration>
49 |             </plugin>
50 | 
51 |         </plugins>
52 |     </build>
53 | 
54 |     <!--<distributionManagement>-->
55 |         <!--<repository>-->
56 |             <!--<id>nexus-releases</id>-->
57 |             <!--<name>Nexus Releases Repository</name>-->
58 |             <!--<url>http://nexus.daikuan.qihoo.net:8083/nexus/content/repositories/releases/</url>-->
59 |         <!--</repository>-->
60 |         <!--<snapshotRepository>-->
61 |             <!--<id>nexus-snapshots</id>-->
62 |             <!--<name>Nexus Snapshots Repository</name>-->
63 |             <!--<url>http://nexus.daikuan.qihoo.net:8083/nexus/content/repositories/snapshots/</url>-->
64 |         <!--</snapshotRepository>-->
65 |     <!--</distributionManagement>-->
66 | </project>


--------------------------------------------------------------------------------