├── .gitignore
├── README.md
├── dataImport
├── pom.xml
└── src
│ └── main
│ ├── resources
│ ├── clean.groovy
│ ├── clean.sh
│ ├── conf
│ │ ├── gremlin-server
│ │ │ └── janusgraph-hbase-es-server-new.properties
│ │ └── hadoop-graph
│ │ │ ├── hadoop-vertex-script-yarn.properties
│ │ │ └── hadoop-vertex-script.properties
│ ├── convert.sh
│ ├── data
│ │ └── lolth-schema.groovy
│ ├── data_export.txt
│ ├── gremlin_run.groovy
│ ├── gremlin_run.sh
│ ├── hive
│ │ └── migrate_csv_to_hive_table.sql
│ ├── scripts
│ │ └── script_mobile.groovy
│ └── test.data
│ │ ├── call_edge.csv
│ │ ├── device.csv
│ │ ├── has.csv
│ │ └── mobile.csv
│ └── scala
│ └── com
│ └── qihoo
│ └── finance
│ └── tap
│ ├── Helper.java
│ ├── ImportCommon.scala
│ ├── JanusGraphProvider.java
│ ├── ScalaHelper.scala
│ ├── data
│ └── convert
│ │ ├── CallEdgeConvertToCsv.scala
│ │ ├── DeviceConvertToCsv.scala
│ │ ├── MergeNodesAndEdges.scala
│ │ ├── MobileConvertToCsv.scala
│ │ └── OtherEdgeConvertToCsv.scala
│ ├── direct
│ ├── EdgeImport.scala
│ └── VertexImport.scala
│ └── increment
│ ├── EdgeImportIncrement.scala
│ └── VertexImportIncrement.scala
├── janusgraph_yarn.md
├── optimize.md
└── pom.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | # maven ignore
2 | target/
3 | *.jar
4 | *.war
5 | *.zip
6 | *.tar
7 |
8 | # eclipse ignore
9 | .settings/
10 | .project
11 | .classpath
12 |
13 | # idea ignore
14 | .idea/*
15 | py_tag_tool/.idea/*
16 | *.ipr
17 | *.iml
18 | *.iws
19 |
20 | # temp ignore
21 | logs/
22 | *.doc
23 | *.log
24 | *.cache
25 | *.diff
26 | *.patch
27 | *.tmp
28 | *.versionsBackup
29 |
30 | # system ignore
31 | .DS_Store
32 | Thumbs.db
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Janusgraph-data-importer
2 |
3 | ## 说明
4 | 本项目是本人具体项目数据迁移过程中的代码,需要根据自身情况修改代码
5 | 交流邮箱:zhoupengblack@qq.com
6 |
7 | ## 工程说明
8 |
9 | ### resources 文件说明
10 | * conf 目录下放置了数据导入的配置文件,并且对相关配置项都做了些说明
11 | * data 为图创建的 schema
12 | * test.data 是我们从 AgensGraph 数据库导出的顶点和边的数据格式 后续的代码都是基于这份数据格式进行解析和操作的
13 | * hive 将转换后的数据利用 hive 添加唯一 id
14 | * scripts janusgraph 导入的时候需要对数据进行解析
15 | * resources 下的 sh,groovy 脚本是方便数据导入 写的一些简单脚本
16 |
17 |
18 | ### 代码文件说明
19 | * data.convert 目录下代码是将导出的数据转换为hive 表,再利用上面说的hive脚本添加 唯一id
20 | 核心为 MergeNodesAndEdges 利用 spark的 cogroup 操作 转换为 janusgraph 接受的导入格式
21 | 此步操作耗时比较久,比较占内存,spark.network.timeout=600 设置长一点
22 |
23 | * direct 此目录下的代码 为直接连接 janusgraphServer 插入数据。如果数据量比较小的情况下可以使用
24 |
25 | * increment 为导入增量数据,当历史数据导入完毕后需要导入增量数据,需要检查顶点和表是否已经存在
26 |
27 |
28 | ## 常见问题
29 | java.lang.OutOfMemoryError: unable to create new native thread
30 | * 机器的 ulimit -u 设置比较小 可以设置 102400
31 |
32 | Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
33 | * spark 内存不足,有两个方案 增大 spark.executor.memory 或者调小 spark.executor.cores
34 | 保证 spark.executor.memory / spark.executor.cores 在 6G,7G左右
35 |
36 | KryoSerializer Failed to find one of the right cookies
37 | * KryoSerializer 序列化 spark 配置不对,参考 hadoop-vertex-script.properties 配置文件
38 |
39 | * 确保建议唯一索引的数据是唯一的,id是唯一的,不然数据导入会有问题
40 |
41 |
42 | ## 补充
43 | * pom 文件中的 provided 表示在打包的时候不会打入进去
44 | 集群环境中是已经有这些包的,在本地调试的时候需要将这行注释掉
45 | 本地调试时 代码中的这行也需要放开 // conf.setMaster("local")
46 | ```
47 |
48 | org.apache.spark
49 | spark-hive_${scala.version}
50 | ${spark.version}
51 | provided
52 |
53 | ```
54 |
--------------------------------------------------------------------------------
/dataImport/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 | com.qihoo.finance
7 | janusgraph
8 | 1.0.0-SNAPSHOT
9 |
10 | 4.0.0
11 |
12 | dataImport
13 |
14 |
15 |
16 | org.janusgraph
17 | janusgraph-core
18 | 0.3.1
19 |
20 |
21 | org.janusgraph
22 | janusgraph-hbase
23 | 0.3.1
24 |
25 |
26 | org.apache.hbase
27 | hbase-shaded-client
28 | 1.2.6
29 |
30 |
31 | org.apache.hbase
32 | hbase-shaded-server
33 | 1.2.6
34 |
35 |
36 |
37 | org.janusgraph
38 | janusgraph-es
39 | 0.3.1
40 |
41 |
42 | org.apache.tinkerpop
43 | gremlin-driver
44 | 3.3.3
45 |
46 |
47 | com.google.guava
48 | guava
49 | 16.0
50 |
51 |
52 |
53 | com.alibaba
54 | fastjson
55 | 1.2.58
56 |
57 |
58 | commons-codec
59 | commons-codec
60 | ${commons.codec.version}
61 |
62 |
63 |
64 | org.apache.spark
65 | spark-core_${scala.version}
66 | ${spark.version}
67 | provided
68 |
69 |
70 | org.apache.spark
71 | spark-streaming_${scala.version}
72 | ${spark.version}
73 | provided
74 |
75 |
76 | org.apache.spark
77 | spark-sql_${scala.version}
78 | ${spark.version}
79 | provided
80 |
81 |
82 | org.apache.spark
83 | spark-hive_${scala.version}
84 | ${spark.version}
85 | provided
86 |
87 |
88 |
89 |
90 | org.apache.hadoop
91 | hadoop-client
92 | ${hadoop.version}
93 | provided
94 |
95 |
96 |
97 | org.apache.hadoop
98 | hadoop-hdfs
99 | ${hadoop.version}
100 | provided
101 |
102 |
103 | org.apache.hadoop
104 | hadoop-common
105 | ${hadoop.version}
106 | provided
107 |
108 |
109 | org.apache.hadoop
110 | hadoop-streaming
111 | ${hadoop.version}
112 | provided
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 | org.scala-tools
122 | maven-scala-plugin
123 | 2.15.2
124 |
125 |
126 | compile
127 |
128 | compile
129 |
130 | compile
131 |
132 |
133 | test-compile
134 |
135 | testCompile
136 |
137 | test-compile
138 |
139 |
140 | process-resources
141 |
142 | compile
143 |
144 |
145 |
146 |
147 |
148 | org.apache.maven.plugins
149 | maven-source-plugin
150 | 3.0.1
151 |
152 |
153 | package
154 |
155 | jar-no-fork
156 |
157 |
158 |
159 |
160 |
161 |
162 | org.apache.maven.plugins
163 | maven-compiler-plugin
164 | 3.6.0
165 |
166 | ${java.version}
167 | ${java.version}
168 | true
169 |
170 |
171 |
172 |
173 |
174 | org.apache.maven.plugins
175 | maven-jar-plugin
176 | 2.4
177 |
178 |
179 |
180 | com.qihoo.finance.tap.Main
181 |
182 |
183 |
184 |
185 |
186 |
187 | org.apache.maven.plugins
188 | maven-shade-plugin
189 | 2.3
190 |
191 | false
192 | ${project.build.directory}/${project.artifactId}-libs.jar
193 |
194 |
195 | *:*
196 |
197 | META-INF/*.SF
198 | META-INF/*.DSA
199 | META-INF/*.RSA
200 |
201 |
202 |
203 |
204 |
205 |
206 | package
207 |
208 | shade
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
--------------------------------------------------------------------------------
/dataImport/src/main/resources/clean.groovy:
--------------------------------------------------------------------------------
1 | graph = JanusGraphFactory.open('conf/gremlin-server/janusgraph-hbase-es-server-new.properties')
2 | graph.close(); org.janusgraph.core.util.JanusGraphCleanup.clear(graph)
3 |
4 | :load data/lolth-schema-pass.groovy
5 | graph = JanusGraphFactory.open('conf/gremlin-server/janusgraph-hbase-es-server-new.properties')
6 |
7 | defineLolthSchema(graph)
8 | graph.close()
--------------------------------------------------------------------------------
/dataImport/src/main/resources/clean.sh:
--------------------------------------------------------------------------------
1 | bin/gremlin.sh -e ./clean.groovy
--------------------------------------------------------------------------------
/dataImport/src/main/resources/conf/gremlin-server/janusgraph-hbase-es-server-new.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/360digitech/janusgraph-data-importer/e4a09b32984961884a5994e4cdff80b211318e43/dataImport/src/main/resources/conf/gremlin-server/janusgraph-hbase-es-server-new.properties
--------------------------------------------------------------------------------
/dataImport/src/main/resources/conf/hadoop-graph/hadoop-vertex-script-yarn.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/360digitech/janusgraph-data-importer/e4a09b32984961884a5994e4cdff80b211318e43/dataImport/src/main/resources/conf/hadoop-graph/hadoop-vertex-script-yarn.properties
--------------------------------------------------------------------------------
/dataImport/src/main/resources/conf/hadoop-graph/hadoop-vertex-script.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/360digitech/janusgraph-data-importer/e4a09b32984961884a5994e4cdff80b211318e43/dataImport/src/main/resources/conf/hadoop-graph/hadoop-vertex-script.properties
--------------------------------------------------------------------------------
/dataImport/src/main/resources/convert.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | spark-submit --class com.qihoo.finance.tap.data.convert.MobileConvertToCsv --master yarn --conf spark.network.timeout=300 --deploy-mode client --queue root.graph ./dataImport-libs.jar hdfs://360jinrongbdp/user/finloan/janusgraph_new/mobile_split/part-*
3 |
4 |
5 | spark-submit --class com.qihoo.finance.tap.data.convert.MergeNodesAndEdges --master yarn --conf spark.network.timeout=600 --deploy-mode client --queue root.graph ./dataImport-libs.jar --outputFile hdfs://360jinrongbdp/user/finloan/janusgraph_new/merge_all_relation.txt
6 |
--------------------------------------------------------------------------------
/dataImport/src/main/resources/data/lolth-schema.groovy:
--------------------------------------------------------------------------------
1 | /* lolth-schema.groovy
2 | *
3 | * Helper functions for declaring JanusGraph schema elements
4 | * (vertex labels, edge labels, property keys) to accommodate
5 | * TP3 sample data.
6 | *
7 | * Sample usage in a gremlin.sh session:
8 | * bin/gremlin.sh
9 | * :load data/lolth-schema.groovy
10 | * t = JanusGraphFactory.open('conf/gremlin-server/janusgraph-hbase-es-server.properties')
11 | * defineLolthSchema(t)
12 | * t.close()
13 | * gremlin>
14 | */
15 |
16 | def defineLolthSchema(janusGraph) {
17 | mgmt = janusGraph.openManagement()
18 | name = mgmt.makePropertyKey("name").dataType(String.class).make()
19 | is_register = mgmt.makePropertyKey("is_register").dataType(String.class).make()
20 | is_risk = mgmt.makePropertyKey("is_risk").dataType(String.class).make()
21 | is_internal = mgmt.makePropertyKey("is_internal").dataType(String.class).make()
22 | is_service = mgmt.makePropertyKey("is_service").dataType(String.class).make()
23 | merchant_name = mgmt.makePropertyKey("merchant_name").dataType(String.class).make()
24 | is_exception = mgmt.makePropertyKey("is_exception").dataType(String.class).make()
25 | is_white = mgmt.makePropertyKey("is_white").dataType(String.class).make()
26 |
27 | // name 属性加密
28 | // nm_pass = mgmt.makePropertyKey("nm_pass").dataType(String.class).make()
29 | // nm_sha1 = mgmt.makePropertyKey("nm_sha1").dataType(String.class).make()
30 |
31 | status = mgmt.makePropertyKey("status").dataType(Integer.class).make()
32 | suspect_risk = mgmt.makePropertyKey("suspect_risk").dataType(Integer.class).make()
33 | overdue_status = mgmt.makePropertyKey("overdue_status").dataType(Integer.class).make()
34 | mgm = mgmt.makePropertyKey("mgm").dataType(Integer.class).make()
35 |
36 | blid = mgmt.makePropertyKey("bulkLoader.vertex.id").dataType(Long.class).make()
37 | mgmt.buildIndex("byBulkLoaderVertexId", Vertex.class).addKey(blid).buildCompositeIndex()
38 |
39 | // 注意 JanusGraph 的 label名称区分大小写,而 AgensGraph 不做区分
40 | // 所有统一使用大写
41 | mgmt.makeVertexLabel("DEVICE").make()
42 | mgmt.makeVertexLabel("MOBILE").make()
43 | mgmt.makeVertexLabel("WIFI").make()
44 |
45 | mgmt.makeEdgeLabel("CALL").multiplicity(Multiplicity.SIMPLE).make()
46 | mgmt.makeEdgeLabel("HAS").multiplicity(Multiplicity.SIMPLE).make()
47 | mgmt.makeEdgeLabel("USE").multiplicity(Multiplicity.SIMPLE).make()
48 | mgmt.makeEdgeLabel("USE_WIFI").multiplicity(Multiplicity.SIMPLE).make()
49 |
50 | mgmt.buildIndex("name", Vertex.class).addKey(name).unique().buildCompositeIndex()
51 | // mgmt.buildIndex("nm_sha1", Vertex.class).addKey(nm_sha1).unique().buildCompositeIndex()
52 | mgmt.commit()
53 | }
--------------------------------------------------------------------------------
/dataImport/src/main/resources/data_export.txt:
--------------------------------------------------------------------------------
1 | # AgensGraph data export
2 | # 顶点
3 | COPY (
4 | match (m:mobile) return m
5 | ) TO '/tmp/mobile.csv';
6 |
7 |
8 | COPY (
9 | match (m:device) return m
10 | ) TO '/tmp/device.csv';
11 |
12 |
13 | COPY (
14 | match (m:wifi) return m
15 | ) TO '/tmp/wifi.csv';
16 |
17 |
18 | COPY (
19 | match (m1:MOBILE)-[r:CALL]->(m2:MOBILE) return m1.name, 'CALL', m2.name, r.mgm
20 | ) TO '/tmp/call.csv' DELIMITER ',';
21 |
22 |
23 | COPY (
24 | match (m1:MOBILE)-[r:USE]->(m2:DEVICE) return m1.name, 'USE', m2.name
25 | ) TO '/tmp/use.csv' DELIMITER ',';
26 |
27 | COPY (
28 | match (m1:DEVICE)-[r:HAS]->(m2:MOBILE) return m1.name, 'HAS', m2.name
29 | ) TO '/tmp/has.csv' DELIMITER ',';
30 |
31 | COPY (
32 | match (m1:MOBILE)-[r:USE_WIFI]->(m2:WIFI) return m1.name, 'USE_WIFI', m2.name
33 | ) TO '/tmp/use_wifi.csv' DELIMITER ',';
34 |
--------------------------------------------------------------------------------
/dataImport/src/main/resources/gremlin_run.groovy:
--------------------------------------------------------------------------------
1 |
2 | graph = GraphFactory.open("conf/hadoop-graph/hadoop-vertex-script.properties")
3 | blvp = BulkLoaderVertexProgram.build().bulkLoader(OneTimeBulkLoader).writeGraph("conf/gremlin-server/janusgraph-hbase-es-server-new.properties").create(graph)
4 | graph.compute(SparkGraphComputer).program(blvp).submit().get()
--------------------------------------------------------------------------------
/dataImport/src/main/resources/gremlin_run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # 配置 hadoop 环境读取 hdfs文件
3 | export HADOOP_CONF_DIR=/etc/hadoop/conf
4 | export CLASSPATH=$HADOOP_CONF_DIR
5 | nohup bin/gremlin.sh -e ./gremlin_run.groovy &
--------------------------------------------------------------------------------
/dataImport/src/main/resources/hive/migrate_csv_to_hive_table.sql:
--------------------------------------------------------------------------------
1 |
2 | # 2425 重复值在 wifi中全部有
3 | select count(*) from migrate_id_repe_check_result_tmp a
4 | join migrate_use_wifi_tmp b
5 | on a.name = b.end_name;
6 |
7 |
8 | insert overwrite table migrate_wifi_tmp
9 | select a.* from migrate_wifi_tmp a
10 | left join migrate_id_repe_check_result_tmp b
11 | on a.name = b.name
12 | where b.name is null;
13 |
14 |
15 |
16 | select * from migrate_use_wifi_tmp where end_name = 'FP2368037167772741632';
17 |
18 |
19 | select row_number() over () as rowid, name from migrate_wifi_tmp limit 100;
20 |
21 |
22 | # 合并生成唯一 id
23 | create table migrate_id_mat_tmp as
24 | select row_number() over () as id, name from (
25 | select name from migrate_device_tmp
26 | union
27 | select name from migrate_wifi_tmp
28 | union
29 | select name from migrate_mobile_tmp
30 | ) as abc;
31 |
32 |
33 | insert overwrite table migrate_id_repe_check_tmp
34 | select name, type from (
35 | select name, 'd' as type from migrate_device_tmp
36 | union
37 | select name, 'w' as type from migrate_wifi_tmp
38 | union
39 | select name, 'm' as type from migrate_mobile_tmp
40 | ) as abc;
41 |
42 |
43 | create table migrate_id_repe_check_result_tmp as
44 | select name, count(*) as count
45 | from migrate_id_repe_check_tmp
46 | group by name
47 | having count(*) > 1;
48 |
49 |
50 |
51 | create table migrate_mobile_id_tmp as
52 | select b.id, a.* from migrate_mobile_tmp a
53 | join migrate_id_mat_tmp b
54 | on a.name = b.name;
55 |
56 |
57 | create table migrate_device_id_tmp as
58 | select b.id, a.* from migrate_device_tmp a
59 | join migrate_id_mat_tmp b
60 | on a.name = b.name;
61 |
62 | create table migrate_wifi_id_tmp as
63 | select b.id, a.* from migrate_wifi_tmp a
64 | join migrate_id_mat_tmp b
65 | on a.name = b.name;
66 |
67 |
68 |
69 | DROP TABLE IF EXISTS migrate_call_id_tmp;
70 | create table migrate_call_id_tmp as
71 | select b.id as start_id, c.id as end_id, a.mgm from migrate_call_tmp a
72 | join migrate_mobile_id_tmp b
73 | on a.start_name = b.name
74 | join migrate_mobile_id_tmp c
75 | on a.end_name = c.name;
76 |
77 |
78 | DROP TABLE IF EXISTS migrate_use_id_tmp;
79 | create table migrate_use_id_tmp as
80 | select b.id as start_id, c.id as end_id from migrate_use_tmp a
81 | join migrate_mobile_id_tmp b
82 | on a.start_name = b.name
83 | join migrate_device_id_tmp c
84 | on a.end_name = c.name;
85 |
86 |
87 |
88 | DROP TABLE IF EXISTS migrate_has_id_tmp;
89 | create table migrate_has_id_tmp as
90 | select b.id as start_id, c.id as end_id from migrate_has_tmp a
91 | join migrate_device_id_tmp b
92 | on a.start_name = b.name
93 | join migrate_mobile_id_tmp c
94 | on a.end_name = c.name;
95 |
96 |
97 | DROP TABLE IF EXISTS migrate_use_wifi_id_tmp;
98 | create table migrate_use_wifi_id_tmp as
99 | select b.id as start_id, c.id as end_id from migrate_use_wifi_tmp a
100 | join migrate_mobile_id_tmp b
101 | on a.start_name = b.name
102 | join migrate_wifi_id_tmp c
103 | on a.end_name = c.name;
--------------------------------------------------------------------------------
/dataImport/src/main/resources/scripts/script_mobile.groovy:
--------------------------------------------------------------------------------
1 | def parse(line) {
2 | def (vertex, inEdges, outEdges) = line.split(/\t/, 3)
3 | def (v1id, v1label, v1props) = vertex.split(/,/, 3)
4 | def v1 = graph.addVertex(T.id, v1id.toLong(), T.label, v1label)
5 | switch (v1label) {
6 | case "MOBILE":
7 | def (name, nm_pass, nm_sha1, is_register, is_risk, is_internal, is_service, merchant_name, status, suspect_risk, overdue_status) = v1props.split(/,/, 11)
8 | v1.property("name", name)
9 | v1.property("nm_pass", nm_pass)
10 | v1.property("nm_sha1", nm_sha1)
11 |
12 | if (is_register?.trim()) {
13 | v1.property("is_register", is_register)
14 | }
15 | if (is_risk?.trim()) {
16 | v1.property("is_risk", is_risk)
17 | }
18 | if (is_internal?.trim()) {
19 | v1.property("is_internal", is_internal)
20 | }
21 | if (is_service?.trim()) {
22 | v1.property("is_service", is_service)
23 | }
24 | if (merchant_name?.trim()) {
25 | v1.property("merchant_name", merchant_name)
26 | }
27 | if (status?.trim()) {
28 | v1.property("status", status.toInteger())
29 | }
30 | if (suspect_risk?.trim()) {
31 | v1.property("suspect_risk", suspect_risk.toInteger())
32 | }
33 | if (overdue_status?.trim()) {
34 | v1.property("overdue_status", overdue_status.toInteger())
35 | }
36 | break
37 | case "DEVICE":
38 | case "WIFI":
39 | def (name, is_exception, is_white) = v1props.split(/,/, 3)
40 | v1.property("name", name)
41 | if (is_exception?.trim()) {
42 | v1.property("is_exception", is_exception)
43 | }
44 | if (is_white?.trim()) {
45 | v1.property("is_white", is_white)
46 | }
47 | break
48 | default:
49 | throw new Exception("Unexpected vertex label: ${v1label}")
50 | }
51 | [[outEdges, true], [inEdges, false]].each { def edges, def out ->
52 | edges.split(/\|/).grep().each { def edge ->
53 | def parts = edge.split(/,/)
54 | def otherV, eLabel, mgm = null
55 | if (parts.size() == 2) {
56 | (eLabel, otherV) = parts
57 | } else {
58 | (eLabel, otherV, mgm) = parts
59 | }
60 | def v2 = graph.addVertex(T.id, otherV.toLong())
61 | def e = out ? v1.addOutEdge(eLabel, v2) : v1.addInEdge(eLabel, v2)
62 |
63 | if (mgm?.trim()) e.property("mgm", mgm.toInteger())
64 | }
65 | }
66 | return v1
67 | }
--------------------------------------------------------------------------------
/dataImport/src/main/resources/test.data/call_edge.csv:
--------------------------------------------------------------------------------
1 | "187027xx013","CALL","187xxx63006",1
2 | "187027xx006","CALL","187xxx61013",\N
--------------------------------------------------------------------------------
/dataImport/src/main/resources/test.data/device.csv:
--------------------------------------------------------------------------------
1 | device[10.2]{"name": "FP1627486073238818816"}
2 | device[10.3]{"name": "FP1418992331331342021"}
3 | device[10.4]{"name": "FP1418992331331342005"}
4 | device[10.6]{"name": "FP2659928169380958208"}
--------------------------------------------------------------------------------
/dataImport/src/main/resources/test.data/has.csv:
--------------------------------------------------------------------------------
1 | "FP1418992331331342001","HAS","18xxx761005"
2 | "FP1418992331331342001","HAS","18xxx761004"
--------------------------------------------------------------------------------
/dataImport/src/main/resources/test.data/mobile.csv:
--------------------------------------------------------------------------------
1 | mobile[11.46]{"name": "1870xxx1013", "is_service": "true", "suspect_risk": 0, "status": 1}
2 | mobile[11.17]{"name": "1870xxx3006", "status": 4, "is_register": "true"}
--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/Helper.java:
--------------------------------------------------------------------------------
1 | package com.qihoo.finance.tap;
2 |
3 | import com.alibaba.fastjson.JSON;
4 | import com.alibaba.fastjson.JSONObject;
5 | import org.apache.commons.codec.digest.DigestUtils;
6 |
7 | /**
8 | * @author zhoupeng
9 | * @date 2019/5/9
10 | */
11 | public class Helper {
12 | public static String buildVertexProperty(String label, String jsonString) {
13 | JSONObject jsonObject = Helper.getVertexProperty(jsonString);
14 | return buildPropertyString(label, jsonObject);
15 | }
16 |
17 | private static String buildPropertyString(String label, JSONObject jsonObject) {
18 | StringBuilder builder = new StringBuilder();
19 | jsonObject.forEach((key, value) -> {
20 | if (value instanceof String) {
21 | builder.append(".property('").append(key).append("', '").append(value).append("')");
22 | } else {
23 | builder.append(".property('").append(key).append("', ").append(value).append(")");
24 | }
25 |
26 | // 手机号才需要加密
27 | if ("MOBILE".equals(label) && "name".equals(key)) {
28 | String encrypt = value.toString();
29 | String sha1Hex = DigestUtils.sha1Hex(value.toString());
30 | builder.append(".property('").append("nm_pass").append("', '").append(encrypt).append("')");
31 | builder.append(".property('").append("nm_sha1").append("', '").append(sha1Hex).append("')");
32 | }
33 | });
34 | return builder.toString();
35 | }
36 |
37 | public static String buildIncrementOtherPropertyString(String label, JSONObject jsonObject) {
38 | StringBuilder builder = new StringBuilder();
39 | jsonObject.forEach((key, value) -> {
40 |
41 | if ("name".equals(key)) {
42 | // 手机号 才需要加密信息
43 | if ("MOBILE".equals(label)) {
44 | String encrypt =value.toString();
45 | String sha1Hex = DigestUtils.sha1Hex(value.toString());
46 |
47 | builder.append(".property('").append("nm_pass").append("', '").append(encrypt).append("')");
48 | builder.append(".property('").append("nm_sha1").append("', '").append(sha1Hex).append("')");
49 | }
50 |
51 | } else if ("status".equals(key)) {
52 | // do nothing
53 | } else {
54 | if (value instanceof String) {
55 | builder.append(".property('").append(key).append("', '").append(value).append("')");
56 | } else {
57 | builder.append(".property('").append(key).append("', ").append(value).append(")");
58 | }
59 | }
60 |
61 | });
62 | return builder.toString();
63 | }
64 |
65 | public static JSONObject getVertexProperty(String jsonString) {
66 | return JSON.parseObject(jsonString);
67 | }
68 | }
--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/ImportCommon.scala:
--------------------------------------------------------------------------------
1 | package com.qihoo.finance.tap
2 |
3 | import java.util
4 | import java.util.concurrent.TimeUnit
5 | import java.util.function.Consumer
6 |
7 | import com.alibaba.fastjson.JSONObject
8 | import org.apache.log4j.{LogManager, Logger}
9 | import org.apache.tinkerpop.gremlin.driver.{Client, Result}
10 |
11 | import scala.util.control.Breaks
12 |
13 |
14 | object ImportCommon {
15 | val logger: Logger = LogManager.getLogger("ImportCommon")
16 |
17 | type OptionMap = Map[Symbol, Any]
18 |
19 | def getJanusGraph(hosts: String, port: Int, poolSize: Int): JanusGraphProvider = {
20 | new JanusGraphProvider(hosts, port, poolSize)
21 | }
22 |
23 | def isEmpty(x: String) = Option(x).forall(_.isEmpty)
24 |
25 |
26 | def submitWithRetry(client: Client, runCql: String) = {
27 | val loop = new Breaks
28 | loop.breakable {
29 | for (a <- 1 to 100) {
30 | try {
31 | client.submit(runCql).stream().forEach(new Consumer[Result] {
32 | override def accept(t: Result): Unit =
33 | logger.info(t.getLong)
34 | })
35 | loop.break()
36 | } catch {
37 | case ex: Exception =>
38 | logger.warn(runCql)
39 | logger.warn(ex.getMessage, ex)
40 | TimeUnit.MILLISECONDS.sleep(1000 * a)
41 | }
42 |
43 | }
44 | }
45 | }
46 |
47 |
48 | def getResultWithRetry(client: Client, runCql: String): util.List[Result] = {
49 | var results: util.List[Result] = null
50 | val loop = new Breaks
51 | loop.breakable {
52 | for (a <- 1 to 100) {
53 | try {
54 | results = client.submit(runCql).all.get
55 | loop.break()
56 | } catch {
57 | case ex: Exception =>
58 | logger.warn(ex.getMessage, ex)
59 | TimeUnit.MILLISECONDS.sleep(1000 * a)
60 | }
61 | }
62 | }
63 |
64 | results
65 | }
66 |
67 |
68 | def handleVertexList(recordList: List[(String, String)], client: Client): Unit = {
69 | var runCql = "g = graph.traversal();g"
70 |
71 | recordList.foreach { case (label, attrString) =>
72 | runCql += ".addV('" + label + "')"
73 | runCql += Helper.buildVertexProperty(label, attrString);
74 | }
75 |
76 | if (recordList.nonEmpty) {
77 | runCql += ".count()"
78 |
79 | ImportCommon.submitWithRetry(client, runCql)
80 | }
81 | }
82 |
83 | // 顶点增量插入
84 | def handleVertexIncrementList(recordList: List[(String, String)], client: Client): Unit = {
85 | var runCql = "g = graph.traversal();g"
86 |
87 | recordList.foreach { case (label, attrString) =>
88 | val attrJson = Helper.getVertexProperty(attrString)
89 | val name = attrJson.getString("name")
90 | runCql += ".V().has('name', '" + name + "').hasLabel('" + label + "').as('m').fold().coalesce(unfold(), addV('" + label + "').property('name', '" + name + "'))"
91 |
92 | runCql += Helper.buildIncrementOtherPropertyString(label, attrJson)
93 | // status 属性单独进行操作
94 | val status = attrJson.getIntValue("status")
95 | if (status > 0) {
96 | runCql += ".V().has('name', '" + name + "').as('m').where(or(select('m').values('status').is(lt(" + status + ")), select('m').hasNot('status'))).property('status', " + status + ")"
97 | }
98 | }
99 |
100 | if (recordList.nonEmpty) {
101 | runCql += ".count()"
102 | ImportCommon.submitWithRetry(client, runCql)
103 | }
104 | }
105 |
106 | def nextOption(map: OptionMap, list: List[String]): OptionMap = {
107 | def isSwitch(s: String) = s(0) == '-'
108 |
109 | list match {
110 | case Nil => map
111 | case "--janusgraph-hosts" :: value :: tail =>
112 | ImportCommon.nextOption(map ++ Map('janusgraphHosts -> value.toString), tail)
113 | case "--janusgraph-port" :: value :: tail =>
114 | ImportCommon.nextOption(map ++ Map('janusgraphPort -> value.toInt), tail)
115 | case "--batch-size" :: value :: tail =>
116 | ImportCommon.nextOption(map ++ Map('batchSize -> value.toInt), tail)
117 | case "--pool-size" :: value :: tail =>
118 | ImportCommon.nextOption(map ++ Map('poolSize -> value.toInt), tail)
119 | case "--storage-hostname" :: value :: tail =>
120 | ImportCommon.nextOption(map ++ Map('storageHostname -> value.toString), tail)
121 | case "--label" :: value :: tail =>
122 | ImportCommon.nextOption(map ++ Map('label -> value.toString), tail)
123 | case "--deviceType" :: value :: tail =>
124 | ImportCommon.nextOption(map ++ Map('deviceType -> value.toString), tail)
125 | case "--edgeType" :: value :: tail =>
126 | ImportCommon.nextOption(map ++ Map('edgeType -> value.toString), tail)
127 | case "--fromLabel" :: value :: tail =>
128 | ImportCommon.nextOption(map ++ Map('fromLabel -> value.toString), tail)
129 | case "--toLabel" :: value :: tail =>
130 | ImportCommon.nextOption(map ++ Map('toLabel -> value.toString), tail)
131 | case "--outputFile" :: value :: tail =>
132 | ImportCommon.nextOption(map ++ Map('outputFile -> value.toString), tail)
133 | case string :: opt2 :: tail if isSwitch(opt2) =>
134 | ImportCommon.nextOption(map ++ Map('importFile -> string.toString), list.tail)
135 | case string :: Nil => ImportCommon.nextOption(map ++ Map('importFile -> string.toString), list.tail)
136 | case option :: tail => println("Unknown option " + option)
137 | Map()
138 | }
139 | }
140 | }
141 |
--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/JanusGraphProvider.java:
--------------------------------------------------------------------------------
1 | package com.qihoo.finance.tap;
2 |
3 | import org.apache.commons.configuration.Configuration;
4 | import org.apache.commons.configuration.PropertiesConfiguration;
5 | import org.apache.log4j.LogManager;
6 | import org.apache.log4j.Logger;
7 | import org.apache.tinkerpop.gremlin.driver.Client;
8 | import org.apache.tinkerpop.gremlin.driver.Cluster;
9 |
10 | import java.util.Objects;
11 |
12 | /**
13 | * @author zhoupeng
14 | * @date 2019/1/31
15 | */
16 | public class JanusGraphProvider {
17 | private static final Logger logger = LogManager.getLogger(JanusGraphProvider.class);
18 | public Cluster cluster;
19 |
20 |
21 | public JanusGraphProvider(String hosts, int port, int poolSize) {
22 | Configuration clusterConfig = new PropertiesConfiguration();
23 | clusterConfig.setProperty("hosts", hosts);
24 | clusterConfig.setProperty("port", port);
25 | clusterConfig.setProperty("connectionPool.minSize", poolSize);
26 | clusterConfig.setProperty("connectionPool.maxSize", poolSize);
27 | clusterConfig.setProperty("connectionPool.maxInProcessPerConnection", poolSize);
28 | clusterConfig.setProperty("connectionPool.maxSimultaneousUsagePerConnection", poolSize);
29 | clusterConfig.setProperty("connectionPool.maxContentLength", 65536000);
30 | clusterConfig.setProperty("serializer.className", "org.apache.tinkerpop.gremlin.driver.ser.GryoMessageSerializerV3d0");
31 | // 此处很蛋疼,需要返回列表,只能加逗号分隔才行,生成两个类
32 | clusterConfig.setProperty("serializer.config.ioRegistries",
33 | "org.janusgraph.graphdb.tinkerpop.JanusGraphIoRegistry,org.janusgraph.graphdb.tinkerpop.JanusGraphIoRegistry");
34 |
35 | cluster = Cluster.open(clusterConfig);
36 | }
37 |
38 | public Client getClient() {
39 | return this.cluster.connect();
40 | }
41 |
42 |
43 | public void close() throws Exception {
44 | try {
45 | if (cluster != null) {
46 | // the cluster closes all of its clients
47 | cluster.close();
48 | }
49 | } finally {
50 | cluster = null;
51 | }
52 | }
53 |
54 | public void submit(String cql) {
55 | Client client = this.getClient();
56 | try {
57 | client.submit(cql).stream();
58 | } finally {
59 | if (!Objects.isNull(client)) {
60 | client.close();
61 | }
62 | }
63 | }
64 | }
--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/ScalaHelper.scala:
--------------------------------------------------------------------------------
1 | package com.qihoo.finance.tap
2 |
3 | import java.lang
4 |
5 | import com.alibaba.fastjson.JSONObject
6 | import org.apache.spark.sql.DataFrame
7 |
8 | object ScalaHelper {
9 | def convertHeader(label: String, headerMap: Map[String, String], headerList: Array[String]): Map[String, String] = {
10 | var headResult = Map[String, String]()
11 | for (field <- headerList) {
12 | var result: String = null
13 |
14 | if ("name".equals(field)) {
15 | result = "%s:ID(%s)".format(field, label)
16 | } else {
17 | if (headerMap.contains(field)) {
18 | result = field + ":" + headerMap(field)
19 | } else {
20 | result = field
21 | }
22 | }
23 |
24 | headResult = headResult + (field -> result)
25 | }
26 |
27 | headResult
28 | }
29 |
30 |
31 | def saveAsCSV(outputFile: String, df: DataFrame) = {
32 | df.repartition(1)
33 | .write
34 | .mode("overwrite")
35 | .format("com.databricks.spark.csv")
36 | .option("header", "true")
37 | .option("treatEmptyValuesAsNulls", "false")
38 | .save(outputFile)
39 | }
40 |
41 |
42 | def parseVertexLineGetIdAndAttr(line: String) = {
43 | val labelLast = line.indexOf("[")
44 | val idLast = line.indexOf("]")
45 | val attrStart = line.indexOf("{")
46 | val attrStr = line.substring(attrStart, line.length)
47 |
48 | val jsonObject = Helper.getVertexProperty(attrStr)
49 | jsonObject
50 | }
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/CallEdgeConvertToCsv.scala:
--------------------------------------------------------------------------------
1 | package com.qihoo.finance.tap.data.convert
2 |
3 | import com.qihoo.finance.tap.ImportCommon
4 | import org.apache.log4j.{LogManager, Logger}
5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
6 | import org.apache.spark.sql.{RowFactory, SQLContext}
7 | import org.apache.spark.{SparkConf, SparkContext}
8 | import org.apache.tinkerpop.gremlin.driver.Client
9 |
10 | object CallEdgeConvertToCsv {
11 | val logger: Logger = LogManager.getLogger("CallEdgeConvertToCsv")
12 |
13 | val usage =
14 | """
15 | Usage: CallEdgeConvertToCsv [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] E:\360_doc\lolth\call_edge.csv
16 | """
17 |
18 | def main(args: Array[String]) {
19 | if (args.length == 0) {
20 | println(usage)
21 | System.exit(0)
22 | }
23 |
24 | val argList = args.toList
25 | val options = ImportCommon.nextOption(Map(), argList)
26 |
27 | val conf = new SparkConf().setAppName("CallEdgeConvertToCsv")
28 | //setMaster("local") 本机的spark就用local,远端的就写ip
29 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
30 | // conf.setMaster("local")
31 |
32 | val sc = new SparkContext(conf)
33 | val sqlContext = new SQLContext(sc)
34 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
35 | val outputFile = options.getOrElse('outputFile, "").asInstanceOf[String]
36 |
37 | val dataRdd = txtFile.map {
38 | line =>
39 | val fields = line.replace("\"", "").split(",")
40 | // "1870276152746","CALL","18602761525746"
41 | // "13512340050","CALL","15607804358",1
42 | // CALL 边有 mgm 属性
43 |
44 | if (!"\\N".equals(fields(3))) {
45 | val mgmInt: java.lang.Integer = Integer.parseInt(fields(3))
46 | RowFactory.create(fields(0), fields(2), mgmInt)
47 | } else {
48 | RowFactory.create(fields(0), fields(2), null)
49 | }
50 | }
51 |
52 | val structType = new StructType()
53 | .add(StructField("start_name", StringType, nullable = true))
54 | .add(StructField("end_name", StringType, nullable = true))
55 | .add(StructField("mgm", IntegerType, nullable = true))
56 |
57 | val df = sqlContext.createDataFrame(dataRdd, structType)
58 |
59 | df.createOrReplaceTempView("csv_df")
60 | sqlContext.sql("create table migrate_call_tmp as select * from csv_df")
61 |
62 | // df.show()
63 | // ScalaHelper.saveAsCSV(outputFile, df)
64 |
65 | println("***********************stoped***********************")
66 | sc.stop()
67 | }
68 |
69 | private def handleEdgeList(cqlList: List[String], client: Client): Unit = {
70 | var runCql = "g = graph.traversal();g"
71 |
72 | cqlList.foreach(cql => runCql += cql)
73 | if (cqlList.nonEmpty) {
74 | runCql += ".count()"
75 | ImportCommon.submitWithRetry(client, runCql)
76 | }
77 | }
78 |
79 | }
80 |
--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/DeviceConvertToCsv.scala:
--------------------------------------------------------------------------------
1 | package com.qihoo.finance.tap.data.convert
2 |
3 | import com.alibaba.fastjson.JSONObject
4 | import com.qihoo.finance.tap.{ImportCommon, ScalaHelper}
5 | import org.apache.log4j.{LogManager, Logger}
6 | import org.apache.spark.sql.types.{StringType, StructType}
7 | import org.apache.spark.sql.{RowFactory, SQLContext}
8 | import org.apache.spark.{SparkConf, SparkContext}
9 |
10 | object DeviceConvertToCsv {
11 |
12 | val logger: Logger = LogManager.getLogger("DeviceConvertToCsv")
13 |
14 | val usage =
15 | """
16 | Usage: DeviceConvertToCsv [--label] [--outputFile] E:\360_doc\lolth\mobile.csv
17 | """
18 |
19 | type OptionMap = Map[Symbol, Any]
20 |
21 |
22 | def main(args: Array[String]) {
23 | if (args.length == 0) {
24 | println(usage)
25 | System.exit(0)
26 | }
27 |
28 | val argList = args.toList
29 | val options = ImportCommon.nextOption(Map(), argList)
30 |
31 |
32 | val conf = new SparkConf().setAppName("DeviceConvertToCsv")
33 | //setMaster("local") 本机的spark就用local,远端的就写ip
34 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
35 | // conf.setMaster("local")
36 |
37 | val sc = new SparkContext(conf)
38 | val sqlContext = new SQLContext(sc)
39 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
40 | val outputFile = options.getOrElse('outputFile, "").asInstanceOf[String]
41 | val deviceType = options.getOrElse('deviceType, "").asInstanceOf[String]
42 | if (deviceType == null) {
43 | println("--deviceType 不能为空 device|wifi")
44 | System.exit(0)
45 | }
46 |
47 | val headerList = Array("name", "is_exception", "is_white")
48 |
49 | // name:ID(human) age:Int
50 | val dataRdd = txtFile.map {
51 | line =>
52 | val jsonObject: JSONObject = ScalaHelper.parseVertexLineGetIdAndAttr(line)
53 | RowFactory.create(jsonObject.getString("name"),
54 | jsonObject.getString("is_exception"),
55 | jsonObject.getString("is_white")
56 | )
57 | }
58 | var structType = new StructType()
59 |
60 | for ((elem, i) <- headerList.view.zipWithIndex) {
61 | structType = structType.add(headerList(i), StringType, nullable = true)
62 | }
63 |
64 | val df = sqlContext.createDataFrame(dataRdd, structType)
65 |
66 | df.createOrReplaceTempView("device_csv_df")
67 |
68 | sqlContext.sql("DROP TABLE IF EXISTS migrate_" + deviceType + "_tmp")
69 | sqlContext.sql("create table migrate_" + deviceType + "_tmp as select * from device_csv_df")
70 |
71 | // df.show()
72 | // ScalaHelper.saveAsCSV(outputFile, df)
73 |
74 | println("***********************stoped***********************")
75 | sc.stop()
76 | }
77 |
78 |
79 | }
80 |
--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/MergeNodesAndEdges.scala:
--------------------------------------------------------------------------------
1 | package com.qihoo.finance.tap.data.convert
2 |
3 | import com.qihoo.finance.tap.ImportCommon
4 | import org.apache.log4j.{LogManager, Logger}
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.{DataFrame, Row, SQLContext}
7 | import org.apache.spark.{SparkConf, SparkContext}
8 |
9 | object MergeNodesAndEdges {
10 |
11 | val logger: Logger = LogManager.getLogger("MergeNodesAndEdges")
12 |
13 | val usage =
14 | """
15 | 将顶点和边合并为一行,做批量导入
16 | Usage: MergeNodesAndEdges --outputFile
17 | """
18 |
19 | type OptionMap = Map[Symbol, Any]
20 |
21 |
22 | def main(args: Array[String]) {
23 | // if (args.length == 0) {
24 | // println(usage)
25 | // System.exit(0)
26 | // }
27 |
28 | val argList = args.toList
29 | val options = ImportCommon.nextOption(Map(), argList)
30 |
31 | val conf = new SparkConf().setAppName("MergeNodesAndEdges")
32 | //setMaster("local") 本机的spark就用local,远端的就写ip
33 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
34 | // conf.setMaster("local")
35 |
36 | val sc = new SparkContext(conf)
37 | val sqlContext = new SQLContext(sc)
38 | val outputFile = options.getOrElse('outputFile, "").asInstanceOf[String]
39 |
40 | val (mobile_df: DataFrame, device_df: DataFrame, wifi_df: DataFrame, call_df: DataFrame, has_df: DataFrame, use_df: DataFrame, use_wifi_df: DataFrame) =
41 | generateTestDataDF(sc, sqlContext)
42 |
43 | // val mobile_df = sqlContext.sql("select * from migrate_mobile_id_tmp")
44 | // val device_df = sqlContext.sql("select * from migrate_device_id_tmp")
45 | // val wifi_df = sqlContext.sql("select * from migrate_wifi_id_tmp")
46 | //
47 | // val call_df = sqlContext.sql("select * from migrate_call_id_tmp")
48 | // val has_df = sqlContext.sql("select * from migrate_has_id_tmp")
49 | // val use_df = sqlContext.sql("select * from migrate_use_id_tmp")
50 | // val use_wifi_df = sqlContext.sql("select * from migrate_use_wifi_id_tmp")
51 |
52 |
53 | val mobile_kv = mobile_df.rdd.keyBy(_ (0)).mapValues(fields => {
54 | // label, other props ...
55 | List("MOBILE",
56 | replaceNullEmpty(fields(1)),
57 | replaceNullEmpty(fields(2)),
58 | replaceNullEmpty(fields(3)),
59 | replaceNullEmpty(fields(4)),
60 | replaceNullEmpty(fields(5)),
61 | replaceNullEmpty(fields(6)),
62 | replaceNullEmpty(fields(7)),
63 | replaceNullEmpty(fields(8)),
64 | replaceNullEmpty(fields(9)),
65 | replaceNullEmpty(fields(10)),
66 | replaceNullEmpty(fields(11))
67 | ).mkString(",")
68 | })
69 |
70 | val device_kv = device_df.rdd.keyBy(_ (0)).mapValues(fields => {
71 | // label, other props ...
72 | ("DEVICE", fields(1), replaceNullEmpty(fields(2)), replaceNullEmpty(fields(3))).productIterator.mkString(",")
73 | })
74 |
75 | val wifi_kv = wifi_df.rdd.keyBy(_ (0)).mapValues(fields => {
76 | // label, other props ...
77 | ("WIFI", fields(1), replaceNullEmpty(fields(2)), replaceNullEmpty(fields(3))).productIterator.mkString(",")
78 | })
79 |
80 | val call_in_kv = call_df.rdd.keyBy(_ (1)).mapValues(fields => (fields(0), replaceNullEmpty(fields(2))))
81 | val call_out_kv = call_df.rdd.keyBy(_ (0)).mapValues(fields => (fields(1), replaceNullEmpty(fields(2))))
82 |
83 | val has_in_kv = has_df.rdd.keyBy(_ (1)).mapValues(fields => fields(0))
84 | val has_out_kv = has_df.rdd.keyBy(_ (0)).mapValues(fields => fields(1))
85 |
86 | val use_out_kv = use_df.rdd.keyBy(_ (0)).mapValues(fields => fields(1))
87 | val use_in_kv = use_df.rdd.keyBy(_ (1)).mapValues(fields => fields(0))
88 |
89 | val use_wifi_out_kv = use_wifi_df.rdd.keyBy(_ (0)).mapValues(fields => fields(1))
90 | val use_wifi_in_kv = use_wifi_df.rdd.keyBy(_ (1)).mapValues(fields => fields(0))
91 |
92 | val mobile_result_rdd = mobile_kv.cogroup(call_in_kv).map(v => {
93 | val callIn = v._2._2.toList.map(v => {
94 | "CALL," + v._1 + "," + v._2
95 | }).mkString("|")
96 | val edge = forceJoinBeforeAndNow(v._2._1.toList.head, callIn, "\t")
97 | (v._1, edge)
98 | }).cogroup(has_in_kv).map(v => {
99 | val hasIn = v._2._2.toList.map(v => {
100 | "HAS," + v
101 | }).mkString("|")
102 | val edge = joinBeforeAndNowWithCheck(v._2._1.toList.head, hasIn, "\t", "|")
103 | (v._1, edge)
104 | }).cogroup(call_out_kv).map(v => {
105 | val callOut = v._2._2.toList.map(v => {
106 | "CALL," + v._1 + "," + v._2
107 | }).mkString("|")
108 | val edge = forceJoinBeforeAndNow(v._2._1.toList.head, callOut, "\t")
109 | (v._1, edge)
110 | }).cogroup(use_out_kv).map(v => {
111 | val useOut = v._2._2.toList.map(v => {
112 | "USE," + v
113 | }).mkString("|")
114 | val edge = joinBeforeAndNowWithCheck(v._2._1.toList.head, useOut, "\t", "|")
115 | (v._1, edge)
116 | }).cogroup(use_wifi_out_kv).map(v => {
117 | val useOut = v._2._2.toList.map(v => {
118 | "USE_WIFI," + v
119 | }).mkString("|")
120 | val edge = joinBeforeAndNowWithCheck(v._2._1.toList.head, useOut, "\t", "|")
121 | (v._1, edge)
122 | }).map(v => v._1 + "," + v._2)
123 |
124 | val device_result_rdd = device_kv.cogroup(use_in_kv).map(v => {
125 | val useIn = v._2._2.toList.map(v => {
126 | "USE," + v
127 | }).mkString("|")
128 | val edge = forceJoinBeforeAndNow(v._2._1.toList.head, useIn, "\t")
129 | (v._1, edge)
130 | }).cogroup(has_out_kv).map(v => {
131 | val hasOut = v._2._2.toList.map(v => {
132 | "HAS," + v
133 | }).mkString("|")
134 | val edge = forceJoinBeforeAndNow(v._2._1.toList.head, hasOut, "\t")
135 | (v._1, edge)
136 | }).map(v => v._1 + "," + v._2)
137 |
138 | val wifi_result_rdd = wifi_kv.cogroup(use_wifi_in_kv).map(v => {
139 | val useIn = v._2._2.toList.map(v => {
140 | "USE_WIFI," + v
141 | }).mkString("|")
142 | val edge = forceJoinBeforeAndNow(v._2._1.toList.head, useIn, "\t")
143 | // USE_WIFI has't outEdge so add \t
144 | (v._1, edge + "\t")
145 | }).map(v => v._1 + "," + v._2)
146 |
147 | val total_result = mobile_result_rdd ++ device_result_rdd ++ wifi_result_rdd
148 |
149 | total_result.saveAsTextFile(outputFile)
150 |
151 | // total_result.collect().foreach(println)
152 |
153 | println("***********************stoped***********************")
154 | sc.stop()
155 | }
156 |
157 | private def generateTestDataDF(sc: SparkContext, sqlContext: SQLContext) = {
158 | val mobile_rdd = sc.parallelize(Seq(
159 | Row(1L, "13908125867", "3|TVOiyN2mC/ihdQuMBaw+0A==", "12dd2479ed75af60968d012fa139ff1cffac3683", "true", "", "", "", "", 1, null, null),
160 | Row(2L, "13908125868", "3|TVOiyN2mC/ihdQuMBaw+0A==", "12dd2479ed75af60968d012fa139ff1cffac3683", "true", "", "", "", "", 1, null, 0),
161 | Row(3L, "13908125869", "3|TVOiyN2mC/ihdQuMBaw+0A==", "12dd2479ed75af60968d012fa139ff1cffac3683", "true", "", "", "", "", 3, null, 1)
162 | ))
163 | val device_rdd = sc.parallelize(Seq(
164 | Row(11L, "FP13682956455", null, "false"),
165 | Row(12L, "FP13682956456", "true", null),
166 | Row(13L, "FP13682956457", "true", "false")
167 | ))
168 | val wifi_rdd = sc.parallelize(Seq(
169 | Row(21L, "bssid13682956455", null, "false"),
170 | Row(22L, "bssid13682956456", "true", null),
171 | Row(23L, "bssid13682956457", "true", "false")
172 | ))
173 |
174 | val call_rdd = sc.parallelize(Seq(
175 | Row(1L, 2L, null),
176 | // Row(3L, 2L, 1),
177 | Row(2L, 1L, 1)
178 | // Row(2L, 3L, 1)
179 | ))
180 | val has_rdd = sc.parallelize(Seq(
181 | Row(11L, 1L),
182 | Row(11L, 2L),
183 | Row(11L, 3L),
184 | Row(12L, 1L),
185 | Row(12L, 2L)
186 | ))
187 | val use_rdd = sc.parallelize(Seq(
188 | // Row(1L, 11L),
189 | // Row(1L, 12L),
190 | // Row(2L, 13L),
191 | Row(2L, 11L)
192 | // Row(3L, 12L)
193 | ))
194 | val use_wifi_rdd = sc.parallelize(Seq(
195 | Row(1L, 21L),
196 | Row(1L, 22L),
197 | Row(2L, 23L),
198 | Row(2L, 21L),
199 | Row(3L, 22L)
200 | ))
201 |
202 | val mobile_schema = StructType(List(
203 | StructField("id", LongType, nullable = false),
204 | StructField("name", StringType, nullable = true),
205 | StructField("nm_pass", StringType, nullable = true),
206 | StructField("nm_sha1", StringType, nullable = true),
207 | StructField("is_register", StringType, nullable = true),
208 | StructField("is_risk", StringType, nullable = true),
209 | StructField("is_internal", StringType, nullable = true),
210 | StructField("is_service", StringType, nullable = true),
211 | StructField("merchant_name", StringType, nullable = true),
212 | StructField("status", IntegerType, nullable = true),
213 | StructField("suspect_risk", IntegerType, nullable = true),
214 | StructField("overdue_status", IntegerType, nullable = true)
215 | ))
216 |
217 | val call_schema = StructType(List(
218 | StructField("start_id", LongType, nullable = false),
219 | StructField("end_id", LongType, nullable = true),
220 | StructField("mgm", IntegerType, nullable = true)
221 | ))
222 | val edge_schema = StructType(List(
223 | StructField("start_id", LongType, nullable = false),
224 | StructField("end_id", LongType, nullable = true)
225 | ))
226 | val device_schema = StructType(List(
227 | StructField("id", LongType, nullable = false),
228 | StructField("name", StringType, nullable = true),
229 | StructField("is_exception", StringType, nullable = true),
230 | StructField("is_white", StringType, nullable = true)
231 | ))
232 |
233 | val mobile_df = sqlContext.createDataFrame(mobile_rdd, mobile_schema)
234 | val device_df = sqlContext.createDataFrame(device_rdd, device_schema)
235 | val wifi_df = sqlContext.createDataFrame(wifi_rdd, device_schema)
236 |
237 | val call_df = sqlContext.createDataFrame(call_rdd, call_schema)
238 | val has_df = sqlContext.createDataFrame(has_rdd, edge_schema)
239 | val use_df = sqlContext.createDataFrame(use_rdd, edge_schema)
240 | val use_wifi_df = sqlContext.createDataFrame(use_wifi_rdd, edge_schema)
241 | (mobile_df, device_df, wifi_df, call_df, has_df, use_df, use_wifi_df)
242 | }
243 |
244 | def joinBeforeAndNowWithCheck(before: String, now: String, beforSep: String, separate: String): String = {
245 | var edge: String = null
246 | if (now.isEmpty) {
247 | edge = before
248 | } else if (before.endsWith(beforSep)) {
249 | edge = List(before, now).mkString("")
250 | } else {
251 | edge = List(before, now).mkString(separate)
252 | }
253 | edge
254 | }
255 |
256 | def forceJoinBeforeAndNow(before: String, now: String, separate: String): String = {
257 | val edge = List(before, now).mkString(separate)
258 | edge
259 | }
260 |
261 | def replaceNullEmpty(field: Any): Any = {
262 | var value = field
263 | if (value == null) {
264 | value = ""
265 | }
266 | value
267 | }
268 |
269 |
270 | }
271 |
--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/MobileConvertToCsv.scala:
--------------------------------------------------------------------------------
1 | package com.qihoo.finance.tap.data.convert
2 |
3 | import com.alibaba.fastjson.JSONObject
4 | import com.qihoo.finance.tap.{ImportCommon, ScalaHelper}
5 | import org.apache.commons.codec.digest.DigestUtils
6 | import org.apache.log4j.{LogManager, Logger}
7 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
8 | import org.apache.spark.sql.{RowFactory, SQLContext}
9 | import org.apache.spark.{SparkConf, SparkContext}
10 |
11 | object MobileConvertToCsv {
12 |
13 | val logger: Logger = LogManager.getLogger("MobileConvertToCsv")
14 |
15 | val usage =
16 | """
17 | Usage: MobileConvertToCsv [--outputFile] E:\360_doc\lolth\mobile.csv
18 | """
19 |
20 | type OptionMap = Map[Symbol, Any]
21 |
22 |
23 | def main(args: Array[String]) {
24 | if (args.length == 0) {
25 | println(usage)
26 | System.exit(0)
27 | }
28 |
29 | val argList = args.toList
30 | val options = ImportCommon.nextOption(Map(), argList)
31 |
32 | val conf = new SparkConf().setAppName("MobileConvertToCsv")
33 | //setMaster("local") 本机的spark就用local,远端的就写ip
34 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
35 | // conf.setMaster("local")
36 |
37 | val sc = new SparkContext(conf)
38 | val sqlContext = new SQLContext(sc)
39 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
40 | val outputFile = options.getOrElse('outputFile, "").asInstanceOf[String]
41 |
42 | val headerList = Array( "name", "nm_pass", "nm_sha1", "is_register", "is_risk", "is_internal", "is_service", "merchant_name", "status", "suspect_risk", "overdue_status")
43 |
44 | val dataRdd = txtFile.map {
45 | line =>
46 | val jsonObject: JSONObject = ScalaHelper.parseVertexLineGetIdAndAttr(line)
47 |
48 | val nameValue = jsonObject.getString("name")
49 | // 加密信息
50 | val encrypt = nameValue
51 | val sha1Hex = DigestUtils.sha1Hex(nameValue)
52 |
53 | RowFactory.create(nameValue, encrypt, sha1Hex,
54 | jsonObject.getString("is_register"),
55 | jsonObject.getString("is_risk"),
56 | jsonObject.getString("is_internal"),
57 | jsonObject.getString("is_service"),
58 | jsonObject.getString("merchant_name"),
59 | jsonObject.getInteger("status"),
60 | jsonObject.getInteger("suspect_risk"),
61 | jsonObject.getInteger("overdue_status")
62 | )
63 | }
64 | var structType = new StructType()
65 |
66 | for ((elem, i) <- headerList.view.zipWithIndex) {
67 | if (List("status", "suspect_risk", "overdue_status").contains(elem)) {
68 | structType = structType.add(headerList(i), IntegerType, nullable = true)
69 | } else {
70 | structType = structType.add(headerList(i), StringType, nullable = true)
71 | }
72 | }
73 |
74 | val df = sqlContext.createDataFrame(dataRdd, structType)
75 |
76 | df.createOrReplaceTempView("mobile_csv_df")
77 |
78 | sqlContext.sql("DROP TABLE IF EXISTS migrate_mobile_tmp")
79 | sqlContext.sql("create table migrate_mobile_tmp as select * from mobile_csv_df")
80 | // df.show()
81 | // ScalaHelper.saveAsCSV(outputFile, df)
82 |
83 | println("***********************stoped***********************")
84 | sc.stop()
85 | }
86 |
87 | }
88 |
--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/OtherEdgeConvertToCsv.scala:
--------------------------------------------------------------------------------
1 | package com.qihoo.finance.tap.data.convert
2 |
3 | import com.qihoo.finance.tap.ImportCommon
4 | import org.apache.log4j.{LogManager, Logger}
5 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
6 | import org.apache.spark.sql.{RowFactory, SQLContext}
7 | import org.apache.spark.{SparkConf, SparkContext}
8 | import org.apache.tinkerpop.gremlin.driver.Client
9 |
10 | object OtherEdgeConvertToCsv {
11 | val logger: Logger = LogManager.getLogger("OtherEdgeConvertToCsv")
12 |
13 | val usage =
14 | """
15 | Usage: OtherEdgeConvertToCsv [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] E:\360_doc\lolth\call_edge.csv
16 | """
17 |
18 | def main(args: Array[String]) {
19 | if (args.length == 0) {
20 | println(usage)
21 | System.exit(0)
22 | }
23 |
24 | val argList = args.toList
25 | val options = ImportCommon.nextOption(Map(), argList)
26 |
27 | val conf = new SparkConf().setAppName("OtherEdgeConvertToCsv")
28 | //setMaster("local") 本机的spark就用local,远端的就写ip
29 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
30 | // conf.setMaster("local")
31 |
32 | val sc = new SparkContext(conf)
33 | val sqlContext = new SQLContext(sc)
34 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
35 |
36 | val fromLabel = options.getOrElse('fromLabel, null).asInstanceOf[String]
37 | val toLabel = options.getOrElse('toLabel, null).asInstanceOf[String]
38 | val edgeType = options.getOrElse('edgeType, null).asInstanceOf[String]
39 |
40 | if (fromLabel == null || toLabel == null) {
41 | println("必须添加参数 --from-label DEVICE|WIFI|MOBILE --to-label DEVICE|WIFI|MOBILE")
42 | System.exit(0)
43 | }
44 |
45 | // :START_ID(god) :END_ID(titan)
46 | // jupiter saturn
47 |
48 | val dataRdd = txtFile.map {
49 | line =>
50 | val fields = line.replace("\"", "").split(",")
51 | // "1870276152746","CALL","18602761525746"
52 | // "13512340050","CALL","15607804358",1
53 | // CALL 边有 mgm 属性
54 | RowFactory.create(fields(0), fields(2))
55 | }
56 |
57 | val structType = new StructType()
58 | .add(StructField("start_name", StringType, nullable = true))
59 | .add(StructField("end_name", StringType, nullable = true))
60 |
61 | val df = sqlContext.createDataFrame(dataRdd, structType)
62 | // df.show()
63 |
64 | df.createOrReplaceTempView("edge_csv_df")
65 |
66 | sqlContext.sql("DROP TABLE IF EXISTS migrate_" + edgeType + "_tmp")
67 | sqlContext.sql("create table migrate_" + edgeType + "_tmp as select * from edge_csv_df")
68 |
69 |
70 | println("***********************stoped***********************")
71 | sc.stop()
72 | }
73 |
74 | private def handleEdgeList(cqlList: List[String], client: Client): Unit = {
75 | var runCql = "g = graph.traversal();g"
76 |
77 | cqlList.foreach(cql => runCql += cql)
78 | if (cqlList.nonEmpty) {
79 | runCql += ".count()"
80 | ImportCommon.submitWithRetry(client, runCql)
81 | }
82 | }
83 |
84 | }
85 |
--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/direct/EdgeImport.scala:
--------------------------------------------------------------------------------
1 | package com.qihoo.finance.tap.direct
2 |
3 | import com.qihoo.finance.tap.ImportCommon
4 | import org.apache.log4j.{LogManager, Logger}
5 | import org.apache.spark.{SparkConf, SparkContext}
6 | import org.apache.tinkerpop.gremlin.driver.Client
7 |
8 | object EdgeImport {
9 | val logger: Logger = LogManager.getLogger("EdgeImport")
10 |
11 | val usage =
12 | """
13 | Usage: EdgeImport [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] E:\360_doc\lolth\call_edge.csv
14 | """
15 |
16 | def main(args: Array[String]) {
17 | if (args.length == 0) {
18 | println(usage)
19 | System.exit(0)
20 | }
21 |
22 | val argList = args.toList
23 | val options = ImportCommon.nextOption(Map(), argList)
24 |
25 | val conf = new SparkConf().setAppName("EdgeImport")
26 | //setMaster("local") 本机的spark就用local,远端的就写ip
27 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
28 | // conf.setMaster("local")
29 |
30 | val sc = new SparkContext(conf)
31 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
32 | val hosts = options.getOrElse('janusgraphHosts, "").asInstanceOf[String]
33 | val port = options.getOrElse('janusgraphPort, 0).asInstanceOf[Int]
34 | val batchSize = options.getOrElse('batchSize, 50).asInstanceOf[Int]
35 | val poolSize = options.getOrElse('poolSize, 16).asInstanceOf[Int]
36 |
37 | txtFile.map {
38 | line =>
39 | val fields = line.replace("\"", "").split(",")
40 | // "1870276152746","CALL","18602761525746"
41 | // "13512340050","CALL","15607804358",1
42 | // CALL 边有 mgm 属性
43 | if (fields.length == 4 && "CALL".equals(fields(1)) && !"\\N".equals(fields(3))) {
44 | (fields(0), fields(1), fields(2), Some(fields(3)))
45 | } else {
46 | (fields(0), fields(1), fields(2), None)
47 | }
48 |
49 | }.foreachPartition(partitionOfRecords => {
50 | val provider = ImportCommon.getJanusGraph(hosts, port, poolSize)
51 | val client = provider.getClient
52 |
53 | var cqlList: List[String] = List()
54 | partitionOfRecords.foreach(record => {
55 | var edgeCql = ""
56 | if (record._2 == "CALL" && record._4.nonEmpty) {
57 | edgeCql = ".V().has('name','" + record._1 + "').as('a').V().has('name','" + record._3 + "').addE('" + record._2 + "').from('a').property('mgm'," + record._4.get + ")"
58 | } else {
59 | edgeCql = ".V().has('name','" + record._1 + "').as('a').V().has('name','" + record._3 + "').addE('" + record._2 + "').from('a')"
60 | }
61 | cqlList = edgeCql :: cqlList
62 | if (cqlList.size >= batchSize) {
63 | handleEdgeList(cqlList, client)
64 | cqlList = List()
65 | }
66 | })
67 |
68 | handleEdgeList(cqlList, client)
69 | client.close()
70 | provider.close()
71 | })
72 | println("***********************stoped***********************")
73 | sc.stop()
74 | }
75 |
76 | private def handleEdgeList(cqlList: List[String], client: Client): Unit = {
77 | var runCql = "g = graph.traversal();g"
78 |
79 | cqlList.foreach(cql => runCql += cql)
80 | if (cqlList.nonEmpty) {
81 | runCql += ".count()"
82 | ImportCommon.submitWithRetry(client, runCql)
83 | }
84 | }
85 |
86 | }
87 |
--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/direct/VertexImport.scala:
--------------------------------------------------------------------------------
1 | package com.qihoo.finance.tap.direct
2 |
3 | import com.qihoo.finance.tap.ImportCommon
4 | import org.apache.log4j.{LogManager, Logger}
5 | import org.apache.spark.{SparkConf, SparkContext}
6 |
7 | object VertexImport {
8 |
9 | val logger: Logger = LogManager.getLogger("VertexImport")
10 |
11 | val usage =
12 | """
13 | Usage: VertexImport [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] [--batch-size 20] E:\360_doc\lolth\mobile.csv
14 | """
15 |
16 | type OptionMap = Map[Symbol, Any]
17 |
18 | def main(args: Array[String]) {
19 | if (args.length == 0) {
20 | println(usage)
21 | System.exit(0)
22 | }
23 |
24 | val argList = args.toList
25 | val options = ImportCommon.nextOption(Map(), argList)
26 |
27 | val conf = new SparkConf().setAppName("VertexImport")
28 | //setMaster("local") 本机的spark就用local,远端的就写ip
29 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
30 | // conf.setMaster("local")
31 |
32 | val sc = new SparkContext(conf)
33 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
34 | val hosts = options.getOrElse('janusgraphHosts, "").asInstanceOf[String]
35 | val port = options.getOrElse('janusgraphPort, 8182).asInstanceOf[Int]
36 | val batchSize = options.getOrElse('batchSize, 50).asInstanceOf[Int]
37 | val poolSize = options.getOrElse('poolSize, 16).asInstanceOf[Int]
38 |
39 |
40 | txtFile.map {
41 | line =>
42 | val labelLast = line.indexOf("[")
43 | val attrStart = line.indexOf("{")
44 | val label = line.substring(0, labelLast)
45 | val attrStr = line.substring(attrStart, line.length)
46 | (label.toUpperCase(), attrStr)
47 | }.foreachPartition(partitionOfRecords => {
48 | val provider = ImportCommon.getJanusGraph(hosts, port, poolSize)
49 | val client = provider.getClient
50 |
51 | var recordList: List[(String, String)] = List()
52 | partitionOfRecords.foreach(record => {
53 | if (!ImportCommon.isEmpty(record._1)) {
54 | recordList = (record._1, record._2) :: recordList
55 | if (recordList.size >= batchSize) {
56 | ImportCommon.handleVertexList(recordList, client)
57 | recordList = List()
58 | }
59 | }
60 | })
61 |
62 | ImportCommon.handleVertexList(recordList, client)
63 | client.close()
64 | provider.close()
65 | })
66 |
67 | println("***********************stoped***********************")
68 | sc.stop()
69 | }
70 |
71 |
72 | }
73 |
--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/increment/EdgeImportIncrement.scala:
--------------------------------------------------------------------------------
1 | package com.qihoo.finance.tap.increment
2 |
3 | import com.qihoo.finance.tap.ImportCommon
4 | import org.apache.log4j.{LogManager, Logger}
5 | import org.apache.spark.{SparkConf, SparkContext}
6 | import org.apache.tinkerpop.gremlin.driver.Client
7 |
8 | object EdgeImportIncrement {
9 | val logger: Logger = LogManager.getLogger("EdgeImportIncrement")
10 |
11 | val usage =
12 | """
13 | 边的增量导入
14 | Usage: EdgeImportIncrement [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] E:\360_doc\lolth\call_edge.csv
15 | """
16 |
17 | def main(args: Array[String]) {
18 | if (args.length == 0) {
19 | println(usage)
20 | System.exit(0)
21 | }
22 |
23 | val argList = args.toList
24 | val options = ImportCommon.nextOption(Map(), argList)
25 |
26 | val conf = new SparkConf().setAppName("EdgeImportIncrement")
27 | //setMaster("local") 本机的spark就用local,远端的就写ip
28 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
29 | // conf.setMaster("local")
30 |
31 | val sc = new SparkContext(conf)
32 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
33 | val hosts = options.getOrElse('janusgraphHosts, "").asInstanceOf[String]
34 | val port = options.getOrElse('janusgraphPort, 8182).asInstanceOf[Int]
35 | val batchSize = options.getOrElse('batchSize, 50).asInstanceOf[Int]
36 | val poolSize = options.getOrElse('poolSize, 16).asInstanceOf[Int]
37 |
38 | txtFile.map {
39 | line =>
40 | val fields = line.replace("\"", "").split(",")
41 | // "1870276152746","CALL","18602761525746"
42 | // "13512340050","CALL","15607804358",1
43 | // CALL 边有 mgm 属性
44 | if (fields.length == 4 && "CALL".equals(fields(1)) && !"\\N".equals(fields(3))) {
45 | (fields(0), fields(1), fields(2), Some(fields(3)))
46 | } else {
47 | (fields(0), fields(1), fields(2), None)
48 | }
49 |
50 | }.foreachPartition(partitionOfRecords => {
51 | val provider = ImportCommon.getJanusGraph(hosts, port, poolSize)
52 | val client = provider.getClient
53 |
54 | var cqlList: List[String] = List()
55 | partitionOfRecords.foreach(record => {
56 | var edgeCql = ".V().has('name','" + record._1 + "').as('a').V().has('name','" + record._3 + "')" +
57 | ".coalesce(inE('" + record._2 + "').where(outV().as('a')), addE('" + record._2 + "').from('a'))"
58 |
59 | if (record._2 == "CALL" && record._4.nonEmpty) {
60 | edgeCql += ".property('mgm'," + record._4.get + ")"
61 | }
62 |
63 | cqlList = edgeCql :: cqlList
64 | if (cqlList.size >= batchSize) {
65 | handleEdgeList(cqlList, client)
66 | cqlList = List()
67 | }
68 | })
69 |
70 | handleEdgeList(cqlList, client)
71 | client.close()
72 | provider.close()
73 | })
74 | println("***********************stoped***********************")
75 | sc.stop()
76 | }
77 |
78 | private def handleEdgeList(cqlList: List[String], client: Client): Unit = {
79 | var runCql = "g = graph.traversal();g"
80 |
81 | cqlList.foreach(cql => runCql += cql)
82 | if (cqlList.nonEmpty) {
83 | runCql += ".count()"
84 | ImportCommon.submitWithRetry(client, runCql)
85 | }
86 | }
87 |
88 | }
89 |
--------------------------------------------------------------------------------
/dataImport/src/main/scala/com/qihoo/finance/tap/increment/VertexImportIncrement.scala:
--------------------------------------------------------------------------------
1 | package com.qihoo.finance.tap.increment
2 |
3 | import java.util
4 |
5 | import com.qihoo.finance.tap.{Helper, ImportCommon}
6 | import org.apache.log4j.{LogManager, Logger}
7 | import org.apache.spark.{SparkConf, SparkContext}
8 | import org.apache.tinkerpop.gremlin.driver.{Client, Result}
9 |
10 | object VertexImportIncrement {
11 |
12 | val logger: Logger = LogManager.getLogger("VertexImportIncrement")
13 |
14 | val usage =
15 | """
16 | 顶点增量导入,判断属性
17 | Usage: VertexImportIncrement [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] [--batch-size 20] E:\360_doc\lolth\mobile.csv
18 | """
19 |
20 | type OptionMap = Map[Symbol, Any]
21 |
22 | def main(args: Array[String]) {
23 | if (args.length == 0) {
24 | println(usage)
25 | System.exit(0)
26 | }
27 |
28 | val argList = args.toList
29 | val options = ImportCommon.nextOption(Map(), argList)
30 |
31 | val conf = new SparkConf().setAppName("VertexImportIncrement")
32 | //setMaster("local") 本机的spark就用local,远端的就写ip
33 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。
34 | // conf.setMaster("local")
35 |
36 | val sc = new SparkContext(conf)
37 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String])
38 | val hosts = options.getOrElse('janusgraphHosts, "").asInstanceOf[String]
39 | val port = options.getOrElse('janusgraphPort, 8182).asInstanceOf[Int]
40 | val batchSize = options.getOrElse('batchSize, 50).asInstanceOf[Int]
41 | val poolSize = options.getOrElse('poolSize, 16).asInstanceOf[Int]
42 |
43 |
44 | txtFile.map {
45 | line =>
46 | val labelLast = line.indexOf("[")
47 | val attrStart = line.indexOf("{")
48 | val label = line.substring(0, labelLast)
49 | val attrStr = line.substring(attrStart, line.length)
50 | (label.toUpperCase(), attrStr)
51 | }.foreachPartition(partitionOfRecords => {
52 | val provider = ImportCommon.getJanusGraph(hosts, port, poolSize)
53 | val client = provider.getClient
54 | var recordList: List[(String, String)] = List()
55 |
56 | partitionOfRecords.foreach(record => {
57 | if (!ImportCommon.isEmpty(record._1)) {
58 |
59 | recordList = (record._1, record._2) :: recordList
60 | if (recordList.size >= batchSize) {
61 | ImportCommon.handleVertexIncrementList(recordList, client)
62 | recordList = List()
63 | }
64 |
65 | }
66 | })
67 |
68 | ImportCommon.handleVertexIncrementList(recordList, client)
69 | client.close()
70 | provider.close()
71 | })
72 |
73 | println("***********************stoped***********************")
74 | sc.stop()
75 | }
76 |
77 | def isVertexExist(record: (String, String), client: Client): Boolean = {
78 | val jsonObject = Helper.getVertexProperty(record._2)
79 | val name = jsonObject.getString("name")
80 | val cql = "g.V().has('name','" + name + "').count()"
81 |
82 | val results: util.List[Result] = ImportCommon.getResultWithRetry(client, cql)
83 | // val results: util.List[Result] = client.submit(cql).all.get
84 |
85 | if (results != null && results.size() > 0 && results.get(0).getInt > 0) {
86 | return true
87 | }
88 | false
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/janusgraph_yarn.md:
--------------------------------------------------------------------------------
1 | # Janusgraph Yarn Configuration
2 |
3 | 此文档旨在说明 JanusGraph 如何集成 yarn
4 |
5 | ## 环境路径
6 | CDH的安装目录 /opt/cloudera/parcels/CDH/
7 | CDH的配置文件目录 /etc/hadoop
8 |
9 | ## 下载
10 | spark-2.2.1-bin-hadoop2.7
11 | janusgraph-0.3.2-hadoop2
12 |
13 |
14 | ## Jar 包冲突解决
15 | spark-2.2.1-bin-hadoop2.7 依赖的 guava-14.0.1.jar 与
16 | janusgraph 依赖的 guava-18.0.jar 存着冲突。使用 guava-18.0.jar
17 |
18 | rm -f spark-2.2.1-bin-hadoop2.7/jars/guava-*.jar
19 |
20 | cp janusgraph/lib/guava-18.0.jar spark/jars/
21 |
22 | ## 修改 bin/gremlin.sh
23 | ```bash
24 | export CLASSPATH="$CLASSPATH:/etc/hadoop/conf/*:/opt/cloudera/parcels/CDH/lib/hadoop-yarn/*:/home/q/spark/jars/*"
25 | ```
26 |
27 | ## 文件配置
28 | gremlin_yan.sh
29 | ```bash
30 | #!/bin/bash
31 | export HADOOP_CONF_DIR=/etc/hadoop/conf
32 |
33 | export CLASSPATH=$CLASSPATH:$HADOOP_CONF_DIR
34 | # 关键,会从此目录加载依赖的 spark和yarn jar包 janusgraph 提供的spark jar包不全
35 | export SPARK_HOME=/home/q/spark
36 |
37 | export PATH=$PATH:$SPARK_HOME/bin
38 | bin/gremlin.sh
39 | ```
--------------------------------------------------------------------------------
/optimize.md:
--------------------------------------------------------------------------------
1 | # JanusGraph 查询优化
2 |
3 | JanusGraph 的查询有比较多的优化点,在此做些说明
4 |
5 | ## 属性 _multiPreFetch 优化
6 | 这是个人认为最重要的优化,0.4.0 版本才提供的功能,没有这个功能 JanusGraph 在大数据量的生产环境基本不可用
7 |
8 | ```bash
9 | g.V().has('name', P.within('186xxxx6666')).both('CALL').or(has('is_register','true'), has('is_risk','true')).as('m2').profile()
10 | ```
11 | 类似上面的语句,没有这个优化,Jansugraph 会找到对端的顶点然后每个顶点单独去获取属性再做过滤条件
12 | 在生产获取的顶点数很多的时候基本不可用
13 | 耗时特别长
14 | 触发了这个优化的话它会批量获取顶点的属性然后做过滤
15 | ```bash
16 | gremlin> g.V(6554048).outE('aggregation').otherV().has('name', neq('bob')).count().profile()
17 | ==>Traversal Metrics
18 | Step Count Traversers Time (ms) % Dur
19 | =============================================================================================================
20 | GraphStep(vertex,[6554048]) 1 1 35.538 0.15
21 | JanusGraphVertexStep(OUT,[aggregation],vertex) 30159 30159 2220.394 9.28
22 | \_condition=(PROPERTY AND visibility:normal)
23 | \_orders=[]
24 | \_isFitted=true
25 | \_isOrdered=true
26 | \_query=org.janusgraph.diskstorage.keycolumnvalue.SliceQuery@8019d62e
27 | \_multi=true
28 | \_vertices=20000
29 | \_multiPreFetch=true
30 | optimization 82.480
31 | backend-query 30159 275.560
32 | \_query=org.janusgraph.diskstorage.keycolumnvalue.SliceQuery@81bebe6b
33 | optimization 0.712
34 | backend-query 257398 1491.029
35 | \_query=org.janusgraph.diskstorage.keycolumnvalue.SliceQuery@8019d62e
36 | HasStep([name.neq(bob)]) 28054 28054 21612.923 90.33
37 | CountGlobalStep 1 1 56.938 0.24
38 | >TOTAL - - 23925.795 -
39 | ```
40 | 在profile `_multiPreFetch=true` 表示触发了这个优化
41 | 这个优化触发的条件有点苛刻
42 | 首先需要在配置文件中配置 `query.batch-property-prefetch=true`
43 | 其次需要利用`has`进行属性过滤
44 | 再者查询出来的数据行数不能超过 配置文件中 `cache.tx-cache-size` 设置的值(默认值为2W)
45 | 意思是如果查询出点超过设置值就不会触发这个优化
46 | 更加详细的信息可以参考如下[链接](https://github.com/JanusGraph/janusgraph/issues/984)
47 |
48 | ## 返回结果优化
49 | ```bash
50 | g.V().has("MOBILE", "name", P.within('186xxxx6666')).as("m1").both("CALL").as('m2') \
51 | .select("m1", "m2") \
52 | .by(valueMap("name")) \
53 | .by(valueMap("name", "is_risk", "status", "is_service", "overdue_status"))
54 | ```
55 | 上面的查询返回的结果是
56 | ```bash
57 | {m1={name=[18658606666]}, m2={name=[13064767986]}}
58 | {m1={name=[18658606666]}, m2={name=[13291676581]}}
59 | {m1={name=[18658606666]}, m2={name=[13566665915]}}
60 | {m1={name=[18658606666]}, m2={name=[15072770149]}}
61 | {m1={name=[18658606666]}, m2={name=[15268898802]}}
62 | {m1={name=[18658606666]}, m2={name=[18657617779], status=[3]}}
63 | ```
64 | 这样的查询返回结果看似没啥问题,我们之前也是这样写的,这样查询语法比较简洁。
65 | 这个查询有两个问题,我们生产的数据量比较大,并且涉及大量的查询。在生产环境应用的内存很快被打满
66 | 这个查询有两个问题,第一个问题是 name返回的是list,在`Java`中 `ArrayList`的默认值是 10,意思是即使你属性只有一个值
67 | 也会创建10个对象
68 | 第二个问题是,返回的Map 过多,m1, m2 这两个Map虽然只有一个key,但是在`Java` 中`HashMap`的默认值 16,
69 | 同上面的问题会导致大量的内存浪费
70 | ```java
71 | public class ArrayList extends AbstractList
72 | implements List, RandomAccess, Cloneable, java.io.Serializable
73 | {
74 | private static final long serialVersionUID = 8683452581122892189L;
75 |
76 | /**
77 | * Default initial capacity.
78 | */
79 | private static final int DEFAULT_CAPACITY = 10;
80 | ...
81 | }
82 |
83 |
84 | public class HashMap extends AbstractMap
85 | implements Map, Cloneable, Serializable {
86 |
87 | private static final long serialVersionUID = 362498820763181265L;
88 |
89 | /**
90 | * The default initial capacity - MUST be a power of two.
91 | */
92 | static final int DEFAULT_INITIAL_CAPACITY = 1 << 4; // aka 16
93 | ...
94 | }
95 | ```
96 | 正确的查询语句应该是像下面这样的
97 | ```bash
98 | g.V().has("MOBILE", "name", P.within('186xxxx6666')).as("m1").both("CALL").as('m2') \
99 | .select("m1", "m2") \
100 | .project("cName", "mobile", "isRisk", "isService") \
101 | .by(select("m1").by(coalesce(values("name"), constant("null"))) ) \
102 | .by(select("m2").by(coalesce(values("name"), constant("null"))) ) \
103 | .by(select("m2").by(coalesce(values("is_risk"), constant("null"))) ) \
104 | .by(select("m2").by(coalesce(values("is_service"), constant("null"))) ) \
105 | # 返回结果为一个 map 并且结果不为 list
106 | {cName=186xxxx6666, mobile=186xxxx6666, isRisk=true, status=0}
107 | ```
108 | 如果确实需要使用`valueMap` 并且不希望返回 List结果可以用下面的语法
109 | ```bash
110 | valueMap().by(unfold())
111 | ```
112 | ## 插入顶点和边的重复检查
113 | ```bash
114 | g.V().has("name", nodeName).fold().coalesce(unfold(), addV("MOBILE").property("name", nodeName)).next();
115 | ```
116 | 上面的语句类似merge功能,如果顶点不存在添加顶点
117 |
118 | ```bash
119 | g.V(fromNode).as("a").V(toNode).coalesce(inE(relationLabel).where(outV().as("a")), addE(relationLabel).from("a"))
120 | ```
121 | 上面的语句会检查两个顶点之间的边是否存着,如果不存在添加对应的边
122 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.qihoo.finance
8 | janusgraph
9 | pom
10 | 1.0.0-SNAPSHOT
11 |
12 |
13 | dataImport
14 |
15 |
16 |
17 | UTF-8
18 | 1.8
19 | false
20 |
21 | 1.9
22 | 1.8.3
23 | 2.2.1
24 | 2.11
25 | 2.6.5
26 | 1.2.6
27 |
28 |
29 |
30 |
31 |
32 |
33 | org.apache.maven.plugins
34 | maven-compiler-plugin
35 | 3.3
36 |
37 | ${java.version}
38 | ${java.version}
39 | true
40 |
41 |
42 |
43 |
44 | org.apache.maven.plugins
45 | maven-deploy-plugin
46 |
47 | ${skip_maven_deploy}
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
--------------------------------------------------------------------------------