├── .gitignore ├── README.md ├── dataImport ├── pom.xml └── src │ └── main │ ├── resources │ ├── clean.groovy │ ├── clean.sh │ ├── conf │ │ ├── gremlin-server │ │ │ └── janusgraph-hbase-es-server-new.properties │ │ └── hadoop-graph │ │ │ ├── hadoop-vertex-script-yarn.properties │ │ │ └── hadoop-vertex-script.properties │ ├── convert.sh │ ├── data │ │ └── lolth-schema.groovy │ ├── data_export.txt │ ├── gremlin_run.groovy │ ├── gremlin_run.sh │ ├── hive │ │ └── migrate_csv_to_hive_table.sql │ ├── scripts │ │ └── script_mobile.groovy │ └── test.data │ │ ├── call_edge.csv │ │ ├── device.csv │ │ ├── has.csv │ │ └── mobile.csv │ └── scala │ └── com │ └── qihoo │ └── finance │ └── tap │ ├── Helper.java │ ├── ImportCommon.scala │ ├── JanusGraphProvider.java │ ├── ScalaHelper.scala │ ├── data │ └── convert │ │ ├── CallEdgeConvertToCsv.scala │ │ ├── DeviceConvertToCsv.scala │ │ ├── MergeNodesAndEdges.scala │ │ ├── MobileConvertToCsv.scala │ │ └── OtherEdgeConvertToCsv.scala │ ├── direct │ ├── EdgeImport.scala │ └── VertexImport.scala │ └── increment │ ├── EdgeImportIncrement.scala │ └── VertexImportIncrement.scala ├── janusgraph_yarn.md ├── optimize.md └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | # maven ignore 2 | target/ 3 | *.jar 4 | *.war 5 | *.zip 6 | *.tar 7 | 8 | # eclipse ignore 9 | .settings/ 10 | .project 11 | .classpath 12 | 13 | # idea ignore 14 | .idea/* 15 | py_tag_tool/.idea/* 16 | *.ipr 17 | *.iml 18 | *.iws 19 | 20 | # temp ignore 21 | logs/ 22 | *.doc 23 | *.log 24 | *.cache 25 | *.diff 26 | *.patch 27 | *.tmp 28 | *.versionsBackup 29 | 30 | # system ignore 31 | .DS_Store 32 | Thumbs.db -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Janusgraph-data-importer 2 | 3 | ## 说明 4 | 本项目是本人具体项目数据迁移过程中的代码,需要根据自身情况修改代码 5 | 交流邮箱:zhoupengblack@qq.com 6 | 7 | ## 工程说明 8 | 9 | ### resources 文件说明 10 | * conf 目录下放置了数据导入的配置文件,并且对相关配置项都做了些说明 11 | * data 为图创建的 schema 12 | * test.data 是我们从 AgensGraph 数据库导出的顶点和边的数据格式 后续的代码都是基于这份数据格式进行解析和操作的 13 | * hive 将转换后的数据利用 hive 添加唯一 id 14 | * scripts janusgraph 导入的时候需要对数据进行解析 15 | * resources 下的 sh,groovy 脚本是方便数据导入 写的一些简单脚本 16 | 17 | 18 | ### 代码文件说明 19 | * data.convert 目录下代码是将导出的数据转换为hive 表,再利用上面说的hive脚本添加 唯一id 20 | 核心为 MergeNodesAndEdges 利用 spark的 cogroup 操作 转换为 janusgraph 接受的导入格式 21 | 此步操作耗时比较久,比较占内存,spark.network.timeout=600 设置长一点 22 | 23 | * direct 此目录下的代码 为直接连接 janusgraphServer 插入数据。如果数据量比较小的情况下可以使用 24 | 25 | * increment 为导入增量数据,当历史数据导入完毕后需要导入增量数据,需要检查顶点和表是否已经存在 26 | 27 | 28 | ## 常见问题 29 | java.lang.OutOfMemoryError: unable to create new native thread 30 | * 机器的 ulimit -u 设置比较小 可以设置 102400 31 | 32 | Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded 33 | * spark 内存不足,有两个方案 增大 spark.executor.memory 或者调小 spark.executor.cores 34 | 保证 spark.executor.memory / spark.executor.cores 在 6G,7G左右 35 | 36 | KryoSerializer Failed to find one of the right cookies 37 | * KryoSerializer 序列化 spark 配置不对,参考 hadoop-vertex-script.properties 配置文件 38 | 39 | * 确保建议唯一索引的数据是唯一的,id是唯一的,不然数据导入会有问题 40 | 41 | 42 | ## 补充 43 | * pom 文件中的 provided 表示在打包的时候不会打入进去 44 | 集群环境中是已经有这些包的,在本地调试的时候需要将这行注释掉 45 | 本地调试时 代码中的这行也需要放开 // conf.setMaster("local") 46 | ``` 47 | 48 | org.apache.spark 49 | spark-hive_${scala.version} 50 | ${spark.version} 51 | provided 52 | 53 | ``` 54 | -------------------------------------------------------------------------------- /dataImport/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | com.qihoo.finance 7 | janusgraph 8 | 1.0.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | dataImport 13 | 14 | 15 | 16 | org.janusgraph 17 | janusgraph-core 18 | 0.3.1 19 | 20 | 21 | org.janusgraph 22 | janusgraph-hbase 23 | 0.3.1 24 | 25 | 26 | org.apache.hbase 27 | hbase-shaded-client 28 | 1.2.6 29 | 30 | 31 | org.apache.hbase 32 | hbase-shaded-server 33 | 1.2.6 34 | 35 | 36 | 37 | org.janusgraph 38 | janusgraph-es 39 | 0.3.1 40 | 41 | 42 | org.apache.tinkerpop 43 | gremlin-driver 44 | 3.3.3 45 | 46 | 47 | com.google.guava 48 | guava 49 | 16.0 50 | 51 | 52 | 53 | com.alibaba 54 | fastjson 55 | 1.2.58 56 | 57 | 58 | commons-codec 59 | commons-codec 60 | ${commons.codec.version} 61 | 62 | 63 | 64 | org.apache.spark 65 | spark-core_${scala.version} 66 | ${spark.version} 67 | provided 68 | 69 | 70 | org.apache.spark 71 | spark-streaming_${scala.version} 72 | ${spark.version} 73 | provided 74 | 75 | 76 | org.apache.spark 77 | spark-sql_${scala.version} 78 | ${spark.version} 79 | provided 80 | 81 | 82 | org.apache.spark 83 | spark-hive_${scala.version} 84 | ${spark.version} 85 | provided 86 | 87 | 88 | 89 | 90 | org.apache.hadoop 91 | hadoop-client 92 | ${hadoop.version} 93 | provided 94 | 95 | 96 | 97 | org.apache.hadoop 98 | hadoop-hdfs 99 | ${hadoop.version} 100 | provided 101 | 102 | 103 | org.apache.hadoop 104 | hadoop-common 105 | ${hadoop.version} 106 | provided 107 | 108 | 109 | org.apache.hadoop 110 | hadoop-streaming 111 | ${hadoop.version} 112 | provided 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | org.scala-tools 122 | maven-scala-plugin 123 | 2.15.2 124 | 125 | 126 | compile 127 | 128 | compile 129 | 130 | compile 131 | 132 | 133 | test-compile 134 | 135 | testCompile 136 | 137 | test-compile 138 | 139 | 140 | process-resources 141 | 142 | compile 143 | 144 | 145 | 146 | 147 | 148 | org.apache.maven.plugins 149 | maven-source-plugin 150 | 3.0.1 151 | 152 | 153 | package 154 | 155 | jar-no-fork 156 | 157 | 158 | 159 | 160 | 161 | 162 | org.apache.maven.plugins 163 | maven-compiler-plugin 164 | 3.6.0 165 | 166 | ${java.version} 167 | ${java.version} 168 | true 169 | 170 | 171 | 172 | 173 | 174 | org.apache.maven.plugins 175 | maven-jar-plugin 176 | 2.4 177 | 178 | 179 | 180 | com.qihoo.finance.tap.Main 181 | 182 | 183 | 184 | 185 | 186 | 187 | org.apache.maven.plugins 188 | maven-shade-plugin 189 | 2.3 190 | 191 | false 192 | ${project.build.directory}/${project.artifactId}-libs.jar 193 | 194 | 195 | *:* 196 | 197 | META-INF/*.SF 198 | META-INF/*.DSA 199 | META-INF/*.RSA 200 | 201 | 202 | 203 | 204 | 205 | 206 | package 207 | 208 | shade 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | -------------------------------------------------------------------------------- /dataImport/src/main/resources/clean.groovy: -------------------------------------------------------------------------------- 1 | graph = JanusGraphFactory.open('conf/gremlin-server/janusgraph-hbase-es-server-new.properties') 2 | graph.close(); org.janusgraph.core.util.JanusGraphCleanup.clear(graph) 3 | 4 | :load data/lolth-schema-pass.groovy 5 | graph = JanusGraphFactory.open('conf/gremlin-server/janusgraph-hbase-es-server-new.properties') 6 | 7 | defineLolthSchema(graph) 8 | graph.close() -------------------------------------------------------------------------------- /dataImport/src/main/resources/clean.sh: -------------------------------------------------------------------------------- 1 | bin/gremlin.sh -e ./clean.groovy -------------------------------------------------------------------------------- /dataImport/src/main/resources/conf/gremlin-server/janusgraph-hbase-es-server-new.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/360digitech/janusgraph-data-importer/e4a09b32984961884a5994e4cdff80b211318e43/dataImport/src/main/resources/conf/gremlin-server/janusgraph-hbase-es-server-new.properties -------------------------------------------------------------------------------- /dataImport/src/main/resources/conf/hadoop-graph/hadoop-vertex-script-yarn.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/360digitech/janusgraph-data-importer/e4a09b32984961884a5994e4cdff80b211318e43/dataImport/src/main/resources/conf/hadoop-graph/hadoop-vertex-script-yarn.properties -------------------------------------------------------------------------------- /dataImport/src/main/resources/conf/hadoop-graph/hadoop-vertex-script.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/360digitech/janusgraph-data-importer/e4a09b32984961884a5994e4cdff80b211318e43/dataImport/src/main/resources/conf/hadoop-graph/hadoop-vertex-script.properties -------------------------------------------------------------------------------- /dataImport/src/main/resources/convert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | spark-submit --class com.qihoo.finance.tap.data.convert.MobileConvertToCsv --master yarn --conf spark.network.timeout=300 --deploy-mode client --queue root.graph ./dataImport-libs.jar hdfs://360jinrongbdp/user/finloan/janusgraph_new/mobile_split/part-* 3 | 4 | 5 | spark-submit --class com.qihoo.finance.tap.data.convert.MergeNodesAndEdges --master yarn --conf spark.network.timeout=600 --deploy-mode client --queue root.graph ./dataImport-libs.jar --outputFile hdfs://360jinrongbdp/user/finloan/janusgraph_new/merge_all_relation.txt 6 | -------------------------------------------------------------------------------- /dataImport/src/main/resources/data/lolth-schema.groovy: -------------------------------------------------------------------------------- 1 | /* lolth-schema.groovy 2 | * 3 | * Helper functions for declaring JanusGraph schema elements 4 | * (vertex labels, edge labels, property keys) to accommodate 5 | * TP3 sample data. 6 | * 7 | * Sample usage in a gremlin.sh session: 8 | * bin/gremlin.sh 9 | * :load data/lolth-schema.groovy 10 | * t = JanusGraphFactory.open('conf/gremlin-server/janusgraph-hbase-es-server.properties') 11 | * defineLolthSchema(t) 12 | * t.close() 13 | * gremlin> 14 | */ 15 | 16 | def defineLolthSchema(janusGraph) { 17 | mgmt = janusGraph.openManagement() 18 | name = mgmt.makePropertyKey("name").dataType(String.class).make() 19 | is_register = mgmt.makePropertyKey("is_register").dataType(String.class).make() 20 | is_risk = mgmt.makePropertyKey("is_risk").dataType(String.class).make() 21 | is_internal = mgmt.makePropertyKey("is_internal").dataType(String.class).make() 22 | is_service = mgmt.makePropertyKey("is_service").dataType(String.class).make() 23 | merchant_name = mgmt.makePropertyKey("merchant_name").dataType(String.class).make() 24 | is_exception = mgmt.makePropertyKey("is_exception").dataType(String.class).make() 25 | is_white = mgmt.makePropertyKey("is_white").dataType(String.class).make() 26 | 27 | // name 属性加密 28 | // nm_pass = mgmt.makePropertyKey("nm_pass").dataType(String.class).make() 29 | // nm_sha1 = mgmt.makePropertyKey("nm_sha1").dataType(String.class).make() 30 | 31 | status = mgmt.makePropertyKey("status").dataType(Integer.class).make() 32 | suspect_risk = mgmt.makePropertyKey("suspect_risk").dataType(Integer.class).make() 33 | overdue_status = mgmt.makePropertyKey("overdue_status").dataType(Integer.class).make() 34 | mgm = mgmt.makePropertyKey("mgm").dataType(Integer.class).make() 35 | 36 | blid = mgmt.makePropertyKey("bulkLoader.vertex.id").dataType(Long.class).make() 37 | mgmt.buildIndex("byBulkLoaderVertexId", Vertex.class).addKey(blid).buildCompositeIndex() 38 | 39 | // 注意 JanusGraph 的 label名称区分大小写,而 AgensGraph 不做区分 40 | // 所有统一使用大写 41 | mgmt.makeVertexLabel("DEVICE").make() 42 | mgmt.makeVertexLabel("MOBILE").make() 43 | mgmt.makeVertexLabel("WIFI").make() 44 | 45 | mgmt.makeEdgeLabel("CALL").multiplicity(Multiplicity.SIMPLE).make() 46 | mgmt.makeEdgeLabel("HAS").multiplicity(Multiplicity.SIMPLE).make() 47 | mgmt.makeEdgeLabel("USE").multiplicity(Multiplicity.SIMPLE).make() 48 | mgmt.makeEdgeLabel("USE_WIFI").multiplicity(Multiplicity.SIMPLE).make() 49 | 50 | mgmt.buildIndex("name", Vertex.class).addKey(name).unique().buildCompositeIndex() 51 | // mgmt.buildIndex("nm_sha1", Vertex.class).addKey(nm_sha1).unique().buildCompositeIndex() 52 | mgmt.commit() 53 | } -------------------------------------------------------------------------------- /dataImport/src/main/resources/data_export.txt: -------------------------------------------------------------------------------- 1 | # AgensGraph data export 2 | # 顶点 3 | COPY ( 4 | match (m:mobile) return m 5 | ) TO '/tmp/mobile.csv'; 6 | 7 | 8 | COPY ( 9 | match (m:device) return m 10 | ) TO '/tmp/device.csv'; 11 | 12 | 13 | COPY ( 14 | match (m:wifi) return m 15 | ) TO '/tmp/wifi.csv'; 16 | 17 | 18 | COPY ( 19 | match (m1:MOBILE)-[r:CALL]->(m2:MOBILE) return m1.name, 'CALL', m2.name, r.mgm 20 | ) TO '/tmp/call.csv' DELIMITER ','; 21 | 22 | 23 | COPY ( 24 | match (m1:MOBILE)-[r:USE]->(m2:DEVICE) return m1.name, 'USE', m2.name 25 | ) TO '/tmp/use.csv' DELIMITER ','; 26 | 27 | COPY ( 28 | match (m1:DEVICE)-[r:HAS]->(m2:MOBILE) return m1.name, 'HAS', m2.name 29 | ) TO '/tmp/has.csv' DELIMITER ','; 30 | 31 | COPY ( 32 | match (m1:MOBILE)-[r:USE_WIFI]->(m2:WIFI) return m1.name, 'USE_WIFI', m2.name 33 | ) TO '/tmp/use_wifi.csv' DELIMITER ','; 34 | -------------------------------------------------------------------------------- /dataImport/src/main/resources/gremlin_run.groovy: -------------------------------------------------------------------------------- 1 | 2 | graph = GraphFactory.open("conf/hadoop-graph/hadoop-vertex-script.properties") 3 | blvp = BulkLoaderVertexProgram.build().bulkLoader(OneTimeBulkLoader).writeGraph("conf/gremlin-server/janusgraph-hbase-es-server-new.properties").create(graph) 4 | graph.compute(SparkGraphComputer).program(blvp).submit().get() -------------------------------------------------------------------------------- /dataImport/src/main/resources/gremlin_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 配置 hadoop 环境读取 hdfs文件 3 | export HADOOP_CONF_DIR=/etc/hadoop/conf 4 | export CLASSPATH=$HADOOP_CONF_DIR 5 | nohup bin/gremlin.sh -e ./gremlin_run.groovy & -------------------------------------------------------------------------------- /dataImport/src/main/resources/hive/migrate_csv_to_hive_table.sql: -------------------------------------------------------------------------------- 1 | 2 | # 2425 重复值在 wifi中全部有 3 | select count(*) from migrate_id_repe_check_result_tmp a 4 | join migrate_use_wifi_tmp b 5 | on a.name = b.end_name; 6 | 7 | 8 | insert overwrite table migrate_wifi_tmp 9 | select a.* from migrate_wifi_tmp a 10 | left join migrate_id_repe_check_result_tmp b 11 | on a.name = b.name 12 | where b.name is null; 13 | 14 | 15 | 16 | select * from migrate_use_wifi_tmp where end_name = 'FP2368037167772741632'; 17 | 18 | 19 | select row_number() over () as rowid, name from migrate_wifi_tmp limit 100; 20 | 21 | 22 | # 合并生成唯一 id 23 | create table migrate_id_mat_tmp as 24 | select row_number() over () as id, name from ( 25 | select name from migrate_device_tmp 26 | union 27 | select name from migrate_wifi_tmp 28 | union 29 | select name from migrate_mobile_tmp 30 | ) as abc; 31 | 32 | 33 | insert overwrite table migrate_id_repe_check_tmp 34 | select name, type from ( 35 | select name, 'd' as type from migrate_device_tmp 36 | union 37 | select name, 'w' as type from migrate_wifi_tmp 38 | union 39 | select name, 'm' as type from migrate_mobile_tmp 40 | ) as abc; 41 | 42 | 43 | create table migrate_id_repe_check_result_tmp as 44 | select name, count(*) as count 45 | from migrate_id_repe_check_tmp 46 | group by name 47 | having count(*) > 1; 48 | 49 | 50 | 51 | create table migrate_mobile_id_tmp as 52 | select b.id, a.* from migrate_mobile_tmp a 53 | join migrate_id_mat_tmp b 54 | on a.name = b.name; 55 | 56 | 57 | create table migrate_device_id_tmp as 58 | select b.id, a.* from migrate_device_tmp a 59 | join migrate_id_mat_tmp b 60 | on a.name = b.name; 61 | 62 | create table migrate_wifi_id_tmp as 63 | select b.id, a.* from migrate_wifi_tmp a 64 | join migrate_id_mat_tmp b 65 | on a.name = b.name; 66 | 67 | 68 | 69 | DROP TABLE IF EXISTS migrate_call_id_tmp; 70 | create table migrate_call_id_tmp as 71 | select b.id as start_id, c.id as end_id, a.mgm from migrate_call_tmp a 72 | join migrate_mobile_id_tmp b 73 | on a.start_name = b.name 74 | join migrate_mobile_id_tmp c 75 | on a.end_name = c.name; 76 | 77 | 78 | DROP TABLE IF EXISTS migrate_use_id_tmp; 79 | create table migrate_use_id_tmp as 80 | select b.id as start_id, c.id as end_id from migrate_use_tmp a 81 | join migrate_mobile_id_tmp b 82 | on a.start_name = b.name 83 | join migrate_device_id_tmp c 84 | on a.end_name = c.name; 85 | 86 | 87 | 88 | DROP TABLE IF EXISTS migrate_has_id_tmp; 89 | create table migrate_has_id_tmp as 90 | select b.id as start_id, c.id as end_id from migrate_has_tmp a 91 | join migrate_device_id_tmp b 92 | on a.start_name = b.name 93 | join migrate_mobile_id_tmp c 94 | on a.end_name = c.name; 95 | 96 | 97 | DROP TABLE IF EXISTS migrate_use_wifi_id_tmp; 98 | create table migrate_use_wifi_id_tmp as 99 | select b.id as start_id, c.id as end_id from migrate_use_wifi_tmp a 100 | join migrate_mobile_id_tmp b 101 | on a.start_name = b.name 102 | join migrate_wifi_id_tmp c 103 | on a.end_name = c.name; -------------------------------------------------------------------------------- /dataImport/src/main/resources/scripts/script_mobile.groovy: -------------------------------------------------------------------------------- 1 | def parse(line) { 2 | def (vertex, inEdges, outEdges) = line.split(/\t/, 3) 3 | def (v1id, v1label, v1props) = vertex.split(/,/, 3) 4 | def v1 = graph.addVertex(T.id, v1id.toLong(), T.label, v1label) 5 | switch (v1label) { 6 | case "MOBILE": 7 | def (name, nm_pass, nm_sha1, is_register, is_risk, is_internal, is_service, merchant_name, status, suspect_risk, overdue_status) = v1props.split(/,/, 11) 8 | v1.property("name", name) 9 | v1.property("nm_pass", nm_pass) 10 | v1.property("nm_sha1", nm_sha1) 11 | 12 | if (is_register?.trim()) { 13 | v1.property("is_register", is_register) 14 | } 15 | if (is_risk?.trim()) { 16 | v1.property("is_risk", is_risk) 17 | } 18 | if (is_internal?.trim()) { 19 | v1.property("is_internal", is_internal) 20 | } 21 | if (is_service?.trim()) { 22 | v1.property("is_service", is_service) 23 | } 24 | if (merchant_name?.trim()) { 25 | v1.property("merchant_name", merchant_name) 26 | } 27 | if (status?.trim()) { 28 | v1.property("status", status.toInteger()) 29 | } 30 | if (suspect_risk?.trim()) { 31 | v1.property("suspect_risk", suspect_risk.toInteger()) 32 | } 33 | if (overdue_status?.trim()) { 34 | v1.property("overdue_status", overdue_status.toInteger()) 35 | } 36 | break 37 | case "DEVICE": 38 | case "WIFI": 39 | def (name, is_exception, is_white) = v1props.split(/,/, 3) 40 | v1.property("name", name) 41 | if (is_exception?.trim()) { 42 | v1.property("is_exception", is_exception) 43 | } 44 | if (is_white?.trim()) { 45 | v1.property("is_white", is_white) 46 | } 47 | break 48 | default: 49 | throw new Exception("Unexpected vertex label: ${v1label}") 50 | } 51 | [[outEdges, true], [inEdges, false]].each { def edges, def out -> 52 | edges.split(/\|/).grep().each { def edge -> 53 | def parts = edge.split(/,/) 54 | def otherV, eLabel, mgm = null 55 | if (parts.size() == 2) { 56 | (eLabel, otherV) = parts 57 | } else { 58 | (eLabel, otherV, mgm) = parts 59 | } 60 | def v2 = graph.addVertex(T.id, otherV.toLong()) 61 | def e = out ? v1.addOutEdge(eLabel, v2) : v1.addInEdge(eLabel, v2) 62 | 63 | if (mgm?.trim()) e.property("mgm", mgm.toInteger()) 64 | } 65 | } 66 | return v1 67 | } -------------------------------------------------------------------------------- /dataImport/src/main/resources/test.data/call_edge.csv: -------------------------------------------------------------------------------- 1 | "187027xx013","CALL","187xxx63006",1 2 | "187027xx006","CALL","187xxx61013",\N -------------------------------------------------------------------------------- /dataImport/src/main/resources/test.data/device.csv: -------------------------------------------------------------------------------- 1 | device[10.2]{"name": "FP1627486073238818816"} 2 | device[10.3]{"name": "FP1418992331331342021"} 3 | device[10.4]{"name": "FP1418992331331342005"} 4 | device[10.6]{"name": "FP2659928169380958208"} -------------------------------------------------------------------------------- /dataImport/src/main/resources/test.data/has.csv: -------------------------------------------------------------------------------- 1 | "FP1418992331331342001","HAS","18xxx761005" 2 | "FP1418992331331342001","HAS","18xxx761004" -------------------------------------------------------------------------------- /dataImport/src/main/resources/test.data/mobile.csv: -------------------------------------------------------------------------------- 1 | mobile[11.46]{"name": "1870xxx1013", "is_service": "true", "suspect_risk": 0, "status": 1} 2 | mobile[11.17]{"name": "1870xxx3006", "status": 4, "is_register": "true"} -------------------------------------------------------------------------------- /dataImport/src/main/scala/com/qihoo/finance/tap/Helper.java: -------------------------------------------------------------------------------- 1 | package com.qihoo.finance.tap; 2 | 3 | import com.alibaba.fastjson.JSON; 4 | import com.alibaba.fastjson.JSONObject; 5 | import org.apache.commons.codec.digest.DigestUtils; 6 | 7 | /** 8 | * @author zhoupeng 9 | * @date 2019/5/9 10 | */ 11 | public class Helper { 12 | public static String buildVertexProperty(String label, String jsonString) { 13 | JSONObject jsonObject = Helper.getVertexProperty(jsonString); 14 | return buildPropertyString(label, jsonObject); 15 | } 16 | 17 | private static String buildPropertyString(String label, JSONObject jsonObject) { 18 | StringBuilder builder = new StringBuilder(); 19 | jsonObject.forEach((key, value) -> { 20 | if (value instanceof String) { 21 | builder.append(".property('").append(key).append("', '").append(value).append("')"); 22 | } else { 23 | builder.append(".property('").append(key).append("', ").append(value).append(")"); 24 | } 25 | 26 | // 手机号才需要加密 27 | if ("MOBILE".equals(label) && "name".equals(key)) { 28 | String encrypt = value.toString(); 29 | String sha1Hex = DigestUtils.sha1Hex(value.toString()); 30 | builder.append(".property('").append("nm_pass").append("', '").append(encrypt).append("')"); 31 | builder.append(".property('").append("nm_sha1").append("', '").append(sha1Hex).append("')"); 32 | } 33 | }); 34 | return builder.toString(); 35 | } 36 | 37 | public static String buildIncrementOtherPropertyString(String label, JSONObject jsonObject) { 38 | StringBuilder builder = new StringBuilder(); 39 | jsonObject.forEach((key, value) -> { 40 | 41 | if ("name".equals(key)) { 42 | // 手机号 才需要加密信息 43 | if ("MOBILE".equals(label)) { 44 | String encrypt =value.toString(); 45 | String sha1Hex = DigestUtils.sha1Hex(value.toString()); 46 | 47 | builder.append(".property('").append("nm_pass").append("', '").append(encrypt).append("')"); 48 | builder.append(".property('").append("nm_sha1").append("', '").append(sha1Hex).append("')"); 49 | } 50 | 51 | } else if ("status".equals(key)) { 52 | // do nothing 53 | } else { 54 | if (value instanceof String) { 55 | builder.append(".property('").append(key).append("', '").append(value).append("')"); 56 | } else { 57 | builder.append(".property('").append(key).append("', ").append(value).append(")"); 58 | } 59 | } 60 | 61 | }); 62 | return builder.toString(); 63 | } 64 | 65 | public static JSONObject getVertexProperty(String jsonString) { 66 | return JSON.parseObject(jsonString); 67 | } 68 | } -------------------------------------------------------------------------------- /dataImport/src/main/scala/com/qihoo/finance/tap/ImportCommon.scala: -------------------------------------------------------------------------------- 1 | package com.qihoo.finance.tap 2 | 3 | import java.util 4 | import java.util.concurrent.TimeUnit 5 | import java.util.function.Consumer 6 | 7 | import com.alibaba.fastjson.JSONObject 8 | import org.apache.log4j.{LogManager, Logger} 9 | import org.apache.tinkerpop.gremlin.driver.{Client, Result} 10 | 11 | import scala.util.control.Breaks 12 | 13 | 14 | object ImportCommon { 15 | val logger: Logger = LogManager.getLogger("ImportCommon") 16 | 17 | type OptionMap = Map[Symbol, Any] 18 | 19 | def getJanusGraph(hosts: String, port: Int, poolSize: Int): JanusGraphProvider = { 20 | new JanusGraphProvider(hosts, port, poolSize) 21 | } 22 | 23 | def isEmpty(x: String) = Option(x).forall(_.isEmpty) 24 | 25 | 26 | def submitWithRetry(client: Client, runCql: String) = { 27 | val loop = new Breaks 28 | loop.breakable { 29 | for (a <- 1 to 100) { 30 | try { 31 | client.submit(runCql).stream().forEach(new Consumer[Result] { 32 | override def accept(t: Result): Unit = 33 | logger.info(t.getLong) 34 | }) 35 | loop.break() 36 | } catch { 37 | case ex: Exception => 38 | logger.warn(runCql) 39 | logger.warn(ex.getMessage, ex) 40 | TimeUnit.MILLISECONDS.sleep(1000 * a) 41 | } 42 | 43 | } 44 | } 45 | } 46 | 47 | 48 | def getResultWithRetry(client: Client, runCql: String): util.List[Result] = { 49 | var results: util.List[Result] = null 50 | val loop = new Breaks 51 | loop.breakable { 52 | for (a <- 1 to 100) { 53 | try { 54 | results = client.submit(runCql).all.get 55 | loop.break() 56 | } catch { 57 | case ex: Exception => 58 | logger.warn(ex.getMessage, ex) 59 | TimeUnit.MILLISECONDS.sleep(1000 * a) 60 | } 61 | } 62 | } 63 | 64 | results 65 | } 66 | 67 | 68 | def handleVertexList(recordList: List[(String, String)], client: Client): Unit = { 69 | var runCql = "g = graph.traversal();g" 70 | 71 | recordList.foreach { case (label, attrString) => 72 | runCql += ".addV('" + label + "')" 73 | runCql += Helper.buildVertexProperty(label, attrString); 74 | } 75 | 76 | if (recordList.nonEmpty) { 77 | runCql += ".count()" 78 | 79 | ImportCommon.submitWithRetry(client, runCql) 80 | } 81 | } 82 | 83 | // 顶点增量插入 84 | def handleVertexIncrementList(recordList: List[(String, String)], client: Client): Unit = { 85 | var runCql = "g = graph.traversal();g" 86 | 87 | recordList.foreach { case (label, attrString) => 88 | val attrJson = Helper.getVertexProperty(attrString) 89 | val name = attrJson.getString("name") 90 | runCql += ".V().has('name', '" + name + "').hasLabel('" + label + "').as('m').fold().coalesce(unfold(), addV('" + label + "').property('name', '" + name + "'))" 91 | 92 | runCql += Helper.buildIncrementOtherPropertyString(label, attrJson) 93 | // status 属性单独进行操作 94 | val status = attrJson.getIntValue("status") 95 | if (status > 0) { 96 | runCql += ".V().has('name', '" + name + "').as('m').where(or(select('m').values('status').is(lt(" + status + ")), select('m').hasNot('status'))).property('status', " + status + ")" 97 | } 98 | } 99 | 100 | if (recordList.nonEmpty) { 101 | runCql += ".count()" 102 | ImportCommon.submitWithRetry(client, runCql) 103 | } 104 | } 105 | 106 | def nextOption(map: OptionMap, list: List[String]): OptionMap = { 107 | def isSwitch(s: String) = s(0) == '-' 108 | 109 | list match { 110 | case Nil => map 111 | case "--janusgraph-hosts" :: value :: tail => 112 | ImportCommon.nextOption(map ++ Map('janusgraphHosts -> value.toString), tail) 113 | case "--janusgraph-port" :: value :: tail => 114 | ImportCommon.nextOption(map ++ Map('janusgraphPort -> value.toInt), tail) 115 | case "--batch-size" :: value :: tail => 116 | ImportCommon.nextOption(map ++ Map('batchSize -> value.toInt), tail) 117 | case "--pool-size" :: value :: tail => 118 | ImportCommon.nextOption(map ++ Map('poolSize -> value.toInt), tail) 119 | case "--storage-hostname" :: value :: tail => 120 | ImportCommon.nextOption(map ++ Map('storageHostname -> value.toString), tail) 121 | case "--label" :: value :: tail => 122 | ImportCommon.nextOption(map ++ Map('label -> value.toString), tail) 123 | case "--deviceType" :: value :: tail => 124 | ImportCommon.nextOption(map ++ Map('deviceType -> value.toString), tail) 125 | case "--edgeType" :: value :: tail => 126 | ImportCommon.nextOption(map ++ Map('edgeType -> value.toString), tail) 127 | case "--fromLabel" :: value :: tail => 128 | ImportCommon.nextOption(map ++ Map('fromLabel -> value.toString), tail) 129 | case "--toLabel" :: value :: tail => 130 | ImportCommon.nextOption(map ++ Map('toLabel -> value.toString), tail) 131 | case "--outputFile" :: value :: tail => 132 | ImportCommon.nextOption(map ++ Map('outputFile -> value.toString), tail) 133 | case string :: opt2 :: tail if isSwitch(opt2) => 134 | ImportCommon.nextOption(map ++ Map('importFile -> string.toString), list.tail) 135 | case string :: Nil => ImportCommon.nextOption(map ++ Map('importFile -> string.toString), list.tail) 136 | case option :: tail => println("Unknown option " + option) 137 | Map() 138 | } 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /dataImport/src/main/scala/com/qihoo/finance/tap/JanusGraphProvider.java: -------------------------------------------------------------------------------- 1 | package com.qihoo.finance.tap; 2 | 3 | import org.apache.commons.configuration.Configuration; 4 | import org.apache.commons.configuration.PropertiesConfiguration; 5 | import org.apache.log4j.LogManager; 6 | import org.apache.log4j.Logger; 7 | import org.apache.tinkerpop.gremlin.driver.Client; 8 | import org.apache.tinkerpop.gremlin.driver.Cluster; 9 | 10 | import java.util.Objects; 11 | 12 | /** 13 | * @author zhoupeng 14 | * @date 2019/1/31 15 | */ 16 | public class JanusGraphProvider { 17 | private static final Logger logger = LogManager.getLogger(JanusGraphProvider.class); 18 | public Cluster cluster; 19 | 20 | 21 | public JanusGraphProvider(String hosts, int port, int poolSize) { 22 | Configuration clusterConfig = new PropertiesConfiguration(); 23 | clusterConfig.setProperty("hosts", hosts); 24 | clusterConfig.setProperty("port", port); 25 | clusterConfig.setProperty("connectionPool.minSize", poolSize); 26 | clusterConfig.setProperty("connectionPool.maxSize", poolSize); 27 | clusterConfig.setProperty("connectionPool.maxInProcessPerConnection", poolSize); 28 | clusterConfig.setProperty("connectionPool.maxSimultaneousUsagePerConnection", poolSize); 29 | clusterConfig.setProperty("connectionPool.maxContentLength", 65536000); 30 | clusterConfig.setProperty("serializer.className", "org.apache.tinkerpop.gremlin.driver.ser.GryoMessageSerializerV3d0"); 31 | // 此处很蛋疼,需要返回列表,只能加逗号分隔才行,生成两个类 32 | clusterConfig.setProperty("serializer.config.ioRegistries", 33 | "org.janusgraph.graphdb.tinkerpop.JanusGraphIoRegistry,org.janusgraph.graphdb.tinkerpop.JanusGraphIoRegistry"); 34 | 35 | cluster = Cluster.open(clusterConfig); 36 | } 37 | 38 | public Client getClient() { 39 | return this.cluster.connect(); 40 | } 41 | 42 | 43 | public void close() throws Exception { 44 | try { 45 | if (cluster != null) { 46 | // the cluster closes all of its clients 47 | cluster.close(); 48 | } 49 | } finally { 50 | cluster = null; 51 | } 52 | } 53 | 54 | public void submit(String cql) { 55 | Client client = this.getClient(); 56 | try { 57 | client.submit(cql).stream(); 58 | } finally { 59 | if (!Objects.isNull(client)) { 60 | client.close(); 61 | } 62 | } 63 | } 64 | } -------------------------------------------------------------------------------- /dataImport/src/main/scala/com/qihoo/finance/tap/ScalaHelper.scala: -------------------------------------------------------------------------------- 1 | package com.qihoo.finance.tap 2 | 3 | import java.lang 4 | 5 | import com.alibaba.fastjson.JSONObject 6 | import org.apache.spark.sql.DataFrame 7 | 8 | object ScalaHelper { 9 | def convertHeader(label: String, headerMap: Map[String, String], headerList: Array[String]): Map[String, String] = { 10 | var headResult = Map[String, String]() 11 | for (field <- headerList) { 12 | var result: String = null 13 | 14 | if ("name".equals(field)) { 15 | result = "%s:ID(%s)".format(field, label) 16 | } else { 17 | if (headerMap.contains(field)) { 18 | result = field + ":" + headerMap(field) 19 | } else { 20 | result = field 21 | } 22 | } 23 | 24 | headResult = headResult + (field -> result) 25 | } 26 | 27 | headResult 28 | } 29 | 30 | 31 | def saveAsCSV(outputFile: String, df: DataFrame) = { 32 | df.repartition(1) 33 | .write 34 | .mode("overwrite") 35 | .format("com.databricks.spark.csv") 36 | .option("header", "true") 37 | .option("treatEmptyValuesAsNulls", "false") 38 | .save(outputFile) 39 | } 40 | 41 | 42 | def parseVertexLineGetIdAndAttr(line: String) = { 43 | val labelLast = line.indexOf("[") 44 | val idLast = line.indexOf("]") 45 | val attrStart = line.indexOf("{") 46 | val attrStr = line.substring(attrStart, line.length) 47 | 48 | val jsonObject = Helper.getVertexProperty(attrStr) 49 | jsonObject 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/CallEdgeConvertToCsv.scala: -------------------------------------------------------------------------------- 1 | package com.qihoo.finance.tap.data.convert 2 | 3 | import com.qihoo.finance.tap.ImportCommon 4 | import org.apache.log4j.{LogManager, Logger} 5 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 6 | import org.apache.spark.sql.{RowFactory, SQLContext} 7 | import org.apache.spark.{SparkConf, SparkContext} 8 | import org.apache.tinkerpop.gremlin.driver.Client 9 | 10 | object CallEdgeConvertToCsv { 11 | val logger: Logger = LogManager.getLogger("CallEdgeConvertToCsv") 12 | 13 | val usage = 14 | """ 15 | Usage: CallEdgeConvertToCsv [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] E:\360_doc\lolth\call_edge.csv 16 | """ 17 | 18 | def main(args: Array[String]) { 19 | if (args.length == 0) { 20 | println(usage) 21 | System.exit(0) 22 | } 23 | 24 | val argList = args.toList 25 | val options = ImportCommon.nextOption(Map(), argList) 26 | 27 | val conf = new SparkConf().setAppName("CallEdgeConvertToCsv") 28 | //setMaster("local") 本机的spark就用local,远端的就写ip 29 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。 30 | // conf.setMaster("local") 31 | 32 | val sc = new SparkContext(conf) 33 | val sqlContext = new SQLContext(sc) 34 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String]) 35 | val outputFile = options.getOrElse('outputFile, "").asInstanceOf[String] 36 | 37 | val dataRdd = txtFile.map { 38 | line => 39 | val fields = line.replace("\"", "").split(",") 40 | // "1870276152746","CALL","18602761525746" 41 | // "13512340050","CALL","15607804358",1 42 | // CALL 边有 mgm 属性 43 | 44 | if (!"\\N".equals(fields(3))) { 45 | val mgmInt: java.lang.Integer = Integer.parseInt(fields(3)) 46 | RowFactory.create(fields(0), fields(2), mgmInt) 47 | } else { 48 | RowFactory.create(fields(0), fields(2), null) 49 | } 50 | } 51 | 52 | val structType = new StructType() 53 | .add(StructField("start_name", StringType, nullable = true)) 54 | .add(StructField("end_name", StringType, nullable = true)) 55 | .add(StructField("mgm", IntegerType, nullable = true)) 56 | 57 | val df = sqlContext.createDataFrame(dataRdd, structType) 58 | 59 | df.createOrReplaceTempView("csv_df") 60 | sqlContext.sql("create table migrate_call_tmp as select * from csv_df") 61 | 62 | // df.show() 63 | // ScalaHelper.saveAsCSV(outputFile, df) 64 | 65 | println("***********************stoped***********************") 66 | sc.stop() 67 | } 68 | 69 | private def handleEdgeList(cqlList: List[String], client: Client): Unit = { 70 | var runCql = "g = graph.traversal();g" 71 | 72 | cqlList.foreach(cql => runCql += cql) 73 | if (cqlList.nonEmpty) { 74 | runCql += ".count()" 75 | ImportCommon.submitWithRetry(client, runCql) 76 | } 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/DeviceConvertToCsv.scala: -------------------------------------------------------------------------------- 1 | package com.qihoo.finance.tap.data.convert 2 | 3 | import com.alibaba.fastjson.JSONObject 4 | import com.qihoo.finance.tap.{ImportCommon, ScalaHelper} 5 | import org.apache.log4j.{LogManager, Logger} 6 | import org.apache.spark.sql.types.{StringType, StructType} 7 | import org.apache.spark.sql.{RowFactory, SQLContext} 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | 10 | object DeviceConvertToCsv { 11 | 12 | val logger: Logger = LogManager.getLogger("DeviceConvertToCsv") 13 | 14 | val usage = 15 | """ 16 | Usage: DeviceConvertToCsv [--label] [--outputFile] E:\360_doc\lolth\mobile.csv 17 | """ 18 | 19 | type OptionMap = Map[Symbol, Any] 20 | 21 | 22 | def main(args: Array[String]) { 23 | if (args.length == 0) { 24 | println(usage) 25 | System.exit(0) 26 | } 27 | 28 | val argList = args.toList 29 | val options = ImportCommon.nextOption(Map(), argList) 30 | 31 | 32 | val conf = new SparkConf().setAppName("DeviceConvertToCsv") 33 | //setMaster("local") 本机的spark就用local,远端的就写ip 34 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。 35 | // conf.setMaster("local") 36 | 37 | val sc = new SparkContext(conf) 38 | val sqlContext = new SQLContext(sc) 39 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String]) 40 | val outputFile = options.getOrElse('outputFile, "").asInstanceOf[String] 41 | val deviceType = options.getOrElse('deviceType, "").asInstanceOf[String] 42 | if (deviceType == null) { 43 | println("--deviceType 不能为空 device|wifi") 44 | System.exit(0) 45 | } 46 | 47 | val headerList = Array("name", "is_exception", "is_white") 48 | 49 | // name:ID(human) age:Int 50 | val dataRdd = txtFile.map { 51 | line => 52 | val jsonObject: JSONObject = ScalaHelper.parseVertexLineGetIdAndAttr(line) 53 | RowFactory.create(jsonObject.getString("name"), 54 | jsonObject.getString("is_exception"), 55 | jsonObject.getString("is_white") 56 | ) 57 | } 58 | var structType = new StructType() 59 | 60 | for ((elem, i) <- headerList.view.zipWithIndex) { 61 | structType = structType.add(headerList(i), StringType, nullable = true) 62 | } 63 | 64 | val df = sqlContext.createDataFrame(dataRdd, structType) 65 | 66 | df.createOrReplaceTempView("device_csv_df") 67 | 68 | sqlContext.sql("DROP TABLE IF EXISTS migrate_" + deviceType + "_tmp") 69 | sqlContext.sql("create table migrate_" + deviceType + "_tmp as select * from device_csv_df") 70 | 71 | // df.show() 72 | // ScalaHelper.saveAsCSV(outputFile, df) 73 | 74 | println("***********************stoped***********************") 75 | sc.stop() 76 | } 77 | 78 | 79 | } 80 | -------------------------------------------------------------------------------- /dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/MergeNodesAndEdges.scala: -------------------------------------------------------------------------------- 1 | package com.qihoo.finance.tap.data.convert 2 | 3 | import com.qihoo.finance.tap.ImportCommon 4 | import org.apache.log4j.{LogManager, Logger} 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.{DataFrame, Row, SQLContext} 7 | import org.apache.spark.{SparkConf, SparkContext} 8 | 9 | object MergeNodesAndEdges { 10 | 11 | val logger: Logger = LogManager.getLogger("MergeNodesAndEdges") 12 | 13 | val usage = 14 | """ 15 | 将顶点和边合并为一行,做批量导入 16 | Usage: MergeNodesAndEdges --outputFile 17 | """ 18 | 19 | type OptionMap = Map[Symbol, Any] 20 | 21 | 22 | def main(args: Array[String]) { 23 | // if (args.length == 0) { 24 | // println(usage) 25 | // System.exit(0) 26 | // } 27 | 28 | val argList = args.toList 29 | val options = ImportCommon.nextOption(Map(), argList) 30 | 31 | val conf = new SparkConf().setAppName("MergeNodesAndEdges") 32 | //setMaster("local") 本机的spark就用local,远端的就写ip 33 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。 34 | // conf.setMaster("local") 35 | 36 | val sc = new SparkContext(conf) 37 | val sqlContext = new SQLContext(sc) 38 | val outputFile = options.getOrElse('outputFile, "").asInstanceOf[String] 39 | 40 | val (mobile_df: DataFrame, device_df: DataFrame, wifi_df: DataFrame, call_df: DataFrame, has_df: DataFrame, use_df: DataFrame, use_wifi_df: DataFrame) = 41 | generateTestDataDF(sc, sqlContext) 42 | 43 | // val mobile_df = sqlContext.sql("select * from migrate_mobile_id_tmp") 44 | // val device_df = sqlContext.sql("select * from migrate_device_id_tmp") 45 | // val wifi_df = sqlContext.sql("select * from migrate_wifi_id_tmp") 46 | // 47 | // val call_df = sqlContext.sql("select * from migrate_call_id_tmp") 48 | // val has_df = sqlContext.sql("select * from migrate_has_id_tmp") 49 | // val use_df = sqlContext.sql("select * from migrate_use_id_tmp") 50 | // val use_wifi_df = sqlContext.sql("select * from migrate_use_wifi_id_tmp") 51 | 52 | 53 | val mobile_kv = mobile_df.rdd.keyBy(_ (0)).mapValues(fields => { 54 | // label, other props ... 55 | List("MOBILE", 56 | replaceNullEmpty(fields(1)), 57 | replaceNullEmpty(fields(2)), 58 | replaceNullEmpty(fields(3)), 59 | replaceNullEmpty(fields(4)), 60 | replaceNullEmpty(fields(5)), 61 | replaceNullEmpty(fields(6)), 62 | replaceNullEmpty(fields(7)), 63 | replaceNullEmpty(fields(8)), 64 | replaceNullEmpty(fields(9)), 65 | replaceNullEmpty(fields(10)), 66 | replaceNullEmpty(fields(11)) 67 | ).mkString(",") 68 | }) 69 | 70 | val device_kv = device_df.rdd.keyBy(_ (0)).mapValues(fields => { 71 | // label, other props ... 72 | ("DEVICE", fields(1), replaceNullEmpty(fields(2)), replaceNullEmpty(fields(3))).productIterator.mkString(",") 73 | }) 74 | 75 | val wifi_kv = wifi_df.rdd.keyBy(_ (0)).mapValues(fields => { 76 | // label, other props ... 77 | ("WIFI", fields(1), replaceNullEmpty(fields(2)), replaceNullEmpty(fields(3))).productIterator.mkString(",") 78 | }) 79 | 80 | val call_in_kv = call_df.rdd.keyBy(_ (1)).mapValues(fields => (fields(0), replaceNullEmpty(fields(2)))) 81 | val call_out_kv = call_df.rdd.keyBy(_ (0)).mapValues(fields => (fields(1), replaceNullEmpty(fields(2)))) 82 | 83 | val has_in_kv = has_df.rdd.keyBy(_ (1)).mapValues(fields => fields(0)) 84 | val has_out_kv = has_df.rdd.keyBy(_ (0)).mapValues(fields => fields(1)) 85 | 86 | val use_out_kv = use_df.rdd.keyBy(_ (0)).mapValues(fields => fields(1)) 87 | val use_in_kv = use_df.rdd.keyBy(_ (1)).mapValues(fields => fields(0)) 88 | 89 | val use_wifi_out_kv = use_wifi_df.rdd.keyBy(_ (0)).mapValues(fields => fields(1)) 90 | val use_wifi_in_kv = use_wifi_df.rdd.keyBy(_ (1)).mapValues(fields => fields(0)) 91 | 92 | val mobile_result_rdd = mobile_kv.cogroup(call_in_kv).map(v => { 93 | val callIn = v._2._2.toList.map(v => { 94 | "CALL," + v._1 + "," + v._2 95 | }).mkString("|") 96 | val edge = forceJoinBeforeAndNow(v._2._1.toList.head, callIn, "\t") 97 | (v._1, edge) 98 | }).cogroup(has_in_kv).map(v => { 99 | val hasIn = v._2._2.toList.map(v => { 100 | "HAS," + v 101 | }).mkString("|") 102 | val edge = joinBeforeAndNowWithCheck(v._2._1.toList.head, hasIn, "\t", "|") 103 | (v._1, edge) 104 | }).cogroup(call_out_kv).map(v => { 105 | val callOut = v._2._2.toList.map(v => { 106 | "CALL," + v._1 + "," + v._2 107 | }).mkString("|") 108 | val edge = forceJoinBeforeAndNow(v._2._1.toList.head, callOut, "\t") 109 | (v._1, edge) 110 | }).cogroup(use_out_kv).map(v => { 111 | val useOut = v._2._2.toList.map(v => { 112 | "USE," + v 113 | }).mkString("|") 114 | val edge = joinBeforeAndNowWithCheck(v._2._1.toList.head, useOut, "\t", "|") 115 | (v._1, edge) 116 | }).cogroup(use_wifi_out_kv).map(v => { 117 | val useOut = v._2._2.toList.map(v => { 118 | "USE_WIFI," + v 119 | }).mkString("|") 120 | val edge = joinBeforeAndNowWithCheck(v._2._1.toList.head, useOut, "\t", "|") 121 | (v._1, edge) 122 | }).map(v => v._1 + "," + v._2) 123 | 124 | val device_result_rdd = device_kv.cogroup(use_in_kv).map(v => { 125 | val useIn = v._2._2.toList.map(v => { 126 | "USE," + v 127 | }).mkString("|") 128 | val edge = forceJoinBeforeAndNow(v._2._1.toList.head, useIn, "\t") 129 | (v._1, edge) 130 | }).cogroup(has_out_kv).map(v => { 131 | val hasOut = v._2._2.toList.map(v => { 132 | "HAS," + v 133 | }).mkString("|") 134 | val edge = forceJoinBeforeAndNow(v._2._1.toList.head, hasOut, "\t") 135 | (v._1, edge) 136 | }).map(v => v._1 + "," + v._2) 137 | 138 | val wifi_result_rdd = wifi_kv.cogroup(use_wifi_in_kv).map(v => { 139 | val useIn = v._2._2.toList.map(v => { 140 | "USE_WIFI," + v 141 | }).mkString("|") 142 | val edge = forceJoinBeforeAndNow(v._2._1.toList.head, useIn, "\t") 143 | // USE_WIFI has't outEdge so add \t 144 | (v._1, edge + "\t") 145 | }).map(v => v._1 + "," + v._2) 146 | 147 | val total_result = mobile_result_rdd ++ device_result_rdd ++ wifi_result_rdd 148 | 149 | total_result.saveAsTextFile(outputFile) 150 | 151 | // total_result.collect().foreach(println) 152 | 153 | println("***********************stoped***********************") 154 | sc.stop() 155 | } 156 | 157 | private def generateTestDataDF(sc: SparkContext, sqlContext: SQLContext) = { 158 | val mobile_rdd = sc.parallelize(Seq( 159 | Row(1L, "13908125867", "3|TVOiyN2mC/ihdQuMBaw+0A==", "12dd2479ed75af60968d012fa139ff1cffac3683", "true", "", "", "", "", 1, null, null), 160 | Row(2L, "13908125868", "3|TVOiyN2mC/ihdQuMBaw+0A==", "12dd2479ed75af60968d012fa139ff1cffac3683", "true", "", "", "", "", 1, null, 0), 161 | Row(3L, "13908125869", "3|TVOiyN2mC/ihdQuMBaw+0A==", "12dd2479ed75af60968d012fa139ff1cffac3683", "true", "", "", "", "", 3, null, 1) 162 | )) 163 | val device_rdd = sc.parallelize(Seq( 164 | Row(11L, "FP13682956455", null, "false"), 165 | Row(12L, "FP13682956456", "true", null), 166 | Row(13L, "FP13682956457", "true", "false") 167 | )) 168 | val wifi_rdd = sc.parallelize(Seq( 169 | Row(21L, "bssid13682956455", null, "false"), 170 | Row(22L, "bssid13682956456", "true", null), 171 | Row(23L, "bssid13682956457", "true", "false") 172 | )) 173 | 174 | val call_rdd = sc.parallelize(Seq( 175 | Row(1L, 2L, null), 176 | // Row(3L, 2L, 1), 177 | Row(2L, 1L, 1) 178 | // Row(2L, 3L, 1) 179 | )) 180 | val has_rdd = sc.parallelize(Seq( 181 | Row(11L, 1L), 182 | Row(11L, 2L), 183 | Row(11L, 3L), 184 | Row(12L, 1L), 185 | Row(12L, 2L) 186 | )) 187 | val use_rdd = sc.parallelize(Seq( 188 | // Row(1L, 11L), 189 | // Row(1L, 12L), 190 | // Row(2L, 13L), 191 | Row(2L, 11L) 192 | // Row(3L, 12L) 193 | )) 194 | val use_wifi_rdd = sc.parallelize(Seq( 195 | Row(1L, 21L), 196 | Row(1L, 22L), 197 | Row(2L, 23L), 198 | Row(2L, 21L), 199 | Row(3L, 22L) 200 | )) 201 | 202 | val mobile_schema = StructType(List( 203 | StructField("id", LongType, nullable = false), 204 | StructField("name", StringType, nullable = true), 205 | StructField("nm_pass", StringType, nullable = true), 206 | StructField("nm_sha1", StringType, nullable = true), 207 | StructField("is_register", StringType, nullable = true), 208 | StructField("is_risk", StringType, nullable = true), 209 | StructField("is_internal", StringType, nullable = true), 210 | StructField("is_service", StringType, nullable = true), 211 | StructField("merchant_name", StringType, nullable = true), 212 | StructField("status", IntegerType, nullable = true), 213 | StructField("suspect_risk", IntegerType, nullable = true), 214 | StructField("overdue_status", IntegerType, nullable = true) 215 | )) 216 | 217 | val call_schema = StructType(List( 218 | StructField("start_id", LongType, nullable = false), 219 | StructField("end_id", LongType, nullable = true), 220 | StructField("mgm", IntegerType, nullable = true) 221 | )) 222 | val edge_schema = StructType(List( 223 | StructField("start_id", LongType, nullable = false), 224 | StructField("end_id", LongType, nullable = true) 225 | )) 226 | val device_schema = StructType(List( 227 | StructField("id", LongType, nullable = false), 228 | StructField("name", StringType, nullable = true), 229 | StructField("is_exception", StringType, nullable = true), 230 | StructField("is_white", StringType, nullable = true) 231 | )) 232 | 233 | val mobile_df = sqlContext.createDataFrame(mobile_rdd, mobile_schema) 234 | val device_df = sqlContext.createDataFrame(device_rdd, device_schema) 235 | val wifi_df = sqlContext.createDataFrame(wifi_rdd, device_schema) 236 | 237 | val call_df = sqlContext.createDataFrame(call_rdd, call_schema) 238 | val has_df = sqlContext.createDataFrame(has_rdd, edge_schema) 239 | val use_df = sqlContext.createDataFrame(use_rdd, edge_schema) 240 | val use_wifi_df = sqlContext.createDataFrame(use_wifi_rdd, edge_schema) 241 | (mobile_df, device_df, wifi_df, call_df, has_df, use_df, use_wifi_df) 242 | } 243 | 244 | def joinBeforeAndNowWithCheck(before: String, now: String, beforSep: String, separate: String): String = { 245 | var edge: String = null 246 | if (now.isEmpty) { 247 | edge = before 248 | } else if (before.endsWith(beforSep)) { 249 | edge = List(before, now).mkString("") 250 | } else { 251 | edge = List(before, now).mkString(separate) 252 | } 253 | edge 254 | } 255 | 256 | def forceJoinBeforeAndNow(before: String, now: String, separate: String): String = { 257 | val edge = List(before, now).mkString(separate) 258 | edge 259 | } 260 | 261 | def replaceNullEmpty(field: Any): Any = { 262 | var value = field 263 | if (value == null) { 264 | value = "" 265 | } 266 | value 267 | } 268 | 269 | 270 | } 271 | -------------------------------------------------------------------------------- /dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/MobileConvertToCsv.scala: -------------------------------------------------------------------------------- 1 | package com.qihoo.finance.tap.data.convert 2 | 3 | import com.alibaba.fastjson.JSONObject 4 | import com.qihoo.finance.tap.{ImportCommon, ScalaHelper} 5 | import org.apache.commons.codec.digest.DigestUtils 6 | import org.apache.log4j.{LogManager, Logger} 7 | import org.apache.spark.sql.types.{IntegerType, StringType, StructType} 8 | import org.apache.spark.sql.{RowFactory, SQLContext} 9 | import org.apache.spark.{SparkConf, SparkContext} 10 | 11 | object MobileConvertToCsv { 12 | 13 | val logger: Logger = LogManager.getLogger("MobileConvertToCsv") 14 | 15 | val usage = 16 | """ 17 | Usage: MobileConvertToCsv [--outputFile] E:\360_doc\lolth\mobile.csv 18 | """ 19 | 20 | type OptionMap = Map[Symbol, Any] 21 | 22 | 23 | def main(args: Array[String]) { 24 | if (args.length == 0) { 25 | println(usage) 26 | System.exit(0) 27 | } 28 | 29 | val argList = args.toList 30 | val options = ImportCommon.nextOption(Map(), argList) 31 | 32 | val conf = new SparkConf().setAppName("MobileConvertToCsv") 33 | //setMaster("local") 本机的spark就用local,远端的就写ip 34 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。 35 | // conf.setMaster("local") 36 | 37 | val sc = new SparkContext(conf) 38 | val sqlContext = new SQLContext(sc) 39 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String]) 40 | val outputFile = options.getOrElse('outputFile, "").asInstanceOf[String] 41 | 42 | val headerList = Array( "name", "nm_pass", "nm_sha1", "is_register", "is_risk", "is_internal", "is_service", "merchant_name", "status", "suspect_risk", "overdue_status") 43 | 44 | val dataRdd = txtFile.map { 45 | line => 46 | val jsonObject: JSONObject = ScalaHelper.parseVertexLineGetIdAndAttr(line) 47 | 48 | val nameValue = jsonObject.getString("name") 49 | // 加密信息 50 | val encrypt = nameValue 51 | val sha1Hex = DigestUtils.sha1Hex(nameValue) 52 | 53 | RowFactory.create(nameValue, encrypt, sha1Hex, 54 | jsonObject.getString("is_register"), 55 | jsonObject.getString("is_risk"), 56 | jsonObject.getString("is_internal"), 57 | jsonObject.getString("is_service"), 58 | jsonObject.getString("merchant_name"), 59 | jsonObject.getInteger("status"), 60 | jsonObject.getInteger("suspect_risk"), 61 | jsonObject.getInteger("overdue_status") 62 | ) 63 | } 64 | var structType = new StructType() 65 | 66 | for ((elem, i) <- headerList.view.zipWithIndex) { 67 | if (List("status", "suspect_risk", "overdue_status").contains(elem)) { 68 | structType = structType.add(headerList(i), IntegerType, nullable = true) 69 | } else { 70 | structType = structType.add(headerList(i), StringType, nullable = true) 71 | } 72 | } 73 | 74 | val df = sqlContext.createDataFrame(dataRdd, structType) 75 | 76 | df.createOrReplaceTempView("mobile_csv_df") 77 | 78 | sqlContext.sql("DROP TABLE IF EXISTS migrate_mobile_tmp") 79 | sqlContext.sql("create table migrate_mobile_tmp as select * from mobile_csv_df") 80 | // df.show() 81 | // ScalaHelper.saveAsCSV(outputFile, df) 82 | 83 | println("***********************stoped***********************") 84 | sc.stop() 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /dataImport/src/main/scala/com/qihoo/finance/tap/data/convert/OtherEdgeConvertToCsv.scala: -------------------------------------------------------------------------------- 1 | package com.qihoo.finance.tap.data.convert 2 | 3 | import com.qihoo.finance.tap.ImportCommon 4 | import org.apache.log4j.{LogManager, Logger} 5 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 6 | import org.apache.spark.sql.{RowFactory, SQLContext} 7 | import org.apache.spark.{SparkConf, SparkContext} 8 | import org.apache.tinkerpop.gremlin.driver.Client 9 | 10 | object OtherEdgeConvertToCsv { 11 | val logger: Logger = LogManager.getLogger("OtherEdgeConvertToCsv") 12 | 13 | val usage = 14 | """ 15 | Usage: OtherEdgeConvertToCsv [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] E:\360_doc\lolth\call_edge.csv 16 | """ 17 | 18 | def main(args: Array[String]) { 19 | if (args.length == 0) { 20 | println(usage) 21 | System.exit(0) 22 | } 23 | 24 | val argList = args.toList 25 | val options = ImportCommon.nextOption(Map(), argList) 26 | 27 | val conf = new SparkConf().setAppName("OtherEdgeConvertToCsv") 28 | //setMaster("local") 本机的spark就用local,远端的就写ip 29 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。 30 | // conf.setMaster("local") 31 | 32 | val sc = new SparkContext(conf) 33 | val sqlContext = new SQLContext(sc) 34 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String]) 35 | 36 | val fromLabel = options.getOrElse('fromLabel, null).asInstanceOf[String] 37 | val toLabel = options.getOrElse('toLabel, null).asInstanceOf[String] 38 | val edgeType = options.getOrElse('edgeType, null).asInstanceOf[String] 39 | 40 | if (fromLabel == null || toLabel == null) { 41 | println("必须添加参数 --from-label DEVICE|WIFI|MOBILE --to-label DEVICE|WIFI|MOBILE") 42 | System.exit(0) 43 | } 44 | 45 | // :START_ID(god) :END_ID(titan) 46 | // jupiter saturn 47 | 48 | val dataRdd = txtFile.map { 49 | line => 50 | val fields = line.replace("\"", "").split(",") 51 | // "1870276152746","CALL","18602761525746" 52 | // "13512340050","CALL","15607804358",1 53 | // CALL 边有 mgm 属性 54 | RowFactory.create(fields(0), fields(2)) 55 | } 56 | 57 | val structType = new StructType() 58 | .add(StructField("start_name", StringType, nullable = true)) 59 | .add(StructField("end_name", StringType, nullable = true)) 60 | 61 | val df = sqlContext.createDataFrame(dataRdd, structType) 62 | // df.show() 63 | 64 | df.createOrReplaceTempView("edge_csv_df") 65 | 66 | sqlContext.sql("DROP TABLE IF EXISTS migrate_" + edgeType + "_tmp") 67 | sqlContext.sql("create table migrate_" + edgeType + "_tmp as select * from edge_csv_df") 68 | 69 | 70 | println("***********************stoped***********************") 71 | sc.stop() 72 | } 73 | 74 | private def handleEdgeList(cqlList: List[String], client: Client): Unit = { 75 | var runCql = "g = graph.traversal();g" 76 | 77 | cqlList.foreach(cql => runCql += cql) 78 | if (cqlList.nonEmpty) { 79 | runCql += ".count()" 80 | ImportCommon.submitWithRetry(client, runCql) 81 | } 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /dataImport/src/main/scala/com/qihoo/finance/tap/direct/EdgeImport.scala: -------------------------------------------------------------------------------- 1 | package com.qihoo.finance.tap.direct 2 | 3 | import com.qihoo.finance.tap.ImportCommon 4 | import org.apache.log4j.{LogManager, Logger} 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import org.apache.tinkerpop.gremlin.driver.Client 7 | 8 | object EdgeImport { 9 | val logger: Logger = LogManager.getLogger("EdgeImport") 10 | 11 | val usage = 12 | """ 13 | Usage: EdgeImport [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] E:\360_doc\lolth\call_edge.csv 14 | """ 15 | 16 | def main(args: Array[String]) { 17 | if (args.length == 0) { 18 | println(usage) 19 | System.exit(0) 20 | } 21 | 22 | val argList = args.toList 23 | val options = ImportCommon.nextOption(Map(), argList) 24 | 25 | val conf = new SparkConf().setAppName("EdgeImport") 26 | //setMaster("local") 本机的spark就用local,远端的就写ip 27 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。 28 | // conf.setMaster("local") 29 | 30 | val sc = new SparkContext(conf) 31 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String]) 32 | val hosts = options.getOrElse('janusgraphHosts, "").asInstanceOf[String] 33 | val port = options.getOrElse('janusgraphPort, 0).asInstanceOf[Int] 34 | val batchSize = options.getOrElse('batchSize, 50).asInstanceOf[Int] 35 | val poolSize = options.getOrElse('poolSize, 16).asInstanceOf[Int] 36 | 37 | txtFile.map { 38 | line => 39 | val fields = line.replace("\"", "").split(",") 40 | // "1870276152746","CALL","18602761525746" 41 | // "13512340050","CALL","15607804358",1 42 | // CALL 边有 mgm 属性 43 | if (fields.length == 4 && "CALL".equals(fields(1)) && !"\\N".equals(fields(3))) { 44 | (fields(0), fields(1), fields(2), Some(fields(3))) 45 | } else { 46 | (fields(0), fields(1), fields(2), None) 47 | } 48 | 49 | }.foreachPartition(partitionOfRecords => { 50 | val provider = ImportCommon.getJanusGraph(hosts, port, poolSize) 51 | val client = provider.getClient 52 | 53 | var cqlList: List[String] = List() 54 | partitionOfRecords.foreach(record => { 55 | var edgeCql = "" 56 | if (record._2 == "CALL" && record._4.nonEmpty) { 57 | edgeCql = ".V().has('name','" + record._1 + "').as('a').V().has('name','" + record._3 + "').addE('" + record._2 + "').from('a').property('mgm'," + record._4.get + ")" 58 | } else { 59 | edgeCql = ".V().has('name','" + record._1 + "').as('a').V().has('name','" + record._3 + "').addE('" + record._2 + "').from('a')" 60 | } 61 | cqlList = edgeCql :: cqlList 62 | if (cqlList.size >= batchSize) { 63 | handleEdgeList(cqlList, client) 64 | cqlList = List() 65 | } 66 | }) 67 | 68 | handleEdgeList(cqlList, client) 69 | client.close() 70 | provider.close() 71 | }) 72 | println("***********************stoped***********************") 73 | sc.stop() 74 | } 75 | 76 | private def handleEdgeList(cqlList: List[String], client: Client): Unit = { 77 | var runCql = "g = graph.traversal();g" 78 | 79 | cqlList.foreach(cql => runCql += cql) 80 | if (cqlList.nonEmpty) { 81 | runCql += ".count()" 82 | ImportCommon.submitWithRetry(client, runCql) 83 | } 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /dataImport/src/main/scala/com/qihoo/finance/tap/direct/VertexImport.scala: -------------------------------------------------------------------------------- 1 | package com.qihoo.finance.tap.direct 2 | 3 | import com.qihoo.finance.tap.ImportCommon 4 | import org.apache.log4j.{LogManager, Logger} 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | 7 | object VertexImport { 8 | 9 | val logger: Logger = LogManager.getLogger("VertexImport") 10 | 11 | val usage = 12 | """ 13 | Usage: VertexImport [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] [--batch-size 20] E:\360_doc\lolth\mobile.csv 14 | """ 15 | 16 | type OptionMap = Map[Symbol, Any] 17 | 18 | def main(args: Array[String]) { 19 | if (args.length == 0) { 20 | println(usage) 21 | System.exit(0) 22 | } 23 | 24 | val argList = args.toList 25 | val options = ImportCommon.nextOption(Map(), argList) 26 | 27 | val conf = new SparkConf().setAppName("VertexImport") 28 | //setMaster("local") 本机的spark就用local,远端的就写ip 29 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。 30 | // conf.setMaster("local") 31 | 32 | val sc = new SparkContext(conf) 33 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String]) 34 | val hosts = options.getOrElse('janusgraphHosts, "").asInstanceOf[String] 35 | val port = options.getOrElse('janusgraphPort, 8182).asInstanceOf[Int] 36 | val batchSize = options.getOrElse('batchSize, 50).asInstanceOf[Int] 37 | val poolSize = options.getOrElse('poolSize, 16).asInstanceOf[Int] 38 | 39 | 40 | txtFile.map { 41 | line => 42 | val labelLast = line.indexOf("[") 43 | val attrStart = line.indexOf("{") 44 | val label = line.substring(0, labelLast) 45 | val attrStr = line.substring(attrStart, line.length) 46 | (label.toUpperCase(), attrStr) 47 | }.foreachPartition(partitionOfRecords => { 48 | val provider = ImportCommon.getJanusGraph(hosts, port, poolSize) 49 | val client = provider.getClient 50 | 51 | var recordList: List[(String, String)] = List() 52 | partitionOfRecords.foreach(record => { 53 | if (!ImportCommon.isEmpty(record._1)) { 54 | recordList = (record._1, record._2) :: recordList 55 | if (recordList.size >= batchSize) { 56 | ImportCommon.handleVertexList(recordList, client) 57 | recordList = List() 58 | } 59 | } 60 | }) 61 | 62 | ImportCommon.handleVertexList(recordList, client) 63 | client.close() 64 | provider.close() 65 | }) 66 | 67 | println("***********************stoped***********************") 68 | sc.stop() 69 | } 70 | 71 | 72 | } 73 | -------------------------------------------------------------------------------- /dataImport/src/main/scala/com/qihoo/finance/tap/increment/EdgeImportIncrement.scala: -------------------------------------------------------------------------------- 1 | package com.qihoo.finance.tap.increment 2 | 3 | import com.qihoo.finance.tap.ImportCommon 4 | import org.apache.log4j.{LogManager, Logger} 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import org.apache.tinkerpop.gremlin.driver.Client 7 | 8 | object EdgeImportIncrement { 9 | val logger: Logger = LogManager.getLogger("EdgeImportIncrement") 10 | 11 | val usage = 12 | """ 13 | 边的增量导入 14 | Usage: EdgeImportIncrement [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] E:\360_doc\lolth\call_edge.csv 15 | """ 16 | 17 | def main(args: Array[String]) { 18 | if (args.length == 0) { 19 | println(usage) 20 | System.exit(0) 21 | } 22 | 23 | val argList = args.toList 24 | val options = ImportCommon.nextOption(Map(), argList) 25 | 26 | val conf = new SparkConf().setAppName("EdgeImportIncrement") 27 | //setMaster("local") 本机的spark就用local,远端的就写ip 28 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。 29 | // conf.setMaster("local") 30 | 31 | val sc = new SparkContext(conf) 32 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String]) 33 | val hosts = options.getOrElse('janusgraphHosts, "").asInstanceOf[String] 34 | val port = options.getOrElse('janusgraphPort, 8182).asInstanceOf[Int] 35 | val batchSize = options.getOrElse('batchSize, 50).asInstanceOf[Int] 36 | val poolSize = options.getOrElse('poolSize, 16).asInstanceOf[Int] 37 | 38 | txtFile.map { 39 | line => 40 | val fields = line.replace("\"", "").split(",") 41 | // "1870276152746","CALL","18602761525746" 42 | // "13512340050","CALL","15607804358",1 43 | // CALL 边有 mgm 属性 44 | if (fields.length == 4 && "CALL".equals(fields(1)) && !"\\N".equals(fields(3))) { 45 | (fields(0), fields(1), fields(2), Some(fields(3))) 46 | } else { 47 | (fields(0), fields(1), fields(2), None) 48 | } 49 | 50 | }.foreachPartition(partitionOfRecords => { 51 | val provider = ImportCommon.getJanusGraph(hosts, port, poolSize) 52 | val client = provider.getClient 53 | 54 | var cqlList: List[String] = List() 55 | partitionOfRecords.foreach(record => { 56 | var edgeCql = ".V().has('name','" + record._1 + "').as('a').V().has('name','" + record._3 + "')" + 57 | ".coalesce(inE('" + record._2 + "').where(outV().as('a')), addE('" + record._2 + "').from('a'))" 58 | 59 | if (record._2 == "CALL" && record._4.nonEmpty) { 60 | edgeCql += ".property('mgm'," + record._4.get + ")" 61 | } 62 | 63 | cqlList = edgeCql :: cqlList 64 | if (cqlList.size >= batchSize) { 65 | handleEdgeList(cqlList, client) 66 | cqlList = List() 67 | } 68 | }) 69 | 70 | handleEdgeList(cqlList, client) 71 | client.close() 72 | provider.close() 73 | }) 74 | println("***********************stoped***********************") 75 | sc.stop() 76 | } 77 | 78 | private def handleEdgeList(cqlList: List[String], client: Client): Unit = { 79 | var runCql = "g = graph.traversal();g" 80 | 81 | cqlList.foreach(cql => runCql += cql) 82 | if (cqlList.nonEmpty) { 83 | runCql += ".count()" 84 | ImportCommon.submitWithRetry(client, runCql) 85 | } 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /dataImport/src/main/scala/com/qihoo/finance/tap/increment/VertexImportIncrement.scala: -------------------------------------------------------------------------------- 1 | package com.qihoo.finance.tap.increment 2 | 3 | import java.util 4 | 5 | import com.qihoo.finance.tap.{Helper, ImportCommon} 6 | import org.apache.log4j.{LogManager, Logger} 7 | import org.apache.spark.{SparkConf, SparkContext} 8 | import org.apache.tinkerpop.gremlin.driver.{Client, Result} 9 | 10 | object VertexImportIncrement { 11 | 12 | val logger: Logger = LogManager.getLogger("VertexImportIncrement") 13 | 14 | val usage = 15 | """ 16 | 顶点增量导入,判断属性 17 | Usage: VertexImportIncrement [--janusgraph-hosts 10.94.90.121] [--janusgraph-port 8182] [--batch-size 20] E:\360_doc\lolth\mobile.csv 18 | """ 19 | 20 | type OptionMap = Map[Symbol, Any] 21 | 22 | def main(args: Array[String]) { 23 | if (args.length == 0) { 24 | println(usage) 25 | System.exit(0) 26 | } 27 | 28 | val argList = args.toList 29 | val options = ImportCommon.nextOption(Map(), argList) 30 | 31 | val conf = new SparkConf().setAppName("VertexImportIncrement") 32 | //setMaster("local") 本机的spark就用local,远端的就写ip 33 | //如果是打成jar包运行则需要去掉 setMaster("local")因为在参数中会指定。 34 | // conf.setMaster("local") 35 | 36 | val sc = new SparkContext(conf) 37 | val txtFile = sc.textFile(options.getOrElse('importFile, "").asInstanceOf[String]) 38 | val hosts = options.getOrElse('janusgraphHosts, "").asInstanceOf[String] 39 | val port = options.getOrElse('janusgraphPort, 8182).asInstanceOf[Int] 40 | val batchSize = options.getOrElse('batchSize, 50).asInstanceOf[Int] 41 | val poolSize = options.getOrElse('poolSize, 16).asInstanceOf[Int] 42 | 43 | 44 | txtFile.map { 45 | line => 46 | val labelLast = line.indexOf("[") 47 | val attrStart = line.indexOf("{") 48 | val label = line.substring(0, labelLast) 49 | val attrStr = line.substring(attrStart, line.length) 50 | (label.toUpperCase(), attrStr) 51 | }.foreachPartition(partitionOfRecords => { 52 | val provider = ImportCommon.getJanusGraph(hosts, port, poolSize) 53 | val client = provider.getClient 54 | var recordList: List[(String, String)] = List() 55 | 56 | partitionOfRecords.foreach(record => { 57 | if (!ImportCommon.isEmpty(record._1)) { 58 | 59 | recordList = (record._1, record._2) :: recordList 60 | if (recordList.size >= batchSize) { 61 | ImportCommon.handleVertexIncrementList(recordList, client) 62 | recordList = List() 63 | } 64 | 65 | } 66 | }) 67 | 68 | ImportCommon.handleVertexIncrementList(recordList, client) 69 | client.close() 70 | provider.close() 71 | }) 72 | 73 | println("***********************stoped***********************") 74 | sc.stop() 75 | } 76 | 77 | def isVertexExist(record: (String, String), client: Client): Boolean = { 78 | val jsonObject = Helper.getVertexProperty(record._2) 79 | val name = jsonObject.getString("name") 80 | val cql = "g.V().has('name','" + name + "').count()" 81 | 82 | val results: util.List[Result] = ImportCommon.getResultWithRetry(client, cql) 83 | // val results: util.List[Result] = client.submit(cql).all.get 84 | 85 | if (results != null && results.size() > 0 && results.get(0).getInt > 0) { 86 | return true 87 | } 88 | false 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /janusgraph_yarn.md: -------------------------------------------------------------------------------- 1 | # Janusgraph Yarn Configuration 2 | 3 | 此文档旨在说明 JanusGraph 如何集成 yarn 4 | 5 | ## 环境路径 6 | CDH的安装目录 /opt/cloudera/parcels/CDH/ 7 | CDH的配置文件目录 /etc/hadoop 8 | 9 | ## 下载 10 | spark-2.2.1-bin-hadoop2.7 11 | janusgraph-0.3.2-hadoop2 12 | 13 | 14 | ## Jar 包冲突解决 15 | spark-2.2.1-bin-hadoop2.7 依赖的 guava-14.0.1.jar 与 16 | janusgraph 依赖的 guava-18.0.jar 存着冲突。使用 guava-18.0.jar 17 | 18 | rm -f spark-2.2.1-bin-hadoop2.7/jars/guava-*.jar 19 | 20 | cp janusgraph/lib/guava-18.0.jar spark/jars/ 21 | 22 | ## 修改 bin/gremlin.sh 23 | ```bash 24 | export CLASSPATH="$CLASSPATH:/etc/hadoop/conf/*:/opt/cloudera/parcels/CDH/lib/hadoop-yarn/*:/home/q/spark/jars/*" 25 | ``` 26 | 27 | ## 文件配置 28 | gremlin_yan.sh 29 | ```bash 30 | #!/bin/bash 31 | export HADOOP_CONF_DIR=/etc/hadoop/conf 32 | 33 | export CLASSPATH=$CLASSPATH:$HADOOP_CONF_DIR 34 | # 关键,会从此目录加载依赖的 spark和yarn jar包 janusgraph 提供的spark jar包不全 35 | export SPARK_HOME=/home/q/spark 36 | 37 | export PATH=$PATH:$SPARK_HOME/bin 38 | bin/gremlin.sh 39 | ``` -------------------------------------------------------------------------------- /optimize.md: -------------------------------------------------------------------------------- 1 | # JanusGraph 查询优化 2 | 3 | JanusGraph 的查询有比较多的优化点,在此做些说明 4 | 5 | ## 属性 _multiPreFetch 优化 6 | 这是个人认为最重要的优化,0.4.0 版本才提供的功能,没有这个功能 JanusGraph 在大数据量的生产环境基本不可用 7 | 8 | ```bash 9 | g.V().has('name', P.within('186xxxx6666')).both('CALL').or(has('is_register','true'), has('is_risk','true')).as('m2').profile() 10 | ``` 11 | 类似上面的语句,没有这个优化,Jansugraph 会找到对端的顶点然后每个顶点单独去获取属性再做过滤条件 12 | 在生产获取的顶点数很多的时候基本不可用 13 | 耗时特别长 14 | 触发了这个优化的话它会批量获取顶点的属性然后做过滤 15 | ```bash 16 | gremlin> g.V(6554048).outE('aggregation').otherV().has('name', neq('bob')).count().profile() 17 | ==>Traversal Metrics 18 | Step Count Traversers Time (ms) % Dur 19 | ============================================================================================================= 20 | GraphStep(vertex,[6554048]) 1 1 35.538 0.15 21 | JanusGraphVertexStep(OUT,[aggregation],vertex) 30159 30159 2220.394 9.28 22 | \_condition=(PROPERTY AND visibility:normal) 23 | \_orders=[] 24 | \_isFitted=true 25 | \_isOrdered=true 26 | \_query=org.janusgraph.diskstorage.keycolumnvalue.SliceQuery@8019d62e 27 | \_multi=true 28 | \_vertices=20000 29 | \_multiPreFetch=true 30 | optimization 82.480 31 | backend-query 30159 275.560 32 | \_query=org.janusgraph.diskstorage.keycolumnvalue.SliceQuery@81bebe6b 33 | optimization 0.712 34 | backend-query 257398 1491.029 35 | \_query=org.janusgraph.diskstorage.keycolumnvalue.SliceQuery@8019d62e 36 | HasStep([name.neq(bob)]) 28054 28054 21612.923 90.33 37 | CountGlobalStep 1 1 56.938 0.24 38 | >TOTAL - - 23925.795 - 39 | ``` 40 | 在profile `_multiPreFetch=true` 表示触发了这个优化 41 | 这个优化触发的条件有点苛刻 42 | 首先需要在配置文件中配置 `query.batch-property-prefetch=true` 43 | 其次需要利用`has`进行属性过滤 44 | 再者查询出来的数据行数不能超过 配置文件中 `cache.tx-cache-size` 设置的值(默认值为2W) 45 | 意思是如果查询出点超过设置值就不会触发这个优化 46 | 更加详细的信息可以参考如下[链接](https://github.com/JanusGraph/janusgraph/issues/984) 47 | 48 | ## 返回结果优化 49 | ```bash 50 | g.V().has("MOBILE", "name", P.within('186xxxx6666')).as("m1").both("CALL").as('m2') \ 51 | .select("m1", "m2") \ 52 | .by(valueMap("name")) \ 53 | .by(valueMap("name", "is_risk", "status", "is_service", "overdue_status")) 54 | ``` 55 | 上面的查询返回的结果是 56 | ```bash 57 | {m1={name=[18658606666]}, m2={name=[13064767986]}} 58 | {m1={name=[18658606666]}, m2={name=[13291676581]}} 59 | {m1={name=[18658606666]}, m2={name=[13566665915]}} 60 | {m1={name=[18658606666]}, m2={name=[15072770149]}} 61 | {m1={name=[18658606666]}, m2={name=[15268898802]}} 62 | {m1={name=[18658606666]}, m2={name=[18657617779], status=[3]}} 63 | ``` 64 | 这样的查询返回结果看似没啥问题,我们之前也是这样写的,这样查询语法比较简洁。 65 | 这个查询有两个问题,我们生产的数据量比较大,并且涉及大量的查询。在生产环境应用的内存很快被打满 66 | 这个查询有两个问题,第一个问题是 name返回的是list,在`Java`中 `ArrayList`的默认值是 10,意思是即使你属性只有一个值 67 | 也会创建10个对象 68 | 第二个问题是,返回的Map 过多,m1, m2 这两个Map虽然只有一个key,但是在`Java` 中`HashMap`的默认值 16, 69 | 同上面的问题会导致大量的内存浪费 70 | ```java 71 | public class ArrayList extends AbstractList 72 | implements List, RandomAccess, Cloneable, java.io.Serializable 73 | { 74 | private static final long serialVersionUID = 8683452581122892189L; 75 | 76 | /** 77 | * Default initial capacity. 78 | */ 79 | private static final int DEFAULT_CAPACITY = 10; 80 | ... 81 | } 82 | 83 | 84 | public class HashMap extends AbstractMap 85 | implements Map, Cloneable, Serializable { 86 | 87 | private static final long serialVersionUID = 362498820763181265L; 88 | 89 | /** 90 | * The default initial capacity - MUST be a power of two. 91 | */ 92 | static final int DEFAULT_INITIAL_CAPACITY = 1 << 4; // aka 16 93 | ... 94 | } 95 | ``` 96 | 正确的查询语句应该是像下面这样的 97 | ```bash 98 | g.V().has("MOBILE", "name", P.within('186xxxx6666')).as("m1").both("CALL").as('m2') \ 99 | .select("m1", "m2") \ 100 | .project("cName", "mobile", "isRisk", "isService") \ 101 | .by(select("m1").by(coalesce(values("name"), constant("null"))) ) \ 102 | .by(select("m2").by(coalesce(values("name"), constant("null"))) ) \ 103 | .by(select("m2").by(coalesce(values("is_risk"), constant("null"))) ) \ 104 | .by(select("m2").by(coalesce(values("is_service"), constant("null"))) ) \ 105 | # 返回结果为一个 map 并且结果不为 list 106 | {cName=186xxxx6666, mobile=186xxxx6666, isRisk=true, status=0} 107 | ``` 108 | 如果确实需要使用`valueMap` 并且不希望返回 List结果可以用下面的语法 109 | ```bash 110 | valueMap().by(unfold()) 111 | ``` 112 | ## 插入顶点和边的重复检查 113 | ```bash 114 | g.V().has("name", nodeName).fold().coalesce(unfold(), addV("MOBILE").property("name", nodeName)).next(); 115 | ``` 116 | 上面的语句类似merge功能,如果顶点不存在添加顶点 117 | 118 | ```bash 119 | g.V(fromNode).as("a").V(toNode).coalesce(inE(relationLabel).where(outV().as("a")), addE(relationLabel).from("a")) 120 | ``` 121 | 上面的语句会检查两个顶点之间的边是否存着,如果不存在添加对应的边 122 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.qihoo.finance 8 | janusgraph 9 | pom 10 | 1.0.0-SNAPSHOT 11 | 12 | 13 | dataImport 14 | 15 | 16 | 17 | UTF-8 18 | 1.8 19 | false 20 | 21 | 1.9 22 | 1.8.3 23 | 2.2.1 24 | 2.11 25 | 2.6.5 26 | 1.2.6 27 | 28 | 29 | 30 | 31 | 32 | 33 | org.apache.maven.plugins 34 | maven-compiler-plugin 35 | 3.3 36 | 37 | ${java.version} 38 | ${java.version} 39 | true 40 | 41 | 42 | 43 | 44 | org.apache.maven.plugins 45 | maven-deploy-plugin 46 | 47 | ${skip_maven_deploy} 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | --------------------------------------------------------------------------------