├── .github ├── CODEOWNERS └── workflows │ └── ci.yml ├── .gitignore ├── .scalafmt.conf ├── LICENSE.txt ├── NOTICE.txt ├── README.md ├── checkstyle.xml ├── code_of_conduct.md ├── contributing.md ├── data-schema.png ├── ldbc-logo.png ├── logs └── .gitignore ├── paramgen ├── parameter_curation.py ├── search_params.py └── time_select.py ├── pom.xml ├── scripts ├── get-spark-to-home.sh ├── run.py ├── run_cluster.sh ├── run_local.sh └── run_paramgen.sh ├── src ├── main │ ├── java │ │ └── ldbc │ │ │ └── finbench │ │ │ └── datagen │ │ │ ├── config │ │ │ ├── ConfigParser.java │ │ │ ├── DatagenConfiguration.java │ │ │ ├── ScaleFactor.java │ │ │ └── ScaleFactors.java │ │ │ ├── entities │ │ │ ├── DynamicActivity.java │ │ │ ├── edges │ │ │ │ ├── CompanyApplyLoan.java │ │ │ │ ├── CompanyGuaranteeCompany.java │ │ │ │ ├── CompanyInvestCompany.java │ │ │ │ ├── CompanyOwnAccount.java │ │ │ │ ├── Deposit.java │ │ │ │ ├── PersonApplyLoan.java │ │ │ │ ├── PersonGuaranteePerson.java │ │ │ │ ├── PersonInvestCompany.java │ │ │ │ ├── PersonOwnAccount.java │ │ │ │ ├── Repay.java │ │ │ │ ├── SignIn.java │ │ │ │ ├── Transfer.java │ │ │ │ └── Withdraw.java │ │ │ ├── nodes │ │ │ │ ├── Account.java │ │ │ │ ├── Company.java │ │ │ │ ├── Loan.java │ │ │ │ ├── Medium.java │ │ │ │ ├── Person.java │ │ │ │ └── PersonOrCompany.java │ │ │ └── place │ │ │ │ └── Place.java │ │ │ ├── generation │ │ │ ├── DatagenContext.java │ │ │ ├── DatagenParams.java │ │ │ ├── dictionary │ │ │ │ ├── CommonTextDictionary.java │ │ │ │ ├── Dictionaries.java │ │ │ │ ├── EmailDictionary.java │ │ │ │ ├── NumbersGenerator.java │ │ │ │ ├── PercentageTextDictionary.java │ │ │ │ ├── PersonNameDictionary.java │ │ │ │ ├── PlaceDictionary.java │ │ │ │ └── PlaceZOrder.java │ │ │ ├── distribution │ │ │ │ ├── AccountDeleteDistribution.java │ │ │ │ ├── Bucket.java │ │ │ │ ├── DegreeDistribution.java │ │ │ │ ├── GeometricDistribution.java │ │ │ │ ├── MultiplicityDistribution.java │ │ │ │ ├── PowerLawActivityDeleteDistribution.java │ │ │ │ ├── PowerLawBucketsDistribution.java │ │ │ │ ├── PowerLawFormulaDistribution.java │ │ │ │ └── TimeDistribution.java │ │ │ ├── events │ │ │ │ ├── AccountActivitiesEvent.java │ │ │ │ ├── CompanyActivitiesEvent.java │ │ │ │ ├── CompanyInvestEvent.java │ │ │ │ ├── LoanActivitiesEvents.java │ │ │ │ ├── PersonActivitiesEvent.java │ │ │ │ ├── PersonInvestEvent.java │ │ │ │ └── SignInEvent.java │ │ │ └── generators │ │ │ │ ├── AccountGenerator.java │ │ │ │ ├── CompanyGenerator.java │ │ │ │ ├── DateGenerator.java │ │ │ │ ├── LoanGenerator.java │ │ │ │ ├── MediumGenerator.java │ │ │ │ └── PersonGenerator.java │ │ │ └── util │ │ │ ├── DateTimeUtils.java │ │ │ ├── RandomGeneratorFarm.java │ │ │ └── ZOrder.java │ ├── resources │ │ ├── README.md │ │ ├── dictionaries │ │ │ ├── accountLevels.txt │ │ │ ├── accountNicknames.txt │ │ │ ├── accountTypes.txt │ │ │ ├── businessTypes.txt │ │ │ ├── citiesByCountry.txt │ │ │ ├── companyNames.txt │ │ │ ├── dicLocations.txt │ │ │ ├── emails.txt │ │ │ ├── goodsTypes.txt │ │ │ ├── guaranteeRelationships.txt │ │ │ ├── loanOrganizations.txt │ │ │ ├── loanUsages.txt │ │ │ ├── mediumNames.txt │ │ │ ├── payTypes.txt │ │ │ ├── randomText.txt │ │ │ ├── riskLevels.txt │ │ │ ├── surnames.txt │ │ │ └── urls.txt │ │ ├── distributions │ │ │ ├── accountDelete.txt │ │ │ ├── facebookPowerlawBucket.dat │ │ │ ├── hourDistribution.dat │ │ │ ├── inDegreeRegression.txt │ │ │ ├── multiplicityPowerlawRegression.txt │ │ │ ├── outDegreeRegression.txt │ │ │ └── powerLawActivityDeleteDate.txt │ │ ├── log4j.properties │ │ ├── params_default.ini │ │ └── scale_factors.xml │ └── scala │ │ └── ldbc │ │ └── finbench │ │ └── datagen │ │ ├── LdbcDatagen.scala │ │ ├── factors │ │ ├── AccountItemsGenerator.scala │ │ ├── FactorGenerationStage.scala │ │ └── FactorTable.scala │ │ ├── generation │ │ ├── ActivitySimulator.scala │ │ ├── GenerationStage.scala │ │ ├── generators │ │ │ ├── ActivityGenerator.scala │ │ │ ├── SparkAccountGenerator.scala │ │ │ ├── SparkCompanyGenerator.scala │ │ │ ├── SparkMediumGenerator.scala │ │ │ └── SparkPersonGenerator.scala │ │ └── serializers │ │ │ └── ActivitySerializer.scala │ │ ├── io │ │ ├── Reader.scala │ │ ├── Writer.scala │ │ ├── dataframes.scala │ │ ├── graphs.scala │ │ └── raw │ │ │ └── package.scala │ │ ├── model │ │ ├── package.scala │ │ └── raw.scala │ │ ├── syntax │ │ ├── FluentSyntax.scala │ │ ├── PathSyntax.scala │ │ ├── SparkSqlSyntax.scala │ │ └── package.scala │ │ ├── transformation │ │ └── TransformationStage.scala │ │ └── util │ │ ├── Logging.scala │ │ ├── SparkApp.scala │ │ ├── SparkEnv.scala │ │ ├── SparkUI.scala │ │ ├── package.scala │ │ └── sql.scala └── test │ ├── java │ └── ldbc │ │ └── finbench │ │ └── datagen │ │ ├── generators │ │ ├── DistributionTest.java │ │ └── GeneratorTest.java │ │ └── util │ │ └── GeneralTest.java │ └── scala │ └── ldbc │ └── finbench │ └── datagen │ └── util │ └── UtilPackageSuite.scala ├── tools ├── README.md ├── check_consistency.py ├── check_deletion.py ├── check_duplicate.py ├── check_transfer.py ├── legacy │ ├── dataprofiler │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── algo.h │ │ ├── compile.sh │ │ ├── de_core.cpp │ │ ├── plot.py │ │ ├── profiler.cpp │ │ ├── result │ │ │ ├── db139 │ │ │ │ ├── edges.txt │ │ │ │ ├── in-out.txt │ │ │ │ ├── in_degree_dist.png │ │ │ │ ├── in_degree_dist.txt │ │ │ │ ├── in_degree_dist_regression.png │ │ │ │ ├── in_degree_dist_regression.txt │ │ │ │ ├── out-in.txt │ │ │ │ ├── out_degree_dist.png │ │ │ │ ├── out_degree_dist.txt │ │ │ │ ├── out_degree_dist_regression.png │ │ │ │ └── out_degree_dist_regression.txt │ │ │ ├── db177 │ │ │ │ ├── in-out.txt │ │ │ │ ├── in_degree_dist.png │ │ │ │ ├── in_degree_dist.txt │ │ │ │ ├── in_degree_dist_regression.png │ │ │ │ ├── in_degree_dist_regression.txt │ │ │ │ ├── out-in.txt │ │ │ │ ├── out_degree_dist.png │ │ │ │ ├── out_degree_dist.txt │ │ │ │ ├── out_degree_dist_regression.png │ │ │ │ └── out_degree_dist_regression.txt │ │ │ ├── db184 │ │ │ │ ├── edges.txt │ │ │ │ ├── in-out.txt │ │ │ │ ├── in_degree_dist.png │ │ │ │ ├── in_degree_dist.txt │ │ │ │ ├── in_degree_dist_regression.png │ │ │ │ ├── in_degree_dist_regression.txt │ │ │ │ ├── out-in.txt │ │ │ │ ├── out_degree_dist.png │ │ │ │ ├── out_degree_dist.txt │ │ │ │ ├── out_degree_dist_regression.png │ │ │ │ └── out_degree_dist_regression.txt │ │ │ ├── hubvertex_indeg │ │ │ │ ├── hub_indeg_1.png │ │ │ │ ├── hub_indeg_1.txt │ │ │ │ ├── hub_indeg_1_regression.png │ │ │ │ ├── hub_indeg_1_regression.txt │ │ │ │ ├── hub_indeg_2.png │ │ │ │ ├── hub_indeg_2.txt │ │ │ │ ├── hub_indeg_2_regression.png │ │ │ │ ├── hub_indeg_2_regression.txt │ │ │ │ ├── hub_indeg_3.png │ │ │ │ ├── hub_indeg_3.txt │ │ │ │ ├── hub_indeg_3_regression.png │ │ │ │ ├── hub_indeg_3_regression.txt │ │ │ │ ├── hub_indeg_4.png │ │ │ │ ├── hub_indeg_4.txt │ │ │ │ ├── hub_indeg_4_regression.png │ │ │ │ ├── hub_indeg_4_regression.txt │ │ │ │ ├── hub_indeg_5.png │ │ │ │ ├── hub_indeg_5.txt │ │ │ │ ├── hub_indeg_5_regression.png │ │ │ │ └── hub_indeg_5_regression.txt │ │ │ ├── hubvertex_outdeg │ │ │ │ ├── hub_outdeg_1.png │ │ │ │ ├── hub_outdeg_1.txt │ │ │ │ ├── hub_outdeg_1_regression.png │ │ │ │ ├── hub_outdeg_1_regression.txt │ │ │ │ ├── hub_outdeg_2.png │ │ │ │ ├── hub_outdeg_2.txt │ │ │ │ ├── hub_outdeg_2_regression.png │ │ │ │ ├── hub_outdeg_2_regression.txt │ │ │ │ ├── hub_outdeg_3.png │ │ │ │ ├── hub_outdeg_3.txt │ │ │ │ ├── hub_outdeg_3_regression.png │ │ │ │ ├── hub_outdeg_3_regression.txt │ │ │ │ ├── hub_outdeg_4.png │ │ │ │ ├── hub_outdeg_4.txt │ │ │ │ ├── hub_outdeg_4_regression.png │ │ │ │ ├── hub_outdeg_4_regression.txt │ │ │ │ ├── hub_outdeg_5.png │ │ │ │ ├── hub_outdeg_5.txt │ │ │ │ ├── hub_outdeg_5_regression.png │ │ │ │ └── hub_outdeg_5_regression.txt │ │ │ └── transfer │ │ │ │ ├── in-out.txt │ │ │ │ ├── in_degree_dist.png │ │ │ │ ├── in_degree_dist.txt │ │ │ │ ├── in_degree_dist_regression.png │ │ │ │ ├── in_degree_dist_regression.txt │ │ │ │ ├── out-in.txt │ │ │ │ ├── out_degree_dist.png │ │ │ │ ├── out_degree_dist.txt │ │ │ │ ├── out_degree_dist_regression.png │ │ │ │ └── out_degree_dist_regression.txt │ │ └── wcc_core.cpp │ ├── factorgen │ │ ├── factor_table.sh │ │ ├── generate_account.py │ │ ├── loan.py │ │ ├── params_gen.properties │ │ ├── params_gen.py │ │ ├── split_amount.py │ │ └── time_split.py │ └── graphgen │ │ ├── Makefile │ │ ├── README.md │ │ └── graph_gen.c ├── merge_cluster_output.py ├── statistic.py └── validate_formula.py └── transformation ├── .gitignore ├── convert_data.py ├── install-dependencies.sh ├── readwrites.sql ├── snapshot.sql ├── transform.sh └── writes.sql /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # * @qishipengqsp -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | pull_request: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | verify: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Set up JDK 1.8 16 | uses: actions/setup-java@v3 17 | with: 18 | java-version: '8' 19 | distribution: 'adopt' 20 | - name: Cache local Maven repository 21 | uses: actions/cache@v3 22 | with: 23 | path: ~/.m2/repository 24 | key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} 25 | restore-keys: | 26 | ${{ runner.os }}-maven- 27 | - name: Build with Maven 28 | run: mvn --batch-mode --update-snapshots verify -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # project env files 2 | .DS_Store 3 | *.iml 4 | *.log 5 | target/ 6 | .idea/ 7 | .vscode/ 8 | .metals/ 9 | .bloop/ 10 | 11 | # local generated data 12 | out*/ 13 | out*.tar 14 | out*.tar.gz 15 | 16 | # tune 17 | tune/ 18 | tune.log 19 | 20 | # scripts 21 | scripts/*.log 22 | scripts/*.png 23 | 24 | sf*/ 25 | sf*.tar 26 | sf*.tar.gz 27 | 28 | paramgen/__pycache__/ 29 | tools/paramgen/__pycache__/ -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = "3.7.15" 2 | runner.dialect = scala213 -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | Copyright [2020-]2022 Linked Data Benchmark Council 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![logo](ldbc-logo.png) 2 | 3 | # FinBench DataGen 4 | 5 | ![Build status](https://github.com/ldbc/ldbc_finbench_datagen/actions/workflows/ci.yml/badge.svg?branch=main) 6 | 7 | The LDBC FinBench Data Generator (Datagen) produces the datasets for 8 | the [LDBC FinBench's workloads](https://ldbcouncil.org/benchmarks/finbench/). 9 | 10 | This data generator produces labelled directred property graphs based on the simulation of financial activities in 11 | business systems. The key features include generation, factorization and transformation. A detailed description of the 12 | schema produced by Datagen, as well as the format of the output files, can be found in the latest version of official 13 | LDBC FinBench specification document. 14 | 15 | ## DataGen Design 16 | 17 | ### Data Schema 18 | 19 | ![Schema](./data-schema.png) 20 | 21 | ### Implementation 22 | 23 | - Generation: Generation simulates financial activities in business systems to produce the raw data. 24 | - Factorization: Factorization profiles of the raw data to produce factor tables used for further parameter curation. 25 | - Transformation: Transformation transforms the raw data to the data for SUT and benchmark driver. 26 | 27 | Note: 28 | 29 | - Generation and Factorization are implemented in Scala while transformation is implemented in Python 30 | under `transformation/`. 31 | - SUT stands for System Under Test. 32 | 33 | ## Quick Start 34 | 35 | ### Pre-requisites 36 | 37 | - Java 8 installed. 38 | - Python3 and related packages installed. See each `install-dependencies.sh` for details. 39 | - Scala 2.12, note that it will be integrated when maven builds. 40 | - Spark deployed. Spark 3.2.x is the recommended runtime to use. The rest of the instructions are provided assuming 41 | Spark 3.2.x. 42 | 43 | ### Workflow 44 | 45 | - Use the spark application to generate the factor tables and raw data. 46 | - Use the python scripts to transform the data to snapshot data and write queries. 47 | 48 | ### Generation of Raw Data 49 | 50 | - Deploy Spark 51 | - use `scripts/get-spark-to-home.sh` to download pre-built spark to home directory and then decompress it. 52 | - Set the PATH environment variable to include the Spark binaries. 53 | - Build the project 54 | - run `mvn clean package -DskipTests` to package the artifacts. 55 | - Run locally with scripts 56 | - See `scripts/run_local.sh` for details. It uses spark-submit to run the data generator. Please make sure you have 57 | the pre-requisites installed and the build is successful. 58 | - Run in cloud: To be supported 59 | - Run in cluster: To be supported 60 | 61 | ### Transformation of Raw Data 62 | 63 | - set the `${FinBench_DATA_ROOT}` variable in `transformation/transform.sh` and run. 64 | 65 | ## TroubleShooting 66 | 67 | N/A yet 68 | 69 | # Related Work 70 | 71 | - FinBench Specification: https://github.com/ldbc/ldbc_finbench_docs 72 | - FinBench Driver: https://github.com/ldbc/ldbc_finbench_driver 73 | - FinBench Reference Implementation: https://github.com/ldbc/ldbc_finbench_transaction_impls 74 | - FinBench ACID Suite: https://github.com/ldbc/finbench-acid 75 | 76 | -------------------------------------------------------------------------------- /code_of_conduct.md: -------------------------------------------------------------------------------- 1 | For our code of conduct, see: https://github.com/ldbc/community/blob/main/code_of_conduct.md 2 | -------------------------------------------------------------------------------- /contributing.md: -------------------------------------------------------------------------------- 1 | For our contributor's guide, see: https://github.com/ldbc/community/blob/main/contributing.md -------------------------------------------------------------------------------- /data-schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/data-schema.png -------------------------------------------------------------------------------- /ldbc-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/ldbc-logo.png -------------------------------------------------------------------------------- /logs/.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | */*.log 3 | *.png -------------------------------------------------------------------------------- /scripts/get-spark-to-home.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eu 4 | cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 5 | 6 | curl https://archive.apache.org/dist/spark/spark-3.2.2/spark-3.2.2-bin-hadoop3.2.tgz | tar -xz -C ${HOME}/ 7 | -------------------------------------------------------------------------------- /scripts/run_cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LDBC_FINBENCH_DATAGEN_JAR=target/ldbc_finbench_datagen-0.2.0-SNAPSHOT-jar-with-dependencies.jar 4 | OUTPUT_DIR=/tmp/finbench-out/ 5 | 6 | echo "start: " `date` 7 | 8 | # Run Spark Application 9 | 10 | #--conf "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2" \ 11 | # --num-executors 2 \ 12 | # --conf "spark.shuffle.service.enabled=true" \ 13 | # --conf "spark.dynamicAllocation.enabled=true" \ 14 | # --conf "spark.dynamicAllocation.minExecutors=1" \ 15 | # --conf "spark.dynamicAllocation.maxExecutors=10" \ 16 | # --conf "spark.yarn.maximizeResourceAllocation=true" \ 17 | # --conf "spark.memory.offHeap.enabled=true" \ 18 | # --conf "spark.memory.offHeap.size=100g" \ 19 | time spark-submit --master spark://finbench-large-00:7077 \ 20 | --class ldbc.finbench.datagen.LdbcDatagen \ 21 | --num-executors 2 \ 22 | --conf "spark.default.parallelism=800" \ 23 | --conf "spark.network.timeout=100000" \ 24 | --conf "spark.shuffle.compress=true" \ 25 | --conf "spark.shuffle.spill.compress=true" \ 26 | --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ 27 | --conf "spark.driver.memory=100g" \ 28 | --conf "spark.driver.maxResultSize=0" \ 29 | --conf "spark.executor.memory=400g" \ 30 | --conf "spark.executor.memoryOverheadFactor=0.5" \ 31 | --conf "spark.executor.extraJavaOptions=-XX:+UseG1GC" \ 32 | ${LDBC_FINBENCH_DATAGEN_JAR} \ 33 | --scale-factor 100 \ 34 | --output-dir ${OUTPUT_DIR} 35 | 36 | echo "End: " `date` 37 | -------------------------------------------------------------------------------- /scripts/run_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LDBC_FINBENCH_DATAGEN_JAR=target/ldbc_finbench_datagen-0.2.0-SNAPSHOT-jar-with-dependencies.jar 4 | OUTPUT_DIR=out 5 | 6 | # Note: generate factor tables with --generate-factors 7 | 8 | # run locally with the python script 9 | # time python3 scripts/run.py --jar $LDBC_FINBENCH_DATAGEN_JAR --main-class ldbc.finbench.datagen.LdbcDatagen --memory 500g -- --scale-factor 30 --output-dir ${OUTPUT_DIR} 10 | 11 | # run locally with spark-submit command 12 | # **({'spark.driver.extraJavaOptions': '-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005'}), # Debug 13 | # **({'spark.executor.extraJavaOptions': '-verbose:gc -XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps'}), 14 | # --conf "spark.memory.offHeap.enabled=true" \ 15 | # --conf "spark.memory.offHeap.size=100g" \ 16 | # --conf "spark.storage.memoryFraction=0" \ 17 | # --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ 18 | 19 | time spark-submit --master local[*] \ 20 | --class ldbc.finbench.datagen.LdbcDatagen \ 21 | --driver-memory 480g \ 22 | --conf "spark.default.parallelism=500" \ 23 | --conf "spark.shuffle.compress=true" \ 24 | --conf "spark.shuffle.spill.compress=true" \ 25 | --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \ 26 | --conf "spark.memory.offHeap.enabled=true" \ 27 | --conf "spark.memory.offHeap.size=100g" \ 28 | --conf "spark.storage.memoryFraction=0" \ 29 | --conf "spark.driver.maxResultSize=0" \ 30 | --conf "spark.executor.extraJavaOptions=-XX:+UseG1GC" \ 31 | ${LDBC_FINBENCH_DATAGEN_JAR} \ 32 | --scale-factor 10 \ 33 | --output-dir ${OUTPUT_DIR} 34 | 35 | # currently works on SF100 36 | #time spark-submit --master local[*] \ 37 | # --class ldbc.finbench.datagen.LdbcDatagen \ 38 | # --driver-memory 400g \ 39 | # --conf "spark.default.parallelism=800" \ 40 | # --conf "spark.shuffle.compress=true" \ 41 | # --conf "spark.shuffle.spill.compress=true" \ 42 | # --conf "spark.kryoserializer.buffer.max=512m" \ 43 | # --conf "spark.driver.maxResultSize=0" \ 44 | # --conf "spark.driver.extraJavaOptions=-Xss512m" \ 45 | # --conf "spark.executor.extraJavaOptions=-Xss512m -XX:+UseG1GC" \ 46 | # --conf "spark.kryo.referenceTracking=false" \ 47 | # ${LDBC_FINBENCH_DATAGEN_JAR} \ 48 | # --scale-factor 100 \ 49 | # --output-dir ${OUTPUT_DIR} 50 | 51 | -------------------------------------------------------------------------------- /scripts/run_paramgen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | LDBC_FINBENCH_DATAGEN_JAR=target/ldbc_finbench_datagen-0.2.0-SNAPSHOT-jar-with-dependencies.jar 4 | OUTPUT_DIR=out/ 5 | 6 | # Note: generate factor tables with --generate-factors 7 | 8 | echo "start factor table generation" 9 | 10 | time spark-submit --master local[*] \ 11 | --class ldbc.finbench.datagen.LdbcDatagen \ 12 | --driver-memory 480g \ 13 | ${LDBC_FINBENCH_DATAGEN_JAR} \ 14 | --output-dir ${OUTPUT_DIR} \ 15 | --factor-format csv \ 16 | --generate-factors 17 | 18 | echo "start parameter curation" -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/config/ConfigParser.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.config; 18 | 19 | import java.io.FileInputStream; 20 | import java.io.IOException; 21 | import java.io.InputStream; 22 | import java.io.InputStreamReader; 23 | import java.nio.charset.StandardCharsets; 24 | import java.util.HashMap; 25 | import java.util.Map; 26 | import java.util.Properties; 27 | 28 | public class ConfigParser { 29 | 30 | public static Map readConfig(String paramsFile) { 31 | try (FileInputStream fis = new FileInputStream(paramsFile)) { 32 | return readConfig(fis); 33 | } catch (IOException e) { 34 | throw new RuntimeException(e); 35 | } 36 | } 37 | 38 | public static Map readConfig(InputStream paramStream) { 39 | Properties properties = new Properties(); 40 | Map res = new HashMap<>(); 41 | try { 42 | properties.load(new InputStreamReader(paramStream, StandardCharsets.UTF_8)); 43 | for (String s : properties.stringPropertyNames()) { 44 | res.put(s, properties.getProperty(s)); 45 | } 46 | return res; 47 | } catch (IOException e) { 48 | System.err.println(e.getMessage()); 49 | throw new RuntimeException(e); 50 | } 51 | } 52 | 53 | public static Map scaleFactorConf(String scaleFactorXml, String scaleFactorId) { 54 | Map conf = new HashMap<>(); 55 | ScaleFactors scaleFactors = ScaleFactors.INSTANCE; 56 | scaleFactors.initialize(scaleFactorXml); // use default if empty 57 | if (!scaleFactors.value.containsKey(scaleFactorId)) { 58 | throw new IllegalArgumentException("Scale factor " + scaleFactorId + " does not exist"); 59 | } 60 | ScaleFactor scaleFactor = scaleFactors.value.get(scaleFactorId); 61 | System.out.println("Applied configuration from " + (scaleFactorXml.isEmpty() ? "default" : scaleFactorXml) 62 | + " of scale factor " + scaleFactorId); 63 | for (Map.Entry e : scaleFactor.properties.entrySet()) { 64 | conf.put(e.getKey(), e.getValue()); 65 | } 66 | return conf; 67 | } 68 | 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/config/DatagenConfiguration.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.config; 18 | 19 | import java.io.Serializable; 20 | import java.util.Iterator; 21 | import java.util.Map; 22 | 23 | public class DatagenConfiguration implements Iterable>, Serializable { 24 | public final Map map; 25 | 26 | public DatagenConfiguration(Map map) { 27 | this.map = map; 28 | } 29 | 30 | public String get(String key) { 31 | return map.get(key); 32 | } 33 | 34 | public String get(String key, String defaultValue) { 35 | return map.getOrDefault(key, defaultValue); 36 | } 37 | 38 | public String getTrimmed(String name) { 39 | String value = this.get(name); 40 | return null == value ? null : value.trim(); 41 | } 42 | 43 | private String getHexDigits(String value) { 44 | boolean negative = false; 45 | String str = value; 46 | String hexString; 47 | if (value.startsWith("-")) { 48 | negative = true; 49 | str = value.substring(1); 50 | } 51 | 52 | if (!str.startsWith("0x") && !str.startsWith("0X")) { 53 | return null; 54 | } else { 55 | hexString = str.substring(2); 56 | if (negative) { 57 | hexString = "-" + hexString; 58 | } 59 | 60 | return hexString; 61 | } 62 | } 63 | 64 | public int getInt(String name, int defaultValue) { 65 | String valueString = this.getTrimmed(name); 66 | if (valueString == null) { 67 | return defaultValue; 68 | } else { 69 | String hexString = this.getHexDigits(valueString); 70 | return hexString != null ? Integer.parseInt(hexString, 16) : Integer.parseInt(valueString); 71 | } 72 | } 73 | 74 | public double getDouble(String name, double defaultValue) { 75 | String valueString = this.getTrimmed(name); 76 | return valueString == null ? defaultValue : Double.parseDouble(valueString); 77 | } 78 | 79 | @Override 80 | public Iterator> iterator() { 81 | return this.map.entrySet().iterator(); 82 | } 83 | 84 | public String getOutputDir() { 85 | return map.get("generator.outputDir"); 86 | } 87 | 88 | public String getFormat() { 89 | return map.get("generator.format"); 90 | } 91 | 92 | public String getPartition() { 93 | return map.get("spark.partition"); 94 | } 95 | 96 | public void printConfig() { 97 | System.out.println("********* Configuration *********"); 98 | map.forEach((key, value) -> System.out.println(key + ": " + value)); 99 | System.out.println("*********************************"); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/config/ScaleFactor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.config; 18 | 19 | import java.util.TreeMap; 20 | 21 | public class ScaleFactor { 22 | public TreeMap properties; 23 | 24 | ScaleFactor() { 25 | properties = new TreeMap<>(); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/config/ScaleFactors.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.config; 18 | 19 | import java.io.IOException; 20 | import java.io.InputStream; 21 | import java.nio.file.Files; 22 | import java.nio.file.Paths; 23 | import java.util.TreeMap; 24 | import javax.xml.parsers.DocumentBuilder; 25 | import javax.xml.parsers.DocumentBuilderFactory; 26 | import javax.xml.parsers.ParserConfigurationException; 27 | import org.w3c.dom.Document; 28 | import org.w3c.dom.Element; 29 | import org.w3c.dom.Node; 30 | import org.w3c.dom.NodeList; 31 | import org.xml.sax.SAXException; 32 | 33 | public class ScaleFactors { 34 | public TreeMap value; 35 | 36 | public static final ScaleFactors INSTANCE = new ScaleFactors(); 37 | 38 | private ScaleFactors() { 39 | } 40 | 41 | public void initialize(String scaleFactorsXml) { 42 | try { 43 | value = new TreeMap<>(); 44 | DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); 45 | InputStream configFile = scaleFactorsXml.isEmpty() 46 | ? ScaleFactors.class.getResourceAsStream("/scale_factors.xml") 47 | : Files.newInputStream(Paths.get(scaleFactorsXml)); 48 | Document doc = builder.parse(configFile); 49 | doc.getDocumentElement().normalize(); 50 | 51 | System.out.println("Reading scale factors from " + (scaleFactorsXml.isEmpty() ? "default" : 52 | scaleFactorsXml) + "..."); 53 | NodeList nodes = doc.getElementsByTagName("scale_factor"); 54 | for (int i = 0; i < nodes.getLength(); i++) { 55 | Node node = nodes.item(i); 56 | if (node.getNodeType() == Node.ELEMENT_NODE) { 57 | Element element = (Element) node; 58 | String scaleFactorName = element.getAttribute("name"); 59 | ScaleFactor scaleFactor = new ScaleFactor(); 60 | NodeList properties = ((Element) node).getElementsByTagName("property"); 61 | for (int j = 0; j < properties.getLength(); ++j) { 62 | Element property = (Element) properties.item(j); 63 | String name = property.getElementsByTagName("name").item(0).getTextContent(); 64 | String value = property.getElementsByTagName("value").item(0).getTextContent(); 65 | scaleFactor.properties.put(name, value); 66 | } 67 | System.out.println("Available scale factor configuration set " + scaleFactorName); 68 | value.put(scaleFactorName, scaleFactor); 69 | } 70 | } 71 | System.out.println("Number of scale factors read " + value.size()); 72 | } catch (ParserConfigurationException | IOException | SAXException e) { 73 | throw new RuntimeException(e); 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/entities/DynamicActivity.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.entities; 18 | 19 | public interface DynamicActivity { 20 | 21 | long getCreationDate(); 22 | 23 | long getDeletionDate(); 24 | 25 | boolean isExplicitlyDeleted(); 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/entities/edges/CompanyOwnAccount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.entities.edges; 18 | 19 | import java.io.Serializable; 20 | import ldbc.finbench.datagen.entities.DynamicActivity; 21 | import ldbc.finbench.datagen.entities.nodes.Account; 22 | import ldbc.finbench.datagen.entities.nodes.Company; 23 | import ldbc.finbench.datagen.entities.nodes.PersonOrCompany; 24 | import ldbc.finbench.datagen.generation.dictionary.Dictionaries; 25 | import ldbc.finbench.datagen.util.RandomGeneratorFarm; 26 | 27 | public class CompanyOwnAccount implements DynamicActivity, Serializable { 28 | private final long companyId; 29 | private final long accountId; 30 | private final long creationDate; 31 | private final long deletionDate; 32 | private final boolean isExplicitlyDeleted; 33 | private final String comment; 34 | private final Account account; // TODO: can be removed 35 | 36 | public CompanyOwnAccount(Company company, Account account, long creationDate, long deletionDate, 37 | boolean isExplicitlyDeleted, String comment) { 38 | this.companyId = company.getCompanyId(); 39 | this.accountId = account.getAccountId(); 40 | this.account = account; // TODO: can be removed 41 | this.creationDate = creationDate; 42 | this.deletionDate = deletionDate; 43 | this.isExplicitlyDeleted = isExplicitlyDeleted; 44 | this.comment = comment; 45 | } 46 | 47 | public static void createCompanyOwnAccount(RandomGeneratorFarm farm, Company company, Account account, 48 | long creationDate) { 49 | account.setOwnerType(PersonOrCompany.COMPANY); 50 | account.setCompanyOwner(company); 51 | String comment = 52 | Dictionaries.randomTexts.getUniformDistRandomTextForComments( 53 | farm.get(RandomGeneratorFarm.Aspect.COMMON_COMMENT)); 54 | CompanyOwnAccount companyOwnAccount = 55 | new CompanyOwnAccount(company, account, creationDate, account.getDeletionDate(), 56 | account.isExplicitlyDeleted(), comment); 57 | company.getCompanyOwnAccounts().add(companyOwnAccount); 58 | } 59 | 60 | public long getCompanyId() { 61 | return companyId; 62 | } 63 | 64 | public long getAccountId() { 65 | return accountId; 66 | } 67 | 68 | @Override 69 | public long getCreationDate() { 70 | return creationDate; 71 | } 72 | 73 | @Override 74 | public long getDeletionDate() { 75 | return deletionDate; 76 | } 77 | 78 | @Override 79 | public boolean isExplicitlyDeleted() { 80 | return isExplicitlyDeleted; 81 | } 82 | 83 | public String getComment() { 84 | return comment; 85 | } 86 | 87 | public Account getAccount() { 88 | return account; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/entities/edges/Deposit.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.entities.edges; 18 | 19 | import java.io.Serializable; 20 | import ldbc.finbench.datagen.entities.DynamicActivity; 21 | import ldbc.finbench.datagen.entities.nodes.Account; 22 | import ldbc.finbench.datagen.entities.nodes.Loan; 23 | import ldbc.finbench.datagen.generation.dictionary.Dictionaries; 24 | import ldbc.finbench.datagen.util.RandomGeneratorFarm; 25 | 26 | public class Deposit implements DynamicActivity, Serializable { 27 | private final long loanId; 28 | private final long accountId; 29 | private final double amount; 30 | private final long creationDate; 31 | private final long deletionDate; 32 | private final boolean isExplicitlyDeleted; 33 | private final String comment; 34 | 35 | public Deposit(Loan loan, Account account, double amount, long creationDate, long deletionDate, 36 | boolean isExplicitlyDeleted, String comment) { 37 | this.loanId = loan.getLoanId(); 38 | this.accountId = account.getAccountId(); 39 | this.amount = amount; 40 | this.creationDate = creationDate; 41 | this.deletionDate = deletionDate; 42 | this.isExplicitlyDeleted = isExplicitlyDeleted; 43 | this.comment = comment; 44 | } 45 | 46 | public static void createDeposit(RandomGeneratorFarm farm, Loan loan, Account account, double amount) { 47 | long creationDate = 48 | Dictionaries.dates.randomLoanToAccountDate(farm.get(RandomGeneratorFarm.Aspect.LOAN_SUBEVENTS_DATE), loan, 49 | account, account.getDeletionDate()); 50 | String comment = 51 | Dictionaries.randomTexts.getUniformDistRandomTextForComments( 52 | farm.get(RandomGeneratorFarm.Aspect.COMMON_COMMENT)); 53 | Deposit deposit = 54 | new Deposit(loan, account, amount, creationDate, account.getDeletionDate(), account.isExplicitlyDeleted(), 55 | comment); 56 | loan.addDeposit(deposit); 57 | //account.getDeposits().add(deposit); 58 | } 59 | 60 | public double getAmount() { 61 | return amount; 62 | } 63 | 64 | public long getLoanId() { 65 | return loanId; 66 | } 67 | 68 | public long getAccountId() { 69 | return accountId; 70 | } 71 | 72 | @Override 73 | public long getCreationDate() { 74 | return creationDate; 75 | } 76 | 77 | @Override 78 | public long getDeletionDate() { 79 | return deletionDate; 80 | } 81 | 82 | @Override 83 | public boolean isExplicitlyDeleted() { 84 | return isExplicitlyDeleted; 85 | } 86 | 87 | public String getComment() { 88 | return comment; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/entities/edges/PersonGuaranteePerson.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.entities.edges; 18 | 19 | import java.io.Serializable; 20 | import ldbc.finbench.datagen.entities.DynamicActivity; 21 | import ldbc.finbench.datagen.entities.nodes.Person; 22 | import ldbc.finbench.datagen.generation.dictionary.Dictionaries; 23 | import ldbc.finbench.datagen.util.RandomGeneratorFarm; 24 | 25 | public class PersonGuaranteePerson implements DynamicActivity, Serializable { 26 | private final long fromPersonId; 27 | private final long toPersonId; 28 | private final long creationDate; 29 | private final long deletionDate; 30 | private final boolean isExplicitlyDeleted; 31 | private final String relationship; 32 | private final String comment; 33 | 34 | public PersonGuaranteePerson(Person fromPerson, Person toPerson, 35 | long creationDate, long deletionDate, boolean isExplicitlyDeleted, String relation, 36 | String comment) { 37 | this.fromPersonId = fromPerson.getPersonId(); 38 | this.toPersonId = toPerson.getPersonId(); 39 | this.creationDate = creationDate; 40 | this.deletionDate = deletionDate; 41 | this.isExplicitlyDeleted = isExplicitlyDeleted; 42 | this.relationship = relation; 43 | this.comment = comment; 44 | } 45 | 46 | public static void createPersonGuaranteePerson(RandomGeneratorFarm farm, Person fromPerson, Person toPerson) { 47 | long creationDate = Dictionaries.dates.randomPersonToPersonDate( 48 | farm.get(RandomGeneratorFarm.Aspect.PERSON_GUARANTEE_DATE), fromPerson, toPerson); 49 | String relation = Dictionaries.guaranteeRelationships.getDistributedText( 50 | farm.get(RandomGeneratorFarm.Aspect.PERSON_GUARANTEE_RELATIONSHIP)); 51 | String comment = 52 | Dictionaries.randomTexts.getUniformDistRandomTextForComments( 53 | farm.get(RandomGeneratorFarm.Aspect.COMMON_COMMENT)); 54 | PersonGuaranteePerson personGuaranteePerson = 55 | new PersonGuaranteePerson(fromPerson, toPerson, creationDate, 0, false, relation, comment); 56 | fromPerson.getGuaranteeSrc().add(personGuaranteePerson); 57 | } 58 | 59 | public long getFromPersonId() { 60 | return fromPersonId; 61 | } 62 | 63 | public long getToPersonId() { 64 | return toPersonId; 65 | } 66 | 67 | @Override 68 | public long getCreationDate() { 69 | return creationDate; 70 | } 71 | 72 | @Override 73 | public long getDeletionDate() { 74 | return deletionDate; 75 | } 76 | 77 | @Override 78 | public boolean isExplicitlyDeleted() { 79 | return isExplicitlyDeleted; 80 | } 81 | 82 | public String getRelationship() { 83 | return relationship; 84 | } 85 | 86 | public String getComment() { 87 | return comment; 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/entities/edges/Repay.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.entities.edges; 18 | 19 | import java.io.Serializable; 20 | import ldbc.finbench.datagen.entities.DynamicActivity; 21 | import ldbc.finbench.datagen.entities.nodes.Account; 22 | import ldbc.finbench.datagen.entities.nodes.Loan; 23 | import ldbc.finbench.datagen.generation.dictionary.Dictionaries; 24 | import ldbc.finbench.datagen.util.RandomGeneratorFarm; 25 | 26 | public class Repay implements DynamicActivity, Serializable { 27 | private final long accountId; 28 | private final long loanId; 29 | private final double amount; 30 | private final long creationDate; 31 | private final long deletionDate; 32 | private final boolean isExplicitlyDeleted; 33 | private final String comment; 34 | 35 | public Repay(Account account, Loan loan, double amount, long creationDate, long deletionDate, 36 | boolean isExplicitlyDeleted, String comment) { 37 | this.accountId = account.getAccountId(); 38 | this.loanId = loan.getLoanId(); 39 | this.amount = amount; 40 | this.creationDate = creationDate; 41 | this.deletionDate = deletionDate; 42 | this.isExplicitlyDeleted = isExplicitlyDeleted; 43 | this.comment = comment; 44 | } 45 | 46 | public static void createRepay(RandomGeneratorFarm farm, Account account, Loan loan, double amount) { 47 | long creationDate = 48 | Dictionaries.dates.randomAccountToLoanDate(farm.get(RandomGeneratorFarm.Aspect.LOAN_SUBEVENTS_DATE), 49 | account, loan, account.getDeletionDate()); 50 | String comment = 51 | Dictionaries.randomTexts.getUniformDistRandomTextForComments( 52 | farm.get(RandomGeneratorFarm.Aspect.COMMON_COMMENT)); 53 | Repay repay = new Repay(account, loan, amount, creationDate, account.getDeletionDate(), 54 | account.isExplicitlyDeleted(), comment); 55 | loan.addRepay(repay); 56 | //account.getRepays().add(repay); 57 | } 58 | 59 | public double getAmount() { 60 | return amount; 61 | } 62 | 63 | public long getAccountId() { 64 | return accountId; 65 | } 66 | 67 | public long getLoanId() { 68 | return loanId; 69 | } 70 | 71 | @Override 72 | public long getCreationDate() { 73 | return creationDate; 74 | } 75 | 76 | @Override 77 | public long getDeletionDate() { 78 | return deletionDate; 79 | } 80 | 81 | @Override 82 | public boolean isExplicitlyDeleted() { 83 | return isExplicitlyDeleted; 84 | } 85 | 86 | public String getComment() { 87 | return comment; 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/entities/nodes/Medium.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.entities.nodes; 18 | 19 | import java.io.Serializable; 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | import ldbc.finbench.datagen.entities.edges.SignIn; 23 | 24 | public class Medium implements Serializable { 25 | private long mediumId; 26 | private String mediumName; 27 | private final List signIns; 28 | private long creationDate; 29 | private boolean isBlocked; 30 | private long lastLogin; 31 | private String riskLevel; 32 | 33 | public Medium() { 34 | signIns = new ArrayList<>(); 35 | } 36 | 37 | public Medium(long mediumId, String mediumName, long creationDate, boolean isBlocked) { 38 | signIns = new ArrayList<>(); 39 | this.mediumId = mediumId; 40 | this.mediumName = mediumName; 41 | this.creationDate = creationDate; 42 | this.isBlocked = isBlocked; 43 | } 44 | 45 | @Override 46 | public boolean equals(Object obj) { 47 | if (obj instanceof Medium) { 48 | Medium other = (Medium) obj; 49 | return mediumId == other.mediumId; 50 | } 51 | return false; 52 | } 53 | 54 | @Override 55 | public int hashCode() { 56 | return Long.hashCode(mediumId); 57 | } 58 | 59 | public long getMediumId() { 60 | return mediumId; 61 | } 62 | 63 | public void setMediumId(long mediumId) { 64 | this.mediumId = mediumId; 65 | } 66 | 67 | public String getMediumName() { 68 | return mediumName; 69 | } 70 | 71 | public void setMediumName(String mediumName) { 72 | this.mediumName = mediumName; 73 | } 74 | 75 | public List getSignIns() { 76 | return signIns; 77 | } 78 | 79 | public long getCreationDate() { 80 | return creationDate; 81 | } 82 | 83 | public void setCreationDate(long creationDate) { 84 | this.creationDate = creationDate; 85 | } 86 | 87 | public boolean isBlocked() { 88 | return isBlocked; 89 | } 90 | 91 | public void setBlocked(boolean blocked) { 92 | isBlocked = blocked; 93 | } 94 | 95 | public long getLastLogin() { 96 | return lastLogin; 97 | } 98 | 99 | public void setLastLogin(long lastLogin) { 100 | this.lastLogin = lastLogin; 101 | } 102 | 103 | public String getRiskLevel() { 104 | return riskLevel; 105 | } 106 | 107 | public void setRiskLevel(String riskLevel) { 108 | this.riskLevel = riskLevel; 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/entities/nodes/PersonOrCompany.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.entities.nodes; 18 | 19 | public enum PersonOrCompany { 20 | PERSON, COMPANY 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/entities/place/Place.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.entities.place; 18 | 19 | import java.io.Serializable; 20 | 21 | @SuppressWarnings("serial") 22 | public class Place implements Serializable { 23 | 24 | public static final String CITY = "City"; 25 | public static final String COUNTRY = "Country"; 26 | public static final String CONTINENT = "Continent"; 27 | 28 | private int id; 29 | private int zid; 30 | private String name; 31 | private double latitude; 32 | private double longitude; 33 | private long population; 34 | private String type; 35 | 36 | public Place() { 37 | } 38 | 39 | public Place(int id, String name, double longitude, double latitude, int population, String type) { 40 | this.id = id; 41 | this.name = name; 42 | this.longitude = longitude; 43 | this.latitude = latitude; 44 | this.population = population; 45 | this.type = type; 46 | } 47 | 48 | public int getZId() { 49 | return zid; 50 | } 51 | 52 | public void setZId(int zid) { 53 | this.zid = zid; 54 | } 55 | 56 | public int getId() { 57 | return id; 58 | } 59 | 60 | public void setId(int id) { 61 | this.id = id; 62 | } 63 | 64 | public String getName() { 65 | return name; 66 | } 67 | 68 | public void setName(String name) { 69 | this.name = name; 70 | } 71 | 72 | public double getLongitude() { 73 | return longitude; 74 | } 75 | 76 | public void setLongitude(double longitude) { 77 | this.longitude = longitude; 78 | } 79 | 80 | public double getLatitude() { 81 | return latitude; 82 | } 83 | 84 | public void setLatitude(double latitude) { 85 | this.latitude = latitude; 86 | } 87 | 88 | public long getPopulation() { 89 | return population; 90 | } 91 | 92 | public void setPopulation(long population) { 93 | this.population = population; 94 | } 95 | 96 | public String getType() { 97 | return type; 98 | } 99 | 100 | public void setType(String type) { 101 | this.type = type; 102 | } 103 | 104 | } 105 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/DatagenContext.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation; 18 | 19 | import ldbc.finbench.datagen.config.DatagenConfiguration; 20 | import ldbc.finbench.datagen.generation.dictionary.Dictionaries; 21 | 22 | public class DatagenContext { 23 | 24 | private static transient volatile boolean initialized = false; 25 | 26 | public static synchronized void initialize(DatagenConfiguration conf) { 27 | if (!initialized) { 28 | DatagenParams.readConf(conf); 29 | Dictionaries.loadDictionaries(); 30 | initialized = true; 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/dictionary/CommonTextDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.dictionary; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.IOException; 21 | import java.io.InputStreamReader; 22 | import java.nio.charset.StandardCharsets; 23 | import java.util.Objects; 24 | import java.util.Random; 25 | import java.util.TreeMap; 26 | 27 | public class CommonTextDictionary { 28 | private final TreeMap resources; 29 | 30 | public CommonTextDictionary(String filePath, String separator) { 31 | this.resources = new TreeMap<>(); 32 | 33 | try { 34 | InputStreamReader inputStreamReader = new InputStreamReader( 35 | Objects.requireNonNull(getClass().getResourceAsStream(filePath)), StandardCharsets.UTF_8); 36 | BufferedReader dictionary = new BufferedReader(inputStreamReader); 37 | String line; 38 | long totalNum = 0; 39 | while ((line = dictionary.readLine()) != null) { 40 | String[] data = line.split(separator); 41 | String surname = data[0].trim(); 42 | this.resources.put(totalNum, surname); 43 | totalNum++; 44 | } 45 | dictionary.close(); 46 | } catch (IOException e) { 47 | throw new RuntimeException(e); 48 | } 49 | } 50 | 51 | public String getUniformDistRandomText(Random random) { 52 | long index = random.nextInt(resources.size()); 53 | return resources.get(index); 54 | } 55 | 56 | public String getUniformDistRandomTextForComments(Random random) { 57 | StringBuilder text = new StringBuilder(); 58 | for (int i = 0; i < 5; i++) { 59 | text.append(resources.get((long) random.nextInt(resources.size()))).append(" "); 60 | } 61 | return text.toString(); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/dictionary/EmailDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.dictionary; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.IOException; 21 | import java.io.InputStreamReader; 22 | import java.util.ArrayList; 23 | import java.util.List; 24 | import java.util.Random; 25 | 26 | 27 | public class EmailDictionary { 28 | private final List emails; 29 | private final List cumulativeDistribution; 30 | 31 | public EmailDictionary(String filePath, String separator) { 32 | try { 33 | BufferedReader emailDictionary = new BufferedReader( 34 | new InputStreamReader(getClass().getResourceAsStream(filePath), "UTF-8")); 35 | 36 | emails = new ArrayList<>(); 37 | cumulativeDistribution = new ArrayList<>(); 38 | 39 | String line; 40 | double cummulativeDist = 0.0; 41 | while ((line = emailDictionary.readLine()) != null) { 42 | String[] data = line.split(separator); 43 | emails.add(data[0]); 44 | if (data.length == 2) { 45 | cummulativeDist += Double.parseDouble(data[1]); 46 | cumulativeDistribution.add(cummulativeDist); 47 | } 48 | } 49 | emailDictionary.close(); 50 | } catch (IOException e) { 51 | throw new RuntimeException(e); 52 | } 53 | } 54 | 55 | public String getRandomEmail(Random randomTop, Random randomEmail) { 56 | int minIdx = 0; 57 | int maxIdx = cumulativeDistribution.size() - 1; 58 | double prob = randomTop.nextDouble(); 59 | if (prob > cumulativeDistribution.get(maxIdx)) { 60 | int idx = randomEmail.nextInt(emails.size() - cumulativeDistribution.size()) + cumulativeDistribution 61 | .size(); 62 | return emails.get(idx); 63 | } else if (prob < cumulativeDistribution.get(minIdx)) { 64 | return emails.get(minIdx); 65 | } 66 | 67 | while ((maxIdx - minIdx) > 1) { 68 | int middlePoint = minIdx + (maxIdx - minIdx) / 2; 69 | if (prob > cumulativeDistribution.get(middlePoint)) { 70 | minIdx = middlePoint; 71 | } else { 72 | maxIdx = middlePoint; 73 | } 74 | } 75 | return emails.get(maxIdx); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/dictionary/NumbersGenerator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.dictionary; 18 | 19 | import java.util.Random; 20 | 21 | public class NumbersGenerator { 22 | 23 | // TODO: add more 24 | private String[] bankCode = {"001", "100", "102", "103", "104", "105", "301", "302", "303", "304", "305", "306", 25 | "307", "308", "309",}; 26 | 27 | // TODO: add more 28 | private String[] districtCode = 29 | {"1100", "1200", "3700", "2100", "1400", "4100", "2200", "2300", "6100", "6200", "6300", 30 | "6400", "6500", "4600", "8100", "8200", "2900", "5000", "4400", "4500", "4300", "4200", "3200", "3300"}; 31 | 32 | public NumbersGenerator() { 33 | } 34 | 35 | public String generatePhonenum(Random random) { 36 | return String.format("%03d", random.nextInt(1000)) 37 | + "-" + String.format("%04d", random.nextInt(10000)); 38 | } 39 | 40 | public String generateOrdernum(Random random) { 41 | return bankCode[random.nextInt(bankCode.length)] 42 | + districtCode[random.nextInt(districtCode.length)] 43 | + String.format("%04d", random.nextInt(1000)) 44 | + String.format("%04d", random.nextInt(10000)); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/dictionary/PercentageTextDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.dictionary; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.IOException; 21 | import java.io.InputStreamReader; 22 | import java.util.ArrayList; 23 | import java.util.List; 24 | import java.util.Random; 25 | 26 | public class PercentageTextDictionary { 27 | private final List resources; 28 | private final List cumulativeDistribution; 29 | 30 | public PercentageTextDictionary(String filePath, String separator) { 31 | resources = new ArrayList<>(); 32 | cumulativeDistribution = new ArrayList<>(); 33 | 34 | try { 35 | BufferedReader dictionary = new BufferedReader( 36 | new InputStreamReader(getClass().getResourceAsStream(filePath), "UTF-8")); 37 | String line; 38 | double cummulativeDist = 0.0; 39 | while ((line = dictionary.readLine()) != null) { 40 | String[] data = line.split(separator); 41 | String browser = data[0]; 42 | cummulativeDist += Double.parseDouble(data[1]); 43 | resources.add(browser); 44 | cumulativeDistribution.add(cummulativeDist); 45 | } 46 | dictionary.close(); 47 | } catch (IOException e) { 48 | throw new RuntimeException(e); 49 | } 50 | } 51 | 52 | public String getName(int id) { 53 | return resources.get(id); 54 | } 55 | 56 | public String getDistributedText(Random random) { 57 | double prob = random.nextDouble(); 58 | int minIdx = 0; 59 | int maxIdx = (byte) ((prob < cumulativeDistribution.get(minIdx)) ? minIdx : cumulativeDistribution 60 | .size() - 1); 61 | // Binary search 62 | while ((maxIdx - minIdx) > 1) { 63 | int middlePoint = minIdx + (maxIdx - minIdx) / 2; 64 | if (prob > cumulativeDistribution.get(middlePoint)) { 65 | minIdx = middlePoint; 66 | } else { 67 | maxIdx = middlePoint; 68 | } 69 | } 70 | return resources.get(maxIdx); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/dictionary/PersonNameDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.dictionary; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.IOException; 21 | import java.io.InputStreamReader; 22 | import java.nio.charset.StandardCharsets; 23 | import java.util.Random; 24 | import java.util.TreeMap; 25 | 26 | public class PersonNameDictionary { 27 | private final TreeMap personSurnames; 28 | 29 | public PersonNameDictionary(String filePath, String separator) { 30 | this.personSurnames = new TreeMap<>(); 31 | try { 32 | InputStreamReader inputStreamReader = new InputStreamReader( 33 | getClass().getResourceAsStream(filePath), StandardCharsets.UTF_8); 34 | BufferedReader dictionary = new BufferedReader(inputStreamReader); 35 | String line; 36 | long totalNumSurnames = 0; 37 | while ((line = dictionary.readLine()) != null) { 38 | String[] data = line.split(separator); 39 | String surname = data[1].trim(); 40 | this.personSurnames.put(totalNumSurnames, surname); 41 | totalNumSurnames++; 42 | } 43 | dictionary.close(); 44 | } catch (IOException e) { 45 | throw new RuntimeException(e); 46 | } 47 | } 48 | 49 | public String getUniformDistRandName(Random random) { 50 | long nameIndex = random.nextInt(personSurnames.size()); 51 | return personSurnames.get(nameIndex); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/dictionary/PlaceZOrder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.dictionary; 18 | 19 | // Private class used to sort countries by their z-order value. 20 | class PlaceZOrder implements Comparable { 21 | 22 | public int id; 23 | Integer zvalue; 24 | 25 | PlaceZOrder(int id, int zvalue) { 26 | this.id = id; 27 | this.zvalue = zvalue; 28 | } 29 | 30 | public int compareTo(PlaceZOrder obj) { 31 | return zvalue.compareTo(obj.zvalue); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/distribution/AccountDeleteDistribution.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.distribution; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.IOException; 21 | import java.io.InputStreamReader; 22 | import java.io.Serializable; 23 | import java.nio.charset.StandardCharsets; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | import java.util.Random; 27 | 28 | public class AccountDeleteDistribution implements Serializable { 29 | 30 | private double[] distribution; 31 | private final String distributionFile; 32 | 33 | public AccountDeleteDistribution(String distributionFile) { 34 | this.distributionFile = distributionFile; 35 | } 36 | 37 | public void initialize() { 38 | try { 39 | BufferedReader distributionBuffer = new BufferedReader( 40 | new InputStreamReader(getClass().getResourceAsStream(distributionFile), StandardCharsets.UTF_8)); 41 | List temp = new ArrayList<>(); 42 | String line; 43 | while ((line = distributionBuffer.readLine()) != null) { 44 | Double prob = Double.valueOf(line); 45 | temp.add(prob); 46 | } 47 | distribution = new double[temp.size()]; 48 | int index = 0; 49 | for (Double ele : temp) { 50 | distribution[index] = ele; 51 | ++index; 52 | } 53 | } catch (IOException e) { 54 | throw new RuntimeException(e); 55 | } 56 | } 57 | 58 | public boolean isDeleted(Random random, long maxDegree) { 59 | if (maxDegree < distribution.length) { 60 | return random.nextDouble() < distribution[(int) maxDegree]; 61 | } else { 62 | // support degree more than 1000 63 | return random.nextDouble() < Math.pow(0.99, maxDegree); 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/distribution/DegreeDistribution.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.distribution; 18 | 19 | public abstract class DegreeDistribution { 20 | 21 | public abstract void initialize(); 22 | 23 | public abstract void reset(long seed); 24 | 25 | public abstract long nextDegree(); 26 | 27 | public double mean(long numPersons) { 28 | return -1; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/distribution/MultiplicityDistribution.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.distribution; 18 | 19 | public class MultiplicityDistribution { 20 | 21 | public MultiplicityDistribution() { 22 | } 23 | 24 | 25 | public void reset(long seed) { 26 | } 27 | 28 | 29 | public long nextDegree() { 30 | return 0; 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/distribution/PowerLawActivityDeleteDistribution.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.distribution; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.IOException; 21 | import java.io.InputStreamReader; 22 | import java.nio.charset.StandardCharsets; 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import java.util.Random; 26 | 27 | public class PowerLawActivityDeleteDistribution { 28 | 29 | private double[] minutes; 30 | private double[] distribution; 31 | private String distributionFile; 32 | 33 | public PowerLawActivityDeleteDistribution(String distributionFile) { 34 | this.distributionFile = distributionFile; 35 | 36 | this.minutes = 37 | new double[] {0, 0.5, 1, 5, 10, 20, 30, 40, 60, 120, 300, 1440, 2880, 4320, 5760, 7200, 8460, 10080}; 38 | 39 | } 40 | 41 | public void initialize() { 42 | try { 43 | BufferedReader distributionBuffer = new BufferedReader( 44 | new InputStreamReader(getClass().getResourceAsStream(distributionFile), StandardCharsets.UTF_8)); 45 | List temp = new ArrayList<>(); 46 | String line; 47 | while ((line = distributionBuffer.readLine()) != null) { 48 | Double prob = Double.valueOf(line); 49 | temp.add(prob); 50 | } 51 | distribution = new double[temp.size()]; 52 | int index = 0; 53 | for (Double item : temp) { 54 | distribution[index] = item; 55 | ++index; 56 | } 57 | } catch (IOException e) { 58 | throw new RuntimeException(e); 59 | } 60 | } 61 | 62 | public double nextDouble(double prob, Random random) { 63 | 64 | double draw = 0; 65 | for (int i = 0; i < distribution.length; i++) { 66 | if (prob < distribution[i]) { 67 | double lower = minutes[i - 1]; 68 | double upper = minutes[i]; 69 | draw = lower + (upper - lower) * random.nextDouble(); 70 | break; 71 | } 72 | } 73 | return draw; 74 | 75 | } 76 | 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/distribution/TimeDistribution.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.distribution; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.InputStreamReader; 21 | import java.nio.charset.StandardCharsets; 22 | import java.util.Map; 23 | import java.util.Random; 24 | import java.util.TreeMap; 25 | 26 | public class TimeDistribution { 27 | private Map hourDistribution; 28 | private double[] hourProbs; 29 | private final double[] hourCumulatives; 30 | 31 | public TimeDistribution(String hourDistributionFile) { 32 | loadDistribution(hourDistributionFile); 33 | hourCumulatives = new double[hourProbs.length]; 34 | hourCumulatives[0] = hourProbs[0]; 35 | for (int i = 1; i < hourProbs.length; i++) { 36 | hourCumulatives[i] = hourCumulatives[i - 1] + hourProbs[i]; 37 | } 38 | } 39 | 40 | public void loadDistribution(String hourDistributionFile) { 41 | try { 42 | BufferedReader reader = new BufferedReader( 43 | new InputStreamReader(getClass().getResourceAsStream(hourDistributionFile), StandardCharsets.UTF_8)); 44 | hourDistribution = new TreeMap<>(); 45 | String line; 46 | while ((line = reader.readLine()) != null) { 47 | String[] data = line.split(" "); 48 | hourDistribution.put(Integer.parseInt(data[1]), Double.parseDouble(data[0])); 49 | } 50 | reader.close(); 51 | hourProbs = hourDistribution.values().stream().mapToDouble(Double::doubleValue).toArray(); 52 | } catch (Exception e) { 53 | throw new RuntimeException(e); 54 | } 55 | } 56 | 57 | public Map getHourDistribution() { 58 | return hourDistribution; 59 | } 60 | 61 | public long nextHour(Random random) { 62 | double rand = random.nextDouble(); 63 | for (int i = 0; i < hourProbs.length; i++) { 64 | if (rand < hourCumulatives[i]) { 65 | return i; 66 | } 67 | } 68 | return -1; 69 | } 70 | 71 | public long nextMinute(Random random) { 72 | return (long) (random.nextDouble() * 60); 73 | } 74 | 75 | public long nextSecond(Random random) { 76 | return (long) (random.nextDouble() * 60); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/events/CompanyInvestEvent.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.events; 18 | 19 | import java.io.Serializable; 20 | import java.util.List; 21 | import java.util.Random; 22 | import ldbc.finbench.datagen.entities.edges.CompanyInvestCompany; 23 | import ldbc.finbench.datagen.entities.nodes.Company; 24 | import ldbc.finbench.datagen.generation.DatagenParams; 25 | import ldbc.finbench.datagen.util.RandomGeneratorFarm; 26 | 27 | public class CompanyInvestEvent implements Serializable { 28 | private final RandomGeneratorFarm randomFarm; 29 | private final Random randIndex; 30 | 31 | public CompanyInvestEvent() { 32 | randomFarm = new RandomGeneratorFarm(); 33 | randIndex = new Random(DatagenParams.defaultSeed); 34 | } 35 | 36 | public void resetState(int seed) { 37 | randomFarm.resetRandomGenerators(seed); 38 | randIndex.setSeed(seed); 39 | } 40 | 41 | public List companyInvestPartition(List investors, List targets) { 42 | Random numInvestorsRand = randomFarm.get(RandomGeneratorFarm.Aspect.NUMS_COMPANY_INVEST); 43 | Random chooseInvestorRand = randomFarm.get(RandomGeneratorFarm.Aspect.CHOOSE_COMPANY_INVESTOR); 44 | for (Company target : targets) { 45 | int numInvestors = numInvestorsRand.nextInt( 46 | DatagenParams.maxInvestors - DatagenParams.minInvestors + 1 47 | ) + DatagenParams.minInvestors; 48 | for (int i = 0; i < numInvestors; i++) { 49 | int index = chooseInvestorRand.nextInt(investors.size()); 50 | Company investor = investors.get(index); 51 | if (cannotInvest(investor, target)) { 52 | continue; 53 | } 54 | CompanyInvestCompany.createCompanyInvestCompany(randomFarm, investor, target); 55 | } 56 | } 57 | return targets; 58 | } 59 | 60 | public boolean cannotInvest(Company investor, Company target) { 61 | return (investor == target) || investor.hasInvestedBy(target) || target.hasInvestedBy(investor); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/events/PersonInvestEvent.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.events; 18 | 19 | import java.io.Serializable; 20 | import java.util.List; 21 | import java.util.Random; 22 | import ldbc.finbench.datagen.entities.edges.PersonInvestCompany; 23 | import ldbc.finbench.datagen.entities.nodes.Company; 24 | import ldbc.finbench.datagen.entities.nodes.Person; 25 | import ldbc.finbench.datagen.generation.DatagenParams; 26 | import ldbc.finbench.datagen.util.RandomGeneratorFarm; 27 | 28 | public class PersonInvestEvent implements Serializable { 29 | private final RandomGeneratorFarm randomFarm; 30 | private final Random randIndex; 31 | 32 | public PersonInvestEvent() { 33 | randomFarm = new RandomGeneratorFarm(); 34 | randIndex = new Random(DatagenParams.defaultSeed); 35 | } 36 | 37 | public void resetState(int seed) { 38 | randomFarm.resetRandomGenerators(seed); 39 | randIndex.setSeed(seed); 40 | } 41 | 42 | public List personInvestPartition(List investors, List targets) { 43 | Random numInvestorsRand = randomFarm.get(RandomGeneratorFarm.Aspect.NUMS_PERSON_INVEST); 44 | Random chooseInvestorRand = randomFarm.get(RandomGeneratorFarm.Aspect.CHOOSE_PERSON_INVESTOR); 45 | for (Company target : targets) { 46 | int numInvestors = numInvestorsRand.nextInt( 47 | DatagenParams.maxInvestors - DatagenParams.minInvestors + 1 48 | ) + DatagenParams.minInvestors; 49 | for (int i = 0; i < numInvestors; i++) { 50 | int index = chooseInvestorRand.nextInt(investors.size()); 51 | Person investor = investors.get(index); 52 | if (cannotInvest(investor, target)) { 53 | continue; 54 | } 55 | PersonInvestCompany.createPersonInvestCompany(randomFarm, investor, target); 56 | } 57 | } 58 | return targets; 59 | } 60 | 61 | public boolean cannotInvest(Person investor, Company target) { 62 | return target.hasInvestedBy(investor); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/generation/events/SignInEvent.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.events; 18 | 19 | import java.io.Serializable; 20 | import java.util.LinkedList; 21 | import java.util.List; 22 | import java.util.Random; 23 | import ldbc.finbench.datagen.entities.edges.SignIn; 24 | import ldbc.finbench.datagen.entities.nodes.Account; 25 | import ldbc.finbench.datagen.entities.nodes.Medium; 26 | import ldbc.finbench.datagen.generation.DatagenParams; 27 | import ldbc.finbench.datagen.util.RandomGeneratorFarm; 28 | 29 | public class SignInEvent implements Serializable { 30 | private final RandomGeneratorFarm randomFarm; 31 | private final Random randIndex; 32 | 33 | public SignInEvent() { 34 | randomFarm = new RandomGeneratorFarm(); 35 | randIndex = new Random(DatagenParams.defaultSeed); 36 | } 37 | 38 | private void resetState(int seed) { 39 | randomFarm.resetRandomGenerators(seed); 40 | randIndex.setSeed(seed); 41 | } 42 | 43 | public List signIn(List mediums, List accounts, int blockId) { 44 | resetState(blockId); 45 | 46 | Random accountsToSignRand = randomFarm.get(RandomGeneratorFarm.Aspect.NUM_ACCOUNTS_SIGNIN_PER_MEDIUM); 47 | Random multiplicityRandom = randomFarm.get(RandomGeneratorFarm.Aspect.MULTIPLICITY_SIGNIN); 48 | int numAccountsToSign = accountsToSignRand.nextInt(DatagenParams.maxAccountToSignIn); 49 | 50 | for (Medium medium : mediums) { 51 | for (int i = 0; i < Math.max(1, numAccountsToSign); i++) { 52 | Account accountToSign = accounts.get(randIndex.nextInt(accounts.size())); 53 | if (cannotSignIn(medium, accountToSign)) { 54 | continue; 55 | } 56 | int numSignIn = multiplicityRandom.nextInt(DatagenParams.maxSignInPerPair); 57 | for (int mid = 0; mid < Math.max(1, numSignIn); mid++) { 58 | SignIn.createSignIn(randomFarm, mid, medium, accountToSign); 59 | } 60 | } 61 | } 62 | return mediums; 63 | } 64 | 65 | public boolean cannotSignIn(Medium from, Account to) { 66 | return from.getCreationDate() + DatagenParams.activityDelta > to.getDeletionDate(); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/util/DateTimeUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.util; 18 | 19 | import java.time.Instant; 20 | import java.time.LocalDate; 21 | import java.time.LocalDateTime; 22 | import java.time.Month; 23 | import java.time.ZoneId; 24 | 25 | public class DateTimeUtils { 26 | public static ZoneId UTC = ZoneId.of("UTC"); 27 | 28 | public static long toEpochMilli(LocalDate ld) { 29 | return ld.atStartOfDay(UTC).toInstant().toEpochMilli(); 30 | } 31 | 32 | public static long toEpochMilli(LocalDateTime ldt) { 33 | return ldt.atZone(UTC).toInstant().toEpochMilli(); 34 | } 35 | 36 | public static LocalDate utcDateOfEpochMilli(long epochMilli) { 37 | return Instant.ofEpochMilli(epochMilli).atZone(UTC).toLocalDate(); 38 | } 39 | 40 | public static LocalDateTime utcDateTimeOfEpochMilli(long epochMilli) { 41 | return Instant.ofEpochMilli(epochMilli).atZone(UTC).toLocalDateTime(); 42 | } 43 | 44 | public static boolean isTravelSeason(long epochMilli) { 45 | LocalDate date = utcDateOfEpochMilli(epochMilli); 46 | 47 | int day = date.getDayOfMonth(); 48 | int month = date.getMonthValue(); 49 | 50 | if ((month > 4) && (month < 7)) { 51 | return true; 52 | } 53 | return ((month == 11) && (day > 23)); 54 | } 55 | 56 | public static int getNumberOfMonths(long epochMilli, int startMonth, int startYear) { 57 | LocalDate date = utcDateOfEpochMilli(epochMilli); 58 | int month = date.getMonthValue(); 59 | int year = date.getYear(); 60 | return (year - startYear) * 12 + month - (startMonth - 1); 61 | } 62 | 63 | public static int getYear(long epochMilli) { 64 | LocalDateTime datetime = utcDateTimeOfEpochMilli(epochMilli); 65 | return datetime.getYear(); 66 | } 67 | 68 | public static Month getMonth(long epochMilli) { 69 | LocalDateTime datetime = utcDateTimeOfEpochMilli(epochMilli); 70 | return datetime.getMonth(); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/ldbc/finbench/datagen/util/ZOrder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.util; 18 | 19 | 20 | public class ZOrder { 21 | 22 | private int maxBitNum; 23 | 24 | public ZOrder(int maxNumBit) { 25 | this.maxBitNum = maxNumBit; 26 | } 27 | 28 | public int getZValue(int x, int y) { 29 | 30 | String sx = Integer.toBinaryString(x); 31 | int numberToAddX = maxBitNum - sx.length(); 32 | for (int i = 0; i < numberToAddX; i++) { 33 | sx = "0" + sx; 34 | } 35 | 36 | String sy = Integer.toBinaryString(y); 37 | int numberToAddY = maxBitNum - sy.length(); 38 | for (int i = 0; i < numberToAddY; i++) { 39 | sy = "0" + sy; 40 | } 41 | 42 | String sz = ""; 43 | for (int i = 0; i < sx.length(); i++) { 44 | sz = sz + sx.substring(i, i + 1) + "" + sy.substring(i, i + 1); 45 | } 46 | 47 | return Integer.parseInt(sz, 2); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/resources/README.md: -------------------------------------------------------------------------------- 1 | # About Resources 2 | 3 | Here we list the resources used in the Data Generation. There are three typical kinds of resourses here. 4 | 5 | - Dictionaries: some raw data used as dictionaries to generate data, e.g. dummy names. 6 | - Distributions: distributions that describes the degree distribution. 7 | - Parameters: the parameters that used to define some common configurations or limits in data generation process. 8 | 9 | ## Dictionaries 10 | 11 | To avoid legal problems, we generate data using dummy names. The dummy names are generated by a free tool named 12 | FauxID[1] or copied from SNB. 13 | 14 | | Dictionary Files | Description | 15 | |------------------|-----------------------------------------------------------------| 16 | | accountTypes.txt | the account types' values generated with ChatGPT[2] | 17 | | companyNames.txt | dummy names generated by fake-company-generator[3] on Fauxid[1] | 18 | | mediumNames.txt | medium types generated with ChatGPT[2] | 19 | | surnames.txt | surnames of persons used in SNB DataGen[4] | 20 | 21 | ## Distributions 22 | 23 | The distributions will be determined based on the real financial data profiling. 24 | 25 | | Distribution Files | Description | 26 | |---------------------------------|-------------------------------------------------------------------------| 27 | | accountDelete.txt | the distribution of the account deletion used in SNB DataGen[4] | 28 | | facebookPowerlawBucket.dat | the Facebook powerlaw bucketed distribution used in SNB DataGen[4] | 29 | | hourDistribution.dat | the distribution of the hour of the day | 30 | | inDegreeRegression.txt | the inDegree distribution in profiling results | 31 | | multiplicity.txt | the multiplicity in profiling results | 32 | | outDegreeRegression.txt | the outDegree distribution in profiling results | 33 | | powerLawAcitivityDeleteDate.txt | the powerlaw distribution to generate deleteDate used in SNB DataGen[4] | 34 | 35 | ## Parameters 36 | 37 | Here are some configuration and parameters used in data generation including, 38 | 39 | - params_default.ini: some common/gloabl parameters by default 40 | - scale_factors.xml: a parameter map from the scale factors to the parameters that controls the data scale in 41 | generation. 42 | 43 | # Reference 44 | 45 | [1] FauxID: https://fauxid.com 46 | [2] ChatGPT: https://chat.openai.com/ 47 | [3] fake-company-generator: https://fauxid.com/tools/fake-company-generator 48 | [4] SNB DataGen: https://github.com/ldbc/ldbc_snb_datagen_spark/ 49 | -------------------------------------------------------------------------------- /src/main/resources/dictionaries/accountLevels.txt: -------------------------------------------------------------------------------- 1 | Basic level, 0.6 2 | Silver level, 0.2 3 | Gold level, 0.1 4 | Platinum level, 0.05 5 | Diamond level, 0.03 6 | Elite level, 0.02 -------------------------------------------------------------------------------- /src/main/resources/dictionaries/accountTypes.txt: -------------------------------------------------------------------------------- 1 | certificate of deposit 2 | credit card 3 | retirement account 4 | merchant account 5 | escrow account 6 | trust account 7 | foreign currency 8 | corporate account 9 | brokerage account 10 | custodial account 11 | internet account 12 | debit card 13 | prepaid card 14 | -------------------------------------------------------------------------------- /src/main/resources/dictionaries/goodsTypes.txt: -------------------------------------------------------------------------------- 1 | Food 2 | Clothing 3 | Electronics 4 | Furniture 5 | Household appliances 6 | Personal care products 7 | Sports equipment 8 | Books and media 9 | Toys and games 10 | Automotive products 11 | Beauty products 12 | Jewelry 13 | Pet supplies 14 | Health and wellness products 15 | Home decor 16 | Office supplies 17 | Tools and hardware 18 | Garden supplies 19 | Musical instruments 20 | Art and craft supplies -------------------------------------------------------------------------------- /src/main/resources/dictionaries/guaranteeRelationships.txt: -------------------------------------------------------------------------------- 1 | friends, 0.3 2 | business associate, 0.2 3 | parents, 0.2 4 | siblings, 0.2 5 | other relatives, 0.1 6 | -------------------------------------------------------------------------------- /src/main/resources/dictionaries/loanOrganizations.txt: -------------------------------------------------------------------------------- 1 | American Express 2 | Avant 3 | Bank of America 4 | Best Egg 5 | BlueVine 6 | Capital One 7 | Chase Bank 8 | Citibank 9 | Discover 10 | Earnest 11 | Fundbox 12 | Funding Circle 13 | Fundrise 14 | Kabbage 15 | LendingClub 16 | LendingPoint 17 | LightStream 18 | Marcus by Goldman Sachs 19 | Navy Federal Credit Union 20 | OnDeck 21 | OneMain Financial 22 | PNC Bank 23 | Patch of Land 24 | Peerform 25 | Prosper 26 | RealtyMogul 27 | Rocket Loans 28 | Roofstock 29 | SoFi 30 | State Farm Bank 31 | SunTrust 32 | TD Bank 33 | US Bank 34 | Upgrade 35 | Upstart 36 | Wells Fargo 37 | -------------------------------------------------------------------------------- /src/main/resources/dictionaries/loanUsages.txt: -------------------------------------------------------------------------------- 1 | major purchases 2 | investing 3 | renovations 4 | debt consolidation 5 | business ventures 6 | education 7 | medical expenses 8 | vacations 9 | weddings 10 | funerals 11 | other -------------------------------------------------------------------------------- /src/main/resources/dictionaries/mediumNames.txt: -------------------------------------------------------------------------------- 1 | POS 2 | ATM 3 | WIFI 4 | PHONE 5 | IPv4 6 | IPv6 7 | MAC 8 | QRCode 9 | NFC 10 | RFID -------------------------------------------------------------------------------- /src/main/resources/dictionaries/payTypes.txt: -------------------------------------------------------------------------------- 1 | PayPal 2 | Apple Pay 3 | Google Pay 4 | Alipay 5 | WeChat Pay 6 | Venmo 7 | Cash App 8 | Bank Transfer 9 | Gift Card 10 | Cryptocurrency 11 | Money Order 12 | Cheque 13 | Direct Deposit 14 | E-wallets 15 | Mobile Carrier Billing 16 | Invoice Factoring 17 | Payment Plan -------------------------------------------------------------------------------- /src/main/resources/dictionaries/riskLevels.txt: -------------------------------------------------------------------------------- 1 | Low risk 2 | Moderate risk 3 | High risk 4 | Very high risk 5 | Extreme risk 6 | Minimal risk 7 | Significant risk 8 | Severe risk 9 | Critical risk 10 | -------------------------------------------------------------------------------- /src/main/resources/distributions/facebookPowerlawBucket.dat: -------------------------------------------------------------------------------- 1 | 0 1 1 2 | 0 1 2 3 | 0 1.5 3 4 | 1.5 2.2 4 5 | 2.2 3.55 5 6 | 3.55 4.37 6 7 | 4.37 5.37 7 8 | 5.37 6.61 8 9 | 6.61 8.13 9 10 | 8.13 10 10 11 | 10 11.22 11 12 | 11.22 12 12 13 | 12 14 13 14 | 14 16 14 15 | 16 17 15 16 | 17 19 16 17 | 19 20 17 18 | 20 22 18 19 | 22 23 19 20 | 23 25 20 21 | 25 26 21 22 | 26 28 22 23 | 28 30 23 24 | 30 31 24 25 | 31 33 25 26 | 33 35 26 27 | 35 36 27 28 | 36 38 28 29 | 38 40 29 30 | 40 42 30 31 | 42 44 31 32 | 44 46 32 33 | 46 49 33 34 | 49 51 34 35 | 51 54 35 36 | 54 56 36 37 | 56 59 37 38 | 59 61 38 39 | 61 64 39 40 | 64 66 40 41 | 66 69 41 42 | 69 72 42 43 | 72 75 43 44 | 75 78 44 45 | 78 82 45 46 | 82 85 46 47 | 85 88 47 48 | 88 92 48 49 | 92 95 49 50 | 95 99 50 51 | 99 102 51 52 | 102 106 52 53 | 106 110 53 54 | 110 113 54 55 | 113 117 55 56 | 117 122 56 57 | 122 126 57 58 | 126 130 58 59 | 130 135 59 60 | 135 139 60 61 | 139 144 61 62 | 144 149 62 63 | 149 154 63 64 | 154 160 64 65 | 160 166 65 66 | 166 172 66 67 | 172 180 67 68 | 180 188 68 69 | 188 196 69 70 | 196 204 70 71 | 204 211 71 72 | 211 217 72 73 | 217 223 73 74 | 223 229 74 75 | 229 236 75 76 | 236 243 76 77 | 243 252 77 78 | 252 261 78 79 | 261 272 79 80 | 272 283 80 81 | 283 295 81 82 | 295 307 82 83 | 307 320 83 84 | 320 334 84 85 | 334 349 85 86 | 349 365 86 87 | 365 383 87 88 | 383 402 88 89 | 402 423 89 90 | 423 447 90 91 | 447 478 91 92 | 478 519 92 93 | 519 570 93 94 | 570 623 94 95 | 623 674 95 96 | 674 723 96 97 | 723 781 97 98 | 781 863 98 99 | 863 1029 99 100 | 1029 5000 100 -------------------------------------------------------------------------------- /src/main/resources/distributions/hourDistribution.dat: -------------------------------------------------------------------------------- 1 | 0.2593526012653495 8 2 | 0.05160376982202897 11 3 | 0.048674656913491814 10 4 | 0.0484494454801552 12 5 | 0.04783513640895614 17 6 | 0.04541527278103532 18 7 | 0.04535949212906142 21 8 | 0.04509153664720135 9 9 | 0.04395123499873539 16 10 | 0.04118797695085037 15 11 | 0.04104902499552937 20 12 | 0.04088250920364632 19 13 | 0.04061096101271552 22 14 | 0.04058332383752758 13 15 | 0.03985351173213859 14 16 | 0.03255691753542388 7 17 | 0.028262704185304518 23 18 | 0.01786883161088602 0 19 | 0.01548368454454632 6 20 | 0.008970635808854882 1 21 | 0.005209140099636779 5 22 | 0.0051042119651415006 2 23 | 0.0035019847725102494 3 24 | 0.0031414352992730145 4 -------------------------------------------------------------------------------- /src/main/resources/distributions/inDegreeRegression.txt: -------------------------------------------------------------------------------- 1 | # Sample139, Sample184, Sample177 2 | alpha: 109539041.821,78379700.038,133908623.887 3 | beta: -2.319,-2.319,-2.085 -------------------------------------------------------------------------------- /src/main/resources/distributions/multiplicityPowerlawRegression.txt: -------------------------------------------------------------------------------- 1 | # Sample Hub Vertex 2 | alpha: 27230469.375668973, 1141214.9408893671, 156038.86238756854 3 | beta: -1.573305530151192, -1.488771851133902, -1.1012676833313366 4 | average: 1.759, 1.786, 1.969 -------------------------------------------------------------------------------- /src/main/resources/distributions/outDegreeRegression.txt: -------------------------------------------------------------------------------- 1 | # Sample139, Sample184, Sample177 2 | alpha: 20186572.914,14153912.686,20194472.855 3 | beta: -1.720,-1.719,-1.720 -------------------------------------------------------------------------------- /src/main/resources/distributions/powerLawActivityDeleteDate.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 0.17 3 | 0.22 4 | 0.38 5 | 0.46 6 | 0.54 7 | 0.586 8 | 0.61 9 | 0.652 10 | 0.752 11 | 0.812 12 | 0.891 13 | 0.949 14 | 0.973 15 | 0.986 16 | 0.994 17 | 0.998 18 | 1 19 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, file 2 | 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.console.layout.ConversionPattern=%d{ISO8601} [%t] %-5p %c - %m%n 6 | 7 | log4j.appender.file=org.apache.log4j.FileAppender 8 | log4j.appender.file.File=/tmp/spark-events/spark.log 9 | log4j.appender.file.layout=org.apache.log4j.PatternLayout 10 | log4j.appender.file.layout.ConversionPattern=%d{ISO8601} [%t] %-5p %c - %m%n -------------------------------------------------------------------------------- /src/main/resources/params_default.ini: -------------------------------------------------------------------------------- 1 | spark.blockSize:10000 2 | 3 | generator.defaultSeed:23 4 | generator.outputDir:out 5 | generator.startYear:2020 6 | generator.numYears:3 7 | generator.activityDelta:60000 8 | generator.deleteDelta:86400000 9 | generator.numUpdateStreams:1 10 | 11 | company.maxDescriptionLength:200 12 | 13 | account.blockedAccountRatio:0.05 14 | 15 | medium.blockedMediumRatio:0.05 16 | 17 | own.maxAccounts:5 18 | 19 | transfer.degreeDistribution:powerlaw 20 | transfer.multiplicityDistribution:powerlaw 21 | transfer.minMultiplicity:1 22 | transfer.maxMultiplicity:100 23 | transfer.maxAmount:10000000 24 | # not used any more 25 | transfer.baseProbCorrelated:0.99 26 | transfer.limitProCorrelated:0.5 27 | transfer.generationMode:loose 28 | transfer.shuffleTimes:1 29 | 30 | withdraw.accountWithdrawFraction:0.3 31 | withdraw.maxWithdrawals:30 32 | withdraw.maxAmount:10000000 33 | 34 | signIn.accountSignedInFraction:1.0 35 | signIn.maxAccountToSignIn:4 36 | signIn.maxMultiplicity:10 37 | 38 | guarantee.personGuaranteeFraction:0.6 39 | guarantee.companyGuaranteeFraction:0.6 40 | guarantee.maxTargetsToGuarantee:3 41 | 42 | loan.personLoanFraction:0.6 43 | loan.companyLoanFraction:0.6 44 | loan.involvedAccountsFraction:0.8 45 | loan.maxLoans:5 46 | loan.minLoanAmount:10000 47 | loan.maxLoanAmount:100000000 48 | loan.numSubEvents:10 49 | loan.maxLoanInterest:0.1 50 | 51 | invest.companyInvestedFraction:1.0 52 | invest.minInvestors:1 53 | invest.maxInvestors:5 -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/factors/AccountItemsGenerator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.factors 18 | 19 | import org.apache.spark.sql.{SparkSession, functions => F} 20 | import org.apache.spark.sql.functions.max 21 | import org.apache.spark.sql.functions.lit 22 | 23 | object AccountItemsGenerator { 24 | def generateAccountItems(implicit spark: SparkSession): Unit = { 25 | import spark.implicits._ 26 | 27 | val accountRDD = spark.read 28 | .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") 29 | .option("header", "true") 30 | .option("delimiter", "|") 31 | .load("./out/raw/account/*.csv") 32 | 33 | val transferRDD = spark.read 34 | .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") 35 | .option("header", "true") 36 | .option("delimiter", "|") 37 | .load("./out/raw/transfer/*.csv") 38 | 39 | val withdrawRDD = spark.read 40 | .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") 41 | .option("header", "true") 42 | .option("delimiter", "|") 43 | .load("./out/raw/withdraw/*.csv") 44 | 45 | val combinedRDD = transferRDD 46 | .select($"fromId", $"toId", $"amount".cast("double")) 47 | .union(withdrawRDD.select($"fromId", $"toId", $"amount".cast("double"))) 48 | 49 | val maxAmountRDD = combinedRDD 50 | .groupBy($"fromId", $"toId") 51 | .agg(max($"amount").alias("maxAmount")) 52 | 53 | val accountItemsRDD = maxAmountRDD 54 | .groupBy($"fromId") 55 | .agg(F.collect_list(F.array($"toId", $"maxAmount")).alias("items")) 56 | .select($"fromId".alias("account_id"), $"items") 57 | .sort($"account_id") 58 | 59 | val transformedAccountItemsRDD = accountItemsRDD 60 | .withColumn( 61 | "items", 62 | F.expr( 63 | "transform(items, array -> concat('[', concat_ws(',', array), ']'))" 64 | ) 65 | ) 66 | .withColumn( 67 | "items", 68 | F.concat_ws(",", $"items") 69 | ) 70 | .withColumn( 71 | "items", 72 | F.concat(lit("["), $"items", lit("]")) 73 | ) 74 | 75 | transformedAccountItemsRDD 76 | .coalesce(1) 77 | .write 78 | .option("header", "true") 79 | .option("delimiter", "|") 80 | .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") 81 | .save("./out/factor_table/account_items") 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/factors/FactorTable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.factors 18 | 19 | import ldbc.finbench.datagen.model.{Graph, GraphDef, Mode} 20 | import org.apache.spark.sql.DataFrame 21 | 22 | case class FactorTable[M <: Mode]( 23 | name: String, 24 | data: DataFrame, 25 | source: Graph[M] 26 | ) 27 | 28 | case class FactorTableDef[M <: Mode]( 29 | name: String, 30 | sourceDef: GraphDef[M] 31 | ) 32 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/generation/generators/SparkAccountGenerator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.generators 18 | 19 | import ldbc.finbench.datagen.config.DatagenConfiguration 20 | import ldbc.finbench.datagen.entities.nodes.Account 21 | import ldbc.finbench.datagen.generation.{DatagenContext, DatagenParams} 22 | import org.apache.spark.rdd.RDD 23 | import org.apache.spark.sql.SparkSession 24 | 25 | import scala.collection.JavaConverters.asScalaIteratorConverter 26 | 27 | // SparkAccountGenerator is not used to generate account data directly. 28 | object SparkAccountGenerator { 29 | // def apply(conf: DatagenConfiguration, numPartitions: Option[Int] = None)( 30 | // implicit spark: SparkSession): RDD[Account] = { 31 | // val numAccounts = 10000 32 | // 33 | // val accountPartitionGenerator = (blocks: Iterator[Long]) => { 34 | // DatagenContext.initialize(conf) 35 | // val accountGenerator = new AccountGenerator() 36 | // for { 37 | // i <- blocks 38 | // size = Math.min(numAccounts - DatagenParams.blockSize * i, DatagenParams.blockSize) 39 | // account <- accountGenerator. 40 | // } yield account 41 | // } 42 | // val numAccountBlocks = Math.ceil(numAccounts / DatagenParams.blockSize.toDouble).toInt 43 | // val partitions = numPartitions.getOrElse(spark.sparkContext.defaultParallelism) 44 | // val accountRdd = spark.sparkContext 45 | // .range(0, numAccountBlocks, step = 1, numSlices = partitions) 46 | // .mapPartitions(accountPartitionGenerator) 47 | // 48 | // accountRdd 49 | // } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/generation/generators/SparkCompanyGenerator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.generators 18 | 19 | import ldbc.finbench.datagen.config.DatagenConfiguration 20 | import ldbc.finbench.datagen.entities.nodes.Company 21 | import ldbc.finbench.datagen.generation.DatagenContext 22 | import org.apache.spark.rdd.RDD 23 | import org.apache.spark.sql.SparkSession 24 | 25 | import scala.collection.JavaConverters.asScalaIteratorConverter 26 | 27 | object SparkCompanyGenerator { 28 | def apply( 29 | numCompanies: Long, 30 | config: DatagenConfiguration, 31 | blockSize: Int 32 | )(implicit spark: SparkSession): RDD[Company] = { 33 | val numBlocks = Math.ceil(numCompanies / blockSize.toDouble).toInt 34 | val partitions = Math.min(numBlocks, spark.sparkContext.defaultParallelism) 35 | 36 | spark.sparkContext 37 | .range(0, numBlocks, step = 1, numSlices = partitions) 38 | .mapPartitions { blocks => 39 | DatagenContext.initialize(config) 40 | val companyGenerator = new CompanyGenerator() 41 | 42 | blocks.flatMap { i => 43 | val size = Math.min(numCompanies - blockSize * i, blockSize) 44 | companyGenerator 45 | .generateCompanyBlock(i.toInt, blockSize) 46 | .asScala 47 | .take(size.toInt) 48 | } 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/generation/generators/SparkMediumGenerator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.generators 18 | 19 | import ldbc.finbench.datagen.config.DatagenConfiguration 20 | import ldbc.finbench.datagen.entities.nodes.Medium 21 | import ldbc.finbench.datagen.generation.DatagenContext 22 | import org.apache.spark.rdd.RDD 23 | import org.apache.spark.sql.SparkSession 24 | 25 | import scala.collection.JavaConverters.asScalaIteratorConverter 26 | 27 | object SparkMediumGenerator { 28 | def apply( 29 | numMedia: Long, 30 | config: DatagenConfiguration, 31 | blockSize: Int 32 | )(implicit spark: SparkSession): RDD[Medium] = { 33 | val numBlocks = Math.ceil(numMedia / blockSize.toDouble).toInt 34 | val partitions = Math.min(numBlocks, spark.sparkContext.defaultParallelism) 35 | 36 | spark.sparkContext 37 | .range(0, numBlocks, step = 1, numSlices = partitions) 38 | .mapPartitions { blocks => 39 | DatagenContext.initialize(config) 40 | val mediumGenerator = new MediumGenerator() 41 | 42 | blocks.flatMap { i => 43 | val size = Math.min(numMedia - blockSize * i, blockSize) 44 | mediumGenerator 45 | .generateMediumBlock(i.toInt, blockSize) 46 | .asScala 47 | .take(size.toInt) 48 | } 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/generation/generators/SparkPersonGenerator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generation.generators 18 | 19 | import ldbc.finbench.datagen.config.DatagenConfiguration 20 | import ldbc.finbench.datagen.entities.nodes.Person 21 | import ldbc.finbench.datagen.generation.DatagenContext 22 | import org.apache.spark.rdd.RDD 23 | import org.apache.spark.sql.SparkSession 24 | 25 | import scala.collection.JavaConverters.asScalaIteratorConverter 26 | 27 | object SparkPersonGenerator { 28 | def apply(numPersons: Long, config: DatagenConfiguration, blockSize: Int)( 29 | implicit spark: SparkSession 30 | ): RDD[Person] = { 31 | val numBlocks = Math.ceil(numPersons / blockSize.toDouble).toInt 32 | val partitions = Math.min(numBlocks, spark.sparkContext.defaultParallelism) 33 | 34 | spark.sparkContext 35 | .range(0, numBlocks, step = 1, numSlices = partitions) 36 | .mapPartitions { blocks => 37 | DatagenContext.initialize(config) 38 | val personGenerator = new PersonGenerator() 39 | 40 | blocks.flatMap { i => 41 | val size = Math.min(numPersons - blockSize * i, blockSize) 42 | personGenerator 43 | .generatePersonBlock(i.toInt, blockSize) 44 | .asScala 45 | .take(size.toInt) 46 | } 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/io/Reader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.io 18 | 19 | trait Reader[T] { 20 | type Ret 21 | 22 | def read(self: T): Ret 23 | def exists(self: T): Boolean 24 | } 25 | 26 | object Reader { 27 | type Aux[T, R] = Reader[T] { type Ret = R } 28 | 29 | def apply[T, R](implicit r: Reader.Aux[T, R]): Reader.Aux[T, R] = implicitly[Reader.Aux[T, R]] 30 | 31 | trait ReaderOps[T] { 32 | type Ret 33 | def tcInstance: Reader.Aux[T, Ret] 34 | def self: T 35 | def read: Ret = tcInstance.read(self) 36 | } 37 | 38 | object ReaderOps { 39 | type Aux[T, R] = ReaderOps[T] { type Ret = R } 40 | } 41 | 42 | object ops { 43 | import scala.language.implicitConversions 44 | implicit def toReaderOps[T, R](target: T)(implicit tc: Reader.Aux[T, R]): ReaderOps.Aux[T, R] = 45 | new ReaderOps[T] { 46 | override type Ret = R 47 | override def tcInstance: Aux[T, R] = tc 48 | override def self: T = target 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/io/Writer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.io 18 | 19 | trait Writer[S] { 20 | type Data 21 | // def write(self: Data, sink: S): Unit 22 | } 23 | 24 | object Writer { 25 | type Aux[S, D] = Writer[S] { type Data = D } 26 | def apply[S, D](implicit r: Writer.Aux[S, D]): Writer.Aux[S, D] = implicitly[Writer.Aux[S, D]] 27 | 28 | trait WriterOps[Data] { 29 | type Sink 30 | def tcInstance: Writer.Aux[Sink, Data] 31 | def self: Data 32 | } 33 | 34 | object WriterOps { 35 | type Aux[Data, S] = WriterOps[Data] { type Sink = S } 36 | } 37 | 38 | object ops { 39 | import scala.language.implicitConversions 40 | implicit def toWriterOps[Data, S](target: Data)( 41 | implicit tc: Writer.Aux[S, Data]): WriterOps.Aux[Data, S] = new WriterOps[Data] { 42 | override type Sink = S 43 | override def tcInstance: Aux[S, Data] = tc 44 | override def self: Data = target 45 | } 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/io/dataframes.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.io 18 | 19 | import java.net.URI 20 | 21 | import org.apache.hadoop.fs.{FileSystem, Path} 22 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} 23 | import org.apache.spark.sql.types.StructType 24 | 25 | object dataframes { 26 | 27 | case class DataFrameSource( 28 | path: String, 29 | format: String, 30 | formatOptions: Map[String, String] = Map.empty, 31 | schema: Option[StructType] = None 32 | ) 33 | 34 | private class DataFrameReader(implicit spark: SparkSession) extends Reader[DataFrameSource] { 35 | override type Ret = DataFrame 36 | 37 | override def read(self: DataFrameSource): DataFrame = { 38 | spark.read 39 | .format(self.format) 40 | .options(self.formatOptions) 41 | .schema(self.schema.get) 42 | .load(self.path) 43 | } 44 | 45 | override def exists(self: DataFrameSource): Boolean = { 46 | val hadoopPath = new Path(self.path) 47 | val fs = FileSystem.get(URI.create(self.path), spark.sparkContext.hadoopConfiguration) 48 | fs.exists(hadoopPath) 49 | } 50 | } 51 | 52 | trait ReaderInstances { 53 | implicit def dataFrameReader( 54 | implicit spark: SparkSession): Reader.Aux[DataFrameSource, DataFrame] = 55 | new DataFrameReader 56 | } 57 | 58 | case class DataFrameSink(path: String, 59 | format: String, 60 | formatOptions: Map[String, String] = Map.empty, 61 | mode: SaveMode = SaveMode.ErrorIfExists, 62 | partitionBy: Seq[String] = Seq.empty) 63 | 64 | private object DataFrameWriter extends Writer[DataFrameSink] { 65 | override type Data = DataFrame 66 | } 67 | 68 | trait WriterInstances { 69 | implicit val dataFrameWriter: Writer.Aux[DataFrameSink, DataFrame] = DataFrameWriter 70 | } 71 | 72 | trait Instances extends WriterInstances with ReaderInstances 73 | 74 | object instances extends Instances 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/io/graphs.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.io 18 | 19 | import ldbc.finbench.datagen.model.Mode.Raw.Layout 20 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} 21 | import org.slf4j.{Logger, LoggerFactory} 22 | 23 | import scala.reflect.internal.Mode 24 | 25 | object graphs { 26 | 27 | case class GraphSink( 28 | path: String, 29 | format: String, 30 | formatOptions: Map[String, String] = Map.empty, 31 | saveMode: SaveMode = SaveMode.ErrorIfExists 32 | ) 33 | 34 | case class GraphSource[M <: Mode](implicit spark: SparkSession, en: DataFrame =:= Layout) 35 | extends Reader[GraphSource[M]] { 36 | @transient lazy val log: Logger = LoggerFactory.getLogger(this.getClass) 37 | 38 | override type Ret = this.type 39 | 40 | override def read(self: GraphSource[M]): GraphSource.this.type = ??? 41 | 42 | override def exists(self: GraphSource[M]): Boolean = ??? 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/io/raw/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.io 18 | 19 | import org.apache.spark.sql.SaveMode 20 | 21 | package object raw { 22 | 23 | sealed trait RawFormat 24 | case object Csv extends RawFormat { override def toString = "org.apache.spark.sql.execution.datasources.csv.CSVFileFormat" } 25 | case object Parquet extends RawFormat { override def toString = "parquet" } 26 | 27 | case class RawSink( 28 | outputDir: String, 29 | format: RawFormat, 30 | partitions: Option[Int] = None, 31 | formatOptions: Map[String, String] = Map.empty, 32 | mode: SaveMode = SaveMode.ErrorIfExists, 33 | partitionBy: Seq[String] = Seq.empty 34 | ) 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/syntax/FluentSyntax.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.syntax 18 | 19 | import scala.language.implicitConversions 20 | 21 | trait FluentSyntax { 22 | @`inline` implicit final def fluentSyntaxOps[A](a: A) = new FluentSyntaxOps(a) 23 | } 24 | 25 | final class FluentSyntaxOps[A](private val self: A) extends AnyVal { 26 | 27 | /** Fluent syntax for folding with self as the base item. 28 | */ 29 | def pipeFoldLeft[F](foldable: TraversableOnce[F])(op: (A, F) => A): A = { 30 | foldable.foldLeft(self)(op) 31 | } 32 | 33 | /** Fluent syntax for applying a function on self. d 34 | */ 35 | def pipe[R](f: A => R): R = f(self) 36 | 37 | /** Fluent syntax for applying a side-effect on self. 38 | */ 39 | def tap(f: A => Unit): A = { 40 | f(self) 41 | self 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/syntax/PathSyntax.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.syntax 18 | 19 | import org.apache.hadoop.fs.Path 20 | 21 | import java.net.URI 22 | import scala.language.implicitConversions 23 | 24 | trait PathSyntax { 25 | @`inline` implicit final def pathSyntaxOpsForString[A](a: String): PathSyntaxOpsForString = new PathSyntaxOpsForString(a) 26 | @`inline` implicit final def pathSyntaxOpsForPath[A](a: Path): PathSyntaxOpsForPath = new PathSyntaxOpsForPath(a) 27 | @`inline` implicit final def pathSyntaxOpsForUri[A](a: URI): PathSyntaxOpsForUri = new PathSyntaxOpsForUri(a) 28 | } 29 | 30 | final class PathSyntaxOpsForString(private val self: String) extends AnyVal { 31 | import PathSyntaxOpsHelpers._ 32 | def /(child: String): Path = join(new Path(self), new Path(child)) 33 | def /(child: Path): Path = join(new Path(self), child) 34 | } 35 | 36 | final class PathSyntaxOpsForPath(private val self: Path) extends AnyVal { 37 | import PathSyntaxOpsHelpers._ 38 | def /(child: String): Path = join(self, new Path(child)) 39 | def /(child: Path): Path = join(self, child) 40 | } 41 | 42 | final class PathSyntaxOpsForUri(private val self: URI) extends AnyVal { 43 | import PathSyntaxOpsHelpers._ 44 | def /(child: String): Path = join(new Path(self), new Path(child)) 45 | def /(child: Path): Path = join(new Path(self), child) 46 | } 47 | 48 | private[syntax] object PathSyntaxOpsHelpers { 49 | def join(path1: Path, path2: Path): Path = new Path(ensureTrailingSlashForAbsoluteUri(path1), path2) 50 | 51 | private[this] def ensureTrailingSlashForAbsoluteUri(path: Path): Path = { 52 | if (path.isAbsolute) 53 | return path 54 | 55 | val uri = path.toUri 56 | 57 | if (uri.getScheme == null || uri.getPath != "") 58 | return path 59 | 60 | new Path(new URI(uri.getScheme, uri.getAuthority, "/", uri.getQuery, uri.getFragment)) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/syntax/SparkSqlSyntax.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.syntax 18 | 19 | import org.apache.spark.sql.{Column, ColumnName, DataFrame, Dataset} 20 | 21 | import scala.language.implicitConversions 22 | 23 | trait SparkSqlSyntax { 24 | @`inline` implicit final def datasetOps[A](a: Dataset[A]) = new DatasetOps(a) 25 | @`inline` implicit final def stringToColumnOps[A](a: StringContext) = new StringToColumnOps(a) 26 | } 27 | 28 | final class DatasetOps[A](private val self: Dataset[A]) extends AnyVal { 29 | def |+|(other: Dataset[A]): Dataset[A] = self union other 30 | 31 | def select(columns: Seq[Column]): DataFrame = self.select(columns: _*) 32 | 33 | def partition(expr: Column): (Dataset[A], Dataset[A]) = { 34 | val df = self.cache() 35 | (df.filter(expr), df.filter(!expr || expr.isNull)) 36 | } 37 | } 38 | 39 | final class StringToColumnOps(private val sc: StringContext) extends AnyVal { 40 | def $(args: Any*): ColumnName = new ColumnName(sc.s(args: _*)) 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/syntax/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen 18 | 19 | package object syntax extends SparkSqlSyntax with FluentSyntax with PathSyntax -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/util/Logging.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.util 18 | 19 | import org.slf4j.{Logger, LoggerFactory} 20 | 21 | trait Logging { 22 | @transient lazy val log: Logger = LoggerFactory.getLogger(this.getClass) 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/util/SparkApp.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.util 18 | 19 | import ldbc.finbench.datagen.entities.edges._ 20 | import ldbc.finbench.datagen.entities.nodes._ 21 | import org.apache.spark.SparkConf 22 | import org.apache.spark.sql.SparkSession 23 | 24 | trait SparkApp { 25 | def appName: String 26 | 27 | type ArgsType 28 | 29 | /** execute the data generation process 30 | */ 31 | def run(args: ArgsType): Unit 32 | 33 | /** set the {@@linkSparkConf} 34 | */ 35 | val sparkConf = setConf(new SparkConf(), defaultSparkConf) 36 | 37 | /** spark entry {@@linkSparkSession} 38 | */ 39 | implicit def spark: SparkSession = 40 | SparkSession 41 | .builder() 42 | .master("local") 43 | .appName(appName) 44 | .config(sparkConf) 45 | .getOrCreate() 46 | 47 | private def applySparkConf(sparkConf: Map[String, String])( 48 | builder: SparkSession.Builder 49 | ) = 50 | sparkConf.foldLeft(builder) { case (b, (k, v)) => b.config(k, v) } 51 | 52 | def setConf(sparkConf: SparkConf, conf: Map[String, String]): SparkConf = { 53 | conf.map(entry => { 54 | if (!sparkConf.contains(entry._1)) { 55 | sparkConf.set(entry._1, entry._2) 56 | } 57 | }) 58 | registerKyroClasses(sparkConf) 59 | } 60 | 61 | def registerKyroClasses(sparkConf: SparkConf): SparkConf = { 62 | // register kryo classes for nodes 63 | sparkConf.registerKryoClasses( 64 | Array( 65 | classOf[Account], 66 | classOf[Company], 67 | classOf[Loan], 68 | classOf[Medium], 69 | classOf[Person] 70 | ) 71 | ) 72 | // register kryo classes for edges 73 | sparkConf.registerKryoClasses( 74 | Array( 75 | classOf[CompanyApplyLoan], 76 | classOf[CompanyGuaranteeCompany], 77 | classOf[CompanyInvestCompany], 78 | classOf[CompanyOwnAccount], 79 | classOf[PersonApplyLoan], 80 | classOf[PersonGuaranteePerson], 81 | classOf[PersonInvestCompany], 82 | classOf[PersonOwnAccount], 83 | classOf[Repay], 84 | classOf[SignIn], 85 | classOf[Transfer], 86 | classOf[Withdraw] 87 | ) 88 | ) 89 | sparkConf 90 | } 91 | 92 | def defaultSparkConf: Map[String, String] = Map( 93 | "spark.sql.session.timeZone" -> "GMT", 94 | "spark.sql.sources.useV1SourceList" -> "csv" 95 | ) 96 | 97 | protected lazy val env: SparkEnv = new SparkEnv 98 | 99 | } 100 | 101 | trait DatagenStage extends SparkApp { 102 | override val appName: String = 103 | s"LDBC Finbench Datagen for Spark: ${this.getClass.getSimpleName.stripSuffix("$")}" 104 | } 105 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/util/SparkEnv.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.util 18 | 19 | import org.apache.spark.sql.SparkSession 20 | 21 | import scala.collection.JavaConverters._ 22 | 23 | class SparkEnv(implicit spark: SparkSession) { 24 | private val sysenv = System.getenv().asScala 25 | private val invalidChars = raw"[.-]" 26 | 27 | def env(key: String): Option[String] = { 28 | sysenv 29 | .get(s"LDBC_FINBENCH_DATAGEN_${camelToUpper(key.replaceAll(invalidChars, "_"))}") 30 | .orElse(spark.conf.getOption(s"spark.ldbc.finbench.datagen.$key")) 31 | } 32 | 33 | val irFormat = env("irFormat").getOrElse("parquet") 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/util/SparkUI.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.util 18 | 19 | import org.apache.spark.sql.SparkSession 20 | import scala.concurrent.{Future, Await} 21 | import scala.concurrent.ExecutionContext.Implicits.global 22 | 23 | object SparkUI { 24 | def job[T](jobGroup: String, jobDescription: String)(action: => T)( 25 | implicit spark: SparkSession): T = { 26 | spark.sparkContext.setJobGroup(jobGroup, jobDescription) 27 | try { 28 | action 29 | } finally { 30 | spark.sparkContext.clearJobGroup() 31 | } 32 | } 33 | 34 | def jobAsync(jobGroup: String, jobDescription: String)(action: => Unit)( 35 | implicit spark: SparkSession): Future[Unit] = { 36 | spark.sparkContext.setJobGroup(jobGroup, jobDescription) 37 | val future = Future { 38 | action 39 | } 40 | future.onComplete { _ => 41 | spark.sparkContext.clearJobGroup() 42 | } 43 | future 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/ldbc/finbench/datagen/util/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen 18 | 19 | import java.util.function.IntFunction 20 | 21 | import com.google.common.base.CaseFormat 22 | 23 | import scala.reflect.ClassTag 24 | 25 | package object util { 26 | def arrayOfSize[A: ClassTag] = new IntFunction[Array[A]] { 27 | override def apply(value: Int) = new Array[A](value) 28 | } 29 | 30 | def simpleNameOf[T: ClassTag] = implicitly[ClassTag[T]].runtimeClass.getSimpleName 31 | 32 | def pascalToCamel(str: String) = CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_CAMEL, str) 33 | 34 | def camelToUpper(str: String) = CaseFormat.LOWER_CAMEL.to(CaseFormat.UPPER_UNDERSCORE, str) 35 | 36 | def lower(str: String) = str.toLowerCase 37 | } 38 | -------------------------------------------------------------------------------- /src/test/java/ldbc/finbench/datagen/generators/GeneratorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.generators; 18 | 19 | import java.util.Map; 20 | import java.util.Random; 21 | import ldbc.finbench.datagen.config.ConfigParser; 22 | import ldbc.finbench.datagen.config.DatagenConfiguration; 23 | import ldbc.finbench.datagen.entities.nodes.Person; 24 | import ldbc.finbench.datagen.generation.DatagenContext; 25 | import ldbc.finbench.datagen.generation.dictionary.Dictionaries; 26 | import ldbc.finbench.datagen.generation.generators.PersonGenerator; 27 | import org.junit.Test; 28 | 29 | public class GeneratorTest { 30 | Map config; 31 | 32 | public GeneratorTest() { 33 | config = ConfigParser.readConfig("src/main/resources/params_default.ini"); 34 | config.putAll(ConfigParser.scaleFactorConf("", "0.1")); // use scale factor 0.1 35 | DatagenContext.initialize(new DatagenConfiguration(config)); 36 | } 37 | 38 | @Test 39 | public void testPersonGenerator() { 40 | PersonGenerator personGenerator = new PersonGenerator(); 41 | Person person = personGenerator.generatePerson(); 42 | assert null != person; 43 | } 44 | 45 | @Test 46 | public void testDatagenContext() { 47 | Random random = new Random(); 48 | System.out.println(Dictionaries.personNames.getUniformDistRandName(random)); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/test/java/ldbc/finbench/datagen/util/GeneralTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.util; 18 | 19 | import java.util.Map; 20 | import ldbc.finbench.datagen.config.ConfigParser; 21 | import org.junit.Test; 22 | 23 | public class GeneralTest { 24 | 25 | @Test 26 | public void testConfigParser() { 27 | Map config = ConfigParser.readConfig("src/main/resources/params_default.ini"); 28 | System.out.println(config); 29 | assert config.size() > 0; 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/test/scala/ldbc/finbench/datagen/util/UtilPackageSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ldbc.finbench.datagen.util 18 | 19 | import org.scalatest.BeforeAndAfterAll 20 | import org.scalatest.funsuite.AnyFunSuite 21 | 22 | class UtilPackageSuite extends AnyFunSuite with BeforeAndAfterAll { 23 | 24 | test("simpleNameOf") { 25 | val simpleName = simpleNameOf[String] 26 | assert(simpleName.equals("String")) 27 | } 28 | 29 | test("pascalToCamel") { 30 | val actualResult = pascalToCamel("PersonInvestCompany") 31 | val expectResult = "personInvestCompany" 32 | assert(actualResult.equals(expectResult)) 33 | 34 | val actualEmptyResult = pascalToCamel("") 35 | val expectEmptyResult = "" 36 | assert(actualEmptyResult.equals(expectEmptyResult)) 37 | 38 | assertThrows[NullPointerException](pascalToCamel(null)) 39 | } 40 | 41 | test("camelToUpper") { 42 | val actualResult = camelToUpper("hasTag") 43 | val expectResult = "HAS_TAG" 44 | assert(actualResult.equals(expectResult)) 45 | 46 | val actualEmtpyResult = camelToUpper("") 47 | val expectEmptyResult = "" 48 | assert(actualEmtpyResult.equals(expectEmptyResult)) 49 | 50 | assertThrows[NullPointerException](camelToUpper(null)) 51 | } 52 | 53 | test("lower") { 54 | val actualResult = lower("fasFSsfja_SFASJFA") 55 | val expectResult = "fasfssfja_sfasjfa" 56 | assert(actualResult.equals(expectResult)) 57 | assertThrows[NullPointerException](lower(null)) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | # Tools 2 | 3 | - paramgen: 4 | - parameter_curation: a tool for generating parameters for finbench queries 5 | - check_*.py: python scripts used for check the data features like consistency, distribution 6 | - merge_cluster_output.py: a python script to merge the output in cluster mode 7 | - statistic.py: a python script to calculate the statistics of the data 8 | - legacy: some legacy tools 9 | - dataprofiler: a tool for profiling graph data, including degree distribution, etc. 10 | - graphgen: a simple tool/example code to generate power-law distributed graph data. 11 | - factorgen: factor table generators in python version 12 | 13 | 14 | ## ParamsGen 15 | 16 | `params_gen.py` uses the CREATE_VALIDATION feature to generate parameters. 17 | 18 | The specific steps are as follows: 19 | 20 | 1. Select vertices of type Account, Person, and Loan from the dataset, and generate a parameter file that meets the input specifications for ldbc_finbench_driver. 21 | 2. Execute CREATE_VALIDATION to generate validation_params.csv. 22 | 3. Select non-empty results from validation_params.csv. 23 | 24 | Example: 25 | 26 | ```bash 27 | python3 params_gen.py 1 # gen tcr1 params 28 | ``` 29 | 30 | Other notes: 31 | 32 | 1. The generated start_timestamp and end_timestamp in the current version are fixed values. 33 | 2. For tcr4 and tcr10, this method is not efficient enough. Use the following Cypher query to search for parameters: 34 | 35 | ```Cypher 36 | // tcr4 37 | MATCH 38 | (n1:Account)-[:transfer]-> 39 | (n2:Account)-[:transfer]-> 40 | (n3:Account)-[:transfer]->(n4:Account) 41 | WHERE 42 | n1.id = n4.id AND n1.id > n2.id AND n2.id > n3.id 43 | WITH 44 | n1.id as n1id, 45 | n2.id as n2id, 46 | n3.id as n3id, 47 | n4.id as n4id 48 | LIMIT 1000 49 | RETURN DISTINCT toString(n1id)+"|"+toString(n2id) 50 | 51 | // tcr10 52 | MATCH 53 | (c:Company)<-[:invest]-(p:Person) 54 | WITH 55 | c.id as cid, 56 | count(p.id) as num, 57 | collect(p.id) as person 58 | WHERE num >= 2 59 | RETURN 60 | tostring(person[0])+"|"+tostring(person[1]) 61 | LIMIT 1000 62 | ``` 63 | -------------------------------------------------------------------------------- /tools/check_consistency.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import sys 4 | import glob 5 | 6 | print_templ = "| {} | {} | {} | {} |" 7 | 8 | 9 | def get_md5_list(subdir, dir): 10 | md5_list = [] 11 | csvs = glob.glob("{}/{}/*.csv".format(dir, subdir)) 12 | for csv in csvs: 13 | with open(csv, "rb") as f: 14 | md5_list.append(hashlib.md5(f.read()).hexdigest()) 15 | return sorted(md5_list) 16 | 17 | 18 | def check_multiple_files(subdir, dir1, dir2): 19 | dir1_list = get_md5_list(subdir, dir1) 20 | dir2_list = get_md5_list(subdir, dir2) 21 | return dir1_list == dir2_list 22 | 23 | 24 | def check_consistency(dir1, dir2): 25 | subdirs1 = [d for d in os.listdir(dir1) if os.path.isdir(os.path.join(dir1, d))] 26 | subdirs2 = [d for d in os.listdir(dir2) if os.path.isdir(os.path.join(dir2, d))] 27 | common_subdirs = set(subdirs1) & set(subdirs2) 28 | 29 | headers = ["Subdir", "Dir1", "Dir2", "Consistency"] 30 | max_len0 = max(max([len(d) for d in common_subdirs]), len(headers[0])) 31 | max_len1 = max(len(dir1), len(headers[1])) 32 | max_len2 = max(len(dir2), len(headers[2])) 33 | max_len3 = max( 34 | len("same"), 35 | len("different"), 36 | len("skipped for more than one file"), 37 | len(headers[3]), 38 | ) 39 | 40 | def align_print(col0: str, col1: str, col2: str, col3: str): 41 | print( 42 | print_templ.format( 43 | col0.center(max_len0), 44 | col1.center(max_len1), 45 | col2.center(max_len2), 46 | col3.center(max_len3), 47 | ) 48 | ) 49 | 50 | align_print(headers[0], headers[1], headers[2], headers[3]) 51 | for subdir in sorted(common_subdirs): 52 | if check_multiple_files(subdir, dir1, dir2): 53 | align_print(subdir, dir1, dir2, "same") 54 | else: 55 | align_print(subdir, dir1, dir2, "different") 56 | 57 | 58 | if __name__ == "__main__": 59 | dir1 = sys.argv[1] 60 | dir2 = sys.argv[2] 61 | check_consistency(dir1, dir2) 62 | -------------------------------------------------------------------------------- /tools/check_deletion.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import sys 3 | import os.path 4 | 5 | from pyspark.sql import SparkSession 6 | 7 | spark = SparkSession.builder.appName("check_time").getOrCreate() 8 | 9 | subdirs = [ 10 | "account", 11 | "companyOwnAccount", 12 | "withdraw", 13 | "deposit", 14 | "loantransfer", 15 | "personOwnAccount", 16 | "signIn", 17 | "repay", 18 | "transfer", 19 | ] 20 | 21 | 22 | def read_data(path): 23 | dataframes = [ 24 | spark.read.option("delimiter", "|").csv(csv, header=True, inferSchema=True) 25 | for csv in glob.glob(path) 26 | ] 27 | allTransfer = dataframes[0] 28 | for idx, dataframe in enumerate(dataframes): 29 | if idx == 0: 30 | continue 31 | allTransfer = allTransfer.union(dataframe) 32 | return allTransfer 33 | 34 | 35 | if __name__ == "__main__": 36 | prefix = sys.argv[1] 37 | for subdir in subdirs: 38 | print("Checking {} if deletion before creation......".format(subdir)) 39 | if not os.path.exists(os.path.join(prefix, subdir)): 40 | print("No {} data exists!\n".format(subdir)) 41 | continue 42 | data = read_data(os.path.join(prefix, subdir, "*.csv")) 43 | wrong = data.filter(data["createTime"] >= data["deleteTime"]) 44 | if wrong.count() > 0: 45 | print( 46 | "{} invalid! Having {} rows with wrong time\n".format( 47 | subdir, wrong.count() 48 | ) 49 | ) 50 | wrong.show(3) 51 | else: 52 | print("{} passed.\n".format(subdir)) 53 | -------------------------------------------------------------------------------- /tools/check_duplicate.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | import glob 3 | import sys 4 | import os 5 | 6 | spark = SparkSession.builder.appName("check_dup").getOrCreate() 7 | 8 | 9 | def check_dup(subdir, key): 10 | datas = [] 11 | for csv in glob.glob(subdir + "/*.csv"): 12 | datas.append( 13 | spark.read.option("delimiter", "|").csv(csv, header=True, inferSchema=True) 14 | ) 15 | 16 | merged = datas[0] 17 | for df in datas[1:]: 18 | merged = merged.unionAll(df) 19 | 20 | dups = merged.groupBy(key).count().filter("count > 1") 21 | print( 22 | "{}: Total rows: {}, duplicated {}".format(subdir, merged.count(), dups.count()) 23 | ) 24 | dups.show(5) 25 | 26 | 27 | if __name__ == "__main__": 28 | prefix = sys.argv[1] 29 | check_dup(os.path.join(prefix, "account"), "id") 30 | check_dup(os.path.join(prefix, "company"), "id") 31 | check_dup(os.path.join(prefix, "person"), "id") 32 | check_dup(os.path.join(prefix, "medium"), "id") 33 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/.gitignore: -------------------------------------------------------------------------------- 1 | profiler_standalone 2 | build -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.2) 2 | project(Profiler C CXX) 3 | 4 | set(TUGRAPH_HOME /home/qsp/project/tugraph-db/) 5 | 6 | add_executable( 7 | profiler_standalone 8 | profiler.cpp 9 | de_core.cpp 10 | wcc_core.cpp 11 | ${TUGRAPH_HOME}/src/lgraph_api/olap_base.cpp 12 | ${TUGRAPH_HOME}/src/lgraph_api/lgraph_utils.cpp 13 | ${TUGRAPH_HOME}/src/lgraph_api/olap_profile.cpp) 14 | target_link_libraries(profiler_standalone -static-libstdc++ libstdc++fs.a 15 | libgomp.a pthread dl) 16 | target_include_directories( 17 | profiler_standalone PUBLIC ${TUGRAPH_HOME}/deps/fma-common/ 18 | ${TUGRAPH_HOME}/include ${TUGRAPH_HOME}/src) 19 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/README.md: -------------------------------------------------------------------------------- 1 | # About the data profiler 2 | 3 | This tool as a Data Profiler is developed to profile the data distribution. 4 | It is devleoped based on [TuGraph][1], an open-source high performance graph database contributed by Ant Group 5 | Co., Ltd. 6 | 7 | The profiling of these metrics are currently supported: 8 | - Count of V(vertices) and E(edges) 9 | - Ratio of E over V 10 | - Edge multiplicity 11 | - In-degree and out-degree distribution including the percentiles 12 | - WCC and Diameter results 13 | 14 | And these features in visualization are supported: 15 | - plot the PowerLaw Distribution of degree 16 | - PowerLaw Distribution Regression 17 | 18 | # How to use 19 | 20 | ## Profile 21 | In order to compile this tool, you need to first pull TuGraph to local and set the TUGRAPH_HOME environment varible in 22 | `CmakeLists.txt` or `compile.sh` to the repository. See `CmakeLists.txt` and `compile.sh` for 23 | more details. 24 | 25 | ## Plot 26 | See `plot.py` for details. 27 | 28 | [1]: https://github.com/TuGraph-db/tugraph-db/ -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/algo.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022 AntGroup. All Rights Reserved. */ 2 | 3 | #pragma once 4 | 5 | #include 6 | #include "lgraph/olap_base.h" 7 | 8 | using namespace lgraph_api; 9 | using namespace lgraph_api::olap; 10 | 11 | /** 12 | * @brief Compute the Dimension Estimation algorithm. 13 | * 14 | * @param[in] graph The graph to compute on. 15 | * @param[in] roots The root vertex id to start de from. 16 | * 17 | * @return return dimension of graph. 18 | */ 19 | size_t DECore(OlapBase& graph, std::set& roots); 20 | 21 | /** 22 | * \brief Compute the weakly connected components. 23 | * 24 | * \param graph The graph to compute on, should be an *undirected* graph. 25 | * \param [in,out] label the ParallelVector to store wcc_label. 26 | */ 27 | void WCCCore(OlapBase& graph, ParallelVector& label); 28 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TUGRAPH_HOME=/home/qsp/project/tugraph-db/ 4 | 5 | g++ -fno-gnu-unique -fPIC -g --std=c++14 \ 6 | -I${TUGRAPH_HOME}/include \ 7 | -I${TUGRAPH_HOME}/src \ 8 | -I${TUGRAPH_HOME}/deps/fma-common \ 9 | -rdynamic -O3 -fopenmp -DNDEBUG \ 10 | -o profiler_standalone \ 11 | stat.cpp "${TUGRAPH_HOME}/build/output/liblgraph.so" -lrt -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/de_core.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022 AntGroup. All Rights Reserved. */ 2 | 3 | #include "lgraph/olap_base.h" 4 | #include "./algo.h" 5 | 6 | using namespace lgraph_api; 7 | using namespace lgraph_api::olap; 8 | 9 | size_t DECore(OlapBase & graph, std::set& roots) { 10 | size_t vertices = graph.NumVertices(); 11 | auto active_in = graph.AllocVertexSubset(); 12 | auto active_out = graph.AllocVertexSubset(); 13 | auto diameter = graph.AllocVertexArray(); 14 | auto curr = graph.AllocVertexArray(); 15 | auto next = graph.AllocVertexArray(); 16 | auto vst = graph.AllocVertexArray(); 17 | 18 | active_in.Fill(); 19 | graph.ProcessVertexActive( 20 | [&](size_t vtx) { 21 | diameter[vtx] = 0; 22 | curr[vtx] = 0; 23 | next[vtx] = 0; 24 | vst[vtx] = 0; 25 | return 0; 26 | }, 27 | active_in); 28 | assert(roots.size() <= 64); 29 | active_in.Clear(); 30 | uint64_t full = 0; 31 | int k = 0; 32 | for (auto vtx : roots) { 33 | curr[vtx] |= (1ul << k); 34 | vst[vtx] |= (1ul << k); 35 | full |= (1ul << k); 36 | diameter[vtx] = 0; 37 | active_in.Add(vtx); 38 | k++; 39 | } 40 | size_t active_vertices = roots.size(); 41 | 42 | size_t i_i = 0; 43 | while (active_vertices > 0) { 44 | i_i++; 45 | active_out.Clear(); 46 | active_vertices = graph.ProcessVertexActive( 47 | [&](size_t src) { 48 | size_t activated = 0; 49 | for (auto edge : graph.OutEdges(src)) { 50 | size_t dst = edge.neighbour; 51 | if (vst[dst] != full) { 52 | graph.AcquireVertexLock(dst); 53 | next[dst] |= curr[src]; 54 | vst[dst] |= curr[src]; 55 | if (diameter[dst] != i_i) { 56 | diameter[dst] = i_i; 57 | active_out.Add(dst); 58 | activated++; 59 | } 60 | graph.ReleaseVertexLock(dst); 61 | } 62 | } 63 | return activated; 64 | }, 65 | active_in); 66 | active_in.Swap(active_out); 67 | curr.Swap(next); 68 | } 69 | 70 | roots.clear(); 71 | size_t max_diameter = 0; 72 | for (size_t vtx = 0; vtx < vertices; vtx++) { 73 | if (diameter[vtx] > max_diameter) { 74 | max_diameter = diameter[vtx]; 75 | roots.clear(); 76 | } 77 | if (diameter[vtx] == max_diameter && roots.size() < 64) { 78 | roots.insert(vtx); 79 | } 80 | } 81 | return max_diameter; 82 | } 83 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db139/edges.txt: -------------------------------------------------------------------------------- 1 | V 192957470, E 364134424, E/V 1.88712 2 | Unique edges: 2.07034e+08 / 364134424, Multiplicity: 1.75881 3 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db139/in-out.txt: -------------------------------------------------------------------------------- 1 | DEGREE TOP 100 (2473301,3200099) (1783222,1209419) (1778827,1115677) (417996,1089786) (417883,873399) (416456,862247) (416093,859954) (376752,760379) (363829,751146) (341055,645088) (340476,641870) (339543,603601) (327973,591788) (321914,567589) (246318,514367) (239969,496956) (234698,465880) (233255,439897) (232308,415132) (231423,414446) (230364,340349) (213580,315555) (200366,314544) (195950,305778) (188032,284154) (171577,274382) (168350,254939) (168065,250598) (161465,246216) (158669,240788) (151784,229971) (151696,223614) (150801,219798) (150114,218040) (147190,214775) (141866,211877) (141180,210660) (132081,204953) (130586,203001) (129405,197944) (126987,193862) (125396,188107) (119588,182274) (116911,182266) (108654,180893) (108293,180738) (107732,174453) (106525,169600) (106032,168351) (104989,168317) (104042,166833) (102056,165385) (101565,157632) (101172,156496) (99948,153538) (99908,149475) (99899,148876) (98522,146576) (96846,138774) (96553,133449) (95932,132374) (95518,131027) (91272,130293) (86434,126792) (85669,124702) (84085,122362) (83575,118131) (82737,117889) (82705,117492) (82333,115296) (81683,113106) (80999,112695) (80489,112460) (80179,111925) (79468,110544) (79155,110432) (78675,109815) (78200,107059) (78011,105027) (77708,104274) (77445,104030) (77336,102202) (77225,102008) (73789,101867) (72939,101760) (72318,101721) (71929,98774) (71650,98405) (70485,98008) (70017,97270) (69960,94013) (69825,93670) (68818,92162) (67873,91379) (67718,90903) (67480,89878) (66867,87863) (66818,84477) (66405,84039) (65966,82336) 2 | DEGREE PEC 100 2473301 13 8 6 5 4 4 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db139/in_degree_dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db139/in_degree_dist.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 109539041.82131267 3 | beta: -2.319428121087157 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db139/out-in.txt: -------------------------------------------------------------------------------- 1 | DEGREE TOP 100 (3200099,2473301) (1209419,1783222) (1115677,1778827) (1089786,417996) (873399,417883) (862247,416456) (859954,416093) (760379,376752) (751146,363829) (645088,341055) (641870,340476) (603601,339543) (591788,327973) (567589,321914) (514367,246318) (496956,239969) (465880,234698) (439897,233255) (415132,232308) (414446,231423) (340349,230364) (315555,213580) (314544,200366) (305778,195950) (284154,188032) (274382,171577) (254939,168350) (250598,168065) (246216,161465) (240788,158669) (229971,151784) (223614,151696) (219798,150801) (218040,150114) (214775,147190) (211877,141866) (210660,141180) (204953,132081) (203001,130586) (197944,129405) (193862,126987) (188107,125396) (182274,119588) (182266,116911) (180893,108654) (180738,108293) (174453,107732) (169600,106525) (168351,106032) (168317,104989) (166833,104042) (165385,102056) (157632,101565) (156496,101172) (153538,99948) (149475,99908) (148876,99899) (146576,98522) (138774,96846) (133449,96553) (132374,95932) (131027,95518) (130293,91272) (126792,86434) (124702,85669) (122362,84085) (118131,83575) (117889,82737) (117492,82705) (115296,82333) (113106,81683) (112695,80999) (112460,80489) (111925,80179) (110544,79468) (110432,79155) (109815,78675) (107059,78200) (105027,78011) (104274,77708) (104030,77445) (102202,77336) (102008,77225) (101867,73789) (101760,72939) (101721,72318) (98774,71929) (98405,71650) (98008,70485) (97270,70017) (94013,69960) (93670,69825) (92162,68818) (91379,67873) (90903,67718) (89878,67480) (87863,66867) (84477,66818) (84039,66405) (82336,65966) 2 | DEGREE PEC 100 3200099 33 17 10 6 4 3 3 2 2 2 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db139/out_degree_dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db139/out_degree_dist.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 20186572.91449603 3 | beta: -1.7197963259066844 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db177/in-out.txt: -------------------------------------------------------------------------------- 1 | DEGREE TOP 100 (2482180,149885218) (1809340,3717832) (1791730,3049232) (422108,1529335) (418298,1358574) (416446,830233) (413525,584009) (377450,552680) (366669,425032) (344063,424147) (342589,413577) (341930,403508) (324830,366893) (323695,337105) (245819,334096) (236200,310579) (236110,274086) (235741,263344) (235192,249659) (233683,246903) (231212,236568) (212737,226417) (200788,217820) (199005,215850) (197283,211030) (175922,205471) (157368,196111) (153269,192106) (152864,185634) (151324,185391) (149506,184095) (141611,182666) (140400,180840) (139353,177872) (136607,176061) (133260,173571) (130547,166558) (129518,154831) (126796,153383) (126188,151620) (116546,151526) (115982,149563) (110005,144547) (109782,143036) (108152,140946) (106538,138909) (106100,138739) (106057,136393) (104552,136387) (101608,133203) (101288,130774) (101130,129113) (100918,128466) (100806,123092) (100663,122190) (99460,120862) (98671,118445) (96911,117918) (95514,115389) (94966,113092) (91107,108581) (90494,106834) (83910,106782) (83766,103335) (83741,102309) (83218,101037) (82754,99585) (82732,98936) (81396,97097) (80429,96245) (79373,93535) (79343,92457) (78541,90238) (78206,89948) (78126,88752) (77809,88735) (77388,87973) (77185,86416) (77031,85444) (76909,80630) (74745,79470) (72674,78478) (71937,77435) (71154,76548) (70638,74983) (70208,74799) (70088,74586) (69684,73839) (69496,73679) (67818,73298) (66740,72493) (66639,70731) (66149,69641) (65819,69549) (65703,69129) (65544,68903) (64300,68123) (63466,68048) (62935,67618) (62858,66595) 2 | DEGREE PEC 100 2482180 15 10 8 6 5 5 4 4 4 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db177/in_degree_dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db177/in_degree_dist.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 133908623.8869632 3 | beta: -2.0848641590246326 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db177/out-in.txt: -------------------------------------------------------------------------------- 1 | DEGREE TOP 100 (149885218,2482180) (3717832,1809340) (3049232,1791730) (1529335,422108) (1358574,418298) (830233,416446) (584009,413525) (552680,377450) (425032,366669) (424147,344063) (413577,342589) (403508,341930) (366893,324830) (337105,323695) (334096,245819) (310579,236200) (274086,236110) (263344,235741) (249659,235192) (246903,233683) (236568,231212) (226417,212737) (217820,200788) (215850,199005) (211030,197283) (205471,175922) (196111,157368) (192106,153269) (185634,152864) (185391,151324) (184095,149506) (182666,141611) (180840,140400) (177872,139353) (176061,136607) (173571,133260) (166558,130547) (154831,129518) (153383,126796) (151620,126188) (151526,116546) (149563,115982) (144547,110005) (143036,109782) (140946,108152) (138909,106538) (138739,106100) (136393,106057) (136387,104552) (133203,101608) (130774,101288) (129113,101130) (128466,100918) (123092,100806) (122190,100663) (120862,99460) (118445,98671) (117918,96911) (115389,95514) (113092,94966) (108581,91107) (106834,90494) (106782,83910) (103335,83766) (102309,83741) (101037,83218) (99585,82754) (98936,82732) (97097,81396) (96245,80429) (93535,79373) (92457,79343) (90238,78541) (89948,78206) (88752,78126) (88735,77809) (87973,77388) (86416,77185) (85444,77031) (80630,76909) (79470,74745) (78478,72674) (77435,71937) (76548,71154) (74983,70638) (74799,70208) (74586,70088) (73839,69684) (73679,69496) (73298,67818) (72493,66740) (70731,66639) (69641,66149) (69549,65819) (69129,65703) (68903,65544) (68123,64300) (68048,63466) (67618,62935) (66595,62858) 2 | DEGREE PEC 100 149885218 27 13 7 4 3 3 2 2 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db177/out_degree_dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db177/out_degree_dist.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 20194472.85465291 3 | beta: -1.719773728669686 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db184/edges.txt: -------------------------------------------------------------------------------- 1 | V 137733288, E 259405367, E/V 1.88339 2 | Unique edges: 1.45251e+08 / 259405367, Multiplicity: 1.78591 3 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db184/in-out.txt: -------------------------------------------------------------------------------- 1 | DEGREE TOP 100 (1732600,7654441) (1267639,2377914) (1259973,1936613) (828900,1000007) (308786,953128) (296263,952993) (294417,562262) (292892,558103) (292377,511335) (268469,505433) (264011,376547) (240173,351256) (239469,308577) (239292,296902) (230500,292156) (228465,271482) (179536,264649) (166358,259273) (165705,248764) (163129,246845) (163113,184887) (161470,177484) (161333,172270) (149807,170135) (140637,167465) (139188,167224) (134272,167043) (133613,164247) (118217,155939) (118020,154629) (111607,153845) (107757,139188) (107671,137904) (107176,134001) (106575,132345) (103729,128671) (97706,126467) (96547,124197) (91013,122166) (90751,117051) (89611,115487) (88550,110171) (87872,107737) (82879,107549) (76867,106927) (76248,105792) (75394,105291) (74913,101305) (74607,98705) (73408,98230) (72142,97844) (71775,96894) (71687,96078) (70989,95640) (70983,94051) (70841,93189) (70133,93163) (69700,91733) (69575,88850) (69207,88689) (67859,87419) (66985,86846) (66082,85448) (64750,85290) (63306,84319) (59816,82857) (59792,82839) (58865,82625) (58732,82198) (57672,78375) (57579,78117) (57466,75798) (57177,75521) (57027,74762) (56474,73389) (55223,73325) (54991,73280) (54932,72705) (54672,70911) (54451,69241) (54281,68911) (53939,68489) (53687,66771) (52786,66404) (50886,64445) (50578,63764) (50276,63105) (49802,62590) (49751,56216) (49068,55041) (48730,54528) (48451,54278) (47581,53784) (47303,52740) (47167,52705) (46741,52576) (46696,51466) (46690,51266) (46033,50903) (45987,50816) 2 | DEGREE PEC 100 1732600 13 8 6 5 4 4 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db184/in_degree_dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db184/in_degree_dist.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 78379700.03758368 3 | beta: -2.318714660323696 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db184/out-in.txt: -------------------------------------------------------------------------------- 1 | DEGREE TOP 100 (7654441,1732600) (2377914,1267639) (1936613,1259973) (1000007,828900) (953128,308786) (952993,296263) (562262,294417) (558103,292892) (511335,292377) (505433,268469) (376547,264011) (351256,240173) (308577,239469) (296902,239292) (292156,230500) (271482,228465) (264649,179536) (259273,166358) (248764,165705) (246845,163129) (184887,163113) (177484,161470) (172270,161333) (170135,149807) (167465,140637) (167224,139188) (167043,134272) (164247,133613) (155939,118217) (154629,118020) (153845,111607) (139188,107757) (137904,107671) (134001,107176) (132345,106575) (128671,103729) (126467,97706) (124197,96547) (122166,91013) (117051,90751) (115487,89611) (110171,88550) (107737,87872) (107549,82879) (106927,76867) (105792,76248) (105291,75394) (101305,74913) (98705,74607) (98230,73408) (97844,72142) (96894,71775) (96078,71687) (95640,70989) (94051,70983) (93189,70841) (93163,70133) (91733,69700) (88850,69575) (88689,69207) (87419,67859) (86846,66985) (85448,66082) (85290,64750) (84319,63306) (82857,59816) (82839,59792) (82625,58865) (82198,58732) (78375,57672) (78117,57579) (75798,57466) (75521,57177) (74762,57027) (73389,56474) (73325,55223) (73280,54991) (72705,54932) (70911,54672) (69241,54451) (68911,54281) (68489,53939) (66771,53687) (66404,52786) (64445,50886) (63764,50578) (63105,50276) (62590,49802) (56216,49751) (55041,49068) (54528,48730) (54278,48451) (53784,47581) (52740,47303) (52705,47167) (52576,46741) (51466,46696) (51266,46690) (50903,46033) (50816,45987) 2 | DEGREE PEC 100 7654441 33 17 10 6 4 3 3 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db184/out_degree_dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db184/out_degree_dist.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 14153912.685932105 3 | beta: -1.7194378338583043 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 223563.07183870228 3 | beta: -1.334418448216897 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 220373.12764021207 3 | beta: -1.3299838405402395 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.txt: -------------------------------------------------------------------------------- 1 | 1,221327 2 | 2,95621 3 | 3,56057 4 | 4,37348 5 | 5,27334 6 | 6,20686 7 | 7,16224 8 | 8,12973 9 | 9,10801 10 | 10,8827 11 | 11,7600 12 | 12,6480 13 | 13,5467 14 | 14,4809 15 | 15,4158 16 | 16,3683 17 | 17,3153 18 | 18,2779 19 | 19,2503 20 | 20,2318 21 | 21,2055 22 | 22,1778 23 | 23,1617 24 | 24,1417 25 | 25,1350 26 | 26,1192 27 | 27,1084 28 | 28,1032 29 | 29,927 30 | 30,838 31 | 31,743 32 | 32,706 33 | 33,652 34 | 34,633 35 | 35,591 36 | 36,480 37 | 37,473 38 | 38,478 39 | 39,415 40 | 40,420 41 | 41,366 42 | 42,312 43 | 43,333 44 | 44,330 45 | 45,292 46 | 46,255 47 | 47,234 48 | 48,226 49 | 49,212 50 | 50,185 51 | 51,195 52 | 52,213 53 | 53,181 54 | 54,149 55 | 55,136 56 | 56,142 57 | 57,131 58 | 58,151 59 | 59,111 60 | 60,124 61 | 61,117 62 | 62,117 63 | 63,91 64 | 64,92 65 | 65,100 66 | 66,89 67 | 67,70 68 | 68,73 69 | 69,64 70 | 70,78 71 | 71,67 72 | 72,57 73 | 73,73 74 | 74,66 75 | 75,68 76 | 76,46 77 | 77,53 78 | 78,43 79 | 79,37 80 | 80,35 81 | 81,53 82 | 82,39 83 | 83,43 84 | 84,33 85 | 85,29 86 | 86,38 87 | 87,37 88 | 88,32 89 | 89,31 90 | 90,46 91 | 91,38 92 | 92,22 93 | 93,23 94 | 94,23 95 | 95,25 96 | 96,33 97 | 97,20 98 | 98,27 99 | 99,21 100 | 100,22 101 | 101,21 102 | 102,17 103 | 103,19 104 | 104,19 105 | 105,14 106 | 106,21 107 | 107,11 108 | 108,22 109 | 109,19 110 | 110,7 111 | 111,12 112 | 112,18 113 | 113,19 114 | 114,14 115 | 115,12 116 | 116,15 117 | 117,16 118 | 118,15 119 | 119,12 120 | 120,17 121 | 121,16 122 | 122,9 123 | 123,8 124 | 124,4 125 | 125,9 126 | 126,11 127 | 127,9 128 | 128,12 129 | 129,14 130 | 130,14 131 | 131,13 132 | 132,15 133 | 133,6 134 | 134,8 135 | 135,5 136 | 136,8 137 | 137,9 138 | 138,6 139 | 139,7 140 | 140,6 141 | 141,8 142 | 142,3 143 | 143,4 144 | 144,5 145 | 145,8 146 | 146,5 147 | 147,5 148 | 148,4 149 | 149,6 150 | 150,8 151 | 151,5 152 | 152,10 153 | 153,7 154 | 154,6 155 | 155,3 156 | 156,5 157 | 157,6 158 | 158,3 159 | 159,4 160 | 160,6 161 | 161,3 162 | 162,5 163 | 163,1 164 | 164,4 165 | 165,2 166 | 166,7 167 | 167,8 168 | 168,5 169 | 169,3 170 | 170,3 171 | 171,4 172 | 172,5 173 | 173,1 174 | 174,2 175 | 175,2 176 | 176,2 177 | 177,2 178 | 178,6 179 | 179,3 180 | 180,2 181 | 181,3 182 | 182,1 183 | 183,4 184 | 184,4 185 | 185,1 186 | 186,2 187 | 187,4 188 | 188,4 189 | 189,2 190 | 190,3 191 | 191,2 192 | 192,1 193 | 193,2 194 | 194,2 195 | 195,2 196 | 196,1 197 | 197,2 198 | 198,1 199 | 200,1 200 | 201,2 201 | 202,2 202 | 204,2 203 | 205,3 204 | 206,4 205 | 207,1 206 | 208,3 207 | 209,1 208 | 210,2 209 | 212,3 210 | 214,2 211 | 216,1 212 | 218,2 213 | 220,2 214 | 221,2 215 | 222,3 216 | 223,1 217 | 224,2 218 | 225,1 219 | 230,2 220 | 231,4 221 | 232,1 222 | 234,1 223 | 236,2 224 | 237,1 225 | 238,1 226 | 239,1 227 | 241,1 228 | 244,1 229 | 245,2 230 | 246,4 231 | 247,1 232 | 248,1 233 | 251,1 234 | 253,3 235 | 254,3 236 | 255,2 237 | 257,1 238 | 258,2 239 | 259,1 240 | 260,1 241 | 263,1 242 | 264,2 243 | 265,1 244 | 266,3 245 | 268,1 246 | 269,1 247 | 273,2 248 | 286,2 249 | 290,1 250 | 292,2 251 | 294,2 252 | 299,1 253 | 300,1 254 | 301,1 255 | 308,1 256 | 313,1 257 | 316,2 258 | 319,2 259 | 332,1 260 | 333,1 261 | 334,1 262 | 337,1 263 | 346,1 264 | 356,1 265 | 361,1 266 | 362,2 267 | 366,1 268 | 369,1 269 | 370,1 270 | 376,1 271 | 383,1 272 | 387,1 273 | 406,1 274 | 418,1 275 | 427,1 276 | 429,1 277 | 430,1 278 | 434,1 279 | 439,1 280 | 453,1 281 | 473,1 282 | 496,1 283 | 497,1 284 | 509,1 285 | 514,1 286 | 522,1 287 | 524,1 288 | 529,1 289 | 531,1 290 | 541,1 291 | 554,1 292 | 667,1 293 | 727,1 294 | 731,1 295 | 762,1 296 | 847,1 297 | 866,1 298 | 931,1 299 | 1236,1 300 | 1555,1 301 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 224369.62351644383 3 | beta: -1.3383227574196173 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.txt: -------------------------------------------------------------------------------- 1 | 1,129328 2 | 2,56359 3 | 3,33288 4 | 4,22557 5 | 5,16138 6 | 6,12218 7 | 7,9438 8 | 8,7807 9 | 9,6411 10 | 10,5208 11 | 11,4376 12 | 12,3810 13 | 13,3189 14 | 14,2830 15 | 15,2393 16 | 16,2153 17 | 17,1870 18 | 18,1639 19 | 19,1431 20 | 20,1293 21 | 21,1166 22 | 22,1084 23 | 23,918 24 | 24,871 25 | 25,775 26 | 26,757 27 | 27,633 28 | 28,575 29 | 29,539 30 | 30,525 31 | 31,458 32 | 32,408 33 | 33,398 34 | 34,354 35 | 35,326 36 | 36,300 37 | 37,304 38 | 38,257 39 | 39,230 40 | 40,241 41 | 41,224 42 | 42,199 43 | 43,198 44 | 44,165 45 | 45,158 46 | 46,153 47 | 47,120 48 | 48,156 49 | 49,125 50 | 50,134 51 | 51,115 52 | 52,114 53 | 53,94 54 | 54,88 55 | 55,85 56 | 56,77 57 | 57,94 58 | 58,85 59 | 59,84 60 | 60,57 61 | 61,61 62 | 62,57 63 | 63,54 64 | 64,51 65 | 65,46 66 | 66,51 67 | 67,44 68 | 68,40 69 | 69,32 70 | 70,35 71 | 71,41 72 | 72,35 73 | 73,31 74 | 74,40 75 | 75,42 76 | 76,33 77 | 77,40 78 | 78,28 79 | 79,33 80 | 80,34 81 | 81,25 82 | 82,25 83 | 83,32 84 | 84,20 85 | 85,14 86 | 86,18 87 | 87,24 88 | 88,11 89 | 89,15 90 | 90,24 91 | 91,21 92 | 92,13 93 | 93,14 94 | 94,16 95 | 95,16 96 | 96,15 97 | 97,17 98 | 98,18 99 | 99,16 100 | 100,19 101 | 101,10 102 | 102,10 103 | 103,6 104 | 104,14 105 | 105,12 106 | 106,8 107 | 107,8 108 | 108,8 109 | 109,14 110 | 110,4 111 | 111,4 112 | 112,8 113 | 113,13 114 | 114,8 115 | 115,4 116 | 116,6 117 | 117,13 118 | 118,7 119 | 119,9 120 | 120,10 121 | 121,6 122 | 122,7 123 | 123,5 124 | 124,7 125 | 125,6 126 | 126,3 127 | 127,10 128 | 128,7 129 | 129,8 130 | 130,5 131 | 131,5 132 | 132,11 133 | 133,1 134 | 134,5 135 | 135,1 136 | 136,4 137 | 137,6 138 | 138,1 139 | 139,7 140 | 140,8 141 | 141,4 142 | 142,6 143 | 143,4 144 | 144,9 145 | 145,6 146 | 146,3 147 | 147,2 148 | 148,3 149 | 149,4 150 | 150,3 151 | 151,4 152 | 152,2 153 | 153,4 154 | 154,2 155 | 155,2 156 | 156,4 157 | 157,2 158 | 159,2 159 | 160,5 160 | 161,3 161 | 162,4 162 | 163,1 163 | 164,2 164 | 165,1 165 | 166,1 166 | 167,2 167 | 168,3 168 | 169,3 169 | 170,3 170 | 171,2 171 | 172,5 172 | 173,2 173 | 174,1 174 | 175,3 175 | 176,3 176 | 177,2 177 | 178,1 178 | 179,2 179 | 180,1 180 | 181,3 181 | 182,1 182 | 183,2 183 | 184,1 184 | 185,1 185 | 186,2 186 | 187,1 187 | 188,1 188 | 190,2 189 | 191,2 190 | 192,2 191 | 193,2 192 | 194,1 193 | 195,1 194 | 197,1 195 | 198,1 196 | 200,1 197 | 201,1 198 | 202,2 199 | 204,2 200 | 205,1 201 | 207,4 202 | 208,3 203 | 211,3 204 | 213,2 205 | 214,2 206 | 215,1 207 | 217,2 208 | 219,2 209 | 221,1 210 | 224,1 211 | 227,1 212 | 228,2 213 | 229,1 214 | 232,1 215 | 234,1 216 | 236,2 217 | 237,1 218 | 238,3 219 | 239,2 220 | 240,1 221 | 241,1 222 | 243,1 223 | 244,1 224 | 252,1 225 | 255,1 226 | 256,1 227 | 257,1 228 | 258,1 229 | 259,1 230 | 260,1 231 | 267,1 232 | 270,1 233 | 274,1 234 | 275,1 235 | 279,1 236 | 283,1 237 | 284,1 238 | 290,2 239 | 294,1 240 | 296,1 241 | 297,2 242 | 302,1 243 | 303,3 244 | 304,2 245 | 305,3 246 | 309,1 247 | 316,1 248 | 322,1 249 | 338,1 250 | 344,2 251 | 352,1 252 | 354,1 253 | 364,1 254 | 368,1 255 | 371,1 256 | 372,1 257 | 373,1 258 | 383,1 259 | 401,1 260 | 407,1 261 | 409,2 262 | 426,1 263 | 443,1 264 | 444,1 265 | 449,1 266 | 452,1 267 | 516,1 268 | 521,1 269 | 537,1 270 | 559,1 271 | 592,1 272 | 756,1 273 | 934,1 274 | 1687,1 275 | 2119,1 276 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 131228.70692235493 3 | beta: -1.3302376665752749 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.txt: -------------------------------------------------------------------------------- 1 | 1,127403 2 | 2,56508 3 | 3,32643 4 | 4,22014 5 | 5,16041 6 | 6,12262 7 | 7,9381 8 | 8,7697 9 | 9,6185 10 | 10,5200 11 | 11,4458 12 | 12,3791 13 | 13,3188 14 | 14,2793 15 | 15,2439 16 | 16,2138 17 | 17,1873 18 | 18,1659 19 | 19,1546 20 | 20,1334 21 | 21,1234 22 | 22,1089 23 | 23,921 24 | 24,870 25 | 25,824 26 | 26,748 27 | 27,663 28 | 28,623 29 | 29,564 30 | 30,531 31 | 31,454 32 | 32,405 33 | 33,372 34 | 34,379 35 | 35,335 36 | 36,279 37 | 37,248 38 | 38,237 39 | 39,244 40 | 40,243 41 | 41,203 42 | 42,230 43 | 43,192 44 | 44,196 45 | 45,172 46 | 46,165 47 | 47,169 48 | 48,126 49 | 49,127 50 | 50,122 51 | 51,133 52 | 52,97 53 | 53,97 54 | 54,81 55 | 55,90 56 | 56,98 57 | 57,80 58 | 58,59 59 | 59,72 60 | 60,69 61 | 61,68 62 | 62,57 63 | 63,60 64 | 64,48 65 | 65,57 66 | 66,58 67 | 67,48 68 | 68,48 69 | 69,43 70 | 70,39 71 | 71,29 72 | 72,45 73 | 73,28 74 | 74,38 75 | 75,49 76 | 76,37 77 | 77,36 78 | 78,25 79 | 79,29 80 | 80,32 81 | 81,28 82 | 82,25 83 | 83,30 84 | 84,26 85 | 85,19 86 | 86,21 87 | 87,20 88 | 88,21 89 | 89,28 90 | 90,28 91 | 91,18 92 | 92,17 93 | 93,14 94 | 94,12 95 | 95,11 96 | 96,19 97 | 97,10 98 | 98,15 99 | 99,12 100 | 100,18 101 | 101,13 102 | 102,23 103 | 103,17 104 | 104,9 105 | 105,11 106 | 106,12 107 | 107,10 108 | 108,10 109 | 109,17 110 | 110,11 111 | 111,14 112 | 112,8 113 | 113,12 114 | 114,7 115 | 115,9 116 | 116,6 117 | 117,10 118 | 118,9 119 | 119,5 120 | 120,7 121 | 121,9 122 | 122,7 123 | 123,4 124 | 124,9 125 | 125,5 126 | 126,12 127 | 127,9 128 | 128,6 129 | 129,9 130 | 130,4 131 | 131,4 132 | 132,4 133 | 133,8 134 | 134,5 135 | 135,5 136 | 136,3 137 | 137,4 138 | 138,4 139 | 139,3 140 | 140,4 141 | 141,4 142 | 142,4 143 | 143,5 144 | 144,6 145 | 145,1 146 | 146,4 147 | 147,6 148 | 148,7 149 | 149,1 150 | 150,4 151 | 151,1 152 | 152,3 153 | 153,4 154 | 156,3 155 | 157,1 156 | 158,2 157 | 159,5 158 | 160,2 159 | 161,4 160 | 162,4 161 | 164,2 162 | 165,1 163 | 166,3 164 | 167,2 165 | 168,3 166 | 169,1 167 | 171,1 168 | 172,3 169 | 174,3 170 | 175,2 171 | 176,1 172 | 178,7 173 | 179,2 174 | 180,2 175 | 181,2 176 | 182,1 177 | 183,2 178 | 184,1 179 | 186,2 180 | 188,1 181 | 189,1 182 | 190,3 183 | 191,4 184 | 192,3 185 | 193,3 186 | 194,1 187 | 196,2 188 | 198,1 189 | 199,2 190 | 200,2 191 | 201,1 192 | 203,5 193 | 204,2 194 | 205,1 195 | 208,2 196 | 209,1 197 | 210,1 198 | 211,1 199 | 212,1 200 | 214,1 201 | 216,1 202 | 217,1 203 | 218,1 204 | 221,2 205 | 223,1 206 | 225,1 207 | 227,1 208 | 228,2 209 | 230,2 210 | 231,1 211 | 232,1 212 | 235,1 213 | 237,2 214 | 239,1 215 | 241,3 216 | 242,1 217 | 243,2 218 | 244,1 219 | 245,1 220 | 246,1 221 | 247,1 222 | 250,1 223 | 253,1 224 | 255,1 225 | 259,1 226 | 263,2 227 | 271,1 228 | 272,3 229 | 278,2 230 | 279,1 231 | 283,1 232 | 288,1 233 | 290,1 234 | 291,1 235 | 300,2 236 | 301,1 237 | 307,1 238 | 315,1 239 | 316,2 240 | 318,1 241 | 319,1 242 | 326,1 243 | 328,1 244 | 330,1 245 | 334,1 246 | 343,1 247 | 346,1 248 | 354,1 249 | 357,1 250 | 361,1 251 | 363,1 252 | 382,1 253 | 415,1 254 | 424,2 255 | 432,1 256 | 435,1 257 | 447,1 258 | 451,1 259 | 480,1 260 | 511,1 261 | 514,1 262 | 707,1 263 | 709,1 264 | 806,1 265 | 872,1 266 | 959,1 267 | 2406,1 268 | 2819,1 269 | 3198,1 270 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 129436.57093022726 3 | beta: -1.32565715160002 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 27230469.375668973 3 | beta: -1.573305530151192 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.txt: -------------------------------------------------------------------------------- 1 | 1,1127199 2 | 2,456471 3 | 3,231478 4 | 4,147759 5 | 5,98680 6 | 6,73174 7 | 7,51960 8 | 8,41451 9 | 9,31121 10 | 10,26166 11 | 11,20475 12 | 12,17443 13 | 13,13827 14 | 14,11993 15 | 15,9829 16 | 16,8780 17 | 17,7115 18 | 18,6303 19 | 19,5118 20 | 20,4321 21 | 21,3384 22 | 22,2984 23 | 23,2272 24 | 24,2107 25 | 25,1675 26 | 26,1506 27 | 27,1170 28 | 28,1132 29 | 29,934 30 | 30,798 31 | 31,643 32 | 32,675 33 | 33,489 34 | 34,515 35 | 35,354 36 | 36,387 37 | 37,274 38 | 38,283 39 | 39,215 40 | 40,218 41 | 41,157 42 | 42,174 43 | 43,133 44 | 44,128 45 | 45,82 46 | 46,119 47 | 47,95 48 | 48,87 49 | 49,58 50 | 50,76 51 | 51,45 52 | 52,33 53 | 53,31 54 | 54,47 55 | 55,33 56 | 56,48 57 | 57,36 58 | 58,42 59 | 59,29 60 | 60,26 61 | 61,22 62 | 62,24 63 | 63,12 64 | 64,16 65 | 65,17 66 | 66,15 67 | 67,10 68 | 68,15 69 | 69,7 70 | 70,15 71 | 71,7 72 | 72,5 73 | 73,9 74 | 74,14 75 | 75,9 76 | 76,6 77 | 77,7 78 | 78,6 79 | 79,5 80 | 80,3 81 | 81,3 82 | 82,8 83 | 83,3 84 | 84,6 85 | 85,3 86 | 86,2 87 | 87,3 88 | 88,3 89 | 89,2 90 | 90,2 91 | 91,2 92 | 92,2 93 | 94,1 94 | 95,1 95 | 96,3 96 | 99,2 97 | 100,1 98 | 101,2 99 | 103,1 100 | 104,2 101 | 105,1 102 | 106,1 103 | 108,1 104 | 109,1 105 | 111,2 106 | 113,1 107 | 117,1 108 | 122,2 109 | 123,1 110 | 126,1 111 | 131,1 112 | 132,1 113 | 133,2 114 | 137,1 115 | 142,1 116 | 144,1 117 | 159,1 118 | 204,1 119 | 218,1 120 | 230,1 121 | 235,1 122 | 271,1 123 | 294,1 124 | 297,1 125 | 302,1 126 | 305,1 127 | 825,1 128 | 1206,1 129 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 1141214.9408893671 3 | beta: -1.488771851133902 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.txt: -------------------------------------------------------------------------------- 1 | 1,142193 2 | 2,108601 3 | 3,49793 4 | 4,39978 5 | 5,25139 6 | 6,20298 7 | 7,14961 8 | 8,12881 9 | 9,10057 10 | 10,8985 11 | 11,7432 12 | 12,6622 13 | 13,5575 14 | 14,5025 15 | 15,4478 16 | 16,4354 17 | 17,3761 18 | 18,3622 19 | 19,3319 20 | 20,3153 21 | 21,2942 22 | 22,2749 23 | 23,2569 24 | 24,2526 25 | 25,2324 26 | 26,2189 27 | 27,2107 28 | 28,2025 29 | 29,1850 30 | 30,2013 31 | 31,1823 32 | 32,1801 33 | 33,1802 34 | 34,1909 35 | 35,1812 36 | 36,1871 37 | 37,1695 38 | 38,1573 39 | 39,1320 40 | 40,1311 41 | 41,1122 42 | 42,1053 43 | 43,1025 44 | 44,938 45 | 45,810 46 | 46,754 47 | 47,645 48 | 48,600 49 | 49,550 50 | 50,489 51 | 51,434 52 | 52,356 53 | 53,307 54 | 54,309 55 | 55,251 56 | 56,233 57 | 57,160 58 | 58,181 59 | 59,150 60 | 60,110 61 | 61,76 62 | 62,69 63 | 63,47 64 | 64,46 65 | 65,32 66 | 66,37 67 | 67,33 68 | 68,41 69 | 69,19 70 | 70,24 71 | 71,18 72 | 72,21 73 | 73,15 74 | 74,15 75 | 75,11 76 | 76,9 77 | 77,13 78 | 78,8 79 | 79,3 80 | 80,11 81 | 81,6 82 | 82,9 83 | 83,2 84 | 84,6 85 | 85,1 86 | 86,4 87 | 87,4 88 | 88,3 89 | 89,8 90 | 90,4 91 | 91,3 92 | 92,3 93 | 93,3 94 | 94,3 95 | 96,6 96 | 97,3 97 | 98,3 98 | 100,4 99 | 101,1 100 | 102,3 101 | 113,1 102 | 118,1 103 | 124,1 104 | 129,1 105 | 154,1 106 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 156038.86238756854 3 | beta: -1.1012676833313366 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.txt: -------------------------------------------------------------------------------- 1 | 1,67717 2 | 2,77379 3 | 3,31850 4 | 4,30428 5 | 5,17902 6 | 6,16002 7 | 7,11361 8 | 8,9941 9 | 9,7763 10 | 10,6769 11 | 11,5319 12 | 12,4602 13 | 13,3824 14 | 14,3488 15 | 15,3163 16 | 16,2965 17 | 17,2572 18 | 18,2406 19 | 19,2317 20 | 20,2207 21 | 21,2083 22 | 22,2032 23 | 23,1876 24 | 24,1884 25 | 25,1865 26 | 26,1918 27 | 27,1784 28 | 28,1824 29 | 29,1745 30 | 30,1860 31 | 31,1891 32 | 32,2074 33 | 33,2027 34 | 34,2300 35 | 35,2264 36 | 36,2772 37 | 37,2205 38 | 38,2155 39 | 39,1740 40 | 40,1599 41 | 41,1360 42 | 42,1206 43 | 43,1100 44 | 44,992 45 | 45,841 46 | 46,719 47 | 47,643 48 | 48,545 49 | 49,497 50 | 50,470 51 | 51,417 52 | 52,404 53 | 53,359 54 | 54,329 55 | 55,286 56 | 56,261 57 | 57,200 58 | 58,203 59 | 59,160 60 | 60,118 61 | 61,104 62 | 62,64 63 | 63,58 64 | 64,39 65 | 65,47 66 | 66,43 67 | 67,34 68 | 68,33 69 | 69,23 70 | 70,37 71 | 71,34 72 | 72,27 73 | 73,16 74 | 74,10 75 | 75,10 76 | 76,8 77 | 77,8 78 | 78,14 79 | 79,11 80 | 80,6 81 | 81,5 82 | 82,8 83 | 83,4 84 | 84,5 85 | 85,4 86 | 86,4 87 | 87,6 88 | 88,6 89 | 89,3 90 | 90,1 91 | 91,5 92 | 92,1 93 | 93,4 94 | 94,3 95 | 95,1 96 | 96,4 97 | 97,3 98 | 98,1 99 | 99,1 100 | 100,1 101 | 101,3 102 | 103,3 103 | 104,2 104 | 105,1 105 | 106,2 106 | 107,1 107 | 112,1 108 | 113,1 109 | 114,1 110 | 115,1 111 | 116,2 112 | 118,1 113 | 119,2 114 | 122,2 115 | 123,1 116 | 143,1 117 | 191,1 118 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 83519.30033324637 3 | beta: -0.9664620660294009 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.txt: -------------------------------------------------------------------------------- 1 | 1,1699695 2 | 2,250291 3 | 3,75890 4 | 4,33114 5 | 5,17934 6 | 6,10863 7 | 7,7211 8 | 8,4873 9 | 9,3518 10 | 10,2628 11 | 11,1944 12 | 12,1586 13 | 13,1234 14 | 14,995 15 | 15,817 16 | 16,647 17 | 17,545 18 | 18,463 19 | 19,346 20 | 20,318 21 | 21,281 22 | 22,252 23 | 23,175 24 | 24,219 25 | 25,151 26 | 26,149 27 | 27,112 28 | 28,103 29 | 29,72 30 | 30,68 31 | 31,67 32 | 32,59 33 | 33,55 34 | 34,41 35 | 35,30 36 | 36,28 37 | 37,31 38 | 38,22 39 | 39,25 40 | 40,19 41 | 41,19 42 | 42,14 43 | 43,17 44 | 44,12 45 | 45,11 46 | 46,11 47 | 47,12 48 | 48,13 49 | 49,11 50 | 50,7 51 | 51,6 52 | 52,5 53 | 53,7 54 | 54,5 55 | 55,2 56 | 56,4 57 | 57,3 58 | 58,1 59 | 59,6 60 | 60,5 61 | 61,5 62 | 62,2 63 | 63,2 64 | 64,2 65 | 65,2 66 | 67,2 67 | 68,1 68 | 69,1 69 | 71,3 70 | 72,1 71 | 73,1 72 | 74,2 73 | 75,1 74 | 76,3 75 | 77,1 76 | 78,1 77 | 79,1 78 | 80,2 79 | 82,1 80 | 88,1 81 | 89,1 82 | 92,1 83 | 93,1 84 | 95,1 85 | 96,2 86 | 117,1 87 | 173,1 88 | 215,1 89 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 1699901.8355022694 3 | beta: -2.782917517119856 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/transfer/in-out.txt: -------------------------------------------------------------------------------- 1 | DEGREE TOP 100 (3048390,149885218) (3046466,7654425) (3043195,3717800) (1809340,3200099) (1129165,3049194) (924712,2377914) (830798,1670911) (712413,1529329) (710260,1358537) (641020,1209411) (632973,1115675) (630067,1089786) (627839,999991) (580524,953094) (578835,859954) (528568,608509) (413525,606974) (412343,603557) (411422,591748) (408226,562251) (399829,439738) (397811,435303) (393778,414443) (391697,408185) (377450,400741) (352612,376543) (309148,366878) (293469,353388) (287538,345340) (277388,340104) (271632,338657) (268182,337096) (258872,315555) (257605,310579) (247652,308547) (239551,305741) (224806,274064) (220635,264638) (218966,263313) (217587,259273) (215537,249205) (214753,246216) (213268,234897) (210157,227524) (194537,223583) (185521,216924) (183980,214773) (180816,211013) (180760,197915) (178397,196080) (176695,192068) (176184,184776) (175973,182230) (173743,179550) (173669,176031) (172947,174463) (172548,173571) (172022,167455) (170749,165347) (170032,163834) (169523,160865) (166546,157632) (164485,156494) (156547,155911) (153269,154617) (148137,153352) (142497,152993) (141041,150277) (139841,149119) (136612,148746) (134140,147731) (132448,143019) (132377,142648) (131275,140915) (131070,137847) (130986,137137) (126796,136712) (122594,136378) (122405,132318) (119256,131018) (119085,130774) (116435,129064) (116335,124674) (113931,124043) (112756,123727) (112516,122166) (110073,117492) (108099,117182) (106688,116283) (106057,115471) (105524,115364) (103445,113458) (102986,113092) (101608,112663) (101394,111914) (101138,110137) (100978,109989) (100806,106889) (100680,105286) (99539,104025) 2 | DEGREE PEC 100 3048390 20 13 10 8 7 6 6 5 5 4 4 4 4 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 3 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/transfer/in_degree_dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/transfer/in_degree_dist.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 105379188.6439417 3 | beta: -1.9560039218452319 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/transfer/out-in.txt: -------------------------------------------------------------------------------- 1 | DEGREE TOP 100 (149885218,3048390) (7654425,3046466) (3717800,3043195) (3200099,1809340) (3049194,1129165) (2377914,924712) (1670911,830798) (1529329,712413) (1358537,710260) (1209411,641020) (1115675,632973) (1089786,630067) (999991,627839) (953094,580524) (859954,578835) (608509,528568) (606974,413525) (603557,412343) (591748,411422) (562251,408226) (439738,399829) (435303,397811) (414443,393778) (408185,391697) (400741,377450) (376543,352612) (366878,309148) (353388,293469) (345340,287538) (340104,277388) (338657,271632) (337096,268182) (315555,258872) (310579,257605) (308547,247652) (305741,239551) (274064,224806) (264638,220635) (263313,218966) (259273,217587) (249205,215537) (246216,214753) (234897,213268) (227524,210157) (223583,194537) (216924,185521) (214773,183980) (211013,180816) (197915,180760) (196080,178397) (192068,176695) (184776,176184) (182230,175973) (179550,173743) (176031,173669) (174463,172947) (173571,172548) (167455,172022) (165347,170749) (163834,170032) (160865,169523) (157632,166546) (156494,164485) (155911,156547) (154617,153269) (153352,148137) (152993,142497) (150277,141041) (149119,139841) (148746,136612) (147731,134140) (143019,132448) (142648,132377) (140915,131275) (137847,131070) (137137,130986) (136712,126796) (136378,122594) (132318,122405) (131018,119256) (130774,119085) (129064,116435) (124674,116335) (124043,113931) (123727,112756) (122166,112516) (117492,110073) (117182,108099) (116283,106688) (115471,106057) (115364,105524) (113458,103445) (113092,102986) (112663,101608) (111914,101394) (110137,101138) (109989,100978) (106889,100806) (105286,100680) (104025,99539) 2 | DEGREE PEC 100 149885218 39 19 11 6 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/transfer/out_degree_dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/transfer/out_degree_dist.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.png -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.txt: -------------------------------------------------------------------------------- 1 | formula: y = alpha * (x^beta) 2 | alpha: 1833055.6837079779 3 | beta: -0.8965897342389562 4 | -------------------------------------------------------------------------------- /tools/legacy/dataprofiler/wcc_core.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022 AntGroup. All Rights Reserved. */ 2 | 3 | #include "lgraph/olap_base.h" 4 | #include "./algo.h" 5 | 6 | using namespace lgraph_api; 7 | using namespace lgraph_api::olap; 8 | 9 | void WCCCore(OlapBase& graph, ParallelVector& label) { 10 | auto active_in = graph.AllocVertexSubset(); 11 | active_in.Fill(); 12 | auto active_out = graph.AllocVertexSubset(); 13 | size_t num_activations = graph.ProcessVertexActive( 14 | [&](size_t vi) { 15 | label[vi] = vi; 16 | return 1; 17 | }, 18 | active_in); 19 | 20 | for (int ii = 0; num_activations != 0; ii++) { 21 | printf("activates(%d) <= %lu\n", ii, num_activations); 22 | active_out.Clear(); 23 | num_activations = graph.ProcessVertexActive( 24 | [&](size_t src) { 25 | size_t num_activations = 0; 26 | for (auto& edge : graph.OutEdges(src)) { 27 | size_t dst = edge.neighbour; 28 | if (label[src] < label[dst]) { 29 | auto lock = graph.GuardVertexLock(dst); 30 | if (label[src] < label[dst]) { 31 | label[dst] = label[src]; 32 | num_activations += 1; 33 | active_out.Add(dst); 34 | } 35 | } 36 | } 37 | return num_activations; 38 | }, 39 | active_in); 40 | active_in.Swap(active_out); 41 | } 42 | } 43 | 44 | -------------------------------------------------------------------------------- /tools/legacy/factorgen/factor_table.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf ../../out/factor_table 4 | 5 | python3 generate_account.py & 6 | 7 | python3 time_split.py & 8 | 9 | python3 split_amount.py & 10 | 11 | python3 loan.py & 12 | 13 | wait 14 | 15 | echo "All factors have been generated." 16 | -------------------------------------------------------------------------------- /tools/legacy/factorgen/generate_account.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | 5 | def process_csv(file_path): 6 | df = pd.read_csv(file_path, delimiter='|') 7 | return df 8 | 9 | 10 | account_folder_path = '../../out/raw/account' 11 | transfer_folder_path = '../../out/raw/transfer' 12 | output_folder = '../../out/factor_table' 13 | withdraw_folder_path = '../../out/raw/withdraw' 14 | 15 | account_files = [os.path.join(account_folder_path, file) for file in os.listdir(account_folder_path) if file.endswith('.csv')] 16 | transfer_files = [os.path.join(transfer_folder_path, file) for file in os.listdir(transfer_folder_path) if file.endswith('.csv')] 17 | withdraw_files = [os.path.join(withdraw_folder_path, file) for file in os.listdir(withdraw_folder_path) if file.endswith('.csv')] 18 | 19 | account_df = pd.concat([process_csv(file) for file in account_files]) 20 | transfer_df = pd.concat([process_csv(file) for file in transfer_files]) 21 | withdraw_df = pd.concat([process_csv(file) for file in withdraw_files]) 22 | 23 | merged_df = pd.merge(account_df, transfer_df, left_on='id', right_on='toId', how='left') 24 | 25 | result_amount_df = merged_df.groupby('id')['amount'].sum().reset_index().fillna(0) 26 | 27 | account_items = [] 28 | 29 | for account_id in account_df['id']: 30 | transfer_data = transfer_df[transfer_df['fromId'] == account_id].groupby('toId')['amount'].max().reset_index() 31 | withdraw_data = withdraw_df[withdraw_df['fromId'] == account_id].groupby('toId')['amount'].max().reset_index() 32 | 33 | max_amounts = pd.concat([transfer_data, withdraw_data], ignore_index=True).groupby('toId')['amount'].max().reset_index() 34 | 35 | items = [[to_id, max_amount] for to_id, max_amount in zip(max_amounts['toId'], max_amounts['amount'])] 36 | 37 | account_items.append([account_id, items]) 38 | 39 | os.makedirs(output_folder, exist_ok=True) 40 | 41 | result_amount_df.to_csv(os.path.join(output_folder, 'amount.csv'), sep='|', index=False, header=['account_id', 'amount']) 42 | result_df = pd.DataFrame(account_items, columns=['account_id', 'items']) 43 | result_df.to_csv(os.path.join(output_folder, 'account_items.csv'), sep='|', index=False) 44 | -------------------------------------------------------------------------------- /tools/legacy/factorgen/loan.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | def process_csv(file_path): 5 | df = pd.read_csv(file_path, delimiter='|') 6 | return df 7 | 8 | loan_folder_path = '../../out/raw/loan' 9 | deposit_folder_path = '../../out/raw/deposit' 10 | output_folder = '../../out/factor_table' 11 | 12 | loan_files = [os.path.join(loan_folder_path, file) for file in os.listdir(loan_folder_path) if file.endswith('.csv')] 13 | deposit_files = [os.path.join(deposit_folder_path, file) for file in os.listdir(deposit_folder_path) if file.endswith('.csv')] 14 | 15 | loan_df = pd.concat([process_csv(file) for file in loan_files]) 16 | deposit_df = pd.concat([process_csv(file) for file in deposit_files]) 17 | 18 | result_list = [] 19 | 20 | for loan_id in loan_df['id'].unique(): 21 | account_list = deposit_df[deposit_df['loanId'] == loan_id]['accountId'].unique().tolist() 22 | result_list.append([loan_id, account_list]) 23 | 24 | result_df = pd.DataFrame(result_list, columns=['loan_id', 'account_list']) 25 | 26 | os.makedirs(output_folder, exist_ok=True) 27 | result_df.to_csv('../../out/factor_table/loan_account_list.csv', sep='|', index=False) 28 | -------------------------------------------------------------------------------- /tools/legacy/factorgen/split_amount.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | 5 | def process_csv(file_path): 6 | df = pd.read_csv(file_path, delimiter='|') 7 | return df 8 | 9 | account_folder_path = '../../out/raw/account' 10 | transfer_folder_path = '../../out/raw/transfer' 11 | withdraw_folder_path = '../../out/raw/withdraw' 12 | output_folder = '../../out/factor_table' 13 | 14 | account_files = [os.path.join(account_folder_path, file) for file in os.listdir(account_folder_path) if file.endswith('.csv')] 15 | transfer_files = [os.path.join(transfer_folder_path, file) for file in os.listdir(transfer_folder_path) if file.endswith('.csv')] 16 | withdraw_files = [os.path.join(withdraw_folder_path, file) for file in os.listdir(withdraw_folder_path) if file.endswith('.csv')] 17 | 18 | account_df = pd.concat([process_csv(file) for file in account_files]) 19 | transfer_df = pd.concat([process_csv(file) for file in transfer_files]) 20 | withdraw_df = pd.concat([process_csv(file) for file in withdraw_files]) 21 | 22 | amount_buckets = [10000, 30000, 100000, 300000, 1000000, 2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 10000000] 23 | 24 | result_dict = {} 25 | 26 | for account_id in account_df['id']: 27 | account_amount_count = {str(bucket): 0 for bucket in amount_buckets} 28 | 29 | account_transfer_data = transfer_df[transfer_df['fromId'] == account_id] 30 | 31 | for _, row in account_transfer_data.iterrows(): 32 | amount = row['amount'] 33 | for bucket in amount_buckets: 34 | if amount <= bucket: 35 | account_amount_count[str(bucket)] += 1 36 | break 37 | 38 | account_withdraw_data = withdraw_df[withdraw_df['fromId'] == account_id] 39 | 40 | for _, row in account_withdraw_data.iterrows(): 41 | amount = row['amount'] 42 | for bucket in amount_buckets: 43 | if amount <= bucket: 44 | account_amount_count[str(bucket)] += 1 45 | break 46 | 47 | result_dict[account_id] = list(account_amount_count.values()) 48 | 49 | 50 | result_df = pd.DataFrame.from_dict(result_dict, orient='index', columns=[str(bucket) for bucket in amount_buckets]) 51 | result_df.reset_index(inplace=True) 52 | result_df.columns = ['account_id'] + [str(bucket) for bucket in amount_buckets] 53 | 54 | os.makedirs(output_folder, exist_ok=True) 55 | result_df.to_csv('../../out/factor_table/amount_bucket.csv', sep='|', index=False) 56 | -------------------------------------------------------------------------------- /tools/legacy/factorgen/time_split.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from datetime import datetime, timedelta 4 | 5 | 6 | def process_csv(file_path): 7 | df = pd.read_csv(file_path, delimiter='|') 8 | return df 9 | 10 | def timestamp_to_year_month(timestamp): 11 | return timestamp.strftime('%Y-%m') 12 | 13 | account_folder_path = '../../out/raw/account' 14 | transfer_folder_path = '../../out/raw/transfer' 15 | withdraw_folder_path = '../../out/raw/withdraw' 16 | output_folder = '../../out/factor_table' 17 | 18 | account_files = [os.path.join(account_folder_path, file) for file in os.listdir(account_folder_path) if file.endswith('.csv')] 19 | transfer_files = [os.path.join(transfer_folder_path, file) for file in os.listdir(transfer_folder_path) if file.endswith('.csv')] 20 | withdraw_files = [os.path.join(withdraw_folder_path, file) for file in os.listdir(withdraw_folder_path) if file.endswith('.csv')] 21 | 22 | account_df = pd.concat([process_csv(file) for file in account_files]) 23 | transfer_df = pd.concat([process_csv(file) for file in transfer_files]) 24 | withdraw_df = pd.concat([process_csv(file) for file in withdraw_files]) 25 | 26 | start_date = datetime(2020, 1, 1) 27 | end_date = datetime(2023, 1, 1) 28 | 29 | month_ranges = pd.date_range(start=start_date, end=end_date, freq='MS') 30 | 31 | result_dict = {} 32 | 33 | for account_id in account_df['id']: 34 | account_month_count = {timestamp_to_year_month(month): 0 for month in month_ranges} 35 | 36 | account_transfer_data = transfer_df[transfer_df['fromId'] == account_id] 37 | 38 | for _, row in account_transfer_data.iterrows(): 39 | month = timestamp_to_year_month(pd.to_datetime(row['createTime'], unit='ms')) 40 | account_month_count[month] += 1 41 | 42 | account_withdraw_data = withdraw_df[withdraw_df['fromId'] == account_id] 43 | 44 | for _, row in account_withdraw_data.iterrows(): 45 | month = timestamp_to_year_month(pd.to_datetime(row['createTime'], unit='ms')) 46 | account_month_count[month] += 1 47 | 48 | result_dict[account_id] = list(account_month_count.values()) 49 | 50 | 51 | result_df = pd.DataFrame.from_dict(result_dict, orient='index', columns=[timestamp_to_year_month(month) for month in month_ranges]) 52 | result_df.reset_index(inplace=True) 53 | result_df.columns = ['account_id'] + [str(timestamp_to_year_month(month)) for month in month_ranges] 54 | 55 | 56 | os.makedirs(output_folder, exist_ok=True) 57 | result_df.to_csv('../../out/factor_table/month.csv', sep='|', index=False) 58 | -------------------------------------------------------------------------------- /tools/legacy/graphgen/Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS=-Wall -fopenmp 2 | LDFLAGS=-fopenmp 3 | 4 | all: graph_gen 5 | 6 | graph_gen.o: graph_gen.c 7 | g++ $(CFLAGS) -o $@ -c $^ 8 | 9 | graph_gen: graph_gen.o 10 | g++ -o $@ $^ $(LDFLAGS) 11 | 12 | clean: 13 | rm -f graph_gen *.o 14 | -------------------------------------------------------------------------------- /tools/legacy/graphgen/README.md: -------------------------------------------------------------------------------- 1 | # Synthetic Graph Generator 2 | 3 | ## Intruction 4 | graph_gen is a simple synthetic graph generator, support graph types are Kronecker, RMat and Erdos. 5 | The generated result may has self-loop and duplicated-edges. 6 | 7 | ## Compile 8 | make graph_gen 9 | 10 | ## Run 11 | Use ```./graph_gen -h``` to get help 12 | 13 | ### Input 14 | ./graph_gen ./graph_gen -s [scale] -e [edge_factor] -t [gen_type] -x [seed] -v [0/1] 15 | 16 | Supported gen_type include kron/rmat/erdos 17 | 18 | ### Output 19 | Edgelist file in binary or txt, each vertex is 4 Bytes. 20 | 21 | ### example 22 | Here are some examples: 23 | 24 | ./graph_gen -s 20 25 | 26 | ./graph_gen -s 22 -e 16 -s kron -x 0 27 | 28 | ./graph_gen -s 22 -e 16 -s erdos -v 1 29 | -------------------------------------------------------------------------------- /tools/merge_cluster_output.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | 5 | def merge_cluster_output(dir_A, dir_B, output_dir): 6 | # check if the directories exist 7 | assert os.path.exists(dir_A), "The directory {} does not exist".format(dir_A) 8 | assert os.path.exists(dir_B), "The directory {} does not exist".format(dir_B) 9 | # create the output directory if it does not exist 10 | os.makedirs(output_dir, exist_ok=True) 11 | 12 | # get all subdirectories in dir_A 13 | subdirs = [o for o in os.listdir(dir_A) if os.path.isdir(os.path.join(dir_A, o))] 14 | for subdir in subdirs: 15 | # check if the subdirectory exists in both dir_A and dir_B 16 | assert os.path.exists( 17 | os.path.join(dir_B, subdir) 18 | ), "The subdirectory {} does not exist in {}".format(subdir, dir_B) 19 | 20 | for subdir in subdirs: 21 | # get all csv files in the subdirectory 22 | csv_files_A = [ 23 | os.path.join(dir_A, subdir, f) 24 | for f in os.listdir(os.path.join(dir_A, subdir)) 25 | if f.endswith(".csv") 26 | ] 27 | csv_files_B = [ 28 | os.path.join(dir_B, subdir, f) 29 | for f in os.listdir(os.path.join(dir_B, subdir)) 30 | if f.endswith(".csv") 31 | ] 32 | 33 | # create a new directory in the output directory with the same name as the subdirectory 34 | new_subdir = os.path.join(output_dir, subdir) 35 | os.makedirs(new_subdir, exist_ok=True) 36 | # copy the csv files from dir_A to the new subdirectory 37 | for csv_file in csv_files_A: 38 | new_csv_file = os.path.join(new_subdir, os.path.basename(csv_file)) 39 | os.system("cp {} {}".format(csv_file, new_csv_file)) 40 | # copy the csv files from dir_B to the new subdirectory 41 | for csv_file in csv_files_B: 42 | new_csv_file = os.path.join(new_subdir, os.path.basename(csv_file)) 43 | os.system("cp {} {}".format(csv_file, new_csv_file)) 44 | 45 | 46 | if __name__ == "__main__": 47 | dir_A = sys.argv[1] 48 | dir_B = sys.argv[2] 49 | output_dir = sys.argv[3] 50 | merge_cluster_output(dir_A, dir_B, output_dir) 51 | -------------------------------------------------------------------------------- /tools/statistic.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import glob 4 | import collections 5 | 6 | 7 | labels = ["person","personOwnAccount","personApplyLoan","personGuarantee","personInvest","blank","company","companyOwnAccount","companyApplyLoan","companyGuarantee","companyInvest","blank","account","transfer","withdraw","blank","loan","loantransfer","deposit","repay","blank","medium","signIn"] 8 | 9 | def print_original_counts(counts): 10 | for key, value in collections.OrderedDict(sorted(counts.items())).items(): 11 | print("{}:{}".format(key, value)) 12 | 13 | def print_formatted_counts(counts): 14 | for label in labels: 15 | if label == "blank": 16 | print("================================") 17 | else: 18 | print("{}:{}".format(label, counts[label])) 19 | 20 | def count_entites(path): 21 | counts = {} 22 | for subdir in os.listdir(path): 23 | subdir_path = os.path.join(path, subdir) 24 | if os.path.isdir(subdir_path): 25 | num_entites = 0 26 | for file in glob.glob(os.path.join(subdir_path, "*.csv")): 27 | num_entites += sum(1 for _ in open(file)) - 1 28 | counts[subdir] = num_entites 29 | print_original_counts(counts) 30 | print("\n========== Formatted Output ============\n") 31 | print_formatted_counts(counts) 32 | 33 | 34 | if __name__ == "__main__": 35 | count_entites(sys.argv[1]) 36 | -------------------------------------------------------------------------------- /tools/validate_formula.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | from collections import Counter 4 | 5 | import numpy as np 6 | from matplotlib import pyplot as plt 7 | from scipy.integrate import quad 8 | 9 | data_size = 10000 10 | ind_alphas = [109539041.821, 78379700.038, 133908623.887] 11 | outd_alphas = [20186572.914, 14153912.686, 20194472.855] 12 | ind_betas = np.array([-2.319, -2.319, -2.085]) 13 | outd_betas = np.array([-1.720, -1.719, -1.720]) 14 | start_degree = 1 15 | max_degree = 1000 16 | 17 | 18 | def calc_integral(): 19 | a = ind_alphas[0] 20 | b = ind_betas[0] 21 | 22 | def avg_degree(n): 23 | return math.pow(n, 0.512 - 0.028 * math.log10(n)) 24 | 25 | def powerlaw_func(x): 26 | return a * np.power(x, b) 27 | 28 | areas, _ = quad(powerlaw_func, start_degree, max_degree) 29 | print(np.power((np.power(max_degree, b + 1) + 9 * np.power(0, b + 1)) / 10, 1 / (b + 1))) 30 | 31 | 32 | # According to https://mathworld.wolfram.com/RandomNumber.html 33 | # The formula to transform uniform distribution to powerlaw distribution is: 34 | # x = [(x1^(n+1) - x0^(n+1))*y + x0^(n+1)]^(1/(n+1)) 35 | def draw_powerlaw(): 36 | beta = np.average(outd_betas) 37 | 38 | def powerlaw_func(y): 39 | return (int)(np.power( 40 | (np.power(max_degree, beta + 1) - np.power(start_degree, beta + 1)) * y + np.power(start_degree, beta + 1), 41 | 1 / (beta + 1))) 42 | 43 | degree = [powerlaw_func(random.uniform(0, 1)) for _ in range(0, data_size)] 44 | freq = Counter(degree).most_common() 45 | degrees = [] 46 | counts = [] 47 | for deg, count in freq: 48 | degrees.append(deg) 49 | counts.append(count) 50 | plt.scatter(degrees, counts) 51 | plt.loglog() 52 | plt.plot() 53 | plt.show() 54 | 55 | 56 | if __name__ == "__main__": 57 | draw_powerlaw() 58 | -------------------------------------------------------------------------------- /transformation/.gitignore: -------------------------------------------------------------------------------- 1 | *.duckdb* 2 | incremental/ 3 | *.log 4 | -------------------------------------------------------------------------------- /transformation/install-dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | pip3 install --user duckdb==0.7.1 pytz networkit pandas 20 | -------------------------------------------------------------------------------- /transformation/transform.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org) 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | ## Point this path to the directory containing the `raw` directory 20 | FinBench_DATA_ROOT=${PATH_TO_FINBENCH_DATA} 21 | 22 | set -eu 23 | set -o pipefail 24 | 25 | cd "$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" 26 | 27 | rm -rf ${FinBench_DATA_ROOT}/deletes ${FinBench_DATA_ROOT}/inserts 28 | mkdir ${FinBench_DATA_ROOT}/deletes ${FinBench_DATA_ROOT}/inserts 29 | 30 | echo "##### Transform to snapshots and write queries #####" 31 | echo 32 | echo "\${FinBench_DATA_ROOT}: ${FinBench_DATA_ROOT}" 33 | echo 34 | 35 | python3 ./convert_data.py --raw_dir ${FinBench_DATA_ROOT} --output_dir ${FinBench_DATA_ROOT} | tee output.log 36 | --------------------------------------------------------------------------------