├── .github
    ├── CODEOWNERS
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .scalafmt.conf
├── LICENSE.txt
├── NOTICE.txt
├── README.md
├── checkstyle.xml
├── code_of_conduct.md
├── contributing.md
├── data-schema.png
├── ldbc-logo.png
├── logs
    └── .gitignore
├── paramgen
    ├── parameter_curation.py
    ├── search_params.py
    └── time_select.py
├── pom.xml
├── scripts
    ├── get-spark-to-home.sh
    ├── run.py
    ├── run_cluster.sh
    ├── run_local.sh
    └── run_paramgen.sh
├── src
    ├── main
    │   ├── java
    │   │   └── ldbc
    │   │   │   └── finbench
    │   │   │       └── datagen
    │   │   │           ├── config
    │   │   │               ├── ConfigParser.java
    │   │   │               ├── DatagenConfiguration.java
    │   │   │               ├── ScaleFactor.java
    │   │   │               └── ScaleFactors.java
    │   │   │           ├── entities
    │   │   │               ├── DynamicActivity.java
    │   │   │               ├── edges
    │   │   │               │   ├── CompanyApplyLoan.java
    │   │   │               │   ├── CompanyGuaranteeCompany.java
    │   │   │               │   ├── CompanyInvestCompany.java
    │   │   │               │   ├── CompanyOwnAccount.java
    │   │   │               │   ├── Deposit.java
    │   │   │               │   ├── PersonApplyLoan.java
    │   │   │               │   ├── PersonGuaranteePerson.java
    │   │   │               │   ├── PersonInvestCompany.java
    │   │   │               │   ├── PersonOwnAccount.java
    │   │   │               │   ├── Repay.java
    │   │   │               │   ├── SignIn.java
    │   │   │               │   ├── Transfer.java
    │   │   │               │   └── Withdraw.java
    │   │   │               ├── nodes
    │   │   │               │   ├── Account.java
    │   │   │               │   ├── Company.java
    │   │   │               │   ├── Loan.java
    │   │   │               │   ├── Medium.java
    │   │   │               │   ├── Person.java
    │   │   │               │   └── PersonOrCompany.java
    │   │   │               └── place
    │   │   │               │   └── Place.java
    │   │   │           ├── generation
    │   │   │               ├── DatagenContext.java
    │   │   │               ├── DatagenParams.java
    │   │   │               ├── dictionary
    │   │   │               │   ├── CommonTextDictionary.java
    │   │   │               │   ├── Dictionaries.java
    │   │   │               │   ├── EmailDictionary.java
    │   │   │               │   ├── NumbersGenerator.java
    │   │   │               │   ├── PercentageTextDictionary.java
    │   │   │               │   ├── PersonNameDictionary.java
    │   │   │               │   ├── PlaceDictionary.java
    │   │   │               │   └── PlaceZOrder.java
    │   │   │               ├── distribution
    │   │   │               │   ├── AccountDeleteDistribution.java
    │   │   │               │   ├── Bucket.java
    │   │   │               │   ├── DegreeDistribution.java
    │   │   │               │   ├── GeometricDistribution.java
    │   │   │               │   ├── MultiplicityDistribution.java
    │   │   │               │   ├── PowerLawActivityDeleteDistribution.java
    │   │   │               │   ├── PowerLawBucketsDistribution.java
    │   │   │               │   ├── PowerLawFormulaDistribution.java
    │   │   │               │   └── TimeDistribution.java
    │   │   │               ├── events
    │   │   │               │   ├── AccountActivitiesEvent.java
    │   │   │               │   ├── CompanyActivitiesEvent.java
    │   │   │               │   ├── CompanyInvestEvent.java
    │   │   │               │   ├── LoanActivitiesEvents.java
    │   │   │               │   ├── PersonActivitiesEvent.java
    │   │   │               │   ├── PersonInvestEvent.java
    │   │   │               │   └── SignInEvent.java
    │   │   │               └── generators
    │   │   │               │   ├── AccountGenerator.java
    │   │   │               │   ├── CompanyGenerator.java
    │   │   │               │   ├── DateGenerator.java
    │   │   │               │   ├── LoanGenerator.java
    │   │   │               │   ├── MediumGenerator.java
    │   │   │               │   └── PersonGenerator.java
    │   │   │           └── util
    │   │   │               ├── DateTimeUtils.java
    │   │   │               ├── RandomGeneratorFarm.java
    │   │   │               └── ZOrder.java
    │   ├── resources
    │   │   ├── README.md
    │   │   ├── dictionaries
    │   │   │   ├── accountLevels.txt
    │   │   │   ├── accountNicknames.txt
    │   │   │   ├── accountTypes.txt
    │   │   │   ├── businessTypes.txt
    │   │   │   ├── citiesByCountry.txt
    │   │   │   ├── companyNames.txt
    │   │   │   ├── dicLocations.txt
    │   │   │   ├── emails.txt
    │   │   │   ├── goodsTypes.txt
    │   │   │   ├── guaranteeRelationships.txt
    │   │   │   ├── loanOrganizations.txt
    │   │   │   ├── loanUsages.txt
    │   │   │   ├── mediumNames.txt
    │   │   │   ├── payTypes.txt
    │   │   │   ├── randomText.txt
    │   │   │   ├── riskLevels.txt
    │   │   │   ├── surnames.txt
    │   │   │   └── urls.txt
    │   │   ├── distributions
    │   │   │   ├── accountDelete.txt
    │   │   │   ├── facebookPowerlawBucket.dat
    │   │   │   ├── hourDistribution.dat
    │   │   │   ├── inDegreeRegression.txt
    │   │   │   ├── multiplicityPowerlawRegression.txt
    │   │   │   ├── outDegreeRegression.txt
    │   │   │   └── powerLawActivityDeleteDate.txt
    │   │   ├── log4j.properties
    │   │   ├── params_default.ini
    │   │   └── scale_factors.xml
    │   └── scala
    │   │   └── ldbc
    │   │       └── finbench
    │   │           └── datagen
    │   │               ├── LdbcDatagen.scala
    │   │               ├── factors
    │   │                   ├── AccountItemsGenerator.scala
    │   │                   ├── FactorGenerationStage.scala
    │   │                   └── FactorTable.scala
    │   │               ├── generation
    │   │                   ├── ActivitySimulator.scala
    │   │                   ├── GenerationStage.scala
    │   │                   ├── generators
    │   │                   │   ├── ActivityGenerator.scala
    │   │                   │   ├── SparkAccountGenerator.scala
    │   │                   │   ├── SparkCompanyGenerator.scala
    │   │                   │   ├── SparkMediumGenerator.scala
    │   │                   │   └── SparkPersonGenerator.scala
    │   │                   └── serializers
    │   │                   │   └── ActivitySerializer.scala
    │   │               ├── io
    │   │                   ├── Reader.scala
    │   │                   ├── Writer.scala
    │   │                   ├── dataframes.scala
    │   │                   ├── graphs.scala
    │   │                   └── raw
    │   │                   │   └── package.scala
    │   │               ├── model
    │   │                   ├── package.scala
    │   │                   └── raw.scala
    │   │               ├── syntax
    │   │                   ├── FluentSyntax.scala
    │   │                   ├── PathSyntax.scala
    │   │                   ├── SparkSqlSyntax.scala
    │   │                   └── package.scala
    │   │               ├── transformation
    │   │                   └── TransformationStage.scala
    │   │               └── util
    │   │                   ├── Logging.scala
    │   │                   ├── SparkApp.scala
    │   │                   ├── SparkEnv.scala
    │   │                   ├── SparkUI.scala
    │   │                   ├── package.scala
    │   │                   └── sql.scala
    └── test
    │   ├── java
    │       └── ldbc
    │       │   └── finbench
    │       │       └── datagen
    │       │           ├── generators
    │       │               ├── DistributionTest.java
    │       │               └── GeneratorTest.java
    │       │           └── util
    │       │               └── GeneralTest.java
    │   └── scala
    │       └── ldbc
    │           └── finbench
    │               └── datagen
    │                   └── util
    │                       └── UtilPackageSuite.scala
├── tools
    ├── README.md
    ├── check_consistency.py
    ├── check_deletion.py
    ├── check_duplicate.py
    ├── check_transfer.py
    ├── legacy
    │   ├── dataprofiler
    │   │   ├── .gitignore
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   ├── algo.h
    │   │   ├── compile.sh
    │   │   ├── de_core.cpp
    │   │   ├── plot.py
    │   │   ├── profiler.cpp
    │   │   ├── result
    │   │   │   ├── db139
    │   │   │   │   ├── edges.txt
    │   │   │   │   ├── in-out.txt
    │   │   │   │   ├── in_degree_dist.png
    │   │   │   │   ├── in_degree_dist.txt
    │   │   │   │   ├── in_degree_dist_regression.png
    │   │   │   │   ├── in_degree_dist_regression.txt
    │   │   │   │   ├── out-in.txt
    │   │   │   │   ├── out_degree_dist.png
    │   │   │   │   ├── out_degree_dist.txt
    │   │   │   │   ├── out_degree_dist_regression.png
    │   │   │   │   └── out_degree_dist_regression.txt
    │   │   │   ├── db177
    │   │   │   │   ├── in-out.txt
    │   │   │   │   ├── in_degree_dist.png
    │   │   │   │   ├── in_degree_dist.txt
    │   │   │   │   ├── in_degree_dist_regression.png
    │   │   │   │   ├── in_degree_dist_regression.txt
    │   │   │   │   ├── out-in.txt
    │   │   │   │   ├── out_degree_dist.png
    │   │   │   │   ├── out_degree_dist.txt
    │   │   │   │   ├── out_degree_dist_regression.png
    │   │   │   │   └── out_degree_dist_regression.txt
    │   │   │   ├── db184
    │   │   │   │   ├── edges.txt
    │   │   │   │   ├── in-out.txt
    │   │   │   │   ├── in_degree_dist.png
    │   │   │   │   ├── in_degree_dist.txt
    │   │   │   │   ├── in_degree_dist_regression.png
    │   │   │   │   ├── in_degree_dist_regression.txt
    │   │   │   │   ├── out-in.txt
    │   │   │   │   ├── out_degree_dist.png
    │   │   │   │   ├── out_degree_dist.txt
    │   │   │   │   ├── out_degree_dist_regression.png
    │   │   │   │   └── out_degree_dist_regression.txt
    │   │   │   ├── hubvertex_indeg
    │   │   │   │   ├── hub_indeg_1.png
    │   │   │   │   ├── hub_indeg_1.txt
    │   │   │   │   ├── hub_indeg_1_regression.png
    │   │   │   │   ├── hub_indeg_1_regression.txt
    │   │   │   │   ├── hub_indeg_2.png
    │   │   │   │   ├── hub_indeg_2.txt
    │   │   │   │   ├── hub_indeg_2_regression.png
    │   │   │   │   ├── hub_indeg_2_regression.txt
    │   │   │   │   ├── hub_indeg_3.png
    │   │   │   │   ├── hub_indeg_3.txt
    │   │   │   │   ├── hub_indeg_3_regression.png
    │   │   │   │   ├── hub_indeg_3_regression.txt
    │   │   │   │   ├── hub_indeg_4.png
    │   │   │   │   ├── hub_indeg_4.txt
    │   │   │   │   ├── hub_indeg_4_regression.png
    │   │   │   │   ├── hub_indeg_4_regression.txt
    │   │   │   │   ├── hub_indeg_5.png
    │   │   │   │   ├── hub_indeg_5.txt
    │   │   │   │   ├── hub_indeg_5_regression.png
    │   │   │   │   └── hub_indeg_5_regression.txt
    │   │   │   ├── hubvertex_outdeg
    │   │   │   │   ├── hub_outdeg_1.png
    │   │   │   │   ├── hub_outdeg_1.txt
    │   │   │   │   ├── hub_outdeg_1_regression.png
    │   │   │   │   ├── hub_outdeg_1_regression.txt
    │   │   │   │   ├── hub_outdeg_2.png
    │   │   │   │   ├── hub_outdeg_2.txt
    │   │   │   │   ├── hub_outdeg_2_regression.png
    │   │   │   │   ├── hub_outdeg_2_regression.txt
    │   │   │   │   ├── hub_outdeg_3.png
    │   │   │   │   ├── hub_outdeg_3.txt
    │   │   │   │   ├── hub_outdeg_3_regression.png
    │   │   │   │   ├── hub_outdeg_3_regression.txt
    │   │   │   │   ├── hub_outdeg_4.png
    │   │   │   │   ├── hub_outdeg_4.txt
    │   │   │   │   ├── hub_outdeg_4_regression.png
    │   │   │   │   ├── hub_outdeg_4_regression.txt
    │   │   │   │   ├── hub_outdeg_5.png
    │   │   │   │   ├── hub_outdeg_5.txt
    │   │   │   │   ├── hub_outdeg_5_regression.png
    │   │   │   │   └── hub_outdeg_5_regression.txt
    │   │   │   └── transfer
    │   │   │   │   ├── in-out.txt
    │   │   │   │   ├── in_degree_dist.png
    │   │   │   │   ├── in_degree_dist.txt
    │   │   │   │   ├── in_degree_dist_regression.png
    │   │   │   │   ├── in_degree_dist_regression.txt
    │   │   │   │   ├── out-in.txt
    │   │   │   │   ├── out_degree_dist.png
    │   │   │   │   ├── out_degree_dist.txt
    │   │   │   │   ├── out_degree_dist_regression.png
    │   │   │   │   └── out_degree_dist_regression.txt
    │   │   └── wcc_core.cpp
    │   ├── factorgen
    │   │   ├── factor_table.sh
    │   │   ├── generate_account.py
    │   │   ├── loan.py
    │   │   ├── params_gen.properties
    │   │   ├── params_gen.py
    │   │   ├── split_amount.py
    │   │   └── time_split.py
    │   └── graphgen
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   └── graph_gen.c
    ├── merge_cluster_output.py
    ├── statistic.py
    └── validate_formula.py
└── transformation
    ├── .gitignore
    ├── convert_data.py
    ├── install-dependencies.sh
    ├── readwrites.sql
    ├── snapshot.sql
    ├── transform.sh
    └── writes.sql


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # *   @qishipengqsp


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |     branches:
 7 |       - main
 8 | 
 9 | jobs:
10 |   verify:
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |       - uses: actions/checkout@v3
15 |       - name: Set up JDK 1.8
16 |         uses: actions/setup-java@v3
17 |         with:
18 |           java-version: '8'
19 |           distribution: 'adopt'
20 |       - name: Cache local Maven repository
21 |         uses: actions/cache@v3
22 |         with:
23 |           path: ~/.m2/repository
24 |           key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
25 |           restore-keys: |
26 |             ${{ runner.os }}-maven-
27 |       - name: Build with Maven
28 |         run: mvn --batch-mode --update-snapshots verify


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # project env files
 2 | .DS_Store
 3 | *.iml
 4 | *.log
 5 | target/
 6 | .idea/
 7 | .vscode/
 8 | .metals/
 9 | .bloop/
10 | 
11 | # local generated data
12 | out*/
13 | out*.tar
14 | out*.tar.gz
15 | 
16 | # tune
17 | tune/
18 | tune.log
19 | 
20 | # scripts
21 | scripts/*.log
22 | scripts/*.png
23 | 
24 | sf*/
25 | sf*.tar
26 | sf*.tar.gz
27 | 
28 | paramgen/__pycache__/
29 | tools/paramgen/__pycache__/


--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
1 | version = "3.7.15"
2 | runner.dialect = scala213


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
 1 |  Copyright [2020-]2022 Linked Data Benchmark Council
 2 | 
 3 |    Licensed under the Apache License, Version 2.0 (the "License");
 4 |    you may not use this file except in compliance with the License.
 5 |    You may obtain a copy of the License at
 6 | 
 7 |        http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |    Unless required by applicable law or agreed to in writing, software
10 |    distributed under the License is distributed on an "AS IS" BASIS,
11 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |    See the License for the specific language governing permissions and
13 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![logo](ldbc-logo.png)
 2 | 
 3 | # FinBench DataGen
 4 | 
 5 | ![Build status](https://github.com/ldbc/ldbc_finbench_datagen/actions/workflows/ci.yml/badge.svg?branch=main)
 6 | 
 7 | The LDBC FinBench Data Generator (Datagen) produces the datasets for
 8 | the [LDBC FinBench's workloads](https://ldbcouncil.org/benchmarks/finbench/).
 9 | 
10 | This data generator produces labelled directred property graphs based on the simulation of financial activities in
11 | business systems. The key features include generation, factorization and transformation. A detailed description of the
12 | schema produced by Datagen, as well as the format of the output files, can be found in the latest version of official
13 | LDBC FinBench specification document.
14 | 
15 | ## DataGen Design
16 | 
17 | ### Data Schema
18 | 
19 | ![Schema](./data-schema.png)
20 | 
21 | ### Implementation
22 | 
23 | - Generation: Generation simulates financial activities in business systems to produce the raw data.
24 | - Factorization: Factorization profiles of the raw data to produce factor tables used for further parameter curation.
25 | - Transformation: Transformation transforms the raw data to the data for SUT and benchmark driver.
26 | 
27 | Note:
28 | 
29 | - Generation and Factorization are implemented in Scala while transformation is implemented in Python
30 |   under `transformation/`.
31 | - SUT stands for System Under Test.
32 | 
33 | ## Quick Start
34 | 
35 | ### Pre-requisites
36 | 
37 | - Java 8 installed.
38 | - Python3 and related packages installed. See each `install-dependencies.sh` for details.
39 | - Scala 2.12, note that it will be integrated when maven builds.
40 | - Spark deployed. Spark 3.2.x is the recommended runtime to use. The rest of the instructions are provided assuming
41 |   Spark 3.2.x.
42 | 
43 | ### Workflow
44 | 
45 | - Use the spark application to generate the factor tables and raw data.
46 | - Use the python scripts to transform the data to snapshot data and write queries.
47 | 
48 | ### Generation of Raw Data
49 | 
50 | - Deploy Spark
51 |     - use `scripts/get-spark-to-home.sh` to download pre-built spark to home directory and then decompress it.
52 |     - Set the PATH environment variable to include the Spark binaries.
53 | - Build the project
54 |     - run `mvn clean package -DskipTests` to package the artifacts.
55 | - Run locally with scripts
56 |     - See `scripts/run_local.sh` for details. It uses spark-submit to run the data generator. Please make sure you have
57 |       the pre-requisites installed and the build is successful.
58 | - Run in cloud: To be supported
59 | - Run in cluster: To be supported
60 | 
61 | ### Transformation of Raw Data
62 | 
63 | - set the `${FinBench_DATA_ROOT}` variable in `transformation/transform.sh` and run.
64 | 
65 | ## TroubleShooting
66 | 
67 | N/A yet
68 | 
69 | # Related Work
70 | 
71 | - FinBench Specification: https://github.com/ldbc/ldbc_finbench_docs
72 | - FinBench Driver: https://github.com/ldbc/ldbc_finbench_driver
73 | - FinBench Reference Implementation: https://github.com/ldbc/ldbc_finbench_transaction_impls
74 | - FinBench ACID Suite: https://github.com/ldbc/finbench-acid
75 | 
76 |  


--------------------------------------------------------------------------------
/code_of_conduct.md:
--------------------------------------------------------------------------------
1 | For our code of conduct, see: https://github.com/ldbc/community/blob/main/code_of_conduct.md
2 | 


--------------------------------------------------------------------------------
/contributing.md:
--------------------------------------------------------------------------------
1 | For our contributor's guide, see: https://github.com/ldbc/community/blob/main/contributing.md


--------------------------------------------------------------------------------
/data-schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/data-schema.png


--------------------------------------------------------------------------------
/ldbc-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/ldbc-logo.png


--------------------------------------------------------------------------------
/logs/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | */*.log
3 | *.png


--------------------------------------------------------------------------------
/scripts/get-spark-to-home.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -eu
4 | cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
5 | 
6 | curl https://archive.apache.org/dist/spark/spark-3.2.2/spark-3.2.2-bin-hadoop3.2.tgz | tar -xz -C ${HOME}/
7 | 


--------------------------------------------------------------------------------
/scripts/run_cluster.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | LDBC_FINBENCH_DATAGEN_JAR=target/ldbc_finbench_datagen-0.2.0-SNAPSHOT-jar-with-dependencies.jar
 4 | OUTPUT_DIR=/tmp/finbench-out/
 5 | 
 6 | echo "start: " `date`
 7 | 
 8 | # Run Spark Application
 9 | 
10 | #--conf "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2" \
11 | # --num-executors 2 \
12 | # --conf "spark.shuffle.service.enabled=true"  \
13 | # --conf "spark.dynamicAllocation.enabled=true" \
14 | # --conf "spark.dynamicAllocation.minExecutors=1" \
15 | # --conf "spark.dynamicAllocation.maxExecutors=10" \
16 | # --conf "spark.yarn.maximizeResourceAllocation=true" \
17 | # --conf "spark.memory.offHeap.enabled=true" \
18 | # --conf "spark.memory.offHeap.size=100g" \
19 | time spark-submit --master spark://finbench-large-00:7077 \
20 |     --class ldbc.finbench.datagen.LdbcDatagen \
21 |     --num-executors 2 \
22 |     --conf "spark.default.parallelism=800" \
23 |     --conf "spark.network.timeout=100000" \
24 |     --conf "spark.shuffle.compress=true" \
25 |     --conf "spark.shuffle.spill.compress=true" \
26 |     --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
27 |     --conf "spark.driver.memory=100g" \
28 |     --conf "spark.driver.maxResultSize=0" \
29 |     --conf "spark.executor.memory=400g" \
30 |     --conf "spark.executor.memoryOverheadFactor=0.5" \
31 |     --conf "spark.executor.extraJavaOptions=-XX:+UseG1GC" \
32 |     ${LDBC_FINBENCH_DATAGEN_JAR} \
33 |     --scale-factor 100 \
34 |     --output-dir ${OUTPUT_DIR}
35 | 
36 | echo "End: " `date`
37 | 


--------------------------------------------------------------------------------
/scripts/run_local.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | LDBC_FINBENCH_DATAGEN_JAR=target/ldbc_finbench_datagen-0.2.0-SNAPSHOT-jar-with-dependencies.jar
 4 | OUTPUT_DIR=out
 5 | 
 6 | # Note: generate factor tables with --generate-factors
 7 | 
 8 | # run locally with the python script
 9 | # time python3 scripts/run.py --jar $LDBC_FINBENCH_DATAGEN_JAR --main-class ldbc.finbench.datagen.LdbcDatagen --memory 500g -- --scale-factor 30 --output-dir ${OUTPUT_DIR}
10 | 
11 | # run locally with spark-submit command
12 | # **({'spark.driver.extraJavaOptions': '-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005'}), # Debug
13 | # **({'spark.executor.extraJavaOptions': '-verbose:gc -XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps'}),
14 | # --conf "spark.memory.offHeap.enabled=true" \
15 | # --conf "spark.memory.offHeap.size=100g" \
16 | # --conf "spark.storage.memoryFraction=0" \
17 | # --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
18 | 
19 | time spark-submit --master local[*] \
20 |     --class ldbc.finbench.datagen.LdbcDatagen \
21 |     --driver-memory 480g \
22 |     --conf "spark.default.parallelism=500" \
23 |     --conf "spark.shuffle.compress=true" \
24 |     --conf "spark.shuffle.spill.compress=true" \
25 |     --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
26 |     --conf "spark.memory.offHeap.enabled=true" \
27 |     --conf "spark.memory.offHeap.size=100g" \
28 |     --conf "spark.storage.memoryFraction=0" \
29 |     --conf "spark.driver.maxResultSize=0" \
30 |     --conf "spark.executor.extraJavaOptions=-XX:+UseG1GC" \
31 |     ${LDBC_FINBENCH_DATAGEN_JAR} \
32 |     --scale-factor 10 \
33 |     --output-dir ${OUTPUT_DIR}
34 | 
35 | # currently works on SF100
36 | #time spark-submit --master local[*] \
37 | #    --class ldbc.finbench.datagen.LdbcDatagen \
38 | #    --driver-memory 400g \
39 | #    --conf "spark.default.parallelism=800" \
40 | #    --conf "spark.shuffle.compress=true" \
41 | #    --conf "spark.shuffle.spill.compress=true" \
42 | #    --conf "spark.kryoserializer.buffer.max=512m" \
43 | #    --conf "spark.driver.maxResultSize=0" \
44 | #    --conf "spark.driver.extraJavaOptions=-Xss512m" \
45 | #    --conf "spark.executor.extraJavaOptions=-Xss512m -XX:+UseG1GC" \
46 | #    --conf "spark.kryo.referenceTracking=false" \
47 | #    ${LDBC_FINBENCH_DATAGEN_JAR} \
48 | #    --scale-factor 100 \
49 | #    --output-dir ${OUTPUT_DIR}
50 | 
51 | 


--------------------------------------------------------------------------------
/scripts/run_paramgen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | LDBC_FINBENCH_DATAGEN_JAR=target/ldbc_finbench_datagen-0.2.0-SNAPSHOT-jar-with-dependencies.jar
 4 | OUTPUT_DIR=out/
 5 | 
 6 | # Note: generate factor tables with --generate-factors
 7 | 
 8 | echo "start factor table generation"
 9 | 
10 | time spark-submit --master local[*] \
11 |     --class ldbc.finbench.datagen.LdbcDatagen \
12 |     --driver-memory 480g \
13 |     ${LDBC_FINBENCH_DATAGEN_JAR} \
14 |     --output-dir ${OUTPUT_DIR} \
15 |     --factor-format csv \
16 |     --generate-factors
17 | 
18 | echo "start parameter curation"


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/config/ConfigParser.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.config;
18 | 
19 | import java.io.FileInputStream;
20 | import java.io.IOException;
21 | import java.io.InputStream;
22 | import java.io.InputStreamReader;
23 | import java.nio.charset.StandardCharsets;
24 | import java.util.HashMap;
25 | import java.util.Map;
26 | import java.util.Properties;
27 | 
28 | public class ConfigParser {
29 | 
30 |     public static Map<String, String> readConfig(String paramsFile) {
31 |         try (FileInputStream fis = new FileInputStream(paramsFile)) {
32 |             return readConfig(fis);
33 |         } catch (IOException e) {
34 |             throw new RuntimeException(e);
35 |         }
36 |     }
37 | 
38 |     public static Map<String, String> readConfig(InputStream paramStream) {
39 |         Properties properties = new Properties();
40 |         Map<String, String> res = new HashMap<>();
41 |         try {
42 |             properties.load(new InputStreamReader(paramStream, StandardCharsets.UTF_8));
43 |             for (String s : properties.stringPropertyNames()) {
44 |                 res.put(s, properties.getProperty(s));
45 |             }
46 |             return res;
47 |         } catch (IOException e) {
48 |             System.err.println(e.getMessage());
49 |             throw new RuntimeException(e);
50 |         }
51 |     }
52 | 
53 |     public static Map<String, String> scaleFactorConf(String scaleFactorXml, String scaleFactorId) {
54 |         Map<String, String> conf = new HashMap<>();
55 |         ScaleFactors scaleFactors = ScaleFactors.INSTANCE;
56 |         scaleFactors.initialize(scaleFactorXml); // use default if empty
57 |         if (!scaleFactors.value.containsKey(scaleFactorId)) {
58 |             throw new IllegalArgumentException("Scale factor " + scaleFactorId + " does not exist");
59 |         }
60 |         ScaleFactor scaleFactor = scaleFactors.value.get(scaleFactorId);
61 |         System.out.println("Applied configuration from " + (scaleFactorXml.isEmpty() ? "default" : scaleFactorXml)
62 |                                + " of scale factor " + scaleFactorId);
63 |         for (Map.Entry<String, String> e : scaleFactor.properties.entrySet()) {
64 |             conf.put(e.getKey(), e.getValue());
65 |         }
66 |         return conf;
67 |     }
68 | 
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/config/DatagenConfiguration.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package ldbc.finbench.datagen.config;
 18 | 
 19 | import java.io.Serializable;
 20 | import java.util.Iterator;
 21 | import java.util.Map;
 22 | 
 23 | public class DatagenConfiguration implements Iterable<Map.Entry<String, String>>, Serializable {
 24 |     public final Map<String, String> map;
 25 | 
 26 |     public DatagenConfiguration(Map<String, String> map) {
 27 |         this.map = map;
 28 |     }
 29 | 
 30 |     public String get(String key) {
 31 |         return map.get(key);
 32 |     }
 33 | 
 34 |     public String get(String key, String defaultValue) {
 35 |         return map.getOrDefault(key, defaultValue);
 36 |     }
 37 | 
 38 |     public String getTrimmed(String name) {
 39 |         String value = this.get(name);
 40 |         return null == value ? null : value.trim();
 41 |     }
 42 | 
 43 |     private String getHexDigits(String value) {
 44 |         boolean negative = false;
 45 |         String str = value;
 46 |         String hexString;
 47 |         if (value.startsWith("-")) {
 48 |             negative = true;
 49 |             str = value.substring(1);
 50 |         }
 51 | 
 52 |         if (!str.startsWith("0x") && !str.startsWith("0X")) {
 53 |             return null;
 54 |         } else {
 55 |             hexString = str.substring(2);
 56 |             if (negative) {
 57 |                 hexString = "-" + hexString;
 58 |             }
 59 | 
 60 |             return hexString;
 61 |         }
 62 |     }
 63 | 
 64 |     public int getInt(String name, int defaultValue) {
 65 |         String valueString = this.getTrimmed(name);
 66 |         if (valueString == null) {
 67 |             return defaultValue;
 68 |         } else {
 69 |             String hexString = this.getHexDigits(valueString);
 70 |             return hexString != null ? Integer.parseInt(hexString, 16) : Integer.parseInt(valueString);
 71 |         }
 72 |     }
 73 | 
 74 |     public double getDouble(String name, double defaultValue) {
 75 |         String valueString = this.getTrimmed(name);
 76 |         return valueString == null ? defaultValue : Double.parseDouble(valueString);
 77 |     }
 78 | 
 79 |     @Override
 80 |     public Iterator<Map.Entry<String, String>> iterator() {
 81 |         return this.map.entrySet().iterator();
 82 |     }
 83 | 
 84 |     public String getOutputDir() {
 85 |         return map.get("generator.outputDir");
 86 |     }
 87 | 
 88 |     public String getFormat() {
 89 |         return map.get("generator.format");
 90 |     }
 91 | 
 92 |     public String getPartition() {
 93 |         return map.get("spark.partition");
 94 |     }
 95 | 
 96 |     public void printConfig() {
 97 |         System.out.println("********* Configuration *********");
 98 |         map.forEach((key, value) -> System.out.println(key + ": " + value));
 99 |         System.out.println("*********************************");
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/config/ScaleFactor.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.config;
18 | 
19 | import java.util.TreeMap;
20 | 
21 | public class ScaleFactor {
22 |     public TreeMap<String, String> properties;
23 | 
24 |     ScaleFactor() {
25 |         properties = new TreeMap<>();
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/config/ScaleFactors.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.config;
18 | 
19 | import java.io.IOException;
20 | import java.io.InputStream;
21 | import java.nio.file.Files;
22 | import java.nio.file.Paths;
23 | import java.util.TreeMap;
24 | import javax.xml.parsers.DocumentBuilder;
25 | import javax.xml.parsers.DocumentBuilderFactory;
26 | import javax.xml.parsers.ParserConfigurationException;
27 | import org.w3c.dom.Document;
28 | import org.w3c.dom.Element;
29 | import org.w3c.dom.Node;
30 | import org.w3c.dom.NodeList;
31 | import org.xml.sax.SAXException;
32 | 
33 | public class ScaleFactors {
34 |     public TreeMap<String, ScaleFactor> value;
35 | 
36 |     public static final ScaleFactors INSTANCE = new ScaleFactors();
37 | 
38 |     private ScaleFactors() {
39 |     }
40 | 
41 |     public void initialize(String scaleFactorsXml) {
42 |         try {
43 |             value = new TreeMap<>();
44 |             DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
45 |             InputStream configFile = scaleFactorsXml.isEmpty()
46 |                 ? ScaleFactors.class.getResourceAsStream("/scale_factors.xml")
47 |                 : Files.newInputStream(Paths.get(scaleFactorsXml));
48 |             Document doc = builder.parse(configFile);
49 |             doc.getDocumentElement().normalize();
50 | 
51 |             System.out.println("Reading scale factors from " + (scaleFactorsXml.isEmpty() ? "default" :
52 |                 scaleFactorsXml) + "...");
53 |             NodeList nodes = doc.getElementsByTagName("scale_factor");
54 |             for (int i = 0; i < nodes.getLength(); i++) {
55 |                 Node node = nodes.item(i);
56 |                 if (node.getNodeType() == Node.ELEMENT_NODE) {
57 |                     Element element = (Element) node;
58 |                     String scaleFactorName = element.getAttribute("name");
59 |                     ScaleFactor scaleFactor = new ScaleFactor();
60 |                     NodeList properties = ((Element) node).getElementsByTagName("property");
61 |                     for (int j = 0; j < properties.getLength(); ++j) {
62 |                         Element property = (Element) properties.item(j);
63 |                         String name = property.getElementsByTagName("name").item(0).getTextContent();
64 |                         String value = property.getElementsByTagName("value").item(0).getTextContent();
65 |                         scaleFactor.properties.put(name, value);
66 |                     }
67 |                     System.out.println("Available scale factor configuration set " + scaleFactorName);
68 |                     value.put(scaleFactorName, scaleFactor);
69 |                 }
70 |             }
71 |             System.out.println("Number of scale factors read " + value.size());
72 |         } catch (ParserConfigurationException | IOException | SAXException e) {
73 |             throw new RuntimeException(e);
74 |         }
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/entities/DynamicActivity.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.entities;
18 | 
19 | public interface DynamicActivity {
20 | 
21 |     long getCreationDate();
22 | 
23 |     long getDeletionDate();
24 | 
25 |     boolean isExplicitlyDeleted();
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/entities/edges/CompanyOwnAccount.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.entities.edges;
18 | 
19 | import java.io.Serializable;
20 | import ldbc.finbench.datagen.entities.DynamicActivity;
21 | import ldbc.finbench.datagen.entities.nodes.Account;
22 | import ldbc.finbench.datagen.entities.nodes.Company;
23 | import ldbc.finbench.datagen.entities.nodes.PersonOrCompany;
24 | import ldbc.finbench.datagen.generation.dictionary.Dictionaries;
25 | import ldbc.finbench.datagen.util.RandomGeneratorFarm;
26 | 
27 | public class CompanyOwnAccount implements DynamicActivity, Serializable {
28 |     private final long companyId;
29 |     private final long accountId;
30 |     private final long creationDate;
31 |     private final long deletionDate;
32 |     private final boolean isExplicitlyDeleted;
33 |     private final String comment;
34 |     private final Account account; // TODO: can be removed
35 | 
36 |     public CompanyOwnAccount(Company company, Account account, long creationDate, long deletionDate,
37 |                              boolean isExplicitlyDeleted, String comment) {
38 |         this.companyId = company.getCompanyId();
39 |         this.accountId = account.getAccountId();
40 |         this.account = account; // TODO: can be removed
41 |         this.creationDate = creationDate;
42 |         this.deletionDate = deletionDate;
43 |         this.isExplicitlyDeleted = isExplicitlyDeleted;
44 |         this.comment = comment;
45 |     }
46 | 
47 |     public static void createCompanyOwnAccount(RandomGeneratorFarm farm, Company company, Account account,
48 |                                                long creationDate) {
49 |         account.setOwnerType(PersonOrCompany.COMPANY);
50 |         account.setCompanyOwner(company);
51 |         String comment =
52 |             Dictionaries.randomTexts.getUniformDistRandomTextForComments(
53 |                 farm.get(RandomGeneratorFarm.Aspect.COMMON_COMMENT));
54 |         CompanyOwnAccount companyOwnAccount =
55 |             new CompanyOwnAccount(company, account, creationDate, account.getDeletionDate(),
56 |                                   account.isExplicitlyDeleted(), comment);
57 |         company.getCompanyOwnAccounts().add(companyOwnAccount);
58 |     }
59 | 
60 |     public long getCompanyId() {
61 |         return companyId;
62 |     }
63 | 
64 |     public long getAccountId() {
65 |         return accountId;
66 |     }
67 | 
68 |     @Override
69 |     public long getCreationDate() {
70 |         return creationDate;
71 |     }
72 | 
73 |     @Override
74 |     public long getDeletionDate() {
75 |         return deletionDate;
76 |     }
77 | 
78 |     @Override
79 |     public boolean isExplicitlyDeleted() {
80 |         return isExplicitlyDeleted;
81 |     }
82 | 
83 |     public String getComment() {
84 |         return comment;
85 |     }
86 | 
87 |     public Account getAccount() {
88 |         return account;
89 |     }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/entities/edges/Deposit.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.entities.edges;
18 | 
19 | import java.io.Serializable;
20 | import ldbc.finbench.datagen.entities.DynamicActivity;
21 | import ldbc.finbench.datagen.entities.nodes.Account;
22 | import ldbc.finbench.datagen.entities.nodes.Loan;
23 | import ldbc.finbench.datagen.generation.dictionary.Dictionaries;
24 | import ldbc.finbench.datagen.util.RandomGeneratorFarm;
25 | 
26 | public class Deposit implements DynamicActivity, Serializable {
27 |     private final long loanId;
28 |     private final long accountId;
29 |     private final double amount;
30 |     private final long creationDate;
31 |     private final long deletionDate;
32 |     private final boolean isExplicitlyDeleted;
33 |     private final String comment;
34 | 
35 |     public Deposit(Loan loan, Account account, double amount, long creationDate, long deletionDate,
36 |                    boolean isExplicitlyDeleted, String comment) {
37 |         this.loanId = loan.getLoanId();
38 |         this.accountId = account.getAccountId();
39 |         this.amount = amount;
40 |         this.creationDate = creationDate;
41 |         this.deletionDate = deletionDate;
42 |         this.isExplicitlyDeleted = isExplicitlyDeleted;
43 |         this.comment = comment;
44 |     }
45 | 
46 |     public static void createDeposit(RandomGeneratorFarm farm, Loan loan, Account account, double amount) {
47 |         long creationDate =
48 |             Dictionaries.dates.randomLoanToAccountDate(farm.get(RandomGeneratorFarm.Aspect.LOAN_SUBEVENTS_DATE), loan,
49 |                                                        account, account.getDeletionDate());
50 |         String comment =
51 |             Dictionaries.randomTexts.getUniformDistRandomTextForComments(
52 |                 farm.get(RandomGeneratorFarm.Aspect.COMMON_COMMENT));
53 |         Deposit deposit =
54 |             new Deposit(loan, account, amount, creationDate, account.getDeletionDate(), account.isExplicitlyDeleted(),
55 |                         comment);
56 |         loan.addDeposit(deposit);
57 |         //account.getDeposits().add(deposit);
58 |     }
59 | 
60 |     public double getAmount() {
61 |         return amount;
62 |     }
63 | 
64 |     public long getLoanId() {
65 |         return loanId;
66 |     }
67 | 
68 |     public long getAccountId() {
69 |         return accountId;
70 |     }
71 | 
72 |     @Override
73 |     public long getCreationDate() {
74 |         return creationDate;
75 |     }
76 | 
77 |     @Override
78 |     public long getDeletionDate() {
79 |         return deletionDate;
80 |     }
81 | 
82 |     @Override
83 |     public boolean isExplicitlyDeleted() {
84 |         return isExplicitlyDeleted;
85 |     }
86 | 
87 |     public String getComment() {
88 |         return comment;
89 |     }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/entities/edges/PersonGuaranteePerson.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.entities.edges;
18 | 
19 | import java.io.Serializable;
20 | import ldbc.finbench.datagen.entities.DynamicActivity;
21 | import ldbc.finbench.datagen.entities.nodes.Person;
22 | import ldbc.finbench.datagen.generation.dictionary.Dictionaries;
23 | import ldbc.finbench.datagen.util.RandomGeneratorFarm;
24 | 
25 | public class PersonGuaranteePerson implements DynamicActivity, Serializable {
26 |     private final long fromPersonId;
27 |     private final long toPersonId;
28 |     private final long creationDate;
29 |     private final long deletionDate;
30 |     private final boolean isExplicitlyDeleted;
31 |     private final String relationship;
32 |     private final String comment;
33 | 
34 |     public PersonGuaranteePerson(Person fromPerson, Person toPerson,
35 |                                  long creationDate, long deletionDate, boolean isExplicitlyDeleted, String relation,
36 |                                  String comment) {
37 |         this.fromPersonId = fromPerson.getPersonId();
38 |         this.toPersonId = toPerson.getPersonId();
39 |         this.creationDate = creationDate;
40 |         this.deletionDate = deletionDate;
41 |         this.isExplicitlyDeleted = isExplicitlyDeleted;
42 |         this.relationship = relation;
43 |         this.comment = comment;
44 |     }
45 | 
46 |     public static void createPersonGuaranteePerson(RandomGeneratorFarm farm, Person fromPerson, Person toPerson) {
47 |         long creationDate = Dictionaries.dates.randomPersonToPersonDate(
48 |             farm.get(RandomGeneratorFarm.Aspect.PERSON_GUARANTEE_DATE), fromPerson, toPerson);
49 |         String relation = Dictionaries.guaranteeRelationships.getDistributedText(
50 |             farm.get(RandomGeneratorFarm.Aspect.PERSON_GUARANTEE_RELATIONSHIP));
51 |         String comment =
52 |             Dictionaries.randomTexts.getUniformDistRandomTextForComments(
53 |                 farm.get(RandomGeneratorFarm.Aspect.COMMON_COMMENT));
54 |         PersonGuaranteePerson personGuaranteePerson =
55 |             new PersonGuaranteePerson(fromPerson, toPerson, creationDate, 0, false, relation, comment);
56 |         fromPerson.getGuaranteeSrc().add(personGuaranteePerson);
57 |     }
58 | 
59 |     public long getFromPersonId() {
60 |         return fromPersonId;
61 |     }
62 | 
63 |     public long getToPersonId() {
64 |         return toPersonId;
65 |     }
66 | 
67 |     @Override
68 |     public long getCreationDate() {
69 |         return creationDate;
70 |     }
71 | 
72 |     @Override
73 |     public long getDeletionDate() {
74 |         return deletionDate;
75 |     }
76 | 
77 |     @Override
78 |     public boolean isExplicitlyDeleted() {
79 |         return isExplicitlyDeleted;
80 |     }
81 | 
82 |     public String getRelationship() {
83 |         return relationship;
84 |     }
85 | 
86 |     public String getComment() {
87 |         return comment;
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/entities/edges/Repay.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.entities.edges;
18 | 
19 | import java.io.Serializable;
20 | import ldbc.finbench.datagen.entities.DynamicActivity;
21 | import ldbc.finbench.datagen.entities.nodes.Account;
22 | import ldbc.finbench.datagen.entities.nodes.Loan;
23 | import ldbc.finbench.datagen.generation.dictionary.Dictionaries;
24 | import ldbc.finbench.datagen.util.RandomGeneratorFarm;
25 | 
26 | public class Repay implements DynamicActivity, Serializable {
27 |     private final long accountId;
28 |     private final long loanId;
29 |     private final double amount;
30 |     private final long creationDate;
31 |     private final long deletionDate;
32 |     private final boolean isExplicitlyDeleted;
33 |     private final String comment;
34 | 
35 |     public Repay(Account account, Loan loan, double amount, long creationDate, long deletionDate,
36 |                  boolean isExplicitlyDeleted, String comment) {
37 |         this.accountId = account.getAccountId();
38 |         this.loanId = loan.getLoanId();
39 |         this.amount = amount;
40 |         this.creationDate = creationDate;
41 |         this.deletionDate = deletionDate;
42 |         this.isExplicitlyDeleted = isExplicitlyDeleted;
43 |         this.comment = comment;
44 |     }
45 | 
46 |     public static void createRepay(RandomGeneratorFarm farm, Account account, Loan loan, double amount) {
47 |         long creationDate =
48 |             Dictionaries.dates.randomAccountToLoanDate(farm.get(RandomGeneratorFarm.Aspect.LOAN_SUBEVENTS_DATE),
49 |                                                        account, loan, account.getDeletionDate());
50 |         String comment =
51 |             Dictionaries.randomTexts.getUniformDistRandomTextForComments(
52 |                 farm.get(RandomGeneratorFarm.Aspect.COMMON_COMMENT));
53 |         Repay repay = new Repay(account, loan, amount, creationDate, account.getDeletionDate(),
54 |                                 account.isExplicitlyDeleted(), comment);
55 |         loan.addRepay(repay);
56 |         //account.getRepays().add(repay);
57 |     }
58 | 
59 |     public double getAmount() {
60 |         return amount;
61 |     }
62 | 
63 |     public long getAccountId() {
64 |         return accountId;
65 |     }
66 | 
67 |     public long getLoanId() {
68 |         return loanId;
69 |     }
70 | 
71 |     @Override
72 |     public long getCreationDate() {
73 |         return creationDate;
74 |     }
75 | 
76 |     @Override
77 |     public long getDeletionDate() {
78 |         return deletionDate;
79 |     }
80 | 
81 |     @Override
82 |     public boolean isExplicitlyDeleted() {
83 |         return isExplicitlyDeleted;
84 |     }
85 | 
86 |     public String getComment() {
87 |         return comment;
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/entities/nodes/Medium.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package ldbc.finbench.datagen.entities.nodes;
 18 | 
 19 | import java.io.Serializable;
 20 | import java.util.ArrayList;
 21 | import java.util.List;
 22 | import ldbc.finbench.datagen.entities.edges.SignIn;
 23 | 
 24 | public class Medium implements Serializable {
 25 |     private long mediumId;
 26 |     private String mediumName;
 27 |     private final List<SignIn> signIns;
 28 |     private long creationDate;
 29 |     private boolean isBlocked;
 30 |     private long lastLogin;
 31 |     private String riskLevel;
 32 | 
 33 |     public Medium() {
 34 |         signIns = new ArrayList<>();
 35 |     }
 36 | 
 37 |     public Medium(long mediumId, String mediumName, long creationDate, boolean isBlocked) {
 38 |         signIns = new ArrayList<>();
 39 |         this.mediumId = mediumId;
 40 |         this.mediumName = mediumName;
 41 |         this.creationDate = creationDate;
 42 |         this.isBlocked = isBlocked;
 43 |     }
 44 | 
 45 |     @Override
 46 |     public boolean equals(Object obj) {
 47 |         if (obj instanceof Medium) {
 48 |             Medium other = (Medium) obj;
 49 |             return mediumId == other.mediumId;
 50 |         }
 51 |         return false;
 52 |     }
 53 | 
 54 |     @Override
 55 |     public int hashCode() {
 56 |         return Long.hashCode(mediumId);
 57 |     }
 58 | 
 59 |     public long getMediumId() {
 60 |         return mediumId;
 61 |     }
 62 | 
 63 |     public void setMediumId(long mediumId) {
 64 |         this.mediumId = mediumId;
 65 |     }
 66 | 
 67 |     public String getMediumName() {
 68 |         return mediumName;
 69 |     }
 70 | 
 71 |     public void setMediumName(String mediumName) {
 72 |         this.mediumName = mediumName;
 73 |     }
 74 | 
 75 |     public List<SignIn> getSignIns() {
 76 |         return signIns;
 77 |     }
 78 | 
 79 |     public long getCreationDate() {
 80 |         return creationDate;
 81 |     }
 82 | 
 83 |     public void setCreationDate(long creationDate) {
 84 |         this.creationDate = creationDate;
 85 |     }
 86 | 
 87 |     public boolean isBlocked() {
 88 |         return isBlocked;
 89 |     }
 90 | 
 91 |     public void setBlocked(boolean blocked) {
 92 |         isBlocked = blocked;
 93 |     }
 94 | 
 95 |     public long getLastLogin() {
 96 |         return lastLogin;
 97 |     }
 98 | 
 99 |     public void setLastLogin(long lastLogin) {
100 |         this.lastLogin = lastLogin;
101 |     }
102 | 
103 |     public String getRiskLevel() {
104 |         return riskLevel;
105 |     }
106 | 
107 |     public void setRiskLevel(String riskLevel) {
108 |         this.riskLevel = riskLevel;
109 |     }
110 | }
111 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/entities/nodes/PersonOrCompany.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.entities.nodes;
18 | 
19 | public enum PersonOrCompany {
20 |     PERSON, COMPANY
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/entities/place/Place.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package ldbc.finbench.datagen.entities.place;
 18 | 
 19 | import java.io.Serializable;
 20 | 
 21 | @SuppressWarnings("serial")
 22 | public class Place implements Serializable {
 23 | 
 24 |     public static final String CITY = "City";
 25 |     public static final String COUNTRY = "Country";
 26 |     public static final String CONTINENT = "Continent";
 27 | 
 28 |     private int id;
 29 |     private int zid;
 30 |     private String name;
 31 |     private double latitude;
 32 |     private double longitude;
 33 |     private long population;
 34 |     private String type;
 35 | 
 36 |     public Place() {
 37 |     }
 38 | 
 39 |     public Place(int id, String name, double longitude, double latitude, int population, String type) {
 40 |         this.id = id;
 41 |         this.name = name;
 42 |         this.longitude = longitude;
 43 |         this.latitude = latitude;
 44 |         this.population = population;
 45 |         this.type = type;
 46 |     }
 47 | 
 48 |     public int getZId() {
 49 |         return zid;
 50 |     }
 51 | 
 52 |     public void setZId(int zid) {
 53 |         this.zid = zid;
 54 |     }
 55 | 
 56 |     public int getId() {
 57 |         return id;
 58 |     }
 59 | 
 60 |     public void setId(int id) {
 61 |         this.id = id;
 62 |     }
 63 | 
 64 |     public String getName() {
 65 |         return name;
 66 |     }
 67 | 
 68 |     public void setName(String name) {
 69 |         this.name = name;
 70 |     }
 71 | 
 72 |     public double getLongitude() {
 73 |         return longitude;
 74 |     }
 75 | 
 76 |     public void setLongitude(double longitude) {
 77 |         this.longitude = longitude;
 78 |     }
 79 | 
 80 |     public double getLatitude() {
 81 |         return latitude;
 82 |     }
 83 | 
 84 |     public void setLatitude(double latitude) {
 85 |         this.latitude = latitude;
 86 |     }
 87 | 
 88 |     public long getPopulation() {
 89 |         return population;
 90 |     }
 91 | 
 92 |     public void setPopulation(long population) {
 93 |         this.population = population;
 94 |     }
 95 | 
 96 |     public String getType() {
 97 |         return type;
 98 |     }
 99 | 
100 |     public void setType(String type) {
101 |         this.type = type;
102 |     }
103 | 
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/DatagenContext.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation;
18 | 
19 | import ldbc.finbench.datagen.config.DatagenConfiguration;
20 | import ldbc.finbench.datagen.generation.dictionary.Dictionaries;
21 | 
22 | public class DatagenContext {
23 | 
24 |     private static transient volatile boolean initialized = false;
25 | 
26 |     public static synchronized void initialize(DatagenConfiguration conf) {
27 |         if (!initialized) {
28 |             DatagenParams.readConf(conf);
29 |             Dictionaries.loadDictionaries();
30 |             initialized = true;
31 |         }
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/dictionary/CommonTextDictionary.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.dictionary;
18 | 
19 | import java.io.BufferedReader;
20 | import java.io.IOException;
21 | import java.io.InputStreamReader;
22 | import java.nio.charset.StandardCharsets;
23 | import java.util.Objects;
24 | import java.util.Random;
25 | import java.util.TreeMap;
26 | 
27 | public class CommonTextDictionary {
28 |     private final TreeMap<Long, String> resources;
29 | 
30 |     public CommonTextDictionary(String filePath, String separator) {
31 |         this.resources = new TreeMap<>();
32 | 
33 |         try {
34 |             InputStreamReader inputStreamReader = new InputStreamReader(
35 |                 Objects.requireNonNull(getClass().getResourceAsStream(filePath)), StandardCharsets.UTF_8);
36 |             BufferedReader dictionary = new BufferedReader(inputStreamReader);
37 |             String line;
38 |             long totalNum = 0;
39 |             while ((line = dictionary.readLine()) != null) {
40 |                 String[] data = line.split(separator);
41 |                 String surname = data[0].trim();
42 |                 this.resources.put(totalNum, surname);
43 |                 totalNum++;
44 |             }
45 |             dictionary.close();
46 |         } catch (IOException e) {
47 |             throw new RuntimeException(e);
48 |         }
49 |     }
50 | 
51 |     public String getUniformDistRandomText(Random random) {
52 |         long index = random.nextInt(resources.size());
53 |         return resources.get(index);
54 |     }
55 | 
56 |     public String getUniformDistRandomTextForComments(Random random) {
57 |         StringBuilder text = new StringBuilder();
58 |         for (int i = 0; i < 5; i++) {
59 |             text.append(resources.get((long) random.nextInt(resources.size()))).append(" ");
60 |         }
61 |         return text.toString();
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/dictionary/EmailDictionary.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.dictionary;
18 | 
19 | import java.io.BufferedReader;
20 | import java.io.IOException;
21 | import java.io.InputStreamReader;
22 | import java.util.ArrayList;
23 | import java.util.List;
24 | import java.util.Random;
25 | 
26 | 
27 | public class EmailDictionary {
28 |     private final List<String> emails;
29 |     private final List<Double> cumulativeDistribution;
30 | 
31 |     public EmailDictionary(String filePath, String separator) {
32 |         try {
33 |             BufferedReader emailDictionary = new BufferedReader(
34 |                 new InputStreamReader(getClass().getResourceAsStream(filePath), "UTF-8"));
35 | 
36 |             emails = new ArrayList<>();
37 |             cumulativeDistribution = new ArrayList<>();
38 | 
39 |             String line;
40 |             double cummulativeDist = 0.0;
41 |             while ((line = emailDictionary.readLine()) != null) {
42 |                 String[] data = line.split(separator);
43 |                 emails.add(data[0]);
44 |                 if (data.length == 2) {
45 |                     cummulativeDist += Double.parseDouble(data[1]);
46 |                     cumulativeDistribution.add(cummulativeDist);
47 |                 }
48 |             }
49 |             emailDictionary.close();
50 |         } catch (IOException e) {
51 |             throw new RuntimeException(e);
52 |         }
53 |     }
54 | 
55 |     public String getRandomEmail(Random randomTop, Random randomEmail) {
56 |         int minIdx = 0;
57 |         int maxIdx = cumulativeDistribution.size() - 1;
58 |         double prob = randomTop.nextDouble();
59 |         if (prob > cumulativeDistribution.get(maxIdx)) {
60 |             int idx = randomEmail.nextInt(emails.size() - cumulativeDistribution.size()) + cumulativeDistribution
61 |                 .size();
62 |             return emails.get(idx);
63 |         } else if (prob < cumulativeDistribution.get(minIdx)) {
64 |             return emails.get(minIdx);
65 |         }
66 | 
67 |         while ((maxIdx - minIdx) > 1) {
68 |             int middlePoint = minIdx + (maxIdx - minIdx) / 2;
69 |             if (prob > cumulativeDistribution.get(middlePoint)) {
70 |                 minIdx = middlePoint;
71 |             } else {
72 |                 maxIdx = middlePoint;
73 |             }
74 |         }
75 |         return emails.get(maxIdx);
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/dictionary/NumbersGenerator.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.dictionary;
18 | 
19 | import java.util.Random;
20 | 
21 | public class NumbersGenerator {
22 | 
23 |     // TODO: add more
24 |     private String[] bankCode = {"001", "100", "102", "103", "104", "105", "301", "302", "303", "304", "305", "306",
25 |         "307", "308", "309",};
26 | 
27 |     // TODO: add more
28 |     private String[] districtCode =
29 |         {"1100", "1200", "3700", "2100", "1400", "4100", "2200", "2300", "6100", "6200", "6300",
30 |             "6400", "6500", "4600", "8100", "8200", "2900", "5000", "4400", "4500", "4300", "4200", "3200", "3300"};
31 | 
32 |     public NumbersGenerator() {
33 |     }
34 | 
35 |     public String generatePhonenum(Random random) {
36 |         return String.format("%03d", random.nextInt(1000))
37 |             + "-" + String.format("%04d", random.nextInt(10000));
38 |     }
39 | 
40 |     public String generateOrdernum(Random random) {
41 |         return bankCode[random.nextInt(bankCode.length)]
42 |             + districtCode[random.nextInt(districtCode.length)]
43 |             + String.format("%04d", random.nextInt(1000))
44 |             + String.format("%04d", random.nextInt(10000));
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/dictionary/PercentageTextDictionary.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.dictionary;
18 | 
19 | import java.io.BufferedReader;
20 | import java.io.IOException;
21 | import java.io.InputStreamReader;
22 | import java.util.ArrayList;
23 | import java.util.List;
24 | import java.util.Random;
25 | 
26 | public class PercentageTextDictionary {
27 |     private final List<String> resources;
28 |     private final List<Double> cumulativeDistribution;
29 | 
30 |     public PercentageTextDictionary(String filePath, String separator) {
31 |         resources = new ArrayList<>();
32 |         cumulativeDistribution = new ArrayList<>();
33 | 
34 |         try {
35 |             BufferedReader dictionary = new BufferedReader(
36 |                 new InputStreamReader(getClass().getResourceAsStream(filePath), "UTF-8"));
37 |             String line;
38 |             double cummulativeDist = 0.0;
39 |             while ((line = dictionary.readLine()) != null) {
40 |                 String[] data = line.split(separator);
41 |                 String browser = data[0];
42 |                 cummulativeDist += Double.parseDouble(data[1]);
43 |                 resources.add(browser);
44 |                 cumulativeDistribution.add(cummulativeDist);
45 |             }
46 |             dictionary.close();
47 |         } catch (IOException e) {
48 |             throw new RuntimeException(e);
49 |         }
50 |     }
51 | 
52 |     public String getName(int id) {
53 |         return resources.get(id);
54 |     }
55 | 
56 |     public String getDistributedText(Random random) {
57 |         double prob = random.nextDouble();
58 |         int minIdx = 0;
59 |         int maxIdx = (byte) ((prob < cumulativeDistribution.get(minIdx)) ? minIdx : cumulativeDistribution
60 |             .size() - 1);
61 |         // Binary search
62 |         while ((maxIdx - minIdx) > 1) {
63 |             int middlePoint = minIdx + (maxIdx - minIdx) / 2;
64 |             if (prob > cumulativeDistribution.get(middlePoint)) {
65 |                 minIdx = middlePoint;
66 |             } else {
67 |                 maxIdx = middlePoint;
68 |             }
69 |         }
70 |         return resources.get(maxIdx);
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/dictionary/PersonNameDictionary.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.dictionary;
18 | 
19 | import java.io.BufferedReader;
20 | import java.io.IOException;
21 | import java.io.InputStreamReader;
22 | import java.nio.charset.StandardCharsets;
23 | import java.util.Random;
24 | import java.util.TreeMap;
25 | 
26 | public class PersonNameDictionary {
27 |     private final TreeMap<Long, String> personSurnames;
28 | 
29 |     public PersonNameDictionary(String filePath, String separator) {
30 |         this.personSurnames = new TreeMap<>();
31 |         try {
32 |             InputStreamReader inputStreamReader = new InputStreamReader(
33 |                 getClass().getResourceAsStream(filePath), StandardCharsets.UTF_8);
34 |             BufferedReader dictionary = new BufferedReader(inputStreamReader);
35 |             String line;
36 |             long totalNumSurnames = 0;
37 |             while ((line = dictionary.readLine()) != null) {
38 |                 String[] data = line.split(separator);
39 |                 String surname = data[1].trim();
40 |                 this.personSurnames.put(totalNumSurnames, surname);
41 |                 totalNumSurnames++;
42 |             }
43 |             dictionary.close();
44 |         } catch (IOException e) {
45 |             throw new RuntimeException(e);
46 |         }
47 |     }
48 | 
49 |     public String getUniformDistRandName(Random random) {
50 |         long nameIndex = random.nextInt(personSurnames.size());
51 |         return personSurnames.get(nameIndex);
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/dictionary/PlaceZOrder.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.dictionary;
18 | 
19 | // Private class used to sort countries by their z-order value.
20 | class PlaceZOrder implements Comparable<PlaceZOrder> {
21 | 
22 |     public int id;
23 |     Integer zvalue;
24 | 
25 |     PlaceZOrder(int id, int zvalue) {
26 |         this.id = id;
27 |         this.zvalue = zvalue;
28 |     }
29 | 
30 |     public int compareTo(PlaceZOrder obj) {
31 |         return zvalue.compareTo(obj.zvalue);
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/distribution/AccountDeleteDistribution.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.distribution;
18 | 
19 | import java.io.BufferedReader;
20 | import java.io.IOException;
21 | import java.io.InputStreamReader;
22 | import java.io.Serializable;
23 | import java.nio.charset.StandardCharsets;
24 | import java.util.ArrayList;
25 | import java.util.List;
26 | import java.util.Random;
27 | 
28 | public class AccountDeleteDistribution implements Serializable {
29 | 
30 |     private double[] distribution;
31 |     private final String distributionFile;
32 | 
33 |     public AccountDeleteDistribution(String distributionFile) {
34 |         this.distributionFile = distributionFile;
35 |     }
36 | 
37 |     public void initialize() {
38 |         try {
39 |             BufferedReader distributionBuffer = new BufferedReader(
40 |                 new InputStreamReader(getClass().getResourceAsStream(distributionFile), StandardCharsets.UTF_8));
41 |             List<Double> temp = new ArrayList<>();
42 |             String line;
43 |             while ((line = distributionBuffer.readLine()) != null) {
44 |                 Double prob = Double.valueOf(line);
45 |                 temp.add(prob);
46 |             }
47 |             distribution = new double[temp.size()];
48 |             int index = 0;
49 |             for (Double ele : temp) {
50 |                 distribution[index] = ele;
51 |                 ++index;
52 |             }
53 |         } catch (IOException e) {
54 |             throw new RuntimeException(e);
55 |         }
56 |     }
57 | 
58 |     public boolean isDeleted(Random random, long maxDegree) {
59 |         if (maxDegree < distribution.length) {
60 |             return random.nextDouble() < distribution[(int) maxDegree];
61 |         } else {
62 |             // support degree more than 1000
63 |             return random.nextDouble() < Math.pow(0.99, maxDegree);
64 |         }
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/distribution/DegreeDistribution.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.distribution;
18 | 
19 | public abstract class DegreeDistribution {
20 | 
21 |     public abstract void initialize();
22 | 
23 |     public abstract void reset(long seed);
24 | 
25 |     public abstract long nextDegree();
26 | 
27 |     public double mean(long numPersons) {
28 |         return -1;
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/distribution/MultiplicityDistribution.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.distribution;
18 | 
19 | public class MultiplicityDistribution {
20 | 
21 |     public MultiplicityDistribution() {
22 |     }
23 | 
24 | 
25 |     public void reset(long seed) {
26 |     }
27 | 
28 | 
29 |     public long nextDegree() {
30 |         return 0;
31 |     }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/distribution/PowerLawActivityDeleteDistribution.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.distribution;
18 | 
19 | import java.io.BufferedReader;
20 | import java.io.IOException;
21 | import java.io.InputStreamReader;
22 | import java.nio.charset.StandardCharsets;
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import java.util.Random;
26 | 
27 | public class PowerLawActivityDeleteDistribution {
28 | 
29 |     private double[] minutes;
30 |     private double[] distribution;
31 |     private String distributionFile;
32 | 
33 |     public PowerLawActivityDeleteDistribution(String distributionFile) {
34 |         this.distributionFile = distributionFile;
35 | 
36 |         this.minutes =
37 |             new double[] {0, 0.5, 1, 5, 10, 20, 30, 40, 60, 120, 300, 1440, 2880, 4320, 5760, 7200, 8460, 10080};
38 | 
39 |     }
40 | 
41 |     public void initialize() {
42 |         try {
43 |             BufferedReader distributionBuffer = new BufferedReader(
44 |                 new InputStreamReader(getClass().getResourceAsStream(distributionFile), StandardCharsets.UTF_8));
45 |             List<Double> temp = new ArrayList<>();
46 |             String line;
47 |             while ((line = distributionBuffer.readLine()) != null) {
48 |                 Double prob = Double.valueOf(line);
49 |                 temp.add(prob);
50 |             }
51 |             distribution = new double[temp.size()];
52 |             int index = 0;
53 |             for (Double item : temp) {
54 |                 distribution[index] = item;
55 |                 ++index;
56 |             }
57 |         } catch (IOException e) {
58 |             throw new RuntimeException(e);
59 |         }
60 |     }
61 | 
62 |     public double nextDouble(double prob, Random random) {
63 | 
64 |         double draw = 0;
65 |         for (int i = 0; i < distribution.length; i++) {
66 |             if (prob < distribution[i]) {
67 |                 double lower = minutes[i - 1];
68 |                 double upper = minutes[i];
69 |                 draw = lower + (upper - lower) * random.nextDouble();
70 |                 break;
71 |             }
72 |         }
73 |         return draw;
74 | 
75 |     }
76 | 
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/distribution/TimeDistribution.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.distribution;
18 | 
19 | import java.io.BufferedReader;
20 | import java.io.InputStreamReader;
21 | import java.nio.charset.StandardCharsets;
22 | import java.util.Map;
23 | import java.util.Random;
24 | import java.util.TreeMap;
25 | 
26 | public class TimeDistribution {
27 |     private Map<Integer, Double> hourDistribution;
28 |     private double[] hourProbs;
29 |     private final double[] hourCumulatives;
30 | 
31 |     public TimeDistribution(String hourDistributionFile) {
32 |         loadDistribution(hourDistributionFile);
33 |         hourCumulatives = new double[hourProbs.length];
34 |         hourCumulatives[0] = hourProbs[0];
35 |         for (int i = 1; i < hourProbs.length; i++) {
36 |             hourCumulatives[i] = hourCumulatives[i - 1] + hourProbs[i];
37 |         }
38 |     }
39 | 
40 |     public void loadDistribution(String hourDistributionFile) {
41 |         try {
42 |             BufferedReader reader = new BufferedReader(
43 |                 new InputStreamReader(getClass().getResourceAsStream(hourDistributionFile), StandardCharsets.UTF_8));
44 |             hourDistribution = new TreeMap<>();
45 |             String line;
46 |             while ((line = reader.readLine()) != null) {
47 |                 String[] data = line.split(" ");
48 |                 hourDistribution.put(Integer.parseInt(data[1]), Double.parseDouble(data[0]));
49 |             }
50 |             reader.close();
51 |             hourProbs = hourDistribution.values().stream().mapToDouble(Double::doubleValue).toArray();
52 |         } catch (Exception e) {
53 |             throw new RuntimeException(e);
54 |         }
55 |     }
56 | 
57 |     public Map<Integer, Double> getHourDistribution() {
58 |         return hourDistribution;
59 |     }
60 | 
61 |     public long nextHour(Random random) {
62 |         double rand = random.nextDouble();
63 |         for (int i = 0; i < hourProbs.length; i++) {
64 |             if (rand < hourCumulatives[i]) {
65 |                 return i;
66 |             }
67 |         }
68 |         return -1;
69 |     }
70 | 
71 |     public long nextMinute(Random random) {
72 |         return (long) (random.nextDouble() * 60);
73 |     }
74 | 
75 |     public long nextSecond(Random random) {
76 |         return (long) (random.nextDouble() * 60);
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/events/CompanyInvestEvent.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.events;
18 | 
19 | import java.io.Serializable;
20 | import java.util.List;
21 | import java.util.Random;
22 | import ldbc.finbench.datagen.entities.edges.CompanyInvestCompany;
23 | import ldbc.finbench.datagen.entities.nodes.Company;
24 | import ldbc.finbench.datagen.generation.DatagenParams;
25 | import ldbc.finbench.datagen.util.RandomGeneratorFarm;
26 | 
27 | public class CompanyInvestEvent implements Serializable {
28 |     private final RandomGeneratorFarm randomFarm;
29 |     private final Random randIndex;
30 | 
31 |     public CompanyInvestEvent() {
32 |         randomFarm = new RandomGeneratorFarm();
33 |         randIndex = new Random(DatagenParams.defaultSeed);
34 |     }
35 | 
36 |     public void resetState(int seed) {
37 |         randomFarm.resetRandomGenerators(seed);
38 |         randIndex.setSeed(seed);
39 |     }
40 | 
41 |     public List<Company> companyInvestPartition(List<Company> investors, List<Company> targets) {
42 |         Random numInvestorsRand = randomFarm.get(RandomGeneratorFarm.Aspect.NUMS_COMPANY_INVEST);
43 |         Random chooseInvestorRand = randomFarm.get(RandomGeneratorFarm.Aspect.CHOOSE_COMPANY_INVESTOR);
44 |         for (Company target : targets) {
45 |             int numInvestors = numInvestorsRand.nextInt(
46 |                 DatagenParams.maxInvestors - DatagenParams.minInvestors + 1
47 |             ) + DatagenParams.minInvestors;
48 |             for (int i = 0; i < numInvestors; i++) {
49 |                 int index = chooseInvestorRand.nextInt(investors.size());
50 |                 Company investor = investors.get(index);
51 |                 if (cannotInvest(investor, target)) {
52 |                     continue;
53 |                 }
54 |                 CompanyInvestCompany.createCompanyInvestCompany(randomFarm, investor, target);
55 |             }
56 |         }
57 |         return targets;
58 |     }
59 | 
60 |     public boolean cannotInvest(Company investor, Company target) {
61 |         return (investor == target) || investor.hasInvestedBy(target) || target.hasInvestedBy(investor);
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/events/PersonInvestEvent.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.events;
18 | 
19 | import java.io.Serializable;
20 | import java.util.List;
21 | import java.util.Random;
22 | import ldbc.finbench.datagen.entities.edges.PersonInvestCompany;
23 | import ldbc.finbench.datagen.entities.nodes.Company;
24 | import ldbc.finbench.datagen.entities.nodes.Person;
25 | import ldbc.finbench.datagen.generation.DatagenParams;
26 | import ldbc.finbench.datagen.util.RandomGeneratorFarm;
27 | 
28 | public class PersonInvestEvent implements Serializable {
29 |     private final RandomGeneratorFarm randomFarm;
30 |     private final Random randIndex;
31 | 
32 |     public PersonInvestEvent() {
33 |         randomFarm = new RandomGeneratorFarm();
34 |         randIndex = new Random(DatagenParams.defaultSeed);
35 |     }
36 | 
37 |     public void resetState(int seed) {
38 |         randomFarm.resetRandomGenerators(seed);
39 |         randIndex.setSeed(seed);
40 |     }
41 | 
42 |     public List<Company> personInvestPartition(List<Person> investors, List<Company> targets) {
43 |         Random numInvestorsRand = randomFarm.get(RandomGeneratorFarm.Aspect.NUMS_PERSON_INVEST);
44 |         Random chooseInvestorRand = randomFarm.get(RandomGeneratorFarm.Aspect.CHOOSE_PERSON_INVESTOR);
45 |         for (Company target : targets) {
46 |             int numInvestors = numInvestorsRand.nextInt(
47 |                 DatagenParams.maxInvestors - DatagenParams.minInvestors + 1
48 |             ) + DatagenParams.minInvestors;
49 |             for (int i = 0; i < numInvestors; i++) {
50 |                 int index = chooseInvestorRand.nextInt(investors.size());
51 |                 Person investor = investors.get(index);
52 |                 if (cannotInvest(investor, target)) {
53 |                     continue;
54 |                 }
55 |                 PersonInvestCompany.createPersonInvestCompany(randomFarm, investor, target);
56 |             }
57 |         }
58 |         return targets;
59 |     }
60 | 
61 |     public boolean cannotInvest(Person investor, Company target) {
62 |         return target.hasInvestedBy(investor);
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/generation/events/SignInEvent.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.events;
18 | 
19 | import java.io.Serializable;
20 | import java.util.LinkedList;
21 | import java.util.List;
22 | import java.util.Random;
23 | import ldbc.finbench.datagen.entities.edges.SignIn;
24 | import ldbc.finbench.datagen.entities.nodes.Account;
25 | import ldbc.finbench.datagen.entities.nodes.Medium;
26 | import ldbc.finbench.datagen.generation.DatagenParams;
27 | import ldbc.finbench.datagen.util.RandomGeneratorFarm;
28 | 
29 | public class SignInEvent implements Serializable {
30 |     private final RandomGeneratorFarm randomFarm;
31 |     private final Random randIndex;
32 | 
33 |     public SignInEvent() {
34 |         randomFarm = new RandomGeneratorFarm();
35 |         randIndex = new Random(DatagenParams.defaultSeed);
36 |     }
37 | 
38 |     private void resetState(int seed) {
39 |         randomFarm.resetRandomGenerators(seed);
40 |         randIndex.setSeed(seed);
41 |     }
42 | 
43 |     public List<Medium> signIn(List<Medium> mediums, List<Account> accounts, int blockId) {
44 |         resetState(blockId);
45 | 
46 |         Random accountsToSignRand = randomFarm.get(RandomGeneratorFarm.Aspect.NUM_ACCOUNTS_SIGNIN_PER_MEDIUM);
47 |         Random multiplicityRandom = randomFarm.get(RandomGeneratorFarm.Aspect.MULTIPLICITY_SIGNIN);
48 |         int numAccountsToSign = accountsToSignRand.nextInt(DatagenParams.maxAccountToSignIn);
49 | 
50 |         for (Medium medium : mediums) {
51 |             for (int i = 0; i < Math.max(1, numAccountsToSign); i++) {
52 |                 Account accountToSign = accounts.get(randIndex.nextInt(accounts.size()));
53 |                 if (cannotSignIn(medium, accountToSign)) {
54 |                     continue;
55 |                 }
56 |                 int numSignIn = multiplicityRandom.nextInt(DatagenParams.maxSignInPerPair);
57 |                 for (int mid = 0; mid < Math.max(1, numSignIn); mid++) {
58 |                     SignIn.createSignIn(randomFarm, mid, medium, accountToSign);
59 |                 }
60 |             }
61 |         }
62 |         return mediums;
63 |     }
64 | 
65 |     public boolean cannotSignIn(Medium from, Account to) {
66 |         return from.getCreationDate() + DatagenParams.activityDelta > to.getDeletionDate();
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/util/DateTimeUtils.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.util;
18 | 
19 | import java.time.Instant;
20 | import java.time.LocalDate;
21 | import java.time.LocalDateTime;
22 | import java.time.Month;
23 | import java.time.ZoneId;
24 | 
25 | public class DateTimeUtils {
26 |     public static ZoneId UTC = ZoneId.of("UTC");
27 | 
28 |     public static long toEpochMilli(LocalDate ld) {
29 |         return ld.atStartOfDay(UTC).toInstant().toEpochMilli();
30 |     }
31 | 
32 |     public static long toEpochMilli(LocalDateTime ldt) {
33 |         return ldt.atZone(UTC).toInstant().toEpochMilli();
34 |     }
35 | 
36 |     public static LocalDate utcDateOfEpochMilli(long epochMilli) {
37 |         return Instant.ofEpochMilli(epochMilli).atZone(UTC).toLocalDate();
38 |     }
39 | 
40 |     public static LocalDateTime utcDateTimeOfEpochMilli(long epochMilli) {
41 |         return Instant.ofEpochMilli(epochMilli).atZone(UTC).toLocalDateTime();
42 |     }
43 | 
44 |     public static boolean isTravelSeason(long epochMilli) {
45 |         LocalDate date = utcDateOfEpochMilli(epochMilli);
46 | 
47 |         int day = date.getDayOfMonth();
48 |         int month = date.getMonthValue();
49 | 
50 |         if ((month > 4) && (month < 7)) {
51 |             return true;
52 |         }
53 |         return ((month == 11) && (day > 23));
54 |     }
55 | 
56 |     public static int getNumberOfMonths(long epochMilli, int startMonth, int startYear) {
57 |         LocalDate date = utcDateOfEpochMilli(epochMilli);
58 |         int month = date.getMonthValue();
59 |         int year = date.getYear();
60 |         return (year - startYear) * 12 + month - (startMonth - 1);
61 |     }
62 | 
63 |     public static int getYear(long epochMilli) {
64 |         LocalDateTime datetime = utcDateTimeOfEpochMilli(epochMilli);
65 |         return datetime.getYear();
66 |     }
67 | 
68 |     public static Month getMonth(long epochMilli) {
69 |         LocalDateTime datetime = utcDateTimeOfEpochMilli(epochMilli);
70 |         return datetime.getMonth();
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/ldbc/finbench/datagen/util/ZOrder.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.util;
18 | 
19 | 
20 | public class ZOrder {
21 | 
22 |     private int maxBitNum;
23 | 
24 |     public ZOrder(int maxNumBit) {
25 |         this.maxBitNum = maxNumBit;
26 |     }
27 | 
28 |     public int getZValue(int x, int y) {
29 | 
30 |         String sx = Integer.toBinaryString(x);
31 |         int numberToAddX = maxBitNum - sx.length();
32 |         for (int i = 0; i < numberToAddX; i++) {
33 |             sx = "0" + sx;
34 |         }
35 | 
36 |         String sy = Integer.toBinaryString(y);
37 |         int numberToAddY = maxBitNum - sy.length();
38 |         for (int i = 0; i < numberToAddY; i++) {
39 |             sy = "0" + sy;
40 |         }
41 | 
42 |         String sz = "";
43 |         for (int i = 0; i < sx.length(); i++) {
44 |             sz = sz + sx.substring(i, i + 1) + "" + sy.substring(i, i + 1);
45 |         }
46 | 
47 |         return Integer.parseInt(sz, 2);
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/resources/README.md:
--------------------------------------------------------------------------------
 1 | # About Resources
 2 | 
 3 | Here we list the resources used in the Data Generation. There are three typical kinds of resourses here.
 4 | 
 5 | - Dictionaries: some raw data used as dictionaries to generate data, e.g. dummy names.
 6 | - Distributions: distributions that describes the degree distribution.
 7 | - Parameters: the parameters that used to define some common configurations or limits in data generation process.
 8 | 
 9 | ## Dictionaries
10 | 
11 | To avoid legal problems, we generate data using dummy names. The dummy names are generated by a free tool named
12 | FauxID[1] or copied from SNB.
13 | 
14 | | Dictionary Files | Description                                                     |
15 | |------------------|-----------------------------------------------------------------|
16 | | accountTypes.txt | the account types' values generated with ChatGPT[2]             |
17 | | companyNames.txt | dummy names generated by fake-company-generator[3] on Fauxid[1] |
18 | | mediumNames.txt  | medium types generated with ChatGPT[2]                          |
19 | | surnames.txt     | surnames of persons used in SNB DataGen[4]                      |
20 | 
21 | ## Distributions
22 | 
23 | The distributions will be determined based on the real financial data profiling.
24 | 
25 | | Distribution Files              | Description                                                             |
26 | |---------------------------------|-------------------------------------------------------------------------|
27 | | accountDelete.txt               | the distribution of the account deletion used in SNB DataGen[4]         |
28 | | facebookPowerlawBucket.dat      | the Facebook powerlaw bucketed distribution used in SNB DataGen[4]      |
29 | | hourDistribution.dat            | the distribution of the hour of the day                                 |
30 | | inDegreeRegression.txt          | the inDegree distribution in profiling results                          |
31 | | multiplicity.txt                | the multiplicity in profiling results                                   |
32 | | outDegreeRegression.txt         | the outDegree distribution in profiling results                         |
33 | | powerLawAcitivityDeleteDate.txt | the powerlaw distribution to generate deleteDate used in SNB DataGen[4] |
34 | 
35 | ## Parameters
36 | 
37 | Here are some configuration and parameters used in data generation including,
38 | 
39 | - params_default.ini: some common/gloabl parameters by default
40 | - scale_factors.xml: a parameter map from the scale factors to the parameters that controls the data scale in
41 |   generation.
42 | 
43 | # Reference
44 | 
45 | [1] FauxID: https://fauxid.com
46 | [2] ChatGPT: https://chat.openai.com/
47 | [3] fake-company-generator: https://fauxid.com/tools/fake-company-generator
48 | [4] SNB DataGen: https://github.com/ldbc/ldbc_snb_datagen_spark/
49 | 


--------------------------------------------------------------------------------
/src/main/resources/dictionaries/accountLevels.txt:
--------------------------------------------------------------------------------
1 | Basic level, 0.6
2 | Silver level, 0.2
3 | Gold level, 0.1
4 | Platinum level, 0.05
5 | Diamond level, 0.03
6 | Elite level, 0.02


--------------------------------------------------------------------------------
/src/main/resources/dictionaries/accountTypes.txt:
--------------------------------------------------------------------------------
 1 | certificate of deposit
 2 | credit card
 3 | retirement account
 4 | merchant account
 5 | escrow account
 6 | trust account
 7 | foreign currency
 8 | corporate account
 9 | brokerage account
10 | custodial account
11 | internet account
12 | debit card
13 | prepaid card
14 | 


--------------------------------------------------------------------------------
/src/main/resources/dictionaries/goodsTypes.txt:
--------------------------------------------------------------------------------
 1 | Food
 2 | Clothing
 3 | Electronics
 4 | Furniture
 5 | Household appliances
 6 | Personal care products
 7 | Sports equipment
 8 | Books and media
 9 | Toys and games
10 | Automotive products
11 | Beauty products
12 | Jewelry
13 | Pet supplies
14 | Health and wellness products
15 | Home decor
16 | Office supplies
17 | Tools and hardware
18 | Garden supplies
19 | Musical instruments
20 | Art and craft supplies


--------------------------------------------------------------------------------
/src/main/resources/dictionaries/guaranteeRelationships.txt:
--------------------------------------------------------------------------------
1 | friends, 0.3
2 | business associate, 0.2
3 | parents, 0.2
4 | siblings, 0.2
5 | other relatives, 0.1
6 | 


--------------------------------------------------------------------------------
/src/main/resources/dictionaries/loanOrganizations.txt:
--------------------------------------------------------------------------------
 1 | American Express
 2 | Avant
 3 | Bank of America
 4 | Best Egg
 5 | BlueVine
 6 | Capital One
 7 | Chase Bank
 8 | Citibank
 9 | Discover
10 | Earnest
11 | Fundbox
12 | Funding Circle
13 | Fundrise
14 | Kabbage
15 | LendingClub
16 | LendingPoint
17 | LightStream
18 | Marcus by Goldman Sachs
19 | Navy Federal Credit Union
20 | OnDeck
21 | OneMain Financial
22 | PNC Bank
23 | Patch of Land
24 | Peerform
25 | Prosper
26 | RealtyMogul
27 | Rocket Loans
28 | Roofstock
29 | SoFi
30 | State Farm Bank
31 | SunTrust
32 | TD Bank
33 | US Bank
34 | Upgrade
35 | Upstart
36 | Wells Fargo
37 | 


--------------------------------------------------------------------------------
/src/main/resources/dictionaries/loanUsages.txt:
--------------------------------------------------------------------------------
 1 | major purchases
 2 | investing
 3 | renovations
 4 | debt consolidation
 5 | business ventures
 6 | education
 7 | medical expenses
 8 | vacations
 9 | weddings
10 | funerals
11 | other


--------------------------------------------------------------------------------
/src/main/resources/dictionaries/mediumNames.txt:
--------------------------------------------------------------------------------
 1 | POS
 2 | ATM
 3 | WIFI
 4 | PHONE
 5 | IPv4
 6 | IPv6
 7 | MAC
 8 | QRCode
 9 | NFC
10 | RFID


--------------------------------------------------------------------------------
/src/main/resources/dictionaries/payTypes.txt:
--------------------------------------------------------------------------------
 1 | PayPal
 2 | Apple Pay
 3 | Google Pay
 4 | Alipay
 5 | WeChat Pay
 6 | Venmo
 7 | Cash App
 8 | Bank Transfer
 9 | Gift Card
10 | Cryptocurrency
11 | Money Order
12 | Cheque
13 | Direct Deposit
14 | E-wallets
15 | Mobile Carrier Billing
16 | Invoice Factoring
17 | Payment Plan


--------------------------------------------------------------------------------
/src/main/resources/dictionaries/riskLevels.txt:
--------------------------------------------------------------------------------
 1 | Low risk
 2 | Moderate risk
 3 | High risk
 4 | Very high risk
 5 | Extreme risk
 6 | Minimal risk
 7 | Significant risk
 8 | Severe risk
 9 | Critical risk
10 | 


--------------------------------------------------------------------------------
/src/main/resources/distributions/facebookPowerlawBucket.dat:
--------------------------------------------------------------------------------
  1 | 0 1 1
  2 | 0 1 2
  3 | 0 1.5 3
  4 | 1.5 2.2 4
  5 | 2.2 3.55 5
  6 | 3.55 4.37 6
  7 | 4.37 5.37 7
  8 | 5.37 6.61 8
  9 | 6.61 8.13 9
 10 | 8.13 10 10
 11 | 10 11.22 11
 12 | 11.22 12 12
 13 | 12 14 13
 14 | 14 16 14
 15 | 16 17 15
 16 | 17 19 16
 17 | 19 20 17
 18 | 20 22 18
 19 | 22 23 19
 20 | 23 25 20
 21 | 25 26 21
 22 | 26 28 22
 23 | 28 30 23
 24 | 30 31 24
 25 | 31 33 25
 26 | 33 35 26
 27 | 35 36 27
 28 | 36 38 28
 29 | 38 40 29
 30 | 40 42 30
 31 | 42 44 31
 32 | 44 46 32
 33 | 46 49 33
 34 | 49 51 34
 35 | 51 54 35
 36 | 54 56 36
 37 | 56 59 37
 38 | 59 61 38
 39 | 61 64 39
 40 | 64 66 40
 41 | 66 69 41
 42 | 69 72 42
 43 | 72 75 43
 44 | 75 78 44
 45 | 78 82 45
 46 | 82 85 46
 47 | 85 88 47
 48 | 88 92 48
 49 | 92 95 49
 50 | 95 99 50
 51 | 99 102 51
 52 | 102 106 52
 53 | 106 110 53
 54 | 110 113 54
 55 | 113 117 55
 56 | 117 122 56
 57 | 122 126 57
 58 | 126 130 58
 59 | 130 135 59
 60 | 135 139 60
 61 | 139 144 61
 62 | 144 149 62
 63 | 149 154 63
 64 | 154 160 64
 65 | 160 166 65
 66 | 166 172 66
 67 | 172 180 67
 68 | 180 188 68
 69 | 188 196 69
 70 | 196 204 70
 71 | 204 211 71
 72 | 211 217 72
 73 | 217 223 73
 74 | 223 229 74
 75 | 229 236 75
 76 | 236 243 76
 77 | 243 252 77
 78 | 252 261 78
 79 | 261 272 79
 80 | 272 283 80
 81 | 283 295 81
 82 | 295 307 82
 83 | 307 320 83
 84 | 320 334 84
 85 | 334 349 85
 86 | 349 365 86
 87 | 365 383 87
 88 | 383 402 88
 89 | 402 423 89
 90 | 423 447 90
 91 | 447 478 91
 92 | 478 519 92
 93 | 519 570 93
 94 | 570 623 94
 95 | 623 674 95
 96 | 674 723 96
 97 | 723 781 97
 98 | 781 863 98
 99 | 863 1029 99
100 | 1029 5000 100


--------------------------------------------------------------------------------
/src/main/resources/distributions/hourDistribution.dat:
--------------------------------------------------------------------------------
 1 | 0.2593526012653495 8
 2 | 0.05160376982202897 11
 3 | 0.048674656913491814 10
 4 | 0.0484494454801552 12
 5 | 0.04783513640895614 17
 6 | 0.04541527278103532 18
 7 | 0.04535949212906142 21
 8 | 0.04509153664720135 9
 9 | 0.04395123499873539 16
10 | 0.04118797695085037 15
11 | 0.04104902499552937 20
12 | 0.04088250920364632 19
13 | 0.04061096101271552 22
14 | 0.04058332383752758 13
15 | 0.03985351173213859 14
16 | 0.03255691753542388 7
17 | 0.028262704185304518 23
18 | 0.01786883161088602 0
19 | 0.01548368454454632 6
20 | 0.008970635808854882 1
21 | 0.005209140099636779 5
22 | 0.0051042119651415006 2
23 | 0.0035019847725102494 3
24 | 0.0031414352992730145 4


--------------------------------------------------------------------------------
/src/main/resources/distributions/inDegreeRegression.txt:
--------------------------------------------------------------------------------
1 | # Sample139, Sample184, Sample177
2 | alpha: 109539041.821,78379700.038,133908623.887
3 | beta: -2.319,-2.319,-2.085


--------------------------------------------------------------------------------
/src/main/resources/distributions/multiplicityPowerlawRegression.txt:
--------------------------------------------------------------------------------
1 | # Sample Hub Vertex
2 | alpha: 27230469.375668973, 1141214.9408893671, 156038.86238756854
3 | beta: -1.573305530151192, -1.488771851133902, -1.1012676833313366
4 | average: 1.759, 1.786, 1.969


--------------------------------------------------------------------------------
/src/main/resources/distributions/outDegreeRegression.txt:
--------------------------------------------------------------------------------
1 | # Sample139, Sample184, Sample177
2 | alpha: 20186572.914,14153912.686,20194472.855
3 | beta: -1.720,-1.719,-1.720


--------------------------------------------------------------------------------
/src/main/resources/distributions/powerLawActivityDeleteDate.txt:
--------------------------------------------------------------------------------
 1 | 0
 2 | 0.17
 3 | 0.22
 4 | 0.38
 5 | 0.46
 6 | 0.54
 7 | 0.586
 8 | 0.61
 9 | 0.652
10 | 0.752
11 | 0.812
12 | 0.891
13 | 0.949
14 | 0.973
15 | 0.986
16 | 0.994
17 | 0.998
18 | 1
19 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=INFO, file
 2 | 
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 5 | log4j.appender.console.layout.ConversionPattern=%d{ISO8601} [%t] %-5p %c - %m%n
 6 | 
 7 | log4j.appender.file=org.apache.log4j.FileAppender
 8 | log4j.appender.file.File=/tmp/spark-events/spark.log
 9 | log4j.appender.file.layout=org.apache.log4j.PatternLayout
10 | log4j.appender.file.layout.ConversionPattern=%d{ISO8601} [%t] %-5p %c - %m%n


--------------------------------------------------------------------------------
/src/main/resources/params_default.ini:
--------------------------------------------------------------------------------
 1 | spark.blockSize:10000
 2 | 
 3 | generator.defaultSeed:23
 4 | generator.outputDir:out
 5 | generator.startYear:2020
 6 | generator.numYears:3
 7 | generator.activityDelta:60000
 8 | generator.deleteDelta:86400000
 9 | generator.numUpdateStreams:1
10 | 
11 | company.maxDescriptionLength:200
12 | 
13 | account.blockedAccountRatio:0.05
14 | 
15 | medium.blockedMediumRatio:0.05
16 | 
17 | own.maxAccounts:5
18 | 
19 | transfer.degreeDistribution:powerlaw
20 | transfer.multiplicityDistribution:powerlaw
21 | transfer.minMultiplicity:1
22 | transfer.maxMultiplicity:100
23 | transfer.maxAmount:10000000
24 | # not used any more
25 | transfer.baseProbCorrelated:0.99
26 | transfer.limitProCorrelated:0.5
27 | transfer.generationMode:loose
28 | transfer.shuffleTimes:1
29 | 
30 | withdraw.accountWithdrawFraction:0.3
31 | withdraw.maxWithdrawals:30
32 | withdraw.maxAmount:10000000
33 | 
34 | signIn.accountSignedInFraction:1.0
35 | signIn.maxAccountToSignIn:4
36 | signIn.maxMultiplicity:10
37 | 
38 | guarantee.personGuaranteeFraction:0.6
39 | guarantee.companyGuaranteeFraction:0.6
40 | guarantee.maxTargetsToGuarantee:3
41 | 
42 | loan.personLoanFraction:0.6
43 | loan.companyLoanFraction:0.6
44 | loan.involvedAccountsFraction:0.8
45 | loan.maxLoans:5
46 | loan.minLoanAmount:10000
47 | loan.maxLoanAmount:100000000
48 | loan.numSubEvents:10
49 | loan.maxLoanInterest:0.1
50 | 
51 | invest.companyInvestedFraction:1.0
52 | invest.minInvestors:1
53 | invest.maxInvestors:5


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/factors/AccountItemsGenerator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.factors
18 | 
19 | import org.apache.spark.sql.{SparkSession, functions => F}
20 | import org.apache.spark.sql.functions.max
21 | import org.apache.spark.sql.functions.lit
22 | 
23 | object AccountItemsGenerator {
24 |   def generateAccountItems(implicit spark: SparkSession): Unit = {
25 |     import spark.implicits._
26 | 
27 |     val accountRDD = spark.read
28 |       .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
29 |       .option("header", "true")
30 |       .option("delimiter", "|")
31 |       .load("./out/raw/account/*.csv")
32 | 
33 |     val transferRDD = spark.read
34 |       .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
35 |       .option("header", "true")
36 |       .option("delimiter", "|")
37 |       .load("./out/raw/transfer/*.csv")
38 | 
39 |     val withdrawRDD = spark.read
40 |       .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
41 |       .option("header", "true")
42 |       .option("delimiter", "|")
43 |       .load("./out/raw/withdraw/*.csv")
44 | 
45 |     val combinedRDD = transferRDD
46 |       .select($"fromId", $"toId", $"amount".cast("double"))
47 |       .union(withdrawRDD.select($"fromId", $"toId", $"amount".cast("double")))
48 | 
49 |     val maxAmountRDD = combinedRDD
50 |       .groupBy($"fromId", $"toId")
51 |       .agg(max($"amount").alias("maxAmount"))
52 | 
53 |     val accountItemsRDD = maxAmountRDD
54 |       .groupBy($"fromId")
55 |       .agg(F.collect_list(F.array($"toId", $"maxAmount")).alias("items"))
56 |       .select($"fromId".alias("account_id"), $"items")
57 |       .sort($"account_id")
58 | 
59 |     val transformedAccountItemsRDD = accountItemsRDD
60 |       .withColumn(
61 |         "items",
62 |         F.expr(
63 |           "transform(items, array -> concat('[', concat_ws(',', array), ']'))"
64 |         )
65 |       )
66 |       .withColumn(
67 |         "items",
68 |         F.concat_ws(",", $"items")
69 |       )
70 |       .withColumn(
71 |         "items",
72 |         F.concat(lit("["), $"items", lit("]"))
73 |       )
74 | 
75 |     transformedAccountItemsRDD
76 |       .coalesce(1)
77 |       .write
78 |       .option("header", "true")
79 |       .option("delimiter", "|")
80 |       .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
81 |       .save("./out/factor_table/account_items")
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/factors/FactorTable.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.factors
18 | 
19 | import ldbc.finbench.datagen.model.{Graph, GraphDef, Mode}
20 | import org.apache.spark.sql.DataFrame
21 | 
22 | case class FactorTable[M <: Mode](
23 |     name: String,
24 |     data: DataFrame,
25 |     source: Graph[M]
26 | )
27 | 
28 | case class FactorTableDef[M <: Mode](
29 |     name: String,
30 |     sourceDef: GraphDef[M]
31 | )
32 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/generation/generators/SparkAccountGenerator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.generators
18 | 
19 | import ldbc.finbench.datagen.config.DatagenConfiguration
20 | import ldbc.finbench.datagen.entities.nodes.Account
21 | import ldbc.finbench.datagen.generation.{DatagenContext, DatagenParams}
22 | import org.apache.spark.rdd.RDD
23 | import org.apache.spark.sql.SparkSession
24 | 
25 | import scala.collection.JavaConverters.asScalaIteratorConverter
26 | 
27 | // SparkAccountGenerator is not used to generate account data directly.
28 | object SparkAccountGenerator {
29 | //  def apply(conf: DatagenConfiguration, numPartitions: Option[Int] = None)(
30 | //    implicit spark: SparkSession): RDD[Account] = {
31 | //    val numAccounts = 10000
32 | //
33 | //    val accountPartitionGenerator = (blocks: Iterator[Long]) => {
34 | //      DatagenContext.initialize(conf)
35 | //      val accountGenerator = new AccountGenerator()
36 | //      for {
37 | //        i <- blocks
38 | //        size = Math.min(numAccounts - DatagenParams.blockSize * i, DatagenParams.blockSize)
39 | //        account <- accountGenerator.
40 | //      } yield account
41 | //    }
42 | //    val numAccountBlocks = Math.ceil(numAccounts / DatagenParams.blockSize.toDouble).toInt
43 | //    val partitions = numPartitions.getOrElse(spark.sparkContext.defaultParallelism)
44 | //    val accountRdd = spark.sparkContext
45 | //      .range(0, numAccountBlocks, step = 1, numSlices = partitions)
46 | //      .mapPartitions(accountPartitionGenerator)
47 | //
48 | //    accountRdd
49 | //  }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/generation/generators/SparkCompanyGenerator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.generators
18 | 
19 | import ldbc.finbench.datagen.config.DatagenConfiguration
20 | import ldbc.finbench.datagen.entities.nodes.Company
21 | import ldbc.finbench.datagen.generation.DatagenContext
22 | import org.apache.spark.rdd.RDD
23 | import org.apache.spark.sql.SparkSession
24 | 
25 | import scala.collection.JavaConverters.asScalaIteratorConverter
26 | 
27 | object SparkCompanyGenerator {
28 |   def apply(
29 |       numCompanies: Long,
30 |       config: DatagenConfiguration,
31 |       blockSize: Int
32 |   )(implicit spark: SparkSession): RDD[Company] = {
33 |     val numBlocks = Math.ceil(numCompanies / blockSize.toDouble).toInt
34 |     val partitions = Math.min(numBlocks, spark.sparkContext.defaultParallelism)
35 | 
36 |     spark.sparkContext
37 |       .range(0, numBlocks, step = 1, numSlices = partitions)
38 |       .mapPartitions { blocks =>
39 |         DatagenContext.initialize(config)
40 |         val companyGenerator = new CompanyGenerator()
41 | 
42 |         blocks.flatMap { i =>
43 |           val size = Math.min(numCompanies - blockSize * i, blockSize)
44 |           companyGenerator
45 |             .generateCompanyBlock(i.toInt, blockSize)
46 |             .asScala
47 |             .take(size.toInt)
48 |         }
49 |       }
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/generation/generators/SparkMediumGenerator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.generators
18 | 
19 | import ldbc.finbench.datagen.config.DatagenConfiguration
20 | import ldbc.finbench.datagen.entities.nodes.Medium
21 | import ldbc.finbench.datagen.generation.DatagenContext
22 | import org.apache.spark.rdd.RDD
23 | import org.apache.spark.sql.SparkSession
24 | 
25 | import scala.collection.JavaConverters.asScalaIteratorConverter
26 | 
27 | object SparkMediumGenerator {
28 |   def apply(
29 |       numMedia: Long,
30 |       config: DatagenConfiguration,
31 |       blockSize: Int
32 |   )(implicit spark: SparkSession): RDD[Medium] = {
33 |     val numBlocks = Math.ceil(numMedia / blockSize.toDouble).toInt
34 |     val partitions = Math.min(numBlocks, spark.sparkContext.defaultParallelism)
35 | 
36 |     spark.sparkContext
37 |       .range(0, numBlocks, step = 1, numSlices = partitions)
38 |       .mapPartitions { blocks =>
39 |         DatagenContext.initialize(config)
40 |         val mediumGenerator = new MediumGenerator()
41 | 
42 |         blocks.flatMap { i =>
43 |           val size = Math.min(numMedia - blockSize * i, blockSize)
44 |           mediumGenerator
45 |             .generateMediumBlock(i.toInt, blockSize)
46 |             .asScala
47 |             .take(size.toInt)
48 |         }
49 |       }
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/generation/generators/SparkPersonGenerator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generation.generators
18 | 
19 | import ldbc.finbench.datagen.config.DatagenConfiguration
20 | import ldbc.finbench.datagen.entities.nodes.Person
21 | import ldbc.finbench.datagen.generation.DatagenContext
22 | import org.apache.spark.rdd.RDD
23 | import org.apache.spark.sql.SparkSession
24 | 
25 | import scala.collection.JavaConverters.asScalaIteratorConverter
26 | 
27 | object SparkPersonGenerator {
28 |   def apply(numPersons: Long, config: DatagenConfiguration, blockSize: Int)(
29 |       implicit spark: SparkSession
30 |   ): RDD[Person] = {
31 |     val numBlocks = Math.ceil(numPersons / blockSize.toDouble).toInt
32 |     val partitions = Math.min(numBlocks, spark.sparkContext.defaultParallelism)
33 | 
34 |     spark.sparkContext
35 |       .range(0, numBlocks, step = 1, numSlices = partitions)
36 |       .mapPartitions { blocks =>
37 |         DatagenContext.initialize(config)
38 |         val personGenerator = new PersonGenerator()
39 | 
40 |         blocks.flatMap { i =>
41 |           val size = Math.min(numPersons - blockSize * i, blockSize)
42 |           personGenerator
43 |             .generatePersonBlock(i.toInt, blockSize)
44 |             .asScala
45 |             .take(size.toInt)
46 |         }
47 |       }
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/io/Reader.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.io
18 | 
19 | trait Reader[T] {
20 |   type Ret
21 | 
22 |   def read(self: T): Ret
23 |   def exists(self: T): Boolean
24 | }
25 | 
26 | object Reader {
27 |   type Aux[T, R] = Reader[T] { type Ret = R }
28 | 
29 |   def apply[T, R](implicit r: Reader.Aux[T, R]): Reader.Aux[T, R] = implicitly[Reader.Aux[T, R]]
30 | 
31 |   trait ReaderOps[T] {
32 |     type Ret
33 |     def tcInstance: Reader.Aux[T, Ret]
34 |     def self: T
35 |     def read: Ret = tcInstance.read(self)
36 |   }
37 | 
38 |   object ReaderOps {
39 |     type Aux[T, R] = ReaderOps[T] { type Ret = R }
40 |   }
41 | 
42 |   object ops {
43 |     import scala.language.implicitConversions
44 |     implicit def toReaderOps[T, R](target: T)(implicit tc: Reader.Aux[T, R]): ReaderOps.Aux[T, R] =
45 |       new ReaderOps[T] {
46 |         override type Ret = R
47 |         override def tcInstance: Aux[T, R] = tc
48 |         override def self: T               = target
49 |       }
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/io/Writer.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.io
18 | 
19 | trait Writer[S] {
20 |   type Data
21 | //  def write(self: Data, sink: S): Unit
22 | }
23 | 
24 | object Writer {
25 |   type Aux[S, D] = Writer[S] { type Data = D }
26 |   def apply[S, D](implicit r: Writer.Aux[S, D]): Writer.Aux[S, D] = implicitly[Writer.Aux[S, D]]
27 | 
28 |   trait WriterOps[Data] {
29 |     type Sink
30 |     def tcInstance: Writer.Aux[Sink, Data]
31 |     def self: Data
32 |   }
33 | 
34 |   object WriterOps {
35 |     type Aux[Data, S] = WriterOps[Data] { type Sink = S }
36 |   }
37 | 
38 |   object ops {
39 |     import scala.language.implicitConversions
40 |     implicit def toWriterOps[Data, S](target: Data)(
41 |         implicit tc: Writer.Aux[S, Data]): WriterOps.Aux[Data, S] = new WriterOps[Data] {
42 |       override type Sink = S
43 |       override def tcInstance: Aux[S, Data] = tc
44 |       override def self: Data               = target
45 |     }
46 |   }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/io/dataframes.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.io
18 | 
19 | import java.net.URI
20 | 
21 | import org.apache.hadoop.fs.{FileSystem, Path}
22 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
23 | import org.apache.spark.sql.types.StructType
24 | 
25 | object dataframes {
26 | 
27 |   case class DataFrameSource(
28 |       path: String,
29 |       format: String,
30 |       formatOptions: Map[String, String] = Map.empty,
31 |       schema: Option[StructType] = None
32 |   )
33 | 
34 |   private class DataFrameReader(implicit spark: SparkSession) extends Reader[DataFrameSource] {
35 |     override type Ret = DataFrame
36 | 
37 |     override def read(self: DataFrameSource): DataFrame = {
38 |       spark.read
39 |         .format(self.format)
40 |         .options(self.formatOptions)
41 |         .schema(self.schema.get)
42 |         .load(self.path)
43 |     }
44 | 
45 |     override def exists(self: DataFrameSource): Boolean = {
46 |       val hadoopPath = new Path(self.path)
47 |       val fs         = FileSystem.get(URI.create(self.path), spark.sparkContext.hadoopConfiguration)
48 |       fs.exists(hadoopPath)
49 |     }
50 |   }
51 | 
52 |   trait ReaderInstances {
53 |     implicit def dataFrameReader(
54 |         implicit spark: SparkSession): Reader.Aux[DataFrameSource, DataFrame] =
55 |       new DataFrameReader
56 |   }
57 | 
58 |   case class DataFrameSink(path: String,
59 |                            format: String,
60 |                            formatOptions: Map[String, String] = Map.empty,
61 |                            mode: SaveMode = SaveMode.ErrorIfExists,
62 |                            partitionBy: Seq[String] = Seq.empty)
63 | 
64 |   private object DataFrameWriter extends Writer[DataFrameSink] {
65 |     override type Data = DataFrame
66 |   }
67 | 
68 |   trait WriterInstances {
69 |     implicit val dataFrameWriter: Writer.Aux[DataFrameSink, DataFrame] = DataFrameWriter
70 |   }
71 | 
72 |   trait Instances extends WriterInstances with ReaderInstances
73 | 
74 |   object instances extends Instances
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/io/graphs.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.io
18 | 
19 | import ldbc.finbench.datagen.model.Mode.Raw.Layout
20 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
21 | import org.slf4j.{Logger, LoggerFactory}
22 | 
23 | import scala.reflect.internal.Mode
24 | 
25 | object graphs {
26 | 
27 |   case class GraphSink(
28 |       path: String,
29 |       format: String,
30 |       formatOptions: Map[String, String] = Map.empty,
31 |       saveMode: SaveMode = SaveMode.ErrorIfExists
32 |   )
33 | 
34 |   case class GraphSource[M <: Mode](implicit spark: SparkSession, en: DataFrame =:= Layout)
35 |       extends Reader[GraphSource[M]] {
36 |     @transient lazy val log: Logger = LoggerFactory.getLogger(this.getClass)
37 | 
38 |     override type Ret = this.type
39 | 
40 |     override def read(self: GraphSource[M]): GraphSource.this.type = ???
41 | 
42 |     override def exists(self: GraphSource[M]): Boolean = ???
43 |   }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/io/raw/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.io
18 | 
19 | import org.apache.spark.sql.SaveMode
20 | 
21 | package object raw {
22 | 
23 |   sealed trait RawFormat
24 |   case object Csv     extends RawFormat { override def toString = "org.apache.spark.sql.execution.datasources.csv.CSVFileFormat"     }
25 |   case object Parquet extends RawFormat { override def toString = "parquet" }
26 | 
27 |   case class RawSink(
28 |       outputDir: String,
29 |       format: RawFormat,
30 |       partitions: Option[Int] = None,
31 |       formatOptions: Map[String, String] = Map.empty,
32 |       mode: SaveMode = SaveMode.ErrorIfExists,
33 |       partitionBy: Seq[String] = Seq.empty
34 |   )
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/syntax/FluentSyntax.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.syntax
18 | 
19 | import scala.language.implicitConversions
20 | 
21 | trait FluentSyntax {
22 |   @`inline` implicit final def fluentSyntaxOps[A](a: A) = new FluentSyntaxOps(a)
23 | }
24 | 
25 | final class FluentSyntaxOps[A](private val self: A) extends AnyVal {
26 | 
27 |   /** Fluent syntax for folding with self as the base item.
28 |     */
29 |   def pipeFoldLeft[F](foldable: TraversableOnce[F])(op: (A, F) => A): A = {
30 |     foldable.foldLeft(self)(op)
31 |   }
32 | 
33 |   /** Fluent syntax for applying a function on self. d
34 |     */
35 |   def pipe[R](f: A => R): R = f(self)
36 | 
37 |   /** Fluent syntax for applying a side-effect on self.
38 |     */
39 |   def tap(f: A => Unit): A = {
40 |     f(self)
41 |     self
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/syntax/PathSyntax.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.syntax
18 | 
19 | import org.apache.hadoop.fs.Path
20 | 
21 | import java.net.URI
22 | import scala.language.implicitConversions
23 | 
24 | trait PathSyntax {
25 |   @`inline` implicit final def pathSyntaxOpsForString[A](a: String): PathSyntaxOpsForString = new PathSyntaxOpsForString(a)
26 |   @`inline` implicit final def pathSyntaxOpsForPath[A](a: Path): PathSyntaxOpsForPath       = new PathSyntaxOpsForPath(a)
27 |   @`inline` implicit final def pathSyntaxOpsForUri[A](a: URI): PathSyntaxOpsForUri          = new PathSyntaxOpsForUri(a)
28 | }
29 | 
30 | final class PathSyntaxOpsForString(private val self: String) extends AnyVal {
31 |   import PathSyntaxOpsHelpers._
32 |   def /(child: String): Path = join(new Path(self), new Path(child))
33 |   def /(child: Path): Path   = join(new Path(self), child)
34 | }
35 | 
36 | final class PathSyntaxOpsForPath(private val self: Path) extends AnyVal {
37 |   import PathSyntaxOpsHelpers._
38 |   def /(child: String): Path = join(self, new Path(child))
39 |   def /(child: Path): Path   = join(self, child)
40 | }
41 | 
42 | final class PathSyntaxOpsForUri(private val self: URI) extends AnyVal {
43 |   import PathSyntaxOpsHelpers._
44 |   def /(child: String): Path = join(new Path(self), new Path(child))
45 |   def /(child: Path): Path   = join(new Path(self), child)
46 | }
47 | 
48 | private[syntax] object PathSyntaxOpsHelpers {
49 |   def join(path1: Path, path2: Path): Path = new Path(ensureTrailingSlashForAbsoluteUri(path1), path2)
50 | 
51 |   private[this] def ensureTrailingSlashForAbsoluteUri(path: Path): Path = {
52 |     if (path.isAbsolute)
53 |       return path
54 | 
55 |     val uri = path.toUri
56 | 
57 |     if (uri.getScheme == null || uri.getPath != "")
58 |       return path
59 | 
60 |     new Path(new URI(uri.getScheme, uri.getAuthority, "/", uri.getQuery, uri.getFragment))
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/syntax/SparkSqlSyntax.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.syntax
18 | 
19 | import org.apache.spark.sql.{Column, ColumnName, DataFrame, Dataset}
20 | 
21 | import scala.language.implicitConversions
22 | 
23 | trait SparkSqlSyntax {
24 |   @`inline` implicit final def datasetOps[A](a: Dataset[A])           = new DatasetOps(a)
25 |   @`inline` implicit final def stringToColumnOps[A](a: StringContext) = new StringToColumnOps(a)
26 | }
27 | 
28 | final class DatasetOps[A](private val self: Dataset[A]) extends AnyVal {
29 |   def |+|(other: Dataset[A]): Dataset[A] = self union other
30 | 
31 |   def select(columns: Seq[Column]): DataFrame = self.select(columns: _*)
32 | 
33 |   def partition(expr: Column): (Dataset[A], Dataset[A]) = {
34 |     val df = self.cache()
35 |     (df.filter(expr), df.filter(!expr || expr.isNull))
36 |   }
37 | }
38 | 
39 | final class StringToColumnOps(private val sc: StringContext) extends AnyVal {
40 |   def $(args: Any*): ColumnName = new ColumnName(sc.s(args: _*))
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/syntax/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen
18 | 
19 | package object syntax extends SparkSqlSyntax with FluentSyntax with PathSyntax


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/util/Logging.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.util
18 | 
19 | import org.slf4j.{Logger, LoggerFactory}
20 | 
21 | trait Logging {
22 |   @transient lazy val log: Logger = LoggerFactory.getLogger(this.getClass)
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/util/SparkApp.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package ldbc.finbench.datagen.util
 18 | 
 19 | import ldbc.finbench.datagen.entities.edges._
 20 | import ldbc.finbench.datagen.entities.nodes._
 21 | import org.apache.spark.SparkConf
 22 | import org.apache.spark.sql.SparkSession
 23 | 
 24 | trait SparkApp {
 25 |   def appName: String
 26 | 
 27 |   type ArgsType
 28 | 
 29 |   /** execute the data generation process
 30 |     */
 31 |   def run(args: ArgsType): Unit
 32 | 
 33 |   /** set the {@@linkSparkConf}
 34 |     */
 35 |   val sparkConf = setConf(new SparkConf(), defaultSparkConf)
 36 | 
 37 |   /** spark entry {@@linkSparkSession}
 38 |     */
 39 |   implicit def spark: SparkSession =
 40 |     SparkSession
 41 |       .builder()
 42 |       .master("local")
 43 |       .appName(appName)
 44 |       .config(sparkConf)
 45 |       .getOrCreate()
 46 | 
 47 |   private def applySparkConf(sparkConf: Map[String, String])(
 48 |       builder: SparkSession.Builder
 49 |   ) =
 50 |     sparkConf.foldLeft(builder) { case (b, (k, v)) => b.config(k, v) }
 51 | 
 52 |   def setConf(sparkConf: SparkConf, conf: Map[String, String]): SparkConf = {
 53 |     conf.map(entry => {
 54 |       if (!sparkConf.contains(entry._1)) {
 55 |         sparkConf.set(entry._1, entry._2)
 56 |       }
 57 |     })
 58 |     registerKyroClasses(sparkConf)
 59 |   }
 60 | 
 61 |   def registerKyroClasses(sparkConf: SparkConf): SparkConf = {
 62 |     // register kryo classes for nodes
 63 |     sparkConf.registerKryoClasses(
 64 |       Array(
 65 |         classOf[Account],
 66 |         classOf[Company],
 67 |         classOf[Loan],
 68 |         classOf[Medium],
 69 |         classOf[Person]
 70 |       )
 71 |     )
 72 |     // register kryo classes for edges
 73 |     sparkConf.registerKryoClasses(
 74 |       Array(
 75 |         classOf[CompanyApplyLoan],
 76 |         classOf[CompanyGuaranteeCompany],
 77 |         classOf[CompanyInvestCompany],
 78 |         classOf[CompanyOwnAccount],
 79 |         classOf[PersonApplyLoan],
 80 |         classOf[PersonGuaranteePerson],
 81 |         classOf[PersonInvestCompany],
 82 |         classOf[PersonOwnAccount],
 83 |         classOf[Repay],
 84 |         classOf[SignIn],
 85 |         classOf[Transfer],
 86 |         classOf[Withdraw]
 87 |       )
 88 |     )
 89 |     sparkConf
 90 |   }
 91 | 
 92 |   def defaultSparkConf: Map[String, String] = Map(
 93 |     "spark.sql.session.timeZone" -> "GMT",
 94 |     "spark.sql.sources.useV1SourceList" -> "csv"
 95 |   )
 96 | 
 97 |   protected lazy val env: SparkEnv = new SparkEnv
 98 | 
 99 | }
100 | 
101 | trait DatagenStage extends SparkApp {
102 |   override val appName: String =
103 |     s"LDBC Finbench Datagen for Spark: ${this.getClass.getSimpleName.stripSuffix("$")}"
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/util/SparkEnv.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.util
18 | 
19 | import org.apache.spark.sql.SparkSession
20 | 
21 | import scala.collection.JavaConverters._
22 | 
23 | class SparkEnv(implicit spark: SparkSession) {
24 |   private val sysenv       = System.getenv().asScala
25 |   private val invalidChars = raw"[.-]"
26 | 
27 |   def env(key: String): Option[String] = {
28 |     sysenv
29 |       .get(s"LDBC_FINBENCH_DATAGEN_${camelToUpper(key.replaceAll(invalidChars, "_"))}")
30 |       .orElse(spark.conf.getOption(s"spark.ldbc.finbench.datagen.$key"))
31 |   }
32 | 
33 |   val irFormat = env("irFormat").getOrElse("parquet")
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/util/SparkUI.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.util
18 | 
19 | import org.apache.spark.sql.SparkSession
20 | import scala.concurrent.{Future, Await}
21 | import scala.concurrent.ExecutionContext.Implicits.global
22 | 
23 | object SparkUI {
24 |   def job[T](jobGroup: String, jobDescription: String)(action: => T)(
25 |     implicit spark: SparkSession): T = {
26 |     spark.sparkContext.setJobGroup(jobGroup, jobDescription)
27 |     try {
28 |       action
29 |     } finally {
30 |       spark.sparkContext.clearJobGroup()
31 |     }
32 |   }
33 | 
34 |   def jobAsync(jobGroup: String, jobDescription: String)(action: => Unit)(
35 |     implicit spark: SparkSession): Future[Unit] = {
36 |     spark.sparkContext.setJobGroup(jobGroup, jobDescription)
37 |     val future = Future {
38 |       action
39 |     }
40 |     future.onComplete { _ => 
41 |       spark.sparkContext.clearJobGroup()
42 |     }
43 |     future
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/ldbc/finbench/datagen/util/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen
18 | 
19 | import java.util.function.IntFunction
20 | 
21 | import com.google.common.base.CaseFormat
22 | 
23 | import scala.reflect.ClassTag
24 | 
25 | package object util {
26 |   def arrayOfSize[A: ClassTag] = new IntFunction[Array[A]] {
27 |     override def apply(value: Int) = new Array[A](value)
28 |   }
29 | 
30 |   def simpleNameOf[T: ClassTag] = implicitly[ClassTag[T]].runtimeClass.getSimpleName
31 | 
32 |   def pascalToCamel(str: String) = CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_CAMEL, str)
33 | 
34 |   def camelToUpper(str: String) = CaseFormat.LOWER_CAMEL.to(CaseFormat.UPPER_UNDERSCORE, str)
35 | 
36 |   def lower(str: String) = str.toLowerCase
37 | }
38 | 


--------------------------------------------------------------------------------
/src/test/java/ldbc/finbench/datagen/generators/GeneratorTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.generators;
18 | 
19 | import java.util.Map;
20 | import java.util.Random;
21 | import ldbc.finbench.datagen.config.ConfigParser;
22 | import ldbc.finbench.datagen.config.DatagenConfiguration;
23 | import ldbc.finbench.datagen.entities.nodes.Person;
24 | import ldbc.finbench.datagen.generation.DatagenContext;
25 | import ldbc.finbench.datagen.generation.dictionary.Dictionaries;
26 | import ldbc.finbench.datagen.generation.generators.PersonGenerator;
27 | import org.junit.Test;
28 | 
29 | public class GeneratorTest {
30 |     Map<String, String> config;
31 | 
32 |     public GeneratorTest() {
33 |         config = ConfigParser.readConfig("src/main/resources/params_default.ini");
34 |         config.putAll(ConfigParser.scaleFactorConf("", "0.1")); // use scale factor 0.1
35 |         DatagenContext.initialize(new DatagenConfiguration(config));
36 |     }
37 | 
38 |     @Test
39 |     public void testPersonGenerator() {
40 |         PersonGenerator personGenerator = new PersonGenerator();
41 |         Person person = personGenerator.generatePerson();
42 |         assert null != person;
43 |     }
44 | 
45 |     @Test
46 |     public void testDatagenContext() {
47 |         Random random = new Random();
48 |         System.out.println(Dictionaries.personNames.getUniformDistRandName(random));
49 |     }
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/src/test/java/ldbc/finbench/datagen/util/GeneralTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.util;
18 | 
19 | import java.util.Map;
20 | import ldbc.finbench.datagen.config.ConfigParser;
21 | import org.junit.Test;
22 | 
23 | public class GeneralTest {
24 | 
25 |     @Test
26 |     public void testConfigParser() {
27 |         Map<String, String> config = ConfigParser.readConfig("src/main/resources/params_default.ini");
28 |         System.out.println(config);
29 |         assert config.size() > 0;
30 |     }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/test/scala/ldbc/finbench/datagen/util/UtilPackageSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package ldbc.finbench.datagen.util
18 | 
19 | import org.scalatest.BeforeAndAfterAll
20 | import org.scalatest.funsuite.AnyFunSuite
21 | 
22 | class UtilPackageSuite extends AnyFunSuite with BeforeAndAfterAll {
23 | 
24 |   test("simpleNameOf") {
25 |     val simpleName = simpleNameOf[String]
26 |     assert(simpleName.equals("String"))
27 |   }
28 | 
29 |   test("pascalToCamel") {
30 |     val actualResult = pascalToCamel("PersonInvestCompany")
31 |     val expectResult = "personInvestCompany"
32 |     assert(actualResult.equals(expectResult))
33 | 
34 |     val actualEmptyResult = pascalToCamel("")
35 |     val expectEmptyResult = ""
36 |     assert(actualEmptyResult.equals(expectEmptyResult))
37 | 
38 |     assertThrows[NullPointerException](pascalToCamel(null))
39 |   }
40 | 
41 |   test("camelToUpper") {
42 |     val actualResult = camelToUpper("hasTag")
43 |     val expectResult = "HAS_TAG"
44 |     assert(actualResult.equals(expectResult))
45 | 
46 |     val actualEmtpyResult = camelToUpper("")
47 |     val expectEmptyResult = ""
48 |     assert(actualEmtpyResult.equals(expectEmptyResult))
49 | 
50 |     assertThrows[NullPointerException](camelToUpper(null))
51 |   }
52 | 
53 |   test("lower") {
54 |     val actualResult = lower("fasFSsfja_SFASJFA")
55 |     val expectResult = "fasfssfja_sfasjfa"
56 |     assert(actualResult.equals(expectResult))
57 |     assertThrows[NullPointerException](lower(null))
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | # Tools
 2 | 
 3 | - paramgen:
 4 |   - parameter_curation: a tool for generating parameters for finbench queries
 5 | - check_*.py: python scripts used for check the data features like consistency, distribution
 6 | - merge_cluster_output.py: a python script to merge the output in cluster mode
 7 | - statistic.py: a python script to calculate the statistics of the data
 8 | - legacy: some legacy tools
 9 |   - dataprofiler: a tool for profiling graph data, including degree distribution, etc.
10 |   - graphgen: a simple tool/example code to generate power-law distributed graph data.
11 |   - factorgen: factor table generators in python version
12 | 
13 | 
14 | ## ParamsGen
15 | 
16 | `params_gen.py` uses the CREATE_VALIDATION feature to generate parameters.
17 | 
18 | The specific steps are as follows:
19 | 
20 | 1. Select vertices of type Account, Person, and Loan from the dataset, and generate a parameter file that meets the input specifications for ldbc_finbench_driver.
21 | 2. Execute CREATE_VALIDATION to generate validation_params.csv.
22 | 3. Select non-empty results from validation_params.csv.
23 | 
24 | Example:
25 | 
26 | ```bash
27 | python3 params_gen.py 1 # gen tcr1 params
28 | ```
29 | 
30 | Other notes:
31 | 
32 | 1. The generated start_timestamp and end_timestamp in the current version are fixed values.
33 | 2. For tcr4 and tcr10, this method is not efficient enough. Use the following Cypher query to search for parameters:
34 | 
35 | ```Cypher
36 | // tcr4
37 | MATCH
38 |     (n1:Account)-[:transfer]->
39 |     (n2:Account)-[:transfer]->
40 |     (n3:Account)-[:transfer]->(n4:Account)
41 | WHERE
42 |     n1.id = n4.id AND n1.id > n2.id AND n2.id > n3.id
43 | WITH
44 | 	  n1.id as n1id,
45 |     n2.id as n2id,
46 |     n3.id as n3id,
47 |     n4.id as n4id
48 | LIMIT 1000
49 | RETURN DISTINCT toString(n1id)+"|"+toString(n2id)
50 | 
51 | // tcr10
52 | MATCH
53 |     (c:Company)<-[:invest]-(p:Person)
54 | WITH
55 | 	  c.id as cid,
56 |     count(p.id) as num,
57 | 		collect(p.id) as person
58 | WHERE num >= 2
59 | RETURN
60 |     tostring(person[0])+"|"+tostring(person[1])
61 | LIMIT 1000
62 | ```
63 | 


--------------------------------------------------------------------------------
/tools/check_consistency.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import os
 3 | import sys
 4 | import glob
 5 | 
 6 | print_templ = "| {} | {} | {} | {} |"
 7 | 
 8 | 
 9 | def get_md5_list(subdir, dir):
10 |     md5_list = []
11 |     csvs = glob.glob("{}/{}/*.csv".format(dir, subdir))
12 |     for csv in csvs:
13 |         with open(csv, "rb") as f:
14 |             md5_list.append(hashlib.md5(f.read()).hexdigest())
15 |     return sorted(md5_list)
16 | 
17 | 
18 | def check_multiple_files(subdir, dir1, dir2):
19 |     dir1_list = get_md5_list(subdir, dir1)
20 |     dir2_list = get_md5_list(subdir, dir2)
21 |     return dir1_list == dir2_list
22 | 
23 | 
24 | def check_consistency(dir1, dir2):
25 |     subdirs1 = [d for d in os.listdir(dir1) if os.path.isdir(os.path.join(dir1, d))]
26 |     subdirs2 = [d for d in os.listdir(dir2) if os.path.isdir(os.path.join(dir2, d))]
27 |     common_subdirs = set(subdirs1) & set(subdirs2)
28 | 
29 |     headers = ["Subdir", "Dir1", "Dir2", "Consistency"]
30 |     max_len0 = max(max([len(d) for d in common_subdirs]), len(headers[0]))
31 |     max_len1 = max(len(dir1), len(headers[1]))
32 |     max_len2 = max(len(dir2), len(headers[2]))
33 |     max_len3 = max(
34 |         len("same"),
35 |         len("different"),
36 |         len("skipped for more than one file"),
37 |         len(headers[3]),
38 |     )
39 | 
40 |     def align_print(col0: str, col1: str, col2: str, col3: str):
41 |         print(
42 |             print_templ.format(
43 |                 col0.center(max_len0),
44 |                 col1.center(max_len1),
45 |                 col2.center(max_len2),
46 |                 col3.center(max_len3),
47 |             )
48 |         )
49 | 
50 |     align_print(headers[0], headers[1], headers[2], headers[3])
51 |     for subdir in sorted(common_subdirs):
52 |         if check_multiple_files(subdir, dir1, dir2):
53 |             align_print(subdir, dir1, dir2, "same")
54 |         else:
55 |             align_print(subdir, dir1, dir2, "different")
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     dir1 = sys.argv[1]
60 |     dir2 = sys.argv[2]
61 |     check_consistency(dir1, dir2)
62 | 


--------------------------------------------------------------------------------
/tools/check_deletion.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import sys
 3 | import os.path
 4 | 
 5 | from pyspark.sql import SparkSession
 6 | 
 7 | spark = SparkSession.builder.appName("check_time").getOrCreate()
 8 | 
 9 | subdirs = [
10 |     "account",
11 |     "companyOwnAccount",
12 |     "withdraw",
13 |     "deposit",
14 |     "loantransfer",
15 |     "personOwnAccount",
16 |     "signIn",
17 |     "repay",
18 |     "transfer",
19 | ]
20 | 
21 | 
22 | def read_data(path):
23 |     dataframes = [
24 |         spark.read.option("delimiter", "|").csv(csv, header=True, inferSchema=True)
25 |         for csv in glob.glob(path)
26 |     ]
27 |     allTransfer = dataframes[0]
28 |     for idx, dataframe in enumerate(dataframes):
29 |         if idx == 0:
30 |             continue
31 |         allTransfer = allTransfer.union(dataframe)
32 |     return allTransfer
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     prefix = sys.argv[1]
37 |     for subdir in subdirs:
38 |         print("Checking {} if deletion before creation......".format(subdir))
39 |         if not os.path.exists(os.path.join(prefix, subdir)):
40 |             print("No {} data exists!\n".format(subdir))
41 |             continue
42 |         data = read_data(os.path.join(prefix, subdir, "*.csv"))
43 |         wrong = data.filter(data["createTime"] >= data["deleteTime"])
44 |         if wrong.count() > 0:
45 |             print(
46 |                 "{} invalid! Having {} rows with wrong time\n".format(
47 |                     subdir, wrong.count()
48 |                 )
49 |             )
50 |             wrong.show(3)
51 |         else:
52 |             print("{} passed.\n".format(subdir))
53 | 


--------------------------------------------------------------------------------
/tools/check_duplicate.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | import glob
 3 | import sys
 4 | import os
 5 | 
 6 | spark = SparkSession.builder.appName("check_dup").getOrCreate()
 7 | 
 8 | 
 9 | def check_dup(subdir, key):
10 |     datas = []
11 |     for csv in glob.glob(subdir + "/*.csv"):
12 |         datas.append(
13 |             spark.read.option("delimiter", "|").csv(csv, header=True, inferSchema=True)
14 |         )
15 | 
16 |     merged = datas[0]
17 |     for df in datas[1:]:
18 |         merged = merged.unionAll(df)
19 | 
20 |     dups = merged.groupBy(key).count().filter("count > 1")
21 |     print(
22 |         "{}: Total rows: {}, duplicated {}".format(subdir, merged.count(), dups.count())
23 |     )
24 |     dups.show(5)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     prefix = sys.argv[1]
29 |     check_dup(os.path.join(prefix, "account"), "id")
30 |     check_dup(os.path.join(prefix, "company"), "id")
31 |     check_dup(os.path.join(prefix, "person"), "id")
32 |     check_dup(os.path.join(prefix, "medium"), "id")
33 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/.gitignore:
--------------------------------------------------------------------------------
1 | profiler_standalone
2 | build


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.2)
 2 | project(Profiler C CXX)
 3 | 
 4 | set(TUGRAPH_HOME /home/qsp/project/tugraph-db/)
 5 | 
 6 | add_executable(
 7 |   profiler_standalone
 8 |   profiler.cpp
 9 |   de_core.cpp
10 |   wcc_core.cpp
11 |   ${TUGRAPH_HOME}/src/lgraph_api/olap_base.cpp
12 |   ${TUGRAPH_HOME}/src/lgraph_api/lgraph_utils.cpp
13 |   ${TUGRAPH_HOME}/src/lgraph_api/olap_profile.cpp)
14 | target_link_libraries(profiler_standalone -static-libstdc++ libstdc++fs.a
15 |                       libgomp.a pthread dl)
16 | target_include_directories(
17 |   profiler_standalone PUBLIC ${TUGRAPH_HOME}/deps/fma-common/
18 |                              ${TUGRAPH_HOME}/include ${TUGRAPH_HOME}/src)
19 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/README.md:
--------------------------------------------------------------------------------
 1 | # About the data profiler
 2 | 
 3 | This tool as a Data Profiler is developed to profile the data distribution.
 4 | It is devleoped based on [TuGraph][1], an open-source high performance graph database contributed by Ant Group
 5 | Co., Ltd.
 6 | 
 7 | The profiling of these metrics are currently supported:
 8 | - Count of V(vertices) and E(edges)
 9 | - Ratio of E over V
10 | - Edge multiplicity
11 | - In-degree and out-degree distribution including the percentiles
12 | - WCC and Diameter results
13 | 
14 | And these features in visualization are supported:
15 | - plot the PowerLaw Distribution of degree
16 | - PowerLaw Distribution Regression
17 | 
18 | # How to use
19 | 
20 | ## Profile
21 | In order to compile this tool, you need to first pull TuGraph to local and set the TUGRAPH_HOME environment varible in
22 | `CmakeLists.txt` or `compile.sh` to the repository. See `CmakeLists.txt` and `compile.sh` for
23 | more details.
24 | 
25 | ## Plot
26 | See `plot.py` for details.
27 | 
28 | [1]: https://github.com/TuGraph-db/tugraph-db/


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/algo.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022 AntGroup. All Rights Reserved. */
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <unordered_set>
 6 | #include "lgraph/olap_base.h"
 7 | 
 8 | using namespace lgraph_api;
 9 | using namespace lgraph_api::olap;
10 | 
11 | /**
12 |  * @brief    Compute the Dimension Estimation algorithm.
13 |  *
14 |  * @param[in]    graph    The graph to compute on.
15 |  * @param[in]    roots    The root vertex id to start de from.
16 |  *
17 |  * @return    return dimension of graph.
18 |  */
19 | size_t DECore(OlapBase<Empty>& graph, std::set<size_t>& roots);
20 | 
21 | /**
22 |  * \brief   Compute the weakly connected components.
23 |  *
24 |  * \param               graph   The graph to compute on, should be an *undirected* graph.
25 |  * \param   [in,out]    label   the ParallelVector to store wcc_label.
26 |  */
27 | void WCCCore(OlapBase<Empty>& graph, ParallelVector<size_t>& label);
28 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/compile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TUGRAPH_HOME=/home/qsp/project/tugraph-db/
 4 | 
 5 | g++ -fno-gnu-unique -fPIC -g --std=c++14 \
 6 | 	-I${TUGRAPH_HOME}/include \
 7 | 	-I${TUGRAPH_HOME}/src \
 8 | 	-I${TUGRAPH_HOME}/deps/fma-common \
 9 | 	-rdynamic -O3 -fopenmp -DNDEBUG \
10 | 	-o profiler_standalone \
11 | 	stat.cpp  "${TUGRAPH_HOME}/build/output/liblgraph.so" -lrt


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/de_core.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022 AntGroup. All Rights Reserved. */
 2 | 
 3 | #include "lgraph/olap_base.h"
 4 | #include "./algo.h"
 5 | 
 6 | using namespace lgraph_api;
 7 | using namespace lgraph_api::olap;
 8 | 
 9 | size_t DECore(OlapBase<Empty> & graph, std::set<size_t>& roots) {
10 |     size_t vertices = graph.NumVertices();
11 |     auto active_in = graph.AllocVertexSubset();
12 |     auto active_out = graph.AllocVertexSubset();
13 |     auto diameter = graph.AllocVertexArray<size_t>();
14 |     auto curr = graph.AllocVertexArray<size_t>();
15 |     auto next = graph.AllocVertexArray<size_t>();
16 |     auto vst = graph.AllocVertexArray<size_t>();
17 | 
18 |     active_in.Fill();
19 |     graph.ProcessVertexActive<size_t>(
20 |             [&](size_t vtx) {
21 |                 diameter[vtx] = 0;
22 |                 curr[vtx] = 0;
23 |                 next[vtx] = 0;
24 |                 vst[vtx] = 0;
25 |                 return 0;
26 |             },
27 |             active_in);
28 |     assert(roots.size() <= 64);
29 |     active_in.Clear();
30 |     uint64_t full = 0;
31 |     int k = 0;
32 |     for (auto vtx : roots) {
33 |         curr[vtx] |= (1ul << k);
34 |         vst[vtx] |= (1ul << k);
35 |         full |= (1ul << k);
36 |         diameter[vtx] = 0;
37 |         active_in.Add(vtx);
38 |         k++;
39 |     }
40 |     size_t active_vertices = roots.size();
41 | 
42 |     size_t i_i = 0;
43 |     while (active_vertices > 0) {
44 |         i_i++;
45 |         active_out.Clear();
46 |         active_vertices = graph.ProcessVertexActive<size_t>(
47 |             [&](size_t src) {
48 |                 size_t activated = 0;
49 |                 for (auto edge : graph.OutEdges(src)) {
50 |                     size_t dst = edge.neighbour;
51 |                     if (vst[dst] != full) {
52 |                         graph.AcquireVertexLock(dst);
53 |                         next[dst] |= curr[src];
54 |                         vst[dst] |= curr[src];
55 |                         if (diameter[dst] != i_i) {
56 |                             diameter[dst] = i_i;
57 |                             active_out.Add(dst);
58 |                             activated++;
59 |                         }
60 |                         graph.ReleaseVertexLock(dst);
61 |                     }
62 |                 }
63 |                 return activated;
64 |             },
65 |             active_in);
66 |         active_in.Swap(active_out);
67 |         curr.Swap(next);
68 |     }
69 | 
70 |     roots.clear();
71 |     size_t max_diameter = 0;
72 |     for (size_t vtx = 0; vtx < vertices; vtx++) {
73 |         if (diameter[vtx] > max_diameter) {
74 |             max_diameter = diameter[vtx];
75 |             roots.clear();
76 |         }
77 |         if (diameter[vtx] == max_diameter && roots.size() < 64) {
78 |             roots.insert(vtx);
79 |         }
80 |     }
81 |     return max_diameter;
82 | }
83 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db139/edges.txt:
--------------------------------------------------------------------------------
1 | V 192957470, E 364134424, E/V 1.88712
2 | Unique edges: 2.07034e+08 / 364134424, Multiplicity: 1.75881
3 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db139/in-out.txt:
--------------------------------------------------------------------------------
1 | DEGREE TOP 100 (2473301,3200099) (1783222,1209419) (1778827,1115677) (417996,1089786) (417883,873399) (416456,862247) (416093,859954) (376752,760379) (363829,751146) (341055,645088) (340476,641870) (339543,603601) (327973,591788) (321914,567589) (246318,514367) (239969,496956) (234698,465880) (233255,439897) (232308,415132) (231423,414446) (230364,340349) (213580,315555) (200366,314544) (195950,305778) (188032,284154) (171577,274382) (168350,254939) (168065,250598) (161465,246216) (158669,240788) (151784,229971) (151696,223614) (150801,219798) (150114,218040) (147190,214775) (141866,211877) (141180,210660) (132081,204953) (130586,203001) (129405,197944) (126987,193862) (125396,188107) (119588,182274) (116911,182266) (108654,180893) (108293,180738) (107732,174453) (106525,169600) (106032,168351) (104989,168317) (104042,166833) (102056,165385) (101565,157632) (101172,156496) (99948,153538) (99908,149475) (99899,148876) (98522,146576) (96846,138774) (96553,133449) (95932,132374) (95518,131027) (91272,130293) (86434,126792) (85669,124702) (84085,122362) (83575,118131) (82737,117889) (82705,117492) (82333,115296) (81683,113106) (80999,112695) (80489,112460) (80179,111925) (79468,110544) (79155,110432) (78675,109815) (78200,107059) (78011,105027) (77708,104274) (77445,104030) (77336,102202) (77225,102008) (73789,101867) (72939,101760) (72318,101721) (71929,98774) (71650,98405) (70485,98008) (70017,97270) (69960,94013) (69825,93670) (68818,92162) (67873,91379) (67718,90903) (67480,89878) (66867,87863) (66818,84477) (66405,84039) (65966,82336)
2 | DEGREE PEC 100 2473301 13 8 6 5 4 4 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db139/in_degree_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db139/in_degree_dist.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db139/in_degree_dist_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 109539041.82131267
3 | beta: -2.319428121087157
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db139/out-in.txt:
--------------------------------------------------------------------------------
1 | DEGREE TOP 100 (3200099,2473301) (1209419,1783222) (1115677,1778827) (1089786,417996) (873399,417883) (862247,416456) (859954,416093) (760379,376752) (751146,363829) (645088,341055) (641870,340476) (603601,339543) (591788,327973) (567589,321914) (514367,246318) (496956,239969) (465880,234698) (439897,233255) (415132,232308) (414446,231423) (340349,230364) (315555,213580) (314544,200366) (305778,195950) (284154,188032) (274382,171577) (254939,168350) (250598,168065) (246216,161465) (240788,158669) (229971,151784) (223614,151696) (219798,150801) (218040,150114) (214775,147190) (211877,141866) (210660,141180) (204953,132081) (203001,130586) (197944,129405) (193862,126987) (188107,125396) (182274,119588) (182266,116911) (180893,108654) (180738,108293) (174453,107732) (169600,106525) (168351,106032) (168317,104989) (166833,104042) (165385,102056) (157632,101565) (156496,101172) (153538,99948) (149475,99908) (148876,99899) (146576,98522) (138774,96846) (133449,96553) (132374,95932) (131027,95518) (130293,91272) (126792,86434) (124702,85669) (122362,84085) (118131,83575) (117889,82737) (117492,82705) (115296,82333) (113106,81683) (112695,80999) (112460,80489) (111925,80179) (110544,79468) (110432,79155) (109815,78675) (107059,78200) (105027,78011) (104274,77708) (104030,77445) (102202,77336) (102008,77225) (101867,73789) (101760,72939) (101721,72318) (98774,71929) (98405,71650) (98008,70485) (97270,70017) (94013,69960) (93670,69825) (92162,68818) (91379,67873) (90903,67718) (89878,67480) (87863,66867) (84477,66818) (84039,66405) (82336,65966)
2 | DEGREE PEC 100 3200099 33 17 10 6 4 3 3 2 2 2 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db139/out_degree_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db139/out_degree_dist.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db139/out_degree_dist_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 20186572.91449603
3 | beta: -1.7197963259066844
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db177/in-out.txt:
--------------------------------------------------------------------------------
1 | DEGREE TOP 100 (2482180,149885218) (1809340,3717832) (1791730,3049232) (422108,1529335) (418298,1358574) (416446,830233) (413525,584009) (377450,552680) (366669,425032) (344063,424147) (342589,413577) (341930,403508) (324830,366893) (323695,337105) (245819,334096) (236200,310579) (236110,274086) (235741,263344) (235192,249659) (233683,246903) (231212,236568) (212737,226417) (200788,217820) (199005,215850) (197283,211030) (175922,205471) (157368,196111) (153269,192106) (152864,185634) (151324,185391) (149506,184095) (141611,182666) (140400,180840) (139353,177872) (136607,176061) (133260,173571) (130547,166558) (129518,154831) (126796,153383) (126188,151620) (116546,151526) (115982,149563) (110005,144547) (109782,143036) (108152,140946) (106538,138909) (106100,138739) (106057,136393) (104552,136387) (101608,133203) (101288,130774) (101130,129113) (100918,128466) (100806,123092) (100663,122190) (99460,120862) (98671,118445) (96911,117918) (95514,115389) (94966,113092) (91107,108581) (90494,106834) (83910,106782) (83766,103335) (83741,102309) (83218,101037) (82754,99585) (82732,98936) (81396,97097) (80429,96245) (79373,93535) (79343,92457) (78541,90238) (78206,89948) (78126,88752) (77809,88735) (77388,87973) (77185,86416) (77031,85444) (76909,80630) (74745,79470) (72674,78478) (71937,77435) (71154,76548) (70638,74983) (70208,74799) (70088,74586) (69684,73839) (69496,73679) (67818,73298) (66740,72493) (66639,70731) (66149,69641) (65819,69549) (65703,69129) (65544,68903) (64300,68123) (63466,68048) (62935,67618) (62858,66595)
2 | DEGREE PEC 100 2482180 15 10 8 6 5 5 4 4 4 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db177/in_degree_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db177/in_degree_dist.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db177/in_degree_dist_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 133908623.8869632
3 | beta: -2.0848641590246326
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db177/out-in.txt:
--------------------------------------------------------------------------------
1 | DEGREE TOP 100 (149885218,2482180) (3717832,1809340) (3049232,1791730) (1529335,422108) (1358574,418298) (830233,416446) (584009,413525) (552680,377450) (425032,366669) (424147,344063) (413577,342589) (403508,341930) (366893,324830) (337105,323695) (334096,245819) (310579,236200) (274086,236110) (263344,235741) (249659,235192) (246903,233683) (236568,231212) (226417,212737) (217820,200788) (215850,199005) (211030,197283) (205471,175922) (196111,157368) (192106,153269) (185634,152864) (185391,151324) (184095,149506) (182666,141611) (180840,140400) (177872,139353) (176061,136607) (173571,133260) (166558,130547) (154831,129518) (153383,126796) (151620,126188) (151526,116546) (149563,115982) (144547,110005) (143036,109782) (140946,108152) (138909,106538) (138739,106100) (136393,106057) (136387,104552) (133203,101608) (130774,101288) (129113,101130) (128466,100918) (123092,100806) (122190,100663) (120862,99460) (118445,98671) (117918,96911) (115389,95514) (113092,94966) (108581,91107) (106834,90494) (106782,83910) (103335,83766) (102309,83741) (101037,83218) (99585,82754) (98936,82732) (97097,81396) (96245,80429) (93535,79373) (92457,79343) (90238,78541) (89948,78206) (88752,78126) (88735,77809) (87973,77388) (86416,77185) (85444,77031) (80630,76909) (79470,74745) (78478,72674) (77435,71937) (76548,71154) (74983,70638) (74799,70208) (74586,70088) (73839,69684) (73679,69496) (73298,67818) (72493,66740) (70731,66639) (69641,66149) (69549,65819) (69129,65703) (68903,65544) (68123,64300) (68048,63466) (67618,62935) (66595,62858)
2 | DEGREE PEC 100 149885218 27 13 7 4 3 3 2 2 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db177/out_degree_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db177/out_degree_dist.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db177/out_degree_dist_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 20194472.85465291
3 | beta: -1.719773728669686
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db184/edges.txt:
--------------------------------------------------------------------------------
1 | V 137733288, E 259405367, E/V 1.88339
2 | Unique edges: 1.45251e+08 / 259405367, Multiplicity: 1.78591
3 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db184/in-out.txt:
--------------------------------------------------------------------------------
1 | DEGREE TOP 100 (1732600,7654441) (1267639,2377914) (1259973,1936613) (828900,1000007) (308786,953128) (296263,952993) (294417,562262) (292892,558103) (292377,511335) (268469,505433) (264011,376547) (240173,351256) (239469,308577) (239292,296902) (230500,292156) (228465,271482) (179536,264649) (166358,259273) (165705,248764) (163129,246845) (163113,184887) (161470,177484) (161333,172270) (149807,170135) (140637,167465) (139188,167224) (134272,167043) (133613,164247) (118217,155939) (118020,154629) (111607,153845) (107757,139188) (107671,137904) (107176,134001) (106575,132345) (103729,128671) (97706,126467) (96547,124197) (91013,122166) (90751,117051) (89611,115487) (88550,110171) (87872,107737) (82879,107549) (76867,106927) (76248,105792) (75394,105291) (74913,101305) (74607,98705) (73408,98230) (72142,97844) (71775,96894) (71687,96078) (70989,95640) (70983,94051) (70841,93189) (70133,93163) (69700,91733) (69575,88850) (69207,88689) (67859,87419) (66985,86846) (66082,85448) (64750,85290) (63306,84319) (59816,82857) (59792,82839) (58865,82625) (58732,82198) (57672,78375) (57579,78117) (57466,75798) (57177,75521) (57027,74762) (56474,73389) (55223,73325) (54991,73280) (54932,72705) (54672,70911) (54451,69241) (54281,68911) (53939,68489) (53687,66771) (52786,66404) (50886,64445) (50578,63764) (50276,63105) (49802,62590) (49751,56216) (49068,55041) (48730,54528) (48451,54278) (47581,53784) (47303,52740) (47167,52705) (46741,52576) (46696,51466) (46690,51266) (46033,50903) (45987,50816)
2 | DEGREE PEC 100 1732600 13 8 6 5 4 4 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db184/in_degree_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db184/in_degree_dist.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db184/in_degree_dist_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 78379700.03758368
3 | beta: -2.318714660323696
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db184/out-in.txt:
--------------------------------------------------------------------------------
1 | DEGREE TOP 100 (7654441,1732600) (2377914,1267639) (1936613,1259973) (1000007,828900) (953128,308786) (952993,296263) (562262,294417) (558103,292892) (511335,292377) (505433,268469) (376547,264011) (351256,240173) (308577,239469) (296902,239292) (292156,230500) (271482,228465) (264649,179536) (259273,166358) (248764,165705) (246845,163129) (184887,163113) (177484,161470) (172270,161333) (170135,149807) (167465,140637) (167224,139188) (167043,134272) (164247,133613) (155939,118217) (154629,118020) (153845,111607) (139188,107757) (137904,107671) (134001,107176) (132345,106575) (128671,103729) (126467,97706) (124197,96547) (122166,91013) (117051,90751) (115487,89611) (110171,88550) (107737,87872) (107549,82879) (106927,76867) (105792,76248) (105291,75394) (101305,74913) (98705,74607) (98230,73408) (97844,72142) (96894,71775) (96078,71687) (95640,70989) (94051,70983) (93189,70841) (93163,70133) (91733,69700) (88850,69575) (88689,69207) (87419,67859) (86846,66985) (85448,66082) (85290,64750) (84319,63306) (82857,59816) (82839,59792) (82625,58865) (82198,58732) (78375,57672) (78117,57579) (75798,57466) (75521,57177) (74762,57027) (73389,56474) (73325,55223) (73280,54991) (72705,54932) (70911,54672) (69241,54451) (68911,54281) (68489,53939) (66771,53687) (66404,52786) (64445,50886) (63764,50578) (63105,50276) (62590,49802) (56216,49751) (55041,49068) (54528,48730) (54278,48451) (53784,47581) (52740,47303) (52705,47167) (52576,46741) (51466,46696) (51266,46690) (50903,46033) (50816,45987)
2 | DEGREE PEC 100 7654441 33 17 10 6 4 3 3 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db184/out_degree_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db184/out_degree_dist.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/db184/out_degree_dist_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 14153912.685932105
3 | beta: -1.7194378338583043
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_1_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 223563.07183870228
3 | beta: -1.334418448216897
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_2_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 220373.12764021207
3 | beta: -1.3299838405402395
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3.txt:
--------------------------------------------------------------------------------
  1 | 1,221327
  2 | 2,95621
  3 | 3,56057
  4 | 4,37348
  5 | 5,27334
  6 | 6,20686
  7 | 7,16224
  8 | 8,12973
  9 | 9,10801
 10 | 10,8827
 11 | 11,7600
 12 | 12,6480
 13 | 13,5467
 14 | 14,4809
 15 | 15,4158
 16 | 16,3683
 17 | 17,3153
 18 | 18,2779
 19 | 19,2503
 20 | 20,2318
 21 | 21,2055
 22 | 22,1778
 23 | 23,1617
 24 | 24,1417
 25 | 25,1350
 26 | 26,1192
 27 | 27,1084
 28 | 28,1032
 29 | 29,927
 30 | 30,838
 31 | 31,743
 32 | 32,706
 33 | 33,652
 34 | 34,633
 35 | 35,591
 36 | 36,480
 37 | 37,473
 38 | 38,478
 39 | 39,415
 40 | 40,420
 41 | 41,366
 42 | 42,312
 43 | 43,333
 44 | 44,330
 45 | 45,292
 46 | 46,255
 47 | 47,234
 48 | 48,226
 49 | 49,212
 50 | 50,185
 51 | 51,195
 52 | 52,213
 53 | 53,181
 54 | 54,149
 55 | 55,136
 56 | 56,142
 57 | 57,131
 58 | 58,151
 59 | 59,111
 60 | 60,124
 61 | 61,117
 62 | 62,117
 63 | 63,91
 64 | 64,92
 65 | 65,100
 66 | 66,89
 67 | 67,70
 68 | 68,73
 69 | 69,64
 70 | 70,78
 71 | 71,67
 72 | 72,57
 73 | 73,73
 74 | 74,66
 75 | 75,68
 76 | 76,46
 77 | 77,53
 78 | 78,43
 79 | 79,37
 80 | 80,35
 81 | 81,53
 82 | 82,39
 83 | 83,43
 84 | 84,33
 85 | 85,29
 86 | 86,38
 87 | 87,37
 88 | 88,32
 89 | 89,31
 90 | 90,46
 91 | 91,38
 92 | 92,22
 93 | 93,23
 94 | 94,23
 95 | 95,25
 96 | 96,33
 97 | 97,20
 98 | 98,27
 99 | 99,21
100 | 100,22
101 | 101,21
102 | 102,17
103 | 103,19
104 | 104,19
105 | 105,14
106 | 106,21
107 | 107,11
108 | 108,22
109 | 109,19
110 | 110,7
111 | 111,12
112 | 112,18
113 | 113,19
114 | 114,14
115 | 115,12
116 | 116,15
117 | 117,16
118 | 118,15
119 | 119,12
120 | 120,17
121 | 121,16
122 | 122,9
123 | 123,8
124 | 124,4
125 | 125,9
126 | 126,11
127 | 127,9
128 | 128,12
129 | 129,14
130 | 130,14
131 | 131,13
132 | 132,15
133 | 133,6
134 | 134,8
135 | 135,5
136 | 136,8
137 | 137,9
138 | 138,6
139 | 139,7
140 | 140,6
141 | 141,8
142 | 142,3
143 | 143,4
144 | 144,5
145 | 145,8
146 | 146,5
147 | 147,5
148 | 148,4
149 | 149,6
150 | 150,8
151 | 151,5
152 | 152,10
153 | 153,7
154 | 154,6
155 | 155,3
156 | 156,5
157 | 157,6
158 | 158,3
159 | 159,4
160 | 160,6
161 | 161,3
162 | 162,5
163 | 163,1
164 | 164,4
165 | 165,2
166 | 166,7
167 | 167,8
168 | 168,5
169 | 169,3
170 | 170,3
171 | 171,4
172 | 172,5
173 | 173,1
174 | 174,2
175 | 175,2
176 | 176,2
177 | 177,2
178 | 178,6
179 | 179,3
180 | 180,2
181 | 181,3
182 | 182,1
183 | 183,4
184 | 184,4
185 | 185,1
186 | 186,2
187 | 187,4
188 | 188,4
189 | 189,2
190 | 190,3
191 | 191,2
192 | 192,1
193 | 193,2
194 | 194,2
195 | 195,2
196 | 196,1
197 | 197,2
198 | 198,1
199 | 200,1
200 | 201,2
201 | 202,2
202 | 204,2
203 | 205,3
204 | 206,4
205 | 207,1
206 | 208,3
207 | 209,1
208 | 210,2
209 | 212,3
210 | 214,2
211 | 216,1
212 | 218,2
213 | 220,2
214 | 221,2
215 | 222,3
216 | 223,1
217 | 224,2
218 | 225,1
219 | 230,2
220 | 231,4
221 | 232,1
222 | 234,1
223 | 236,2
224 | 237,1
225 | 238,1
226 | 239,1
227 | 241,1
228 | 244,1
229 | 245,2
230 | 246,4
231 | 247,1
232 | 248,1
233 | 251,1
234 | 253,3
235 | 254,3
236 | 255,2
237 | 257,1
238 | 258,2
239 | 259,1
240 | 260,1
241 | 263,1
242 | 264,2
243 | 265,1
244 | 266,3
245 | 268,1
246 | 269,1
247 | 273,2
248 | 286,2
249 | 290,1
250 | 292,2
251 | 294,2
252 | 299,1
253 | 300,1
254 | 301,1
255 | 308,1
256 | 313,1
257 | 316,2
258 | 319,2
259 | 332,1
260 | 333,1
261 | 334,1
262 | 337,1
263 | 346,1
264 | 356,1
265 | 361,1
266 | 362,2
267 | 366,1
268 | 369,1
269 | 370,1
270 | 376,1
271 | 383,1
272 | 387,1
273 | 406,1
274 | 418,1
275 | 427,1
276 | 429,1
277 | 430,1
278 | 434,1
279 | 439,1
280 | 453,1
281 | 473,1
282 | 496,1
283 | 497,1
284 | 509,1
285 | 514,1
286 | 522,1
287 | 524,1
288 | 529,1
289 | 531,1
290 | 541,1
291 | 554,1
292 | 667,1
293 | 727,1
294 | 731,1
295 | 762,1
296 | 847,1
297 | 866,1
298 | 931,1
299 | 1236,1
300 | 1555,1
301 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_3_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 224369.62351644383
3 | beta: -1.3383227574196173
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4.txt:
--------------------------------------------------------------------------------
  1 | 1,129328
  2 | 2,56359
  3 | 3,33288
  4 | 4,22557
  5 | 5,16138
  6 | 6,12218
  7 | 7,9438
  8 | 8,7807
  9 | 9,6411
 10 | 10,5208
 11 | 11,4376
 12 | 12,3810
 13 | 13,3189
 14 | 14,2830
 15 | 15,2393
 16 | 16,2153
 17 | 17,1870
 18 | 18,1639
 19 | 19,1431
 20 | 20,1293
 21 | 21,1166
 22 | 22,1084
 23 | 23,918
 24 | 24,871
 25 | 25,775
 26 | 26,757
 27 | 27,633
 28 | 28,575
 29 | 29,539
 30 | 30,525
 31 | 31,458
 32 | 32,408
 33 | 33,398
 34 | 34,354
 35 | 35,326
 36 | 36,300
 37 | 37,304
 38 | 38,257
 39 | 39,230
 40 | 40,241
 41 | 41,224
 42 | 42,199
 43 | 43,198
 44 | 44,165
 45 | 45,158
 46 | 46,153
 47 | 47,120
 48 | 48,156
 49 | 49,125
 50 | 50,134
 51 | 51,115
 52 | 52,114
 53 | 53,94
 54 | 54,88
 55 | 55,85
 56 | 56,77
 57 | 57,94
 58 | 58,85
 59 | 59,84
 60 | 60,57
 61 | 61,61
 62 | 62,57
 63 | 63,54
 64 | 64,51
 65 | 65,46
 66 | 66,51
 67 | 67,44
 68 | 68,40
 69 | 69,32
 70 | 70,35
 71 | 71,41
 72 | 72,35
 73 | 73,31
 74 | 74,40
 75 | 75,42
 76 | 76,33
 77 | 77,40
 78 | 78,28
 79 | 79,33
 80 | 80,34
 81 | 81,25
 82 | 82,25
 83 | 83,32
 84 | 84,20
 85 | 85,14
 86 | 86,18
 87 | 87,24
 88 | 88,11
 89 | 89,15
 90 | 90,24
 91 | 91,21
 92 | 92,13
 93 | 93,14
 94 | 94,16
 95 | 95,16
 96 | 96,15
 97 | 97,17
 98 | 98,18
 99 | 99,16
100 | 100,19
101 | 101,10
102 | 102,10
103 | 103,6
104 | 104,14
105 | 105,12
106 | 106,8
107 | 107,8
108 | 108,8
109 | 109,14
110 | 110,4
111 | 111,4
112 | 112,8
113 | 113,13
114 | 114,8
115 | 115,4
116 | 116,6
117 | 117,13
118 | 118,7
119 | 119,9
120 | 120,10
121 | 121,6
122 | 122,7
123 | 123,5
124 | 124,7
125 | 125,6
126 | 126,3
127 | 127,10
128 | 128,7
129 | 129,8
130 | 130,5
131 | 131,5
132 | 132,11
133 | 133,1
134 | 134,5
135 | 135,1
136 | 136,4
137 | 137,6
138 | 138,1
139 | 139,7
140 | 140,8
141 | 141,4
142 | 142,6
143 | 143,4
144 | 144,9
145 | 145,6
146 | 146,3
147 | 147,2
148 | 148,3
149 | 149,4
150 | 150,3
151 | 151,4
152 | 152,2
153 | 153,4
154 | 154,2
155 | 155,2
156 | 156,4
157 | 157,2
158 | 159,2
159 | 160,5
160 | 161,3
161 | 162,4
162 | 163,1
163 | 164,2
164 | 165,1
165 | 166,1
166 | 167,2
167 | 168,3
168 | 169,3
169 | 170,3
170 | 171,2
171 | 172,5
172 | 173,2
173 | 174,1
174 | 175,3
175 | 176,3
176 | 177,2
177 | 178,1
178 | 179,2
179 | 180,1
180 | 181,3
181 | 182,1
182 | 183,2
183 | 184,1
184 | 185,1
185 | 186,2
186 | 187,1
187 | 188,1
188 | 190,2
189 | 191,2
190 | 192,2
191 | 193,2
192 | 194,1
193 | 195,1
194 | 197,1
195 | 198,1
196 | 200,1
197 | 201,1
198 | 202,2
199 | 204,2
200 | 205,1
201 | 207,4
202 | 208,3
203 | 211,3
204 | 213,2
205 | 214,2
206 | 215,1
207 | 217,2
208 | 219,2
209 | 221,1
210 | 224,1
211 | 227,1
212 | 228,2
213 | 229,1
214 | 232,1
215 | 234,1
216 | 236,2
217 | 237,1
218 | 238,3
219 | 239,2
220 | 240,1
221 | 241,1
222 | 243,1
223 | 244,1
224 | 252,1
225 | 255,1
226 | 256,1
227 | 257,1
228 | 258,1
229 | 259,1
230 | 260,1
231 | 267,1
232 | 270,1
233 | 274,1
234 | 275,1
235 | 279,1
236 | 283,1
237 | 284,1
238 | 290,2
239 | 294,1
240 | 296,1
241 | 297,2
242 | 302,1
243 | 303,3
244 | 304,2
245 | 305,3
246 | 309,1
247 | 316,1
248 | 322,1
249 | 338,1
250 | 344,2
251 | 352,1
252 | 354,1
253 | 364,1
254 | 368,1
255 | 371,1
256 | 372,1
257 | 373,1
258 | 383,1
259 | 401,1
260 | 407,1
261 | 409,2
262 | 426,1
263 | 443,1
264 | 444,1
265 | 449,1
266 | 452,1
267 | 516,1
268 | 521,1
269 | 537,1
270 | 559,1
271 | 592,1
272 | 756,1
273 | 934,1
274 | 1687,1
275 | 2119,1
276 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_4_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 131228.70692235493
3 | beta: -1.3302376665752749
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5.txt:
--------------------------------------------------------------------------------
  1 | 1,127403
  2 | 2,56508
  3 | 3,32643
  4 | 4,22014
  5 | 5,16041
  6 | 6,12262
  7 | 7,9381
  8 | 8,7697
  9 | 9,6185
 10 | 10,5200
 11 | 11,4458
 12 | 12,3791
 13 | 13,3188
 14 | 14,2793
 15 | 15,2439
 16 | 16,2138
 17 | 17,1873
 18 | 18,1659
 19 | 19,1546
 20 | 20,1334
 21 | 21,1234
 22 | 22,1089
 23 | 23,921
 24 | 24,870
 25 | 25,824
 26 | 26,748
 27 | 27,663
 28 | 28,623
 29 | 29,564
 30 | 30,531
 31 | 31,454
 32 | 32,405
 33 | 33,372
 34 | 34,379
 35 | 35,335
 36 | 36,279
 37 | 37,248
 38 | 38,237
 39 | 39,244
 40 | 40,243
 41 | 41,203
 42 | 42,230
 43 | 43,192
 44 | 44,196
 45 | 45,172
 46 | 46,165
 47 | 47,169
 48 | 48,126
 49 | 49,127
 50 | 50,122
 51 | 51,133
 52 | 52,97
 53 | 53,97
 54 | 54,81
 55 | 55,90
 56 | 56,98
 57 | 57,80
 58 | 58,59
 59 | 59,72
 60 | 60,69
 61 | 61,68
 62 | 62,57
 63 | 63,60
 64 | 64,48
 65 | 65,57
 66 | 66,58
 67 | 67,48
 68 | 68,48
 69 | 69,43
 70 | 70,39
 71 | 71,29
 72 | 72,45
 73 | 73,28
 74 | 74,38
 75 | 75,49
 76 | 76,37
 77 | 77,36
 78 | 78,25
 79 | 79,29
 80 | 80,32
 81 | 81,28
 82 | 82,25
 83 | 83,30
 84 | 84,26
 85 | 85,19
 86 | 86,21
 87 | 87,20
 88 | 88,21
 89 | 89,28
 90 | 90,28
 91 | 91,18
 92 | 92,17
 93 | 93,14
 94 | 94,12
 95 | 95,11
 96 | 96,19
 97 | 97,10
 98 | 98,15
 99 | 99,12
100 | 100,18
101 | 101,13
102 | 102,23
103 | 103,17
104 | 104,9
105 | 105,11
106 | 106,12
107 | 107,10
108 | 108,10
109 | 109,17
110 | 110,11
111 | 111,14
112 | 112,8
113 | 113,12
114 | 114,7
115 | 115,9
116 | 116,6
117 | 117,10
118 | 118,9
119 | 119,5
120 | 120,7
121 | 121,9
122 | 122,7
123 | 123,4
124 | 124,9
125 | 125,5
126 | 126,12
127 | 127,9
128 | 128,6
129 | 129,9
130 | 130,4
131 | 131,4
132 | 132,4
133 | 133,8
134 | 134,5
135 | 135,5
136 | 136,3
137 | 137,4
138 | 138,4
139 | 139,3
140 | 140,4
141 | 141,4
142 | 142,4
143 | 143,5
144 | 144,6
145 | 145,1
146 | 146,4
147 | 147,6
148 | 148,7
149 | 149,1
150 | 150,4
151 | 151,1
152 | 152,3
153 | 153,4
154 | 156,3
155 | 157,1
156 | 158,2
157 | 159,5
158 | 160,2
159 | 161,4
160 | 162,4
161 | 164,2
162 | 165,1
163 | 166,3
164 | 167,2
165 | 168,3
166 | 169,1
167 | 171,1
168 | 172,3
169 | 174,3
170 | 175,2
171 | 176,1
172 | 178,7
173 | 179,2
174 | 180,2
175 | 181,2
176 | 182,1
177 | 183,2
178 | 184,1
179 | 186,2
180 | 188,1
181 | 189,1
182 | 190,3
183 | 191,4
184 | 192,3
185 | 193,3
186 | 194,1
187 | 196,2
188 | 198,1
189 | 199,2
190 | 200,2
191 | 201,1
192 | 203,5
193 | 204,2
194 | 205,1
195 | 208,2
196 | 209,1
197 | 210,1
198 | 211,1
199 | 212,1
200 | 214,1
201 | 216,1
202 | 217,1
203 | 218,1
204 | 221,2
205 | 223,1
206 | 225,1
207 | 227,1
208 | 228,2
209 | 230,2
210 | 231,1
211 | 232,1
212 | 235,1
213 | 237,2
214 | 239,1
215 | 241,3
216 | 242,1
217 | 243,2
218 | 244,1
219 | 245,1
220 | 246,1
221 | 247,1
222 | 250,1
223 | 253,1
224 | 255,1
225 | 259,1
226 | 263,2
227 | 271,1
228 | 272,3
229 | 278,2
230 | 279,1
231 | 283,1
232 | 288,1
233 | 290,1
234 | 291,1
235 | 300,2
236 | 301,1
237 | 307,1
238 | 315,1
239 | 316,2
240 | 318,1
241 | 319,1
242 | 326,1
243 | 328,1
244 | 330,1
245 | 334,1
246 | 343,1
247 | 346,1
248 | 354,1
249 | 357,1
250 | 361,1
251 | 363,1
252 | 382,1
253 | 415,1
254 | 424,2
255 | 432,1
256 | 435,1
257 | 447,1
258 | 451,1
259 | 480,1
260 | 511,1
261 | 514,1
262 | 707,1
263 | 709,1
264 | 806,1
265 | 872,1
266 | 959,1
267 | 2406,1
268 | 2819,1
269 | 3198,1
270 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_indeg/hub_indeg_5_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 129436.57093022726
3 | beta: -1.32565715160002
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_1_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 27230469.375668973
3 | beta: -1.573305530151192
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2.txt:
--------------------------------------------------------------------------------
  1 | 1,1127199
  2 | 2,456471
  3 | 3,231478
  4 | 4,147759
  5 | 5,98680
  6 | 6,73174
  7 | 7,51960
  8 | 8,41451
  9 | 9,31121
 10 | 10,26166
 11 | 11,20475
 12 | 12,17443
 13 | 13,13827
 14 | 14,11993
 15 | 15,9829
 16 | 16,8780
 17 | 17,7115
 18 | 18,6303
 19 | 19,5118
 20 | 20,4321
 21 | 21,3384
 22 | 22,2984
 23 | 23,2272
 24 | 24,2107
 25 | 25,1675
 26 | 26,1506
 27 | 27,1170
 28 | 28,1132
 29 | 29,934
 30 | 30,798
 31 | 31,643
 32 | 32,675
 33 | 33,489
 34 | 34,515
 35 | 35,354
 36 | 36,387
 37 | 37,274
 38 | 38,283
 39 | 39,215
 40 | 40,218
 41 | 41,157
 42 | 42,174
 43 | 43,133
 44 | 44,128
 45 | 45,82
 46 | 46,119
 47 | 47,95
 48 | 48,87
 49 | 49,58
 50 | 50,76
 51 | 51,45
 52 | 52,33
 53 | 53,31
 54 | 54,47
 55 | 55,33
 56 | 56,48
 57 | 57,36
 58 | 58,42
 59 | 59,29
 60 | 60,26
 61 | 61,22
 62 | 62,24
 63 | 63,12
 64 | 64,16
 65 | 65,17
 66 | 66,15
 67 | 67,10
 68 | 68,15
 69 | 69,7
 70 | 70,15
 71 | 71,7
 72 | 72,5
 73 | 73,9
 74 | 74,14
 75 | 75,9
 76 | 76,6
 77 | 77,7
 78 | 78,6
 79 | 79,5
 80 | 80,3
 81 | 81,3
 82 | 82,8
 83 | 83,3
 84 | 84,6
 85 | 85,3
 86 | 86,2
 87 | 87,3
 88 | 88,3
 89 | 89,2
 90 | 90,2
 91 | 91,2
 92 | 92,2
 93 | 94,1
 94 | 95,1
 95 | 96,3
 96 | 99,2
 97 | 100,1
 98 | 101,2
 99 | 103,1
100 | 104,2
101 | 105,1
102 | 106,1
103 | 108,1
104 | 109,1
105 | 111,2
106 | 113,1
107 | 117,1
108 | 122,2
109 | 123,1
110 | 126,1
111 | 131,1
112 | 132,1
113 | 133,2
114 | 137,1
115 | 142,1
116 | 144,1
117 | 159,1
118 | 204,1
119 | 218,1
120 | 230,1
121 | 235,1
122 | 271,1
123 | 294,1
124 | 297,1
125 | 302,1
126 | 305,1
127 | 825,1
128 | 1206,1
129 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_2_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 1141214.9408893671
3 | beta: -1.488771851133902
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3.txt:
--------------------------------------------------------------------------------
  1 | 1,142193
  2 | 2,108601
  3 | 3,49793
  4 | 4,39978
  5 | 5,25139
  6 | 6,20298
  7 | 7,14961
  8 | 8,12881
  9 | 9,10057
 10 | 10,8985
 11 | 11,7432
 12 | 12,6622
 13 | 13,5575
 14 | 14,5025
 15 | 15,4478
 16 | 16,4354
 17 | 17,3761
 18 | 18,3622
 19 | 19,3319
 20 | 20,3153
 21 | 21,2942
 22 | 22,2749
 23 | 23,2569
 24 | 24,2526
 25 | 25,2324
 26 | 26,2189
 27 | 27,2107
 28 | 28,2025
 29 | 29,1850
 30 | 30,2013
 31 | 31,1823
 32 | 32,1801
 33 | 33,1802
 34 | 34,1909
 35 | 35,1812
 36 | 36,1871
 37 | 37,1695
 38 | 38,1573
 39 | 39,1320
 40 | 40,1311
 41 | 41,1122
 42 | 42,1053
 43 | 43,1025
 44 | 44,938
 45 | 45,810
 46 | 46,754
 47 | 47,645
 48 | 48,600
 49 | 49,550
 50 | 50,489
 51 | 51,434
 52 | 52,356
 53 | 53,307
 54 | 54,309
 55 | 55,251
 56 | 56,233
 57 | 57,160
 58 | 58,181
 59 | 59,150
 60 | 60,110
 61 | 61,76
 62 | 62,69
 63 | 63,47
 64 | 64,46
 65 | 65,32
 66 | 66,37
 67 | 67,33
 68 | 68,41
 69 | 69,19
 70 | 70,24
 71 | 71,18
 72 | 72,21
 73 | 73,15
 74 | 74,15
 75 | 75,11
 76 | 76,9
 77 | 77,13
 78 | 78,8
 79 | 79,3
 80 | 80,11
 81 | 81,6
 82 | 82,9
 83 | 83,2
 84 | 84,6
 85 | 85,1
 86 | 86,4
 87 | 87,4
 88 | 88,3
 89 | 89,8
 90 | 90,4
 91 | 91,3
 92 | 92,3
 93 | 93,3
 94 | 94,3
 95 | 96,6
 96 | 97,3
 97 | 98,3
 98 | 100,4
 99 | 101,1
100 | 102,3
101 | 113,1
102 | 118,1
103 | 124,1
104 | 129,1
105 | 154,1
106 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_3_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 156038.86238756854
3 | beta: -1.1012676833313366
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4.txt:
--------------------------------------------------------------------------------
  1 | 1,67717
  2 | 2,77379
  3 | 3,31850
  4 | 4,30428
  5 | 5,17902
  6 | 6,16002
  7 | 7,11361
  8 | 8,9941
  9 | 9,7763
 10 | 10,6769
 11 | 11,5319
 12 | 12,4602
 13 | 13,3824
 14 | 14,3488
 15 | 15,3163
 16 | 16,2965
 17 | 17,2572
 18 | 18,2406
 19 | 19,2317
 20 | 20,2207
 21 | 21,2083
 22 | 22,2032
 23 | 23,1876
 24 | 24,1884
 25 | 25,1865
 26 | 26,1918
 27 | 27,1784
 28 | 28,1824
 29 | 29,1745
 30 | 30,1860
 31 | 31,1891
 32 | 32,2074
 33 | 33,2027
 34 | 34,2300
 35 | 35,2264
 36 | 36,2772
 37 | 37,2205
 38 | 38,2155
 39 | 39,1740
 40 | 40,1599
 41 | 41,1360
 42 | 42,1206
 43 | 43,1100
 44 | 44,992
 45 | 45,841
 46 | 46,719
 47 | 47,643
 48 | 48,545
 49 | 49,497
 50 | 50,470
 51 | 51,417
 52 | 52,404
 53 | 53,359
 54 | 54,329
 55 | 55,286
 56 | 56,261
 57 | 57,200
 58 | 58,203
 59 | 59,160
 60 | 60,118
 61 | 61,104
 62 | 62,64
 63 | 63,58
 64 | 64,39
 65 | 65,47
 66 | 66,43
 67 | 67,34
 68 | 68,33
 69 | 69,23
 70 | 70,37
 71 | 71,34
 72 | 72,27
 73 | 73,16
 74 | 74,10
 75 | 75,10
 76 | 76,8
 77 | 77,8
 78 | 78,14
 79 | 79,11
 80 | 80,6
 81 | 81,5
 82 | 82,8
 83 | 83,4
 84 | 84,5
 85 | 85,4
 86 | 86,4
 87 | 87,6
 88 | 88,6
 89 | 89,3
 90 | 90,1
 91 | 91,5
 92 | 92,1
 93 | 93,4
 94 | 94,3
 95 | 95,1
 96 | 96,4
 97 | 97,3
 98 | 98,1
 99 | 99,1
100 | 100,1
101 | 101,3
102 | 103,3
103 | 104,2
104 | 105,1
105 | 106,2
106 | 107,1
107 | 112,1
108 | 113,1
109 | 114,1
110 | 115,1
111 | 116,2
112 | 118,1
113 | 119,2
114 | 122,2
115 | 123,1
116 | 143,1
117 | 191,1
118 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_4_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 83519.30033324637
3 | beta: -0.9664620660294009
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5.txt:
--------------------------------------------------------------------------------
 1 | 1,1699695
 2 | 2,250291
 3 | 3,75890
 4 | 4,33114
 5 | 5,17934
 6 | 6,10863
 7 | 7,7211
 8 | 8,4873
 9 | 9,3518
10 | 10,2628
11 | 11,1944
12 | 12,1586
13 | 13,1234
14 | 14,995
15 | 15,817
16 | 16,647
17 | 17,545
18 | 18,463
19 | 19,346
20 | 20,318
21 | 21,281
22 | 22,252
23 | 23,175
24 | 24,219
25 | 25,151
26 | 26,149
27 | 27,112
28 | 28,103
29 | 29,72
30 | 30,68
31 | 31,67
32 | 32,59
33 | 33,55
34 | 34,41
35 | 35,30
36 | 36,28
37 | 37,31
38 | 38,22
39 | 39,25
40 | 40,19
41 | 41,19
42 | 42,14
43 | 43,17
44 | 44,12
45 | 45,11
46 | 46,11
47 | 47,12
48 | 48,13
49 | 49,11
50 | 50,7
51 | 51,6
52 | 52,5
53 | 53,7
54 | 54,5
55 | 55,2
56 | 56,4
57 | 57,3
58 | 58,1
59 | 59,6
60 | 60,5
61 | 61,5
62 | 62,2
63 | 63,2
64 | 64,2
65 | 65,2
66 | 67,2
67 | 68,1
68 | 69,1
69 | 71,3
70 | 72,1
71 | 73,1
72 | 74,2
73 | 75,1
74 | 76,3
75 | 77,1
76 | 78,1
77 | 79,1
78 | 80,2
79 | 82,1
80 | 88,1
81 | 89,1
82 | 92,1
83 | 93,1
84 | 95,1
85 | 96,2
86 | 117,1
87 | 173,1
88 | 215,1
89 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/hubvertex_outdeg/hub_outdeg_5_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 1699901.8355022694
3 | beta: -2.782917517119856
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/transfer/in-out.txt:
--------------------------------------------------------------------------------
1 | DEGREE TOP 100 (3048390,149885218) (3046466,7654425) (3043195,3717800) (1809340,3200099) (1129165,3049194) (924712,2377914) (830798,1670911) (712413,1529329) (710260,1358537) (641020,1209411) (632973,1115675) (630067,1089786) (627839,999991) (580524,953094) (578835,859954) (528568,608509) (413525,606974) (412343,603557) (411422,591748) (408226,562251) (399829,439738) (397811,435303) (393778,414443) (391697,408185) (377450,400741) (352612,376543) (309148,366878) (293469,353388) (287538,345340) (277388,340104) (271632,338657) (268182,337096) (258872,315555) (257605,310579) (247652,308547) (239551,305741) (224806,274064) (220635,264638) (218966,263313) (217587,259273) (215537,249205) (214753,246216) (213268,234897) (210157,227524) (194537,223583) (185521,216924) (183980,214773) (180816,211013) (180760,197915) (178397,196080) (176695,192068) (176184,184776) (175973,182230) (173743,179550) (173669,176031) (172947,174463) (172548,173571) (172022,167455) (170749,165347) (170032,163834) (169523,160865) (166546,157632) (164485,156494) (156547,155911) (153269,154617) (148137,153352) (142497,152993) (141041,150277) (139841,149119) (136612,148746) (134140,147731) (132448,143019) (132377,142648) (131275,140915) (131070,137847) (130986,137137) (126796,136712) (122594,136378) (122405,132318) (119256,131018) (119085,130774) (116435,129064) (116335,124674) (113931,124043) (112756,123727) (112516,122166) (110073,117492) (108099,117182) (106688,116283) (106057,115471) (105524,115364) (103445,113458) (102986,113092) (101608,112663) (101394,111914) (101138,110137) (100978,109989) (100806,106889) (100680,105286) (99539,104025)
2 | DEGREE PEC 100 3048390 20 13 10 8 7 6 6 5 5 4 4 4 4 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0
3 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/transfer/in_degree_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/transfer/in_degree_dist.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/transfer/in_degree_dist_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 105379188.6439417
3 | beta: -1.9560039218452319
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/transfer/out-in.txt:
--------------------------------------------------------------------------------
1 | DEGREE TOP 100 (149885218,3048390) (7654425,3046466) (3717800,3043195) (3200099,1809340) (3049194,1129165) (2377914,924712) (1670911,830798) (1529329,712413) (1358537,710260) (1209411,641020) (1115675,632973) (1089786,630067) (999991,627839) (953094,580524) (859954,578835) (608509,528568) (606974,413525) (603557,412343) (591748,411422) (562251,408226) (439738,399829) (435303,397811) (414443,393778) (408185,391697) (400741,377450) (376543,352612) (366878,309148) (353388,293469) (345340,287538) (340104,277388) (338657,271632) (337096,268182) (315555,258872) (310579,257605) (308547,247652) (305741,239551) (274064,224806) (264638,220635) (263313,218966) (259273,217587) (249205,215537) (246216,214753) (234897,213268) (227524,210157) (223583,194537) (216924,185521) (214773,183980) (211013,180816) (197915,180760) (196080,178397) (192068,176695) (184776,176184) (182230,175973) (179550,173743) (176031,173669) (174463,172947) (173571,172548) (167455,172022) (165347,170749) (163834,170032) (160865,169523) (157632,166546) (156494,164485) (155911,156547) (154617,153269) (153352,148137) (152993,142497) (150277,141041) (149119,139841) (148746,136612) (147731,134140) (143019,132448) (142648,132377) (140915,131275) (137847,131070) (137137,130986) (136712,126796) (136378,122594) (132318,122405) (131018,119256) (130774,119085) (129064,116435) (124674,116335) (124043,113931) (123727,112756) (122166,112516) (117492,110073) (117182,108099) (116283,106688) (115471,106057) (115364,105524) (113458,103445) (113092,102986) (112663,101608) (111914,101394) (110137,101138) (109989,100978) (106889,100806) (105286,100680) (104025,99539)
2 | DEGREE PEC 100 149885218 39 19 11 6 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/transfer/out_degree_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/transfer/out_degree_dist.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ldbc/ldbc_finbench_datagen/d126523519dca6978a6aacf0c1a8df9be48c30ad/tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.png


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/result/transfer/out_degree_dist_regression.txt:
--------------------------------------------------------------------------------
1 | formula: y = alpha * (x^beta)
2 | alpha: 1833055.6837079779
3 | beta: -0.8965897342389562
4 | 


--------------------------------------------------------------------------------
/tools/legacy/dataprofiler/wcc_core.cpp:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022 AntGroup. All Rights Reserved. */
 2 | 
 3 | #include "lgraph/olap_base.h"
 4 | #include "./algo.h"
 5 | 
 6 | using namespace lgraph_api;
 7 | using namespace lgraph_api::olap;
 8 | 
 9 | void WCCCore(OlapBase<Empty>& graph, ParallelVector<size_t>& label) {
10 |     auto active_in = graph.AllocVertexSubset();
11 |     active_in.Fill();
12 |     auto active_out = graph.AllocVertexSubset();
13 |     size_t num_activations = graph.ProcessVertexActive<size_t>(
14 |         [&](size_t vi) {
15 |             label[vi] = vi;
16 |             return 1;
17 |         },
18 |         active_in);
19 | 
20 |     for (int ii = 0; num_activations != 0; ii++) {
21 |         printf("activates(%d) <= %lu\n", ii, num_activations);
22 |         active_out.Clear();
23 |         num_activations = graph.ProcessVertexActive<size_t>(
24 |             [&](size_t src) {
25 |                 size_t num_activations = 0;
26 |                 for (auto& edge : graph.OutEdges(src)) {
27 |                     size_t dst = edge.neighbour;
28 |                     if (label[src] < label[dst]) {
29 |                         auto lock = graph.GuardVertexLock(dst);
30 |                         if (label[src] < label[dst]) {
31 |                             label[dst] = label[src];
32 |                             num_activations += 1;
33 |                             active_out.Add(dst);
34 |                         }
35 |                     }
36 |                 }
37 |                 return num_activations;
38 |             },
39 |             active_in);
40 |         active_in.Swap(active_out);
41 |     }
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/tools/legacy/factorgen/factor_table.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -rf ../../out/factor_table
 4 | 
 5 | python3 generate_account.py &
 6 | 
 7 | python3 time_split.py &
 8 | 
 9 | python3 split_amount.py &
10 | 
11 | python3 loan.py &
12 | 
13 | wait
14 | 
15 | echo "All factors have been generated."
16 | 


--------------------------------------------------------------------------------
/tools/legacy/factorgen/generate_account.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def process_csv(file_path):
 6 |     df = pd.read_csv(file_path, delimiter='|')
 7 |     return df
 8 | 
 9 | 
10 | account_folder_path = '../../out/raw/account'
11 | transfer_folder_path = '../../out/raw/transfer'
12 | output_folder = '../../out/factor_table'
13 | withdraw_folder_path = '../../out/raw/withdraw'
14 | 
15 | account_files = [os.path.join(account_folder_path, file) for file in os.listdir(account_folder_path) if file.endswith('.csv')]
16 | transfer_files = [os.path.join(transfer_folder_path, file) for file in os.listdir(transfer_folder_path) if file.endswith('.csv')]
17 | withdraw_files = [os.path.join(withdraw_folder_path, file) for file in os.listdir(withdraw_folder_path) if file.endswith('.csv')]
18 | 
19 | account_df = pd.concat([process_csv(file) for file in account_files])
20 | transfer_df = pd.concat([process_csv(file) for file in transfer_files])
21 | withdraw_df = pd.concat([process_csv(file) for file in withdraw_files])
22 | 
23 | merged_df = pd.merge(account_df, transfer_df, left_on='id', right_on='toId', how='left')
24 | 
25 | result_amount_df = merged_df.groupby('id')['amount'].sum().reset_index().fillna(0)
26 | 
27 | account_items = []
28 | 
29 | for account_id in account_df['id']:
30 |     transfer_data = transfer_df[transfer_df['fromId'] == account_id].groupby('toId')['amount'].max().reset_index()
31 |     withdraw_data = withdraw_df[withdraw_df['fromId'] == account_id].groupby('toId')['amount'].max().reset_index()
32 |     
33 |     max_amounts = pd.concat([transfer_data, withdraw_data], ignore_index=True).groupby('toId')['amount'].max().reset_index()
34 |     
35 |     items = [[to_id, max_amount] for to_id, max_amount in zip(max_amounts['toId'], max_amounts['amount'])]
36 | 
37 |     account_items.append([account_id, items])
38 | 
39 | os.makedirs(output_folder, exist_ok=True)
40 | 
41 | result_amount_df.to_csv(os.path.join(output_folder, 'amount.csv'), sep='|', index=False, header=['account_id', 'amount'])
42 | result_df = pd.DataFrame(account_items, columns=['account_id', 'items'])
43 | result_df.to_csv(os.path.join(output_folder, 'account_items.csv'), sep='|', index=False)
44 | 


--------------------------------------------------------------------------------
/tools/legacy/factorgen/loan.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | 
 4 | def process_csv(file_path):
 5 |     df = pd.read_csv(file_path, delimiter='|')
 6 |     return df
 7 | 
 8 | loan_folder_path = '../../out/raw/loan'
 9 | deposit_folder_path = '../../out/raw/deposit'
10 | output_folder = '../../out/factor_table'
11 | 
12 | loan_files = [os.path.join(loan_folder_path, file) for file in os.listdir(loan_folder_path) if file.endswith('.csv')]
13 | deposit_files = [os.path.join(deposit_folder_path, file) for file in os.listdir(deposit_folder_path) if file.endswith('.csv')]
14 | 
15 | loan_df = pd.concat([process_csv(file) for file in loan_files])
16 | deposit_df = pd.concat([process_csv(file) for file in deposit_files])
17 | 
18 | result_list = []
19 | 
20 | for loan_id in loan_df['id'].unique():
21 |     account_list = deposit_df[deposit_df['loanId'] == loan_id]['accountId'].unique().tolist()
22 |     result_list.append([loan_id, account_list])
23 | 
24 | result_df = pd.DataFrame(result_list, columns=['loan_id', 'account_list'])
25 | 
26 | os.makedirs(output_folder, exist_ok=True)
27 | result_df.to_csv('../../out/factor_table/loan_account_list.csv', sep='|', index=False)
28 | 


--------------------------------------------------------------------------------
/tools/legacy/factorgen/split_amount.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def process_csv(file_path):
 6 |     df = pd.read_csv(file_path, delimiter='|')
 7 |     return df
 8 | 
 9 | account_folder_path = '../../out/raw/account'
10 | transfer_folder_path = '../../out/raw/transfer'
11 | withdraw_folder_path = '../../out/raw/withdraw'
12 | output_folder = '../../out/factor_table'
13 | 
14 | account_files = [os.path.join(account_folder_path, file) for file in os.listdir(account_folder_path) if file.endswith('.csv')]
15 | transfer_files = [os.path.join(transfer_folder_path, file) for file in os.listdir(transfer_folder_path) if file.endswith('.csv')]
16 | withdraw_files = [os.path.join(withdraw_folder_path, file) for file in os.listdir(withdraw_folder_path) if file.endswith('.csv')]
17 | 
18 | account_df = pd.concat([process_csv(file) for file in account_files])
19 | transfer_df = pd.concat([process_csv(file) for file in transfer_files])
20 | withdraw_df = pd.concat([process_csv(file) for file in withdraw_files])
21 | 
22 | amount_buckets = [10000, 30000, 100000, 300000, 1000000, 2000000, 3000000, 4000000, 5000000, 6000000, 7000000, 8000000, 9000000, 10000000]
23 | 
24 | result_dict = {}
25 | 
26 | for account_id in account_df['id']:
27 |     account_amount_count = {str(bucket): 0 for bucket in amount_buckets}
28 | 
29 |     account_transfer_data = transfer_df[transfer_df['fromId'] == account_id]
30 | 
31 |     for _, row in account_transfer_data.iterrows():
32 |         amount = row['amount']
33 |         for bucket in amount_buckets:
34 |             if amount <= bucket:
35 |                 account_amount_count[str(bucket)] += 1
36 |                 break
37 | 
38 |     account_withdraw_data = withdraw_df[withdraw_df['fromId'] == account_id]
39 | 
40 |     for _, row in account_withdraw_data.iterrows():
41 |         amount = row['amount']
42 |         for bucket in amount_buckets:
43 |             if amount <= bucket:
44 |                 account_amount_count[str(bucket)] += 1
45 |                 break
46 | 
47 |     result_dict[account_id] = list(account_amount_count.values())
48 | 
49 | 
50 | result_df = pd.DataFrame.from_dict(result_dict, orient='index', columns=[str(bucket) for bucket in amount_buckets])
51 | result_df.reset_index(inplace=True)
52 | result_df.columns = ['account_id'] + [str(bucket) for bucket in amount_buckets]
53 | 
54 | os.makedirs(output_folder, exist_ok=True)
55 | result_df.to_csv('../../out/factor_table/amount_bucket.csv', sep='|', index=False)
56 | 


--------------------------------------------------------------------------------
/tools/legacy/factorgen/time_split.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | from datetime import datetime, timedelta
 4 | 
 5 | 
 6 | def process_csv(file_path):
 7 |     df = pd.read_csv(file_path, delimiter='|')
 8 |     return df
 9 | 
10 | def timestamp_to_year_month(timestamp):
11 |     return timestamp.strftime('%Y-%m')
12 | 
13 | account_folder_path = '../../out/raw/account'
14 | transfer_folder_path = '../../out/raw/transfer'
15 | withdraw_folder_path = '../../out/raw/withdraw'
16 | output_folder = '../../out/factor_table'
17 | 
18 | account_files = [os.path.join(account_folder_path, file) for file in os.listdir(account_folder_path) if file.endswith('.csv')]
19 | transfer_files = [os.path.join(transfer_folder_path, file) for file in os.listdir(transfer_folder_path) if file.endswith('.csv')]
20 | withdraw_files = [os.path.join(withdraw_folder_path, file) for file in os.listdir(withdraw_folder_path) if file.endswith('.csv')]
21 | 
22 | account_df = pd.concat([process_csv(file) for file in account_files])
23 | transfer_df = pd.concat([process_csv(file) for file in transfer_files])
24 | withdraw_df = pd.concat([process_csv(file) for file in withdraw_files])
25 | 
26 | start_date = datetime(2020, 1, 1)
27 | end_date = datetime(2023, 1, 1)
28 | 
29 | month_ranges = pd.date_range(start=start_date, end=end_date, freq='MS')
30 | 
31 | result_dict = {}
32 | 
33 | for account_id in account_df['id']:
34 |     account_month_count = {timestamp_to_year_month(month): 0 for month in month_ranges}
35 | 
36 |     account_transfer_data = transfer_df[transfer_df['fromId'] == account_id]
37 | 
38 |     for _, row in account_transfer_data.iterrows():
39 |         month = timestamp_to_year_month(pd.to_datetime(row['createTime'], unit='ms'))
40 |         account_month_count[month] += 1
41 | 
42 |     account_withdraw_data = withdraw_df[withdraw_df['fromId'] == account_id]
43 | 
44 |     for _, row in account_withdraw_data.iterrows():
45 |         month = timestamp_to_year_month(pd.to_datetime(row['createTime'], unit='ms'))
46 |         account_month_count[month] += 1
47 | 
48 |     result_dict[account_id] = list(account_month_count.values())
49 | 
50 | 
51 | result_df = pd.DataFrame.from_dict(result_dict, orient='index', columns=[timestamp_to_year_month(month) for month in month_ranges])
52 | result_df.reset_index(inplace=True)
53 | result_df.columns = ['account_id'] + [str(timestamp_to_year_month(month)) for month in month_ranges]
54 | 
55 | 
56 | os.makedirs(output_folder, exist_ok=True)
57 | result_df.to_csv('../../out/factor_table/month.csv', sep='|', index=False)
58 | 


--------------------------------------------------------------------------------
/tools/legacy/graphgen/Makefile:
--------------------------------------------------------------------------------
 1 | CFLAGS=-Wall -fopenmp
 2 | LDFLAGS=-fopenmp
 3 | 
 4 | all: graph_gen
 5 | 
 6 | graph_gen.o: graph_gen.c
 7 | 	g++ $(CFLAGS) -o $@ -c $^ 
 8 | 
 9 | graph_gen: graph_gen.o
10 | 	g++ -o $@ $^ $(LDFLAGS)
11 | 
12 | clean:
13 | 	rm -f graph_gen *.o
14 | 


--------------------------------------------------------------------------------
/tools/legacy/graphgen/README.md:
--------------------------------------------------------------------------------
 1 | # Synthetic Graph Generator
 2 | 
 3 | ## Intruction
 4 | graph_gen is a simple synthetic graph generator, support graph types are Kronecker, RMat and Erdos.
 5 | The generated result may has self-loop and duplicated-edges.
 6 | 
 7 | ## Compile
 8 | make graph_gen
 9 | 
10 | ## Run
11 | Use ```./graph_gen -h``` to get help 
12 | 
13 | ### Input
14 | ./graph_gen ./graph_gen -s [scale] -e [edge_factor] -t [gen_type] -x [seed] -v [0/1]
15 | 
16 | Supported gen_type include kron/rmat/erdos
17 | 
18 | ### Output
19 | Edgelist file in binary or txt, each vertex is 4 Bytes.
20 | 
21 | ### example
22 | Here are some examples:
23 | 
24 | ./graph_gen -s 20
25 | 
26 | ./graph_gen -s 22 -e 16 -s kron -x 0
27 | 
28 | ./graph_gen -s 22 -e 16 -s erdos -v 1
29 | 


--------------------------------------------------------------------------------
/tools/merge_cluster_output.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | 
 5 | def merge_cluster_output(dir_A, dir_B, output_dir):
 6 |     # check if the directories exist
 7 |     assert os.path.exists(dir_A), "The directory {} does not exist".format(dir_A)
 8 |     assert os.path.exists(dir_B), "The directory {} does not exist".format(dir_B)
 9 |     # create the output directory if it does not exist
10 |     os.makedirs(output_dir, exist_ok=True)
11 | 
12 |     # get all subdirectories in dir_A
13 |     subdirs = [o for o in os.listdir(dir_A) if os.path.isdir(os.path.join(dir_A, o))]
14 |     for subdir in subdirs:
15 |         # check if the subdirectory exists in both dir_A and dir_B
16 |         assert os.path.exists(
17 |             os.path.join(dir_B, subdir)
18 |         ), "The subdirectory {} does not exist in {}".format(subdir, dir_B)
19 | 
20 |     for subdir in subdirs:
21 |         # get all csv files in the subdirectory
22 |         csv_files_A = [
23 |             os.path.join(dir_A, subdir, f)
24 |             for f in os.listdir(os.path.join(dir_A, subdir))
25 |             if f.endswith(".csv")
26 |         ]
27 |         csv_files_B = [
28 |             os.path.join(dir_B, subdir, f)
29 |             for f in os.listdir(os.path.join(dir_B, subdir))
30 |             if f.endswith(".csv")
31 |         ]
32 | 
33 |         # create a new directory in the output directory with the same name as the subdirectory
34 |         new_subdir = os.path.join(output_dir, subdir)
35 |         os.makedirs(new_subdir, exist_ok=True)
36 |         # copy the csv files from dir_A to the new subdirectory
37 |         for csv_file in csv_files_A:
38 |             new_csv_file = os.path.join(new_subdir, os.path.basename(csv_file))
39 |             os.system("cp {} {}".format(csv_file, new_csv_file))
40 |         # copy the csv files from dir_B to the new subdirectory
41 |         for csv_file in csv_files_B:
42 |             new_csv_file = os.path.join(new_subdir, os.path.basename(csv_file))
43 |             os.system("cp {} {}".format(csv_file, new_csv_file))
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     dir_A = sys.argv[1]
48 |     dir_B = sys.argv[2]
49 |     output_dir = sys.argv[3]
50 |     merge_cluster_output(dir_A, dir_B, output_dir)
51 | 


--------------------------------------------------------------------------------
/tools/statistic.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import glob
 4 | import collections
 5 | 
 6 | 
 7 | labels = ["person","personOwnAccount","personApplyLoan","personGuarantee","personInvest","blank","company","companyOwnAccount","companyApplyLoan","companyGuarantee","companyInvest","blank","account","transfer","withdraw","blank","loan","loantransfer","deposit","repay","blank","medium","signIn"]
 8 | 
 9 | def print_original_counts(counts):
10 |     for key, value in collections.OrderedDict(sorted(counts.items())).items():
11 |         print("{}:{}".format(key, value))
12 | 
13 | def print_formatted_counts(counts):
14 |     for label in labels:
15 |         if label == "blank":
16 |             print("================================")
17 |         else:
18 |             print("{}:{}".format(label, counts[label]))
19 | 
20 | def count_entites(path):
21 |     counts = {}
22 |     for subdir in os.listdir(path):
23 |         subdir_path = os.path.join(path, subdir)
24 |         if os.path.isdir(subdir_path):
25 |             num_entites = 0
26 |             for file in glob.glob(os.path.join(subdir_path, "*.csv")):
27 |                 num_entites += sum(1 for _ in open(file)) - 1
28 |             counts[subdir] = num_entites
29 |     print_original_counts(counts)
30 |     print("\n========== Formatted Output ============\n")
31 |     print_formatted_counts(counts)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     count_entites(sys.argv[1])
36 | 


--------------------------------------------------------------------------------
/tools/validate_formula.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import random
 3 | from collections import Counter
 4 | 
 5 | import numpy as np
 6 | from matplotlib import pyplot as plt
 7 | from scipy.integrate import quad
 8 | 
 9 | data_size = 10000
10 | ind_alphas = [109539041.821, 78379700.038, 133908623.887]
11 | outd_alphas = [20186572.914, 14153912.686, 20194472.855]
12 | ind_betas = np.array([-2.319, -2.319, -2.085])
13 | outd_betas = np.array([-1.720, -1.719, -1.720])
14 | start_degree = 1
15 | max_degree = 1000
16 | 
17 | 
18 | def calc_integral():
19 |     a = ind_alphas[0]
20 |     b = ind_betas[0]
21 | 
22 |     def avg_degree(n):
23 |         return math.pow(n, 0.512 - 0.028 * math.log10(n))
24 | 
25 |     def powerlaw_func(x):
26 |         return a * np.power(x, b)
27 | 
28 |     areas, _ = quad(powerlaw_func, start_degree, max_degree)
29 |     print(np.power((np.power(max_degree, b + 1) + 9 * np.power(0, b + 1)) / 10, 1 / (b + 1)))
30 | 
31 | 
32 | # According to https://mathworld.wolfram.com/RandomNumber.html
33 | # The formula to transform uniform distribution to powerlaw distribution is:
34 | # x = [(x1^(n+1) - x0^(n+1))*y + x0^(n+1)]^(1/(n+1))
35 | def draw_powerlaw():
36 |     beta = np.average(outd_betas)
37 | 
38 |     def powerlaw_func(y):
39 |         return (int)(np.power(
40 |             (np.power(max_degree, beta + 1) - np.power(start_degree, beta + 1)) * y + np.power(start_degree, beta + 1),
41 |             1 / (beta + 1)))
42 | 
43 |     degree = [powerlaw_func(random.uniform(0, 1)) for _ in range(0, data_size)]
44 |     freq = Counter(degree).most_common()
45 |     degrees = []
46 |     counts = []
47 |     for deg, count in freq:
48 |         degrees.append(deg)
49 |         counts.append(count)
50 |     plt.scatter(degrees, counts)
51 |     plt.loglog()
52 |     plt.plot()
53 |     plt.show()
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     draw_powerlaw()
58 | 


--------------------------------------------------------------------------------
/transformation/.gitignore:
--------------------------------------------------------------------------------
1 | *.duckdb*
2 | incremental/
3 | *.log
4 | 


--------------------------------------------------------------------------------
/transformation/install-dependencies.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | pip3 install --user duckdb==0.7.1 pytz networkit pandas
20 | 


--------------------------------------------------------------------------------
/transformation/transform.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Copyright © 2022 Linked Data Benchmark Council (info@ldbcouncil.org)
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | ## Point this path to the directory containing the `raw` directory
20 | FinBench_DATA_ROOT=${PATH_TO_FINBENCH_DATA}
21 | 
22 | set -eu
23 | set -o pipefail
24 | 
25 | cd "$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
26 | 
27 | rm -rf ${FinBench_DATA_ROOT}/deletes ${FinBench_DATA_ROOT}/inserts
28 | mkdir ${FinBench_DATA_ROOT}/deletes ${FinBench_DATA_ROOT}/inserts
29 | 
30 | echo "##### Transform to snapshots and write queries #####"
31 | echo
32 | echo "\${FinBench_DATA_ROOT}: ${FinBench_DATA_ROOT}"
33 | echo
34 | 
35 | python3 ./convert_data.py --raw_dir ${FinBench_DATA_ROOT} --output_dir ${FinBench_DATA_ROOT} | tee output.log
36 | 


--------------------------------------------------------------------------------