├── .clang-format
├── .clangd
├── .gitattributes
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── compile_souffle.sh
├── data
    ├── CA-HepTH
    │   └── edge.facts
    ├── Gnutella31
    │   └── edge.facts
    ├── JOIN.pdf
    ├── SF.cedge
    │   └── edge.facts
    ├── com-dblp
    │   └── edge.facts
    ├── cspa
    │   ├── httpd
    │   │   ├── assign.facts
    │   │   └── dereference.facts
    │   ├── linux
    │   │   ├── assign.facts
    │   │   └── dereference.facts
    │   └── postgresql
    │   │   ├── assign.facts
    │   │   └── dereference.facts
    ├── data_10.txt
    ├── data_22.txt
    ├── data_39994.txt
    ├── ego-Facebook
    │   └── edge.facts
    ├── employee.txt
    ├── fc_ocean
    │   └── edge.facts
    ├── fe-sphere
    │   └── edge.facts
    ├── fe_body
    │   └── edge.facts
    ├── hpc_talk.txt
    ├── loc-Brightkite
    │   └── edge.facts
    ├── tc.png
    ├── tc_new.png
    ├── usroad
    │   └── edge.facts
    └── vsp_finan
    │   └── edge.facts
├── docker
    └── runpod.dockerfile
├── include
    ├── dynamic_dispatch.h
    ├── exception.cuh
    ├── lie.cuh
    ├── print.cuh
    ├── relation.cuh
    ├── relational_algebra.cuh
    ├── timer.cuh
    └── tuple.cuh
├── install_souffle.sh
├── run_cspa_all.sh
├── run_sg_all.sh
├── run_tc_all.sh
├── src
    ├── acopy.cu
    ├── copy.cu
    ├── join.cu
    ├── lie.cu
    ├── print.cu
    ├── relation.cu
    └── tuple.cu
└── test
    ├── cspa.cu
    ├── cuDF
        ├── load_test.py
        ├── reachability.py
        ├── sg.json
        └── sg.py
    ├── datastructure.cu
    ├── merge.cu
    ├── path_length.cu
    ├── sg.cu
    ├── sort.cu
    ├── souffle
        ├── bip.dl
        ├── choice_total.dl
        ├── cspa
        ├── cspa.cpp
        ├── cspa.dl
        ├── cspa.slog
        ├── cspa.slogc
        ├── path_length.dl
        ├── sg.cpp
        ├── sg.dl
        ├── spanning.dl
        ├── tc
        ├── tc.cpp
        └── tc.dl
    └── tc.cu


/.clang-format:
--------------------------------------------------------------------------------
1 | IndentWidth: 4
2 | 


--------------------------------------------------------------------------------
/.clangd:
--------------------------------------------------------------------------------
 1 | CompileFlags:
 2 |   Remove:
 3 |     - -forward-unknown-to-host-compiler
 4 |     - -rdc=true
 5 |     - --generate-code*
 6 |   Add:
 7 |     - -xcuda
 8 |     - -std=c++20
 9 |     - --cuda-gpu-arch=sm_60
10 |     - --cuda-path=/opt/cuda
11 |     - -L/opt/cuda/lib
12 |     - -I/opt/cuda/include
13 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.facts filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | .cache
3 | .vscode
4 | */*.csv
5 | */**/log
6 | .idea/
7 | cmake-build-debug/
8 | cluster
9 | test/cuDF/*.log


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
 2 | project(tc_gpu LANGUAGES CXX CUDA)
 3 | 
 4 | set(gpu_ra_src "${PROJECT_SOURCE_DIR}/src")
 5 | set(gpu_ra_include "${PROJECT_SOURCE_DIR}/include")
 6 | set(test_dir "${PROJECT_SOURCE_DIR}/test")
 7 | 
 8 | file(GLOB source_file_gpu_ra 
 9 |     "${gpu_ra_include}/exception.cuh"
10 |     "${gpu_ra_include}/dynamic_dispatch.h"
11 |     "${gpu_ra_include}/print.cuh"
12 |     "${gpu_ra_include}/relation.cuh"
13 |     "${gpu_ra_include}/relational_algebra.cuh"
14 |     "${gpu_ra_include}/timer.cuh"
15 |     "${gpu_ra_include}/tuple.cuh"
16 |     "${gpu_ra_include}/lie.cuh"
17 |     "${gpu_ra_src}/tuple.cu"
18 |     "${gpu_ra_src}/print.cu"
19 |     "${gpu_ra_src}/relation.cu"
20 |     "${gpu_ra_src}/lie.cu"
21 |     "${gpu_ra_src}/copy.cu"
22 |     "${gpu_ra_src}/acopy.cu"
23 |     "${gpu_ra_src}/join.cu"
24 | )
25 | 
26 | file(GLOB source_file_tc
27 |     "${test_dir}/tc.cu"
28 | )
29 | 
30 | file(GLOB source_path_lenght
31 |     "${test_dir}/path_length.cu"
32 | )
33 | 
34 | file(GLOB source_cspa
35 |     "${test_dir}/cspa.cu"
36 | )
37 | 
38 | file(GLOB source_file_datastructure
39 |     "${test_dir}/datastructure.cu"
40 | )
41 | 
42 | file(GLOB source_file_sg
43 |     "${test_dir}/sg.cu"
44 | )
45 | 
46 | file(GLOB source_file_test
47 |     "${test_dir}/sort.cu"
48 | )
49 | 
50 | file(GLOB source_file_merge
51 |     "${test_dir}/merge.cu"
52 | )
53 | 
54 | add_library(gpu_ra "${source_file_gpu_ra}")
55 | target_compile_features(gpu_ra PUBLIC cxx_std_20)
56 | set_target_properties(gpu_ra PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
57 | 
58 | add_executable(TC ${source_file_tc})
59 | target_link_libraries(TC gpu_ra)
60 | target_compile_features(TC PUBLIC cxx_std_20)
61 | set_target_properties(TC PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
62 | 
63 | add_executable(PLEN ${source_path_lenght})
64 | target_link_libraries(PLEN gpu_ra)
65 | target_compile_features(PLEN PUBLIC cxx_std_20)
66 | set_target_properties(PLEN PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
67 | 
68 | add_executable(CSPA ${source_cspa})
69 | target_link_libraries(CSPA gpu_ra)
70 | target_compile_features(CSPA PUBLIC cxx_std_20)
71 | set_target_properties(CSPA PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
72 | 
73 | add_executable(DATASTRUCTURE ${source_file_datastructure})
74 | target_link_libraries(DATASTRUCTURE gpu_ra)
75 | target_compile_features(DATASTRUCTURE PUBLIC cxx_std_20)
76 | set_target_properties(DATASTRUCTURE PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
77 | 
78 | add_executable(SG ${source_file_sg})
79 | target_link_libraries(SG gpu_ra)
80 | target_compile_features(SG PUBLIC cxx_std_20)
81 | set_target_properties(SG PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
82 | 
83 | add_executable(TEST ${source_file_test})
84 | target_link_libraries(TEST gpu_ra)
85 | target_compile_features(TEST PUBLIC cxx_std_20)
86 | set_target_properties(TEST PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
87 | 
88 | add_executable(MERGE ${source_file_merge})
89 | target_link_libraries(MERGE gpu_ra)
90 | target_compile_features(MERGE PUBLIC cxx_std_20)
91 | set_target_properties(MERGE PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
92 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 HARP Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Datasets
  3 | - Datasets are listed in [data folder](data).
  4 | 
  5 | ## Dependencies
  6 | ### Hardware
  7 | - The complete benchmark of the CUDA-based transitive closure computation experiment can be executed on an Nvidia A100 GPU with a minimum of 40 GB GPU memory. The ThetaGPU single-GPU node is a suitable choice.
  8 | - Partial benchmarks can be run on other Nvidia GPUs, but they may result in program termination for certain datasets due to limited GPU memory, leading to an instance of the `std::bad_alloc: cudaErrorMemoryAllocation: out of memory` error.
  9 | 
 10 | ### NVIDIA CUDA Toolkit (version 11.4.2 or later)
 11 | - Download and install the NVIDIA CUDA Toolkit from the NVIDIA website: [https://developer.nvidia.com/cuda-toolkit-archive](https://developer.nvidia.com/cuda-toolkit-archive)
 12 | - Follow the installation instructions for your operating system. Make sure to install version 11.4.2 or later.
 13 | ### CMake 
 14 | - Download and install CMake(version 3.9 or later) from the CMake website: [https://cmake.org/download/](https://cmake.org/download/)
 15 | ## Thrust
 16 | - need apply patch https://github.com/NVIDIA/thrust/pull/1832/files to fix integer overflow in `thrust::reduce`
 17 | 
 18 | ## Transitive Closure Computation
 19 | - Transitive closure computation is a fundamental operation in graph analytics and relational algebra.
 20 | - We present a CUDA-based implementation of transitive closure computation that is optimized for sparse graphs.
 21 | - Build and run instructions are provided below:
 22 | ```shell
 23 | cmake --no-warn-unused-cli -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE -S./ -B./build 
 24 | cd build
 25 | make
 26 | ```
 27 | This will build the `TC` executable using the nvcc compiler.
 28 | - The `TC` executable takes a single argument, which is the path to the input file containing the graph data. The input file should be in the following format:
 29 | ```shell
 30 | ./TC ../data/data_5.txt
 31 | ```
 32 | ### Run instructions for Polaris
 33 | - Run using Interactive node:
 34 | ```shell
 35 | ssh <USERNAME>@polaris.alcf.anl.gov
 36 | qsub -I -l select=1 -l filesystems=home:eagle -l walltime=1:00:00 -q debug -A dist_relational_alg
 37 | module load gcc
 38 | cd slog-gpu-backend
 39 | git fetch
 40 | git reset --hard origin/main
 41 | rm -rf build
 42 | module purge
 43 | module load gcc
 44 | module load cmake
 45 | module load cudatoolkit-standalone
 46 | cmake --no-warn-unused-cli -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=TRUE -S./ -B./build 
 47 | cd build
 48 | make
 49 | ./TC ../data/data_5.txt
 50 | ```
 51 | - Transfer a file from local machine to Polaris:
 52 | ```shell
 53 | scp data_68993773.txt arsho@polaris.alcf.anl.gov:/home/arsho/slog-gpu-backend/data/
 54 | ```
 55 | ### (Optional) Memory check:
 56 | - After creating the build folder and `TC` executable, run the following commands to check for memory leaks and errors:
 57 | ```shell
 58 | cuda-memcheck ./TC ../data/data_7035.txt
 59 | ========= CUDA-MEMCHECK
 60 | ...
 61 | TC time: 48.691
 62 | ========= ERROR SUMMARY: 0 errors
 63 | compute-sanitizer ./TC ../data/data_7035.txt
 64 | ========= COMPUTE-SANITIZER
 65 | ...
 66 | TC time: 0.668892
 67 | ========= ERROR SUMMARY: 0 errors
 68 | ```
 69 | 
 70 | ### Run cuDF on Polaris
 71 | ```shell
 72 | ssh <USERNAME>@polaris.alcf.anl.gov
 73 | qsub -I -l select=1 -l filesystems=home:grand:eagle -l walltime=1:00:00 -q debug -A dist_relational_alg
 74 | module purge
 75 | module load conda/2023-10-04
 76 | conda activate
 77 | pip install --extra-index-url https://pypi.nvidia.com cudf-cu11
 78 | python test/cuDF/sg.py
 79 | 
 80 | (2022-09-08/base) arsho::x3004c0s7b0n0 { ~/slog-gpu-backend/test/cuDF }-> python sg.py
 81 | | Dataset | Number of rows | SG size | Iterations | Time (s) |
 82 | | --- | --- | --- | --- | --- |
 83 | | hipc | 5 | 4 | 3 | 0.016371 |
 84 | Error in fe_body. Message: std::bad_alloc: out_of_memory: CUDA error at: /__w/rmm/rmm/include/rmm/mr/device/cuda_memory_resource.hpp:70: cudaErrorMemoryAllocation out of memory
 85 | Error in loc-Brightkite. Message: std::bad_alloc: out_of_memory: CUDA error at: /__w/rmm/rmm/include/rmm/mr/device/cuda_memory_resource.hpp:70: cudaErrorMemoryAllocation out of memory
 86 | Error in fe_sphere. Message: std::bad_alloc: out_of_memory: CUDA error at: /__w/rmm/rmm/include/rmm/mr/device/cuda_memory_resource.hpp:70: cudaErrorMemoryAllocation out of memory
 87 | | CA-HepTh | 51971 | 74618689 | 9 | 21.241212 |
 88 | | Dataset | Number of rows | SG size | Iterations | Time (s) |
 89 | | --- | --- | --- | --- | --- |
 90 | | ego-Facebook | 88234 | 15018986 | 13 | 19.074940 |
 91 | | wiki-Vote | 103689 | 5376338 | 4 | 2.603751 |
 92 | | luxembourg_osm | 119666 | 245221 | 326 | 2.215113 |
 93 | | cti | 48232 | 14503742 | 44 | 3.857438 |
 94 | | fe_ocean | 409593 | 65941441 | 77 | 45.979235 |
 95 | | wing | 121544 | 647999 | 8 | 0.204277 |
 96 | | delaunay_n16 | 196575 | 25994011 | 85 | 14.832548 |
 97 | Error in usroads. Message: std::bad_alloc: out_of_memory: CUDA error at: /__w/rmm/rmm/include/rmm/mr/device/cuda_memory_resource.hpp:70: cudaErrorMemoryAllocation out of memory
 98 | Error in p2p-Gnutella31. Message: std::bad_alloc: out_of_memory: CUDA error at: /__w/rmm/rmm/include/rmm/mr/device/cuda_memory_resource.hpp:70: cudaErrorMemoryAllocation out of memory
 99 | | p2p-Gnutella09 | 26013 | 62056583 | 14 | 13.705286 |
100 | | p2p-Gnutella04 | 39994 | 116931333 | 18 | 48.947088 |
101 | | cal.cedge | 21693 | 23519 | 58 | 0.259069 |
102 | | TG.cedge | 23874 | 608090 | 54 | 0.719743 |
103 | | OL.cedge | 7035 | 285431 | 56 | 0.385674 |
104 | ```
105 | 
106 | ### Examples
107 | a TC example
108 | ```
109 |     Relation *edge_2__2_1 = new Relation();
110 |     Relation *path_2__1_2 = new Relation();
111 | 
112 |     load_relation(path_2__1_2, "path_2__1_2", 2, raw_graph_data,
113 |                   graph_edge_counts, 1, 0, grid_size, block_size);
114 |     load_relation(edge_2__2_1, "edge_2__2_1", 2, raw_reverse_graph_data,
115 |                   graph_edge_counts, 1, 0, grid_size, block_size);
116 | 
117 |     LIE tc_scc(grid_size, block_size);
118 |     tc_scc.add_relations(edge_2__2_1, true);
119 |     tc_scc.add_relations(path_2__1_2, false);
120 |     float join_detail[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
121 |     tuple_generator_hook reorder_path_host;
122 |     cudaMemcpyFromSymbol(&reorder_path_host, reorder_path_device,
123 |                          sizeof(tuple_generator_hook));
124 |     tuple_copy_hook cp_1_host;
125 |     cudaMemcpyFromSymbol(&cp_1_host, cp_1_device, sizeof(tuple_copy_hook));
126 |     tc_scc.add_ra(RelationalJoin(edge_2__2_1, FULL, path_2__1_2, DELTA,
127 |                                  path_2__1_2, reorder_path_host, nullptr,
128 |                                  LEFT, grid_size, block_size, join_detail));
129 | 
130 |     tc_scc.fixpoint_loop();
131 | ```
132 | 
133 | ### References
134 | - [Getting Started on ThetaGPU](https://docs.alcf.anl.gov/theta-gpu/getting-started/)
135 | - [Getting Started on Polaris](https://docs.alcf.anl.gov/polaris/getting-started/)
136 | - [CUDA — Memory Model blog](https://medium.com/analytics-vidhya/cuda-memory-model-823f02cef0bf)
137 | - [CUDA - Pinned memory](https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/)
138 | - [Stanford Large Network Dataset Collection](https://snap.stanford.edu/data/index.html)
139 | 


--------------------------------------------------------------------------------
/compile_souffle.sh:
--------------------------------------------------------------------------------
1 | echo "Compiling souffle Queries"
2 | souffle -o build/TC.souffle -j 32 
3 | 
4 | 


--------------------------------------------------------------------------------
/data/CA-HepTH/edge.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8edcd3ddb7db3ad8306c07ebe8e57b3fb558f559f3dfd32b5a6e430529772257
3 | size 658567
4 | 


--------------------------------------------------------------------------------
/data/Gnutella31/edge.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9c5735f9868a150251841ba0b42a6b47655e61feac849232c469157c9f37bdd6
3 | size 1852859
4 | 


--------------------------------------------------------------------------------
/data/JOIN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harp-lab/gdlog/65a6ee960ced8d04bc725ccdfa68f004f8479226/data/JOIN.pdf


--------------------------------------------------------------------------------
/data/SF.cedge/edge.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a9947ea406d23dada5de5dc26f20780a255a68b7d7bab0ff83adf01598ed8949
3 | size 2838411
4 | 


--------------------------------------------------------------------------------
/data/com-dblp/edge.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d98b4fc436446c28e452071813cf205e39f333e74e79a286a4b8edabe5ff680e
3 | size 13931327
4 | 


--------------------------------------------------------------------------------
/data/cspa/httpd/assign.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:31402653628de04569ee38d5dd079dab4f84b36ab685a0b567261b42a6cb4af6
3 | size 5336279
4 | 


--------------------------------------------------------------------------------
/data/cspa/httpd/dereference.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f391e13c3b42cb0acad7e0fa816d43ac987f0e0de1e7a9fef498045a9ff8e26c
3 | size 16879392
4 | 


--------------------------------------------------------------------------------
/data/cspa/linux/assign.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bbf8b176590d6cf2c2296c4135bd8de6bb71237b4207e3a52c8aa53b238e5d83
3 | size 31737404
4 | 


--------------------------------------------------------------------------------
/data/cspa/linux/dereference.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5aca2cb3aabfecd209b823b687df409a861936728d8dacd177d4a190137d91b0
3 | size 120237633
4 | 


--------------------------------------------------------------------------------
/data/cspa/postgresql/assign.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1227deac9deeb851fbe5e1e4266100b03c88a70c9f5f046483824accae98b41a
3 | size 18852299
4 | 


--------------------------------------------------------------------------------
/data/cspa/postgresql/dereference.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0493fe455145ec8aa6d0d3543c9dfc35282493fa8174ff69f2128630988b6a0b
3 | size 54020989
4 | 


--------------------------------------------------------------------------------
/data/data_10.txt:
--------------------------------------------------------------------------------
 1 | 1	2
 2 | 1	5
 3 | 1	6
 4 | 2	3
 5 | 2	6
 6 | 3	4
 7 | 3	7
 8 | 4	5
 9 | 4	6
10 | 5	6
11 | 


--------------------------------------------------------------------------------
/data/data_22.txt:
--------------------------------------------------------------------------------
 1 | 1	2
 2 | 2	3
 3 | 3	4
 4 | 4	5
 5 | 5	6
 6 | 6	7
 7 | 7	8
 8 | 8	9
 9 | 9	10
10 | 10	11
11 | 11	12
12 | 12	13
13 | 13	14
14 | 14	15
15 | 15	16
16 | 16	17
17 | 17	18
18 | 18	19
19 | 19	20
20 | 20	21
21 | 21	22
22 | 22	23
23 | 


--------------------------------------------------------------------------------
/data/ego-Facebook/edge.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a23ba0e1930d856fe71c3355969ca2a53756de3ea9ccae486fd7cb4294a59567
3 | size 854362
4 | 


--------------------------------------------------------------------------------
/data/employee.txt:
--------------------------------------------------------------------------------
1 | 1,1
2 | 1,2
3 | 1,3
4 | 1,55
5 | 1,539
6 | 2,1
7 | 2,2
8 | 2,3


--------------------------------------------------------------------------------
/data/fc_ocean/edge.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0232e4991188da19342d6d25196e1ce5afd739fb54e47e3d589b4d80a3f6da7a
3 | size 5098905
4 | 


--------------------------------------------------------------------------------
/data/fe-sphere/edge.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f96dfcb180d181e81edcb6c52dd9948dc20c9b3ec2adae9190ba349a73064a85
3 | size 523204
4 | 


--------------------------------------------------------------------------------
/data/fe_body/edge.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:86e184c5c5092f963b421def09cc0a8dae3afdfa26113d14384a76c069455c3e
3 | size 1884551
4 | 


--------------------------------------------------------------------------------
/data/hpc_talk.txt:
--------------------------------------------------------------------------------
1 | 1   2
2 | 1   3
3 | 2   4
4 | 3   4
5 | 4   5
6 | 


--------------------------------------------------------------------------------
/data/loc-Brightkite/edge.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cc2eebc46f5c1ecd84122d5b02fde29d5b534226a214a7ea6608401815f29a16
3 | size 2289578
4 | 


--------------------------------------------------------------------------------
/data/tc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harp-lab/gdlog/65a6ee960ced8d04bc725ccdfa68f004f8479226/data/tc.png


--------------------------------------------------------------------------------
/data/tc_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harp-lab/gdlog/65a6ee960ced8d04bc725ccdfa68f004f8479226/data/tc_new.png


--------------------------------------------------------------------------------
/data/usroad/edge.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a77677c832e2a217187e8ba5d9b23794eba814eecb77dce3386acac043ff4cb4
3 | size 2031370
4 | 


--------------------------------------------------------------------------------
/data/vsp_finan/edge.facts:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fb16bab5380960ff88e9bb748e35735c722d6cc9a88595d7692b57e8eb97b80a
3 | size 6724114
4 | 


--------------------------------------------------------------------------------
/docker/runpod.dockerfile:
--------------------------------------------------------------------------------
 1 | FROM stargazermiao/gdlog-env:11.8
 2 | 
 3 | COPY --chown=gdlog:gdlog . /opt/gdlog
 4 | WORKDIR /opt/gdlog
 5 | 
 6 | # RUN rm -r build
 7 | RUN cmake -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo -Bbuild . && cd build && make -j
 8 | RUN chmod -R 757 /opt/gdlog
 9 | 
10 | # CMD [ "/start.sh" ]
11 | 


--------------------------------------------------------------------------------
/include/dynamic_dispatch.h:
--------------------------------------------------------------------------------
1 | #include <type_traits>
2 | 
3 | template <class... Variants>
4 | struct dynamic_dispatch : Variants... {
5 |     using Variants::operator()...;
6 | };
7 | template <class... Variants>
8 | dynamic_dispatch(Variants...) -> dynamic_dispatch<Variants...>;
9 | 


--------------------------------------------------------------------------------
/include/exception.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <assert.h>
 3 | // #include <cuda_runtime.h>
 4 | #include <iostream>
 5 | 
 6 | #define checkCuda(ans)                                                         \
 7 |     { gpuAssert((ans), __FILE__, __LINE__); }
 8 | 
 9 | inline void gpuAssert(cudaError_t code, const char *file, int line,
10 |                       bool abort = true) {
11 |     if (code != cudaSuccess) {
12 |         fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
13 |                 line);
14 |         if (abort) {
15 |             cudaDeviceReset();
16 |             exit(code);
17 |         }
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/include/lie.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "relational_algebra.cuh"
 3 | #include <climits>
 4 | #include <vector>
 5 | 
 6 | /**
 7 |  * @brief Logical inference engine(LIE). Compute fixpoint for a datalog rule SCC
 8 |  * (Strongly Connected Component).
 9 |  *
10 |  */
11 | struct LIE {
12 |     // all relation operator used in this LIE
13 |     std::vector<ra_op> ra_ops;
14 | 
15 |     // all relations may have new data in this SCC
16 |     std::vector<Relation *> update_relations;
17 |     // all relation won't be changed in this SCC
18 |     std::vector<Relation *> static_relations;
19 | 
20 |     // temporary relations, these relations's FULL version won't be stored,
21 |     // delta version of these relation will be cleared after used in join
22 |     std::vector<Relation *> tmp_relations;
23 | 
24 |     // GPU grid size
25 |     int grid_size;
26 |     // GPU block size
27 |     int block_size;
28 | 
29 |     bool reload_full_flag = true;
30 |     int max_iteration = INT_MAX;
31 | 
32 |     LIE(int grid_size, int block_size)
33 |         : grid_size(grid_size), block_size(block_size) {}
34 | 
35 |     /**
36 |      * @brief compute fixpoint for current LIE
37 |      *
38 |      */
39 |     void fixpoint_loop();
40 | 
41 |     /**
42 |      * @brief Add a relation to SCC, all relation must be added before fixpoint
43 |      * loop begin
44 |      *
45 |      * @param rel relation to add
46 |      * @param static_flag whether a relation appears in output relation position
47 |      * or not
48 |      */
49 |     void add_relations(Relation *rel, bool static_flag);
50 | 
51 |     /**
52 |      * @brief add a temporary relation (a relation only have DELTA/NEWT)
53 |      * 
54 |      * @param rel 
55 |      */
56 |     void add_tmp_relation(Relation *rel);
57 | 
58 |     /**
59 |      * @brief add a Relation Algebra operation
60 |      * 
61 |      * @param op 
62 |      */
63 |     void add_ra(ra_op op);
64 |     // void ra(ra_op op);
65 | };
66 | 


--------------------------------------------------------------------------------
/include/print.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "relation.cuh"
 3 | #include "tuple.cuh"
 4 | // test helper
 5 | 
 6 | void print_hashes(GHashRelContainer* target, const char *rel_name);
 7 | 
 8 | void print_tuple_rows(GHashRelContainer* target, const char *rel_name);
 9 | 
10 | void print_tuple_raw_data(GHashRelContainer* target, const char *rel_name);
11 | 
12 | void print_memory_usage();
13 | 
14 | void print_tuple_list(tuple_type* tuples, tuple_size_t rows, tuple_size_t arity);
15 | 
16 | tuple_size_t get_free_memory();
17 | 
18 | tuple_size_t get_total_memory();
19 | 


--------------------------------------------------------------------------------
/include/relation.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "tuple.cuh"
  3 | #include <string>
  4 | #include <vector>
  5 | 
  6 | #ifndef RADIX_SORT_THRESHOLD
  7 | #define RADIX_SORT_THRESHOLD 0
  8 | #endif
  9 | #ifndef FULL_BUFFER_VEC_MULTIPLIER
 10 | #define FULL_BUFFER_VEC_MULTIPLIER 8
 11 | #endif
 12 | 
 13 | enum RelationVersion { DELTA, FULL, NEWT };
 14 | 
 15 | /**
 16 |  * @brief A hash table entry
 17 |  * TODO: no need for struct actually, a u64[2] should be enough, easier to init
 18 |  *
 19 |  */
 20 | struct MEntity {
 21 |     // index position in actual index_arrary
 22 |     u64 key;
 23 |     // tuple position in actual data_arrary
 24 |     tuple_size_t value;
 25 | };
 26 | 
 27 | #define EMPTY_HASH_ENTRY ULONG_MAX
 28 | /**
 29 |  * @brief a C-style hashset indexing based relation container.
 30 |  *        Actual data is still stored using sorted set.
 31 |  *        Different from normal btree relation, using hash table storing the
 32 |  * index to accelarte range fetch. Good:
 33 |  *           - fast range fetch, in Shovon's ATC paper it shows great
 34 |  * performance.
 35 |  *           - fast serialization, its very GPU friendly and also easier for MPI
 36 |  * inter-rank comm transmission. Bad:
 37 |  *           - need reconstruct index very time tuple is inserted (need more
 38 |  * reasonable algorithm).
 39 |  *           - sorting is a issue, each update need resort everything seems
 40 |  * stupid.
 41 |  *
 42 |  */
 43 | struct GHashRelContainer {
 44 |     // open addressing hashmap for indexing
 45 |     MEntity *index_map = nullptr;
 46 |     tuple_size_t index_map_size = 0;
 47 |     float index_map_load_factor;
 48 | 
 49 |     // index prefix length
 50 |     // don't have to be u64,int is enough
 51 |     // u64 *index_columns;
 52 |     tuple_size_t index_column_size;
 53 | 
 54 |     // dependent postfix column always at the end of tuple
 55 |     int dependent_column_size = 0;
 56 | 
 57 |     // the pointer to flatten tuple, all tuple pointer here need to be sorted
 58 |     tuple_type *tuples = nullptr;
 59 |     // flatten tuple data
 60 |     column_type *data_raw = nullptr;
 61 |     // number of tuples
 62 |     tuple_size_t tuple_counts = 0;
 63 |     // actual tuple rows in flatten data, this maybe different from
 64 |     // tuple_counts when deduplicated
 65 |     tuple_size_t data_raw_row_size = 0;
 66 |     int arity;
 67 |     bool tmp_flag = false;
 68 | 
 69 |     GHashRelContainer(int arity, int indexed_column_size,
 70 |                       int dependent_column_size, bool tmp_flag = false)
 71 |         : arity(arity), index_column_size(indexed_column_size),
 72 |           dependent_column_size(dependent_column_size), tmp_flag(tmp_flag){};
 73 | };
 74 | 
 75 | enum JoinDirection { LEFT, RIGHT };
 76 | 
 77 | /**
 78 |  * @brief fill in index hash table for a relation in parallel, assume index is
 79 |  * correctly initialized, data has been loaded , deduplicated and sorted
 80 |  *
 81 |  * @param target the hashtable to init
 82 |  * @return dedeuplicated_bitmap
 83 |  */
 84 | __global__ void calculate_index_hash(GHashRelContainer *target,
 85 |                                      tuple_indexed_less cmp);
 86 | 
 87 | /**
 88 |  * @brief count how many non empty hash entry in index map
 89 |  *
 90 |  * @param target target relation hash table
 91 |  * @param size return the size
 92 |  * @return __global__
 93 |  */
 94 | __global__ void count_index_entry_size(GHashRelContainer *target,
 95 |                                        tuple_size_t *size);
 96 | 
 97 | /**
 98 |  * @brief rehash to make index map more compact, the new index hash size is
 99 |  * already update in target new index already inited to empty table and have new
100 |  * size.
101 |  *
102 |  * @param target
103 |  * @param old_index_map index map before compaction
104 |  * @param old_index_map_size original size of index map before compaction
105 |  * @return __global__
106 |  */
107 | __global__ void shrink_index_map(GHashRelContainer *target,
108 |                                  MEntity *old_index_map,
109 |                                  tuple_size_t old_index_map_size);
110 | 
111 | /**
112 |  * @brief a CUDA kernel init the index entry map of a hashtabl
113 |  *
114 |  * @param target the hashtable to init
115 |  * @return void
116 |  */
117 | __global__ void init_index_map(GHashRelContainer *target);
118 | 
119 | /**
120 |  * @brief a helper function to init an unsorted tuple arrary from raw data. This
121 |  * function turn a flatten raw data array into a tuple array contains pointers
122 |  * to raw data array
123 |  *
124 |  * @param tuples result tuple array
125 |  * @param raw_data flatten raw tuples 1-D array
126 |  * @param arity arity of reltaion
127 |  * @param rows tuple number
128 |  * @return void
129 |  */
130 | __global__ void init_tuples_unsorted(tuple_type *tuples, column_type *raw_data,
131 |                                      int arity, tuple_size_t rows);
132 | 
133 | /**
134 |  * @brief for all tuples in outer table, match same prefix with inner table
135 |  *
136 |  * @note can we use pipeline here? since many matching may acually missing
137 |  *
138 |  * @param inner_table the hashtable to iterate
139 |  * @param outer_table the hashtable to match
140 |  * @param join_column_counts number of join columns (inner and outer must agree
141 |  * on this)
142 |  * @param  return value stored here, size of joined tuples
143 |  * @return void
144 |  */
145 | __global__ void get_join_result_size(GHashRelContainer *inner_table,
146 |                                      GHashRelContainer *outer_table,
147 |                                      int join_column_counts,
148 |                                      tuple_generator_hook tp_gen,
149 |                                      tuple_predicate tp_pred,
150 |                                      tuple_size_t *join_result_size);
151 | 
152 | /**
153 |  * @brief compute the join result
154 |  *
155 |  * @param inner_table
156 |  * @param outer_table
157 |  * @param join_column_counts
158 |  * @param output_reorder_array reorder array for output relation column
159 |  * selection, arrary pos < inner->arity is index in inner, > is index in outer.
160 |  * @param output_arity output relation arity
161 |  * @param output_raw_data join result, need precompute the size
162 |  * @return __global__
163 |  */
164 | __global__ void
165 | get_join_result(GHashRelContainer *inner_table, GHashRelContainer *outer_table,
166 |                 int join_column_counts, tuple_generator_hook tp_gen,
167 |                 tuple_predicate tp_pred, int output_arity,
168 |                 column_type *output_raw_data, tuple_size_t *res_count_array,
169 |                 tuple_size_t *res_offset, JoinDirection direction);
170 | 
171 | __global__ void flatten_tuples_raw_data(tuple_type *tuple_pointers,
172 |                                         column_type *raw,
173 |                                         tuple_size_t tuple_counts, int arity);
174 | 
175 | __global__ void get_copy_result(tuple_type *src_tuples,
176 |                                 column_type *dest_raw_data, int output_arity,
177 |                                 tuple_size_t tuple_counts,
178 |                                 tuple_copy_hook tp_gen);
179 | 
180 | //////////////////////////////////////////////////////
181 | // CPU functions
182 | 
183 | /**
184 |  * @brief load raw data into relation container
185 |  *
186 |  * @param target hashtable struct in host
187 |  * @param arity
188 |  * @param data raw data on host
189 |  * @param data_row_size
190 |  * @param index_columns index columns id in host
191 |  * @param index_column_size
192 |  * @param index_map_load_factor
193 |  * @param grid_size
194 |  * @param block_size
195 |  * @param gpu_data_flag if data is a GPU memory address directly assign to
196 |  * target's data_raw
197 |  * @param sorted_flag whether input raw data tuples are sorted (use sorted array
198 |  * will be fasted, avoid extra sorting)
199 |  * @param build_index_flag whether this relation container need indexing.
200 |  */
201 | void load_relation_container(
202 |     GHashRelContainer *target, int arity, column_type *data,
203 |     tuple_size_t data_row_size, tuple_size_t index_column_size,
204 |     int dependent_column_size, float index_map_load_factor, int grid_size,
205 |     int block_size, float *detail_time, bool gpu_data_flag = false,
206 |     bool sorted_flag = false, bool build_index_flag = true,
207 |     bool tuples_array_flag = true);
208 | 
209 | void repartition_relation_index(GHashRelContainer *target, int arity,
210 |                                 column_type *data, tuple_size_t data_row_size,
211 |                                 tuple_size_t index_column_size,
212 |                                 int dependent_column_size,
213 |                                 float index_map_load_factor, int grid_size,
214 |                                 int block_size, float *detail_time);
215 | 
216 | /**
217 |  * @brief copy a relation into an **empty** relation
218 |  *
219 |  * @param dst
220 |  * @param src
221 |  */
222 | void copy_relation_container(GHashRelContainer *dst, GHashRelContainer *src,
223 |                              int grid_size, int block_size);
224 | 
225 | /**
226 |  * @brief recreate index for a full relation container
227 |  *
228 |  * @param target
229 |  * @param arity
230 |  * @param tuples
231 |  * @param data_row_size
232 |  * @param index_column_size
233 |  * @param dependent_column_size
234 |  * @param index_map_load_factor
235 |  * @param grid_size
236 |  * @param block_size
237 |  */
238 | void reload_full_temp(GHashRelContainer *target, int arity, tuple_type *tuples,
239 |                       tuple_size_t data_row_size,
240 |                       tuple_size_t index_column_size, int dependent_column_size,
241 |                       float index_map_load_factor, int grid_size,
242 |                       int block_size);
243 | 
244 | /**
245 |  * @brief clean all data in a relation container
246 |  *
247 |  * @param target
248 |  */
249 | void free_relation_container(GHashRelContainer *target);
250 | 
251 | enum MonotonicOrder { DESC, ASC, UNSPEC };
252 | 
253 | /**
254 |  * @brief actual relation class used in semi-naive eval
255 |  *
256 |  */
257 | struct Relation {
258 |     int arity;
259 |     // the first <index_column_size> columns of a relation will be use to
260 |     // build relation index, and only indexed columns can be used to join
261 |     int index_column_size;
262 |     std::string name;
263 | 
264 |     // the last <dependent_column_size> will be used a dependant columns,
265 |     // these column can be used to store recurisve aggreagtion/choice
266 |     // domain's result, these columns can't be used as index columns
267 |     int dependent_column_size = 0;
268 |     bool index_flag = true;
269 |     bool tmp_flag = false;
270 | 
271 |     GHashRelContainer *delta;
272 |     GHashRelContainer *newt;
273 |     GHashRelContainer *full;
274 | 
275 |     // TODO: out dataed remove these, directly use GHashRelContainer
276 |     // **full** a buffer for tuple pointer in full
277 |     tuple_size_t current_full_size = 0;
278 |     tuple_type *tuple_full;
279 | 
280 |     tuple_type *tuple_merge_buffer;
281 |     tuple_size_t tuple_merge_buffer_size = 0;
282 |     bool pre_allocated_merge_buffer_flag = true;
283 |     bool fully_disable_merge_buffer_flag = true;
284 |     //
285 | 
286 |     // delta relation generate in each iteration, all index stripped
287 |     std::vector<GHashRelContainer *> buffered_delta_vectors;
288 | 
289 |     // reserved properties for monotonic aggregation
290 |     MonotonicOrder monotonic_order = MonotonicOrder::DESC;
291 | 
292 |     /**
293 |      * @brief store the data in DELTA into full relation (this won't free
294 |      * delta)
295 |      *
296 |      * @param grid_size
297 |      * @param block_size
298 |      */
299 |     void flush_delta(int grid_size, int block_size, float *detail_time);
300 | };
301 | 
302 | /**
303 |  * @brief load tuples to FULL relation of target relation
304 |  *
305 |  * @param target target relation
306 |  * @param name name of relation
307 |  * @param arity
308 |  * @param data raw flatten tuple need loaded into target relation
309 |  * @param data_row_size number of tuples to load
310 |  * @param index_column_size number of columns used to index
311 |  * @param dependent_column_size
312 |  * @param grid_size
313 |  * @param block_size
314 |  */
315 | void load_relation(Relation *target, std::string name, int arity,
316 |                    column_type *data, tuple_size_t data_row_size,
317 |                    tuple_size_t index_column_size, int dependent_column_size,
318 |                    int grid_size, int block_size, bool tmp_flag = false);
319 | 


--------------------------------------------------------------------------------
/include/relational_algebra.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "relation.cuh"
  3 | #include "tuple.cuh"
  4 | #include <thrust/host_vector.h>
  5 | #include <variant>
  6 | 
  7 | // for fixing
  8 | #ifndef MAX_REDUCE_SIZE
  9 | #define MAX_REDUCE_SIZE 80000000
 10 | #endif
 11 | 
 12 | // function hook describ how inner and outer tuple are reordered to result tuple
 13 | 
 14 | /**
 15 |  * @brief Relation Algerbra kernal for JOIN ⋈
 16 |  *
 17 |  */
 18 | struct RelationalJoin {
 19 | 
 20 |     // relation to compare, this relation must has index
 21 |     Relation *inner_rel;
 22 |     RelationVersion inner_ver;
 23 |     // serialized relation, every tuple in this relation will be iterated and
 24 |     // joined with tuples in inner relation
 25 |     Relation *outer_rel;
 26 |     RelationVersion outer_ver;
 27 | 
 28 |     // the relation to store the generated join result
 29 |     Relation *output_rel;
 30 |     // hook function will be mapped on every join result tuple
 31 |     tuple_generator_hook tuple_generator;
 32 |     // filter to be applied on every join result tuple
 33 |     tuple_predicate tuple_pred;
 34 | 
 35 |     // TODO: reserved for optimization
 36 |     JoinDirection direction;
 37 |     int grid_size;
 38 |     int block_size;
 39 | 
 40 |     // flag for benchmark, this will disable sorting on result
 41 |     bool disable_load = false;
 42 | 
 43 |     // join time for debug and profiling
 44 |     float *detail_time;
 45 | 
 46 |     RelationalJoin(Relation *inner_rel, RelationVersion inner_ver,
 47 |                    Relation *outer_rel, RelationVersion outer_ver,
 48 |                    Relation *output_rel, tuple_generator_hook tp_gen,
 49 |                    tuple_predicate tp_pred, JoinDirection direction,
 50 |                    int grid_size, int block_size, float *detail_time)
 51 |         : inner_rel(inner_rel), inner_ver(inner_ver), outer_rel(outer_rel),
 52 |           outer_ver(outer_ver), output_rel(output_rel), tuple_generator(tp_gen),
 53 |           tuple_pred(tp_pred), direction(direction), grid_size(grid_size),
 54 |           block_size(block_size), detail_time(detail_time){};
 55 | 
 56 |     void operator()();
 57 | };
 58 | 
 59 | /**
 60 |  * @brief Relation Algerbra kernal for PROJECTION Π
 61 |  *
 62 |  */
 63 | struct RelationalCopy {
 64 |     Relation *src_rel;
 65 |     RelationVersion src_ver;
 66 |     Relation *dest_rel;
 67 |     tuple_copy_hook tuple_generator;
 68 |     tuple_predicate tuple_pred;
 69 | 
 70 |     int grid_size;
 71 |     int block_size;
 72 |     bool copied = false;
 73 | 
 74 |     RelationalCopy(Relation *src, RelationVersion src_ver, Relation *dest,
 75 |                    tuple_copy_hook tuple_generator, tuple_predicate tuple_pred,
 76 |                    int grid_size, int block_size)
 77 |         : src_rel(src), src_ver(src_ver), dest_rel(dest),
 78 |           tuple_generator(tuple_generator), tuple_pred(tuple_pred),
 79 |           grid_size(grid_size), block_size(block_size) {}
 80 | 
 81 |     void operator()();
 82 | };
 83 | 
 84 | /**
 85 |  * @brief Relation Algebra kernel for sync up different indices of the same
 86 |  * relation. This RA operator must be added in the end of each SCC, it will
 87 |  * directly change the DELTA version of dest relation
 88 |  *
 89 |  */
 90 | struct RelationalACopy {
 91 |     Relation *src_rel;
 92 |     Relation *dest_rel;
 93 |     // function will be mapped on all tuple copied
 94 |     tuple_copy_hook tuple_generator;
 95 |     // filter for copied tuple
 96 |     tuple_predicate tuple_pred;
 97 | 
 98 |     int grid_size;
 99 |     int block_size;
100 | 
101 |     RelationalACopy(Relation *src, Relation *dest,
102 |                     tuple_copy_hook tuple_generator, tuple_predicate tuple_pred,
103 |                     int grid_size, int block_size)
104 |         : src_rel(src), dest_rel(dest), tuple_generator(tuple_generator),
105 |           tuple_pred(tuple_pred), grid_size(grid_size), block_size(block_size) {
106 |     }
107 | 
108 |     void operator()();
109 | };
110 | 
111 | /**
112 |  * @brief possible RA types
113 |  * 
114 |  */
115 | using ra_op = std::variant<RelationalJoin, RelationalCopy, RelationalACopy>;
116 | 
117 | enum RAtypes { JOIN, COPY, ACOPY };
118 | 


--------------------------------------------------------------------------------
/include/timer.cuh:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | // #include <cuda_runtime.h>
 4 | 
 5 | struct KernelTimer {
 6 |     cudaEvent_t start;
 7 |     cudaEvent_t stop;
 8 | 
 9 |     KernelTimer() {
10 |         cudaEventCreate(&start);
11 |         cudaEventCreate(&stop);
12 |     }
13 | 
14 |     ~KernelTimer() {
15 |         cudaEventDestroy(start);
16 |         cudaEventDestroy(stop);
17 |     }
18 | 
19 |     void start_timer() { cudaEventRecord(start, 0); }
20 | 
21 |     void stop_timer() { cudaEventRecord(stop, 0); }
22 | 
23 |     float get_spent_time() {
24 |         float elapsed;
25 |         cudaEventSynchronize(stop);
26 |         cudaEventElapsedTime(&elapsed, start, stop);
27 |         elapsed /= 1000.0;
28 |         return elapsed;
29 |     }
30 | };
31 | 
32 | struct Output {
33 |     int block_size;
34 |     int grid_size;
35 |     long int input_rows;
36 |     long int hashtable_rows;
37 |     double load_factor;
38 |     double initialization_time;
39 |     double memory_clear_time;
40 |     double read_time;
41 |     double reverse_time;
42 |     double hashtable_build_time;
43 |     long int hashtable_build_rate;
44 |     double join_time;
45 |     double projection_time;
46 |     double deduplication_time;
47 |     double union_time;
48 |     double total_time;
49 |     const char *dataset_name;
50 | };
51 | 


--------------------------------------------------------------------------------
/include/tuple.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | // #include <cuda_runtime.h>
  3 | #include <functional>
  4 | 
  5 | using u64 = unsigned long long;
  6 | using u32 = unsigned long;
  7 | 
  8 | using column_type = u32;
  9 | using tuple_type = column_type *;
 10 | using tuple_size_t = u64;
 11 | 
 12 | // TODO: use thrust vector as tuple type??
 13 | // using t_gpu_index = thrust::device_vector<u64>;
 14 | // using t_gpu_tuple = thrust::device_vector<u64>;
 15 | 
 16 | // using t_data_internal = thrust::device_vector<u64>;
 17 | /**
 18 |  * @brief u64* to store the actual relation tuples, for serialize concern
 19 |  *
 20 |  */
 21 | using t_data_internal = u64 *;
 22 | 
 23 | typedef void (*tuple_generator_hook)(tuple_type, tuple_type, tuple_type);
 24 | typedef void (*tuple_copy_hook)(tuple_type, tuple_type);
 25 | typedef bool (*tuple_predicate)(tuple_type);
 26 | 
 27 | // struct tuple_generator_hook {
 28 | //     __host__ __device__
 29 | //     void operator()(tuple_type inner, tuple_type outer, tuple_type newt) {};
 30 | // };
 31 | 
 32 | /**
 33 |  * @brief TODO: remove this use comparator function
 34 |  *
 35 |  * @param t1
 36 |  * @param t2
 37 |  * @param l
 38 |  * @return true
 39 |  * @return false
 40 |  */
 41 | __host__ __device__ inline bool tuple_eq(tuple_type t1, tuple_type t2,
 42 |                                          tuple_size_t l) {
 43 |     for (int i = 0; i < l; i++) {
 44 |         if (t1[i] != t2[i]) {
 45 |             return false;
 46 |         }
 47 |     }
 48 |     return true;
 49 | }
 50 | 
 51 | struct t_equal {
 52 |     u64 arity;
 53 | 
 54 |     t_equal(tuple_size_t arity) { this->arity = arity; }
 55 | 
 56 |     __host__ __device__ bool operator()(const tuple_type &lhs,
 57 |                                         const tuple_type &rhs) {
 58 |         for (int i = 0; i < arity; i++) {
 59 |             if (lhs[i] != rhs[i]) {
 60 |                 return false;
 61 |             }
 62 |         }
 63 |         return true;
 64 |     }
 65 | };
 66 | 
 67 | /**
 68 |  * @brief fnv1-a hash used in original slog backend
 69 |  *
 70 |  * @param start_ptr
 71 |  * @param prefix_len
 72 |  * @return __host__ __device__
 73 |  */
 74 | __host__ __device__ inline column_type prefix_hash(tuple_type start_ptr,
 75 |                                                    column_type prefix_len) {
 76 |     const column_type base = 2166136261U;
 77 |     const column_type prime = 16777619U;
 78 | 
 79 |     column_type hash = base;
 80 |     for (column_type i = 0; i < prefix_len; ++i) {
 81 |         column_type chunk = (column_type)start_ptr[i];
 82 |         hash ^= chunk & 255U;
 83 |         hash *= prime;
 84 |         for (char j = 0; j < 3; ++j) {
 85 |             chunk = chunk >> 8;
 86 |             hash ^= chunk & 255U;
 87 |             hash *= prime;
 88 |         }
 89 |     }
 90 |     return hash;
 91 | }
 92 | 
 93 | // change to std
 94 | struct tuple_indexed_less {
 95 | 
 96 |     // u64 *index_columns;
 97 |     tuple_size_t index_column_size;
 98 |     int arity;
 99 | 
100 |     tuple_indexed_less(tuple_size_t index_column_size, int arity) {
101 |         // this->index_columns = index_columns;
102 |         this->index_column_size = index_column_size;
103 |         this->arity = arity;
104 |     }
105 | 
106 |     __host__ __device__ bool operator()(const tuple_type &lhs,
107 |                                         const tuple_type &rhs) {
108 |         // fetch the index
109 |         // compare hash first, could be index very different but share the same
110 |         // hash
111 |         // same hash
112 |         if (lhs == 0) {
113 |             return false;
114 |         }
115 |         if (rhs == 0) {
116 |             return true;
117 |         }
118 |         for (tuple_size_t i = 0; i < arity; i++) {
119 |             if (lhs[i] < rhs[i]) {
120 |                 return true;
121 |             } else if (lhs[i] > rhs[i]) {
122 |                 return false;
123 |             }
124 |         }
125 |         return false;
126 |     }
127 | };
128 | 
129 | struct tuple_indexed_less2 {
130 | 
131 |     // u64 *index_columns;
132 |     tuple_size_t index_column_size;
133 |     int arity;
134 | 
135 |     tuple_indexed_less2(tuple_size_t index_column_size, int arity) {
136 |         // this->index_columns = index_columns;
137 |         this->index_column_size = index_column_size;
138 |         this->arity = arity;
139 |     }
140 | 
141 |     __host__ __device__ bool operator()(const tuple_type &lhs,
142 |                                         const tuple_type &rhs) {
143 |         // fetch the index
144 |         // compare hash first, could be index very different but share the same
145 |         // hash
146 |         // same hash
147 |         if (lhs == 0) {
148 |             return false;
149 |         }
150 |         if (rhs == 0) {
151 |             return true;
152 |         }
153 |         if (lhs[0] < rhs[0]) {
154 |             return true;
155 |         } else if (lhs[0] > rhs[0]) {
156 |             return false;
157 |         } else {
158 |             return lhs[1] < rhs[1];
159 |         }
160 |         return false;
161 |     }
162 | };
163 | 
164 | 
165 | struct tuple_weak_less {
166 | 
167 |     int arity;
168 | 
169 |     tuple_weak_less(int arity) { this->arity = arity; }
170 | 
171 |     __host__ __device__ bool operator()(const tuple_type &lhs,
172 |                                         const tuple_type &rhs) {
173 | 
174 |         for (u64 i = 0; i < arity; i++) {
175 |             if (lhs[i] < rhs[i]) {
176 |                 return true;
177 |             } else if (lhs[i] > rhs[i]) {
178 |                 return false;
179 |             }
180 |         }
181 |         return false;
182 |     };
183 | };
184 | 
185 | // cuda kernel extract the k th column from tuples
186 | __global__ void extract_column(tuple_type *tuples, tuple_size_t rows,
187 |                                tuple_size_t k, column_type *column);
188 | 
189 | __global__ void compute_hash(tuple_type *tuples, tuple_size_t rows,
190 |                              tuple_size_t index_column_size,
191 |                              column_type *hashes);
192 | 
193 | void sort_tuples(tuple_type *tuples, tuple_size_t rows, tuple_size_t arity,
194 |                  tuple_size_t index_column_size, int grid_size, int block_size);
195 | 
196 | void sort_tuple_by_hash(tuple_type *tuples, tuple_size_t rows,
197 |                         tuple_size_t arity, tuple_size_t index_column_size,
198 |                         int grid_size, int block_size);
199 | 


--------------------------------------------------------------------------------
/install_souffle.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harp-lab/gdlog/65a6ee960ced8d04bc725ccdfa68f004f8479226/install_souffle.sh


--------------------------------------------------------------------------------
/run_cspa_all.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo "Preparing code and building CSPA"
 3 | git stash && git checkout hash_diff 
 4 | cmake -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo -Bbuild . && cd build && make -j 
 5 | cd ..
 6 | echo ">>>>>>>>>>>>>>>>>> Testing SG >>>>>>>>>>>>>>>>>"
 7 | echo " >>>>> Testing GDlog: "
 8 | echo "Generating result for TABEL IV"
 9 | echo "Dataset : httpd"
10 | ./build/CSPA ./data/cspa/httpd
11 | echo "Dataset : linux"
12 | ./build/CSPA ./data/cspa/linux
13 | echo "Dataset : postgresql"
14 | ./build/CSPA ./data/cspa/postgresql
15 | 
16 | 


--------------------------------------------------------------------------------
/run_sg_all.sh:
--------------------------------------------------------------------------------
 1 | echo "Preparing code and building SG"
 2 | git stash && git checkout hash_diff 
 3 | cmake -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo -Bbuild . && cd build && make -j 
 4 | cd ..
 5 | echo ">>>>>>>>>>>>>>>>>> Testing SG >>>>>>>>>>>>>>>>>"
 6 | echo " >>>>> Testing GDlog: "
 7 | echo "Generating result for TABEL III"
 8 | echo "Dataset : fe_body"
 9 | ./build/SG ./data/fe_body/edge.facts
10 | echo "Dataset : loc-Brightkite"
11 | ./build/SG ./data/loc_Brightkite/edge.facts
12 | echo "Dataset : fe-sphere"
13 | ./build/SG ./data/fe-sphere/edge.facts
14 | echo "Dataset : CA-HepTH"
15 | ./build/SG ./data/CA-HepTH/edge.facts
16 | echo "Dataset : SF.cedge"
17 | ./build/SG ./data/SF.cedge/edge.facts
18 | echo "Dataset : ego-Facebook"
19 | ./build/SG ./data/ego-Facebook/edge.facts
20 | 


--------------------------------------------------------------------------------
/run_tc_all.sh:
--------------------------------------------------------------------------------
 1 | echo "Preparing code and building TC"
 2 | git stash && git checkout main 
 3 | cmake -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo -Bbuild . && cd build && make -j 
 4 | cd ..
 5 | echo ">>>>>>>>>>>>>>>>>> Testing REACH >>>>>>>>>>>>>>>>>"
 6 | echo " >>>>> Testing GDlog: "
 7 | echo " >>>>> Generating result for TABEL I"
 8 | echo " >>>>> Dataset : usroad  with  EBM"
 9 | ./build/TC ./data/usroad/edge.facts 0
10 | echo " >>>>> Dataset : usroad  without  EBM"
11 | ./build/TC ./data/usroad/edge.facts 1
12 | echo " >>>>> "
13 | echo " >>>>> Dataset : vsp_finan  with  EBM"
14 | ./build/TC ./data/vsp_finan/edge.facts 0
15 | echo " >>>>> Dataset : vsp_finan  without  EBM"
16 | ./build/TC ./data/vsp_finan/edge.facts 1
17 | echo " >>>>> "
18 | echo " >>>>> Dataset : fc_ocean  with  EBM"
19 | ./build/TC ./data/fc_ocean/edge.facts 0
20 | echo " >>>>> Dataset : fc_ocean  without  EBM"
21 | ./build/TC ./data/fc_ocean/edge.facts 1
22 | echo " >>>>> Dataset : com-dblp  with  EBM"
23 | ./build/TC ./data/com-dblp/edge.facts 0
24 | echo " >>>>> Dataset : com-dblp  without  EBM"
25 | ./build/TC ./data/com-dblp/edge.facts 1
26 | echo " >>>>> "
27 | echo " >>>>> Dataset : Gnutella31  with  EBM"
28 | ./build/TC ./data/Gnutella31/edge.facts 0
29 | echo " >>>>> Dataset : Gnutella31  without  EBM"
30 | ./build/TC ./data/Gnutella31/edge.facts 1
31 | echo " >>>>> "
32 | 
33 | echo " >>>>> Testing GDlog: "
34 | echo " >>>>> Generating result for TABEL II"
35 | # echo "Dataset : usroad"
36 | # ./build/TC ./data/data_165435.txt
37 | echo "Dataset : fc_ocean"
38 | ./build/TC ./data/fc_ocean/edge.facts 0
39 | echo "Dataset : com-dblp"
40 | ./build/TC ./data/com-dblp/edge.facts 0
41 | echo "Dataset : vsp_finan"
42 | ./build/TC ./data/vsp_finan/edge.facts 0
43 | echo "Dataset : Gnutella31"
44 | ./build/TC ./data/Gnutella31/edge.facts 0
45 | echo "Dataset : fe_body"
46 | ./build/TC ./data/fe_body/edge.facts 0
47 | echo "Dataset : SF.cedge"
48 | ./build/TC ./data/SF.cedge/edge.facts 0
49 | 
50 | 


--------------------------------------------------------------------------------
/src/acopy.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <thrust/execution_policy.h>
 3 | #include <thrust/reduce.h>
 4 | #include <thrust/scan.h>
 5 | #include <thrust/unique.h>
 6 | 
 7 | #include "../include/exception.cuh"
 8 | #include "../include/print.cuh"
 9 | #include "../include/relational_algebra.cuh"
10 | #include "../include/timer.cuh"
11 | 
12 | void RelationalACopy::operator()() {
13 | 
14 |     GHashRelContainer *src = src_rel->newt;
15 |     GHashRelContainer *dest = dest_rel->newt;
16 |     std::cout << "ACopy " << src_rel->name << " to " << dest_rel->name
17 |               << std::endl;
18 | 
19 |     if (src->tuple_counts == 0) {
20 |         free_relation_container(dest);
21 |         dest->tuple_counts = 0;
22 |         return;
23 |     }
24 | 
25 |     int output_arity = dest_rel->arity;
26 |     column_type *copied_raw_data;
27 |     u64 copied_raw_data_size =
28 |         src->tuple_counts * output_arity * sizeof(column_type);
29 |     checkCuda(cudaMalloc((void **)&copied_raw_data, copied_raw_data_size));
30 |     checkCuda(cudaMemset(copied_raw_data, 0, copied_raw_data_size));
31 |     get_copy_result<<<grid_size, block_size>>>(src->tuples, copied_raw_data,
32 |                                                output_arity, src->tuple_counts,
33 |                                                tuple_generator);
34 |     checkCuda(cudaGetLastError());
35 |     checkCuda(cudaDeviceSynchronize());
36 | 
37 |     free_relation_container(dest);
38 |     float detail_time[5] = {0, 0, 0, 0, 0};
39 |     // TODO: swap to repartition_relation_index in future
40 |     load_relation_container(dest, dest->arity, copied_raw_data,
41 |                             src->tuple_counts, src->index_column_size,
42 |                             dest->dependent_column_size, 0.8, grid_size,
43 |                             block_size, detail_time, true, false, true);
44 |     checkCuda(cudaDeviceSynchronize());
45 |     // print_tuple_rows(dest, "delta");
46 |     // merge delta to full immediately here
47 |     // dest_rel->flush_delta(grid_size, block_size);
48 |     // std::cout << dest->tuple_counts << std::endl;
49 |     // print_tuple_rows(dest, "acopied");
50 | }
51 | 


--------------------------------------------------------------------------------
/src/copy.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <thrust/execution_policy.h>
 3 | #include <thrust/reduce.h>
 4 | #include <thrust/scan.h>
 5 | #include <thrust/unique.h>
 6 | 
 7 | #include "../include/exception.cuh"
 8 | #include "../include/print.cuh"
 9 | #include "../include/relational_algebra.cuh"
10 | #include "../include/timer.cuh"
11 | 
12 | void RelationalCopy::operator()() {
13 |     checkCuda(cudaDeviceSynchronize());
14 |     GHashRelContainer *src;
15 |     if (src_ver == DELTA) {
16 |         src = src_rel->delta;
17 |     } else {
18 |         src = src_rel->full;
19 |     }
20 |     GHashRelContainer *dest = dest_rel->newt;
21 |     // std::cout << "Copy " << src_rel->name << " to " << dest_rel->name
22 |     //           << std::endl;
23 | 
24 |     if (src->tuple_counts == 0) {
25 |         dest_rel->newt->tuple_counts = 0;
26 |         return;
27 |     }
28 | 
29 |     int output_arity = dest_rel->arity;
30 |     column_type *copied_raw_data;
31 |     u64 copied_raw_data_size =
32 |         src->tuple_counts * output_arity * sizeof(column_type);
33 |     checkCuda(cudaMalloc((void **)&copied_raw_data, copied_raw_data_size));
34 |     checkCuda(cudaMemset(copied_raw_data, 0, copied_raw_data_size));
35 |     get_copy_result<<<grid_size, block_size>>>(src->tuples, copied_raw_data,
36 |                                                output_arity, src->tuple_counts,
37 |                                                tuple_generator);
38 |     checkCuda(cudaGetLastError());
39 |     checkCuda(cudaDeviceSynchronize());
40 |     float load_relation_container_time[5] = {0, 0, 0, 0, 0};
41 | 
42 |     if (dest->tuples == nullptr || dest->tuple_counts == 0) {
43 |         free_relation_container(dest);
44 |         load_relation_container(
45 |             dest, dest->arity, copied_raw_data, src->tuple_counts,
46 |             src->index_column_size, dest->dependent_column_size, 0.8, grid_size,
47 |             block_size, load_relation_container_time, true, false, false);
48 |     } else {
49 |         GHashRelContainer *tmp = new GHashRelContainer(
50 |             dest->arity, dest->index_column_size, dest->dependent_column_size);
51 |         load_relation_container(
52 |             tmp, dest->arity, copied_raw_data, src->tuple_counts,
53 |             src->index_column_size, dest->dependent_column_size, 0.8, grid_size,
54 |             block_size, load_relation_container_time, true, false, false);
55 |         checkCuda(cudaDeviceSynchronize());
56 |         // merge to newt
57 |         GHashRelContainer *old_newt = dest;
58 |         tuple_type *tp_buffer;
59 |         u64 tp_buffer_mem_size =
60 |             (old_newt->tuple_counts + src->tuple_counts) * sizeof(tuple_type);
61 |         checkCuda(cudaMalloc((void **)&tp_buffer, tp_buffer_mem_size));
62 |         checkCuda(cudaMemset(tp_buffer, 0, tp_buffer_mem_size));
63 |         tuple_type *tp_buffer_end = thrust::merge(
64 |             thrust::device, old_newt->tuples,
65 |             old_newt->tuples + old_newt->tuple_counts, tmp->tuples,
66 |             tmp->tuples + tmp->tuple_counts, tp_buffer,
67 |             tuple_indexed_less(dest->index_column_size, output_arity));
68 |         // checkCuda(cudaDeviceSynchronize());
69 |         // checkCuda(cudaFree(tmp->tuples));
70 |         // checkCuda(cudaFree(old_newt->tuples));
71 |         tp_buffer_end = thrust::unique(thrust::device, tp_buffer, tp_buffer_end,
72 |                                        t_equal(output_arity));
73 |         checkCuda(cudaDeviceSynchronize());
74 |         tuple_size_t new_newt_counts = tp_buffer_end - tp_buffer;
75 |         // std::cout << " >>>>>>>>>> " << new_newt_counts << std::endl;
76 |         column_type *new_newt_raw;
77 |         u64 new_newt_raw_mem_size =
78 |             new_newt_counts * output_arity * sizeof(column_type);
79 |         checkCuda(cudaMalloc((void **)&new_newt_raw, new_newt_raw_mem_size));
80 |         flatten_tuples_raw_data<<<grid_size, block_size>>>(
81 |             tp_buffer, new_newt_raw, new_newt_counts, output_arity);
82 |         checkCuda(cudaGetLastError());
83 |         checkCuda(cudaDeviceSynchronize());
84 |         checkCuda(cudaFree(tp_buffer));
85 |         free_relation_container(old_newt);
86 |         free_relation_container(tmp);
87 |         load_relation_container(dest, output_arity, new_newt_raw,
88 |                                 new_newt_counts, dest->index_column_size,
89 |                                 dest->dependent_column_size, 0.8, grid_size,
90 |                                 block_size, load_relation_container_time, true,
91 |                                 true, false);
92 |         // delete tmp;
93 |     }
94 |     // std::cout << "copy finish " << std::endl;
95 | }
96 | 


--------------------------------------------------------------------------------
/src/join.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <thrust/execution_policy.h>
  3 | #include <thrust/reduce.h>
  4 | #include <thrust/scan.h>
  5 | #include <thrust/unique.h>
  6 | 
  7 | #include "../include/exception.cuh"
  8 | #include "../include/print.cuh"
  9 | #include "../include/relational_algebra.cuh"
 10 | #include "../include/timer.cuh"
 11 | 
 12 | void RelationalJoin::operator()() {
 13 | 
 14 |     bool output_is_tmp = output_rel->tmp_flag;
 15 |     GHashRelContainer *inner;
 16 |     if (inner_ver == DELTA) {
 17 |         inner = inner_rel->delta;
 18 |     } else {
 19 |         inner = inner_rel->full;
 20 |     }
 21 |     GHashRelContainer *outer;
 22 |     if (outer_ver == DELTA) {
 23 |         outer = outer_rel->delta;
 24 |     } else if (outer_ver == FULL) {
 25 |         outer = outer_rel->full;
 26 |     } else {
 27 |         // temp relation can be outer relation
 28 |         outer = outer_rel->newt;
 29 |     }
 30 |     int output_arity = output_rel->arity;
 31 |     // GHashRelContainer* output = output_rel->newt;
 32 | 
 33 |     // std::cout << "inner " << inner_rel->name << " : " << inner->tuple_counts
 34 |     //           << " outer " << outer_rel->name << " : " << outer->tuple_counts
 35 |     //           << std::endl;
 36 |     // print_tuple_rows(inner, "inner");
 37 |     // print_tuple_rows(outer, "outer");
 38 |     if (outer->tuples == nullptr || outer->tuple_counts == 0) {
 39 |         outer->tuple_counts = 0;
 40 |         return;
 41 |     }
 42 |     if (inner->tuples == nullptr || inner->tuple_counts == 0) {
 43 |         outer->tuple_counts = 0;
 44 |         return;
 45 |     }
 46 | 
 47 |     KernelTimer timer;
 48 |     // checkCuda(cudaDeviceSynchronize());
 49 |     GHashRelContainer *inner_device;
 50 |     checkCuda(cudaMalloc((void **)&inner_device, sizeof(GHashRelContainer)));
 51 |     checkCuda(cudaMemcpy(inner_device, inner, sizeof(GHashRelContainer),
 52 |                          cudaMemcpyHostToDevice));
 53 |     GHashRelContainer *outer_device;
 54 |     checkCuda(cudaMalloc((void **)&outer_device, sizeof(GHashRelContainer)));
 55 |     checkCuda(cudaMemcpy(outer_device, outer, sizeof(GHashRelContainer),
 56 |                          cudaMemcpyHostToDevice));
 57 | 
 58 |     tuple_size_t *result_counts_array;
 59 |     checkCuda(cudaMalloc((void **)&result_counts_array,
 60 |                          outer->tuple_counts * sizeof(tuple_size_t)));
 61 |     checkCuda(cudaMemset(result_counts_array, 0,
 62 |                          outer->tuple_counts * sizeof(tuple_size_t)));
 63 | 
 64 |     // print_tuple_rows(outer, "inber");
 65 |     // checkCuda(cudaDeviceSynchronize());
 66 |     timer.start_timer();
 67 |     checkCuda(cudaDeviceSynchronize());
 68 |     get_join_result_size<<<grid_size, block_size>>>(
 69 |         inner_device, outer_device, outer->index_column_size, tuple_generator,
 70 |         tuple_pred, result_counts_array);
 71 |     checkCuda(cudaGetLastError());
 72 |     checkCuda(cudaDeviceSynchronize());
 73 |     timer.stop_timer();
 74 |     this->detail_time[0] += timer.get_spent_time();
 75 | 
 76 |     timer.start_timer();
 77 |     tuple_size_t total_result_rows = 0;
 78 |     for (tuple_size_t i = 0; i < outer->tuple_counts; i = i + MAX_REDUCE_SIZE) {
 79 |         tuple_size_t reduce_size = MAX_REDUCE_SIZE;
 80 |         if (i + MAX_REDUCE_SIZE > outer->tuple_counts) {
 81 |             reduce_size = outer->tuple_counts - i;
 82 |         }
 83 |         tuple_size_t reduce_v = thrust::reduce(
 84 |             thrust::device, result_counts_array + i,
 85 |             result_counts_array + i + reduce_size, 0);
 86 |         total_result_rows += reduce_v;
 87 |         // checkCuda(cudaDeviceSynchronize());
 88 |     }
 89 |     
 90 |     // std::cout << output_rel->name << "   " << outer->index_column_size
 91 |     //           << " join result size(non dedup) " << total_result_rows
 92 |     //           << std::endl;
 93 |     // print_memory_usage();
 94 |     tuple_size_t *result_counts_offset;
 95 |     checkCuda(cudaMalloc((void **)&result_counts_offset,
 96 |                          outer->tuple_counts * sizeof(tuple_size_t)));
 97 |     checkCuda(cudaMemcpy(result_counts_offset, result_counts_array,
 98 |                          outer->tuple_counts * sizeof(tuple_size_t),
 99 |                          cudaMemcpyDeviceToDevice));
100 |     thrust::exclusive_scan(thrust::device, result_counts_offset,
101 |                            result_counts_offset + outer->tuple_counts,
102 |                            result_counts_offset);
103 | 
104 |     checkCuda(cudaDeviceSynchronize());
105 |     timer.stop_timer();
106 |     detail_time[1] += timer.get_spent_time();
107 | 
108 |     timer.start_timer();
109 |     column_type *join_res_raw_data;
110 |     u64 join_res_raw_data_mem_size =
111 |         total_result_rows * output_arity * sizeof(column_type);
112 |     checkCuda(
113 |         cudaMalloc((void **)&join_res_raw_data, join_res_raw_data_mem_size));
114 |     checkCuda(cudaMemset(join_res_raw_data, 0, join_res_raw_data_mem_size));
115 |     get_join_result<<<grid_size, block_size>>>(
116 |         inner_device, outer_device, outer->index_column_size, tuple_generator,
117 |         tuple_pred, output_arity, join_res_raw_data, result_counts_array,
118 |         result_counts_offset, direction);
119 |     checkCuda(cudaGetLastError());
120 |     checkCuda(cudaDeviceSynchronize());
121 |     timer.stop_timer();
122 |     detail_time[2] += timer.get_spent_time();
123 |     checkCuda(cudaFree(result_counts_array));
124 |     checkCuda(cudaFree(result_counts_offset));
125 | 
126 |     float load_relation_container_time[5] = {0, 0, 0, 0, 0};
127 |     // // reload newt
128 |     // free_relation(output_newt);
129 |     // newt don't need index
130 |     if (output_rel->newt->tuples == nullptr ||
131 |         output_rel->newt->tuple_counts == 0) {
132 |         if (disable_load) {
133 |             return;
134 |         }
135 |         if (!output_is_tmp) {
136 |             load_relation_container(
137 |                 output_rel->newt, output_arity, join_res_raw_data,
138 |                 total_result_rows, output_rel->index_column_size,
139 |                 output_rel->dependent_column_size, 0.8, grid_size, block_size,
140 |                 load_relation_container_time, true, false, false);
141 |         } else {
142 |             // temporary relation doesn't need index nor sort
143 |             // std::cout << "use tmp >>>>>>>>>>>>>>>>>>>>>>>>>>>>>" <<
144 |             // std::endl;
145 |             load_relation_container(
146 |                 output_rel->newt, output_arity, join_res_raw_data,
147 |                 total_result_rows, output_rel->index_column_size,
148 |                 output_rel->dependent_column_size, 0.8, grid_size, block_size,
149 |                 load_relation_container_time, true, true, false);
150 |             output_rel->newt->tmp_flag = true;
151 |         }
152 |         checkCuda(cudaDeviceSynchronize());
153 |         detail_time[3] += load_relation_container_time[0];
154 |         detail_time[4] += load_relation_container_time[1];
155 |         detail_time[5] += load_relation_container_time[2];
156 |         // print_tuple_rows(output_rel->newt, "newt after join");
157 |     } else {
158 |         // TODO: handle the case out put relation is temp relation
159 |         // data in current newt, merge
160 |         if (!output_is_tmp) {
161 |             GHashRelContainer *newt_tmp = new GHashRelContainer(
162 |                 output_rel->arity, output_rel->index_column_size,
163 |                 output_rel->dependent_column_size);
164 |             GHashRelContainer *old_newt = output_rel->newt;
165 |             load_relation_container(
166 |                 newt_tmp, output_arity, join_res_raw_data, total_result_rows,
167 |                 output_rel->index_column_size,
168 |                 output_rel->dependent_column_size, 0.8, grid_size, block_size,
169 |                 load_relation_container_time, true, false, false);
170 |             detail_time[3] += load_relation_container_time[0];
171 |             detail_time[4] += load_relation_container_time[1];
172 |             detail_time[5] += load_relation_container_time[2];
173 |             // checkCuda(cudaDeviceSynchronize());
174 |             tuple_type *tp_buffer;
175 |             u64 tp_buffer_mem_size =
176 |                 (newt_tmp->tuple_counts + old_newt->tuple_counts) *
177 |                 sizeof(tuple_type);
178 |             checkCuda(cudaMalloc((void **)&tp_buffer, tp_buffer_mem_size));
179 |             cudaMemset(tp_buffer, 0, tp_buffer_mem_size);
180 |             timer.start_timer();
181 |             tuple_type *tp_buffer_end = thrust::merge(
182 |                 thrust::device, newt_tmp->tuples,
183 |                 newt_tmp->tuples + newt_tmp->tuple_counts, old_newt->tuples,
184 |                 old_newt->tuples + old_newt->tuple_counts, tp_buffer,
185 |                 tuple_indexed_less(output_rel->index_column_size,
186 |                                    output_rel->arity));
187 |             // checkCuda(cudaDeviceSynchronize());
188 |             timer.stop_timer();
189 |             detail_time[6] += timer.get_spent_time();
190 |             // cudaFree(newt_tmp->tuples);
191 |             // cudaFree(old_newt->tuples);
192 |             timer.start_timer();
193 |             tp_buffer_end =
194 |                 thrust::unique(thrust::device, tp_buffer, tp_buffer_end,
195 |                                t_equal(output_rel->arity));
196 |             checkCuda(cudaDeviceSynchronize());
197 |             timer.stop_timer();
198 |             detail_time[7] += timer.get_spent_time();
199 |             tuple_size_t new_newt_counts = tp_buffer_end - tp_buffer;
200 |             // std::cout << " >>>>>>>>>> " << new_newt_counts *
201 |             // output_rel->arity * sizeof(column_type) << std::endl;
202 | 
203 |             timer.start_timer();
204 |             column_type *new_newt_raw;
205 |             u64 new_newt_raw_mem_size =
206 |                 new_newt_counts * output_rel->arity * sizeof(column_type);
207 |             checkCuda(
208 |                 cudaMalloc((void **)&new_newt_raw, new_newt_raw_mem_size));
209 |             checkCuda(cudaMemset(new_newt_raw, 0, new_newt_raw_mem_size));
210 |             flatten_tuples_raw_data<<<grid_size, block_size>>>(
211 |                 tp_buffer, new_newt_raw, new_newt_counts, output_rel->arity);
212 |             checkCuda(cudaGetLastError());
213 |             checkCuda(cudaDeviceSynchronize());
214 |             timer.stop_timer();
215 |             detail_time[4] += timer.get_spent_time();
216 |             checkCuda(cudaFree(tp_buffer));
217 |             free_relation_container(old_newt);
218 |             free_relation_container(newt_tmp);
219 |             // TODO: free newt_tmp pointer
220 |             load_relation_container(
221 |                 output_rel->newt, output_arity, new_newt_raw, new_newt_counts,
222 |                 output_rel->index_column_size,
223 |                 output_rel->dependent_column_size, 0.8, grid_size, block_size,
224 |                 load_relation_container_time, true, true, false);
225 |             checkCuda(cudaDeviceSynchronize());
226 |         } else {
227 |             // output relation is tmp relation, directly merge without sort
228 |             GHashRelContainer *old_newt = output_rel->newt;
229 |             column_type *newt_tmp_raw;
230 |             u64 newt_tmp_raw_mem_size =
231 |                 (old_newt->tuple_counts + total_result_rows) *
232 |                 output_rel->arity * sizeof(column_type);
233 |             tuple_size_t new_newt_counts =
234 |                 old_newt->tuple_counts + total_result_rows;
235 |             checkCuda(
236 |                 cudaMalloc((void **)&newt_tmp_raw, newt_tmp_raw_mem_size));
237 |             checkCuda(cudaMemcpy(newt_tmp_raw, old_newt->data_raw,
238 |                                  old_newt->tuple_counts * old_newt->arity *
239 |                                      sizeof(column_type),
240 |                                  cudaMemcpyDeviceToDevice));
241 |             checkCuda(cudaMemcpy(
242 |                 &(newt_tmp_raw[old_newt->tuple_counts * old_newt->arity]),
243 |                 join_res_raw_data,
244 |                 total_result_rows * output_rel->arity * sizeof(column_type),
245 |                 cudaMemcpyDeviceToDevice));
246 |             free_relation_container(old_newt);
247 |             checkCuda(cudaFree(join_res_raw_data));
248 |             load_relation_container(
249 |                 output_rel->newt, output_arity, newt_tmp_raw, new_newt_counts,
250 |                 output_rel->index_column_size,
251 |                 output_rel->dependent_column_size, 0.8, grid_size, block_size,
252 |                 load_relation_container_time, true, true, false);
253 |             checkCuda(cudaDeviceSynchronize())
254 |         }
255 | 
256 |         detail_time[3] += load_relation_container_time[0];
257 |         detail_time[4] += load_relation_container_time[1];
258 |         detail_time[5] += load_relation_container_time[2];
259 |         // print_tuple_rows(output_rel->newt, "join merge newt");
260 |         // delete newt_tmp;
261 |     }
262 | 
263 |     // print_tuple_rows(output_rel->newt, "output_newtr");
264 |     // checkCuda(cudaDeviceSynchronize());
265 |     // std::cout << output_rel->name << " join result size " <<
266 |     // output_rel->newt->tuple_counts <<std::endl;
267 | 
268 |     checkCuda(cudaFree(inner_device));
269 |     checkCuda(cudaFree(outer_device));
270 | }
271 | 


--------------------------------------------------------------------------------
/src/lie.cu:
--------------------------------------------------------------------------------
  1 | #include "../include/dynamic_dispatch.h"
  2 | #include "../include/exception.cuh"
  3 | #include "../include/lie.cuh"
  4 | #include "../include/print.cuh"
  5 | #include "../include/timer.cuh"
  6 | #include <iostream>
  7 | #include <thrust/execution_policy.h>
  8 | #include <thrust/merge.h>
  9 | #include <thrust/set_operations.h>
 10 | 
 11 | #include <variant>
 12 | 
 13 | void LIE::add_ra(ra_op op) { ra_ops.push_back(op); }
 14 | 
 15 | void LIE::add_relations(Relation *rel, bool static_flag) {
 16 |     if (static_flag) {
 17 |         static_relations.push_back(rel);
 18 |     } else {
 19 |         update_relations.push_back(rel);
 20 |         // add delta and newt for it
 21 |     }
 22 | }
 23 | 
 24 | void LIE::add_tmp_relation(Relation *rel) { tmp_relations.push_back(rel); }
 25 | 
 26 | void LIE::fixpoint_loop() {
 27 | 
 28 |     int iteration_counter = 0;
 29 |     float join_time = 0;
 30 |     float merge_time = 0;
 31 |     float rebuild_time = 0;
 32 |     float flatten_time = 0;
 33 |     float set_diff_time = 0;
 34 |     float rebuild_delta_time = 0;
 35 |     float flatten_full_time = 0;
 36 |     float memory_alloc_time = 0;
 37 | 
 38 |     float join_get_size_time = 0;
 39 |     float join_get_result_time = 0;
 40 |     float rebuild_newt_time = 0;
 41 |     KernelTimer timer;
 42 | 
 43 |     float rebuild_rel_sort_time = 0;
 44 |     float rebuild_rel_unique_time = 0;
 45 |     float rebuild_rel_index_time = 0;
 46 | 
 47 |     // std::cout << "start lie .... " << std::endl;
 48 |     // init full tuple buffer for all relation involved
 49 |     for (Relation *rel : update_relations) {
 50 |         checkCuda(cudaMalloc((void **)&rel->tuple_full,
 51 |                              rel->full->tuple_counts * sizeof(tuple_type)));
 52 |         checkCuda(cudaMemcpy(rel->tuple_full, rel->full->tuples,
 53 |                              rel->full->tuple_counts * sizeof(tuple_type),
 54 |                              cudaMemcpyDeviceToDevice));
 55 |         rel->current_full_size = rel->full->tuple_counts;
 56 |         copy_relation_container(rel->delta, rel->full, grid_size, block_size);
 57 |         checkCuda(cudaDeviceSynchronize());
 58 |         // std::cout << "wwwwwwwwww" << rel->delta->tuple_counts << std::endl;
 59 |     }
 60 | 
 61 |     while (true) {
 62 |         for (auto &ra_op : ra_ops) {
 63 |             timer.start_timer();
 64 |             std::visit(dynamic_dispatch{[](RelationalJoin &op) {
 65 |                                             // timer.start_timer();
 66 |                                             op();
 67 |                                         },
 68 |                                         [](RelationalACopy &op) { op(); },
 69 |                                         [](RelationalCopy &op) {
 70 |                                             if (op.src_ver == FULL) {
 71 |                                                 if (!op.copied) {
 72 |                                                     op();
 73 |                                                     op.copied = true;
 74 |                                                 }
 75 |                                             } else {
 76 |                                                 op();
 77 |                                             }
 78 |                                         }},
 79 |                        ra_op);
 80 |             timer.stop_timer();
 81 |             join_time += timer.get_spent_time();
 82 |         }
 83 | 
 84 |         // clean tmp relation
 85 |         for (Relation *rel : tmp_relations) {
 86 |             free_relation_container(rel->newt);
 87 |         }
 88 | 
 89 |         // std::cout << "Iteration " << iteration_counter
 90 |         //           << " popluating new tuple" << std::endl;
 91 |         // merge delta into full
 92 |         bool fixpoint_flag = true;
 93 |         for (Relation *rel : update_relations) {
 94 |             // std::cout << rel->name << std::endl;
 95 |             // if (rel->newt->tuple_counts != 0) {
 96 |             //     fixpoint_flag = false;
 97 |             // }
 98 |             if (iteration_counter == 0) {
 99 |                 free_relation_container(rel->delta);
100 |             }
101 |             // drop the index of delta once merged, because it won't be used in
102 |             // next iter when migrate more general case, this operation need to
103 |             // be put off to end of all RA operation in current iteration
104 |             if (rel->delta->index_map != nullptr) {
105 |                 checkCuda(cudaFree(rel->delta->index_map));
106 |                 rel->delta->index_map = nullptr;
107 |             }
108 |             if (rel->delta->tuples != nullptr) {
109 |                 checkCuda(cudaFree(rel->delta->tuples));
110 |                 rel->delta->tuples = nullptr;
111 |             }
112 | 
113 |             timer.start_timer();
114 |             if (rel->newt->tuple_counts == 0) {
115 |                 rel->delta =
116 |                     new GHashRelContainer(rel->arity, rel->index_column_size,
117 |                                           rel->dependent_column_size);
118 |                 // std::cout << "iteration " << iteration_counter << " relation "
119 |                 //           << rel->name << " no new tuple added" << std::endl;
120 |                 continue;
121 |             }
122 |             tuple_type *deduplicated_newt_tuples;
123 |             u64 deduplicated_newt_tuples_mem_size =
124 |                 rel->newt->tuple_counts * sizeof(tuple_type);
125 |             checkCuda(cudaMalloc((void **)&deduplicated_newt_tuples,
126 |                                  deduplicated_newt_tuples_mem_size));
127 |             checkCuda(cudaMemset(deduplicated_newt_tuples, 0,
128 |                                  deduplicated_newt_tuples_mem_size));
129 |             //////
130 | 
131 |             tuple_type *deuplicated_end = thrust::set_difference(
132 |                 thrust::device, rel->newt->tuples,
133 |                 rel->newt->tuples + rel->newt->tuple_counts, rel->tuple_full,
134 |                 rel->tuple_full + rel->current_full_size,
135 |                 deduplicated_newt_tuples,
136 |                 tuple_indexed_less(rel->full->index_column_size,
137 |                                    rel->full->arity -
138 |                                        rel->dependent_column_size));
139 |             // checkCuda(cudaDeviceSynchronize());
140 |             tuple_size_t deduplicate_size =
141 |                 deuplicated_end - deduplicated_newt_tuples;
142 | 
143 |             if (deduplicate_size != 0) {
144 |                 fixpoint_flag = false;
145 |             }
146 |             timer.stop_timer();
147 |             set_diff_time += timer.get_spent_time();
148 | 
149 |             column_type *deduplicated_raw;
150 |             u64 dedeuplicated_raw_mem_size =
151 |                 deduplicate_size * rel->newt->arity * sizeof(column_type);
152 |             checkCuda(cudaMalloc((void **)&deduplicated_raw,
153 |                                  dedeuplicated_raw_mem_size));
154 |             checkCuda(
155 |                 cudaMemset(deduplicated_raw, 0, dedeuplicated_raw_mem_size));
156 |             flatten_tuples_raw_data<<<grid_size, block_size>>>(
157 |                 deduplicated_newt_tuples, deduplicated_raw, deduplicate_size,
158 |                 rel->newt->arity);
159 |             checkCuda(cudaGetLastError());
160 |             checkCuda(cudaDeviceSynchronize());
161 |             checkCuda(cudaFree(deduplicated_newt_tuples));
162 | 
163 |             free_relation_container(rel->newt);
164 | 
165 |             timer.start_timer();
166 |             float load_detail_time[5] = {0, 0, 0, 0, 0};
167 |             rel->delta = new GHashRelContainer(
168 |                 rel->arity, rel->index_column_size, rel->dependent_column_size);
169 |             load_relation_container(
170 |                 rel->delta, rel->full->arity, deduplicated_raw,
171 |                 deduplicate_size, rel->full->index_column_size,
172 |                 rel->full->dependent_column_size,
173 |                 rel->full->index_map_load_factor, grid_size, block_size,
174 |                 load_detail_time, true, true, true);
175 |             // checkCuda(cudaDeviceSynchronize());
176 |             timer.stop_timer();
177 |             rebuild_delta_time += timer.get_spent_time();
178 |             rebuild_rel_sort_time += load_detail_time[0];
179 |             rebuild_rel_unique_time += load_detail_time[1];
180 |             rebuild_rel_index_time += load_detail_time[2];
181 | 
182 |             // auto old_full = rel->tuple_full;
183 |             float flush_detail_time[5] = {0, 0, 0, 0, 0};
184 |             timer.start_timer();
185 |             rel->flush_delta(grid_size, block_size, flush_detail_time);
186 |             timer.stop_timer();
187 |             merge_time += flush_detail_time[1];
188 |             memory_alloc_time += flush_detail_time[0];
189 |             memory_alloc_time += flush_detail_time[2];
190 |             // checkCuda(cudaFree(old_full));
191 | 
192 |             // print_tuple_rows(rel->full, "Path full after load newt");
193 |             // std::cout << "iteration " << iteration_counter << " relation "
194 |             //           << rel->name
195 |             //           << " finish dedup new tuples : " << deduplicate_size
196 |             //           << " delta tuple size: " << rel->delta->tuple_counts
197 |             //           << " full counts " << rel->current_full_size << std::endl;
198 |         }
199 |         checkCuda(cudaDeviceSynchronize());
200 |         // std::cout << "Iteration " << iteration_counter << " finish populating"
201 |         //           << std::endl;
202 |         
203 |         iteration_counter++;
204 |         // if (iteration_counter >= 3) {
205 |         //     break;
206 |         // }
207 | 
208 |         if (fixpoint_flag || iteration_counter > max_iteration) {
209 |             // print_memory_usage();
210 |             // std::cout << "Iteration : " << iteration_counter 
211 |             //     << "Join time: " << join_time
212 |             //     << " ; merge full time: " << merge_time
213 |             //     << " ; memory alloc time: " << memory_alloc_time
214 |             //     << " ; rebuild delta time: " << rebuild_delta_time
215 |             //     << " ; set diff time: " << set_diff_time << std::endl;
216 |             break;
217 |         }
218 |     }
219 |     // merge full after reach fixpoint
220 |     timer.start_timer();
221 |     if (reload_full_flag) {
222 |         // std::cout << "Start merge full" << std::endl;
223 |     for (Relation *rel : update_relations) {
224 |         // if (rel->current_full_size <= rel->full->tuple_counts) {
225 |         //     continue;
226 |         // }
227 |         column_type *new_full_raw_data;
228 |         u64 new_full_raw_data_mem_size =
229 |             rel->current_full_size * rel->full->arity * sizeof(column_type);
230 |         checkCuda(cudaMalloc((void **)&new_full_raw_data,
231 |                              new_full_raw_data_mem_size));
232 |         checkCuda(cudaMemset(new_full_raw_data, 0, new_full_raw_data_mem_size));
233 |         flatten_tuples_raw_data<<<grid_size, block_size>>>(
234 |             rel->tuple_full, new_full_raw_data, rel->current_full_size,
235 |             rel->full->arity);
236 |         checkCuda(cudaGetLastError());
237 |         checkCuda(cudaDeviceSynchronize());
238 |         // cudaFree(tuple_merge_buffer);
239 |         float load_detail_time[5] = {0, 0, 0, 0, 0};
240 |         load_relation_container(
241 |             rel->full, rel->full->arity, new_full_raw_data,
242 |             rel->current_full_size, rel->full->index_column_size,
243 |             rel->full->dependent_column_size, rel->full->index_map_load_factor,
244 |             grid_size, block_size, load_detail_time, true, true, true);
245 |         checkCuda(cudaDeviceSynchronize());
246 |         rebuild_rel_sort_time += load_detail_time[0];
247 |         rebuild_rel_unique_time += load_detail_time[1];
248 |         rebuild_rel_index_time += load_detail_time[2];
249 |         // std::cout << "Finished! " << rel->name << " has "
250 |         //           << rel->full->tuple_counts << std::endl;
251 |         for (auto &delta_b : rel->buffered_delta_vectors) {
252 |             free_relation_container(delta_b);
253 |         }
254 |         free_relation_container(rel->delta);
255 |         free_relation_container(rel->newt);
256 |     }
257 |     } else {
258 |         // for (Relation *rel : update_relations) {
259 |         //     std::cout << "Finished! " << rel->name << " has "
260 |         //           << rel->full->tuple_counts << std::endl;
261 |         // }
262 |     }
263 |     timer.stop_timer();
264 |     float merge_full_time = timer.get_spent_time();
265 | 
266 |     std::cout << " memory alloc time: " << memory_alloc_time
267 |               << " ; Join time: " << join_time
268 |               << " ; merge full time: " << merge_time
269 |               << " ; rebuild full time: " << merge_full_time
270 |               << " ; rebuild delta time: " << rebuild_delta_time
271 |               << " ; set diff time: " << set_diff_time << std::endl;
272 |     std::cout << "Rebuild relation detail time : rebuild rel sort time: "
273 |               << rebuild_rel_sort_time
274 |               << " ; rebuild rel unique time: " << rebuild_rel_unique_time
275 |               << " ; rebuild rel index time: " << rebuild_rel_index_time
276 |               << std::endl;
277 | }
278 | 


--------------------------------------------------------------------------------
/src/print.cu:
--------------------------------------------------------------------------------
  1 | #include "../include/print.cuh"
  2 | #include <iostream>
  3 | #include <thrust/sort.h>
  4 | 
  5 | void print_hashes(GHashRelContainer *target, const char *rel_name) {
  6 |     MEntity *host_map;
  7 |     cudaMallocHost((void **)&host_map,
  8 |                    target->index_map_size * sizeof(MEntity));
  9 |     cudaMemcpy(host_map, target->index_map,
 10 |                target->index_map_size * sizeof(MEntity),
 11 |                cudaMemcpyDeviceToHost);
 12 |     std::cout << "Relation hash >>> " << rel_name << std::endl;
 13 |     for (tuple_size_t i = 0; i < target->index_map_size; i++) {
 14 |         std::cout << host_map[i].key << "    " << host_map[i].value
 15 |                   << std::endl;
 16 |     }
 17 |     std::cout << "end <<<" << std::endl;
 18 |     cudaFreeHost(host_map);
 19 | }
 20 | 
 21 | void print_tuple_rows(GHashRelContainer* target, const char *rel_name) {
 22 |     // sort first
 23 |     tuple_type* natural_ordered;
 24 |     cudaMalloc((void**) &natural_ordered, target->tuple_counts * sizeof(tuple_type));
 25 |     cudaMemcpy(natural_ordered, target->tuples, target->tuple_counts * sizeof(tuple_type),
 26 |                cudaMemcpyDeviceToDevice);
 27 |     thrust::sort(thrust::device, natural_ordered, natural_ordered+target->tuple_counts,
 28 |                  tuple_weak_less(target->arity));
 29 | 
 30 |     tuple_type* tuples_host;
 31 |     cudaMallocHost((void**) &tuples_host, target->tuple_counts * sizeof(tuple_type));
 32 |     cudaMemcpy(tuples_host, natural_ordered, target->tuple_counts * sizeof(tuple_type),
 33 |                cudaMemcpyDeviceToHost);
 34 |     std::cout << "Relation tuples >>> " << rel_name << std::endl;
 35 |     std::cout << "Total tuples counts:  " <<  target->tuple_counts << std::endl;
 36 |     u32 pt_size = target->tuple_counts;
 37 |     if (target->tuple_counts > 3000) {
 38 |         pt_size = 3000;
 39 |     }
 40 |     for (tuple_size_t i = 0; i < pt_size; i++) {
 41 |         tuple_type cur_tuple = tuples_host[i];
 42 |         
 43 |         tuple_type cur_tuple_host;
 44 |         cudaMallocHost((void**) &cur_tuple_host, target->arity * sizeof(column_type));
 45 |         cudaMemcpy(cur_tuple_host, cur_tuple, target->arity * sizeof(column_type),
 46 |                    cudaMemcpyDeviceToHost);
 47 |         // if (cur_tuple_host[0] != 1966) {
 48 |         //     continue;
 49 |         // }
 50 |         for (int j = 0; j < target->arity; j++) {
 51 | 
 52 |             std::cout << cur_tuple_host[j] << "\t";
 53 |         }
 54 |         std::cout << std::endl;
 55 |         cudaFreeHost(cur_tuple_host);
 56 |     }
 57 |     if (target->tuple_counts > 3000) {
 58 |         std::cout << "........." << std::endl;
 59 |     }
 60 |     std::cout << "end <<<" << std::endl;
 61 | 
 62 |     cudaFreeHost(tuples_host);
 63 |     cudaFree(natural_ordered);
 64 | }
 65 | 
 66 | void print_tuple_raw_data(GHashRelContainer* target, const char *rel_name) {
 67 |     column_type* raw_data_host;
 68 |     u64 mem_raw = target->data_raw_row_size * target->arity * sizeof(column_type);
 69 |     cudaMallocHost((void**) &raw_data_host, mem_raw);
 70 |     cudaMemcpy(raw_data_host, target->data_raw, mem_raw, cudaMemcpyDeviceToHost);
 71 |     std::cout << "Relation raw tuples >>> " << rel_name << std::endl;
 72 |     std::cout << "Total raw tuples counts:  " <<  target->data_raw_row_size << std::endl;
 73 |     for (tuple_size_t i = 0; i < target->data_raw_row_size; i++) {
 74 |         if (raw_data_host[i*target->arity] != 3) {
 75 |             continue;
 76 |         }
 77 |         for (int j = 0; j < target->arity; j++) {
 78 |             std::cout << raw_data_host[i*target->arity + j] << "    ";
 79 |         }
 80 |         std::cout << std::endl;
 81 |     }
 82 |     cudaFreeHost(raw_data_host);
 83 | }
 84 | 
 85 | void print_memory_usage(){
 86 |     int num_gpus;
 87 |     size_t free, total;
 88 |     cudaGetDeviceCount( &num_gpus );
 89 |     for ( int gpu_id = 0; gpu_id < num_gpus; gpu_id++ ) {
 90 |         cudaSetDevice( gpu_id );
 91 |         int id;
 92 |         cudaGetDevice( &id );
 93 |         cudaMemGetInfo( &free, &total );
 94 |         std::cout << "GPU " << id << " memory: free=" << free << ", total=" << total << std::endl;
 95 |     }
 96 | }
 97 | 
 98 | tuple_size_t get_free_memory() {
 99 |     int num_gpus;
100 |     size_t free, total;
101 |     cudaGetDeviceCount( &num_gpus );
102 |     for ( int gpu_id = 0; gpu_id < num_gpus; gpu_id++ ) {
103 |         cudaSetDevice( gpu_id );
104 |         int id;
105 |         cudaGetDevice( &id );
106 |         cudaMemGetInfo( &free, &total );
107 |         return free;
108 |     }
109 |     return 0;
110 | }
111 | 
112 | tuple_size_t get_total_memory() {
113 |     int num_gpus;
114 |     size_t free, total;
115 |     cudaGetDeviceCount( &num_gpus );
116 |     for ( int gpu_id = 0; gpu_id < num_gpus; gpu_id++ ) {
117 |         cudaSetDevice( gpu_id );
118 |         int id;
119 |         cudaGetDevice( &id );
120 |         cudaMemGetInfo( &free, &total );
121 |         return total;
122 |     }
123 |     return 0;
124 | }
125 | 
126 | void print_tuple_list(tuple_type* tuples, tuple_size_t rows, tuple_size_t arity) {
127 |     tuple_type* tuples_host;
128 |     cudaMallocHost((void**) &tuples_host, rows * sizeof(tuple_type));
129 |     cudaMemcpy(tuples_host, tuples, rows * sizeof(tuple_type),
130 |                cudaMemcpyDeviceToHost);
131 |     if (rows > 100) {
132 |         rows = 100;
133 |     }
134 |     for (tuple_size_t i = 0; i < rows; i++) {
135 |         tuple_type cur_tuple = tuples_host[i];
136 |         
137 |         tuple_type cur_tuple_host;
138 |         cudaMallocHost((void**) &cur_tuple_host, arity * sizeof(column_type));
139 |         cudaMemcpy(cur_tuple_host, cur_tuple, arity * sizeof(column_type),
140 |                    cudaMemcpyDeviceToHost);
141 |         for (tuple_size_t j = 0; j < arity; j++) {
142 |             std::cout << cur_tuple_host[j] << " ";
143 |         }
144 |         std::cout << std::endl;
145 |     }
146 | }
147 | 


--------------------------------------------------------------------------------
/src/tuple.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "../include/exception.cuh"
 3 | #include "../include/tuple.cuh"
 4 | #include <thrust/sort.h>
 5 | 
 6 | __global__ void extract_column(tuple_type *tuples, tuple_size_t rows,
 7 |                                tuple_size_t k, column_type *column) {
 8 |     int index = (blockIdx.x * blockDim.x) + threadIdx.x;
 9 |     if (index >= rows)
10 |         return;
11 | 
12 |     int stride = blockDim.x * gridDim.x;
13 |     for (tuple_size_t i = index; i < rows; i += stride) {
14 |         column[i] = tuples[i][k];
15 |     }
16 | }
17 | 
18 | __global__ void compute_hash(tuple_type *tuples, tuple_size_t rows,
19 |                              tuple_size_t index_column_size,
20 |                              column_type *hashes) {
21 |     int index = (blockIdx.x * blockDim.x) + threadIdx.x;
22 |     if (index >= rows)
23 |         return;
24 | 
25 |     int stride = blockDim.x * gridDim.x;
26 |     for (tuple_size_t i = index; i < rows; i += stride) {
27 |         hashes[i] = (column_type)prefix_hash(tuples[i], index_column_size);
28 |     }
29 | }
30 | 
31 | void sort_tuples(tuple_type *tuples, tuple_size_t rows, tuple_size_t arity,
32 |                  tuple_size_t index_column_size, int grid_size,
33 |                  int block_size) {
34 | 
35 |     column_type *col_tmp;
36 |     cudaMalloc((void **)&col_tmp, rows * sizeof(column_type));
37 |     for (int k = arity - 1; k >= 0; k--) {
38 |         extract_column<<<grid_size, block_size>>>(tuples, rows, k, col_tmp);
39 |         checkCuda(cudaGetLastError());
40 |         checkCuda(cudaDeviceSynchronize());
41 |         thrust::stable_sort_by_key(thrust::device, col_tmp, col_tmp + rows,
42 |                                    tuples);
43 |         checkCuda(cudaDeviceSynchronize());
44 |     }
45 |     cudaFree(col_tmp);
46 | }
47 | 
48 | void sort_tuple_by_hash(tuple_type *tuples, tuple_size_t rows,
49 |                         tuple_size_t arity, tuple_size_t index_column_size,
50 |                         int grid_size, int block_size) {
51 |     column_type *col_tmp;
52 |     cudaMalloc((void **)&col_tmp, rows * sizeof(column_type));
53 |     compute_hash<<<grid_size, block_size>>>(tuples, rows, index_column_size,
54 |                                             col_tmp);
55 |     checkCuda(cudaGetLastError());
56 |     checkCuda(cudaDeviceSynchronize());
57 |     thrust::stable_sort_by_key(thrust::device, col_tmp, col_tmp + rows, tuples);
58 |     checkCuda(cudaDeviceSynchronize());
59 |     cudaFree(col_tmp);
60 | }
61 | 


--------------------------------------------------------------------------------
/test/cspa.cu:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | #include <fstream>
  3 | #include <iostream>
  4 | #include <sstream>
  5 | #include <stdlib.h>
  6 | #include <thrust/execution_policy.h>
  7 | #include <thrust/merge.h>
  8 | #include <thrust/set_operations.h>
  9 | #include <vector>
 10 | 
 11 | #include "../include/exception.cuh"
 12 | #include "../include/lie.cuh"
 13 | #include "../include/print.cuh"
 14 | #include "../include/timer.cuh"
 15 | 
 16 | //////////////////////////////////////////////////////
 17 | 
 18 | long int get_row_size(const char *data_path) {
 19 |     std::ifstream f;
 20 |     f.open(data_path);
 21 |     char c;
 22 |     long i = 0;
 23 |     while (f.get(c))
 24 |         if (c == '\n')
 25 |             ++i;
 26 |     f.close();
 27 |     return i;
 28 | }
 29 | 
 30 | enum ColumnT { U64, U32 };
 31 | 
 32 | column_type *get_relation_from_file(const char *file_path, int total_rows,
 33 |                                     int total_columns, char separator,
 34 |                                     ColumnT ct) {
 35 |     column_type *data =
 36 |         (column_type *)malloc(total_rows * total_columns * sizeof(column_type));
 37 |     FILE *data_file = fopen(file_path, "r");
 38 |     for (int i = 0; i < total_rows; i++) {
 39 |         for (int j = 0; j < total_columns; j++) {
 40 |             if (j != (total_columns - 1)) {
 41 |                 if (ct == U64) {
 42 |                     fscanf(data_file, "%lld%c", &data[(i * total_columns) + j],
 43 |                            &separator);
 44 |                 } else {
 45 |                     fscanf(data_file, "%ld%c", &data[(i * total_columns) + j],
 46 |                            &separator);
 47 |                 }
 48 |             } else {
 49 |                 if (ct == U64) {
 50 |                     fscanf(data_file, "%lld", &data[(i * total_columns) + j]);
 51 |                 } else {
 52 |                     fscanf(data_file, "%ld", &data[(i * total_columns) + j]);
 53 |                 }
 54 |             }
 55 |         }
 56 |     }
 57 |     return data;
 58 | }
 59 | 
 60 | //////////////////////////////////////////////////////////////////
 61 | 
 62 | __device__ void cp_2_1__1(tuple_type input, tuple_type outpt) {
 63 |     outpt[0] = input[0];
 64 |     outpt[1] = input[0];
 65 | };
 66 | __device__ tuple_copy_hook cp_2_1__1_device = cp_2_1__1;
 67 | __device__ void cp_2_1__2(tuple_type input, tuple_type outpt) {
 68 |     outpt[0] = input[1];
 69 |     outpt[1] = input[1];
 70 | };
 71 | __device__ tuple_copy_hook cp_2_1__2_device = cp_2_1__2;
 72 | 
 73 | __device__ void cp_2_1__1_2(tuple_type input, tuple_type outpt) {
 74 |     outpt[0] = input[1];
 75 |     outpt[1] = input[0];
 76 | };
 77 | __device__ tuple_copy_hook cp_2_1__1_2_device = cp_2_1__1_2;
 78 | __device__ void cp_2_1__2_1(tuple_type input, tuple_type outpt) {
 79 |     outpt[0] = input[0];
 80 |     outpt[1] = input[1];
 81 | };
 82 | __device__ tuple_copy_hook cp_2_1__2_1_device = cp_2_1__2_1;
 83 | 
 84 | __device__ void join_10_11(tuple_type inner, tuple_type outer,
 85 |                            tuple_type output) {
 86 |     output[1] = inner[1];
 87 |     output[0] = outer[1];
 88 | }
 89 | __device__ tuple_generator_hook join_10_11_device = join_10_11;
 90 | 
 91 | __device__ void join_01_11(tuple_type inner, tuple_type outer,
 92 |                            tuple_type output) {
 93 |     output[0] = inner[1];
 94 |     output[1] = outer[1];
 95 | }
 96 | __device__ tuple_generator_hook join_01_11_device = join_01_11;
 97 | 
 98 | ////////////////////////////////////////////////////////////////
 99 | 
100 | void analysis_bench(const char *dataset_path, int block_size, int grid_size) {
101 |     KernelTimer timer;
102 |     int relation_columns = 2;
103 |     std::chrono::high_resolution_clock::time_point time_point_begin;
104 |     std::chrono::high_resolution_clock::time_point time_point_end;
105 |     
106 |     double spent_time;
107 | 
108 |     // load the input relation
109 |     std::stringstream assign_fact_ss;
110 |     assign_fact_ss << dataset_path << "/assign.facts";
111 |     std::stringstream dereference_fact_ss;
112 |     dereference_fact_ss << dataset_path << "/dereference.facts";
113 |     // std::cout << assign_fact_ss.str() << std::endl;
114 |     tuple_size_t assign_counts = get_row_size(assign_fact_ss.str().c_str());
115 |     std::cout << "Input assign rows: " << assign_counts << std::endl;
116 |     column_type *raw_assign_data = get_relation_from_file(
117 |         assign_fact_ss.str().c_str(), assign_counts, 2, '\t', U32);
118 |     std::cout << "reversing assign ... " << std::endl;
119 |     column_type *raw_reverse_assign_data =
120 |         (column_type *)malloc(assign_counts * 2 * sizeof(column_type));
121 |     for (tuple_size_t i = 0; i < assign_counts; i++) {
122 |         raw_reverse_assign_data[i * 2 + 1] = raw_assign_data[i * 2];
123 |         raw_reverse_assign_data[i * 2] = raw_assign_data[i * 2 + 1];
124 |     }
125 | 
126 |     tuple_size_t dereference_counts =
127 |         get_row_size(dereference_fact_ss.str().c_str());
128 |     std::cout << "Input dereference rows: " << dereference_counts << std::endl;
129 |     column_type *raw_dereference_data = get_relation_from_file(
130 |         dereference_fact_ss.str().c_str(), dereference_counts, 2, '\t', U32);
131 |     std::cout << "reversing dereference ... " << std::endl;
132 |     column_type *raw_reverse_dereference_data =
133 |         (column_type *)malloc(dereference_counts * 2 * sizeof(column_type));
134 |     for (tuple_size_t i = 0; i < dereference_counts; i++) {
135 |         raw_reverse_dereference_data[i * 2 + 1] = raw_dereference_data[i * 2];
136 |         raw_reverse_dereference_data[i * 2] = raw_dereference_data[i * 2 + 1];
137 |     }
138 | 
139 |     timer.start_timer();
140 |     
141 |     Relation *assign_2__2_1 = new Relation();
142 |     load_relation(assign_2__2_1, "assign_2__2_1", 2, raw_reverse_assign_data,
143 |                   assign_counts, 1, 0, grid_size, block_size);
144 | 
145 |     Relation *dereference_2__1_2 = new Relation();
146 |     load_relation(dereference_2__1_2, "dereference_2__1_2", 2,
147 |                   raw_dereference_data, dereference_counts, 1, 0, grid_size,
148 |                   block_size);
149 |     Relation *dereference_2__2_1 = new Relation();
150 |     load_relation(dereference_2__2_1, "dereference_2__2_1", 2,
151 |                   raw_reverse_dereference_data, dereference_counts, 1, 0,
152 |                   grid_size, block_size);
153 |     timer.stop_timer();
154 |     std::cout << "Build hash table time: " << timer.get_spent_time()
155 |               << std::endl;
156 | 
157 |     // scc init
158 |     Relation *value_flow_2__1_2 = new Relation();
159 |     load_relation(value_flow_2__1_2, "value_flow_2__1_2", 2, nullptr, 0, 1, 0,
160 |                   grid_size, block_size);
161 |     Relation *value_flow_2__2_1 = new Relation();
162 |     load_relation(value_flow_2__2_1, "value_flow_2__2_1", 2, nullptr, 0, 1, 0,
163 |                   grid_size, block_size);
164 | 
165 |     Relation *memory_alias_2__1_2 = new Relation();
166 |     load_relation(memory_alias_2__1_2, "memory_alias_2__1_2", 2, nullptr, 0, 1,
167 |                   0, grid_size, block_size);
168 |     Relation *memory_alias_2__2_1 = new Relation();
169 |     load_relation(memory_alias_2__2_1, "memory_alias_2__2_1", 2, nullptr, 0, 1,
170 |                   0, grid_size, block_size);
171 | 
172 |     timer.start_timer();
173 |     time_point_begin = std::chrono::high_resolution_clock::now();
174 |     LIE init_scc(grid_size, block_size);
175 |     init_scc.add_relations(value_flow_2__1_2, false);
176 |     init_scc.add_relations(value_flow_2__2_1, false);
177 |     init_scc.add_relations(memory_alias_2__1_2, false);
178 |     init_scc.add_relations(memory_alias_2__2_1, false);
179 |     init_scc.add_relations(assign_2__2_1, true);
180 |     tuple_copy_hook cp_2_1__1_host;
181 |     checkCuda(cudaMemcpyFromSymbol(&cp_2_1__1_host, cp_2_1__1_device,
182 |                          sizeof(tuple_copy_hook)));
183 |     tuple_copy_hook cp_2_1__2_host;
184 |     checkCuda(cudaMemcpyFromSymbol(&cp_2_1__2_host, cp_2_1__2_device,
185 |                          sizeof(tuple_copy_hook)));
186 |     tuple_copy_hook cp_2_1__1_2_host;
187 |     checkCuda(cudaMemcpyFromSymbol(&cp_2_1__1_2_host, cp_2_1__1_2_device,
188 |                          sizeof(tuple_copy_hook)));
189 |     tuple_copy_hook cp_2_1__2_1_host;
190 |     checkCuda(cudaMemcpyFromSymbol(&cp_2_1__1_host, cp_2_1__1_device,
191 |                          sizeof(tuple_copy_hook)));
192 |     init_scc.add_ra(RelationalCopy(assign_2__2_1, FULL, value_flow_2__1_2,
193 |                                    cp_2_1__1_host, nullptr, grid_size,
194 |                                    block_size));
195 |     init_scc.add_ra(RelationalCopy(assign_2__2_1, FULL, value_flow_2__1_2,
196 |                                    cp_2_1__2_host, nullptr, grid_size,
197 |                                    block_size));
198 |     init_scc.add_ra(RelationalCopy(assign_2__2_1, FULL, value_flow_2__1_2,
199 |                                    cp_2_1__1_2_host, nullptr, grid_size,
200 |                                    block_size));
201 | 
202 |     init_scc.add_ra(RelationalCopy(assign_2__2_1, FULL, memory_alias_2__1_2,
203 |                                    cp_2_1__1_host, nullptr, grid_size,
204 |                                    block_size));
205 |     init_scc.add_ra(RelationalCopy(assign_2__2_1, FULL, memory_alias_2__1_2,
206 |                                    cp_2_1__2_host, nullptr, grid_size,
207 |                                    block_size));
208 | 
209 |     init_scc.add_ra(RelationalCopy(value_flow_2__1_2, DELTA, value_flow_2__2_1,
210 |                                    cp_2_1__1_2_host, nullptr, grid_size,
211 |                                    block_size));
212 |     init_scc.add_ra(RelationalCopy(memory_alias_2__1_2, DELTA, memory_alias_2__2_1,
213 |                                    cp_2_1__1_2_host, nullptr, grid_size,
214 |                                    block_size));
215 |     init_scc.fixpoint_loop();
216 | 
217 |     timer.stop_timer();
218 |     time_point_end = std::chrono::high_resolution_clock::now();
219 |     std::cout << "init scc time: " << timer.get_spent_time() << std::endl;
220 |     std::cout << "init scc time (chono): "
221 |               << std::chrono::duration_cast<std::chrono::milliseconds>(
222 |                      time_point_end - time_point_begin)
223 |                      .count()
224 |               << std::endl;
225 | 
226 |     // scc analysis
227 |     Relation *value_flow_forward_2__1_2 = new Relation();
228 |     load_relation(value_flow_forward_2__1_2, "value_flow_forward_2__1_2", 2,
229 |                   nullptr, 0, 1, 0, grid_size, block_size);
230 | 
231 |     Relation *value_flow_forward_2__2_1 = new Relation();
232 |     load_relation(value_flow_forward_2__2_1, "value_flow_forward_2__2_1", 2,
233 |                   nullptr, 0, 1, 0, grid_size, block_size);
234 | 
235 |     Relation *value_alias_2__1_2 = new Relation();
236 |     value_alias_2__1_2->index_flag = false;
237 |     load_relation(value_alias_2__1_2, "value_alias_2__1_2", 2, nullptr, 0, 1, 0,
238 |                   grid_size, block_size);
239 | 
240 |     Relation *tmp_rel_def = new Relation();
241 |     tmp_rel_def->index_flag = false;
242 |     load_relation(tmp_rel_def, "tmp_rel_def", 2, nullptr, 0, 1, 0, grid_size,
243 |                   block_size);
244 |     Relation *tmp_rel_ma1 = new Relation();
245 |     tmp_rel_ma1->index_flag = false;
246 |     load_relation(tmp_rel_ma1, "tmp_rel_ma1", 2, nullptr, 0, 1, 0, grid_size,
247 |                   block_size, true);
248 |      Relation *tmp_rel_ma2 = new Relation();
249 |     tmp_rel_ma2->index_flag = false;
250 |     load_relation(tmp_rel_ma2, "tmp_rel_ma2", 2, nullptr, 0, 1, 0, grid_size,
251 |                   block_size, true);
252 | 
253 |     LIE analysis_scc(grid_size, block_size);
254 | 
255 |     analysis_scc.add_relations(assign_2__2_1, true);
256 |     analysis_scc.add_relations(dereference_2__1_2, true);
257 |     analysis_scc.add_relations(dereference_2__2_1, true);
258 | 
259 |     analysis_scc.add_relations(value_flow_2__1_2, false);
260 |     analysis_scc.add_relations(value_flow_2__2_1, false);
261 |     analysis_scc.add_relations(memory_alias_2__1_2, false);
262 |     analysis_scc.add_relations(memory_alias_2__2_1, false);
263 |     analysis_scc.add_relations(value_alias_2__1_2, false);
264 | 
265 |     // join order matters for temp!
266 |     analysis_scc.add_tmp_relation(tmp_rel_def);
267 |     analysis_scc.add_tmp_relation(tmp_rel_ma1);
268 |     analysis_scc.add_tmp_relation(tmp_rel_ma2);
269 | 
270 |     float join_detail[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
271 | 
272 |     // join_vf_vfvf: ValueFlow(x, y) :- ValueFlow(x, z), ValueFlow(z, y).
273 |     tuple_generator_hook join_10_11_host;
274 |     checkCuda(cudaMemcpyFromSymbol(&join_10_11_host, join_10_11_device,
275 |                          sizeof(tuple_generator_hook)));
276 |     tuple_generator_hook join_01_11_host;
277 |     checkCuda(cudaMemcpyFromSymbol(&join_01_11_host, join_01_11_device,
278 |                          sizeof(tuple_generator_hook)));
279 |     analysis_scc.add_ra(
280 |         RelationalJoin(value_flow_2__1_2, FULL, value_flow_2__2_1, DELTA,
281 |                        value_flow_2__1_2, join_10_11_host, nullptr, LEFT,
282 |                        grid_size, block_size, join_detail));
283 |     analysis_scc.add_ra(
284 |         RelationalJoin(value_flow_2__2_1, FULL, value_flow_2__1_2, DELTA,
285 |                        value_flow_2__1_2, join_01_11_host, nullptr, LEFT,
286 |                        grid_size, block_size, join_detail));
287 | 
288 |     // join_va_vf_vf: ValueAlias(x, y) :- ValueFlow(z, x), ValueFlow(z, y).
289 |     // v1
290 |     analysis_scc.add_ra(
291 |         RelationalJoin(value_flow_2__1_2, FULL, value_flow_2__1_2, DELTA,
292 |                        value_alias_2__1_2, join_01_11_host, nullptr, LEFT,
293 |                        grid_size, block_size, join_detail));
294 |     // v2
295 |     analysis_scc.add_ra(
296 |         RelationalJoin(value_flow_2__1_2, FULL, value_flow_2__1_2, DELTA,
297 |                        value_alias_2__1_2, join_10_11_host, nullptr, LEFT,
298 |                        grid_size, block_size, join_detail));
299 | 
300 |     // join_vf_am: ValueFlow(x, y) :- Assign(x, z), MemoryAlias(z, y).
301 |     analysis_scc.add_ra(
302 |         RelationalJoin(assign_2__2_1, FULL, memory_alias_2__1_2, DELTA,
303 |                        value_flow_2__1_2, join_01_11_host, nullptr, LEFT,
304 |                        grid_size, block_size, join_detail));
305 | 
306 |     // tmp_rel_def(z, x) :- Dereference(y, x), ValueAlias(y, z)
307 |     analysis_scc.add_ra(
308 |         RelationalJoin(dereference_2__1_2, FULL, value_alias_2__1_2, DELTA,
309 |                        tmp_rel_def, join_10_11_host, nullptr, LEFT, grid_size,
310 |                        block_size, join_detail));
311 | 
312 |     // WARNING: tmp relation can only in outer because it doesn't include
313 |     // index!
314 |     // join_ma_d_tmp: MemoryAlias(x, w) :- Dereference(z, w) , tmp_rel_def(z,x)
315 |     analysis_scc.add_ra(
316 |         RelationalJoin(dereference_2__1_2, FULL, tmp_rel_def, NEWT,
317 |                        memory_alias_2__1_2, join_10_11_host, nullptr, LEFT,
318 |                        grid_size, block_size, join_detail));
319 | 
320 |     // ValueAlias(x,y) :- 
321 |     //    ValueFlow(z,x),
322 |     //    MemoryAlias(z,w),
323 |     //    ValueFlow(w,y).
324 |     // ValueFlow DELTA 1, 2 <> MemoryAlias FULL 1, 2 <> ValueFlow FULL 2, 1
325 |     // ValueFlow FULL 1, 2 <> MemoryAlias DELTA 1, 2 <> ValueFlow FULL 2, 1
326 |     // ValueFlow FULL 1, 2 <> MemoryAlias FULL 1, 2 <> ValueFlow DELTA 2, 1 
327 |     // join_tmp_vf_ma : tmp_rel_ma(w, x) :- ValueFlow(z, x), MemoryAlias(z, w).
328 |     // join_va_tmp_vf : ValueAlias(x, y) :- tmp_rel_ma(w, x), ValueFlow(w,y).
329 |     // v1
330 |     analysis_scc.add_ra(
331 |         RelationalJoin(memory_alias_2__1_2, FULL , value_flow_2__1_2, DELTA,
332 |                        tmp_rel_ma1, join_01_11_host, nullptr, LEFT, grid_size,
333 |                        block_size, join_detail));
334 |     analysis_scc.add_ra(
335 |         RelationalJoin(value_flow_2__1_2, FULL, memory_alias_2__1_2, DELTA,
336 |                        tmp_rel_ma1, join_10_11_host, nullptr, LEFT, grid_size,
337 |                        block_size, join_detail));
338 | 
339 |     analysis_scc.add_ra(
340 |         RelationalJoin(value_flow_2__1_2, FULL, tmp_rel_ma1, NEWT,
341 |                        value_alias_2__1_2, join_10_11_host, nullptr, LEFT,
342 |                        grid_size, block_size, join_detail));
343 | 
344 |     analysis_scc.add_ra(
345 |         RelationalJoin(memory_alias_2__2_1, FULL , value_flow_2__1_2, DELTA,
346 |                        tmp_rel_ma2, join_01_11_host, nullptr, LEFT, grid_size,
347 |                        block_size, join_detail));
348 |      analysis_scc.add_ra(
349 |         RelationalJoin(value_flow_2__1_2, FULL, tmp_rel_ma2, NEWT,
350 |                        value_alias_2__1_2, join_01_11_host, nullptr, LEFT,
351 |                        grid_size, block_size, join_detail));
352 | 
353 |     analysis_scc.add_ra(RelationalACopy(value_flow_2__1_2, value_flow_2__2_1,
354 |                                         cp_2_1__1_2_host, nullptr, grid_size,
355 |                                         block_size));
356 |     analysis_scc.add_ra(RelationalACopy(memory_alias_2__1_2, memory_alias_2__2_1,
357 |                                         cp_2_1__1_2_host, nullptr, grid_size,
358 |                                         block_size));
359 |     time_point_begin = std::chrono::high_resolution_clock::now();
360 |     timer.start_timer();
361 |     analysis_scc.fixpoint_loop();
362 |     // print_tuple_rows(value_flow_2__1_2->full, "value_flow_2__1_2");
363 |     timer.stop_timer();
364 |     time_point_end = std::chrono::high_resolution_clock::now();
365 |     std::cout << "analysis scc time: " << timer.get_spent_time() << std::endl;
366 |     std::cout << "analysis scc time (chono): "
367 |               << std::chrono::duration_cast<std::chrono::milliseconds>(
368 |                      time_point_end - time_point_begin)
369 |                      .count()
370 |               << std::endl;
371 |     std::cout << "join detail: " << std::endl;
372 |     std::cout << "compute size time:  " <<  join_detail[0] <<  std::endl;
373 |     std::cout << "reduce + scan time: " <<  join_detail[1] <<  std::endl;
374 |     std::cout << "fetch result time:  " <<  join_detail[2] <<  std::endl;
375 |     std::cout << "sort time:          " <<  join_detail[3] <<  std::endl;
376 |     std::cout << "build index time:   " <<  join_detail[5] <<  std::endl;
377 |     std::cout << "merge time:         " <<  join_detail[6] <<  std::endl;
378 |     std::cout << "unique time:        " << join_detail[4] + join_detail[7] <<  std::endl;
379 | }
380 | 
381 | int main(int argc, char *argv[]) {
382 |     int device_id;
383 |     int number_of_sm;
384 |     cudaGetDevice(&device_id);
385 |     cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount,
386 |                            device_id);
387 |     int max_threads_per_block;
388 |     cudaDeviceGetAttribute(&max_threads_per_block, cudaDevAttrMaxThreadsPerBlock, 0);
389 |     std::cout << "num of sm " << number_of_sm << " num of thread per block " << max_threads_per_block << std::endl;
390 |     std::cout << "using " << EMPTY_HASH_ENTRY << " as empty hash entry"
391 |               << std::endl;
392 |     int block_size, grid_size;
393 |     block_size = 512;
394 |     grid_size = 32 * number_of_sm;
395 |     std::locale loc("");
396 |     analysis_bench(argv[1], block_size, grid_size);
397 |     return 0;
398 | }
399 | 


--------------------------------------------------------------------------------
/test/cuDF/load_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import re
 3 | import cudf
 4 | import time
 5 | import json
 6 | import cupy
 7 | 
 8 | REPEAT = 1000
 9 | 
10 | def get_dataset(filename, column_names=['column 1', 'column 2'],
11 |                 rows=None):
12 |     if rows != None:
13 |         nrows = rows
14 |     else:
15 |         nrows = int(re.search('\d+|$', filename).group())
16 |     return cudf.read_csv(filename, sep='\t', header=None,
17 |                          names=column_names, nrows=nrows)
18 | 
19 | 
20 | def loading_time_benchmark(dataset, rows=None):
21 |     graph = get_dataset(dataset, rows=rows)
22 |     arr_cupy = cudf.to_cupy()
23 |     size = len(arr_cupy)
24 |     time_start = time.time()
25 |     for _ in range(REPEAT):
26 |         re_constructed = cudf.from_cupy(arr_cupy)
27 |     time_end = time.time()
28 |     return time_end - time_start, size
29 | 
30 | def main():
31 |     datasets = {
32 |         "ego-Facebook": "../data/data_88234.txt",
33 |         "wiki-Vote": "../data/data_103689.txt",
34 |         "luxembourg_osm": "../data/data_119666.txt",
35 |         "fe_sphere": "../data/data_49152.txt",
36 |         "fe_body": "../data/data_163734.txt",
37 |         "cti": "../data/data_48232.txt",
38 |         "fe_ocean": "../data/data_409593.txt",
39 |         "wing": "../data/data_121544.txt",
40 |         "loc-Brightkite": "../data/data_214078.txt",
41 |         "delaunay_n16": "../data/data_196575.txt",
42 |         "usroads": "../data/data_165435.txt",
43 |         "CA-HepTh": "../data/data_51971.txt",
44 |         "SF.cedge": "../data/data_223001.txt",
45 |         "p2p-Gnutella31": "../data/data_147892.txt",
46 |         "p2p-Gnutella09": "../data/data_26013.txt",
47 |         "p2p-Gnutella04": "../data/data_39994.txt",
48 |         "cal.cedge": "../data/data_21693.txt",
49 |         "TG.cedge": "../data/data_23874.txt",
50 |         "OL.cedge": "../data/data_7035.txt",
51 |     }
52 |     results = {}
53 |     for dataset_name, dataset_path in datasets.items():
54 |         loading_time, size = loading_time_benchmark(dataset_path)
55 |         results[dataset_name] = {
56 |             "loading_time": loading_time,
57 |             "tuple/s: ": size * REPEAT / loading_time
58 |         } 
59 |     print(json.dumps(results, indent=4))
60 | 
61 | if __name__ == "__main__":
62 |     main()
63 | 


--------------------------------------------------------------------------------
/test/cuDF/reachability.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import cudf
  3 | import time
  4 | import json
  5 | 
  6 | 
  7 | def display_time(time_start, time_end, message):
  8 |     time_took = time_end - time_start
  9 |     print(f"Debug: {message}: {time_took:.6f}s")
 10 | 
 11 | 
 12 | def get_join(relation_1, relation_2, column_names=['column 1', 'column 2']):
 13 |     return relation_1.merge(relation_2, on=column_names[0],
 14 |                             how="inner",
 15 |                             suffixes=('_relation_1', '_relation_2'))
 16 | 
 17 | 
 18 | def get_projection(result, column_names=['column 1', 'column 2']):
 19 |     temp = result.drop([column_names[0]], axis=1).drop_duplicates()
 20 |     temp.columns = column_names
 21 |     return temp
 22 | 
 23 | 
 24 | def get_union(relation_1, relation_2):
 25 |     return cudf.concat([relation_1, relation_2],
 26 |                        ignore_index=True).drop_duplicates()
 27 | 
 28 | 
 29 | def get_dataset(filename, column_names=['column 1', 'column 2'],
 30 |                 rows=None):
 31 |     if rows != None:
 32 |         nrows = rows
 33 |     else:
 34 |         nrows = int(re.search('\d+|$', filename).group())
 35 |     return cudf.read_csv(filename, sep='\t', header=None,
 36 |                          names=column_names, nrows=nrows)
 37 | 
 38 | 
 39 | def get_transitive_closure(dataset):
 40 |     COLUMN_NAMES = ['column 1', 'column 2']
 41 |     rows = int(re.search('\d+|$', dataset).group())
 42 |     start_time_outer = time.perf_counter()
 43 |     relation_1 = get_dataset(dataset, COLUMN_NAMES, rows)
 44 |     relation_2 = relation_1.copy()
 45 |     relation_2.columns = COLUMN_NAMES[::-1]
 46 |     temp_result = relation_1
 47 |     i = 0
 48 |     while True:
 49 |         temp_projection = get_projection(get_join(relation_2, relation_1,
 50 |                                                   COLUMN_NAMES), COLUMN_NAMES)
 51 |         x = len(temp_projection)
 52 |         previous_result_size = len(temp_result)
 53 |         temp_result = get_union(temp_result, temp_projection)
 54 |         current_result_size = len(temp_result)
 55 |         if previous_result_size == current_result_size:
 56 |             i += 1
 57 |             break
 58 |         del relation_2
 59 |         relation_2 = temp_projection
 60 |         relation_2.columns = COLUMN_NAMES[::-1]
 61 |         i += 1
 62 |         del temp_projection
 63 |         # print(f"i: {i}, projection size: {x}, rows: {current_result_size}")
 64 |     end_time_outer = time.perf_counter()
 65 |     time_took = end_time_outer - start_time_outer
 66 |     time_took = f"{time_took:.6f}"
 67 |     # print(temp_result)
 68 |     return rows, len(temp_result), i, time_took
 69 | 
 70 | 
 71 | def generate_benchmark(iterative=True, datasets=None):
 72 |     result = []
 73 |     if iterative:
 74 |         print("| Number of rows | TC size | Iterations | Time (s) |")
 75 |         print("| --- | --- | --- | --- |")
 76 |         increment = 1000
 77 |         n = 990
 78 |         count = 0
 79 |         while n < 11000:
 80 |             try:
 81 |                 dataset = f"../data/data_{n}.txt"
 82 |                 n = int(re.search('\d+|$', dataset).group())
 83 |                 record = get_transitive_closure(dataset)
 84 |                 result.append(record)
 85 |                 print(
 86 |                     f"| {record[0]} | {record[1]} | {record[2]} | {record[3]:.6f} |")
 87 |                 n += increment
 88 |             except Exception as ex:
 89 |                 print(str(ex))
 90 |                 break
 91 |             count += 1
 92 |     if datasets:
 93 |         print("| Dataset | Number of rows | TC size | Iterations | Time (s) |")
 94 |         print("| --- | --- | --- | --- | --- |")
 95 |         for key, dataset in datasets.items():
 96 |             try:
 97 |                 record = get_transitive_closure(dataset)
 98 |                 record = list(record)
 99 |                 record.insert(0, key)
100 |                 result.append(record)
101 |                 message = " | ".join([str(s) for s in record])
102 |                 message = "| " + message + " |"
103 |                 print(message)
104 |             except Exception as ex:
105 |                 print(str(ex))
106 |                 break
107 |     print("\n")
108 |     with open('transitive_closure.json', 'w') as f:
109 |         json.dump(result, f)
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     generate_benchmark(iterative=False, datasets={
114 |         "ego-Facebook": "../data/data_88234.txt",
115 |         "wiki-Vote": "../data/data_103689.txt",
116 |         "luxembourg_osm": "../data/data_119666.txt",
117 |         "fe_sphere": "../data/data_49152.txt",
118 |         # "fe_body": "../data/data_163734.txt",
119 |         "cti": "../data/data_48232.txt",
120 |         # "fe_ocean": "../data/data_409593.txt",
121 |         "wing": "../data/data_121544.txt",
122 |         # "loc-Brightkite": "../data/data_214078.txt",
123 |         "delaunay_n16": "../data/data_196575.txt",
124 |         # "usroads": "../data/data_165435.txt",
125 |         "CA-HepTh": "../data/data_51971.txt",
126 |         "SF.cedge": "../data/data_223001.txt",
127 |         # "p2p-Gnutella31": "../data/data_147892.txt",
128 |         "p2p-Gnutella09": "../data/data_26013.txt",
129 |         "p2p-Gnutella04": "../data/data_39994.txt",
130 |         "cal.cedge": "../data/data_21693.txt",
131 |         "TG.cedge": "../data/data_23874.txt",
132 |         "OL.cedge": "../data/data_7035.txt",
133 |     })
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/test/cuDF/sg.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   [
 3 |     "CA-HepTh",
 4 |     51971,
 5 |     74618689,
 6 |     9,
 7 |     "21.241212"
 8 |   ],
 9 |   [
10 |     "ego-Facebook",
11 |     88234,
12 |     15018986,
13 |     13,
14 |     "19.074940"
15 |   ],
16 |   [
17 |     "wiki-Vote",
18 |     103689,
19 |     5376338,
20 |     4,
21 |     "2.603751"
22 |   ],
23 |   [
24 |     "luxembourg_osm",
25 |     119666,
26 |     245221,
27 |     326,
28 |     "2.215113"
29 |   ],
30 |   [
31 |     "cti",
32 |     48232,
33 |     14503742,
34 |     44,
35 |     "3.857438"
36 |   ],
37 |   [
38 |     "fe_ocean",
39 |     409593,
40 |     65941441,
41 |     77,
42 |     "45.979235"
43 |   ],
44 |   [
45 |     "wing",
46 |     121544,
47 |     647999,
48 |     8,
49 |     "0.204277"
50 |   ],
51 |   [
52 |     "delaunay_n16",
53 |     196575,
54 |     25994011,
55 |     85,
56 |     "14.832548"
57 |   ],
58 |   [
59 |     "p2p-Gnutella09",
60 |     26013,
61 |     62056583,
62 |     14,
63 |     "13.705286"
64 |   ],
65 |   [
66 |     "p2p-Gnutella04",
67 |     39994,
68 |     116931333,
69 |     18,
70 |     "48.947088"
71 |   ],
72 |   [
73 |     "cal.cedge",
74 |     21693,
75 |     23519,
76 |     58,
77 |     "0.259069"
78 |   ],
79 |   [
80 |     "TG.cedge",
81 |     23874,
82 |     608090,
83 |     54,
84 |     "0.719743"
85 |   ],
86 |   [
87 |     "OL.cedge",
88 |     7035,
89 |     285431,
90 |     56,
91 |     "0.385674"
92 |   ]
93 | ]


--------------------------------------------------------------------------------
/test/cuDF/sg.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import cudf
  3 | import time
  4 | import json
  5 | 
  6 | REPEAT = 3
  7 | 
  8 | def display_time(time_start, time_end, message):
  9 |     time_took = time_end - time_start
 10 |     print(f"Debug: {message}: {time_took:.6f}s")
 11 | 
 12 | 
 13 | def get_join(relation_1, relation_2, column_names=['column 1', 'column 2']):
 14 |     return relation_1.merge(relation_2, on=column_names[0],
 15 |                             how="inner",
 16 |                             suffixes=('_relation_1', '_relation_2'))
 17 | 
 18 | 
 19 | def get_projection(result, column_names=['column 1', 'column 2'], remove_same_val=False):
 20 |     temp = result.drop([column_names[0]], axis=1).drop_duplicates()
 21 |     temp.columns = column_names
 22 |     if remove_same_val:
 23 |         temp = temp.loc[(temp[column_names[0]] != temp[column_names[1]])]
 24 |     return temp
 25 | 
 26 | 
 27 | def get_union(relation_1, relation_2):
 28 |     return cudf.concat([relation_1, relation_2],
 29 |                        ignore_index=True).drop_duplicates()
 30 | 
 31 | 
 32 | def get_dataset(filename, column_names=['column 1', 'column 2'],
 33 |                 rows=None):
 34 |     if rows != None:
 35 |         nrows = rows
 36 |     else:
 37 |         nrows = int(re.search('\d+|$', filename).group())
 38 |     return cudf.read_csv(filename, sep='\t', header=None,
 39 |                          names=column_names, nrows=nrows)
 40 | 
 41 | 
 42 | def get_sg(dataset):
 43 |     COLUMN_NAMES = ['column 1', 'column 2']
 44 |     rows = int(re.search('\d+|$', dataset).group())
 45 |     start_time_outer = time.perf_counter()
 46 |     relation_1 = get_dataset(dataset, COLUMN_NAMES, rows)
 47 |     relation_2 = relation_1.copy()
 48 |     # sg(x, y): - edge(p, x), edge(p, y), x != y.
 49 |     temp_result = get_projection(get_join(relation_1, relation_2,
 50 |                                                   COLUMN_NAMES), COLUMN_NAMES, remove_same_val=True)
 51 |     i = 0
 52 |     relation_2 = temp_result
 53 |     while True:
 54 |         # tmp(b, x): - edge(a, x), sg(a, b).
 55 |         temp_projection = get_projection(get_join(relation_1, relation_2,
 56 |                                                   COLUMN_NAMES), COLUMN_NAMES)
 57 |         temp_projection.columns = COLUMN_NAMES[::-1]
 58 |         # sg(x, y): - tmp(b, x), edge(b, y).
 59 |         temp_projection_2 = get_projection(get_join(temp_projection, relation_1,
 60 |                                                   COLUMN_NAMES), COLUMN_NAMES)
 61 |         relation_2 = temp_projection_2
 62 |         previous_result_size = len(temp_result)
 63 |         temp_result = get_union(temp_result, relation_2)
 64 |         current_result_size = len(temp_result)
 65 |         if previous_result_size == current_result_size:
 66 |             i += 1
 67 |             break
 68 |         i += 1
 69 |         del temp_projection
 70 |         del temp_projection_2
 71 |     end_time_outer = time.perf_counter()
 72 |     time_took = end_time_outer - start_time_outer
 73 |     time_took = f"{time_took:.6f}"
 74 |     return rows, len(temp_result), int(i), time_took
 75 | 
 76 | 
 77 | def generate_benchmark(datasets=None):
 78 |     result = []
 79 |     print("| Dataset | Number of rows | SG size | Iterations | Time (s) |")
 80 |     print("| --- | --- | --- | --- | --- |")
 81 |     for key, dataset in datasets.items():
 82 |         time_took = []
 83 |         record = None
 84 |         try:
 85 |             # Omit the warm up round timing
 86 |             warm_up = get_sg(dataset)
 87 |             for i in range(REPEAT):
 88 |                 try:
 89 |                     record = get_sg(dataset)
 90 |                     time_took.append(float(record[3]))
 91 |                 except Exception as ex:
 92 |                     print(str(ex))
 93 |             record = list(record)
 94 |             record[3] = f"{(sum(time_took) / REPEAT):.6f}"
 95 |             record.insert(0, key)
 96 |             result.append(record)
 97 |             message = " | ".join([str(s) for s in record])
 98 |             message = "| " + message + " |"
 99 |             print(message)
100 |         except Exception as ex:
101 |             print(f"Error in {key}. Message: {str(ex)}")
102 |     print("\n")
103 |     with open('sg.json', 'w') as f:
104 |         json.dump(result, f)
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     generate_benchmark(datasets={
109 |         "hipc": "../../data/data_5.txt",
110 |         "fe_body": "../../data/data_163734.txt",
111 |         "loc-Brightkite": "../../data/data_214078.txt",
112 |         "fe_sphere": "../../data/data_49152.txt",
113 |         "CA-HepTh": "../../data/data_51971.txt",
114 |         # "SF.cedge": "../../data/data_223001.txt",
115 |         "ego-Facebook": "../../data/data_88234.txt",
116 |         "wiki-Vote": "../../data/data_103689.txt",
117 |         "luxembourg_osm": "../../data/data_119666.txt",
118 |         "cti": "../../data/data_48232.txt",
119 |         "fe_ocean": "../../data/data_409593.txt",
120 |         "wing": "../../data/data_121544.txt",
121 |         "delaunay_n16": "../../data/data_196575.txt",
122 |         "usroads": "../../data/data_165435.txt",
123 |         "p2p-Gnutella31": "../../data/data_147892.txt",
124 |         "p2p-Gnutella09": "../../data/data_26013.txt",
125 |         "p2p-Gnutella04": "../../data/data_39994.txt",
126 |         "cal.cedge": "../../data/data_21693.txt",
127 |         "TG.cedge": "../../data/data_23874.txt",
128 |         "OL.cedge": "../../data/data_7035.txt",
129 |     })
130 | 


--------------------------------------------------------------------------------
/test/datastructure.cu:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | #include <fstream>
  3 | #include <iostream>
  4 | #include <stdlib.h>
  5 | #include <thrust/execution_policy.h>
  6 | #include <thrust/merge.h>
  7 | #include <thrust/set_operations.h>
  8 | #include <thrust/sort.h>
  9 | #include <vector>
 10 | 
 11 | #include "../include/exception.cuh"
 12 | #include "../include/lie.cuh"
 13 | #include "../include/print.cuh"
 14 | #include "../include/timer.cuh"
 15 | 
 16 | long int get_row_size(const char *data_path) {
 17 |     std::ifstream f;
 18 |     f.open(data_path);
 19 |     char c;
 20 |     long i = 0;
 21 |     while (f.get(c))
 22 |         if (c == '\n')
 23 |             ++i;
 24 |     f.close();
 25 |     return i;
 26 | }
 27 | 
 28 | enum ColumnT { U64, U32 };
 29 | 
 30 | column_type *get_relation_from_file(const char *file_path, int total_rows,
 31 |                                     int total_columns, char separator,
 32 |                                     ColumnT ct) {
 33 |     column_type *data =
 34 |         (column_type *)malloc(total_rows * total_columns * sizeof(column_type));
 35 |     FILE *data_file = fopen(file_path, "r");
 36 |     for (int i = 0; i < total_rows; i++) {
 37 |         for (int j = 0; j < total_columns; j++) {
 38 |             if (j != (total_columns - 1)) {
 39 |                 if (ct == U64) {
 40 |                     fscanf(data_file, "%lld%c", &data[(i * total_columns) + j],
 41 |                            &separator);
 42 |                 } else {
 43 |                     fscanf(data_file, "%ld%c", &data[(i * total_columns) + j],
 44 |                            &separator);
 45 |                 }
 46 |             } else {
 47 |                 if (ct == U64) {
 48 |                     fscanf(data_file, "%lld", &data[(i * total_columns) + j]);
 49 |                 } else {
 50 |                     fscanf(data_file, "%ld", &data[(i * total_columns) + j]);
 51 |                 }
 52 |             }
 53 |         }
 54 |     }
 55 |     return data;
 56 | }
 57 | 
 58 | __device__ void reorder_path(tuple_type inner, tuple_type outer,
 59 |                              tuple_type newt) {
 60 |     newt[0] = inner[1];
 61 |     newt[1] = outer[1];
 62 | };
 63 | __device__ tuple_generator_hook reorder_path_device = reorder_path;
 64 | 
 65 | void datastructure_bench(const char *dataset_path, int block_size,
 66 |                          int grid_size) {
 67 |     KernelTimer timer;
 68 |     int relation_columns = 2;
 69 |     std::chrono::high_resolution_clock::time_point time_point_begin;
 70 |     std::chrono::high_resolution_clock::time_point time_point_end;
 71 |     time_point_begin = std::chrono::high_resolution_clock::now();
 72 |     double spent_time;
 73 | 
 74 |     // load the raw graph
 75 |     tuple_size_t graph_edge_counts = get_row_size(dataset_path);
 76 |     std::cout << "Input graph rows: " << graph_edge_counts << std::endl;
 77 |     // u64 graph_edge_counts = 2100;
 78 |     column_type *raw_graph_data =
 79 |         get_relation_from_file(dataset_path, graph_edge_counts, 2, '\t', U32);
 80 |     column_type *raw_reverse_graph_data =
 81 |         (column_type *)malloc(graph_edge_counts * 2 * sizeof(column_type));
 82 |     std::cout << "reversing graph ... " << std::endl;
 83 |     for (tuple_size_t i = 0; i < graph_edge_counts; i++) {
 84 |         raw_reverse_graph_data[i * 2 + 1] = raw_graph_data[i * 2];
 85 |         raw_reverse_graph_data[i * 2] = raw_graph_data[i * 2 + 1];
 86 |     }
 87 |     column_type *raw_graph_data_gpu;
 88 |     cudaMalloc((void **)&raw_graph_data_gpu,
 89 |                graph_edge_counts * 2 * sizeof(column_type));
 90 |     cudaMemcpy(raw_graph_data_gpu, raw_graph_data,
 91 |                graph_edge_counts * 2 * sizeof(column_type),
 92 |                cudaMemcpyHostToDevice);
 93 |     tuple_type *raw_graph_data_gpu_tuple;
 94 |     cudaMalloc((void **)&raw_graph_data_gpu_tuple,
 95 |                graph_edge_counts * sizeof(tuple_type));
 96 |     init_tuples_unsorted<<<grid_size, block_size>>>(
 97 |         raw_graph_data_gpu_tuple, raw_graph_data_gpu, 2, graph_edge_counts);
 98 |     checkCuda(cudaDeviceSynchronize());
 99 |     thrust::sort(thrust::device, raw_graph_data_gpu_tuple,
100 |                  raw_graph_data_gpu_tuple + graph_edge_counts,
101 |                  tuple_indexed_less(1, 2));
102 |     checkCuda(cudaDeviceSynchronize());
103 |     column_type *raw_graph_data_gpu_sorted;
104 |     cudaMalloc((void **)&raw_graph_data_gpu_sorted,
105 |                graph_edge_counts * 2 * sizeof(column_type));
106 |     flatten_tuples_raw_data<<<grid_size, block_size>>>(
107 |         raw_graph_data_gpu_tuple, raw_graph_data_gpu_sorted, graph_edge_counts,
108 |         2);
109 |     std::cout << "finish reverse graph." << std::endl;
110 | 
111 |     std::cout << "Testing datastructure build <<<<<<<<<<<<<<< " << std::endl;
112 |     int REPEAT = 100;
113 |     float build_table_time = 0;
114 |     
115 |     for (int i = 0; i < REPEAT; i++) {
116 |         Relation *path_2__1_2 = new Relation();
117 |         path_2__1_2->index_flag = false;
118 |         // cudaMallocHost((void **)&path_2__1_2, sizeof(Relation));
119 |         // std::cout << "edge size " << graph_edge_counts << std::endl;
120 |         // load_relation(path_2__1_2, "path_2__1_2", 2, raw_graph_data,
121 |         //             graph_edge_counts, 1, 0, grid_size, block_size);
122 |         path_2__1_2->full = new GHashRelContainer(2, 1, 0);
123 |         timer.start_timer();
124 |         float load_detail_time[5] = {0, 0, 0, 0, 0};
125 |         load_relation_container(path_2__1_2->full, 2, raw_graph_data_gpu_sorted,
126 |                                 graph_edge_counts, 1, 0, 0.8, grid_size,
127 |                                 block_size, load_detail_time, true, true);
128 |         timer.stop_timer();
129 |         build_table_time += timer.get_spent_time();
130 |         // load_relation(edge_2__2_1, "edge_2__2_1", 2, raw_reverse_graph_data,
131 |         //               graph_edge_counts, 1, 0, grid_size, block_size);
132 |         path_2__1_2->full->tuple_counts = 0;
133 |         path_2__1_2->full->index_map_size = 0;
134 |         path_2__1_2->full->data_raw_row_size = 0;
135 |         if (path_2__1_2->full->index_map != nullptr) {
136 |             checkCuda(cudaFree(path_2__1_2->full->index_map));
137 |             path_2__1_2->full->index_map = nullptr;
138 |         }
139 |         if (path_2__1_2->full->tuples != nullptr) {
140 |             checkCuda(cudaFree(path_2__1_2->full->tuples));
141 |             path_2__1_2->full->tuples = nullptr;
142 |         }
143 |     }
144 |     
145 |     std::cout << "Graph size: " << graph_edge_counts << std::endl;
146 |     std::cout << "Build hash table time: " << build_table_time << std::endl;
147 |     std::cout << "HashTable build ratio : "
148 |               << graph_edge_counts * REPEAT / build_table_time << std::endl;
149 | 
150 |     Relation *edge_2__2_1 = new Relation();
151 |     // cudaMallocHost((void **)&edge_2__2_1, sizeof(Relation));
152 |     Relation *path_2__1_2 = new Relation();
153 |     path_2__1_2->index_flag = false;
154 |     // cudaMallocHost((void **)&path_2__1_2, sizeof(Relation));
155 |     std::cout << "edge size " << graph_edge_counts << std::endl;
156 |     load_relation(path_2__1_2, "path_2__1_2", 2, raw_graph_data,
157 |                   graph_edge_counts, 1, 0, grid_size, block_size);
158 |     load_relation(edge_2__2_1, "edge_2__2_1", 2, raw_reverse_graph_data,
159 |                   graph_edge_counts, 1, 0, grid_size, block_size);
160 |     tuple_generator_hook reorder_path_host;
161 |     cudaMemcpyFromSymbol(&reorder_path_host, reorder_path_device,
162 |                          sizeof(tuple_generator_hook));
163 |     std::cout << "Testing datastructure query <<<<<<<<<<<<<<< " << std::endl;
164 |     
165 |     float join_time[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
166 |     RelationalJoin join_test(edge_2__2_1, FULL, path_2__1_2, FULL, path_2__1_2,
167 |                              reorder_path_host, nullptr, LEFT, grid_size,
168 |                              block_size, join_time);
169 |     float query_time = 0;
170 |     for (int i = 0; i < REPEAT; i++) {
171 |         join_test.disable_load = true;
172 |         timer.start_timer();
173 |         join_test();
174 |         timer.stop_timer();
175 |         query_time += timer.get_spent_time();
176 |     }
177 | 
178 |     std::cout << "Query time: " << query_time << std::endl;
179 |     std::cout << "HashTable query ratio : "
180 |               << graph_edge_counts * REPEAT / query_time << std::endl;
181 | }
182 | 
183 | int main(int argc, char *argv[]) {
184 |     int device_id;
185 |     int number_of_sm;
186 |     cudaGetDevice(&device_id);
187 |     cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount,
188 |                            device_id);
189 |     std::cout << "num of sm " << number_of_sm << std::endl;
190 |     std::cout << "using " << EMPTY_HASH_ENTRY << " as empty hash entry"
191 |               << std::endl;
192 |     int block_size, grid_size;
193 |     block_size = 512;
194 |     grid_size = 32 * number_of_sm;
195 |     std::locale loc("");
196 | 
197 |     datastructure_bench(argv[1], block_size, grid_size);
198 |     return 0;
199 | }
200 | 


--------------------------------------------------------------------------------
/test/merge.cu:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | #include <cuda_runtime.h>
  3 | #include <fstream>
  4 | #include <functional>
  5 | #include <iostream>
  6 | #include <stdlib.h>
  7 | #include <thrust/execution_policy.h>
  8 | #include <thrust/merge.h>
  9 | #include <thrust/set_operations.h>
 10 | #include <thrust/sort.h>
 11 | #include <vector>
 12 | #include <thrust/device_vector.h>
 13 | #include <thrust/host_vector.h>
 14 | 
 15 | #include "../include/lie.cuh"
 16 | #include "../include/print.cuh"
 17 | #include "../include/timer.cuh"
 18 | 
 19 | long int get_row_size(const char *data_path) {
 20 |     std::ifstream f;
 21 |     f.open(data_path);
 22 |     char c;
 23 |     long i = 0;
 24 |     while (f.get(c))
 25 |         if (c == '\n')
 26 |             ++i;
 27 |     f.close();
 28 |     return i;
 29 | }
 30 | 
 31 | enum ColumnT { U64, U32 };
 32 | 
 33 | column_type *get_relation_from_file(const char *file_path, int total_rows,
 34 |                                     int total_columns, char separator,
 35 |                                     ColumnT ct) {
 36 |     column_type *data =
 37 |         (column_type *)malloc(total_rows * total_columns * sizeof(column_type));
 38 |     FILE *data_file = fopen(file_path, "r");
 39 |     for (int i = 0; i < total_rows; i++) {
 40 |         for (int j = 0; j < total_columns; j++) {
 41 |             if (j != (total_columns - 1)) {
 42 |                 if (ct == U64) {
 43 |                     fscanf(data_file, "%lld%c", &data[(i * total_columns) + j],
 44 |                            &separator);
 45 |                 } else {
 46 |                     fscanf(data_file, "%ld%c", &data[(i * total_columns) + j],
 47 |                            &separator);
 48 |                 }
 49 |             } else {
 50 |                 if (ct == U64) {
 51 |                     fscanf(data_file, "%lld", &data[(i * total_columns) + j]);
 52 |                 } else {
 53 |                     fscanf(data_file, "%ld", &data[(i * total_columns) + j]);
 54 |                 }
 55 |             }
 56 |         }
 57 |     }
 58 |     return data;
 59 | }
 60 | 
 61 | __device__ void reorder_path(tuple_type inner, tuple_type outer,
 62 |                              tuple_type newt) {
 63 |     newt[0] = inner[1];
 64 |     newt[1] = outer[1];
 65 | };
 66 | __device__ tuple_generator_hook reorder_path_device = reorder_path;
 67 | 
 68 | int main(int argc, char *argv[]) {
 69 |     auto dataset_path = argv[1];
 70 |     int device_id;
 71 |     int number_of_sm;
 72 |     cudaGetDevice(&device_id);
 73 |     cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount,
 74 |                            device_id);
 75 |     int max_threads_per_block;
 76 |     cudaDeviceGetAttribute(&max_threads_per_block,
 77 |                            cudaDevAttrMaxThreadsPerBlock, 0);
 78 |     std::cout << "num of sm " << number_of_sm << " num of thread per block "
 79 |               << max_threads_per_block << std::endl;
 80 |     std::cout << "using " << EMPTY_HASH_ENTRY << " as empty hash entry"
 81 |               << std::endl;
 82 |     ;
 83 |     int block_size, grid_size;
 84 |     block_size = 512;
 85 |     grid_size = 32 * number_of_sm;
 86 |     std::locale loc("");
 87 | 
 88 |     int relation_columns = 2;
 89 |     std::chrono::high_resolution_clock::time_point time_point_begin;
 90 |     std::chrono::high_resolution_clock::time_point time_point_end;
 91 |     time_point_begin = std::chrono::high_resolution_clock::now();
 92 |     double spent_time;
 93 |     KernelTimer timer;
 94 | 
 95 |     // load the raw graph
 96 |     tuple_size_t graph_edge_counts = get_row_size(dataset_path);
 97 |     std::cout << "Input graph rows: " << graph_edge_counts << std::endl;
 98 |     // u64 graph_edge_counts = 2100;
 99 |     column_type *raw_graph_data =
100 |         get_relation_from_file(dataset_path, graph_edge_counts, 2, '\t', U32);
101 |     column_type *raw_reverse_graph_data =
102 |         (column_type *)malloc(graph_edge_counts * 2 * sizeof(column_type));
103 |     std::cout << "reversing graph ... " << std::endl;
104 |     for (tuple_size_t i = 0; i < graph_edge_counts; i++) {
105 |         raw_reverse_graph_data[i * 2 + 1] = raw_graph_data[i * 2];
106 |         raw_reverse_graph_data[i * 2] = raw_graph_data[i * 2 + 1];
107 |     }
108 |     std::cout << "finish reverse graph." << std::endl;
109 | 
110 |     int REPEAT = 10;
111 |     // init the tuples
112 |     time_point_end = std::chrono::high_resolution_clock::now();
113 |     spent_time = std::chrono::duration_cast<std::chrono::duration<double>>(
114 |                      time_point_end - time_point_begin)
115 |                      .count();
116 |     std::cout << "init tuples time: " << spent_time << std::endl;
117 |     column_type *tuple_hashvs;
118 |     cudaMalloc((void **)&tuple_hashvs, graph_edge_counts * sizeof(column_type));
119 |     column_type *col_tmp;
120 |     cudaMalloc((void **)&col_tmp, graph_edge_counts * sizeof(column_type));
121 | 
122 |     // load raw data into edge relation
123 |     time_point_begin = std::chrono::high_resolution_clock::now();
124 |     Relation *edge_2__2_1 = new Relation();
125 |     // cudaMallocHost((void **)&edge_2__2_1, sizeof(Relation));
126 |     Relation *path_2__1_2 = new Relation();
127 |     path_2__1_2->index_flag = false;
128 |     // cudaMallocHost((void **)&path_2__1_2, sizeof(Relation));
129 |     std::cout << "edge size " << graph_edge_counts << std::endl;
130 |     load_relation(path_2__1_2, "path_2__1_2", 2, raw_graph_data,
131 |                   graph_edge_counts, 1, 0, grid_size, block_size);
132 |     load_relation(edge_2__2_1, "edge_2__2_1", 2, raw_reverse_graph_data,
133 |                   graph_edge_counts, 1, 0, grid_size, block_size);
134 |     LIE tc_scc(grid_size, block_size);
135 |     tc_scc.max_iteration = 277;
136 |     tc_scc.reload_full_flag = false;
137 |     tc_scc.add_relations(edge_2__2_1, true);
138 |     tc_scc.add_relations(path_2__1_2, false);
139 |     float join_detail[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
140 |     tuple_generator_hook reorder_path_host;
141 |     cudaMemcpyFromSymbol(&reorder_path_host, reorder_path_device,
142 |                          sizeof(tuple_generator_hook));
143 |     tuple_copy_hook cp_1_host;
144 |     RelationalJoin join_op(edge_2__2_1, FULL, path_2__1_2, DELTA, path_2__1_2,
145 |                            reorder_path_host, nullptr, LEFT, grid_size,
146 |                            block_size, join_detail);
147 |     tc_scc.add_ra(join_op);
148 |     timer.start_timer();
149 |     tc_scc.fixpoint_loop();
150 |     timer.stop_timer();
151 | 
152 |     std::cout << "Path counts " << path_2__1_2->full->tuple_counts << std::endl;
153 |     // print_tuple_rows(path_2__2_1->full, "full");
154 |     std::cout << "TC time: " << timer.get_spent_time() << std::endl;
155 |     std::cout << "join detail: " << std::endl;
156 |     std::cout << "compute size time:  " << join_detail[0] << std::endl;
157 |     std::cout << "reduce + scan time: " << join_detail[1] << std::endl;
158 |     std::cout << "fetch result time:  " << join_detail[2] << std::endl;
159 |     std::cout << "sort time:          " << join_detail[3] << std::endl;
160 |     std::cout << "build index time:   " << join_detail[5] << std::endl;
161 |     std::cout << "merge time:         " << join_detail[6] << std::endl;
162 |     std::cout << "unique time:        " << join_detail[4] + join_detail[7]
163 |               << std::endl;
164 | 
165 |     join_op();
166 |     print_memory_usage();
167 |     // deduplicate with full
168 |     time_point_begin = std::chrono::high_resolution_clock::now();
169 |     std::cout << "start deduplicate with full ..." << std::endl;
170 |     tuple_type *dedup_buf;
171 |     cudaMalloc((void **)&dedup_buf,
172 |                path_2__1_2->current_full_size * sizeof(tuple_type));
173 |     cudaDeviceSynchronize();
174 |     tuple_type *dedup_buf_end = thrust::set_difference(
175 |         thrust::device, path_2__1_2->newt->tuples,
176 |         path_2__1_2->newt->tuples + path_2__1_2->newt->tuple_counts,
177 |         path_2__1_2->tuple_full,
178 |         path_2__1_2->tuple_full + path_2__1_2->current_full_size, dedup_buf,
179 |         tuple_indexed_less(path_2__1_2->full->index_column_size,
180 |                            path_2__1_2->full->arity -
181 |                                path_2__1_2->dependent_column_size));
182 |     tuple_size_t tp_counts = dedup_buf_end - dedup_buf;
183 |     time_point_end = std::chrono::high_resolution_clock::now();
184 |     spent_time = std::chrono::duration_cast<std::chrono::duration<double>>(
185 |                      time_point_end - time_point_begin)
186 |                      .count();
187 |     std::cout << "deduplicate with full time: " << spent_time << std::endl;
188 | 
189 |     // test merge speed
190 |     
191 |     tuple_type *merge_buf;
192 |     std::cout << "start merge test ..." << std::endl;
193 |     std::cout << "full size " << path_2__1_2->full->tuple_counts << std::endl;
194 |     
195 |     double alloc_time = 0;
196 |     for (int i = 0; i < REPEAT; i++) {
197 |         timer.start_timer();
198 |         cudaMalloc((void **)&merge_buf, (path_2__1_2->full->tuple_counts +
199 |                                         tp_counts) * sizeof(tuple_type));
200 |         timer.stop_timer();
201 |         alloc_time += timer.get_spent_time();
202 |         cudaFree(merge_buf);
203 |         merge_buf = nullptr;
204 |     }
205 |     cudaMalloc((void **)&merge_buf, (path_2__1_2->full->tuple_counts +
206 |                                         tp_counts) * sizeof(tuple_type));
207 |     std::cout << "alloc merge buf time: " << alloc_time << std::endl;
208 | 
209 |     std::cout << "start merge test 2 ..." << std::endl;
210 |     
211 |     double resize_time = 0;
212 |     for (int i = 0; i < REPEAT; i++) {
213 |         thrust::device_vector<tuple_type> full_buf_vec(path_2__1_2->full->tuples, path_2__1_2->full->tuples + path_2__1_2->full->tuple_counts);
214 |         timer.start_timer();
215 |         full_buf_vec.resize(path_2__1_2->full->tuple_counts+ tp_counts);
216 |         timer.stop_timer();
217 |         resize_time += timer.get_spent_time();
218 |     }
219 |     
220 |     std::cout << "resize merge buf time: " << resize_time << std::endl;
221 |     
222 |     std::cout << "dedup size " << tp_counts << std::endl;
223 |     print_memory_usage();
224 |     timer.start_timer();
225 |     for (int i = 0; i < REPEAT; i++) {
226 |         thrust::merge(thrust::device, path_2__1_2->tuple_full,
227 |                       path_2__1_2->tuple_full + path_2__1_2->current_full_size,
228 |                       dedup_buf, dedup_buf_end, merge_buf,
229 |                       tuple_indexed_less(path_2__1_2->full->index_column_size,
230 |                                path_2__1_2->full->arity));
231 |     }
232 |     timer.stop_timer();
233 |     std::cout << "merge int once time: " << timer.get_spent_time() << std::endl;
234 | 
235 |     // std::cout << "start merge test 2 ..." << std::endl;
236 |     // thrust::device_vector<tuple_type> full_buf_vec(path_2__1_2->full->tuples, path_2__1_2->full->tuples + path_2__1_2->full->tuple_counts);
237 |     // thrust::device_vector<tuple_type> dedup_buf_vec(dedup_buf, dedup_buf_end);
238 |     // for (int i = 0; i < REPEAT; i++) {
239 |     //     timer.start_timer();
240 |     //     thrust::merge(thrust::device, full_buf_vec.begin(),
241 |     //                   full_buf_vec.end(),
242 |     //                   dedup_buf_vec.begin(), dedup_buf_vec.end(), merge_buf,
243 |     //                   tuple_indexed_less(path_2__1_2->full->index_column_size,
244 |     //                            path_2__1_2->full->arity));
245 |     //     timer.stop_timer();
246 |     // }
247 | 
248 |     // std::cout << "start multi merge test ..." << std::endl;
249 |     // tuple_size_t merge_step = 5000;
250 |     // time_point_begin = std::chrono::high_resolution_clock::now();
251 |     // for(tuple_size_t i = 0; i < path_2__1_2->full->tuple_counts; i += merge_step) {
252 |     //     tuple_size_t merge_size = merge_step;
253 |     //     if (i + merge_step > path_2__1_2->full->tuple_counts) {
254 |     //         merge_size = path_2__1_2->full->tuple_counts - i;
255 |     //     }
256 |     //     cudaDeviceSynchronize();
257 |     //     thrust::merge(thrust::device, path_2__1_2->tuple_full + i,
258 |     //                   path_2__1_2->tuple_full + i + merge_size,
259 |     //                   dedup_buf, dedup_buf_end, merge_buf,
260 |     //                   tuple_indexed_less(path_2__1_2->full->index_column_size,
261 |     //                        path_2__1_2->full->arity));
262 |     // }
263 |     // cudaDeviceSynchronize();
264 |     // time_point_end = std::chrono::high_resolution_clock::now();
265 |     // spent_time = std::chrono::duration_cast<std::chrono::duration<double>>(
266 |     //                  time_point_end - time_point_begin)
267 |     //                  .count();
268 |     // std::cout << "multi merge time 1: " << spent_time << std::endl;
269 | 
270 |     // std::cout << "start multi merge test 2 ..." << std::endl;
271 |     // timer.start_timer();
272 |     // tuple_type *merge_buf_2;
273 |     // cudaMalloc((void **)&merge_buf_2, path_2__1_2->full->tuple_counts * sizeof(tuple_type));
274 |     // tuple_type *merge_buf_3;
275 |     // cudaMalloc((void **)&merge_buf_3, path_2__1_2->full->tuple_counts * sizeof(tuple_type));
276 |     // tuple_size_t cur_merged_size = 0;
277 |     // print_memory_usage();
278 |     // cudaDeviceSynchronize();
279 |     // time_point_begin = std::chrono::high_resolution_clock::now();
280 |     // for(tuple_size_t i = 0; i < path_2__1_2->full->tuple_counts; i += merge_step) {
281 |     //     tuple_size_t merge_size = merge_step;
282 |     //     if (i + merge_step > path_2__1_2->full->tuple_counts) {
283 |     //         merge_size = path_2__1_2->full->tuple_counts - i;
284 |     //     }
285 |     //     thrust::merge(thrust::device, path_2__1_2->tuple_full + i,
286 |     //                   path_2__1_2->tuple_full + i + merge_size,
287 |     //                   merge_buf_2, merge_buf_2 + cur_merged_size, merge_buf_2,
288 |     //                   tuple_indexed_less(path_2__1_2->full->index_column_size,
289 |     //                        path_2__1_2->full->arity));
290 |     //     cudaDeviceSynchronize();
291 |     //     cur_merged_size += merge_size;
292 |     // }
293 |     // time_point_end = std::chrono::high_resolution_clock::now();
294 |     // spent_time = std::chrono::duration_cast<std::chrono::duration<double>>(
295 |     //                  time_point_end - time_point_begin)
296 |     //                  .count();
297 |     // std::cout << "multi merge time 2: " << spent_time << std::endl;
298 |     cudaFree(merge_buf);
299 | 
300 |     std::cout << "stupid test .... " << std::endl;
301 |     thrust::host_vector<int> H1(4);
302 |     // initialize individual elements
303 |     H1[0] = 14;
304 |     H1[1] = 20;
305 |     H1[2] = 38;
306 |     H1[3] = 46;
307 |     thrust::host_vector<int> H2(3);
308 |     // initialize individual elements
309 |     H2[0] = 12;
310 |     H2[1] = 31;
311 |     H2[2] = 53;
312 |     thrust::device_vector<int> h1_device = H1;
313 |     thrust::device_vector<int> h2_device = H2;
314 |     // h1_device.resize(7);
315 |     thrust::merge(thrust::device, h1_device.begin(), h1_device.begin()+4,
316 |                   h2_device.begin(), h2_device.end(), h1_device.begin(), thrust::less<int>());
317 |     thrust::host_vector<int> H3 = h1_device;
318 |     for (int i = 0; i < H3.size(); i++) {
319 |         std::cout << H3[i] << std::endl;
320 |     }
321 | 
322 |     return 0;
323 | }
324 | 


--------------------------------------------------------------------------------
/test/path_length.cu:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | #include <fstream>
  3 | #include <iostream>
  4 | #include <stdlib.h>
  5 | #include <thrust/execution_policy.h>
  6 | #include <thrust/merge.h>
  7 | #include <thrust/set_operations.h>
  8 | #include <vector>
  9 | 
 10 | #include "../include/exception.cuh"
 11 | #include "../include/lie.cuh"
 12 | #include "../include/print.cuh"
 13 | #include "../include/timer.cuh"
 14 | 
 15 | //////////////////////////////////////////////////////
 16 | 
 17 | long int get_row_size(const char *data_path) {
 18 |     std::ifstream f;
 19 |     f.open(data_path);
 20 |     char c;
 21 |     long i = 0;
 22 |     while (f.get(c))
 23 |         if (c == '\n')
 24 |             ++i;
 25 |     f.close();
 26 |     return i;
 27 | }
 28 | 
 29 | column_type *get_relation_from_file(const char *file_path, int total_rows,
 30 |                                     int total_columns, char separator) {
 31 |     column_type *data =
 32 |         (column_type *)malloc(total_rows * total_columns * sizeof(column_type));
 33 |     FILE *data_file = fopen(file_path, "r");
 34 |     for (int i = 0; i < total_rows; i++) {
 35 |         for (int j = 0; j < total_columns; j++) {
 36 |             if (j != (total_columns - 1)) {
 37 |                 fscanf(data_file, "%lld%c", &data[(i * total_columns) + j],
 38 |                        &separator);
 39 |             } else {
 40 |                 fscanf(data_file, "%lld", &data[(i * total_columns) + j]);
 41 |             }
 42 |         }
 43 |     }
 44 |     return data;
 45 | }
 46 | 
 47 | //////////////////////////////////////////////////////////////////
 48 | 
 49 | __device__ void reorder_path(tuple_type inner, tuple_type outer,
 50 |                              tuple_type newt) {
 51 |     newt[0] = inner[1];
 52 |     newt[1] = outer[1];
 53 |     newt[2] = outer[2] + 1;
 54 | };
 55 | __device__ tuple_generator_hook reorder_path_device = reorder_path;
 56 | 
 57 | void analysis_bench(const char *dataset_path, int block_size, int grid_size) {
 58 |     KernelTimer timer;
 59 |     int relation_columns = 2;
 60 |     std::chrono::high_resolution_clock::time_point time_point_begin;
 61 |     std::chrono::high_resolution_clock::time_point time_point_end;
 62 |     time_point_begin = std::chrono::high_resolution_clock::now();
 63 |     double spent_time;
 64 | 
 65 |     // load the raw graph
 66 |     u64 graph_edge_counts = get_row_size(dataset_path);
 67 |     std::cout << "Input graph rows: " << graph_edge_counts << std::endl;
 68 |     // u64 graph_edge_counts = 2100;
 69 |     column_type *raw_graph_data =
 70 |         get_relation_from_file(dataset_path, graph_edge_counts, 2, '\t');
 71 |     column_type *raw_reverse_graph_data;
 72 |     u64 raw_reverse_graph_data_mem_size =
 73 |         graph_edge_counts * 2 * sizeof(column_type);
 74 |     cudaMallocHost((void **)&raw_reverse_graph_data,
 75 |                    raw_reverse_graph_data_mem_size);
 76 |     cudaMemset(raw_reverse_graph_data, 0, raw_reverse_graph_data_mem_size);
 77 |     column_type *raw_path_data;
 78 |     u64 raw_path_data_mem_size = graph_edge_counts * 3 * sizeof(column_type);
 79 |     cudaMallocHost((void **)&raw_path_data,
 80 |                    raw_path_data_mem_size);
 81 |     cudaMemset(raw_path_data, 0, raw_path_data_mem_size);
 82 | 
 83 |     std::cout << "init path ... " << std::endl;
 84 |     for (u64 i = 0; i < graph_edge_counts; i++) {
 85 |         raw_path_data[i * 3] = raw_graph_data[i * 2];
 86 |         raw_path_data[i * 3 + 1] = raw_graph_data[i * 2 + 1];
 87 |         raw_path_data[i * 3 + 2] = 1;
 88 |     }
 89 | 
 90 |     std::cout << "reversing graph ... " << std::endl;
 91 |     for (u64 i = 0; i < graph_edge_counts; i++) {
 92 |         raw_reverse_graph_data[i * 2 + 1] = raw_graph_data[i * 2];
 93 |         raw_reverse_graph_data[i * 2] = raw_graph_data[i * 2 + 1];
 94 |     }
 95 |     std::cout << "finish reverse graph." << std::endl;
 96 | 
 97 |     timer.start_timer();
 98 |     Relation *edge_2__2_1 = new Relation();
 99 |     // cudaMallocHost((void **)&edge_2__2_1, sizeof(Relation));
100 |     Relation *path_3__1_2_3 = new Relation();
101 |     path_3__1_2_3->index_flag = false;
102 |     // cudaMallocHost((void **)&path_3__1_2_3, sizeof(Relation));
103 |     std::cout << "edge size " << graph_edge_counts << std::endl;
104 |     load_relation(path_3__1_2_3, "path_3__1_2_3", 3, raw_path_data,
105 |                   graph_edge_counts, 1, 1, grid_size, block_size);
106 |     load_relation(edge_2__2_1, "edge_2__2_1", 2, raw_reverse_graph_data,
107 |                   graph_edge_counts, 1, 0, grid_size, block_size);
108 |     timer.stop_timer();
109 |     // double kernel_spent_time = timer.get_spent_time();
110 |     std::cout << "Build hash table time: " << timer.get_spent_time()
111 |               << std::endl;
112 | 
113 |     timer.start_timer();
114 |     LIE tc_scc(grid_size, block_size);
115 |     tc_scc.add_relations(edge_2__2_1, true);
116 |     tc_scc.add_relations(path_3__1_2_3, false);
117 |     float join_detail[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
118 |     tuple_generator_hook reorder_path_host;
119 |     cudaMemcpyFromSymbol(&reorder_path_host, reorder_path_device,
120 |                          sizeof(tuple_generator_hook));
121 |     tc_scc.add_ra(RelationalJoin(edge_2__2_1, FULL, path_3__1_2_3, DELTA,
122 |                                  path_3__1_2_3, reorder_path_host, nullptr,
123 |                                  LEFT, grid_size, block_size, join_detail));
124 |     tc_scc.fixpoint_loop();
125 |     timer.stop_timer();
126 |     // print_tuple_rows(path_3__1_2_3->full, "full path");
127 |     std::cout << "PLEN time: " << timer.get_spent_time() << std::endl;
128 |     std::cout << "join detail: " << std::endl;
129 |     std::cout << "compute size time:  " <<  join_detail[0] <<  std::endl;
130 |     std::cout << "reduce + scan time: " <<  join_detail[1] <<  std::endl;
131 |     std::cout << "fetch result time:  " <<  join_detail[2] <<  std::endl;
132 |     std::cout << "sort time:          " <<  join_detail[3] <<  std::endl;
133 |     std::cout << "build index time:   " <<  join_detail[5] <<  std::endl;
134 |     std::cout << "merge time:         " <<  join_detail[6] <<  std::endl;
135 |     std::cout << "unique time:        " << join_detail[4] + join_detail[7] <<  std::endl;
136 | }
137 | 
138 | int main(int argc, char *argv[]) {
139 |     int device_id;
140 |     int number_of_sm;
141 |     cudaGetDevice(&device_id);
142 |     cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount,
143 |                            device_id);
144 |     std::cout << "num of sm " << number_of_sm << std::endl;
145 |     std::cout << "using " << EMPTY_HASH_ENTRY << " as empty hash entry"
146 |               << std::endl;
147 |     int block_size, grid_size;
148 |     block_size = 512;
149 |     grid_size = 32 * number_of_sm;
150 |     std::locale loc("");
151 | 
152 |     analysis_bench(argv[1], block_size, grid_size);
153 |     return 0;
154 | }
155 | 


--------------------------------------------------------------------------------
/test/sg.cu:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | #include <fstream>
  3 | #include <iostream>
  4 | #include <stdlib.h>
  5 | #include <thrust/execution_policy.h>
  6 | #include <thrust/merge.h>
  7 | #include <thrust/set_operations.h>
  8 | #include <vector>
  9 | 
 10 | #include "../include/exception.cuh"
 11 | #include "../include/lie.cuh"
 12 | #include "../include/print.cuh"
 13 | #include "../include/timer.cuh"
 14 | 
 15 | //////////////////////////////////////////////////////
 16 | 
 17 | long int get_row_size(const char *data_path) {
 18 |     std::ifstream f;
 19 |     f.open(data_path);
 20 |     char c;
 21 |     long i = 0;
 22 |     while (f.get(c))
 23 |         if (c == '\n')
 24 |             ++i;
 25 |     f.close();
 26 |     return i;
 27 | }
 28 | 
 29 | enum ColumnT { U64, U32 };
 30 | 
 31 | column_type *get_relation_from_file(const char *file_path, int total_rows,
 32 |                                     int total_columns, char separator,
 33 |                                     ColumnT ct) {
 34 |     column_type *data =
 35 |         (column_type *)malloc(total_rows * total_columns * sizeof(column_type));
 36 |     FILE *data_file = fopen(file_path, "r");
 37 |     for (int i = 0; i < total_rows; i++) {
 38 |         for (int j = 0; j < total_columns; j++) {
 39 |             if (j != (total_columns - 1)) {
 40 |                 if (ct == U64) {
 41 |                     fscanf(data_file, "%lld%c", &data[(i * total_columns) + j],
 42 |                            &separator);
 43 |                 } else {
 44 |                     fscanf(data_file, "%ld%c", &data[(i * total_columns) + j],
 45 |                            &separator);
 46 |                 }
 47 |             } else {
 48 |                 if (ct == U64) {
 49 |                     fscanf(data_file, "%lld", &data[(i * total_columns) + j]);
 50 |                 } else {
 51 |                     fscanf(data_file, "%ld", &data[(i * total_columns) + j]);
 52 |                 }
 53 |             }
 54 |         }
 55 |     }
 56 |     return data;
 57 | }
 58 | 
 59 | //////////////////////////////////////////////////////////////////
 60 | 
 61 | 
 62 | __device__ void reorder_path(tuple_type inner, tuple_type outer,
 63 |                              tuple_type newt) {
 64 |     newt[0] = inner[1];
 65 |     newt[1] = outer[1];
 66 | };
 67 | __device__ void reorder_path1(tuple_type inner, tuple_type outer,
 68 |                               tuple_type newt) {
 69 |     newt[0] = outer[1];
 70 |     newt[1] = inner[1];
 71 | };
 72 | 
 73 | // sg(x, y) :-  edge(a, x), edge(b, y), sg(a, b)
 74 | // __device__ void reorder_path1_3arity(tuple_type inner1, tuple_type inner2, tuple_type outer,
 75 | //                                      tuple_type newt) {
 76 | //     newt[0] = inner1[1];
 77 | //     newt[1] = inner2[1];
 78 | // };
 79 | 
 80 | __device__ tuple_generator_hook reorder_path_device = reorder_path;
 81 | __device__ tuple_generator_hook reorder_path1_device = reorder_path1;
 82 | // __device__ tuple_generator_hook reorder_path1_3arity_device = reorder_path1_3arity;
 83 | 
 84 | __device__ void cp_1(tuple_type src, tuple_type dest) {
 85 |     dest[0] = src[1];
 86 |     dest[1] = src[0];
 87 | }
 88 | __device__ tuple_copy_hook cp_1_device = cp_1;
 89 | 
 90 | __device__ bool tuple_pred_eq_11(tuple_type t) { return t[0] != t[1]; }
 91 | __device__ tuple_predicate tuple_pred_eq_11_device = tuple_pred_eq_11;
 92 | 
 93 | void analysis_bench(const char *dataset_path, int block_size, int grid_size) {
 94 |     KernelTimer timer;
 95 |     int relation_columns = 2;
 96 |     std::chrono::high_resolution_clock::time_point time_point_begin;
 97 |     std::chrono::high_resolution_clock::time_point time_point_end;
 98 |     time_point_begin = std::chrono::high_resolution_clock::now();
 99 |     double spent_time;
100 | 
101 |     // load the raw graph
102 |     tuple_size_t graph_edge_counts = get_row_size(dataset_path);
103 |     std::cout << "Input graph rows: " << graph_edge_counts << std::endl;
104 |     // u64 graph_edge_counts = 2100;
105 |     column_type *raw_graph_data =
106 |         get_relation_from_file(dataset_path, graph_edge_counts, 2, '\t', U32);
107 |     column_type *raw_reverse_graph_data =
108 |         (column_type *)malloc(graph_edge_counts * 2 * sizeof(column_type));
109 |     std::cout << "reversing graph ... " << std::endl;
110 |     for (tuple_size_t i = 0; i < graph_edge_counts; i++) {
111 |         raw_reverse_graph_data[i * 2 + 1] = raw_graph_data[i * 2];
112 |         raw_reverse_graph_data[i * 2] = raw_graph_data[i * 2 + 1];
113 |     }
114 |     std::cout << "finish reverse graph." << std::endl;
115 | 
116 |     timer.start_timer();
117 |     Relation *edge_2__1_2 = new Relation();
118 |     Relation *edge_2__2_1 = new Relation();
119 |     load_relation(edge_2__2_1, "edge_2__2_1", 2, raw_reverse_graph_data,
120 |                   graph_edge_counts, 1, 0, grid_size, block_size);
121 |     // cudaMallocHost((void **)&edge_2__2_1, sizeof(Relation));
122 |     Relation *sg_2__1_2 = new Relation();
123 |     sg_2__1_2->index_flag = false;
124 |     // cudaMallocHost((void **)&path_2__1_2, sizeof(Relation));
125 |     std::cout << "edge size " << graph_edge_counts << std::endl;
126 |     load_relation(sg_2__1_2, "sg_2__2_1", 2, nullptr, 0, 1, 0, grid_size,
127 |                   block_size);
128 |     load_relation(edge_2__1_2, "edge_2__1_2", 2, raw_graph_data,
129 |                   graph_edge_counts, 1, 0, grid_size, block_size);
130 |     timer.stop_timer();
131 |     // double kernel_spent_time = timer.get_spent_time();
132 |     std::cout << "Build hash table time: " << timer.get_spent_time()
133 |               << std::endl;
134 |     float join_detail[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
135 |     timer.start_timer();
136 |     LIE init_scc(grid_size, block_size);
137 |     init_scc.add_relations(edge_2__1_2, true);
138 |     init_scc.add_relations(sg_2__1_2, false);
139 |     // sg(x, y) :- edge(p, x), edge(p, y), x != y.
140 |     // sg:y,x
141 |     tuple_generator_hook reorder_path_host;
142 |     cudaMemcpyFromSymbol(&reorder_path_host, reorder_path_device,
143 |                          sizeof(tuple_generator_hook));
144 |     tuple_predicate tuple_pred_eq_11_host;
145 |     cudaMemcpyFromSymbol(&tuple_pred_eq_11_host, tuple_pred_eq_11_device,
146 |                          sizeof(tuple_predicate));
147 |     init_scc.add_ra(RelationalJoin(
148 |         edge_2__1_2, FULL, edge_2__1_2, FULL, sg_2__1_2, reorder_path_host,
149 |         tuple_pred_eq_11_host, LEFT, grid_size, block_size, join_detail));
150 |     init_scc.fixpoint_loop();
151 |     timer.stop_timer();
152 |     std::cout << "sg init counts " << sg_2__1_2->full->tuple_counts
153 |               << std::endl;
154 |     std::cout << "sg init time: " << timer.get_spent_time() << std::endl;
155 | 
156 |     LIE sg_lie(grid_size, block_size);
157 |     Relation *tmp = new Relation();
158 |     load_relation(tmp, "tmp", 2, nullptr, 0, 1, 0, grid_size, block_size);
159 |     tmp->index_flag = false;
160 |     sg_lie.add_relations(edge_2__1_2, true);
161 |     sg_lie.add_relations(sg_2__1_2, false);
162 | 
163 |     sg_lie.add_tmp_relation(tmp);
164 |     // sg(x, y) :- edge(a, x), sg(a, b), edge(b, y).
165 |     // tmp(b,x) :- edge(a, x), sg(a, b).
166 |     tuple_generator_hook reorder_path1_host;
167 |     cudaMemcpyFromSymbol(&reorder_path1_host, reorder_path1_device,
168 |                          sizeof(tuple_generator_hook));
169 |     sg_lie.add_ra(RelationalJoin(edge_2__1_2, FULL, sg_2__1_2, DELTA, tmp,
170 |                                  reorder_path1_host, nullptr, LEFT, grid_size,
171 |                                  block_size, join_detail));
172 |     // sg(x, y) :- edge(b, y), tmp(b, x).
173 |     sg_lie.add_ra(RelationalJoin(edge_2__1_2, FULL, tmp, NEWT, sg_2__1_2,
174 |                                  reorder_path1_host, nullptr, LEFT, grid_size,
175 |                                  block_size, join_detail));
176 |     timer.start_timer();
177 |     sg_lie.fixpoint_loop();
178 |     timer.stop_timer();
179 |     std::cout << "sg counts " << sg_2__1_2->full->tuple_counts << std::endl;
180 |     std::cout << "sg time: " << timer.get_spent_time() << std::endl;
181 | 
182 |     std::cout << "join detail: " << std::endl;
183 |     std::cout << "compute size time:  " << join_detail[0] << std::endl;
184 |     std::cout << "reduce + scan time: " << join_detail[1] << std::endl;
185 |     std::cout << "fetch result time:  " << join_detail[2] << std::endl;
186 |     std::cout << "sort time:          " << join_detail[3] << std::endl;
187 |     std::cout << "build index time:   " << join_detail[5] << std::endl;
188 |     std::cout << "merge time:         " << join_detail[6] << std::endl;
189 |     std::cout << "unique time:        " << join_detail[4] + join_detail[7]
190 |               << std::endl;
191 | }
192 | 
193 | int main(int argc, char *argv[]) {
194 |     int device_id;
195 |     int number_of_sm;
196 |     cudaGetDevice(&device_id);
197 |     cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount,
198 |                            device_id);
199 |     std::cout << "num of sm " << number_of_sm << std::endl;
200 |     std::cout << "using " << EMPTY_HASH_ENTRY << " as empty hash entry"
201 |               << std::endl;
202 |     int block_size, grid_size;
203 |     block_size = 512;
204 |     grid_size = 32 * number_of_sm;
205 |     std::locale loc("");
206 | 
207 |     analysis_bench(argv[1], block_size, grid_size);
208 |     return 0;
209 | }
210 | 


--------------------------------------------------------------------------------
/test/sort.cu:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | #include <cuda_runtime.h>
  3 | #include <fstream>
  4 | #include <functional>
  5 | #include <iostream>
  6 | #include <stdlib.h>
  7 | #include <thrust/execution_policy.h>
  8 | #include <thrust/merge.h>
  9 | #include <thrust/set_operations.h>
 10 | #include <thrust/sort.h>
 11 | #include <vector>
 12 | 
 13 | #include "../include/relation.cuh"
 14 | #include "../include/timer.cuh"
 15 | #include "../include/relational_algebra.cuh"
 16 | #include "../include/print.cuh"
 17 | 
 18 | #define EMPTY_HASH_ENTRY ULLONG_MAX
 19 | 
 20 | using u64 = unsigned long long;
 21 | using u32 = unsigned long;
 22 | 
 23 | using column_type = u32;
 24 | using tuple_type = column_type *;
 25 | using tuple_size_t = u64;
 26 | using t_data_internal = u64 *;
 27 | 
 28 | typedef void (*tuple_generator_hook)(tuple_type, tuple_type, tuple_type);
 29 | typedef void (*tuple_copy_hook)(tuple_type, tuple_type);
 30 | typedef bool (*tuple_predicate)(tuple_type);
 31 | 
 32 | // struct tuple_generator_hook {
 33 | //     __host__ __device__
 34 | //     void operator()(tuple_type inner, tuple_type outer, tuple_type newt) {};
 35 | // };
 36 | 
 37 | 
 38 | // 32 bit version of fnv1-a
 39 | __host__ __device__ inline u32 prefix_hash_32(tuple_type start_ptr,
 40 |                                               u64 prefix_len) {
 41 |     const u32 base = 2166136261U;
 42 |     const u32 prime = 16777619U;
 43 | 
 44 |     u32 hash = base;
 45 |     for (u64 i = 0; i < prefix_len; ++i) {
 46 |         u32 chunk = (u32)start_ptr[i];
 47 |         hash ^= chunk & 255U;
 48 |         hash *= prime;
 49 |         for (char j = 0; j < 3; ++j) {
 50 |             chunk = chunk >> 8;
 51 |             hash ^= chunk & 255U;
 52 |             hash *= prime;
 53 |         }
 54 |     }
 55 |     return hash;
 56 | }
 57 | 
 58 | // 32bit xxhash version prefix hash
 59 | __host__ __device__ inline u32 prefix_hash_xxhash_32(tuple_type start_ptr,
 60 |                                                      u64 prefix_len) {
 61 |     const u32 prime = 2654435761U;
 62 |     u32 hash = 0;
 63 |     for (u64 i = 0; i < prefix_len; ++i) {
 64 |         u32 chunk = (u32)start_ptr[i];
 65 |         hash += chunk * prime;
 66 |         hash += (hash << 13);
 67 |         hash ^= (hash >> 7);
 68 |         hash += (hash << 3);
 69 |         hash ^= (hash >> 17);
 70 |         hash += (hash << 5);
 71 |     }
 72 |     return hash;
 73 | }
 74 | 
 75 | long int get_row_size(const char *data_path) {
 76 |     std::ifstream f;
 77 |     f.open(data_path);
 78 |     char c;
 79 |     long i = 0;
 80 |     while (f.get(c))
 81 |         if (c == '\n')
 82 |             ++i;
 83 |     f.close();
 84 |     return i;
 85 | }
 86 | 
 87 | enum ColumnT { U64, U32 };
 88 | 
 89 | column_type *get_relation_from_file(const char *file_path, int total_rows,
 90 |                                     int total_columns, char separator,
 91 |                                     ColumnT ct) {
 92 |     column_type *data =
 93 |         (column_type *)malloc(total_rows * total_columns * sizeof(column_type));
 94 |     FILE *data_file = fopen(file_path, "r");
 95 |     for (int i = 0; i < total_rows; i++) {
 96 |         for (int j = 0; j < total_columns; j++) {
 97 |             if (j != (total_columns - 1)) {
 98 |                 if (ct == U64) {
 99 |                     fscanf(data_file, "%lld%c", &data[(i * total_columns) + j],
100 |                            &separator);
101 |                 } else {
102 |                     fscanf(data_file, "%ld%c", &data[(i * total_columns) + j],
103 |                            &separator);
104 |                 }
105 |             } else {
106 |                 if (ct == U64) {
107 |                     fscanf(data_file, "%lld", &data[(i * total_columns) + j]);
108 |                 } else {
109 |                     fscanf(data_file, "%ld", &data[(i * total_columns) + j]);
110 |                 }
111 |             }
112 |         }
113 |     }
114 |     return data;
115 | }
116 | 
117 | // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
118 | 
119 | // Number of bits per pass
120 | const int BITS_PER_PASS = 4;
121 | 
122 | // Number of bins per pass
123 | const int BINS_PER_PASS = 1 << BITS_PER_PASS;
124 | 
125 | // Number of threads per block
126 | const int THREADS_PER_BLOCK = 256;
127 | 
128 | // Radix sort kernel
129 | __global__ void radix_sort_kernel(u32 *data, int *temp, int *histogram,
130 |                                   int num_elements, int pass) {
131 |     // Compute the global thread ID
132 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
133 | 
134 |     // Compute the local thread ID within the warp
135 |     int lane = threadIdx.x & 31;
136 | 
137 |     // Compute the histogram index for this thread
138 |     int index = (data[tid] >> (pass * BITS_PER_PASS)) & (BINS_PER_PASS - 1);
139 | 
140 |     // Compute the starting index for this bin in the temp array
141 |     int start = histogram[index * blockDim.x + lane];
142 | 
143 |     // Compute the ending index for this bin in the temp array
144 |     int end = start + histogram[index * blockDim.x + blockDim.x - 1];
145 | 
146 |     // Copy the element to the temp array
147 |     temp[start + lane] = data[tid];
148 | 
149 |     // Increment the histogram count for this bin
150 |     atomicAdd(&histogram[index * blockDim.x + lane], 1);
151 | 
152 |     // Wait for all threads to finish updating the histogram
153 |     __syncthreads();
154 | 
155 |     // Compute the starting index for this thread's bin in the temp array
156 |     start = histogram[index * blockDim.x + lane];
157 | 
158 |     // Copy the element to the temp array
159 |     temp[start + lane] = data[tid];
160 | 
161 |     // Wait for all threads to finish copying to the temp array
162 |     __syncthreads();
163 | 
164 |     // Update the data array with the sorted elements
165 |     data[tid] = temp[tid];
166 | }
167 | 
168 | // Radix sort function
169 | void radix_sort(column_type *data, int arity, int num_elements) {
170 |     // Allocate memory for the temp array and histogram
171 |     int max_threads_per_block;
172 |     cudaDeviceGetAttribute(&max_threads_per_block, cudaDevAttrMaxThreadsPerBlock, 0);
173 |     int *temp, *histogram;
174 |     cudaMalloc(&temp, num_elements * sizeof(int));
175 |     cudaMalloc(&histogram, BINS_PER_PASS * THREADS_PER_BLOCK * sizeof(int));
176 | 
177 |     // Initialize the histogram to zero
178 |     cudaMemset(histogram, 0, BINS_PER_PASS * THREADS_PER_BLOCK * sizeof(int));
179 |     column_type pass_cnt = sizeof(column_type) * 8 * arity / BITS_PER_PASS;
180 | 
181 |     // Perform the radix sort passes
182 |     for (column_type pass = 0; pass < pass_cnt; pass++) {
183 |         // Launch the radix sort kernel
184 |         radix_sort_kernel<<<(num_elements + THREADS_PER_BLOCK - 1) /
185 |                                 THREADS_PER_BLOCK,
186 |                             THREADS_PER_BLOCK>>>(data+arity, temp, histogram,
187 |                                                  num_elements, pass);
188 | 
189 |         // Clear the histogram for the next pass
190 |         cudaMemset(histogram, 0,
191 |                    BINS_PER_PASS * THREADS_PER_BLOCK * sizeof(int));
192 |     }
193 | 
194 |     // Free the memory
195 |     cudaFree(temp);
196 |     cudaFree(histogram);
197 | }
198 | 
199 | struct t_equal_n {
200 |     u64 arity;
201 |     tuple_type rhs;
202 | 
203 |     t_equal_n(tuple_size_t arity, tuple_type target) { this->arity = arity; this->rhs = target; }
204 | 
205 |     __host__ __device__ bool operator()(tuple_type lhs) {
206 |         for (int i = 0; i < arity; i++) {
207 |             if (lhs[i] != rhs[i]) {
208 |                 return false;
209 |             }
210 |         }
211 |         return true;
212 |     }
213 | };
214 | 
215 | __device__ void reorder_path(tuple_type inner, tuple_type outer,
216 |                              tuple_type newt) {
217 |     newt[0] = inner[1];
218 |     newt[1] = outer[1];
219 | };
220 | __device__ tuple_generator_hook reorder_path_device = reorder_path;
221 | 
222 | int main(int argc, char *argv[]) {
223 |     auto dataset_path = argv[1];
224 |     int device_id;
225 |     int number_of_sm;
226 |     cudaGetDevice(&device_id);
227 |     cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount,
228 |                            device_id);
229 |     int max_threads_per_block;
230 |     cudaDeviceGetAttribute(&max_threads_per_block, cudaDevAttrMaxThreadsPerBlock, 0);
231 |     std::cout << "num of sm " << number_of_sm << " num of thread per block " << max_threads_per_block << std::endl;
232 |     std::cout << "using " << EMPTY_HASH_ENTRY << " as empty hash entry"
233 |               << std::endl;;
234 |     int block_size, grid_size;
235 |     block_size = 512;
236 |     grid_size = 32 * number_of_sm;
237 |     std::locale loc("");
238 | 
239 |     int relation_columns = 2;
240 |     std::chrono::high_resolution_clock::time_point time_point_begin;
241 |     std::chrono::high_resolution_clock::time_point time_point_end;
242 |     time_point_begin = std::chrono::high_resolution_clock::now();
243 |     double spent_time;
244 | 
245 |     // load the raw graph
246 |     tuple_size_t graph_edge_counts = get_row_size(dataset_path);
247 |     std::cout << "Input graph rows: " << graph_edge_counts << std::endl;
248 |     // u64 graph_edge_counts = 2100;
249 |     column_type *raw_graph_data =
250 |         get_relation_from_file(dataset_path, graph_edge_counts, 2, '\t', U32);
251 |     column_type *raw_reverse_graph_data =
252 |         (column_type *)malloc(graph_edge_counts * 2 * sizeof(column_type));
253 |     std::cout << "reversing graph ... " << std::endl;
254 |     for (tuple_size_t i = 0; i < graph_edge_counts; i++) {
255 |         raw_reverse_graph_data[i * 2 + 1] = raw_graph_data[i * 2];
256 |         raw_reverse_graph_data[i * 2] = raw_graph_data[i * 2 + 1];
257 |     }
258 |     std::cout << "finish reverse graph." << std::endl;
259 | 
260 |     // copy the graph to device
261 |     column_type *d_graph_data;
262 |     cudaMalloc((void **)&d_graph_data,
263 |                graph_edge_counts * relation_columns * sizeof(column_type));
264 |     cudaMemcpy(d_graph_data, raw_graph_data,
265 |                graph_edge_counts * relation_columns * sizeof(column_type),
266 |                cudaMemcpyHostToDevice);
267 | 
268 |     int REPEAT = 1;
269 |     // init the tuples
270 |     tuple_type *tuples;
271 |     cudaMalloc(&tuples, graph_edge_counts * sizeof(tuple_type));
272 |     time_point_begin = std::chrono::high_resolution_clock::now();
273 |     for (int i = 0; i < REPEAT; i++) {
274 |         init_tuples_unsorted<<<grid_size, block_size>>>(
275 |             tuples, d_graph_data, relation_columns, graph_edge_counts);
276 |     }
277 |     cudaDeviceSynchronize();
278 |     time_point_end = std::chrono::high_resolution_clock::now();
279 |     spent_time = std::chrono::duration_cast<std::chrono::duration<double>>(
280 |                      time_point_end - time_point_begin)
281 |                      .count();
282 |     std::cout << "init tuples time: " << spent_time << std::endl;
283 |     column_type *tuple_hashvs;
284 |     cudaMalloc((void **)&tuple_hashvs, graph_edge_counts * sizeof(column_type));
285 |     column_type *col_tmp;
286 |     cudaMalloc((void **)&col_tmp, graph_edge_counts * sizeof(column_type));
287 | 
288 |     time_point_end = std::chrono::high_resolution_clock::now();
289 |     // compute hash for tuples
290 |     for (int i = 0; i < REPEAT; i++) {
291 |         compute_hash<<<grid_size, block_size>>>(tuples, graph_edge_counts, 1,
292 |                                                 tuple_hashvs);
293 |         cudaDeviceSynchronize();
294 |     }
295 |     time_point_end = std::chrono::high_resolution_clock::now();
296 |     spent_time = std::chrono::duration_cast<std::chrono::duration<double>>(
297 |                      time_point_end - time_point_begin)
298 |                      .count();
299 |     std::cout << "compute hash time: " << spent_time << std::endl;
300 | 
301 |     // sort the tuples using thrust
302 |     double sort_hash_time = 0;
303 |     for (int i = 0; i < REPEAT; i++) {
304 |         time_point_begin = std::chrono::high_resolution_clock::now();
305 | 
306 |         extract_column<<<grid_size, block_size>>>(tuples, graph_edge_counts, 1,
307 |                                                   col_tmp);
308 |         cudaDeviceSynchronize();
309 |         thrust::stable_sort_by_key(thrust::device, col_tmp,
310 |                                    col_tmp + graph_edge_counts, tuples);
311 |         cudaDeviceSynchronize();
312 |         extract_column<<<grid_size, block_size>>>(tuples, graph_edge_counts, 0,
313 |                                                   col_tmp);
314 |         cudaDeviceSynchronize();
315 |         thrust::stable_sort_by_key(thrust::device, col_tmp,
316 |                                    col_tmp + graph_edge_counts, tuples);
317 |         compute_hash<<<grid_size, block_size>>>(tuples, graph_edge_counts, 1,
318 |                                                 tuple_hashvs);
319 |         cudaDeviceSynchronize();
320 |         thrust::stable_sort_by_key(thrust::device, tuple_hashvs,
321 |                                    tuple_hashvs + graph_edge_counts, tuples);
322 |         cudaDeviceSynchronize();
323 |         time_point_end = std::chrono::high_resolution_clock::now();
324 |         sort_hash_time +=
325 |             std::chrono::duration_cast<std::chrono::duration<double>>(
326 |                 time_point_end - time_point_begin)
327 |                 .count();
328 |         // print_tuple_list(tuples, graph_edge_counts, 2);
329 |         // recover prepare for next sort
330 |         init_tuples_unsorted<<<grid_size, block_size>>>(
331 |             tuples, d_graph_data, relation_columns, graph_edge_counts);
332 |     }
333 |     std::cout << "sort hash time: " << sort_hash_time << std::endl;
334 | 
335 |     // sort the tuples using thrust with tuple_indexed_less
336 |     double sort_comp_time = 0;
337 |     for (int i = 0; i < REPEAT; i++) {
338 |         time_point_begin = std::chrono::high_resolution_clock::now();
339 |         thrust::sort(thrust::device, tuples, tuples + graph_edge_counts,
340 |                      tuple_indexed_less(1, 2));
341 |         cudaDeviceSynchronize();
342 |         time_point_end = std::chrono::high_resolution_clock::now();
343 |         sort_comp_time +=
344 |             std::chrono::duration_cast<std::chrono::duration<double>>(
345 |                 time_point_end - time_point_begin)
346 |                 .count();
347 |         // print_tuple_list(tuples, graph_edge_counts, 2);
348 |         init_tuples_unsorted<<<grid_size, block_size>>>(
349 |             tuples, d_graph_data, relation_columns, graph_edge_counts);
350 |     }
351 |     std::cout << "sort using tuple_indexed_less time: " << sort_comp_time
352 |               << std::endl;
353 | 
354 | 
355 |     // load raw data into edge relation
356 |     time_point_begin = std::chrono::high_resolution_clock::now();
357 |     Relation *edge_2__2_1 = new Relation();
358 |     // cudaMallocHost((void **)&edge_2__2_1, sizeof(Relation));
359 |     Relation *path_2__1_2 = new Relation();
360 |     path_2__1_2->index_flag = false;
361 |     // cudaMallocHost((void **)&path_2__1_2, sizeof(Relation));
362 |     std::cout << "edge size " << graph_edge_counts << std::endl;
363 |     load_relation(path_2__1_2, "path_2__1_2", 2, raw_graph_data,
364 |                   graph_edge_counts, 1, 0, grid_size, block_size);
365 |     load_relation(edge_2__2_1, "edge_2__2_1", 2, raw_reverse_graph_data,
366 |                   graph_edge_counts, 1, 0, grid_size, block_size);
367 |     time_point_end = std::chrono::high_resolution_clock::now();
368 |     // double kernel_spent_time = timer.get_spent_time();
369 |     double init_relation_time =
370 |         std::chrono::duration_cast<std::chrono::duration<double>>(
371 |             time_point_end - time_point_begin)
372 |             .count();
373 |     std::cout << "Build hash table time: " << init_relation_time << std::endl;
374 | 
375 |     tuple_generator_hook reorder_path_host;
376 |     cudaMemcpyFromSymbol(&reorder_path_host, reorder_path_device,
377 |                          sizeof(tuple_generator_hook));
378 |     float join_detail[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
379 |     RelationalJoin join_test(edge_2__2_1, FULL, path_2__1_2, FULL, path_2__1_2,
380 |                              reorder_path_host, nullptr, LEFT, grid_size,
381 |                              block_size, join_detail);
382 |     time_point_begin = std::chrono::high_resolution_clock::now();
383 |     join_test();
384 |     time_point_end = std::chrono::high_resolution_clock::now();
385 |     double join_test_time =
386 |         std::chrono::duration_cast<std::chrono::duration<double>>(
387 |             time_point_end - time_point_begin)
388 |             .count();
389 |     std::cout << "join test time: " << join_test_time << std::endl;
390 |     std::cout << "join detail: " << std::endl;
391 |     std::cout << "compute size time:  " <<  join_detail[0] <<  std::endl;
392 |     std::cout << "reduce + scan time: " <<  join_detail[1] <<  std::endl;
393 |     std::cout << "fetch result time:  " <<  join_detail[2] <<  std::endl;
394 |     std::cout << "sort time:          " <<  join_detail[3] <<  std::endl;
395 |     std::cout << "build index time:   " <<  join_detail[5] <<  std::endl;
396 |     std::cout << "merge time:         " <<  join_detail[6] <<  std::endl;
397 |     std::cout << "unique time:        " << join_detail[4] + join_detail[7] <<  std::endl;
398 |     // test thrust set_difference time on path's newt and full
399 |     tuple_type* deduped_tuples;
400 |     cudaMalloc(&deduped_tuples, path_2__1_2->newt->tuple_counts * sizeof(tuple_type));
401 | 
402 |     time_point_begin = std::chrono::high_resolution_clock::now();
403 |     for (int i = 0; i < 10; i++) {
404 |     thrust::set_difference(thrust::device, path_2__1_2->newt->tuples,
405 |                            path_2__1_2->newt->tuples + path_2__1_2->newt->tuple_counts,
406 |                            path_2__1_2->full->tuples, path_2__1_2->full->tuples + path_2__1_2->full->tuple_counts,
407 |                            deduped_tuples, tuple_indexed_less(1, 2));
408 |     cudaDeviceSynchronize();
409 |     }
410 |     time_point_end = std::chrono::high_resolution_clock::now();
411 |     double set_difference_time =
412 |         std::chrono::duration_cast<std::chrono::duration<double>>(
413 |             time_point_end - time_point_begin)
414 |             .count();
415 |     std::cout << "set_difference time: " << set_difference_time << std::endl;
416 | 
417 |     // sequential set_difference
418 |     tuple_type* deduped_tuples_seq;
419 |     cudaMalloc(&deduped_tuples_seq, path_2__1_2->newt->tuple_counts * sizeof(tuple_type));
420 |     time_point_begin = std::chrono::high_resolution_clock::now();
421 |     for (int i = 0; i < 10; i++) {
422 |         tuple_type* full_t_end = path_2__1_2->full->tuples + path_2__1_2->full->tuple_counts;
423 |         for (auto i = 0; i < path_2__1_2->newt->tuple_counts ; i++) {
424 |             auto cur_newt_tuple = path_2__1_2->newt->tuples[i];
425 |             
426 |             auto res =thrust::find_if(thrust::device, path_2__1_2->full->tuples, path_2__1_2->full->tuples + path_2__1_2->full->tuple_counts,
427 |                             t_equal_n(path_2__1_2->arity, cur_newt_tuple));
428 |             cudaDeviceSynchronize();
429 |             if (res != full_t_end) {
430 |                 deduped_tuples_seq[i] = cur_newt_tuple;
431 |             }
432 |         }
433 |     }
434 |     time_point_end = std::chrono::high_resolution_clock::now();
435 |     double set_difference_time_seq =
436 |         std::chrono::duration_cast<std::chrono::duration<double>>(
437 |             time_point_end - time_point_begin)
438 |             .count();
439 |     std::cout << "set_difference time seq: " << set_difference_time_seq << std::endl;
440 | 
441 |     return 0;
442 | }
443 | 


--------------------------------------------------------------------------------
/test/souffle/bip.dl:
--------------------------------------------------------------------------------
 1 | .decl edge(v1:symbol, v2:symbol)
 2 | .input edge
 3 | .decl matching(v1:symbol, v2:symbol) choice-domain v1, v2
 4 | .decl notBipartiteMatching()
 5 | 
 6 | matching(x,y) :- edge(x,y).
 7 | 
 8 | // No two edges share an endpoint.
 9 | notBipartiteMatching() :- matching(x, y), matching(x, z), z != y.
10 | notBipartiteMatching() :- matching(y, x), matching(z, x), z != y.
11 | 
12 | .printsize notBipartiteMatching
13 | 


--------------------------------------------------------------------------------
/test/souffle/choice_total.dl:
--------------------------------------------------------------------------------
 1 | .decl domain(x:symbol)
 2 | .input domain
 3 | .decl list(prev:symbol, data:symbol) choice-domain prev, data
 4 | .decl notTotalOrder()
 5 | .printsize notTotalOrder
 6 | 
 7 | list("nil", "head").
 8 | list(x,y) :- domain(y), list(_,x).
 9 | 
10 | // every node has only one sucessor.
11 | notTotalOrder() :- list(p, x), list(p, y), x != y.
12 | // every node has only one predecessor.
13 | notTotalOrder() :- list(pa, x), list(pb, x), pa != pb.
14 | // every node is in the list exactly once.
15 | notTotalOrder() :- domain(x), !list(x,_), !list(_,x).
16 | 


--------------------------------------------------------------------------------
/test/souffle/cspa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harp-lab/gdlog/65a6ee960ced8d04bc725ccdfa68f004f8479226/test/souffle/cspa


--------------------------------------------------------------------------------
/test/souffle/cspa.dl:
--------------------------------------------------------------------------------
 1 | // 2,1
 2 | .decl assign(src: number, dest: number)
 3 | .input assign(IO=file, deliminator="\t")
 4 | // 1,2 2,1
 5 | .decl dereference(src: number, dest: number)
 6 | .input dereference(IO=file, deliminator="\t")
 7 | 
 8 | //1,2 2,1
 9 | .decl ValueFlow(src: number, dest: number)
10 | .printsize ValueFlow
11 | .output ValueFlow
12 | 
13 | // 1,2 2,1
14 | .decl ValueAlias(src: number, dest: number)
15 | .printsize ValueAlias
16 | .output ValueAlias
17 | 
18 | // 1,2
19 | .decl MemoryAlias(src: number, dest: number)
20 | .printsize MemoryAlias
21 | 
22 | .decl tmp(src: number, dest: number)
23 | .printsize tmp
24 | 
25 | ValueFlow(x, y) :- ValueFlow(x, z), ValueFlow(z, y).
26 | ValueAlias(x, y) :- ValueFlow(z, x), ValueFlow(z, y).
27 | ValueFlow(x, y) :- assign(x, z), MemoryAlias(z, y).
28 | // tmp(z, x) :- dereference(y, x), ValueAlias(y, z).
29 | MemoryAlias(x, w) :- dereference(y, x), ValueAlias(y, z), dereference(z, w).
30 | // MemoryAlias(x, w) :- tmp(z, x), dereference(z, w).
31 | ValueAlias(x, y) :- ValueFlow(z, x), MemoryAlias(z, w), ValueFlow(w, y).
32 | 
33 | 
34 | ValueFlow(y, x) :- assign(y, x).
35 | ValueFlow(x, x) :- assign(x, y).
36 | ValueFlow(x, x) :- assign(y, x).
37 | 
38 | MemoryAlias(x, x) :- assign(y, x).
39 | MemoryAlias(x, x) :- assign(x, y).
40 | 


--------------------------------------------------------------------------------
/test/souffle/cspa.slog:
--------------------------------------------------------------------------------
 1 | ; ValueFlow(y, x) :- Assign(y, x).
 2 | ; ValueFlow(x, y) :- Assign(x, z), MemoryAlias(z, y).
 3 | ; ValueFlow(x, y) :- ValueFlow(x, z), ValueFlow(z, y).
 4 | ; MemoryAlias(x, w) :- Dereference(y, x), ValueAlias(y, z), Dereference(z, w).
 5 | ; ValueAlias(x, y) :- ValueFlow(z, x), ValueFlow(z, y).
 6 | ; ValueAlias(x, y) :- ValueFlow(z, x), MemoryAlias(z, w),ValueFlow(w, y).
 7 | ; ValueFlow(x, x) :- Assign(x, y).
 8 | ; ValueFlow(x, x) :- Assign(y, x).
 9 | ; MemoryAlias(x, x) :- Assign(y, x).
10 | ; MemoryAlias(x, x) :- Assign(x, y).
11 | 
12 | [(value-flow y x) <-- (assign y x)]
13 | [(value-flow x y) <-- (assign x z) (memory-alias z y)]
14 | [(value-flow x y) <-- (value-flow x z) (value-flow z y)]
15 | [(memory-alias x w) <-- (dereference y x) (value-alias y z) -- (dereference z w)]
16 | [(value-alias x y) <-- (value-flow z x) (value-flow z y)]
17 | [(value-alias x y) <-- (memory-alias z w) (value-flow w y) -- (value-flow z x)]
18 | [(value-flow x x) <-- (assign x y)]
19 | [(value-flow x x) <-- (assign y x)]
20 | [(memory-alias x x) <-- (assign y x)]
21 | [(memory-alias x x) <-- (assign x y)]
22 | 


--------------------------------------------------------------------------------
/test/souffle/cspa.slogc:
--------------------------------------------------------------------------------
  1 | (
  2 |  slog-prog
  3 |  ((relation-decl rel__value__alias__2__1 value-alias 1 #f 2 (1))
  4 |   (relation-decl rel__value__flow__2__1 value-flow 1 #f 2 (1))
  5 |   (relation-decl rel__value__flow__2__1__2 value-flow 2 #t 2 (1 2))
  6 |   (relation-decl
  7 |    rel___dollorrule7__inter__body__2__2
  8 |    $rule7-inter-body
  9 |    1
 10 |    #f
 11 |    2
 12 |    (2))
 13 |   (relation-decl rel__dereference__2__1__2 dereference 2 #t 2 (1 2))
 14 |   (relation-decl rel__value__flow__2__2 value-flow 1 #f 2 (2))
 15 |   (relation-decl rel__assign__2__2 assign 1 #f 2 (2))
 16 |   (relation-decl rel__assign__2__1__2 assign 2 #t 2 (1 2))
 17 |   (relation-decl rel__value__alias__2__1__2 value-alias 2 #t 2 (1 2))
 18 |   (relation-decl rel__memory__alias__2__1 memory-alias 1 #f 2 (1))
 19 |   (relation-decl rel__memory__alias__2__2 memory-alias 1 #f 2 (2))
 20 |   (relation-decl
 21 |    rel___dollorrule7__inter__body__2__1__2
 22 |    $rule7-inter-body
 23 |    2
 24 |    #t
 25 |    2
 26 |    (1 2))
 27 |   (relation-decl
 28 |    rel___dollorrule10__inter__body__2__1__2
 29 |    $rule10-inter-body
 30 |    2
 31 |    #t
 32 |    2
 33 |    (1 2))
 34 |   (relation-decl rel__dereference__2__1 dereference 1 #f 2 (1))
 35 |   (relation-decl rel__memory__alias__2__1__2 memory-alias 2 #t 2 (1 2))
 36 |   (relation-decl
 37 |    rel___dollorrule10__inter__body__2__2
 38 |    $rule10-inter-body
 39 |    1
 40 |    #f
 41 |    2
 42 |    (2)))
 43 |  ((scc-decl
 44 |    scc0
 45 |    0
 46 |    #f
 47 |    ((scc-rel rel__assign__2__1__2 #f #f)
 48 |     (scc-rel rel__value__flow__2__1__2 #t #f))
 49 |    ((copy rel__value__flow__2__1__2 rel__assign__2__1__2 FULL (1 1))))
 50 |   (scc-decl
 51 |    scc1
 52 |    1
 53 |    #f
 54 |    ((scc-rel rel__assign__2__1__2 #f #f)
 55 |     (scc-rel rel__memory__alias__2__1__2 #t #f))
 56 |    ((copy rel__memory__alias__2__1__2 rel__assign__2__1__2 FULL (1 1))))
 57 |   (scc-decl
 58 |    scc2
 59 |    2
 60 |    #f
 61 |    ((scc-rel rel__assign__2__1__2 #f #f)
 62 |     (scc-rel rel__memory__alias__2__1__2 #t #f))
 63 |    ((copy rel__memory__alias__2__1__2 rel__assign__2__1__2 FULL (0 0))))
 64 |   (scc-decl
 65 |    scc3
 66 |    3
 67 |    #f
 68 |    ((scc-rel rel__assign__2__1__2 #f #f)
 69 |     (scc-rel rel__value__flow__2__1__2 #t #f))
 70 |    ((copy rel__value__flow__2__1__2 rel__assign__2__1__2 FULL (0 1))))
 71 |   (scc-decl
 72 |    scc4
 73 |    4
 74 |    #f
 75 |    ((scc-rel rel__dereference__2__1 #t #f)
 76 |     (scc-rel rel__dereference__2__1__2 #t #f))
 77 |    ((acopy rel__dereference__2__1 rel__dereference__2__1__2 DELTA (0 2 1))))
 78 |   (scc-decl
 79 |    scc5
 80 |    5
 81 |    #f
 82 |    ((scc-rel rel__assign__2__2 #t #f) (scc-rel rel__assign__2__1__2 #t #f))
 83 |    ((acopy rel__assign__2__2 rel__assign__2__1__2 DELTA (1 2 0))))
 84 |   (scc-decl
 85 |    scc6
 86 |    6
 87 |    #f
 88 |    ((scc-rel rel__assign__2__1__2 #f #f)
 89 |     (scc-rel rel__value__flow__2__1__2 #t #f))
 90 |    ((copy rel__value__flow__2__1__2 rel__assign__2__1__2 FULL (0 0))))
 91 |   (scc-decl
 92 |    scc7
 93 |    7
 94 |    #t
 95 |    ((scc-rel rel__value__alias__2__1__2 #t #f)
 96 |     (scc-rel rel__value__alias__2__1 #t #f)
 97 |     (scc-rel rel__value__flow__2__1 #t #f)
 98 |     (scc-rel rel__value__flow__2__1__2 #t #f)
 99 |     (scc-rel rel___dollorrule7__inter__body__2__2 #t #t)
100 |     (scc-rel rel__value__flow__2__2 #t #f)
101 |     (scc-rel rel__assign__2__2 #f #f)
102 |     (scc-rel rel__memory__alias__2__1 #t #f)
103 |     (scc-rel rel__memory__alias__2__2 #t #f)
104 |     (scc-rel rel___dollorrule7__inter__body__2__1__2 #t #t)
105 |     (scc-rel rel___dollorrule10__inter__body__2__1__2 #t #t)
106 |     (scc-rel rel__dereference__2__1 #f #f)
107 |     (scc-rel rel__memory__alias__2__1__2 #t #f)
108 |     (scc-rel rel___dollorrule10__inter__body__2__2 #t #t))
109 |    ((join
110 |      rel__value__flow__2__1__2
111 |      rel__value__flow__2__1
112 |      DELTA
113 |      rel__value__flow__2__2
114 |      DELTA
115 |      (4 2))
116 |     (join
117 |      rel___dollorrule7__inter__body__2__1__2
118 |      rel__memory__alias__2__2
119 |      DELTA
120 |      rel__value__flow__2__1
121 |      DELTA
122 |      (4 2))
123 |     (join
124 |      rel__value__alias__2__1__2
125 |      rel__value__flow__2__1
126 |      DELTA
127 |      rel__value__flow__2__1
128 |      FULL
129 |      (2 4))
130 |     (acopy
131 |      rel___dollorrule10__inter__body__2__2
132 |      rel___dollorrule10__inter__body__2__1__2
133 |      DELTA
134 |      (1 2 0))
135 |     (join
136 |      rel__value__alias__2__1__2
137 |      rel__value__flow__2__1
138 |      FULL
139 |      rel__value__flow__2__1
140 |      DELTA
141 |      (2 4))
142 |     (acopy rel__value__flow__2__2 rel__value__flow__2__1__2 DELTA (1 2 0))
143 |     (acopy rel__value__flow__2__1 rel__value__flow__2__1__2 DELTA (0 2 1))
144 |     (acopy rel__memory__alias__2__1 rel__memory__alias__2__1__2 DELTA (0 2 1))
145 |     (join
146 |      rel__value__flow__2__1__2
147 |      rel__value__flow__2__1
148 |      DELTA
149 |      rel__value__flow__2__2
150 |      FULL
151 |      (4 2))
152 |     (join
153 |      rel__value__alias__2__1__2
154 |      rel___dollorrule7__inter__body__2__2
155 |      DELTA
156 |      rel__value__flow__2__1
157 |      DELTA
158 |      (4 2))
159 |     (join
160 |      rel__value__alias__2__1__2
161 |      rel___dollorrule7__inter__body__2__2
162 |      FULL
163 |      rel__value__flow__2__1
164 |      DELTA
165 |      (4 2))
166 |     (join
167 |      rel___dollorrule7__inter__body__2__1__2
168 |      rel__memory__alias__2__2
169 |      DELTA
170 |      rel__value__flow__2__1
171 |      FULL
172 |      (4 2))
173 |     (join
174 |      rel___dollorrule7__inter__body__2__1__2
175 |      rel__memory__alias__2__2
176 |      FULL
177 |      rel__value__flow__2__1
178 |      DELTA
179 |      (4 2))
180 |     (acopy rel__memory__alias__2__2 rel__memory__alias__2__1__2 DELTA (1 2 0))
181 |     (acopy rel__value__alias__2__1 rel__value__alias__2__1__2 DELTA (0 2 1))
182 |     (join
183 |      rel__value__alias__2__1__2
184 |      rel___dollorrule7__inter__body__2__2
185 |      DELTA
186 |      rel__value__flow__2__1
187 |      FULL
188 |      (4 2))
189 |     (join
190 |      rel__value__flow__2__1__2
191 |      rel__value__flow__2__1
192 |      FULL
193 |      rel__value__flow__2__2
194 |      DELTA
195 |      (4 2))
196 |     (join
197 |      rel___dollorrule10__inter__body__2__1__2
198 |      rel__value__alias__2__1
199 |      DELTA
200 |      rel__dereference__2__1
201 |      FULL
202 |      (4 2))
203 |     (acopy
204 |      rel___dollorrule7__inter__body__2__2
205 |      rel___dollorrule7__inter__body__2__1__2
206 |      DELTA
207 |      (1 2 0))
208 |     (join
209 |      rel__value__flow__2__1__2
210 |      rel__assign__2__2
211 |      FULL
212 |      rel__memory__alias__2__1
213 |      DELTA
214 |      (2 4))
215 |     (join
216 |      rel__value__alias__2__1__2
217 |      rel__value__flow__2__1
218 |      DELTA
219 |      rel__value__flow__2__1
220 |      DELTA
221 |      (2 4))
222 |     (join
223 |      rel__memory__alias__2__1__2
224 |      rel__dereference__2__1
225 |      FULL
226 |      rel___dollorrule10__inter__body__2__2
227 |      DELTA
228 |      (4 2)))))
229 |  ((0 7) (1 7) (2 7) (3 7) (4 7) (5 7) (6 7)))
230 | 


--------------------------------------------------------------------------------
/test/souffle/path_length.dl:
--------------------------------------------------------------------------------
 1 | 
 2 | .decl edge(from:number, to:number)
 3 | .input edge(IO=file, filename="../../data/data_3.txt", deliminator="\t")
 4 | 
 5 | 
 6 | .decl path(from:number, to:number, l: number) choice-domain (from, to)
 7 | .output path
 8 | 
 9 | path(from, to, 1) :- edge(from, to).
10 | path(from, to, l+1) :- edge(from, mid), path(mid, to, l).
11 | 
12 | .printsize path
13 | 


--------------------------------------------------------------------------------
/test/souffle/sg.dl:
--------------------------------------------------------------------------------
 1 | 
 2 | .decl edge(x: number, y: number)
 3 | .input edge(IO=file, filename="../../data/data_39994.txt", deliminator="\t")
 4 | 
 5 | .decl sg(x: number, y: number)
 6 | // .decl sg_init(x: number, y: number)
 7 | // sg_init(x, y) :- edge(p, x), edge(p, y), x != y.
 8 | 
 9 | 
10 | sg(x, y) :- edge(p, x), edge(p, y), x != y.
11 | sg(x, y) :- edge(a, x), sg(a, b), edge(b, y).
12 | 
13 | .printsize sg
14 | // .printsize sg_init
15 | 


--------------------------------------------------------------------------------
/test/souffle/spanning.dl:
--------------------------------------------------------------------------------
 1 | .decl edge(v: number, u:number)
 2 | .input edge(IO=file, filename="../../data/data_39994.txt", deliminator="\t")
 3 | 
 4 | .decl start_node(v: number)
 5 | // start_node(1).
 6 | start_node(10).
 7 | // start_node(32).
 8 | // start_node(45).
 9 | // start_node(56).
10 | // start_node(886).
11 | // start_node(9851).
12 | // start_node(5682).
13 | // start_node(3301).
14 | // start_node(11234).
15 | // start_node(v) :- edge(v,_).
16 | 
17 | .decl st(v:number, u:number) choice-domain u
18 | st(99999999, v) :- start_node(v).
19 | st(v,u) :- st(_, v), edge(v,u).
20 | .output st
21 | 


--------------------------------------------------------------------------------
/test/souffle/tc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harp-lab/gdlog/65a6ee960ced8d04bc725ccdfa68f004f8479226/test/souffle/tc


--------------------------------------------------------------------------------
/test/souffle/tc.dl:
--------------------------------------------------------------------------------
 1 | 
 2 | .decl edge(from:number, to:number)
 3 | .input edge(IO=file, filename="../../../dataset/vsp_finan512_scagr7-2c_rlfddd/vsp_finan512_scagr7-2c_rlfddd.mtx", deliminator="\t")
 4 | 
 5 | 
 6 | 
 7 | .decl path(from:number, to:number)
 8 | 
 9 | path(from, to) :- edge(from, to).
10 | path(from, to) :- path(from, mid), path(mid, to).
11 | 
12 | .printsize path
13 | 
14 | // .decl path1_join(from:number, to:number)
15 | // .output path1_join
16 | // path1_join(from, to) :- edge(from, mid), edge(mid, to).
17 | // .printsize path1_join
18 | 
19 | // .decl path1(from:number, to:number)
20 | // .output path1
21 | // path1(from, to) :- edge(from, to).
22 | // path1(from, to) :- edge(from, mid), edge(mid, to).
23 | // .printsize path1
24 | 
25 | // .decl path2(from:number, to:number)
26 | // // path2(from, to) :- path1(from, to).
27 | // path2(from, to) :- edge(from, mid), path1(mid, to).
28 | // .printsize path2
29 | // .output path2
30 | 
31 | // .decl path3(from:number, to:number)
32 | // path3(from, to) :- path2(from, to).
33 | // path3(from, to) :- edge(from, mid), path2(mid, to).
34 | // .printsize path3
35 | 
36 | // .decl path3_join(from:number, to:number)
37 | // path3_join(from, to) :- edge(from, mid), path2(mid, to).
38 | // .printsize path3_join
39 | // .output path3_join
40 | 


--------------------------------------------------------------------------------
/test/tc.cu:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | #include <fstream>
  3 | #include <iostream>
  4 | #include <stdlib.h>
  5 | #include <thrust/execution_policy.h>
  6 | #include <thrust/merge.h>
  7 | #include <thrust/set_operations.h>
  8 | #include <vector>
  9 | 
 10 | #include "../include/exception.cuh"
 11 | #include "../include/lie.cuh"
 12 | #include "../include/timer.cuh"
 13 | #include "../include/print.cuh"
 14 | 
 15 | //////////////////////////////////////////////////////
 16 | 
 17 | long int get_row_size(const char *data_path) {
 18 |     std::ifstream f;
 19 |     f.open(data_path);
 20 |     char c;
 21 |     long i = 0;
 22 |     while (f.get(c))
 23 |         if (c == '\n')
 24 |             ++i;
 25 |     f.close();
 26 |     return i;
 27 | }
 28 | 
 29 | enum ColumnT{ U64, U32};
 30 | 
 31 | column_type *get_relation_from_file(const char *file_path, int total_rows,
 32 |                                     int total_columns, char separator,
 33 |                                     ColumnT ct) {
 34 |     column_type *data =
 35 |         (column_type *)malloc(total_rows * total_columns * sizeof(column_type));
 36 |     FILE *data_file = fopen(file_path, "r");
 37 |     for (int i = 0; i < total_rows; i++) {
 38 |         for (int j = 0; j < total_columns; j++) {
 39 |             if (j != (total_columns - 1)) {
 40 |                 if (ct == U64){
 41 |                 fscanf(data_file, "%lld%c", &data[(i * total_columns) + j],
 42 |                        &separator);
 43 |                 } else {
 44 |                     fscanf(data_file, "%ld%c", &data[(i * total_columns) + j],
 45 |                        &separator);
 46 |                 }
 47 |             } else {
 48 |                 if (ct == U64) {
 49 |                     fscanf(data_file, "%lld", &data[(i * total_columns) + j]);
 50 |                 } else {
 51 |                     fscanf(data_file, "%ld", &data[(i * total_columns) + j]);
 52 |                 }
 53 |             }
 54 |         }
 55 |     }
 56 |     return data;
 57 | }
 58 | 
 59 | //////////////////////////////////////////////////////////////////
 60 | 
 61 | __device__ void reorder_path(tuple_type inner, tuple_type outer,
 62 |                              tuple_type newt) {
 63 |     newt[0] = inner[1];
 64 |     newt[1] = outer[1];
 65 | };
 66 | __device__ void reorder_path1(tuple_type inner, tuple_type outer,
 67 |                              tuple_type newt) {
 68 |     newt[0] = outer[1];
 69 |     newt[1] = inner[1];
 70 | };
 71 | __device__ tuple_generator_hook reorder_path_device = reorder_path;
 72 | __device__ tuple_generator_hook reorder_path1_device = reorder_path1;
 73 | 
 74 | __device__ void cp_1(tuple_type src, tuple_type dest) {
 75 |     dest[0] = src[1];
 76 |     dest[1] = src[0];
 77 | }
 78 | __device__ tuple_copy_hook cp_1_device = cp_1;
 79 | 
 80 | void analysis_bench(const char *dataset_path, int block_size, int grid_size, bool fully_disable_buffer = false) {
 81 |     KernelTimer timer;
 82 |     int relation_columns = 2;
 83 |     std::chrono::high_resolution_clock::time_point time_point_begin;
 84 |     std::chrono::high_resolution_clock::time_point time_point_end;
 85 |     time_point_begin = std::chrono::high_resolution_clock::now();
 86 |     double spent_time;
 87 | 
 88 |     // load the raw graph
 89 |     tuple_size_t graph_edge_counts = get_row_size(dataset_path);
 90 |     std::cout << "Input graph rows: " << graph_edge_counts << std::endl;
 91 |     // u64 graph_edge_counts = 2100;
 92 |     column_type *raw_graph_data =
 93 |         get_relation_from_file(dataset_path, graph_edge_counts, 2, '\t', U32);
 94 |     column_type *raw_reverse_graph_data =
 95 |         (column_type *)malloc(graph_edge_counts * 2 * sizeof(column_type));
 96 | 
 97 |     // std::cout << "reversing graph ... " << std::endl;
 98 |     for (tuple_size_t i = 0; i < graph_edge_counts; i++) {
 99 |         raw_reverse_graph_data[i * 2 + 1] = raw_graph_data[i * 2];
100 |         raw_reverse_graph_data[i * 2] = raw_graph_data[i * 2 + 1];
101 |     }
102 |     // std::cout << "finish reverse graph." << std::endl;
103 | 
104 |     timer.start_timer();
105 |     Relation *edge_2__2_1 = new Relation();
106 |     edge_2__2_1->fully_disable_merge_buffer_flag = fully_disable_buffer;
107 |     // cudaMallocHost((void **)&edge_2__2_1, sizeof(Relation));
108 |     Relation *path_2__1_2 = new Relation();
109 |     path_2__1_2->fully_disable_merge_buffer_flag = fully_disable_buffer;
110 |     path_2__1_2->index_flag = false;
111 |     // cudaMallocHost((void **)&path_2__1_2, sizeof(Relation));
112 |     std::cout << "edge size " << graph_edge_counts << std::endl;
113 |     load_relation(path_2__1_2, "path_2__1_2", 2, raw_graph_data,
114 |                   graph_edge_counts, 1, 0, grid_size, block_size);
115 |     load_relation(edge_2__2_1, "edge_2__2_1", 2, raw_reverse_graph_data,
116 |                   graph_edge_counts, 1, 0, grid_size, block_size);
117 |     timer.stop_timer();
118 |     // // double kernel_spent_time = timer.get_spent_time();
119 |     // std::cout << "Build hash table time: " << timer.get_spent_time()
120 |     //           << std::endl;
121 | 
122 |     timer.start_timer();
123 |     LIE tc_scc(grid_size, block_size);
124 |     tc_scc.reload_full_flag = false;
125 |     tc_scc.add_relations(edge_2__2_1, true);
126 |     tc_scc.add_relations(path_2__1_2, false);
127 |     float join_detail[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
128 |     tuple_generator_hook reorder_path_host;
129 |     cudaMemcpyFromSymbol(&reorder_path_host, reorder_path_device,
130 |                          sizeof(tuple_generator_hook));
131 |     tuple_copy_hook cp_1_host;
132 |     cudaMemcpyFromSymbol(&cp_1_host, cp_1_device, sizeof(tuple_copy_hook));
133 |     tc_scc.add_ra(RelationalJoin(edge_2__2_1, FULL, path_2__1_2, DELTA,
134 |                                  path_2__1_2, reorder_path_host, nullptr,
135 |                                  LEFT, grid_size, block_size, join_detail));
136 | 
137 |     tc_scc.fixpoint_loop();
138 | 
139 |     timer.stop_timer();
140 |     std::cout << "Path counts " << path_2__1_2->full->tuple_counts << std::endl;
141 |     // print_tuple_rows(path_2__2_1->full, "full");
142 |     std::cout << "TC time: " << timer.get_spent_time() << std::endl;
143 |     // std::cout << "join detail: " << std::endl;
144 |     // std::cout << "compute size time:  " <<  join_detail[0] <<  std::endl;
145 |     // std::cout << "reduce + scan time: " <<  join_detail[1] <<  std::endl;
146 |     // std::cout << "fetch result time:  " <<  join_detail[2] <<  std::endl;
147 |     // std::cout << "sort time:          " <<  join_detail[3] <<  std::endl;
148 |     // std::cout << "build index time:   " <<  join_detail[5] <<  std::endl;
149 |     // std::cout << "merge time:         " <<  join_detail[6] <<  std::endl;
150 |     // std::cout << "unique time:        " << join_detail[4] + join_detail[7] <<  std::endl;
151 | }
152 | 
153 | void analysis_bench2(const char *dataset_path, int block_size, int grid_size) {
154 |     KernelTimer timer;
155 |     int relation_columns = 2;
156 |     std::chrono::high_resolution_clock::time_point time_point_begin;
157 |     std::chrono::high_resolution_clock::time_point time_point_end;
158 |     time_point_begin = std::chrono::high_resolution_clock::now();
159 |     double spent_time;
160 | 
161 |     // load the raw graph
162 |     tuple_size_t graph_edge_counts = get_row_size(dataset_path);
163 |     std::cout << "Input graph rows: " << graph_edge_counts << std::endl;
164 |     // u64 graph_edge_counts = 2100;
165 |     column_type *raw_graph_data =
166 |         get_relation_from_file(dataset_path, graph_edge_counts, 2, '\t', U32);
167 |     column_type *raw_reverse_graph_data =
168 |         (column_type *)malloc(graph_edge_counts * 2 * sizeof(column_type));
169 | 
170 |     // std::cout << "reversing graph ... " << std::endl;
171 |     for (tuple_size_t i = 0; i < graph_edge_counts; i++) {
172 |         raw_reverse_graph_data[i * 2 + 1] = raw_graph_data[i * 2];
173 |         raw_reverse_graph_data[i * 2] = raw_graph_data[i * 2 + 1];
174 |     }
175 |     // std::cout << "finish reverse graph." << std::endl;
176 | 
177 |     timer.start_timer();
178 |     Relation *path_2__1_2 = new Relation();
179 |     // cudaMallocHost((void **)&path_2__1_2, sizeof(Relation));
180 |     Relation *path_2__2_1 = new Relation();
181 |     // cudaMallocHost((void **)&path_2__2_1, sizeof(Relation));
182 |     // std::cout << "edge size " << graph_edge_counts << std::endl;
183 |     load_relation(path_2__1_2, "path_2__1_2", 2, raw_graph_data,
184 |                   graph_edge_counts, 1, 0, grid_size, block_size);
185 |     load_relation(path_2__2_1, "path_2__2_1", 2, nullptr, 0, 1, 0, grid_size,
186 |                   block_size);
187 |     timer.stop_timer();
188 |     // double kernel_spent_time = timer.get_spent_time();
189 |     // std::cout << "Build hash table time: " << timer.get_spent_time()
190 |     //           << std::endl;
191 | 
192 |     timer.start_timer();
193 |     LIE tc_scc(grid_size, block_size);
194 |     tc_scc.add_relations(path_2__2_1, false);
195 |     tc_scc.add_relations(path_2__1_2, false);
196 |     float join_time[3];
197 |     tuple_generator_hook reorder_path_host;
198 |     cudaMemcpyFromSymbol(&reorder_path_host, reorder_path1_device,
199 |                          sizeof(tuple_generator_hook));
200 |     tuple_copy_hook cp_1_host;
201 |     cudaMemcpyFromSymbol(&cp_1_host, cp_1_device, sizeof(tuple_copy_hook));
202 |     tc_scc.add_ra(RelationalACopy(path_2__1_2, path_2__2_1, cp_1_host, nullptr,
203 |                                   grid_size, block_size));
204 |     tc_scc.add_ra(RelationalJoin(path_2__1_2, FULL, path_2__2_1, DELTA,
205 |                                  path_2__1_2, reorder_path_host, nullptr, LEFT,
206 |                                  grid_size, block_size, join_time));
207 | 
208 |     tc_scc.fixpoint_loop();
209 | 
210 |     timer.stop_timer();
211 |     std::cout << "Path counts " << path_2__1_2->full->tuple_counts << std::endl;
212 |     // print_tuple_rows(path_2__2_1->full, "full");
213 |     std::cout << "TC time: " << timer.get_spent_time() << std::endl;
214 | }
215 | 
216 | int main(int argc, char *argv[]) {
217 |     int device_id;
218 |     int number_of_sm;
219 |     cudaGetDevice(&device_id);
220 |     cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount,
221 |                            device_id);
222 |     // std::cout << "num of sm " << number_of_sm << std::endl;
223 |     // std::cout << "using " << EMPTY_HASH_ENTRY << " as empty hash entry"
224 |     //           << std::endl;
225 |     int block_size, grid_size;
226 |     block_size = 512;
227 |     grid_size = 32 * number_of_sm;
228 |     std::locale loc("");
229 |     if (strcmp(argv[2], "1") == 0)
230 |         analysis_bench(argv[1], block_size, grid_size, true);
231 |     else
232 |         analysis_bench(argv[1], block_size, grid_size, false);
233 | 
234 |     return 0;
235 | }
236 | 


--------------------------------------------------------------------------------