├── .gitignore
├── .gitmodules
├── 1_download_benchmark.sh
├── 2_build_docker.sh
├── 3_launch_docker.sh
├── 4_build_all.sh
├── 5_run_all.sh
├── 6_gen_all.sh
├── LICENSE
├── README.md
├── clean.sh
├── code
    ├── CMakeLists.txt
    ├── include
    │   ├── clara
    │   │   ├── clara.hpp
    │   │   └── clara_textflow.hpp
    │   ├── commons
    │   │   ├── NFA.h
    │   │   ├── NFALoader.h
    │   │   ├── SymbolStream.h
    │   │   ├── common_func.h
    │   │   ├── compatible_group_helper.h
    │   │   ├── device_intrinsics.h
    │   │   ├── graph.h
    │   │   ├── graph_helper.h
    │   │   ├── group_graph.h
    │   │   ├── my_bitset.h
    │   │   ├── nfa_utils.h
    │   │   ├── node.h
    │   │   ├── precompute_table.h
    │   │   ├── report_formatter.h
    │   │   ├── validate.h
    │   │   └── vasim_helper.h
    │   ├── gpunfautils
    │   │   ├── abstract_gpunfa.h
    │   │   ├── array2.h
    │   │   ├── common.h
    │   │   └── utils.h
    │   ├── moderngpu
    │   │   ├── context.hxx
    │   │   ├── cpp11.hxx
    │   │   ├── cta_load_balance.hxx
    │   │   ├── cta_merge.hxx
    │   │   ├── cta_mergesort.hxx
    │   │   ├── cta_reduce.hxx
    │   │   ├── cta_scan.hxx
    │   │   ├── cta_search.hxx
    │   │   ├── cta_segscan.hxx
    │   │   ├── cta_segsort.hxx
    │   │   ├── intrinsics.hxx
    │   │   ├── kernel_bulkinsert.hxx
    │   │   ├── kernel_bulkremove.hxx
    │   │   ├── kernel_compact.hxx
    │   │   ├── kernel_intervalmove.hxx
    │   │   ├── kernel_join.hxx
    │   │   ├── kernel_load_balance.hxx
    │   │   ├── kernel_merge.hxx
    │   │   ├── kernel_mergesort.hxx
    │   │   ├── kernel_reduce.hxx
    │   │   ├── kernel_scan.hxx
    │   │   ├── kernel_segreduce.hxx
    │   │   ├── kernel_segsort.hxx
    │   │   ├── kernel_sortedsearch.hxx
    │   │   ├── kernel_workcreate.hxx
    │   │   ├── launch_box.hxx
    │   │   ├── launch_params.hxx
    │   │   ├── loadstore.hxx
    │   │   ├── memory.hxx
    │   │   ├── meta.hxx
    │   │   ├── operators.hxx
    │   │   ├── search.hxx
    │   │   ├── sort_networks.hxx
    │   │   ├── transform.hxx
    │   │   ├── tuple.hxx
    │   │   ├── types.hxx
    │   │   └── util.hxx
    │   └── pugixml
    │   │   ├── pugiconfig.hpp
    │   │   ├── pugixml.cpp
    │   │   └── pugixml.hpp
    ├── scripts
    │   ├── collect_keyword_list_throughput.txt
    │   ├── collect_results.py
    │   ├── configs
    │   │   ├── app_spec_ngap_new
    │   │   ├── app_spec_ngap_new_quickvalidation
    │   │   ├── app_spec_ngap_new_quickvalidation_part1
    │   │   ├── app_spec_ngap_new_quickvalidation_part2
    │   │   ├── app_spec_ngap_new_quickvalidation_part3
    │   │   ├── exec_config_ngap_groups_best
    │   │   ├── exec_config_ngap_groups_best_4degree
    │   │   ├── exec_config_ngap_groups_best_4degree_oneinput
    │   │   ├── exec_config_ngap_groups_best_oneinput
    │   │   ├── exec_config_ngap_groups_design_NAP
    │   │   ├── exec_config_ngap_groups_design_NAP_4degree
    │   │   ├── exec_config_ngap_groups_design_cpu
    │   │   ├── exec_config_ngap_groups_design_cpu_oneinput
    │   │   ├── exec_config_ngap_groups_design_sota
    │   │   ├── exec_config_ngap_groups_design_sota_4degree
    │   │   ├── exec_config_ngap_groups_design_sota_4degree_oneinput
    │   │   ├── exec_config_ngap_groups_design_sota_oneinput
    │   │   ├── exec_config_ngap_groups_design_sota_runahead
    │   │   ├── exec_config_ngap_groups_design_sota_runahead_4degree
    │   │   ├── exec_config_ngap_groups_design_sota_runahead_4degree_oneinput
    │   │   ├── exec_config_ngap_groups_design_sota_runahead_oneinput
    │   │   ├── exec_config_ngap_groups_nap_default
    │   │   ├── exec_config_ngap_groups_nap_default_4degree
    │   │   ├── exec_config_ngap_groups_nap_default_4degree_oneinput
    │   │   └── exec_config_ngap_groups_nap_default_oneinput
    │   ├── launch_exps.py
    │   └── llcommons.py
    └── src
    │   ├── asyncap
    │       ├── CMakeLists.txt
    │       ├── Makefile
    │       ├── include
    │       │   ├── run_ahead_approach.h
    │       │   ├── run_ahead_kernels.h
    │       │   └── scan_kernels.h
    │       └── src
    │       │   ├── main.cu
    │       │   └── run_ahead_approach.cu
    │   ├── commons
    │       ├── NFA.cpp
    │       ├── NFALoader.cpp
    │       ├── SymbolStream.cpp
    │       ├── common_func.cpp
    │       ├── compatible_group_helper.cpp
    │       ├── graph.cu
    │       ├── graph_helper.cpp
    │       ├── nfa_utils.cpp
    │       ├── node.cpp
    │       ├── precompute_table.cu
    │       ├── report_formatter.cpp
    │       ├── validate.cpp
    │       └── vasim_helper.cpp
    │   ├── gpunfautils
    │       ├── abstract_gpunfa.cu
    │       ├── common.cpp
    │       └── utils.cu
    │   ├── infant
    │       ├── device_funcs.h
    │       ├── infant.cu
    │       ├── infant.h
    │       ├── infant_config.h
    │       ├── infant_kernels.cu
    │       ├── infant_kernels.h
    │       └── main.cu
    │   ├── ngap
    │       ├── kernel.h
    │       ├── kernel_bap.cu
    │       ├── kernel_helper.h
    │       ├── kernel_ngap_O0.cu
    │       ├── kernel_ngap_O1.cu
    │       ├── kernel_ngap_O3.cu
    │       ├── kernel_ngap_O4.cu
    │       ├── kernel_ngap_OA.cu
    │       ├── main.cu
    │       ├── ngap.cu
    │       ├── ngap.h
    │       ├── ngap_buffer.cu
    │       ├── ngap_buffer.h
    │       └── ngap_option.h
    │   ├── obat
    │       ├── Makefile
    │       ├── main.cu
    │       ├── one_byte_a_time_kernels.h
    │       ├── one_byte_at_a_time.cu
    │       ├── one_byte_at_a_time.h
    │       └── option_config.h
    │   └── ppopp12
    │       ├── main.cu
    │       ├── ppopp12.cu
    │       ├── ppopp12.h
    │       ├── ppopp12_kernels.cu
    │       ├── ppopp12_kernels.h
    │       └── ppopp12_option.h
├── docker
    └── Dockerfile
├── env.sh
├── ref_results
    ├── fig13_throughput.pdf
    ├── fig14_breakdown.pdf
    ├── fig20_latency.pdf
    ├── raw
    │   ├── throughput_cpu
    │   │   ├── throughput_cpu_part1.csv
    │   │   └── throughput_cpu_part2.csv
    │   ├── throughput_cpu_oneinput
    │   │   ├── throughput_cpu_part1.csv
    │   │   └── throughput_cpu_part2.csv
    │   ├── throughput_gpu_nap_best
    │   │   ├── throughput_gpu_napbest_part1.csv
    │   │   ├── throughput_gpu_napbest_part2.csv
    │   │   └── throughput_gpu_napbest_part3.csv
    │   ├── throughput_gpu_nap_best_oneinput
    │   │   ├── throughput_gpu_napbest_oneinput_part1.csv
    │   │   ├── throughput_gpu_napbest_oneinput_part2.csv
    │   │   └── throughput_gpu_napbest_oneinput_part3.csv
    │   ├── throughput_gpu_nap_breakdown
    │   │   ├── throughput_nap_breakdown_part1.csv
    │   │   ├── throughput_nap_breakdown_part2.csv
    │   │   └── throughput_nap_breakdown_part3.csv
    │   ├── throughput_gpu_nap_default_adp
    │   │   ├── throughput_gpu_nap_default_part1.csv
    │   │   ├── throughput_gpu_nap_default_part2.csv
    │   │   └── throughput_gpu_nap_default_part3.csv
    │   ├── throughput_gpu_nap_default_adp_oneinput
    │   │   ├── throughput_gpu_nap_default_oneinput_part1.csv
    │   │   ├── throughput_gpu_nap_default_oneinput_part2.csv
    │   │   └── throughput_gpu_nap_default_oneinput_part3.csv
    │   ├── throughput_gpu_runahead
    │   │   ├── throughput_gpu_runahead_part1.csv
    │   │   ├── throughput_gpu_runahead_part2.csv
    │   │   └── throughput_gpu_runahead_part3.csv
    │   ├── throughput_gpu_runahead_oneinput
    │   │   ├── throughput_gpu_runahead_oneinput_part1.csv
    │   │   ├── throughput_gpu_runahead_oneinput_part2.csv
    │   │   └── throughput_gpu_runahead_oneinput_part3.csv
    │   ├── throughput_gpu_sota_best
    │   │   ├── throughput_gpu_sota_part1.csv
    │   │   └── throughput_gpu_sota_part3.csv
    │   ├── throughput_gpu_sota_best_oneinput
    │   │   ├── throughput_gpu_sota_oneinput_part1.csv
    │   │   └── throughput_gpu_sota_oneinput_part3.csv
    │   └── v100
    │   │   ├── throughput_gpu_nap_best
    │   │       ├── throughput_gpu_napbest_part1.csv
    │   │       ├── throughput_gpu_napbest_part2.csv
    │   │       └── throughput_gpu_napbest_part3.csv
    │   │   ├── throughput_gpu_nap_default_adp
    │   │       ├── throughput_gpu_nap_default_part1.csv
    │   │       ├── throughput_gpu_nap_default_part2.csv
    │   │       └── throughput_gpu_nap_default_part3.csv
    │   │   ├── throughput_gpu_runahead
    │   │       ├── throughput_gpu_runahead_part1.csv
    │   │       ├── throughput_gpu_runahead_part2.csv
    │   │       └── throughput_gpu_runahead_part3.csv
    │   │   └── throughput_gpu_sota_best
    │   │       ├── throughput_gpu_sota_part1.csv
    │   │       └── throughput_gpu_sota_part3.csv
    ├── tab4_throughput.csv
    └── tab6_latency.csv
├── scripts
    ├── dict_config.py
    ├── gen-breakdown-fig14.sh
    ├── gen-latency-fig20tab6.sh
    ├── gen-throughput-fig13tab4.sh
    ├── plot_throughput_gpu_nap_breakdown.py
    ├── plot_throughput_gpu_sota.py
    ├── plot_throughput_gpu_sota_oneinput.py
    ├── run-breakdown.sh
    ├── run-latency.sh
    ├── run-throughput.sh
    ├── run_experiments.sh
    ├── run_throughput.sh
    ├── run_throughput_NAP_breakdown.sh
    ├── run_throughput_cpu.sh
    ├── run_throughput_cpu_oneinput.sh
    ├── run_throughput_gpu_nap_best.sh
    ├── run_throughput_gpu_nap_best_oneinput.sh
    ├── run_throughput_gpu_nap_defalut.sh
    ├── run_throughput_gpu_nap_defalut_oneinput.sh
    ├── run_throughput_gpu_sota_best.sh
    ├── run_throughput_gpu_sota_best_oneinput.sh
    ├── run_throughput_runahead.sh
    ├── run_throughput_runahead_oneinput.sh
    ├── table_throughput.py
    └── table_throughput_oneinput.py
└── small_dataset
    ├── apple.anml
    └── inputstream.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.pyc
 3 | code/build
 4 | code/log
 5 | code/results
 6 | .vscode
 7 | raw_results/*
 8 | results/*
 9 | exp_table3/*
10 | characterize/*
11 | froniter_length*/*
12 | execution_path*/*
13 | scripts/__pycache__/*
14 | automata_benchmark_original/*
15 | automata_benchmark.tar.gz
16 | code/src/asyncap/bin/*
17 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "hscompile"]
2 | 	path = hscompile
3 | 	url = https://github.com/getianao/hscompile.git
4 | 


--------------------------------------------------------------------------------
/1_download_benchmark.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cd ${NGAP_ROOT}
4 | wget https://hkustgz-my.sharepoint.com/:u:/g/personal/tge601_connect_hkust-gz_edu_cn/EbRBcgYV7Z1KrGLk56PjswsBAmdDwfen2zdXTknP5owEAg\?e\=5bWc4W\&download=1 -O automata_benchmark_original.tar.gz
5 | tar -zxvf automata_benchmark_original.tar.gz


--------------------------------------------------------------------------------
/2_build_docker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker build -t ngap-ae ${NGAP_ROOT}/docker


--------------------------------------------------------------------------------
/3_launch_docker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker run -it --rm --gpus all -v ${NGAP_ROOT}:/ngAP ngap-ae:latest /bin/bash


--------------------------------------------------------------------------------
/4_build_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # GPU Schemes
 4 | cd ${NGAP_ROOT}/code && mkdir -p build && cd build
 5 | cmake -DCMAKE_BUILD_TYPE=Release ..
 6 | make -j
 7 | 
 8 | # CPU Schemes
 9 | cd ${NGAP_ROOT}/hscompile/lib/hyperscan && mkdir -p build && cd build
10 | cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc .. 
11 | make -j
12 | cd ${NGAP_ROOT}/hscompile/lib/mnrl/C++
13 | sed -i 's/CC = .*/CC = g++-5/g' Makefile     # requires GCC-5.
14 | make                                         # If an error occurs, try to run it again  
15 | cd ${NGAP_ROOT}/hscompile && mkdir -p build && cd build
16 | cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc       \
17 |     -DHS_SOURCE_DIR=${NGAP_ROOT}/hscompile/lib/hyperscan    \
18 |     -DMNRL_SOURCE_DIR=${NGAP_ROOT}/hscompile/lib/mnrl/C++   \
19 |     ..
20 | make -j


--------------------------------------------------------------------------------
/5_run_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | time ./scripts/run-throughput.sh    # 10 hrs
4 | time ./scripts/run-breakdown.sh     # 5 hrs
5 | time ./scripts/run-latency.sh       # 1 hrs


--------------------------------------------------------------------------------
/6_gen_all.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./scripts/gen-throughput-fig13tab4.sh
4 | ./scripts/gen-breakdown-fig14.sh
5 | ./scripts/gen-latency-fig20tab6.sh


--------------------------------------------------------------------------------
/clean.sh:
--------------------------------------------------------------------------------
1 | # Docker
2 | 
3 | # docker ps -a
4 | # docker image ls
5 | # docker stop ngap-ae:latest
6 | # docker rm ngap-ae:latest
7 | # docker rmi ngap-ae
8 | 


--------------------------------------------------------------------------------
/code/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
  2 | enable_language(CXX CUDA)
  3 | 
  4 | project(GPUNFA2019)
  5 | find_package(CUDA REQUIRED)
  6 | find_package(OpenMP REQUIRED)
  7 | find_package(TBB REQUIRED COMPONENTS tbb)
  8 | if (OPENMP_FOUND)
  9 |     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
 10 |     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 11 |     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
 12 | endif()
 13 | 
 14 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 15 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 16 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 17 | 
 18 | set(CMAKE_CXX_STANDARD 17)
 19 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 20 | set(CMAKE_CXX_EXTENSIONS OFF)
 21 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++17 -lineinfo --expt-extended-lambda -O3 -Wno-deprecated-gpu-targets -arch=sm_86 --keep")
 22 | # set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++17 -lineinfo --expt-extended-lambda -O3 -Wno-deprecated-gpu-targets -arch=sm_70 --keep")
 23 | # set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++17 -lineinfo --expt-extended-lambda -g -G -Wno-deprecated-gpu-targets -arch=sm_86")
 24 | # -arch=sm_86
 25 | 
 26 | if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
 27 |   set(CMAKE_CUDA_ARCHITECTURES 86)
 28 | endif()
 29 | 
 30 | add_definitions(-DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT)
 31 | 
 32 | include_directories(.)
 33 | include_directories(include)
 34 | include_directories(${CUDA_INCLUDE_DIRS})
 35 | 
 36 | file(GLOB_RECURSE SOURCES RELATIVE ${CMAKE_SOURCE_DIR} "src/commons/*")
 37 | add_library(gpunfacommons ${SOURCES})
 38 | target_link_libraries(gpunfacommons tbb)
 39 | target_include_directories(gpunfacommons PRIVATE include/commons include/pugixml )
 40 | 
 41 | 
 42 | file(GLOB_RECURSE SOURCES1 RELATIVE ${CMAKE_SOURCE_DIR} "src/gpunfautils/*")
 43 | cuda_add_library(gpunfautils ${SOURCES1})
 44 | target_include_directories(gpunfautils PRIVATE include/commons include/gpunfautils)
 45 | set_property(TARGET gpunfautils PROPERTY CUDA_STANDARD 17)
 46 | 
 47 | 
 48 | # CUDA Projects
 49 | # obat
 50 | SET(PROJ "obat")
 51 | file(GLOB_RECURSE SOURCES2 RELATIVE ${CMAKE_SOURCE_DIR} "src/${PROJ}/*")
 52 | cuda_add_executable(${PROJ} ${SOURCES2})
 53 | set_property(TARGET ${PROJ} PROPERTY CUDA_STANDARD 11)
 54 | target_link_libraries(${PROJ} gpunfacommons gpunfautils)
 55 | target_include_directories(${PROJ} PRIVATE include/commons;include/gpunfautils)
 56 | 
 57 | # # infant
 58 | # file(GLOB_RECURSE SOURCES_INFANT RELATIVE ${CMAKE_SOURCE_DIR} "src/infant/*")
 59 | # cuda_add_executable(infant ${SOURCES_INFANT})
 60 | # set_property(TARGET infant PROPERTY CUDA_STANDARD 11)
 61 | # target_link_libraries(infant gpunfacommons gpunfautils)
 62 | 
 63 | # ppopp12
 64 | file(GLOB_RECURSE SOURCES_PPOPP RELATIVE ${CMAKE_SOURCE_DIR} "src/ppopp12/*")
 65 | cuda_add_executable(ppopp12 ${SOURCES_PPOPP})
 66 | set_property(TARGET ppopp12 PROPERTY CUDA_STANDARD 11)
 67 | target_link_libraries(ppopp12 gpunfacommons gpunfautils)
 68 | target_include_directories(ppopp12 PRIVATE include/commons;include/gpunfautils)
 69 | 
 70 | 
 71 | if(DEFINED DATA_BUFFER_SIZE)
 72 |   message("DATA_BUFFER_SIZE is defined to ${DATA_BUFFER_SIZE}")
 73 |   add_compile_definitions(DATA_BUFFER_SIZE=${DATA_BUFFER_SIZE})
 74 | endif()
 75 | if(DEFINED DATA_BUFFER_SIZE_FRONTIER)
 76 |   message("DATA_BUFFER_SIZE_FRONTIER is defined to ${DATA_BUFFER_SIZE_FRONTIER}")
 77 |   add_compile_definitions(DATA_BUFFER_SIZE_FRONTIER=${DATA_BUFFER_SIZE_FRONTIER})
 78 | endif()
 79 | if(DEFINED RESULTS_SIZE)
 80 |   message("RESULTS_SIZE is defined to ${RESULTS_SIZE}")
 81 |   add_compile_definitions(RESULTS_SIZE=${RESULTS_SIZE})
 82 | endif()
 83 | 
 84 | 
 85 | # ngap
 86 | file(GLOB_RECURSE SOURCES_PPOPP RELATIVE ${CMAKE_SOURCE_DIR} "src/ngap/*")
 87 | cuda_add_executable(ngap ${SOURCES_PPOPP})
 88 | set_property(TARGET ngap PROPERTY CUDA_STANDARD 11)
 89 | target_link_libraries(ngap gpunfacommons gpunfautils)
 90 | target_include_directories(ngap PRIVATE include/commons;include/gpunfautils)
 91 | 
 92 | # asyncap
 93 | add_subdirectory(src/asyncap)
 94 | add_custom_target(asyncap ALL
 95 |     COMMAND make -C ${CMAKE_CURRENT_SOURCE_DIR}/src/asyncap all
 96 |     COMMENT "Running Makefile in asyncap"
 97 | )
 98 | add_dependencies(asyncap gpunfautils gpunfacommons)
 99 | set_property(
100 |     TARGET asyncap
101 |     APPEND
102 |     PROPERTY ADDITIONAL_CLEAN_FILES 
103 |     ${CMAKE_CURRENT_SOURCE_DIR}/src/asyncap/bin
104 |     ${CMAKE_CURRENT_SOURCE_DIR}/build/bin/asyncap
105 | )
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/code/include/commons/NFA.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * NFA.h
  3 |  *
  4 |  *  Created on: Apr 29, 2018
  5 |  *      Author: hyliu
  6 |  */
  7 | 
  8 | #ifndef NFA_H_
  9 | #define NFA_H_
 10 | 
 11 | #include <string>
 12 | #include <vector>
 13 | #include <map>
 14 | #include <list>
 15 | #include <unordered_map>
 16 | #include <bitset>
 17 | #include <memory>
 18 | #include <set>
 19 | #include "node.h"
 20 | 
 21 | 
 22 | using std::set;
 23 | using std::unique_ptr;
 24 | using std::bitset;
 25 | using std::string;
 26 | using std::map;
 27 | using std::vector;
 28 | using std::list;
 29 | using std::unordered_map;
 30 | using std::pair;
 31 | using std::make_pair;
 32 | 
 33 | 
 34 | class NFA {
 35 | 
 36 | public:
 37 | 	NFA();
 38 | 	NFA(int V);
 39 | 	virtual ~NFA();
 40 | 
 41 | 	void addNode(Node *n);
 42 | 
 43 | 	void addNode(Node *n, int intid);
 44 | 
 45 | 	void addEdge(string from_str_id, string to_str_id);
 46 | 
 47 | 	int size() const;
 48 | 	int edge_size() const;
 49 | 	int always_active_nodes_num;
 50 | 	int start_active_nodes_num;
 51 | 
 52 | 	void mark_cc_id();
 53 | 	int get_num_cc() const;
 54 | 
 55 | 	Node* get_node_by_int_id(int iid) const;
 56 | 	Node* get_node_by_str_id(string sid) const;
 57 | 
 58 | 	int get_num_transitions() const;
 59 | 
 60 | 	int get_int_id_by_str_id(string str_id) const;
 61 | 
 62 | 	void print();
 63 | 
 64 | 	void calc_scc();
 65 | 	void topo_sort();
 66 | 
 67 | 	int get_num_scc() const;
 68 | 
 69 | 	vector<string> get_nodes_by_original_id(string original_id) const;
 70 | 
 71 | 	vector<string> get_adj(string str) const;
 72 | 	vector<string> get_from(string str_id) const;
 73 | 
 74 | 	int get_indegree_of_node(string str_id) const;
 75 | 	int get_outdegree_of_node(string str_id) const; 
 76 | 
 77 | 	bool has_node(string str) const;
 78 | 	bool has_node(int int_id) const;
 79 | 
 80 | 	void to_dot_file(string dotfile) const;
 81 | 
 82 | 	/** return the removed node's intid 
 83 | 		
 84 | 		This function must be called followed by an addNode(Node *n, int intid);
 85 | 		where the intid is the previous one. 
 86 | 		or there will be an inconsistency. 
 87 | 
 88 | 
 89 | 	**/
 90 | 	Node remove_node_unsafe(string str_id);
 91 | 
 92 | 	void remove_edge(string from_node, string to_node);
 93 | 
 94 | 	int get_num_topoorder() const;
 95 | 
 96 | 	set<uint8_t> get_alphabet_in_nfa_wo_wildcard() const;
 97 | 
 98 | 	set<uint8_t> get_alphabet_in_nodes_wo_wildcard_wo_nottype() const;
 99 | 
100 | 	int get_num_states_leq_topo(int topo);
101 | 	
102 | 	int get_dag();
103 | 
104 | 	bool has_self_loop(int sid) const;
105 | 	bool has_self_loop(string str_id) const;
106 | 	
107 | 	void remove_self_loop(int sid);
108 | 	void remove_self_loop(string str_id);
109 | 
110 | 	int has_self_loop_plus_large_matchset() const; 
111 | 	unordered_map<string, vector<string> > adj;
112 | 
113 | 
114 | private:
115 | 
116 | 	// ----- for separate CCs -------------------------------
117 | 	void calc_bidirected_graph();
118 | 	void clear_visit_flag();
119 | 	void dfs(int start_iid, int cc_id);
120 | 
121 | 	unordered_map<string, vector<string> > from_node;
122 | 
123 | 	unordered_map<string, int> strid_to_intid;
124 | 	
125 | 	unordered_map<int, Node * > node_pool;
126 | 	
127 | 	int V; // n nodes;
128 | 	int E;
129 | 
130 | 	unordered_map<int, list<int>> bi_directed_eq_graph; 
131 | 	int num_cc;
132 | 
133 | 
134 | 	unordered_map<string, vector<string> > original_id_to_nodes;
135 | 
136 | };
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | #endif /* NFA_H_ */
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/code/include/commons/NFALoader.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * NFALoader.h
 3 |  *
 4 |  *  Created on: Apr 29, 2018
 5 |  *      Author: hyliu
 6 |  */
 7 | 
 8 | #ifndef NFALOADER_H_
 9 | #define NFALOADER_H_
10 | 
11 | #include <memory>
12 | #include "NFA.h"
13 | #include <string>
14 | #include "pugixml/pugixml.hpp"
15 | //#include "mnrl.hpp"
16 | 
17 | using std::string;
18 | 
19 | NFA *load_nfa_from_anml(string filename);
20 | 
21 | //NFA *load_nfa_from_mnrl(string filename);
22 | 
23 | NFA *load_nfa_from_file(string filename);
24 | 
25 | 
26 | #endif /* NFALOADER_H_ */
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/code/include/commons/SymbolStream.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SymbolStream.h
 3 |  *
 4 |  *  Created on: May 1, 2018
 5 |  *      Author: hyliu
 6 |  */
 7 | 
 8 | #ifndef SYMBOLSTREAM_H_
 9 | #define SYMBOLSTREAM_H_
10 | 
11 | 
12 | #include <string>
13 | #include <set>
14 | #include <vector>
15 | 
16 | using std::string;
17 | using std::set;
18 | using std::vector;
19 | 
20 | 
21 | class SymbolStream {
22 | public:
23 | 	SymbolStream();
24 | 
25 | 	virtual ~SymbolStream();
26 | 	void readFromFile(string filename);
27 | 
28 | 	const set<uint8_t>& calc_alphabet();
29 | 	uint8_t get_position(int pos) const;
30 | 	void set_position(int pos, uint8_t c);
31 | 
32 | 	void push_back(uint8_t c) {
33 | 		input.push_back(c);
34 | 	}
35 | 
36 | 	void concat(SymbolStream &s) {
37 | 		input.insert(input.end(), s.input.begin(), s.input.end());
38 | 	}
39 | 
40 | 	int get_length() const;
41 | 
42 | 	int size() const {
43 | 		return input.size();
44 | 	}
45 | 
46 | 	SymbolStream slice(int start, int len) const;
47 | 
48 | 	void padding_to_base(int base);
49 | 
50 | private:
51 | 	vector<uint8_t> input;
52 | 	string fromFile;
53 | 	set<uint8_t> alphabet;
54 | 
55 | };
56 | 
57 | 
58 | 
59 | #endif /* SYMBOLSTREAM_H_ */
60 | 


--------------------------------------------------------------------------------
/code/include/commons/graph_helper.h:
--------------------------------------------------------------------------------
 1 | #ifndef GRAPH_HELPER_H
 2 | #define GRAPH_HELPER_H
 3 | 
 4 | 
 5 | #include <algorithm>
 6 | #include <iostream>
 7 | #include <vector>
 8 | #include <map>
 9 | #include <list>
10 | 
11 | using std::map;
12 | using std::vector;
13 | using std::fill;
14 | using std::cout;
15 | using std::endl;
16 | 
17 | 
18 | 
19 | class DAG { // for topological sort.
20 | private:
21 | 	int V;
22 | 	std::map<int, std::list<int> > adj;
23 | 	std::map<int, int> in_degree;
24 | 	bool **gg;
25 | 
26 | 	std::vector<std::vector<int> > topo_order;
27 | 	bool *flag;
28 | 
29 | public:
30 | 	DAG (int V);
31 | 	~DAG();
32 | 	void addEdge(int v, int w);
33 | 	void topological_sort();
34 | 	std::vector<std::vector<int> > &get_topo_order();
35 | 
36 | };
37 | 
38 | 
39 | // A class that represents an directed graph
40 | class MyGraph
41 | {
42 |     int V;    // No. of vertices
43 |     std::map<int, std::list<int> > adj;    // A dynamic array of adjacency lists
44 |     std::map<int, int> scc;
45 |     int time_, *pre, *low, *stk, tops, sccN;
46 |     void tarjan(int s);
47 | 
48 |     //bool dag;
49 | 
50 | 
51 |     bool *visited;
52 |     int *bfs_layer;
53 | 
54 | public:
55 |     MyGraph(int V);   // Constructor
56 |     ~MyGraph();
57 |     void addEdge(int v, int w);   // function to add an edge to graph
58 | 
59 |     void calc_SCC();
60 |     int get_n_scc();
61 | 
62 |     void bfs();
63 |     
64 |     //bool is_dag() const {
65 |      //   return dag;
66 |     //}
67 | 
68 |     std::map<int, int> get_scc();
69 | 
70 |     const int *get_bfs_layers() {
71 |         return bfs_layer;
72 |     } 
73 | 
74 |     void print_SCC();
75 | 
76 | };
77 | 
78 | 
79 | #endif


--------------------------------------------------------------------------------
/code/include/commons/group_graph.h:
--------------------------------------------------------------------------------
  1 | #ifndef GROUP_GRAPH_H_
  2 | #define GROUP_GRAPH_H_
  3 | 
  4 | #include "graph.h"
  5 | 
  6 | class GroupCsr {
  7 | public:
  8 |   int size;
  9 |   Csr *groups_csr;
 10 |   Csr *h_groups_csr;
 11 | 
 12 |   void init(std::vector<Graph *> &gs) {
 13 |     this->size = gs.size();
 14 |     h_groups_csr = new Csr[size];
 15 |     CHECK_ERROR(cudaMalloc(&groups_csr, sizeof(Csr) * size));
 16 |     for (int i = 0; i < size; i++) {
 17 |       Graph *graph = gs[i];
 18 |       Csr csr(*graph);
 19 |       csr.fromCoo(graph->edge_pairs->get_host());
 20 |       csr.moveToDevice();
 21 |       h_groups_csr[i] = csr;
 22 |       CHECK_ERROR(cudaMemcpy((void *)(groups_csr + i), (Csr *)&csr, sizeof(Csr),
 23 |                              cudaMemcpyHostToDevice));
 24 |     }
 25 |   }
 26 | 
 27 |   void release() {
 28 |     if (size > 0) {
 29 |       CHECK_ERROR(cudaFree((void *)groups_csr));
 30 |       delete[] h_groups_csr;
 31 |     }
 32 |   }
 33 | };
 34 | 
 35 | class GroupMatchset {
 36 | public:
 37 |   int size;
 38 |   Matchset *groups_ms;
 39 | 
 40 |   void init(std::vector<Graph *> &gs, bool use_soa) {
 41 |     this->size = gs.size();
 42 |     CHECK_ERROR(cudaMalloc(&groups_ms, sizeof(Matchset) * size));
 43 |     for (int i = 0; i < size; i++) {
 44 |       Graph *graph = gs[i];
 45 |       Matchset ms = graph->get_matchset_device(use_soa);
 46 |       CHECK_ERROR(cudaMemcpy((void *)(groups_ms + i), (Matchset *)&ms,
 47 |                              sizeof(Matchset), cudaMemcpyHostToDevice));
 48 |     }
 49 |   }
 50 | 
 51 |   void release() {
 52 |     if (size > 0) {
 53 |       CHECK_ERROR(cudaFree((void *)groups_ms));
 54 |     }
 55 |   }
 56 | };
 57 | 
 58 | class GroupNodeAttrs {
 59 | public:
 60 |   int size;
 61 |   uint8_t **groups_node_attrs;
 62 | 
 63 |   void init(std::vector<Graph *> &gs) {
 64 |     this->size = gs.size();
 65 |     CHECK_ERROR(cudaMalloc(&groups_node_attrs, sizeof(uint8_t *) * size));
 66 |     for (int i = 0; i < size; i++) {
 67 |       Graph *graph = gs[i];
 68 |       uint8_t *pointer = graph->node_attrs->get_dev();
 69 |       CHECK_ERROR(cudaMemcpy((void *)(groups_node_attrs + i), (void *)&pointer,
 70 |                              sizeof(uint8_t *), cudaMemcpyHostToDevice));
 71 |     }
 72 |   }
 73 | 
 74 |   void release() {
 75 |     if (size > 0) {
 76 |       // for (int i = 0; i < size; i++) {
 77 |       //   CHECK_ERROR(cudaFree((void *)groups_node_attrs[i]));
 78 |       // }
 79 |       CHECK_ERROR(cudaFree((void *)groups_node_attrs));
 80 |     }
 81 |   }
 82 | };
 83 | 
 84 | class GroupAAS {
 85 | public:
 86 |   int size;
 87 |   int **groups_always_active_states;
 88 | 
 89 |   void init(std::vector<Graph *> &gs) {
 90 |     this->size = gs.size();
 91 |     CHECK_ERROR(cudaMalloc(&groups_always_active_states, sizeof(int *) * size));
 92 |     for (int i = 0; i < size; i++) {
 93 |       Graph *graph = gs[i];
 94 |       int *pointer = graph->always_active_nodes->get_dev();
 95 |       CHECK_ERROR(cudaMemcpy((void *)(groups_always_active_states + i),
 96 |                              (void *)&pointer, sizeof(int *),
 97 |                              cudaMemcpyHostToDevice));
 98 |     }
 99 |   }
100 | 
101 |   void release() {
102 |     if (size > 0) {
103 |       // for (int i = 0; i < size; i++) {
104 |       //   CHECK_ERROR(cudaFree((void *)groups_always_active_states[i]));
105 |       // }
106 |       CHECK_ERROR(cudaFree((void *)groups_always_active_states));
107 |     }
108 |   }
109 | };
110 | 
111 | #endif


--------------------------------------------------------------------------------
/code/include/commons/my_bitset.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #pragma once
  3 | 
  4 | #include <bitset>
  5 | #include <iostream>
  6 | #include <math.h>
  7 | 
  8 | typedef struct My_bitset256 {
  9 |   uint32_t data[8];
 10 | 
 11 |   __host__ __device__ My_bitset256() { memset(data, 0, sizeof(data)); }
 12 | 
 13 |   __host__ __device__ My_bitset256(const My_bitset256 &other) {
 14 |     memcpy(data, other.data, sizeof(data));
 15 |   }
 16 | 
 17 |   __host__ __device__ ~My_bitset256() {}
 18 | 
 19 |   __host__ __device__ cudaError_t reset() {
 20 |     memset(data, 0, sizeof(data));
 21 |     return cudaSuccess;
 22 |   }
 23 | 
 24 |   __host__ __device__ cudaError_t set(uint8_t offset, int value) {
 25 |     int pos = (offset / 32);
 26 |     data[pos] = data[pos] | ((value & 1) << (offset % 32));
 27 |     return cudaSuccess;
 28 |   }
 29 | 
 30 |   __host__ __device__ bool test(uint8_t offset) {
 31 |     int pos = (offset / 32);
 32 |     return data[pos] & (1 << (offset % 32));
 33 |   }
 34 | 
 35 |   __host__ __device__ My_bitset256 &operator=(const My_bitset256 &other) {
 36 |     memcpy(data, other.data, sizeof(data));
 37 |     return *this;
 38 |   }
 39 | 
 40 |   void fromBitset(std::bitset<256> column) {
 41 |     for (int i = 0; i < 256; i++) {
 42 |       if (column.test(i)) {
 43 |         set(i, 1);
 44 |       } else {
 45 |         set(i, 0);
 46 |       }
 47 |     }
 48 |   }
 49 | 
 50 | } My_bitset256;
 51 | 
 52 | struct My_bitsetN {
 53 |   uint32_t N;
 54 |   uint32_t size;
 55 |   uint32_t *data;
 56 | 
 57 |   __host__ __device__ My_bitsetN(int N = 256) : N(N) {
 58 |     // this->N = N;
 59 |     this->size = (N - 1) / 32 + 1;
 60 |     data = new uint32_t[size];
 61 |     memset(data, 0, sizeof(uint32_t) * size);
 62 |   }
 63 | 
 64 |   __host__ __device__ My_bitsetN(const My_bitsetN &other) {
 65 |     this->N = N;
 66 |     this->size = (N - 1) / 32 + 1;
 67 |     data = new uint32_t[size];
 68 |     memcpy(data, other.data, sizeof(uint32_t) * size);
 69 |   }
 70 | 
 71 |   __host__ __device__ ~My_bitsetN() { delete[] data; }
 72 | 
 73 |   __host__ __device__ cudaError_t reset() {
 74 |     memset(data, 0, sizeof(uint32_t) * size);
 75 |     return cudaSuccess;
 76 |   }
 77 | 
 78 |   __host__ __device__ cudaError_t set(int offset, int value) {
 79 |     int pos = (offset / 32);
 80 |     data[pos] = data[pos] | ((value & 1) << (offset % 32));
 81 |     return cudaSuccess;
 82 |   }
 83 | 
 84 |   __host__ __device__ bool test(int offset) {
 85 |     int pos = (offset / 32);
 86 |     return data[pos] & (1 << (offset % 32));
 87 |   }
 88 | 
 89 |   __host__ __device__ My_bitsetN &operator=(const My_bitsetN &other) {
 90 |     this->N = N;
 91 |     this->size = (N - 1) / 32 + 1;
 92 |     data = new uint32_t[size];
 93 |     memcpy(data, other.data, sizeof(uint32_t) * size);
 94 |     return *this;
 95 |   }
 96 | 
 97 |   // void fromBitset(std::bitset<N> column) {
 98 |   //   for (int i = 0; i < N; i++) {
 99 |   //     if (column.test(i)) {
100 |   //       set(i, 1);
101 |   //     } else {
102 |   //       set(i, 0);
103 |   //     }
104 |   //   }
105 |   // }
106 | };
107 | 
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/code/include/commons/node.h:
--------------------------------------------------------------------------------
  1 | #ifndef NODE_H_
  2 | #define NODE_H_
  3 | 
  4 | #include <string>
  5 | #include <vector>
  6 | #include <map>
  7 | #include <list>
  8 | #include <unordered_map>
  9 | #include <bitset>
 10 | #include <memory>
 11 | #include <set>
 12 | #include "vasim_helper.h"
 13 | 
 14 | 
 15 | using std::set;
 16 | using std::unique_ptr;
 17 | using std::bitset;
 18 | using std::string;
 19 | using std::map;
 20 | using std::vector;
 21 | using std::list;
 22 | using std::unordered_map;
 23 | using std::pair;
 24 | using std::make_pair;
 25 | 
 26 | enum NODE_START_ENUM {
 27 | 	START=1, 
 28 | 	START_ALWAYS_ENABLED=2
 29 | };
 30 | 
 31 | 
 32 | class Node {
 33 | public:
 34 | 	Node();
 35 | 	
 36 | 	~Node();
 37 | 
 38 | 	string original_id;
 39 | 
 40 | 	string str_id;
 41 | 
 42 | 	int sid;
 43 | 	int cc_id;
 44 | 	int cc_local_id;
 45 | 
 46 | 	int scc_id;
 47 | 	int topo_order;
 48 | 
 49 | 	int bfs_layer;
 50 | 	
 51 | 
 52 | 	bitset<256> symbol_set;
 53 | 	
 54 | 	string symbol_set_str;
 55 | 
 56 | 	bool complete; 
 57 | 	bool complement;
 58 | 	int match_set_range;
 59 | 
 60 | 	int start;
 61 | 
 62 | 	
 63 | 	bool report = false;
 64 | 
 65 | 	// new added for mnrl
 66 | 	string report_code;
 67 | 	bool report_eod = false; 
 68 | 
 69 | 
 70 | 	bool visited = false;
 71 | 
 72 | 	void symbol_set_to_bit();
 73 | 
 74 |     inline bool match2(uint8_t input) const {
 75 |         return symbol_set.test(input);
 76 |     }
 77 | 
 78 |     bool is_start_always_enabled() const;
 79 |     bool is_start() const;
 80 |     bool is_report() const;
 81 | 
 82 |     bool is_wildcard() const;
 83 | 
 84 |     // if the symbol set is a reverse of one symbol, we classify this to not type. 
 85 |     bool is_not_type_node() const; 
 86 | 
 87 |     int num_of_accept_symbol() const;
 88 | 
 89 |     void remap_alphabet(const map<int, int> &remap_table);
 90 | 
 91 |     int num_of_1_in_matchset() const; 
 92 | 
 93 | 
 94 |     double hot_degree; 
 95 | 
 96 |     int cg_id;
 97 | 
 98 | };
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | #endif /*NODE_H */
107 | 
108 | 


--------------------------------------------------------------------------------
/code/include/commons/precompute_table.h:
--------------------------------------------------------------------------------
  1 | #ifndef PRECOMPUTE_TABLE_H
  2 | #define PRECOMPUTE_TABLE_H
  3 | #include <vector>
  4 | #include <cstdint>
  5 | #include <assert.h>
  6 | #include <stdio.h>
  7 | 
  8 | class PrecTable {
  9 | public:
 10 |   uint64_t size = 0;
 11 |   int depth = 0;
 12 |   int cutoff;
 13 |   uint64_t nonzeroVerticesNum = 0;
 14 |   uint64_t nonzeroResultsNum = 0;
 15 |   bool isCompress;
 16 |   int maxkey = 0;
 17 | 
 18 |   std::vector<int *> vertices;
 19 |   std::vector<int> vertices_length;
 20 |   std::vector<int *> results;
 21 |   std::vector<int> results_length;
 22 | 
 23 |   std::vector<uint32_t> nonzeroVerticesMap; // from index to vertix
 24 |   std::vector<uint32_t> nonzeroResultsMap;  // from index to result
 25 | 
 26 |   int *d_vertices;
 27 |   int *d_vertices_offsets;
 28 |   int *d_results;
 29 |   int *d_results_offsets;
 30 | 
 31 |   uint32_t* d_nonzeroVerticesMap; // from index to vertix
 32 |   uint32_t* d_nonzeroResultsMap; // from index to result
 33 | 
 34 |   PrecTable(){}
 35 | 
 36 |   PrecTable(uint64_t size);
 37 | 
 38 |   void allocate(uint64_t size, int depth, bool isCompress = true);
 39 | 
 40 |   void setVertices(uint32_t index, std::vector<int> &v);
 41 | 
 42 |   void setResults(uint32_t index, std::vector<int> &v);
 43 | 
 44 | 
 45 |   int printHistogram();
 46 | 
 47 |   void calcCutoff();
 48 | 
 49 |   void calcCutoffMedian();
 50 | 
 51 |   void toDevice(bool use_uvm = false);
 52 | 
 53 |   void releaseHost();
 54 | 
 55 |   void releaseDevice();
 56 | 
 57 | 
 58 |   // template <typename T>
 59 |   __device__ __forceinline__ int getVertexSymbolIndex(uint32_t symbol) {
 60 |     if(isCompress)
 61 |       return binary_search(d_nonzeroVerticesMap, nonzeroVerticesNum, symbol);
 62 |     else
 63 |       return (int)symbol;
 64 |   }
 65 | 
 66 |   // template <typename T>
 67 |   __device__ __forceinline__ int getResultSymbolIndex(uint32_t symbol) {
 68 |     if(isCompress)
 69 |       return binary_search(d_nonzeroResultsMap, nonzeroResultsNum, symbol);
 70 |     else
 71 |       return (int)symbol;
 72 |   }
 73 | 
 74 |   // template <typename T>
 75 |   __host__ __forceinline__ int getVertexSymbolIndexHost(uint32_t symbol) {
 76 |     return binary_search(&nonzeroVerticesMap[0], nonzeroVerticesNum, symbol);
 77 |   }
 78 | 
 79 |   // template <typename T>
 80 |   __host__ __forceinline__ int getResultSymbolIndexHost(uint32_t symbol) {
 81 |     return binary_search(&nonzeroResultsMap[0], nonzeroResultsNum, symbol);
 82 |   }
 83 | 
 84 |   // template <typename T>
 85 |   __device__ __host__ __forceinline__ int binary_search(uint32_t *arr,
 86 |                                                              int n,
 87 |                                                              uint32_t x) {
 88 |     int start = 0;
 89 |     int end = n - 1;
 90 |     while (start <= end) {
 91 |       int mid = (start + end) / 2;
 92 |       if (arr[mid] == x)
 93 |         return (int)mid;
 94 |       else if (arr[mid] < x)
 95 |         start = mid + 1;
 96 |       else
 97 |         end = mid - 1;
 98 |     }
 99 |     return -1;
100 |   }
101 | };
102 | 
103 | #endif


--------------------------------------------------------------------------------
/code/include/commons/report_formatter.h:
--------------------------------------------------------------------------------
 1 | #ifndef REPORT_FORMATTER_H_
 2 | #define REPORT_FORMATTER_H_
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | using std::vector;
 8 | 
 9 | using std::string;
10 | 
11 | 
12 | class report {
13 | 
14 | public:
15 | 	 int offset;
16 | 	 string str_id;
17 | 	 int cc;
18 | 	 int input_stream_id;
19 | 
20 | 	 bool operator < (const report &r ) const {
21 | 			if (input_stream_id < r.input_stream_id) {
22 | 				return true;
23 | 			} else if (input_stream_id == r.input_stream_id) {
24 | 				if (offset < r.offset) {
25 | 					return true;
26 | 				} else if (offset == r.offset) {
27 | 					if (cc < r.cc) {
28 | 						return true;
29 | 					} else if(str_id == r.str_id) {
30 | 						if (cc < r.cc) {
31 | 							return true;
32 | 						} else {
33 | 							return false;
34 | 						}
35 | 					} else {
36 | 						return false;
37 | 					}
38 | 				} else {
39 | 					return false;
40 | 				}
41 | 			} else {
42 | 				return false;
43 | 			}
44 | 		}
45 | 
46 |     bool operator == (const report &r) const {
47 |     	return offset == r.offset && str_id == r.str_id && cc == r.cc && input_stream_id == r.input_stream_id;
48 |     }
49 | 		report(){};
50 | 
51 |     report(int offset, string str_id, int cc, int input_stream_id);
52 | 
53 | };
54 | 
55 | 
56 | class report_formatter {
57 | public:
58 | 	report_formatter();
59 | 
60 | 	void print_to_file(string filename, bool unique=true);
61 | 
62 | 	void add_report(report rp);
63 | 
64 | 	int size() const {
65 | 		return reports.size();
66 | 	}
67 | 
68 | // private:
69 | 	vector<report> reports;
70 | 
71 | };
72 | 
73 | #endif


--------------------------------------------------------------------------------
/code/include/commons/validate.h:
--------------------------------------------------------------------------------
 1 | #ifndef VALIDATE_H_
 2 | #define VALIDATE_H_
 3 | 
 4 | #include "graph.h"
 5 | #include "group_graph.h"
 6 | 
 7 | namespace automata_utils {
 8 | 	  void automataGroupsReference(std::vector<Graph *> &gs,
 9 | 																uint8_t *input_str, int num_seg,
10 | 																int input_length,
11 | 																std::vector<uint64_t> *results,
12 | 																std::vector<uint64_t> *db_results,
13 | 																int debug_iter, GroupCsr gcsr);
14 | 		void automataReference(Graph &g, uint8_t *input_str, int num_seg,
15 | 			int input_length,
16 | 			std::vector<uint64_t> *results,
17 | 			std::vector<uint64_t> *db_results,
18 | 			int debug_iter, Csr csr);
19 | 
20 | 		bool automataValidation(std::vector<uint64_t> *results,
21 | 																std::vector<uint64_t> *ref_results,
22 | 																bool ifPrintBoth);
23 | }
24 | 
25 | #endif


--------------------------------------------------------------------------------
/code/include/commons/vasim_helper.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | 
 4 | #include <string>
 5 | #include <bitset>
 6 | 
 7 | 
 8 | 
 9 | /**
10 |  * helper functions
11 |  * From VASim
12 |  */
13 | namespace VASim {
14 | 	void find_and_replace(std::string & source, std::string const & find, std::string const & replace);
15 | 	void setRange(std::bitset<256> &column, int start, int end, int value);
16 | 	void parseSymbolSet(std::bitset<256> &column, std::string symbol_set);
17 | }
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/code/include/gpunfautils/abstract_gpunfa.h:
--------------------------------------------------------------------------------
  1 | #ifndef ABSTRACT_NFA_PROCESSING_ALGORITHM
  2 | #define ABSTRACT_NFA_PROCESSING_ALGORITHM
  3 | 
  4 | 
  5 | #include <algorithm>
  6 | #include <iostream>
  7 | #include <vector>
  8 | #include <map>
  9 | #include <list>
 10 | #include <cassert>
 11 | #include <set>
 12 | #include "commons/NFA.h"
 13 | #include "array2.h"
 14 | #include "utils.h"
 15 | #include "common.h"
 16 | #include "commons/SymbolStream.h"
 17 | 
 18 | using std::map;
 19 | using std::vector;
 20 | using std::fill;
 21 | using std::cout;
 22 | using std::endl;
 23 | using std::pair;
 24 | using std::set;
 25 | using std::make_pair;
 26 | 
 27 | 
 28 | 
 29 | class abstract_algorithm {
 30 | public:
 31 | 	explicit abstract_algorithm(NFA *nfa);
 32 | 	virtual ~abstract_algorithm();
 33 | 
 34 | 	virtual void preprocessing() {};
 35 | 	virtual void launch_kernel() = 0;
 36 | 	virtual void postprocessing() {};
 37 | 
 38 | 	virtual void set_alphabet(set<uint8_t> alphabet);
 39 | 	virtual const SymbolStream& get_symbol_stream(int i) const;
 40 | 	virtual void add_symbol_stream(SymbolStream ss);
 41 | 
 42 | 	virtual int get_num_streams() const {
 43 | 		return symbol_streams.size();
 44 | 	}
 45 | 
 46 | 	void set_block_size(int block_size);
 47 | 
 48 | 	void set_output_file(string output_filename) {
 49 | 		this->output_file = output_filename;
 50 | 	} 
 51 | 	
 52 | 	void set_output_buffer_size(unsigned long long int ob_size) {
 53 | 		this->output_buffer_size = ob_size;
 54 | 	}
 55 | 
 56 | 	void set_NFA(NFA *nfa) {
 57 | 		this->nfa = nfa;
 58 | 	}
 59 | 
 60 | 	// whether we want the algorithm to generate reports. 
 61 | 	// If not, we can save time and space for the reports. 
 62 | 	void turn_off_report() {
 63 | 		this->report_on = false;
 64 | 	}
 65 | 
 66 | 	void turn_on_report() {
 67 | 		this->report_on = true;
 68 | 	}
 69 | 
 70 | 	void set_report_off(bool &report_off,
 71 | 											unsigned long long int result_capacity,
 72 | 											long long int quick_result_number) {
 73 | 			if (quick_result_number >= 0 &&
 74 | 					result_capacity <= quick_result_number * 1.5) {
 75 | 				if (!report_off) {
 76 | 					printf("Warning: The number of results may exceed the "
 77 | 									"capacity limit. "
 78 | 									"Set report_off=true.\n");
 79 | 					report_off = true;
 80 | 				}
 81 | 			}
 82 | 			this->report_on = !report_off;
 83 | 			if (this->report_on) {
 84 | 					printf("Report on.\n");
 85 | 			} else {
 86 | 					printf("Report off.\n");
 87 | 			}
 88 | 	}
 89 | 
 90 | 	Array2<uint8_t> *concat_input_streams_to_array2(); 
 91 | 	
 92 | 	void set_padding_input_stream(int pad) {
 93 | 		this->padding_input_stream = pad;
 94 | 	}
 95 | 
 96 | 
 97 | 	void set_max_cc_size_limit(int max_cc_size_limit) {
 98 | 		this->max_cc_size_limit = max_cc_size_limit;
 99 | 	}
100 | 	
101 | 	void set_read_input(bool b) {
102 | 		this->read_input = b;
103 | 	}
104 | 
105 | 	bool validation;
106 | 	NFA *nfa;
107 | 	vector<NFA *> ccs; 
108 | 	
109 | protected:
110 | 
111 | 	int max_cc_size_limit;
112 | 
113 | 	int padding_input_stream;
114 | 
115 | 	unsigned long long int  output_buffer_size;
116 | 	
117 | 	vector<SymbolStream> symbol_streams;
118 | 
119 | 	set<uint8_t> alphabet;
120 | 
121 | 	int block_size;
122 | 
123 | 	bool report_on;       // decide whether generating reports. 
124 | 
125 | 	string output_file;
126 | 
127 | 	bool read_input;
128 | 
129 | };
130 | 
131 | 
132 | 
133 | 
134 | #endif


--------------------------------------------------------------------------------
/code/include/gpunfautils/array2.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef ARRAY2_H_
  3 | #define ARRAY2_H_
  4 | 
  5 | 
  6 | #include <cuda_runtime.h>
  7 | #include <cuda.h>
  8 | #include <cassert>
  9 | #include <iostream>
 10 | #include <cstring>
 11 | 
 12 | using std::cout;
 13 | using std::cerr;
 14 | using std::endl;
 15 | 
 16 | 
 17 | template <class T>
 18 | class Array2 {
 19 | public:
 20 | 	Array2(int arr_length):  Array2(arr_length, "noname_array") {
 21 | 		
 22 | 	}
 23 | 
 24 | 	Array2(int arr_length, std::string arr_id) : h_arr(NULL), d_arr(NULL) {
 25 | 		this->arr_id = arr_id;
 26 | 
 27 | 		// cout << "arr_length = " << arr_length << endl;
 28 | 
 29 | 		this->arr_length = arr_length;
 30 | 		this->element_bytes = sizeof(T);
 31 | 
 32 | 		if (arr_length <= 0) {
 33 | 			// cout << "arr_length = " << arr_length << " arr_id = " << arr_id << endl;
 34 | 			assert(arr_length >= 0);
 35 | 			return;
 36 | 		}
 37 | 
 38 | 		h_arr = new T[arr_length];
 39 | 
 40 | 		//assert(element_bytes * arr_length > 0);
 41 | 
 42 | 		auto errcode = cudaMalloc(&d_arr, 1ULL * element_bytes * arr_length);
 43 | 		
 44 | 		if (errcode != cudaSuccess) {
 45 | 			cerr << "try to allocate " << 1ULL * arr_length * element_bytes << " byte of memory failed" << " arrid = " << arr_id << endl;
 46 | 			cerr << "cannot allocate cuda memory " << errcode << endl; 
 47 | 			exit(-1);
 48 | 		}
 49 | 
 50 | 	}
 51 | 
 52 | 	
 53 | 	virtual ~Array2() {
 54 | 		assert(h_arr != NULL);
 55 | 		delete [] h_arr;
 56 | 
 57 | 		assert(d_arr != NULL);
 58 | 		cudaFree(d_arr);
 59 | 
 60 | 	}
 61 | 
 62 | 
 63 | 	int size() const {
 64 | 		return arr_length;
 65 | 	}
 66 | 
 67 | 	int size_of_T() const {
 68 | 		return element_bytes;
 69 | 	}
 70 | 
 71 | 	unsigned long long num_of_byte() const {
 72 | 		return 1ULL * element_bytes * arr_length;
 73 | 	}
 74 | 
 75 | 	T *get_dev() const {
 76 | 		return d_arr;
 77 | 	} 
 78 | 
 79 | 	T get(int idx) const {
 80 | 		assert(idx >= 0 && idx < size());
 81 | 		return h_arr[idx];
 82 | 	}
 83 | 
 84 | 	T *get_host() const {
 85 | 		return h_arr;
 86 | 	}
 87 | 
 88 | 	void clear_to_zero() {
 89 | 		memset(h_arr, 0, num_of_byte());
 90 | 	}
 91 | 
 92 | 
 93 | 	void fill(T val) {
 94 | 		for (int i = 0; i < arr_length; i++) {
 95 | 			h_arr[i] = val;
 96 | 		}
 97 | 	}
 98 | 
 99 | 
100 | 	T operator[] (int idx) {
101 | 		assert(idx >= 0 && idx < size());
102 | #ifdef __CUDA_ARCH__
103 |     return d_arr[idx];
104 | #else
105 |     return h_arr[idx];
106 | #endif
107 | 	}
108 | 
109 | 	void set(int idx, T v) {
110 | 		if (!(idx >= 0 && idx < size())) {
111 | 			cout << "assert(idx >= 0 && idx < size());  " << idx << endl;
112 | 			assert(idx >= 0 && idx < size());
113 | 		}
114 | 
115 | 		h_arr[idx] = v;
116 | 
117 | 	}
118 | 
119 | 	void copy_to_device() {
120 | 
121 | 		//cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
122 | 		if (num_of_byte() == 0) {
123 | 			// std::cout<< "FAIL: copy_to_device !!!!!\n";
124 | 			// printf("%d, %d", element_bytes ,arr_length);
125 | 			return;
126 | 		}
127 | 
128 | 
129 | 		auto errcode = cudaMemcpy(d_arr, h_arr, num_of_byte(), cudaMemcpyHostToDevice);
130 | 
131 | 		if (errcode != cudaSuccess) {
132 | 			cout << "trying to copy " << num_of_byte() << " byte to device " << endl;
133 | 			cout << "cannot copy to device error code = " << errcode << endl; 
134 | 			exit(-1);
135 | 		}
136 | 	}
137 | 
138 | 	void copy_back() {
139 | 		auto errcode = cudaMemcpy(h_arr, d_arr, num_of_byte(), cudaMemcpyDeviceToHost);
140 | 		if (errcode != cudaSuccess) {
141 | 			cerr << "cannot copy back" << "  " << errcode << endl; 
142 | 			exit(-1);
143 | 		}
144 | 	}
145 | 
146 | 	T *copy_to_host(int num_of_element) {
147 | 		assert(num_of_element <= arr_length);
148 | 		
149 | 		T *arr = new T[num_of_element];
150 | 		auto errcode = cudaMemcpy(arr, d_arr, 1ULL * sizeof(T) * num_of_element, cudaMemcpyDeviceToHost);
151 | 		if (errcode != cudaSuccess) {
152 | 			cerr << "cannot copy_to_host " << "  " << errcode << endl; 
153 | 			exit(-1);
154 | 		}
155 | 
156 | 		return arr;
157 | 		
158 | 	}
159 | 
160 | 	void print() const {
161 | 		cout << "print for debug array2 length = " << arr_length <<  endl;
162 | 		for (int i = 0; i < arr_length; i++) {
163 | 			cout << h_arr[i] << " " ;
164 | 		}
165 | 
166 | 		cout << endl;
167 | 	}
168 | 
169 | 	T *copy_to_host_async(int num_of_element) {
170 | 		assert(num_of_element <= arr_length);
171 | 		
172 | 		T *arr = new T[num_of_element];
173 | 		auto errcode = cudaMemcpyAsync(arr, d_arr, 1ULL * sizeof(T) * num_of_element, cudaMemcpyDeviceToHost);
174 | 		if (errcode != cudaSuccess) {
175 | 			cerr << "cannot copy_to_host " << "  " << errcode << endl; 
176 | 			exit(-1);
177 | 		}
178 | 
179 | 		return arr;	
180 | 	}
181 | 
182 | // private:
183 | 	int arr_length;
184 | 	int element_bytes;
185 | 
186 | 	T *h_arr; 
187 | 	T *d_arr;
188 | 
189 | 	std::string arr_id;
190 | 
191 | };
192 | 
193 | 
194 | #endif


--------------------------------------------------------------------------------
/code/include/gpunfautils/common.h:
--------------------------------------------------------------------------------
  1 | #ifndef COMMON_H_
  2 | #define COMMON_H_
  3 | 
  4 | #include <unordered_map>
  5 | #include <iostream>
  6 | #include <string>
  7 | #include <algorithm>
  8 | #include <cassert>
  9 | #include <queue>
 10 | #include <stack>
 11 | #include <memory>
 12 | #include <map>
 13 | #include <vector>
 14 | #include <iomanip>
 15 | #include <bitset>
 16 | 
 17 | using std::vector;
 18 | using std::string;
 19 | using std::make_pair;
 20 | using std::pair;
 21 | 
 22 | const int ALPHABET_SIZE = 256;
 23 | 
 24 | const int EMPTY_ENTRY = 56789;
 25 | 
 26 | enum remap_node_type {
 27 | 	NONE = 0,
 28 | 	REPORT = 1,
 29 | 	TOPO_ORDER = 2,
 30 | 	BFS_LAYER = 3,
 31 | 	OUTDEGREE = 4,
 32 | 	COMPLETE = 5,
 33 | 	COMPLETE_AND_TOP = 6,
 34 | 	COMPLETE_AND_BFS = 7
 35 | 
 36 | };
 37 | 
 38 | 
 39 | struct match_pair {
 40 | 	int symbol_offset;
 41 | 	int state_id; 
 42 | 	
 43 | 	bool operator< (const match_pair& o) const {
 44 | 		if (symbol_offset < o.symbol_offset) {
 45 | 			return true;
 46 | 		} else if (symbol_offset == o.symbol_offset) {
 47 | 			if (state_id < o.state_id ) {
 48 | 				return true;
 49 | 			}
 50 | 			return false;
 51 | 		} else {
 52 | 			return false;
 53 | 		}
 54 | 	}
 55 | };
 56 | 
 57 | 
 58 | struct match3 {
 59 | 	int symbol_offset;
 60 | 	int state_id; 
 61 | 	int nfa; 
 62 | 
 63 | 	bool operator< (const match_pair& o) const {
 64 | 		if (symbol_offset < o.symbol_offset) {
 65 | 			return true;
 66 | 		} else if (symbol_offset == o.symbol_offset) {
 67 | 			if (state_id < o.state_id ) {
 68 | 				return true;
 69 | 			}
 70 | 			return false;
 71 | 		} else {
 72 | 			return false;
 73 | 		}
 74 | 	}
 75 | 
 76 | };
 77 | 
 78 | 
 79 | 
 80 | struct match_entry {
 81 | 	int symbol_offset;
 82 | 	int state_id;
 83 | 	int cc_id;
 84 | 	int stream_id;
 85 | };
 86 | 
 87 | std::ostream& operator<<(std::ostream& os, const match_pair &obj);
 88 | 
 89 | 
 90 | 
 91 | template<int DEGREE_LIMIT>
 92 | struct STE_dev {
 93 | 	int32_t ms[8]; // 8 * 32 = 256; local memory. 
 94 | 	
 95 | 	int   edge_dst[DEGREE_LIMIT];
 96 | 	
 97 | 	char     attribute;  // is report? 
 98 | 	int degree;
 99 | 
100 | }; 
101 | 
102 | 
103 | 
104 | struct STE_dev4 {
105 | 	int32_t ms[8]; // 8 * 32 = 256; local memory. 
106 | 
107 | 	unsigned long long edges;
108 | 	
109 | 	char     attribute;  // is report? 
110 | 	int degree;
111 | 
112 | }; 
113 | 
114 | 
115 | struct STE_dev4_compressed_matchset {
116 | 	int32_t ms[8]; // 8 * 32 = 256; local memory. 
117 | 
118 | 
119 | 	unsigned long long edges;
120 | 	
121 | 	char     attribute;  // is report? 
122 | 	//       attribute has 8 bit..  
123 | 
124 | 	//       complete; complement;  
125 | 
126 | 	//uint8_t start;
127 | 	//uint8_t end; 
128 | 
129 | 	unsigned int start_end;
130 | 
131 | 	int degree;	
132 | 
133 | }; 
134 | 
135 | 
136 | 
137 | struct STE_dev4_compressed_matchset_allcomplete {
138 | 	unsigned long long edges;
139 | 	
140 | 	char     attribute;  // is report? 
141 | 
142 | 	unsigned int start_end;
143 | 
144 | 	int degree;	
145 | }; 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | // Revised implementation. 20190121
152 | 
153 | struct STE_nodeinfo_new_imp {
154 | 	unsigned long long edges;
155 | 	
156 | 	unsigned int attribute : 8; 
157 | 	unsigned int start : 8;
158 | 	unsigned int end : 8;
159 | 	unsigned int degree : 8;
160 | };
161 | 
162 | 
163 | 
164 | struct STE_nodeinfo_new_imp2 {
165 |     unsigned long long edges;
166 |     unsigned int attribute : 8;
167 |     unsigned int degree : 8;
168 | };
169 | 
170 | 
171 | 
172 | struct STE_matchset_new_imp {
173 | 	int32_t ms[8]; // 8 * 32 = 256; local memory. 
174 | };
175 | 
176 | 
177 | struct STE_nodeinfo_new_imp_withcg {
178 | 	unsigned long long edges;
179 | 
180 | 	unsigned int attribute : 8; 
181 | 	unsigned int start :  8;
182 | 	unsigned int end :  8;
183 | 	unsigned int degree : 8;
184 | 
185 | 	// cg_id ---> write position in gpu kernel. 
186 | 	uint16_t cg_id;
187 | 	uint16_t cg_of_to_edges[4];
188 | };
189 | 
190 | struct matchset_t {
191 | 	int32_t m[8];
192 | };
193 | 
194 | 
195 | 
196 | 
197 | #endif
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 
237 | 
238 | 
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/cta_mergesort.hxx:
--------------------------------------------------------------------------------
  1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
  2 | #pragma once
  3 | 
  4 | #include "cta_merge.hxx"
  5 | #include "sort_networks.hxx"
  6 | 
  7 | BEGIN_MGPU_NAMESPACE
  8 | 
  9 | MGPU_HOST_DEVICE int out_of_range_flags(int first, int vt, int count) {
 10 |   int out_of_range = min(vt, first + vt - count);
 11 |   int head_flags = 0;
 12 |   if(out_of_range > 0) {
 13 |     const int mask = (1<< vt) - 1;
 14 |     head_flags = mask & (~mask>> out_of_range);
 15 |   }
 16 |   return head_flags;
 17 | }
 18 | 
 19 | MGPU_HOST_DEVICE merge_range_t compute_mergesort_frame(int partition,
 20 |   int coop, int spacing) {
 21 | 
 22 |   int size = spacing * (coop / 2);
 23 |   int start = ~(coop - 1) & partition;
 24 |   int a_begin = spacing * start;
 25 |   int b_begin = spacing * start + size;
 26 | 
 27 |   return merge_range_t {
 28 |     a_begin,
 29 |     a_begin + size,
 30 |     b_begin,
 31 |     b_begin + size
 32 |   };
 33 | }
 34 | 
 35 | MGPU_HOST_DEVICE merge_range_t compute_mergesort_range(int count, 
 36 |   int partition, int coop, int spacing) {
 37 | 
 38 |   merge_range_t frame = compute_mergesort_frame(partition, coop, spacing);
 39 | 
 40 |   return merge_range_t {
 41 |     frame.a_begin,
 42 |     min(count, frame.a_end),
 43 |     min(count, frame.b_begin),
 44 |     min(count, frame.b_end)
 45 |   };
 46 | }
 47 | 
 48 | MGPU_HOST_DEVICE merge_range_t compute_mergesort_range(int count, 
 49 |   int partition, int coop, int spacing, int mp0, int mp1) {
 50 | 
 51 |   merge_range_t range = compute_mergesort_range(count, partition, 
 52 |     coop, spacing);
 53 | 
 54 |   // Locate the diagonal from the start of the A sublist.
 55 |   int diag = spacing * partition - range.a_begin;
 56 | 
 57 |   // The end partition of the last cta for each merge operation is computed
 58 |   // and stored as the begin partition for the subsequent merge. i.e. it is
 59 |   // the same partition but in the wrong coordinate system, so its 0 when it
 60 |   // should be listSize. Correct that by checking if this is the last cta
 61 |   // in this merge operation.
 62 |   if(coop - 1 != ((coop - 1) & partition)) {
 63 |     range.a_end = range.a_begin + mp1;
 64 |     range.b_end = min(count, range.b_begin + diag + spacing - mp1);
 65 |   }
 66 | 
 67 |   range.a_begin = range.a_begin + mp0;
 68 |   range.b_begin = min(count, range.b_begin + diag - mp0);
 69 | 
 70 |   return range;
 71 | }
 72 | 
 73 | template<int nt, int vt, typename key_t, typename val_t>
 74 | struct cta_sort_t {
 75 |   enum { 
 76 |     has_values = !std::is_same<val_t, empty_t>::value,
 77 |     num_passes = s_log2(nt)
 78 |   };
 79 | 
 80 |   union storage_t {
 81 |     key_t keys[nt * vt + 1];
 82 |     val_t vals[nt * vt];
 83 |   };
 84 | 
 85 |   static_assert(is_pow2(nt), "cta_sort_t requires pow2 number of threads");
 86 | 
 87 |   template<typename comp_t>
 88 |   MGPU_DEVICE kv_array_t<key_t, val_t, vt> 
 89 |   merge_pass(kv_array_t<key_t, val_t, vt> x, int tid, int count, 
 90 |     int pass, comp_t comp, storage_t& storage) const {
 91 | 
 92 |     // Divide the CTA's keys into lists.
 93 |     int coop = 2<< pass;
 94 |     merge_range_t range = compute_mergesort_range(count, tid, coop, vt);
 95 |     int diag = vt * tid - range.a_begin;
 96 | 
 97 |     // Store the keys into shared memory for searching.
 98 |     reg_to_shared_thread<nt, vt>(x.keys, tid, storage.keys);
 99 |     
100 |     // Search for the merge path for this thread within its list.
101 |     int mp = merge_path<bounds_lower>(storage.keys, range, diag, comp);
102 | 
103 |     // Run a serial merge and return.
104 |     merge_pair_t<key_t, vt> merge = serial_merge<bounds_lower, vt>(
105 |       storage.keys, range.partition(mp, diag), comp);
106 |     x.keys = merge.keys;
107 | 
108 |     if(has_values) {
109 |       // Reorder values through shared memory.
110 |       reg_to_shared_thread<nt, vt>(x.vals, tid, storage.vals);
111 |       x.vals = shared_gather<nt, vt>(storage.vals, merge.indices);
112 |     }
113 | 
114 |     return x;
115 |   }
116 | 
117 |   template<typename comp_t>
118 |   MGPU_DEVICE kv_array_t<key_t, val_t, vt> 
119 |   block_sort(kv_array_t<key_t, val_t, vt> x, int tid, int count,
120 |     comp_t comp, storage_t& storage) const {
121 | 
122 |     // Sort the inputs within each thread. If any threads have fewer than
123 |     // vt items, use the segmented sort network to prevent out-of-range
124 |     // elements from contaminating the sort.
125 |     if(count < nt * vt) {
126 |       int head_flags = out_of_range_flags(vt * tid, vt, count);
127 |       x = odd_even_sort(x, comp, head_flags);
128 |     } else
129 |       x = odd_even_sort(x, comp);
130 | 
131 |     // Merge threads starting with a pair until all values are merged.
132 |     for(int pass = 0; pass < num_passes; ++pass)
133 |       x = merge_pass(x, tid, count, pass, comp, storage);
134 |     
135 |     return x;
136 |   }
137 | };
138 | 
139 | 
140 | END_MGPU_NAMESPACE
141 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/cta_reduce.hxx:
--------------------------------------------------------------------------------
  1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
  2 | #pragma once
  3 | #include "loadstore.hxx"
  4 | #include "intrinsics.hxx"
  5 | 
  6 | BEGIN_MGPU_NAMESPACE
  7 | 
  8 | // requires __CUDA_ARCH__ >= 300.
  9 | // warp_size can be any power-of-two <= warp_size.
 10 | // warp_reduce_t returns the reduction only in lane 0.
 11 | template<typename type_t, int group_size>
 12 | struct shfl_reduce_t {
 13 |  
 14 |   static_assert(group_size <= warp_size && is_pow2(group_size),
 15 |     "shfl_reduce_t must operate on a pow2 number of threads <= warp_size (32)");
 16 |   enum { num_passes = s_log2(group_size) };
 17 | 
 18 |   template<typename op_t = plus_t<type_t> >
 19 |   MGPU_DEVICE type_t reduce(int lane, type_t x, int count, op_t op = op_t()) {
 20 |     if(count == group_size) { 
 21 |       iterate<num_passes>([&](int pass) {
 22 |         int offset = 1<< pass;
 23 |         x = shfl_down_op(x, offset, op, group_size);
 24 |       });
 25 |     } else {
 26 |       iterate<num_passes>([&](int pass) {
 27 |         int offset = 1<< pass;
 28 |         type_t y = shfl_down(x, offset, group_size);
 29 |         if(lane + offset < count) x = op(x, y);
 30 |       });
 31 |     }
 32 |     return x;
 33 |   }
 34 | };
 35 | 
 36 | // cta_reduce_t returns the reduction of all inputs for thread 0, and returns
 37 | // type_t() for all other threads. This behavior saves a broadcast.
 38 | 
 39 | template<int nt, typename type_t>
 40 | struct cta_reduce_t {
 41 | 
 42 |   enum { 
 43 |     group_size = min(nt, (int)warp_size), 
 44 |     num_passes = s_log2(group_size),
 45 |     num_items = nt / group_size 
 46 |   };
 47 | 
 48 |   static_assert(0 == nt % warp_size, 
 49 |     "cta_reduce_t requires num threads to be a multiple of warp_size (32)");
 50 | 
 51 |   struct storage_t {
 52 |     struct { type_t data[max(nt, 2 * group_size)]; };
 53 |   };
 54 | 
 55 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
 56 | 
 57 |   typedef shfl_reduce_t<type_t, group_size> group_reduce_t;
 58 | 
 59 |   template<typename op_t = plus_t<type_t> >
 60 |   MGPU_DEVICE type_t reduce(int tid, type_t x, storage_t& storage, 
 61 |     int count = nt, op_t op = op_t(), bool all_return = true) const {
 62 | 
 63 |     // Store your data into shared memory.
 64 |     storage.data[tid] = x;
 65 |     __syncthreads();
 66 | 
 67 |     if(tid < group_size) {
 68 |       // Each thread scans within its lane.
 69 |       strided_iterate<group_size, num_items>([&](int i, int j) {
 70 |         if(i > 0) x = op(x, storage.data[j]);
 71 |       }, tid, count);
 72 | 
 73 |       // Cooperative reduction.
 74 |       x = group_reduce_t().reduce(tid, x, min(count, (int)group_size), op);
 75 | 
 76 |       if(all_return) storage.data[tid] = x;
 77 |     }
 78 |     __syncthreads();
 79 | 
 80 |     if(all_return) {
 81 |       x = storage.data[0];
 82 |       __syncthreads();
 83 |     }
 84 |     return x;
 85 |   }
 86 | 
 87 | #else
 88 | 
 89 |   template<typename op_t = plus_t<type_t> >
 90 |   MGPU_DEVICE type_t reduce(int tid, type_t x, storage_t& storage, 
 91 |     int count = nt, op_t op = op_t(), bool all_return = true) const {
 92 | 
 93 |     // Store your data into shared memory.
 94 |     storage.data[tid] = x;
 95 |     __syncthreads();
 96 | 
 97 |     if(tid < group_size) {
 98 |       // Each thread scans within its lane.
 99 |       strided_iterate<group_size, num_items>([&](int i, int j) {
100 |         type_t y = storage.data[j];
101 |         if(i > 0) x = op(x, y);
102 |       }, tid, count);
103 |       storage.data[tid] = x;
104 |     }
105 |     __syncthreads();
106 | 
107 |     int count2 = min(count, int(group_size));
108 |     int first = (1 & num_passes) ? group_size : 0;
109 |     if(tid < group_size)
110 |       storage.data[first + tid] = x;
111 |     __syncthreads();
112 | 
113 |     iterate<num_passes>([&](int pass) {
114 |       if(tid < group_size) {
115 |         int offset = 1 << pass;
116 |         if(tid + offset < count2) 
117 |           x = op(x, storage.data[first + offset + tid]);
118 |         first = group_size - first;
119 |         storage.data[first + tid] = x;
120 |       }
121 |       __syncthreads();
122 |     });
123 | 
124 |     if(all_return) {
125 |       x = storage.data[0];
126 |       __syncthreads();
127 |     }
128 |     return x;
129 |   }
130 | 
131 | #endif
132 | };
133 | 
134 | END_MGPU_NAMESPACE
135 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/cta_search.hxx:
--------------------------------------------------------------------------------
  1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
  2 | #pragma once
  3 | 
  4 | #include "cta_merge.hxx"
  5 | 
  6 | BEGIN_MGPU_NAMESPACE
  7 | 
  8 | template<bounds_t bounds, typename keys_it, typename int_t, typename key_t, 
  9 |   typename comp_t>
 10 | MGPU_HOST_DEVICE int_t binary_search(keys_it keys, int_t count, key_t key,
 11 |   comp_t comp) {
 12 | 
 13 |   int_t begin = 0;
 14 |   int_t end = count;
 15 |   while(begin < end) {
 16 |     int_t mid = (begin + end) / 2;
 17 |     key_t key2 = keys[mid];
 18 |     bool pred = (bounds_upper == bounds) ? 
 19 |       !comp(key, key2) :
 20 |       comp(key2, key);
 21 |     if(pred) begin = mid + 1;
 22 |     else end = mid;
 23 |   }
 24 |   return begin;
 25 | }
 26 | 
 27 | ////////////////////////////////////////////////////////////////////////////////
 28 | // TODO: Implement a moderngpu V1 style vectorized sorted search.
 29 | 
 30 | template<typename type_t, int vt>
 31 | struct search_result_t {
 32 |   array_t<type_t, vt> keys;
 33 |   array_t<int, vt> indices;
 34 |   int decisions;              // Set a bit if this iteration has progressed A.
 35 |   int matches_a;              // A set flag for a match on each iteration.
 36 |   int matches_b;
 37 | };
 38 | 
 39 | template<int vt, bounds_t bounds, bool range_check, typename type_t, 
 40 |   typename comp_t>
 41 | MGPU_DEVICE search_result_t<type_t, vt> 
 42 | serial_search(const type_t* keys_shared, merge_range_t range,
 43 |   int a_offset, int b_offset, comp_t comp, bool sync = true) {
 44 | 
 45 |   type_t a_key = keys_shared[range.a_begin];
 46 |   type_t b_key = keys_shared[range.b_begin];
 47 |   type_t a_prev = type_t(), b_prev = type_t();
 48 | 
 49 |   int a_start = 0;
 50 |   int b_start = range.a_end;    // Assume the b_keys start right after the end
 51 |                                 // of the a_keys.
 52 |   if(range.a_begin > 0) a_prev = keys_shared[range.a_begin - 1];
 53 |   if(range.b_begin > b_start) b_prev = keys_shared[range.b_begin - 1];
 54 | 
 55 |   search_result_t<type_t, vt> result = search_result_t<type_t, vt>();
 56 | 
 57 |   iterate<vt>([&](int i) {
 58 |     // This is almost the same body as serial_merge, except for the match
 59 |     // criterion below.
 60 |     bool p = merge_predicate<bounds, range_check>(a_key, b_key, range, comp);
 61 | 
 62 |     if(p) {
 63 |       bool match = (bounds_upper == bounds) ?
 64 |         (!range_check || range.b_begin > b_start) && 
 65 |           !comp(b_prev, a_key) :
 66 |         (!range_check || range.b_valid()) && 
 67 |           !comp(a_key, b_key);
 68 | 
 69 |       result.decisions |= 1<< i;
 70 |       result.matches_a |= (int)match<< i;
 71 |       a_prev = a_key;
 72 | 
 73 |     } else {
 74 |       bool match = (bounds_upper == bounds) ?
 75 |         (!range_check || (range.a_valid() && range.b_valid())) && 
 76 |           !comp(b_key, a_key) :
 77 |         (!range_check || (range.b_valid() && range.a_begin > a_start)) && 
 78 |           !comp(a_prev, b_key);
 79 | 
 80 |       result.matches_b |= (int)match<< i;
 81 |       b_prev = b_key;
 82 |     }
 83 | 
 84 |     // Same advancement behavior as serial_merge.
 85 |     int index = p ? range.a_begin : range.b_begin;
 86 | 
 87 |     result.keys[i] = p ? a_key : b_key;
 88 |     result.indices[i] = index + (p ? a_offset : b_offset);
 89 | 
 90 |     type_t c_key = keys_shared[++index];
 91 |     if(p) a_key = c_key, range.a_begin = index;
 92 |     else b_key = c_key, range.b_begin = index;
 93 |   });
 94 | 
 95 |   if(sync) __syncthreads();
 96 | 
 97 |   return result;
 98 | }
 99 | 
100 | END_MGPU_NAMESPACE
101 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/cta_segscan.hxx:
--------------------------------------------------------------------------------
  1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
  2 | #pragma once
  3 | 
  4 | #include "cta_scan.hxx"
  5 | 
  6 | BEGIN_MGPU_NAMESPACE
  7 | 
  8 | template<typename type_t>
  9 | struct segscan_result_t {
 10 |   type_t scan;
 11 |   type_t reduction;
 12 |   bool has_carry_in;
 13 |   int left_lane;
 14 | };
 15 | 
 16 | template<int nt, typename type_t>
 17 | struct cta_segscan_t {
 18 |   enum { num_warps = nt / warp_size };
 19 | 
 20 |   union storage_t {
 21 |     int delta[num_warps + nt]; 
 22 |     struct { type_t values[2 * nt]; int packed[nt]; };
 23 |   };
 24 | 
 25 |   MGPU_DEVICE int find_left_lane(int tid, bool has_head_flag, 
 26 |     storage_t& storage) const {
 27 | 
 28 |     int warp = tid / warp_size;
 29 |     int lane = (warp_size - 1) & tid;
 30 |     int warp_mask = 0xffffffff>> (31 - lane);   // inclusive search.
 31 |     int cta_mask = 0x7fffffff>> (31 - lane);    // exclusive search.
 32 | 
 33 |     // Build a head flag bitfield and store it into shared memory.
 34 |     int warp_bits = __ballot(has_head_flag);
 35 |     storage.delta[warp] = warp_bits;
 36 |     __syncthreads();
 37 | 
 38 |     if(tid < num_warps) {
 39 |       int cta_bits = __ballot(0 != storage.delta[tid]);
 40 |       int warp_segment = 31 - clz(cta_mask & cta_bits);
 41 |       int start = (-1 != warp_segment) ?
 42 |         (31 - clz(storage.delta[warp_segment]) + 32 * warp_segment) : 0;
 43 |       storage.delta[num_warps + tid] = start;
 44 |     }
 45 |     __syncthreads();
 46 | 
 47 |     // Find the closest flag to the left of this thread within the warp.
 48 |     // Include the flag for this thread.
 49 |     int start = 31 - clz(warp_mask & warp_bits);
 50 |     if(-1 != start) start += ~31 & tid;
 51 |     else start = storage.delta[num_warps + warp];
 52 |     __syncthreads();
 53 | 
 54 |     return start;
 55 |   }
 56 | 
 57 |   template<typename op_t = plus_t<type_t> >
 58 |   MGPU_DEVICE segscan_result_t<type_t> segscan(int tid, bool has_head_flag,
 59 |     bool has_carry_out, type_t x, storage_t& storage, type_t init = type_t(),
 60 |     op_t op = op_t()) const {
 61 | 
 62 |     if(!has_carry_out) x = init;
 63 | 
 64 |     int left_lane = find_left_lane(tid, has_head_flag, storage);
 65 |     int tid_delta = tid - left_lane;
 66 | 
 67 |     // Store the has_carry_out flag.
 68 |     storage.packed[tid] = (int)has_carry_out | (left_lane<< 1);
 69 | 
 70 |     // Run an inclusive scan.
 71 |     int first = 0;
 72 |     storage.values[first + tid] = x;
 73 |     __syncthreads();
 74 | 
 75 |     int packed = storage.packed[left_lane];
 76 |     left_lane = packed>> 1;
 77 |     tid_delta = tid - left_lane;
 78 |     if(0 == (1 & packed)) --tid_delta;
 79 | 
 80 |     iterate<s_log2(nt)>([&](int pass) {
 81 |       int offset = 1<< pass;
 82 |       if(tid_delta >= offset)
 83 |         x = op(x, storage.values[first + tid - offset]);
 84 |       first = nt - first;
 85 |       storage.values[first + tid] = x;
 86 |       __syncthreads();
 87 |     });
 88 | 
 89 |     // Get the exclusive scan by fetching the preceding element. Also return
 90 |     // the carry-out value as the total.
 91 |     bool has_carry_in = tid ? (0 != (1 & storage.packed[tid - 1])) : false;
 92 | 
 93 |     segscan_result_t<type_t> result { 
 94 |       (has_carry_in && tid) ? storage.values[first + tid - 1] : init,
 95 |       storage.values[first + nt - 1],
 96 |       has_carry_in,
 97 |       left_lane
 98 |     };
 99 |     __syncthreads();
100 | 
101 |     return result;
102 |   }
103 | };
104 | 
105 | END_MGPU_NAMESPACE
106 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/kernel_bulkinsert.hxx:
--------------------------------------------------------------------------------
 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
 2 | #pragma once
 3 | #include "kernel_merge.hxx"
 4 | 
 5 | BEGIN_MGPU_NAMESPACE
 6 | 
 7 | // Insert the values at a_keys before the values at b_keys identified by
 8 | // insert.
 9 | template<typename launch_t = empty_t, typename a_it, typename insert_it, 
10 |   typename b_it, typename c_it>
11 | void bulk_insert(a_it a, insert_it a_insert, int insert_size, b_it b, 
12 |   int source_size, c_it c, context_t& context) {
13 | 
14 |   merge<launch_t>(a_insert, a, insert_size, counting_iterator_t<int>(0), b, 
15 |     source_size, discard_iterator_t<int>(), c, mgpu::less_t<int>(), context);
16 | }
17 | 
18 | END_MGPU_NAMESPACE
19 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/kernel_bulkremove.hxx:
--------------------------------------------------------------------------------
 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
 2 | #pragma once
 3 | #include "search.hxx"
 4 | 
 5 | BEGIN_MGPU_NAMESPACE
 6 | 
 7 | template<typename launch_arg_t = empty_t,
 8 |   typename input_it, typename indices_it, typename output_it>
 9 | void bulk_remove(input_it input, int count, indices_it indices, 
10 |   int num_indices, output_it output, context_t& context) {
11 | 
12 |   typedef typename conditional_typedef_t<launch_arg_t, 
13 |     launch_box_t<
14 |       arch_20_cta<128, 15>,
15 |       arch_35_cta<128, 11>,
16 |       arch_52_cta<128, 15>
17 |     >
18 |   >::type_t launch_t;
19 | 
20 |   typedef typename std::iterator_traits<input_it>::value_type type_t;
21 | 
22 |   // Map the removal indices into tiles.
23 |   mem_t<int> partitions = binary_search_partitions<bounds_lower>(indices, 
24 |     count, num_indices, launch_t::nv(context), context);
25 |   const int* p_data = partitions.data();
26 | 
27 |   auto k = [=]MGPU_DEVICE(int tid, int cta) {
28 |     typedef typename launch_t::sm_ptx params_t; 
29 |     enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
30 |     
31 |     __shared__ union {
32 |       int indices[nv + 1];
33 |     } shared;
34 | 
35 |     range_t tile = get_tile(cta, nv, count);
36 | 
37 |     // Search the begin and end iterators to load.
38 |     int begin = p_data[cta];
39 |     int end = p_data[cta + 1]; 
40 |     int b_count = end - begin;
41 | 
42 |     int* a_shared = shared.indices;
43 |     int* b_shared = shared.indices + tile.count() - b_count;
44 | 
45 |     // Store the indices to shared memory.
46 |     // TODO: MODIFY MEM_TO_SHARED TO UNCONDITIONALLY WRITE TO FULL SMEM.
47 |     mem_to_shared<nt, vt>(indices + begin, tid, b_count, b_shared, false);
48 | 
49 |     // Binary search into the remove array to prepare a range for the thread.
50 |     merge_range_t range = {
51 |       // a range
52 |       vt * tid, 
53 |       tile.count(), 
54 |       
55 |       // b range
56 |       binary_search<bounds_lower>(b_shared, b_count, 
57 |         tile.begin + vt * tid, less_t<int>()),
58 |       b_count
59 |     };
60 | 
61 |     // Emit all values that aren't removed.
62 |     iterate<vt>([&](int i) {
63 |       bool p = range.a_valid() && (!range.b_valid() || 
64 |         tile.begin + range.a_begin < b_shared[range.b_begin]);
65 |       if(p)
66 |         a_shared[range.a_begin - range.b_begin] = tile.begin + range.a_begin;
67 |       else 
68 |         ++range.b_begin;
69 |       ++range.a_begin;
70 |     });
71 |     __syncthreads();
72 | 
73 |     // Pull the gather indices out of shared memory in strided order.
74 |     array_t<int, vt> gather = shared_to_reg_strided<nt, vt>(
75 |       shared.indices, tid);
76 | 
77 |     // Gather the elements from input.
78 |     int num_move = tile.count() - b_count;
79 |     array_t<type_t, vt> values;
80 |     strided_iterate<nt, vt, 0>([&](int i, int j) {
81 |       values[i] = input[gather[i]];
82 |     }, tid, num_move);
83 | 
84 |     // Stream to output.
85 |     reg_to_mem_strided<nt, vt>(values, tid, num_move, 
86 |       output + tile.begin - begin);
87 |   };
88 |   cta_transform<launch_t>(k, count, context);
89 | }
90 | 
91 | END_MGPU_NAMESPACE
92 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/kernel_intervalmove.hxx:
--------------------------------------------------------------------------------
 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
 2 | #pragma once
 3 | #include "kernel_load_balance.hxx"
 4 | 
 5 | BEGIN_MGPU_NAMESPACE
 6 | 
 7 | template<typename launch_arg_t = empty_t, typename input_it, 
 8 |   typename segments_it, typename output_it>
 9 | void interval_expand(input_it input, int count, segments_it segments,
10 |   int num_segments, output_it output, context_t& context) {
11 | 
12 |   typedef typename std::iterator_traits<input_it>::value_type type_t;
13 |   transform_lbs<launch_arg_t>(
14 |     []MGPU_DEVICE(int index, int seg, int rank, tuple<type_t> desc,
15 |       output_it output) {
16 |       output[index] = get<0>(desc);
17 |     }, 
18 |     count, segments, num_segments, make_tuple(input), context, output
19 |   );
20 | }
21 | 
22 | template<typename launch_arg_t = empty_t, typename input_it, 
23 |   typename segments_it, typename gather_it, typename output_it>
24 | void interval_gather(input_it input, int count, segments_it segments,
25 |   int num_segments, gather_it gather, output_it output, context_t& context) {
26 | 
27 |   transform_lbs<launch_arg_t>(
28 |     []MGPU_DEVICE(int index, int seg, int rank, tuple<int> desc, 
29 |       input_it input, output_it output) {
30 |       output[index] = input[get<0>(desc) + rank];
31 |     }, 
32 |     count, segments, num_segments, make_tuple(gather), context, input, output
33 |   );
34 | }
35 | 
36 | template<typename launch_arg_t = empty_t, typename input_it, 
37 |   typename segments_it, typename scatter_it, typename output_it>
38 | void interval_scatter(input_it input, int count, segments_it segments,
39 |   int num_segments, scatter_it scatter, output_it output, context_t& context) {
40 | 
41 |   transform_lbs<launch_arg_t>(
42 |     []MGPU_DEVICE(int index, int seg, int rank, tuple<int> desc, 
43 |       input_it input, output_it output) {
44 |       output[get<0>(desc) + rank] = input[index];
45 |     }, 
46 |     count, segments, num_segments, make_tuple(scatter), context, input, output
47 |   );
48 | }
49 | 
50 | template<typename launch_arg_t = empty_t, 
51 |   typename input_it, typename segments_it, typename scatter_it,
52 |   typename gather_it, typename output_it>
53 | void interval_move(input_it input, int count, segments_it segments,
54 |   int num_segments, scatter_it scatter, gather_it gather, output_it output, 
55 |   context_t& context) {
56 | 
57 |   transform_lbs<launch_arg_t>(
58 |     []MGPU_DEVICE(int index, int seg, int rank, tuple<int, int> desc,
59 |       input_it input, output_it output) {
60 |       output[get<0>(desc) + rank] = input[get<1>(desc) + rank];
61 |     }, 
62 |     count, segments, num_segments, make_tuple(scatter, gather), context,
63 |     input, output
64 |   );
65 | }
66 | 
67 | END_MGPU_NAMESPACE
68 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/kernel_join.hxx:
--------------------------------------------------------------------------------
 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
 2 | #pragma once
 3 | #include "kernel_sortedsearch.hxx"
 4 | #include "kernel_scan.hxx"
 5 | #include "kernel_load_balance.hxx"
 6 | 
 7 | BEGIN_MGPU_NAMESPACE
 8 | 
 9 | template<typename launch_arg_t = empty_t, 
10 |   typename a_it, typename b_it, typename comp_t>
11 | mem_t<int2> inner_join(a_it a, int a_count, b_it b, int b_count, 
12 |   comp_t comp, context_t& context) {
13 | 
14 |   // Compute lower and upper bounds of a into b.
15 |   mem_t<int> lower(a_count, context);
16 |   mem_t<int> upper(a_count, context);
17 |   sorted_search<bounds_lower, launch_arg_t>(a, a_count, b, b_count, 
18 |     lower.data(), comp, context);
19 |   sorted_search<bounds_upper, launch_arg_t>(a, a_count, b, b_count, 
20 |     upper.data(), comp, context);
21 | 
22 |   // Compute output ranges by scanning upper - lower. Retrieve the reduction
23 |   // of the scan, which specifies the size of the output array to allocate.
24 |   mem_t<int> scanned_sizes(a_count, context);
25 |   const int* lower_data = lower.data();
26 |   const int* upper_data = upper.data();
27 | 
28 |   mem_t<int> count(1, context);
29 |   transform_scan<int>([=]MGPU_DEVICE(int index) {
30 |     return upper_data[index] - lower_data[index];
31 |   }, a_count, scanned_sizes.data(), plus_t<int>(), count.data(), context);
32 | 
33 |   // Allocate an int2 output array and use load-balancing search to compute
34 |   // the join.
35 |   int join_count = from_mem(count)[0];
36 |   mem_t<int2> output(join_count, context);
37 |   int2* output_data = output.data();
38 | 
39 |   // Use load-balancing search on the segmens. The output is a pair with
40 |   // a_index = seg and b_index = lower_data[seg] + rank.
41 |   auto k = [=]MGPU_DEVICE(int index, int seg, int rank, tuple<int> lower) {
42 |     output_data[index] = make_int2(seg, get<0>(lower) + rank);
43 |   };
44 |   transform_lbs<launch_arg_t>(k, join_count, scanned_sizes.data(), a_count,
45 |     make_tuple(lower_data), context);
46 | 
47 |   return output;
48 | }
49 | 
50 | END_MGPU_NAMESPACE
51 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/kernel_load_balance.hxx:
--------------------------------------------------------------------------------
 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
 2 | #pragma once
 3 | #include "cta_load_balance.hxx"
 4 | #include "search.hxx"
 5 | 
 6 | BEGIN_MGPU_NAMESPACE
 7 | 
 8 | template<typename launch_arg_t = empty_t, typename func_t, 
 9 |   typename segments_it, typename pointers_t, typename... args_t>
10 | void transform_lbs(func_t f, int count, segments_it segments, 
11 |   int num_segments, pointers_t caching_iterators, context_t& context,
12 |   args_t... args) {
13 | 
14 |   typedef typename conditional_typedef_t<launch_arg_t, 
15 |     launch_box_t<
16 |       arch_20_cta<128, 11, 9>,
17 |       arch_35_cta<128,  7, 5>,
18 |       arch_52_cta<128, 11, 9>
19 |     >
20 |   >::type_t launch_t;
21 | 
22 |   typedef typename std::iterator_traits<segments_it>::value_type int_t;
23 |   typedef tuple_iterator_value_t<pointers_t> value_t;
24 | 
25 |   mem_t<int_t> mp = load_balance_partitions(count, segments, num_segments,
26 |     launch_t::nv(context), context);
27 |   const int_t* mp_data = mp.data();
28 | 
29 |   auto k = [=]MGPU_DEVICE(int tid, int cta, args_t... args) {
30 | 
31 |     typedef typename launch_t::sm_ptx params_t;
32 |     enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 };
33 |     typedef cta_load_balance_t<nt, vt> load_balance_t;
34 |     typedef detail::cached_segment_load_t<nt, pointers_t> cached_load_t;
35 | 
36 |     __shared__ union {
37 |       typename load_balance_t::storage_t lbs;
38 |       typename cached_load_t::storage_t cached;
39 |     } shared;
40 | 
41 |     // Compute the load-balancing search and materialize (index, seg, rank)
42 |     // arrays.
43 |     auto lbs = load_balance_t().load_balance(count, segments, num_segments,
44 |       tid, cta, mp_data, shared.lbs);
45 | 
46 |     // Load from the cached iterators. Use the placement range, not the 
47 |     // merge-path range for situating the segments.
48 |     array_t<value_t, vt> cached_values = cached_load_t::template load<vt0>(
49 |       tid, lbs.merge_range.a_count(), lbs.placement.range.b_range(), 
50 |       lbs.segments, shared.cached, caching_iterators);
51 | 
52 |     // Call the user-supplied functor f.
53 |     strided_iterate<nt, vt, vt0>([=](int i, int j) {
54 |       int index = lbs.merge_range.a_begin + j;
55 |       int seg = lbs.segments[i];
56 |       int rank = lbs.ranks[i];
57 | 
58 |       f(index, seg, rank, cached_values[i], args...);
59 |     }, tid, lbs.merge_range.a_count());
60 |   };
61 |   cta_transform<launch_t>(k, count + num_segments, context, args...);
62 | }
63 | 
64 | // load-balancing search without caching.
65 | template<typename launch_arg_t = empty_t, typename func_t, 
66 |   typename segments_it, typename... args_t>
67 | void transform_lbs(func_t f, int count, segments_it segments, 
68 |   int num_segments, context_t& context, args_t... args) {
69 | 
70 |   transform_lbs<launch_arg_t>(
71 |     [=]MGPU_DEVICE(int index, int seg, int rank, tuple<>, args_t... args) {
72 |       f(index, seg, rank, args...);    // drop the cached values.
73 |     },
74 |     count, segments, num_segments, tuple<>(), context, args...
75 |   );
76 | }
77 | 
78 | template<typename launch_arg_t = empty_t, typename segments_it,
79 |   typename output_it>
80 | void load_balance_search(int count, segments_it segments, 
81 |   int num_segments, output_it output, context_t& context) {
82 | 
83 |   transform_lbs<launch_arg_t>([=]MGPU_DEVICE(int index, int seg, int rank) {
84 |     output[index] = seg;
85 |   }, count, segments, num_segments, context);
86 | }
87 | 
88 | END_MGPU_NAMESPACE
89 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/kernel_merge.hxx:
--------------------------------------------------------------------------------
 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
 2 | #pragma once
 3 | #include "cta_merge.hxx"
 4 | #include "search.hxx"
 5 | 
 6 | BEGIN_MGPU_NAMESPACE
 7 | 
 8 | // Key-value merge.
 9 | template<typename launch_arg_t = empty_t,
10 |   typename a_keys_it, typename a_vals_it, 
11 |   typename b_keys_it, typename b_vals_it,
12 |   typename c_keys_it, typename c_vals_it, 
13 |   typename comp_t>
14 | void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count, 
15 |   b_keys_it b_keys, b_vals_it b_vals, int b_count,
16 |   c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context) {
17 | 
18 |   typedef typename conditional_typedef_t<launch_arg_t, 
19 |     launch_box_t<
20 |       arch_20_cta<128, 15>,
21 |       arch_35_cta<128, 11>,
22 |       arch_52_cta<128, 15>
23 |     >
24 |   >::type_t launch_t;
25 | 
26 |   typedef typename std::iterator_traits<a_keys_it>::value_type type_t;
27 |   typedef typename std::iterator_traits<a_vals_it>::value_type val_t;
28 |   enum { has_values = !std::is_same<val_t, empty_t>::value };
29 | 
30 |   mem_t<int> partitions = merge_path_partitions<bounds_lower>(a_keys, a_count, 
31 |     b_keys, b_count, launch_t::nv(context), comp, context);
32 |   int* mp_data = partitions.data();
33 | 
34 |   auto k = [=] MGPU_DEVICE (int tid, int cta) {
35 |     typedef typename launch_t::sm_ptx params_t;
36 |     enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
37 | 
38 |     __shared__ union {
39 |       type_t keys[nv + 1];
40 |       int indices[nv];
41 |     } shared;
42 | 
43 |     // Load the range for this CTA and merge the values into register.
44 |     int mp0 = mp_data[cta + 0];
45 |     int mp1 = mp_data[cta + 1];
46 |     merge_range_t range = compute_merge_range(a_count, b_count, cta, nv, 
47 |       mp0, mp1);
48 | 
49 |     merge_pair_t<type_t, vt> merge = cta_merge_from_mem<bounds_lower, nt, vt>(
50 |       a_keys, b_keys, range, tid, comp, shared.keys);
51 | 
52 |     int dest_offset = nv * cta;
53 |     reg_to_mem_thread<nt>(merge.keys, tid, range.total(), c_keys + dest_offset,
54 |       shared.keys);
55 | 
56 |     if(has_values) {
57 |       // Transpose the indices from thread order to strided order.
58 |       array_t<int, vt> indices = reg_thread_to_strided<nt>(merge.indices, tid, 
59 |         shared.indices);
60 | 
61 |       // Gather the input values and merge into the output values.
62 |       transfer_two_streams_strided<nt>(a_vals + range.a_begin, range.a_count(),
63 |         b_vals + range.b_begin, range.b_count(), indices, tid, 
64 |         c_vals + dest_offset);
65 |     }
66 |   };
67 |   cta_transform<launch_t>(k, a_count + b_count, context);
68 | }
69 | 
70 | // Key-only merge.
71 | template<typename launch_t = empty_t,
72 |   typename a_keys_it, typename b_keys_it, typename c_keys_it,
73 |   typename comp_t>
74 | void merge(a_keys_it a_keys, int a_count, b_keys_it b_keys, int b_count,
75 |   c_keys_it c_keys, comp_t comp, context_t& context) {
76 | 
77 |   merge<launch_t>(a_keys, (const empty_t*)nullptr, a_count, b_keys, 
78 |     (const empty_t*)nullptr, b_count, c_keys, (empty_t*)nullptr, comp,
79 |     context);
80 | }
81 | 
82 | END_MGPU_NAMESPACE
83 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/kernel_reduce.hxx:
--------------------------------------------------------------------------------
 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
 2 | #pragma once
 3 | 
 4 | #include "cta_reduce.hxx"
 5 | #include "memory.hxx"
 6 | #include "transform.hxx"
 7 | #include "operators.hxx"
 8 | 
 9 | BEGIN_MGPU_NAMESPACE
10 | 
11 | template<typename launch_arg_t = empty_t, typename input_it, 
12 |   typename output_it, typename op_t>
13 | void reduce(input_it input, int count, output_it reduction, op_t op, 
14 |   context_t& context) {
15 | 
16 |   typedef typename conditional_typedef_t<launch_arg_t, 
17 |     launch_params_t<128, 8>
18 |   >::type_t launch_t;
19 | 
20 |   typedef typename std::iterator_traits<input_it>::value_type type_t;
21 | 
22 |   int num_ctas = launch_t::cta_dim(context).num_ctas(count);
23 |   mem_t<type_t> partials(num_ctas, context);
24 |   type_t* partials_data = partials.data();
25 | 
26 |   auto k = [=] MGPU_DEVICE(int tid, int cta) {
27 |     typedef typename launch_t::sm_ptx params_t;
28 |     enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
29 |     typedef cta_reduce_t<nt, type_t> reduce_t;
30 |     __shared__ typename reduce_t::storage_t shared_reduce;
31 | 
32 |     // Load the data for the first tile for each cta.
33 |     range_t tile = get_tile(cta, nv, count);
34 |     array_t<type_t, vt> x = mem_to_reg_strided<nt, vt>(input + tile.begin, 
35 |       tid, tile.count());
36 | 
37 |     // Reduce the multiple values per thread into a scalar.
38 |     type_t scalar;
39 |     strided_iterate<nt, vt>([&](int i, int j) {
40 |       scalar = i ? op(scalar, x[i]) : x[0];
41 |     }, tid, tile.count());
42 | 
43 |     // Reduce to a scalar per CTA.
44 |     scalar = reduce_t().reduce(tid, scalar, shared_reduce, 
45 |       min(tile.count(), (int)nt), op, false);
46 | 
47 |     if(!tid) {
48 |       if(1 == num_ctas) *reduction = scalar;
49 |       else partials_data[cta] = scalar;
50 |     }
51 |   };
52 |   cta_launch<launch_t>(k, num_ctas, context);
53 | 
54 |   // Recursively call reduce until there's just one scalar.
55 |   if(num_ctas > 1)
56 |     reduce<launch_params_t<512, 4> >(partials_data, num_ctas, reduction, op, 
57 |       context);
58 | }
59 | 
60 | template<typename launch_arg_t = empty_t, typename func_t, 
61 |   typename output_it, typename op_t>
62 | void transform_reduce(func_t f, int count, output_it reduction, op_t op, 
63 |   context_t& context) {
64 | 
65 |   typedef typename std::iterator_traits<output_it>::value_type type_t;
66 |   reduce<launch_arg_t>(make_load_iterator<type_t>(f), count, reduction, op, 
67 |     context);
68 | }
69 | 
70 | END_MGPU_NAMESPACE
71 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/kernel_sortedsearch.hxx:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "cta_merge.hxx"
 3 | #include "search.hxx"
 4 | 
 5 | BEGIN_MGPU_NAMESPACE
 6 | 
 7 | template<bounds_t bounds, typename launch_arg_t = empty_t,
 8 |   typename needles_it, typename haystack_it, typename indices_it,
 9 |   typename comp_it>
10 | void sorted_search(needles_it needles, int num_needles, haystack_it haystack,
11 |   int num_haystack, indices_it indices, comp_it comp, context_t& context) {
12 | 
13 |   typedef typename conditional_typedef_t<launch_arg_t, 
14 |     launch_box_t<
15 |       arch_20_cta<128, 15>,
16 |       arch_35_cta<128, 11>,
17 |       arch_52_cta<128, 15>
18 |     >
19 |   >::type_t launch_t;
20 | 
21 |   typedef typename std::iterator_traits<needles_it>::value_type type_t;
22 | 
23 |   // Partition the needles and haystacks into tiles.
24 |   mem_t<int> partitions = merge_path_partitions<bounds>(needles, num_needles,
25 |     haystack, num_haystack, launch_t::nv(context), comp, context);
26 |   const int* mp_data = partitions.data();
27 | 
28 |   auto k = [=]MGPU_DEVICE(int tid, int cta) {
29 |     typedef typename launch_t::sm_ptx params_t;
30 |     enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt };
31 |     
32 |     __shared__ union {
33 |       type_t keys[nv + 1];
34 |       int indices[nv];
35 |     } shared;
36 | 
37 |     // Load the range for this CTA and merge the values into register.
38 |     int mp0 = mp_data[cta + 0];
39 |     int mp1 = mp_data[cta + 1];
40 |     merge_range_t range = compute_merge_range(num_needles, num_haystack, cta,
41 |       nv, mp0, mp1);
42 | 
43 |     // Merge the values needles and haystack.
44 |     merge_pair_t<type_t, vt> merge = cta_merge_from_mem<bounds, nt, vt>(
45 |       needles, haystack, range, tid, comp, shared.keys);
46 | 
47 |     // Store the needle indices to shared memory.
48 |     iterate<vt>([&](int i) {
49 |       if(merge.indices[i] < range.a_count()) {
50 |         int needle = merge.indices[i];
51 |         int haystack = range.b_begin + vt * tid + i - needle;
52 |         shared.indices[needle] = haystack;
53 |       }
54 |     });
55 |     __syncthreads();
56 | 
57 |     shared_to_mem<nt, vt>(shared.indices, tid, range.a_count(), 
58 |       indices + range.a_begin);
59 |   };
60 | 
61 |   cta_transform<launch_t>(k, num_needles + num_haystack, context);
62 | }
63 | 
64 | END_MGPU_NAMESPACE
65 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/launch_box.hxx:
--------------------------------------------------------------------------------
 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
 2 | #pragma once
 3 | 
 4 | #include "context.hxx"
 5 | 
 6 | BEGIN_MGPU_NAMESPACE
 7 | 
 8 | // Specializable launch parameters.
 9 | struct launch_box_default_t {
10 |   typedef launch_cta_t<0, 0, 0> sm_00;
11 |   typedef empty_t sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53;
12 | 
13 |   template<typename new_base_t>
14 |   using rebind = launch_box_default_t;
15 | };
16 | 
17 | template<typename... params_v>
18 | struct launch_box_t : inherit_t<params_v..., launch_box_default_t> { 
19 |   typedef inherit_t<params_v..., launch_box_default_t> base_t; 
20 | 
21 |   typedef typename conditional_typedef_t<
22 |     typename base_t::sm_20, typename base_t::sm_00
23 |   >::type_t sm_20;
24 | 
25 | #define INHERIT_LAUNCH_PARAMS(new_ver, old_ver) \
26 |   typedef typename conditional_typedef_t< \
27 |     typename base_t::sm_##new_ver, sm_##old_ver \
28 |   >::type_t sm_##new_ver;
29 |   
30 |   INHERIT_LAUNCH_PARAMS(21, 20)
31 |   INHERIT_LAUNCH_PARAMS(30, 21)
32 |   INHERIT_LAUNCH_PARAMS(32, 30)
33 |   INHERIT_LAUNCH_PARAMS(35, 30)
34 |   INHERIT_LAUNCH_PARAMS(37, 35)
35 |   INHERIT_LAUNCH_PARAMS(50, 35)
36 |   INHERIT_LAUNCH_PARAMS(52, 50)
37 |   INHERIT_LAUNCH_PARAMS(53, 50)
38 | 
39 |   // Overwrite the params defined for sm_00 so that the host-side compiler
40 |   // has all expected symbols available to it.
41 |   typedef sm_53 sm_00;
42 |   typedef MGPU_LAUNCH_PARAMS(launch_box_t) sm_ptx;
43 | 
44 |   static cta_dim_t cta_dim(int ptx_version) {
45 |     // Ptx version from cudaFuncGetAttributes.
46 |     if     (ptx_version == 53) return cta_dim_t { sm_53::nt, sm_53::vt };
47 |     else if(ptx_version >= 52) return cta_dim_t { sm_52::nt, sm_52::vt };
48 |     else if(ptx_version >= 50) return cta_dim_t { sm_50::nt, sm_50::vt };
49 |     else if(ptx_version == 37) return cta_dim_t { sm_37::nt, sm_37::vt };
50 |     else if(ptx_version >= 35) return cta_dim_t { sm_35::nt, sm_35::vt };
51 |     else if(ptx_version == 32) return cta_dim_t { sm_32::nt, sm_32::vt };
52 |     else if(ptx_version >= 30) return cta_dim_t { sm_30::nt, sm_30::vt };
53 |     else if(ptx_version >= 21) return cta_dim_t { sm_21::nt, sm_21::vt };
54 |     else if(ptx_version >= 20) return cta_dim_t { sm_20::nt, sm_20::vt };
55 |     else return cta_dim_t { -1, 0 };
56 |   }
57 | 
58 |   static cta_dim_t cta_dim(const context_t& context) {
59 |     return cta_dim(context.ptx_version());
60 |   }
61 | 
62 |   static int nv(const context_t& context) {
63 |     return cta_dim(context.ptx_version()).nv();
64 |   }
65 | };
66 | 
67 | 
68 | template<typename launch_box, typename func_t, typename... args_t>
69 | int occupancy(func_t f, const context_t& context, args_t... args) {
70 |   int num_blocks;
71 |   int nt = launch_box::cta_dim(context).nt;
72 |   cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
73 |     &num_blocks, 
74 |     &launch_box_cta_k<launch_box, func_t, args_t...>, 
75 |     nt,
76 |     (size_t)0
77 |   );
78 |   if(cudaSuccess != result) throw cuda_exception_t(result);
79 |   return context.props().multiProcessorCount * num_blocks;
80 | }
81 | 
82 | END_MGPU_NAMESPACE
83 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/launch_params.hxx:
--------------------------------------------------------------------------------
  1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
  2 | #pragma once
  3 | 
  4 | #include "meta.hxx"
  5 | #include "tuple.hxx"
  6 | 
  7 | #ifdef __CUDA_ARCH__
  8 | #if   __CUDA_ARCH__ == 530
  9 |   #define MGPU_SM_TAG sm_53
 10 | #elif __CUDA_ARCH__ >= 520
 11 |   #define MGPU_SM_TAG sm_52
 12 | #elif __CUDA_ARCH__ >= 500
 13 |   #define MGPU_SM_TAG sm_50
 14 | #elif __CUDA_ARCH__ == 370
 15 |   #define MGPU_SM_TAG sm_37
 16 | #elif __CUDA_ARCH__ >= 350
 17 |   #define MGPU_SM_TAG sm_35
 18 | #elif __CUDA_ARCH__ == 320
 19 |   #define MGPU_SM_TAG sm_32
 20 | #elif __CUDA_ARCH__ >= 300
 21 |   #define MGPU_SM_TAG sm_30
 22 | #elif __CUDA_ARCH__ >= 210
 23 |   #define MGPU_SM_TAG sm_21
 24 | #elif __CUDA_ARCH__ >= 200
 25 |   #define MGPU_SM_TAG sm_20
 26 | #else
 27 |   #error "Modern GPU v3 does not support builds for sm_1.x"
 28 | #endif
 29 | #else // __CUDA_ARCH__
 30 |   #define MGPU_SM_TAG sm_00
 31 | #endif
 32 | 
 33 | #define MGPU_LAUNCH_PARAMS(launch_box) \
 34 |   typename launch_box::MGPU_SM_TAG
 35 | #define MGPU_LAUNCH_BOUNDS(launch_box) \
 36 |   __launch_bounds__(launch_box::sm_ptx::nt, launch_box::sm_ptx::occ) 
 37 | 
 38 | BEGIN_MGPU_NAMESPACE
 39 | 
 40 | struct MGPU_ALIGN(8) cta_dim_t {
 41 |   int nt, vt;
 42 |   int nv() const { return nt * vt; }
 43 |   int num_ctas(int count) const {
 44 |     return div_up(count, nv());
 45 |   }
 46 | };
 47 | 
 48 | namespace detail {
 49 | 
 50 | // Due to a bug in the compiler we need to expand make_restrict() before
 51 | // branching on cta < num_ctas.
 52 | template<typename func_t, typename... args_t>
 53 | MGPU_DEVICE void restrict_forward(func_t f, int tid, int cta, int num_ctas,
 54 |   args_t... args) {
 55 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 56 |   if(cta < num_ctas) 
 57 | #endif 
 58 |     f(tid, cta, args...);
 59 | }
 60 | 
 61 | }
 62 | 
 63 | // Generic thread cta kernel.
 64 | template<typename launch_box, typename func_t, typename... args_t>
 65 | __global__ MGPU_LAUNCH_BOUNDS(launch_box)
 66 | void launch_box_cta_k(func_t f, int num_ctas, args_t... args) {
 67 |   // Masking threadIdx.x by (nt - 1) may help strength reduction because the
 68 |   // compiler now knows the range of tid: (0, nt).
 69 |   typedef typename launch_box::sm_ptx params_t;
 70 |   int tid = (int)(threadIdx.x % (unsigned)params_t::nt);
 71 |   int cta = blockIdx.x;
 72 | 
 73 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 74 |   cta += gridDim.x * blockIdx.y;
 75 | #endif
 76 | 
 77 |   detail::restrict_forward(f, tid, cta, num_ctas, make_restrict(args)...);
 78 | }
 79 | 
 80 | // Dummy kernel for retrieving PTX version.
 81 | template<int dummy_arg>
 82 | __global__ void dummy_k() { }
 83 | 
 84 | template<int nt_, int vt_ = 1, int vt0_ = vt_, int occ_= 0>
 85 | struct launch_cta_t {
 86 |   enum { nt = nt_, vt = vt_, vt0 = vt0_, occ = occ_ };
 87 | };
 88 | 
 89 | #define DEF_ARCH_STRUCT(ver)                                                  \
 90 |   template<typename params_t, typename base_t = empty_t>                      \
 91 |   struct arch_##ver : base_t {                                                \
 92 |     typedef params_t sm_##ver;                                                \
 93 |                                                                               \
 94 |     template<typename new_base_t>                                             \
 95 |     using rebind = arch_##ver<params_t, new_base_t>;                          \
 96 |   };                                                                          \
 97 |                                                                               \
 98 |   template<int nt, int vt = 1, int vt0 = vt, int occ = 0>                     \
 99 |   using arch_##ver##_cta = arch_##ver<launch_cta_t<nt, vt, vt0, occ> >;
100 | 
101 | DEF_ARCH_STRUCT(20)
102 | DEF_ARCH_STRUCT(21)
103 | DEF_ARCH_STRUCT(30)
104 | DEF_ARCH_STRUCT(32)
105 | DEF_ARCH_STRUCT(35)
106 | DEF_ARCH_STRUCT(37)
107 | DEF_ARCH_STRUCT(50)
108 | DEF_ARCH_STRUCT(52)
109 | DEF_ARCH_STRUCT(53)
110 | 
111 | #undef DEF_ARCH_STRUCT
112 | 
113 | struct context_t;
114 | 
115 | // Non-specializable launch parameters.
116 | template<int nt, int vt, int vt0 = vt, int occ = 0>
117 | struct launch_params_t : launch_cta_t<nt, vt, vt0, occ> {
118 |   typedef launch_params_t sm_ptx;
119 | 
120 |   static cta_dim_t cta_dim() {
121 |     return cta_dim_t { nt, vt };
122 |   }
123 | 
124 |   static cta_dim_t cta_dim(int) {
125 |     return cta_dim();
126 |   }
127 | 
128 |   static cta_dim_t cta_dim(const context_t& context) {
129 |     return cta_dim();
130 |   }
131 | 
132 |   static int nv(const context_t& context) {
133 |     return cta_dim().nv();
134 |   }
135 | };
136 | 
137 | END_MGPU_NAMESPACE
138 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/memory.hxx:
--------------------------------------------------------------------------------
  1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
  2 | #pragma once
  3 | 
  4 | #include "transform.hxx"
  5 | #include "context.hxx"
  6 | 
  7 | BEGIN_MGPU_NAMESPACE
  8 | 
  9 | ////////////////////////////////////////////////////////////////////////////////
 10 | // Memory functions on raw pointers.
 11 | 
 12 | template<typename type_t>
 13 | cudaError_t htoh(type_t* dest, const type_t* source, size_t count) {
 14 |   if(count) 
 15 |     memcpy(dest, source, sizeof(type_t) * count);
 16 |   return cudaSuccess;
 17 | }
 18 | 
 19 | template<typename type_t>
 20 | cudaError_t dtoh(type_t* dest, const type_t* source, size_t count) {
 21 |   cudaError_t result = count ? 
 22 |     cudaMemcpy(dest, source, sizeof(type_t) * count,
 23 |       cudaMemcpyDeviceToHost) :
 24 |     cudaSuccess;
 25 |   return result;
 26 | }
 27 | 
 28 | template<typename type_t>
 29 | cudaError_t htod(type_t* dest, const type_t* source, size_t count) {
 30 |   cudaError_t result = count ?
 31 |     cudaMemcpy(dest, source, sizeof(type_t) * count,
 32 |       cudaMemcpyHostToDevice) :
 33 |     cudaSuccess;
 34 |   return result;
 35 | }
 36 | 
 37 | template<typename type_t>
 38 | cudaError_t dtod(type_t* dest, const type_t* source, size_t count) {
 39 |   cudaError_t result = count ?
 40 |     cudaMemcpy(dest, source, sizeof(type_t) * count,
 41 |       cudaMemcpyDeviceToDevice) :
 42 |     cudaSuccess;
 43 |   return result;
 44 | }
 45 | 
 46 | template<typename type_t>
 47 | cudaError_t dtoh(std::vector<type_t>& dest, const type_t* source, 
 48 |   size_t count) {
 49 |   dest.resize(count);
 50 |   return dtoh(dest.data(), source, count);
 51 | }
 52 | 
 53 | template<typename type_t>
 54 | cudaError_t htod(type_t* dest, const std::vector<type_t>& source) {
 55 |   return htod(dest, source.data(), source.size());
 56 | }
 57 | 
 58 | ////////////////////////////////////////////////////////////////////////////////
 59 | // Memory functions on mem_t.
 60 | 
 61 | template<typename type_t>
 62 | mem_t<type_t> to_mem(const std::vector<type_t>& data, context_t& context) {
 63 |   mem_t<type_t> mem(data.size(), context);
 64 |   cudaError_t result = htod(mem.data(), data);
 65 |   if(cudaSuccess != result) throw cuda_exception_t(result);
 66 |   return mem;
 67 | }
 68 | 
 69 | template<typename type_t>
 70 | std::vector<type_t> from_mem(const mem_t<type_t>& mem) {
 71 |   std::vector<type_t> host;
 72 |   cudaError_t result = dtoh(host, mem.data(), mem.size());
 73 |   if(cudaSuccess != result) throw cuda_exception_t(result);
 74 |   return host;
 75 | }
 76 | 
 77 | template<typename type_t, typename func_t>
 78 | mem_t<type_t> fill_function(func_t f, size_t count, context_t& context) {
 79 |   mem_t<type_t> mem(count, context);
 80 |   type_t* p = mem.data();
 81 |   transform([=]MGPU_DEVICE(int index) {
 82 |     p[index] = f(index);
 83 |   }, count, context);
 84 |   return mem;
 85 | }
 86 | 
 87 | template<typename type_t>
 88 | mem_t<type_t> fill(type_t value, size_t count, context_t& context) {
 89 |   // We'd prefer to call fill_function and pass a lambda that returns value,
 90 |   // but that can create tokens that are too long for VS2013.
 91 |   mem_t<type_t> mem(count, context);
 92 |   type_t* p = mem.data();
 93 |   transform([=]MGPU_DEVICE(int index) {
 94 |     p[index] = value;
 95 |   }, count, context);
 96 |   return mem;
 97 | }
 98 | 
 99 | template<typename it_t>
100 | auto copy_to_mem(it_t input, size_t count, context_t& context) -> 
101 |   mem_t<typename std::iterator_traits<it_t>::value_type> {
102 |   
103 |   typedef typename std::iterator_traits<it_t>::value_type type_t;
104 |   mem_t<type_t> mem(count, context);
105 |   type_t* p = mem.data();
106 |   transform([=]MGPU_DEVICE(int index) {
107 |     p[index] = input[index];
108 |   }, count, context);
109 |   return mem;
110 | }
111 | 
112 | inline std::mt19937& get_mt19937() {
113 |   static std::mt19937 mt19937;
114 |   return mt19937;
115 | }
116 | 
117 | mem_t<int> inline fill_random(int a, int b, size_t count, bool sorted, 
118 |   context_t& context) {
119 | 
120 |   std::uniform_int_distribution<int> d(a, b);
121 |   std::vector<int> data(count);
122 | 
123 |   for(int& i : data)
124 |     i = d(get_mt19937());
125 |   if(sorted) 
126 |     std::sort(data.begin(), data.end());
127 | 
128 |   return to_mem(data, context);
129 | }
130 | 
131 | END_MGPU_NAMESPACE
132 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/search.hxx:
--------------------------------------------------------------------------------
 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
 2 | #pragma once
 3 | 
 4 | #include "loadstore.hxx"
 5 | #include "operators.hxx"
 6 | #include "cta_search.hxx"
 7 | #include "memory.hxx"
 8 | #include "context.hxx"
 9 | 
10 | BEGIN_MGPU_NAMESPACE
11 | 
12 | template<bounds_t bounds, typename a_keys_it, typename b_keys_it,
13 |   typename comp_t>
14 | mem_t<int> merge_path_partitions(a_keys_it a, int64_t a_count, b_keys_it b,
15 |   int64_t b_count, int64_t spacing, comp_t comp, context_t& context) {
16 | 
17 |   typedef int int_t;
18 |   int num_partitions = (int)div_up(a_count + b_count, spacing) + 1;
19 |   mem_t<int_t> mem(num_partitions, context);
20 |   int_t* p = mem.data();
21 |   transform([=]MGPU_DEVICE(int index) {
22 |     int_t diag = (int_t)min(spacing * index, a_count + b_count);
23 |     p[index] = merge_path<bounds>(a, (int_t)a_count, b, (int_t)b_count,
24 |       diag, comp);
25 |   }, num_partitions, context);
26 |   return mem;
27 | }
28 | 
29 | template<typename segments_it>
30 | auto load_balance_partitions(int64_t dest_count, segments_it segments, 
31 |   int num_segments, int spacing, context_t& context) -> 
32 |   mem_t<typename std::iterator_traits<segments_it>::value_type> {
33 | 
34 |   typedef typename std::iterator_traits<segments_it>::value_type int_t;
35 |   return merge_path_partitions<bounds_upper>(counting_iterator_t<int_t>(0), 
36 |     dest_count, segments, num_segments, spacing, less_t<int_t>(), context);
37 | }
38 | 
39 | template<bounds_t bounds, typename keys_it>
40 | mem_t<int> binary_search_partitions(keys_it keys, int count, int num_items,
41 |   int spacing, context_t& context) {
42 | 
43 |   int num_partitions = div_up(count, spacing) + 1;
44 |   mem_t<int> mem(num_partitions, context);
45 |   int* p = mem.data();
46 |   transform([=]MGPU_DEVICE(int index) {
47 |     int key = min(spacing * index, count);
48 |     p[index] = binary_search<bounds>(keys, num_items, key, less_t<int>());
49 |   }, num_partitions, context);
50 |   return mem;
51 | }
52 | 
53 | END_MGPU_NAMESPACE
54 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/sort_networks.hxx:
--------------------------------------------------------------------------------
 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
 2 | #pragma once
 3 | #include "operators.hxx"
 4 | 
 5 | BEGIN_MGPU_NAMESPACE
 6 | 
 7 | ////////////////////////////////////////////////////////////////////////////////
 8 | // Odd-even transposition sorting network. Sorts keys and values in-place in
 9 | // register.
10 | // http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort
11 | 
12 | template<typename type_t, int vt, typename comp_t>
13 | MGPU_HOST_DEVICE array_t<type_t, vt> 
14 | odd_even_sort(array_t<type_t, vt> x, comp_t comp, int flags = 0) { 
15 |   iterate<vt>([&](int I) {
16 |     PRAGMA_UNROLL
17 |     for(int i = 1 & I; i < vt - 1; i += 2) {
18 |       if((0 == ((2<< i) & flags)) && comp(x[i + 1], x[i]))
19 |         swap(x[i], x[i + 1]);
20 |     }
21 |   });
22 |   return x;
23 | }
24 | 
25 | template<typename key_t, typename val_t, int vt, typename comp_t>
26 | MGPU_HOST_DEVICE kv_array_t<key_t, val_t, vt> 
27 | odd_even_sort(kv_array_t<key_t, val_t, vt> x, comp_t comp, int flags = 0) { 
28 |   iterate<vt>([&](int I) {
29 |     PRAGMA_UNROLL
30 |     for(int i = 1 & I; i < vt - 1; i += 2) {
31 |       if((0 == ((2<< i) & flags)) && comp(x.keys[i + 1], x.keys[i])) {
32 |         swap(x.keys[i], x.keys[i + 1]);
33 |         swap(x.vals[i], x.vals[i + 1]);
34 |       }
35 |     }
36 |   });
37 |   return x;
38 | }
39 | 
40 | ////////////////////////////////////////////////////////////////////////////////
41 | // TODO: Batcher Odd-Even Mergesort network
42 | // Unstable but executes much faster than the transposition sort.
43 | // http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
44 | #if 0
45 | template<int width, int low, int count>
46 | struct odd_even_mergesort_t {
47 | 
48 | };
49 | 
50 | template<typename key_t, typename val_t, int vt, typename comp_t>
51 | MGPU_HOST_DEVICE kv_array_t<key_t, val_t, vt> 
52 | odd_even_mergesort(kv_array_t<key_t, val_t, vt> x, int flags = 0) {
53 |   return kv_array_t<key_t, val_t, vt>();
54 | }
55 | #endif
56 | 
57 | END_MGPU_NAMESPACE
58 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/transform.hxx:
--------------------------------------------------------------------------------
  1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
  2 | #pragma once
  3 | 
  4 | #include <random>
  5 | #include <algorithm>
  6 | #include <cuda.h>
  7 | #include "launch_box.hxx"
  8 | 
  9 | BEGIN_MGPU_NAMESPACE
 10 | 
 11 | ////////////////////////////////////////////////////////////////////////////////
 12 | // Launch a grid given a number of CTAs.
 13 | 
 14 | template<typename launch_box, typename func_t, typename... args_t>
 15 | void cta_launch(func_t f, int num_ctas, context_t& context, args_t... args) { 
 16 |   cta_dim_t cta = launch_box::cta_dim(context.ptx_version());
 17 |   dim3 grid_dim(num_ctas);
 18 |   if(context.ptx_version() < 30 && num_ctas > 65535)
 19 |     grid_dim = dim3(256, div_up(num_ctas, 256));
 20 |   
 21 |   if(num_ctas)
 22 |     launch_box_cta_k<launch_box, func_t>
 23 |       <<<grid_dim, cta.nt, 0, context.stream()>>>(f, num_ctas, args...);
 24 | }
 25 | 
 26 | template<int nt, int vt = 1, typename func_t, typename... args_t>
 27 | void cta_launch(func_t f, int num_ctas, context_t& context, args_t... args) {
 28 |   cta_launch<launch_params_t<nt, vt> >(f, num_ctas, context, args...);
 29 | }
 30 | 
 31 | ////////////////////////////////////////////////////////////////////////////////
 32 | // Launch a grid given a number of work-items.
 33 | 
 34 | template<typename launch_box, typename func_t, typename... args_t>
 35 | void cta_transform(func_t f, int count, context_t& context, args_t... args) {
 36 |   cta_dim_t cta = launch_box::cta_dim(context.ptx_version());
 37 |   int num_ctas = div_up(count, cta.nv());
 38 |   cta_launch<launch_box>(f, num_ctas, context, args...);
 39 | }
 40 | 
 41 | template<int nt, int vt = 1, typename func_t, typename... args_t>
 42 | void cta_transform(func_t f, int count, context_t& context, args_t... args) {
 43 |   cta_transform<launch_params_t<nt, vt> >(f, count, context, args...);
 44 | }
 45 | 
 46 | ////////////////////////////////////////////////////////////////////////////////
 47 | // Launch persistent CTAs and loop through num_ctas values.
 48 | 
 49 | template<typename launch_box, typename func_t, typename... args_t>
 50 | void cta_launch(func_t f, const int* num_tiles, context_t& context, 
 51 |   args_t... args) {
 52 | 
 53 |   // Over-subscribe the device by a factor of 8.
 54 |   // This reduces the penalty if we can't schedule all the CTAs to run 
 55 |   // concurrently.
 56 |   int num_ctas = 8 * occupancy<launch_box>(f, context);
 57 | 
 58 |   auto k = [=] MGPU_DEVICE(int tid, int cta, args_t... args) {
 59 |     int count = *num_tiles;
 60 |     while(cta < count) {
 61 |       f(tid, cta, args...);
 62 |       cta += num_ctas;
 63 |     }
 64 |   };
 65 |   cta_launch<launch_box>(k, num_ctas, context, args...);
 66 | }
 67 | 
 68 | ////////////////////////////////////////////////////////////////////////////////
 69 | // Ordinary transform launch. This uses the standard launch box mechanism 
 70 | // so we can query its occupancy and other things.
 71 | 
 72 | namespace detail {
 73 | 
 74 | template<typename launch_t>
 75 | struct transform_f {
 76 |   template<typename func_t, typename... args_t>
 77 |   MGPU_DEVICE void operator()(int tid, int cta, func_t f, 
 78 |     size_t count, args_t... args) {
 79 | 
 80 |     typedef typename launch_t::sm_ptx params_t;
 81 |     enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 };
 82 | 
 83 |     range_t range = get_tile(cta, nt * vt, count);
 84 | 
 85 |     strided_iterate<nt, vt, vt0>([=](int i, int j) {
 86 |       f(range.begin + j, args...);
 87 |     }, tid, range.count());  
 88 |   }
 89 | };
 90 | 
 91 | } 
 92 | 
 93 | template<typename launch_t, typename func_t, typename... args_t>
 94 | void transform(func_t f, size_t count, context_t& context, args_t... args) {
 95 |   cta_transform<launch_t>(detail::transform_f<launch_t>(), count, 
 96 |     context, f, count, args...);
 97 | }
 98 | 
 99 | template<size_t nt = 128, int vt = 1, typename func_t, typename... args_t>
100 | void transform(func_t f, size_t count, context_t& context, args_t... args) {
101 |   transform<launch_params_t<nt, vt> >(f, count, context, args...);
102 | }
103 | 
104 | END_MGPU_NAMESPACE
105 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/types.hxx:
--------------------------------------------------------------------------------
  1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com
  2 | #pragma once
  3 | 
  4 | #include "meta.hxx"
  5 | #include "operators.hxx"
  6 | 
  7 | BEGIN_MGPU_NAMESPACE
  8 | 
  9 | struct cuda_exception_t : std::exception {
 10 |   cudaError_t result;
 11 | 
 12 |   cuda_exception_t(cudaError_t result_) : result(result_) { }
 13 |   virtual const char* what() const noexcept { 
 14 |     return cudaGetErrorString(result); 
 15 |   }
 16 | };
 17 | 
 18 | 
 19 | template<typename type_t, int size>
 20 | struct array_t {
 21 |   type_t data[size];
 22 | 
 23 |   MGPU_HOST_DEVICE type_t operator[](int i) const { return data[i]; }
 24 |   MGPU_HOST_DEVICE type_t& operator[](int i) { return data[i]; }
 25 | 
 26 |   array_t() = default;
 27 |   array_t(const array_t&) = default;
 28 |   array_t& operator=(const array_t&) = default;
 29 | 
 30 |   // Fill the array with x.
 31 |   MGPU_HOST_DEVICE array_t(type_t x) { 
 32 |     iterate<size>([&](int i) { data[i] = x; });  
 33 |   }
 34 | };
 35 | 
 36 | template<typename type_t>
 37 | struct array_t<type_t, 0> { 
 38 |   MGPU_HOST_DEVICE type_t operator[](int i) const { return type_t(); }
 39 |   MGPU_HOST_DEVICE type_t& operator[](int i) { return *(type_t*)nullptr; }
 40 | };
 41 | 
 42 | // Reduce on components of array_t.
 43 | template<typename type_t, int size, typename op_t = plus_t<type_t> >
 44 | MGPU_HOST_DEVICE type_t reduce(array_t<type_t, size> x, op_t op = op_t()) {
 45 |   type_t a;
 46 |   iterate<size>([&](int i) {
 47 |     a = i ? op(a, x[i]) : x[i];
 48 |   });
 49 |   return a;
 50 | }
 51 | 
 52 | // Call the operator component-wise on all components.
 53 | template<typename type_t, int size, typename op_t>
 54 | MGPU_HOST_DEVICE array_t<type_t, size> combine(array_t<type_t, size> x,
 55 |   array_t<type_t, size> y, op_t op) {
 56 | 
 57 |   array_t<type_t, size> z;
 58 |   iterate<size>([&](int i) { z[i] = op(x[i], y[i]); });
 59 |   return z;
 60 | }
 61 | 
 62 | template<typename type_t, int size>
 63 | MGPU_HOST_DEVICE array_t<type_t, size> operator+(
 64 |   array_t<type_t, size> a, array_t<type_t, size> b) {
 65 |   return combine(a, b, plus_t<type_t>());
 66 | }
 67 | 
 68 | template<typename type_t, int size>
 69 | MGPU_HOST_DEVICE array_t<type_t, size> operator-(
 70 |   array_t<type_t, size> a, array_t<type_t, size> b) {
 71 |   return combine(a, b, minus_t<type_t>());
 72 | }
 73 | 
 74 | 
 75 | template<typename key_t, typename val_t, int size>
 76 | struct kv_array_t {
 77 |   array_t<key_t, size> keys;
 78 |   array_t<val_t, size> vals;
 79 | };
 80 | 
 81 | enum bounds_t { 
 82 |   bounds_lower,
 83 |   bounds_upper
 84 | };
 85 | 
 86 | struct MGPU_ALIGN(8) range_t {
 87 |   int begin, end;
 88 |   MGPU_HOST_DEVICE int size() const { return end - begin; }
 89 |   MGPU_HOST_DEVICE int count() const { return size(); }
 90 |   MGPU_HOST_DEVICE bool valid() const { return end > begin; }
 91 | };
 92 | 
 93 | MGPU_HOST_DEVICE range_t get_tile(int cta, int nv, int count) {
 94 |   return range_t { nv * cta, min(count, nv * (cta + 1)) };
 95 | }
 96 | 
 97 | 
 98 | struct MGPU_ALIGN(16) merge_range_t {
 99 |   int a_begin, a_end, b_begin, b_end;
100 | 
101 |   MGPU_HOST_DEVICE int a_count() const { return a_end - a_begin; }
102 |   MGPU_HOST_DEVICE int b_count() const { return b_end - b_begin; }
103 |   MGPU_HOST_DEVICE int total() const { return a_count() + b_count(); }
104 | 
105 |   MGPU_HOST_DEVICE range_t a_range() const { 
106 |     return range_t { a_begin, a_end };
107 |   }
108 |   MGPU_HOST_DEVICE range_t b_range() const {
109 |     return range_t { b_begin, b_end };
110 |   }
111 | 
112 |   MGPU_HOST_DEVICE merge_range_t to_local() const {
113 |     return merge_range_t { 0, a_count(), a_count(), total() };
114 |   }
115 |   
116 |   // Partition from mp to the end.
117 |   MGPU_HOST_DEVICE merge_range_t partition(int mp0, int diag) const {
118 |     return merge_range_t { a_begin + mp0, a_end, b_begin + diag - mp0, b_end };
119 |   }
120 | 
121 |   // Partition from mp0 to mp1.
122 |   MGPU_HOST_DEVICE merge_range_t partition(int mp0, int diag0,
123 |     int mp1, int diag1) const {
124 |     return merge_range_t { 
125 |       a_begin + mp0, 
126 |       a_begin + mp1,
127 |       b_begin + diag0 - mp0,
128 |       b_begin + diag1 - mp1
129 |     };
130 |   }
131 | 
132 |   MGPU_HOST_DEVICE bool a_valid() const { 
133 |     return a_begin < a_end; 
134 |   }
135 |   MGPU_HOST_DEVICE bool b_valid() const {
136 |     return b_begin < b_end;
137 |   }
138 | };
139 | 
140 | template<typename type_t, int size>
141 | struct merge_pair_t {
142 |   array_t<type_t, size> keys;
143 |   array_t<int, size> indices;
144 | };
145 | 
146 | 
147 | END_MGPU_NAMESPACE
148 | 


--------------------------------------------------------------------------------
/code/include/moderngpu/util.hxx:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "types.hxx"
 3 | #include <cstdarg>
 4 | #include <string>
 5 | 
 6 | BEGIN_MGPU_NAMESPACE
 7 | 
 8 | namespace detail {
 9 | 
10 | inline std::string stringprintf(const char* format, ...) {
11 |   va_list args;
12 |   va_start(args, format);
13 |   int len = vsnprintf(0, 0, format, args);
14 |   va_end(args);
15 | 
16 |   // allocate space.
17 |   std::string text;
18 |   text.resize(len);
19 | 
20 |   va_start(args, format);
21 |   vsnprintf(&text[0], len + 1, format, args);
22 |   va_end(args);
23 | 
24 |   return text;
25 | }
26 | 
27 | } // namespace detail
28 | 
29 | END_MGPU_NAMESPACE
30 | 
31 | 


--------------------------------------------------------------------------------
/code/include/pugixml/pugiconfig.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * pugixml parser - version 1.9
 3 |  * --------------------------------------------------------
 4 |  * Copyright (C) 2006-2019, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 5 |  * Report bugs and download new versions at https://pugixml.org/
 6 |  *
 7 |  * This library is distributed under the MIT License. See notice at the end
 8 |  * of this file.
 9 |  *
10 |  * This work is based on the pugxml parser, which is:
11 |  * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
12 |  */
13 | 
14 | #ifndef HEADER_PUGICONFIG_HPP
15 | #define HEADER_PUGICONFIG_HPP
16 | 
17 | // Uncomment this to enable wchar_t mode
18 | // #define PUGIXML_WCHAR_MODE
19 | 
20 | // Uncomment this to enable compact mode
21 | // #define PUGIXML_COMPACT
22 | 
23 | // Uncomment this to disable XPath
24 | // #define PUGIXML_NO_XPATH
25 | 
26 | // Uncomment this to disable STL
27 | // #define PUGIXML_NO_STL
28 | 
29 | // Uncomment this to disable exceptions
30 | // #define PUGIXML_NO_EXCEPTIONS
31 | 
32 | // Set this to control attributes for public classes/functions, i.e.:
33 | // #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL
34 | // #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL
35 | // #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall
36 | // In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead
37 | 
38 | // Tune these constants to adjust memory-related behavior
39 | // #define PUGIXML_MEMORY_PAGE_SIZE 32768
40 | // #define PUGIXML_MEMORY_OUTPUT_STACK 10240
41 | // #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096
42 | 
43 | // Uncomment this to switch to header-only version
44 | 
45 | #define PUGIXML_HEADER_ONLY
46 | 
47 | // Uncomment this to enable long long support
48 | // #define PUGIXML_HAS_LONG_LONG
49 | 
50 | #endif
51 | 
52 | /**
53 |  * Copyright (c) 2006-2019 Arseny Kapoulkine
54 |  *
55 |  * Permission is hereby granted, free of charge, to any person
56 |  * obtaining a copy of this software and associated documentation
57 |  * files (the "Software"), to deal in the Software without
58 |  * restriction, including without limitation the rights to use,
59 |  * copy, modify, merge, publish, distribute, sublicense, and/or sell
60 |  * copies of the Software, and to permit persons to whom the
61 |  * Software is furnished to do so, subject to the following
62 |  * conditions:
63 |  *
64 |  * The above copyright notice and this permission notice shall be
65 |  * included in all copies or substantial portions of the Software.
66 |  *
67 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
68 |  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
69 |  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
70 |  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
71 |  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
72 |  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
73 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
74 |  * OTHER DEALINGS IN THE SOFTWARE.
75 |  */
76 | 


--------------------------------------------------------------------------------
/code/scripts/collect_keyword_list_throughput.txt:
--------------------------------------------------------------------------------
1 | [
2 |    ('throughput', 'float'),
3 | ]


--------------------------------------------------------------------------------
/code/scripts/configs/app_spec_ngap_new_quickvalidation_part2:
--------------------------------------------------------------------------------
 1 | {
 2 |     "root": "/ngAP/automata_benchmark_original",
 3 |     "apps": [
 4 |         {
 5 |             "name": "Snort",
 6 |             "input": "AutomataZoo/Snort/benchmarks/inputs/wrccdc2012.pcap",
 7 |             "mnrl": "AutomataZoo/Snort/benchmarks/automata/snort.mnrl",
 8 |             "anml_no_ORs": "AutomataZoo/Snort/benchmarks/anml_remove_or/automata_0.anml",
 9 |             "optimized_ANML": "AutomataZoo/Snort/benchmarks/optimized_ANML/automata_0.anml",
10 |             "hs": "AutomataZoo/Snort/benchmarks/hs/automata.hs",
11 |             "automata": "AutomataZoo/Snort/benchmarks/anml_remove_or/automata_0.anml",
12 |             "quick_validation": "128259",
13 |         },
14 |         {
15 |             "name": "FileCarving",
16 |             "input": "AutomataZoo/FileCarving/benchmarks/inputs/fat32_files.input",
17 |             "anml": "AutomataZoo/FileCarving/benchmarks/automata/file_carver.anml",
18 |             "anml_no_ORs": "AutomataZoo/FileCarving/benchmarks/anml_remove_or/automata_0.anml",
19 |             "optimized_ANML": "AutomataZoo/FileCarving/benchmarks/optimized_ANML/automata_0.anml",
20 |             "mnrl": "AutomataZoo/FileCarving/benchmarks/mnrl/automata_0.mnrl",
21 |             "hs": "AutomataZoo/FileCarving/benchmarks/hs/automata.hs",
22 |             "automata": "AutomataZoo/FileCarving/benchmarks/anml_remove_or/automata_0.anml",
23 |             "quick_validation": "0",
24 |         },
25 |         {
26 |             "name": "ClamAV",
27 |             "input": "AutomataZoo/ClamAV/benchmarks/inputs/clamav.input",
28 |             "anml": "AutomataZoo/ClamAV/benchmarks/automata/clamav.anml",
29 |             "anml_no_ORs": "AutomataZoo/ClamAV/benchmarks/anml_remove_or/automata_0.anml",
30 |             "optimized_ANML": "AutomataZoo/ClamAV/benchmarks/optimized_ANML/automata_0.anml",
31 |             "mnrl": "AutomataZoo/ClamAV/benchmarks/mnrl/automata_0.mnrl",
32 |             "hs": "AutomataZoo/ClamAV/benchmarks/hs/automata.hs",
33 |             "automata": "AutomataZoo/ClamAV/benchmarks/anml_remove_or/automata_0.anml",
34 |             "quick_validation": "1",
35 |         },
36 |     ],
37 |     
38 |     "exclude_apps": [
39 |         "Hamming_N1000_l22_d5",
40 |         "Hamming_N1000_l31_d10",
41 |         "Levenshtein_l24d5",
42 |         "Levenshtein_l37d10",
43 |         "RandomForest_20_400_270",
44 |         "RandomForest_20_800_200",
45 |         "SeqMatch_BIBLE_w6_p10",
46 |         "Fermi",
47 |         "FileCarving",
48 |         "smallFileCarving",
49 |     ],
50 | }


--------------------------------------------------------------------------------
/code/scripts/configs/app_spec_ngap_new_quickvalidation_part3:
--------------------------------------------------------------------------------
 1 | {
 2 |     "root": "/ngAP/automata_benchmark_original",
 3 |     "apps": [
 4 |         {
 5 |             "name": "smallSnort",
 6 |             "input": "AutomataZoo/Snort/benchmarks/inputs/wrccdc2012.pcap",
 7 |             "mnrl": "AutomataZoo/Snort/benchmarks/automata/snort.mnrl",
 8 |             "anml_no_ORs": "AutomataZoo/Snort/benchmarks/anml_remove_or/automata_0.anml",
 9 |             "optimized_ANML": "AutomataZoo/Snort/benchmarks/optimized_ANML/automata_0.anml",
10 |             "hs": "AutomataZoo/Snort/benchmarks/hs/automata.hs",
11 |             "automata": "AutomataZoo/Snort/benchmarks/anml_remove_or/automata_0.anml",
12 |             "quick_validation": "128259",
13 |             "validation": 0,
14 |         },
15 |         {
16 |             "name": "smallFileCarving",
17 |             "input": "AutomataZoo/FileCarving/benchmarks/inputs/fat32_files.input",
18 |             "anml": "AutomataZoo/FileCarving/benchmarks/automata/file_carver.anml",
19 |             "anml_no_ORs": "AutomataZoo/FileCarving/benchmarks/anml_remove_or/automata_0.anml",
20 |             "optimized_ANML": "AutomataZoo/FileCarving/benchmarks/optimized_ANML/automata_0.anml",
21 |             "mnrl": "AutomataZoo/FileCarving/benchmarks/mnrl/automata_0.mnrl",
22 |             "hs": "AutomataZoo/FileCarving/benchmarks/hs/automata.hs",
23 |             "automata": "AutomataZoo/FileCarving/benchmarks/anml_remove_or/automata_0.anml",
24 |             "quick_validation": "0",
25 |         },
26 |         {
27 |             "name": "smallClamAV",
28 |             "input": "AutomataZoo/ClamAV/benchmarks/inputs/clamav.input",
29 |             "anml": "AutomataZoo/ClamAV/benchmarks/automata/clamav.anml",
30 |             "anml_no_ORs": "AutomataZoo/ClamAV/benchmarks/anml_remove_or/automata_0.anml",
31 |             "optimized_ANML": "AutomataZoo/ClamAV/benchmarks/optimized_ANML/automata_0.anml",
32 |             "mnrl": "AutomataZoo/ClamAV/benchmarks/mnrl/automata_0.mnrl",
33 |             "hs": "AutomataZoo/ClamAV/benchmarks/hs/automata.hs",
34 |             "automata": "AutomataZoo/ClamAV/benchmarks/anml_remove_or/automata_0.anml",
35 |             "quick_validation": "1",
36 |         },
37 |     ],
38 |     
39 |     "exclude_apps": [
40 |         "Hamming_N1000_l22_d5",
41 |         "Hamming_N1000_l31_d10",
42 |         "Levenshtein_l24d5",
43 |         "Levenshtein_l37d10",
44 |         "RandomForest_20_400_270",
45 |         "RandomForest_20_800_200",
46 |         "SeqMatch_BIBLE_w6_p10",
47 |         "Fermi",
48 |         "FileCarving",
49 |         "smallFileCarving",
50 |     ],
51 | }


--------------------------------------------------------------------------------
/code/scripts/configs/exec_config_ngap_groups_design_cpu:
--------------------------------------------------------------------------------
 1 | {
 2 |     "exp_parameters" : {
 3 |         "before-hyperscan" : [
 4 |             ("exec", 'hsrun', 'nocombination'),
 5 |             ("output-name", "before-hyperscan_", 'nocombination'),
 6 |             ("t", ["12"]),
 7 |             ("i", ["1000000"]),
 8 |             ("d", ["600"]),
 9 |         ],
10 |     },
11 |     "exp_times"    : 1,
12 |     "out_prefix"   : "output",
13 |     "input_suffix" : "1MB",
14 |     "exclude_configs" : ["o1-nonblocking-aas", "o1-nonblocking-unique"],
15 | }


--------------------------------------------------------------------------------
/code/scripts/configs/exec_config_ngap_groups_design_cpu_oneinput:
--------------------------------------------------------------------------------
 1 | {
 2 |     "exp_parameters" : {
 3 |         "before-hyperscan" : [
 4 |             ("exec", 'hsrun', 'nocombination'),
 5 |             ("output-name", "before-hyperscan_", 'nocombination'),
 6 |             ("t", ["12"]),
 7 |             ("i", ["1000000"]),
 8 |             ("d", ["1"]),
 9 |         ],
10 |     },
11 |     "exp_times"    : 1,
12 |     "out_prefix"   : "output",
13 |     "input_suffix" : "1MB",
14 |     "exclude_configs" : ["o1-nonblocking-aas", "o1-nonblocking-unique"],
15 | }


--------------------------------------------------------------------------------
/code/scripts/configs/exec_config_ngap_groups_design_sota_runahead:
--------------------------------------------------------------------------------
 1 | {
 2 |     "exp_parameters" : {
 3 |         "before-runahead-cc4" : [
 4 |             ("exec", 'asyncap', 'nocombination'),
 5 |             ("output-name", "before-runahead-cc4_", 'nocombination'),
 6 |             ("algorithm", ["runahead"]),
 7 |             ("input-len", ["1000000"]),
 8 |             ("report-off", ['true']),
 9 |             ("duplicate-input-stream", ['600']),
10 |             ("one-output-capacity", ['104619400']),
11 |             ("scanning-R", ['999999999']),
12 |             ("block-size", ['128']),
13 |             ("record-ir", ['0']),
14 |             ("blockDimX", ['-1']),
15 |             ("num-streams", ['4']),
16 |             ("merge-cc", ['4']),
17 |             ("shrmem-wl", ['1']),
18 |             ("shr_wl_len", ['4']),
19 |             ("remove-degree", ['false']),
20 |             ("quit-degree", ['false']),
21 |         ],
22 | 
23 |     },
24 | 
25 |     'exp_times'    : 1,
26 |     'out_prefix'   : 'output',
27 |     'input_suffix' : '1MB'
28 | }
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/code/scripts/configs/exec_config_ngap_groups_design_sota_runahead_4degree:
--------------------------------------------------------------------------------
 1 | {
 2 |     "exp_parameters" : {
 3 |         "before-runahead-cc4" : [
 4 |             ("exec", 'asyncap', 'nocombination'),
 5 |             ("output-name", "before-runahead-cc4_", 'nocombination'),
 6 |             ("algorithm", ["runahead"]),
 7 |             ("input-len", ["1000000"]),
 8 |             ("report-off", ['true']),
 9 |             ("duplicate-input-stream", ['600']),
10 |             ("one-output-capacity", ['104619400']),
11 |             ("scanning-R", ['999999999']),
12 |             ("block-size", ['128']),
13 |             ("record-ir", ['0']),
14 |             ("blockDimX", ['-1']),
15 |             ("num-streams", ['4']),
16 |             ("merge-cc", ['4']),
17 |             ("shrmem-wl", ['1']),
18 |             ("shr_wl_len", ['4']),
19 |             ("remove-degree", ['true']),
20 |             ("quit-degree", ['false']),
21 |         ],
22 | 
23 |     },
24 | 
25 |     'exp_times'    : 1,
26 |     'out_prefix'   : 'output',
27 |     'input_suffix' : '1MB'
28 | }
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/code/scripts/configs/exec_config_ngap_groups_design_sota_runahead_4degree_oneinput:
--------------------------------------------------------------------------------
 1 | {
 2 |     "exp_parameters" : {
 3 |         "before-runahead-cc4" : [
 4 |             ("exec", 'asyncap', 'nocombination'),
 5 |             ("output-name", "before-runahead-cc4_", 'nocombination'),
 6 |             ("algorithm", ["runahead"]),
 7 |             ("input-len", ["1000000"]),
 8 |             ("report-off", ['true']),
 9 |             ("duplicate-input-stream", ['1']),
10 |             ("one-output-capacity", ['104619400']),
11 |             ("scanning-R", ['999999999']),
12 |             ("block-size", ['128']),
13 |             ("record-ir", ['0']),
14 |             ("blockDimX", ['-1']),
15 |             ("num-streams", ['4']),
16 |             ("merge-cc", ['4']),
17 |             ("shrmem-wl", ['1']),
18 |             ("shr_wl_len", ['4']),
19 |             ("remove-degree", ['true']),
20 |             ("quit-degree", ['false']),
21 |         ],
22 | 
23 |     },
24 | 
25 |     'exp_times'    : 1,
26 |     'out_prefix'   : 'output',
27 |     'input_suffix' : '1MB'
28 | }
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/code/scripts/configs/exec_config_ngap_groups_design_sota_runahead_oneinput:
--------------------------------------------------------------------------------
 1 | {
 2 |     "exp_parameters" : {
 3 |         "before-runahead-cc4" : [
 4 |             ("exec", 'asyncap', 'nocombination'),
 5 |             ("output-name", "before-runahead-cc4_", 'nocombination'),
 6 |             ("algorithm", ["runahead"]),
 7 |             ("input-len", ["1000000"]),
 8 |             ("report-off", ['true']),
 9 |             ("duplicate-input-stream", ['1']),
10 |             ("one-output-capacity", ['104619400']),
11 |             ("scanning-R", ['999999999']),
12 |             ("block-size", ['128']),
13 |             ("record-ir", ['0']),
14 |             ("blockDimX", ['-1']),
15 |             ("num-streams", ['4']),
16 |             ("merge-cc", ['4']),
17 |             ("shrmem-wl", ['1']),
18 |             ("shr_wl_len", ['4']),
19 |             ("remove-degree", ['false']),
20 |             ("quit-degree", ['false']),
21 |         ],
22 | 
23 |     },
24 | 
25 |     'exp_times'    : 1,
26 |     'out_prefix'   : 'output',
27 |     'input_suffix' : '1MB'
28 | }
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/code/scripts/llcommons.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import os, errno
  3 | import collections
  4 | import scipy
  5 | import scipy.stats
  6 | import pandas as pd
  7 | import argparse
  8 | import math
  9 | 
 10 | # begin from Prof. Sree  -----------------------------
 11 | def critlevel(n, level_perc):
 12 |     import scipy.stats
 13 |     # not the same alpha as in the eqns ...
 14 |     alpha = level_perc / 100.0
 15 | 
 16 |     if n > 32:
 17 |         return scipy.stats.norm.interval(alpha)[1]
 18 |     else:
 19 |         return scipy.stats.t.interval(alpha, n - 1)[1]
 20 | 
 21 | def calc_ci(stdev, n, level_perc=95):
 22 |     t1 = critlevel(n, level_perc)
 23 |     se = stdev / math.sqrt(n)
 24 |     zt = t1*se
 25 |     return zt
 26 | # end  ------------------------------------------
 27 | 
 28 | 
 29 | nested_dict = lambda: collections.defaultdict(nested_dict)
 30 | 
 31 | def get_layer1_folders(path):
 32 |     d = path
 33 |     print(path)
 34 |     return filter(lambda x: os.path.isdir(os.path.join(d, x)), os.listdir(d))
 35 | 
 36 | 
 37 | def get_layer1_files(mypath):
 38 |     print(mypath)
 39 |     onlyfiles = [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
 40 |     return onlyfiles
 41 | 
 42 | 
 43 | def create_dirs_on_path(filepath):
 44 |     if not os.path.exists(os.path.dirname(filepath)):
 45 |         try:
 46 |             os.makedirs(os.path.dirname(filepath))
 47 |         except OSError as exc: # Guard against race condition
 48 |             if exc.errno != errno.EEXIST:
 49 |                 raise
 50 | 
 51 | 
 52 | def replace_string_based_on_map(ss, mp):
 53 |     ss1 = ss
 54 |     for kw in mp:
 55 |         ss1 = ss1.replace(kw, mp[kw])
 56 | 
 57 |     return ss1
 58 | 
 59 | 
 60 | def read_file_to_string(filepath):
 61 |     #print os.getcwd()
 62 |     with open(filepath, 'r') as myfile:
 63 |         data=myfile.read()
 64 | 
 65 |     return data
 66 | 
 67 | 
 68 | def get1Minput(path):
 69 |     for subdir, dirs, files in os.walk(path):
 70 |         #print files
 71 |         for ff in files:
 72 |             if ff.find('1MB') != -1:
 73 |                 return ff
 74 | 
 75 | 
 76 | def get_anml(path):
 77 |     for subdir, dirs, files in os.walk(path):
 78 |         #print files
 79 |         for ff in files:
 80 |             if ff.endswith('.anml'):
 81 |                 return os.path.abspath(os.path.join(path, ff))
 82 | 
 83 | def get_hs(path):
 84 |     for subdir, dirs, files in os.walk(path):
 85 |         #print files
 86 |         for ff in files:
 87 |             if ff.endswith('.hs'):
 88 |                 return os.path.abspath(os.path.join(path, ff))
 89 | 
 90 | 
 91 | def get_file_path(path, suffix):
 92 |     files = get_layer1_files(path)
 93 | 
 94 |     res = []
 95 |     for f in files:
 96 |         assert(os.path.isfile(os.path.join(path, f)))
 97 | 
 98 |         filename_wo_ext = os.path.splitext(f)[0]
 99 |         if filename_wo_ext.endswith(suffix):
100 |             res.append(os.path.abspath(os.path.join(path, f)))
101 | 
102 |     assert(len(res) == 1)
103 | 
104 |     return res[0]
105 | 
106 | 
107 | 
108 | 
109 | def mean_confidence_interval(data, confidence=0.95):
110 |     a = 1.0 * np.array(data)
111 |     n = len(a)
112 |     m, se = np.mean(a), scipy.stats.sem(a)
113 |     h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
114 |     return m, m-h, m+h
115 | 
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     # just a few tests.
120 |     print(get_layer1_folders('../benchmarks'))
121 |     print(get_layer1_files('../benchmarks/Brill/inputs'))
122 |     for app in get_layer1_folders('../benchmarks'):
123 |         print(get_file_path('../benchmarks/%s/inputs' % app, '1MB'))
124 |     
125 |     


--------------------------------------------------------------------------------
/code/src/asyncap/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # add_custom_target(runahead
2 | #     COMMAND make -C ${CMAKE_CURRENT_SOURCE_DIR}
3 | #     COMMENT "Running Makefile in Runahead"
4 | # )
5 | 
6 | 


--------------------------------------------------------------------------------
/code/src/asyncap/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: directories
 2 | 
 3 | 
 4 | #ifeq ($(CXXFLAGS),)
 5 | CC = g++
 6 | AR = ar
 7 | NVCC=/usr/local/cuda/bin/nvcc
 8 | SM=sm_80
 9 | 
10 | # -I../mnrl/include
11 | CXXFLAGS= -std=c++14 -O3 -Iinclude -I../../include  -I../../include/commons -I../../include/pugixml -I../../include/gpunfautils -w
12 | CUDA_INCLUDE=-I/usr/local/cuda/include
13 | NVCCFLAGS+= $(CUDA_INCLUDE) -O3 -D_FORCE_INLINES -arch ${SM}  --default-stream per-thread --compiler-options -Wno-deprecated -lineinfo  -cudart shared -rdc=true -use_fast_math -extra-device-vectorization -restrict 
14 | # -Xptxas -v
15 | #endif
16 | 
17 | #NVCCFLAGS += -I../cub #-I../obat/include
18 | 
19 | objects := obj/run_ahead_approach.o obj/main.o
20 | libs_objects = obj/run_ahead_approach.o
21 | 
22 | LDFLAGS= -L../../build/lib -lgpunfacommons -lgpunfautils -lpthread -ltbb
23 | 
24 | 
25 | all: directories bin/asyncap
26 | 
27 | directories:
28 | 	mkdir -p obj
29 | 	mkdir -p export_lib
30 | 	mkdir -p bin
31 | 
32 | export_lib/libgpunfa_runahead.so: $(libs_objects)
33 | 	$(NVCC)  ${NVCCFLAGS} ${CXXFLAGS}  -shared  --compiler-options '-fPIC' ${libs_objects}  ${LDFLAGS} -o $@ 
34 | 
35 | 
36 | # bin/asyncap: ${objects} export_lib/libgpunfa_runahead.so
37 | bin/asyncap: ${objects}
38 | 	$(NVCC) ${NVCCFLAGS}   ${CXXFLAGS} ${objects} ${LDFLAGS} -o $@ 
39 | 	cp -r bin/* ../../build/bin/
40 | 
41 | obj/run_ahead_approach.o: 
42 | 	$(NVCC) -c ${CXXFLAGS} ${NVCCFLAGS} --shared --compiler-options '-fPIC'  src/run_ahead_approach.cu ${LDFLAGS} -o $@
43 | 
44 | obj/main.o:
45 | 	nvcc -c ${CXXFLAGS} ${NVCCFLAGS} src/main.cu ${LDFLAGS} -o $@
46 | 
47 | clean:
48 | 	rm -rf obj
49 | 	rm -rf export_lib
50 | 
51 | 


--------------------------------------------------------------------------------
/code/src/commons/SymbolStream.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SymbolStream.cpp
  3 |  *
  4 |  *  Created on: May 1, 2018
  5 |  *      Author: hyliu
  6 |  */
  7 | 
  8 | #include "SymbolStream.h"
  9 | 
 10 | #include <string>
 11 | #include <set>
 12 | #include <vector>
 13 | #include <iostream>
 14 | #include <fstream>
 15 | #include <iterator>
 16 | #include <sstream>
 17 | #include <cassert>
 18 | #include <cstdlib>
 19 | #include <time.h>
 20 | 
 21 | using std::string;
 22 | using std::set;
 23 | using std::vector;
 24 | using std::endl;
 25 | using std::cout;
 26 | using std::ios;
 27 | 
 28 | 
 29 | SymbolStream::SymbolStream() {
 30 | }
 31 | 
 32 | SymbolStream::~SymbolStream() {
 33 | }
 34 | 
 35 | 
 36 | const set<uint8_t>&  SymbolStream::calc_alphabet() {
 37 | 	this->alphabet.clear();
 38 | 	for (int i = 0; i < input.size(); i++) {
 39 | 		alphabet.insert(input[i]);
 40 | 	}
 41 | 
 42 | 	//cout << "size_of_alphabet = " << alphabet.size() << endl;
 43 | 	return alphabet;
 44 | }
 45 | 
 46 | 
 47 | int SymbolStream::get_length() const {
 48 |     return input.size();
 49 | }
 50 | 
 51 | 
 52 | /**
 53 |  * From VASim
 54 |  */
 55 | static vector<unsigned char> file2CharVector(string fn) {
 56 | 
 57 |     // open the file:
 58 |     std::ifstream file(fn, std::ios::binary);
 59 |     if(file.fail()){
 60 |         if(errno == ENOENT) {
 61 |             cout<< " Error: no such input file." << endl;
 62 |             exit(-1);
 63 |         }
 64 |     }
 65 | 
 66 |     // get its size:
 67 |     std::streampos fileSize;
 68 | 
 69 |     file.seekg(0, std::ios::end);
 70 |     fileSize = file.tellg();
 71 |     file.seekg(0, ios::beg);
 72 | 
 73 |     // Stop eating new lines in binary mode!!!
 74 |     file.unsetf(std::ios::skipws);
 75 | 
 76 |     // reserve capacity
 77 |     std::vector<unsigned char> vec;
 78 |     vec.reserve(fileSize);
 79 | 
 80 |     // read the data:
 81 |     vec.insert(vec.begin(),
 82 |                std::istream_iterator<unsigned char>(file),
 83 |                std::istream_iterator<unsigned char>());
 84 | 
 85 |     return vec;
 86 | 
 87 | }
 88 | 
 89 | static vector<unsigned char> generateRandomCharVector() {
 90 |   std::string path = "./random.txt";
 91 |   std::ofstream randomFile(path);
 92 |   std::vector<unsigned char> vec;
 93 |   std::srand(time(0));
 94 | 
 95 |   printf("Save random file to %s\n", path.c_str());
 96 |   if (randomFile.is_open()) {
 97 |     for (int i = 0; i < 1000050; i++) {
 98 |       char c = rand() % 256;
 99 |       vec.push_back(c);
100 |       randomFile << c;
101 |     }
102 |     randomFile.close();
103 |   }
104 |   return vec;
105 | }
106 | 
107 | void SymbolStream::readFromFile(string filename) {
108 |     string input_fn = filename;
109 |     bool randomInput = false;
110 |     vector<unsigned char> input2;
111 | 
112 |     auto hasEnding = [](std::string const &fullString,
113 |                         std::string const &ending) -> bool {
114 |       if (fullString.length() >= ending.length()) {
115 |         return (0 == fullString.compare(fullString.length() - ending.length(),
116 |                                         ending.length(), ending));
117 |       } else {
118 |         return false;
119 |       }
120 |     };
121 | 
122 |     if (hasEnding(input_fn, "random")) {
123 |       randomInput = true;
124 |       std::cout << "generate random input\n";
125 |       input2 = generateRandomCharVector();
126 |     } else {
127 |       cout << "read input stream from file = " << input_fn << endl;
128 |       input2 = file2CharVector(input_fn);
129 |     }
130 | 
131 |     input.clear();
132 | 
133 |     // copy bytes to unsigned ints
134 |     uint32_t counter = 0;
135 | 
136 |     for(uint8_t val : input2){
137 |     	input.push_back(val);
138 |     }
139 | 
140 |     cout << "input_stream_size = " << input.size() << endl;
141 | 
142 |     this->fromFile = filename;
143 | }
144 | 
145 | 
146 | uint8_t SymbolStream::get_position(int pos) const {
147 | 	return this->input[pos];
148 | }
149 | 
150 | void SymbolStream::set_position(int pos, uint8_t c) {
151 |     assert(pos >= 0 && pos < size());
152 |     this->input[pos] = c;
153 | }
154 | 
155 | SymbolStream SymbolStream::slice(int start, int len) const {
156 |     SymbolStream res;
157 |     
158 |     assert(start >= 0);
159 |     assert(len >= 0);
160 |     assert(start < this->input.size());
161 | 
162 |     if (start + len > this->input.size()) {
163 |         len = this->input.size() - start;
164 |         cout << "the input is shorter than the length specified, just slice to end" << endl;
165 |     }
166 | 
167 |     assert(start + len <= this->input.size());
168 |     
169 |     for (int i = start; i < start + len; i++) {
170 |         res.input.push_back(this->input[i]);
171 |     }
172 | 
173 |     return res;
174 | }
175 | 
176 | 
177 | 
178 | 
179 | void SymbolStream::padding_to_base(int base) {
180 |     if (base <= 0) {
181 |         return;
182 |     }
183 | 
184 |     while (this->size() % base != 0) {
185 |         this->input.push_back( (uint8_t) 0);
186 |     }
187 | }


--------------------------------------------------------------------------------
/code/src/commons/common_func.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "common_func.h"
 3 | 
 4 | #include "NFA.h"
 5 | #include <unordered_map>
 6 | #include <iostream>
 7 | #include <string>
 8 | #include <algorithm>
 9 | #include <cassert>
10 | #include <queue>
11 | #include <stack>
12 | #include <memory>
13 | #include <map>
14 | #include <vector>
15 | #include <iomanip>
16 | #include <fstream>
17 | #include <climits>
18 | #include "nfa_utils.h"
19 | 
20 | #include <sys/types.h>
21 | #include <sys/stat.h>
22 | 
23 | 
24 | 
25 | using std::ifstream;
26 | using std::string;
27 | using std::endl;
28 | using std::cout;
29 | using std::pair;
30 | 
31 | 
32 | 
33 | 
34 | 
35 | void tools::create_path_if_not_exists(string path) {
36 | 		struct stat info;
37 | 
38 | 		if( stat( path.c_str(), &info ) != 0 )
39 | 		{
40 | 		    printf( "cannot access %s\n", path.c_str() );
41 | 
42 | 		    const int dir_err = mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
43 | 			if (-1 == dir_err)
44 | 			{
45 |     			printf("Error creating directory!n");
46 |     			exit(1);
47 | 			} else {
48 | 				puts("success create dir ");
49 | 			}
50 | 
51 | 		}
52 | 
53 | 		else if( info.st_mode & S_IFDIR )  // S_ISDIR() doesn't exist on my windows  
54 | 	    {
55 | 	    	printf( "%s is a directory\n", path.c_str() );
56 | 	    		
57 | 	    }
58 | 		else
59 | 	    {
60 | 	    	printf( "%s is no directory\n", path.c_str() );
61 | 	    	exit(-1);
62 | 	    }
63 | 
64 | }


--------------------------------------------------------------------------------
/code/src/commons/node.cpp:
--------------------------------------------------------------------------------
  1 | #include "node.h"
  2 | #include <string>
  3 | #include <vector>
  4 | #include <map>
  5 | #include <list>
  6 | #include <unordered_map>
  7 | #include <bitset>
  8 | #include <memory>
  9 | #include <set>
 10 | #include <iostream>
 11 | #include "vasim_helper.h"
 12 | 
 13 | using std::cout;
 14 | using std::endl;
 15 | 
 16 | using namespace VASim;
 17 | 
 18 | using std::set;
 19 | using std::unique_ptr;
 20 | using std::bitset;
 21 | using std::string;
 22 | using std::map;
 23 | using std::vector;
 24 | using std::list;
 25 | using std::unordered_map;
 26 | using std::pair;
 27 | using std::make_pair;
 28 | 
 29 | 
 30 | 
 31 | Node::Node() {
 32 | 
 33 | 	str_id = "";
 34 | 	sid = -1;
 35 | 	symbol_set.reset();
 36 | 	symbol_set_str = "";
 37 | 	start = 0;
 38 | 	str_id = "";
 39 | 	cc_id = 0;
 40 | 	
 41 | 	scc_id = -1;
 42 | 	topo_order = -1;
 43 | 
 44 | 	this->original_id = "undefined";
 45 | 
 46 | 	report = false;
 47 | 
 48 | 	complete = false;
 49 | 	complement = false;
 50 | 
 51 | 
 52 | 	this->hot_degree = 0.0;
 53 | 
 54 | 	
 55 | 	cg_id = -1;
 56 | 
 57 | 	visited = false;
 58 | }
 59 | 
 60 | Node::~Node() {
 61 | }
 62 | 
 63 | 
 64 | bool Node::is_start_always_enabled() const {
 65 | 	return (start == NODE_START_ENUM::START_ALWAYS_ENABLED);
 66 | }
 67 | 
 68 | bool Node::is_start() const {
 69 | 	return (start == NODE_START_ENUM::START || start == NODE_START_ENUM::START_ALWAYS_ENABLED);
 70 | }
 71 | 
 72 | bool Node::is_report() const {
 73 | 	return report;
 74 | }
 75 | 
 76 | 
 77 | void Node::symbol_set_to_bit() {
 78 | 	parseSymbolSet(this->symbol_set, this->symbol_set_str);
 79 | }
 80 | 
 81 | 
 82 | 
 83 | bool Node::is_wildcard() const {
 84 | 	return symbol_set.all();
 85 | }
 86 | 
 87 | 
 88 | 
 89 | // if the symbol set is a reverse of one symbol, we classify this to not type. 
 90 | bool Node::is_not_type_node() const {
 91 | 	return (symbol_set.count() == 1);
 92 | } 
 93 | 
 94 | 
 95 | 
 96 | int Node::num_of_accept_symbol() const {
 97 | 	return (symbol_set.count());
 98 | }
 99 | 
100 | 
101 | 
102 | 
103 | void Node::remap_alphabet(const map<int, int> &remap_table) {
104 | 	bitset<256> remapped_symbol_set;
105 | 	remapped_symbol_set.reset();
106 | 	for (auto it : remap_table) {
107 | 		int k = it.first;
108 | 		int v = it.second;
109 | 		//cout << "remap_alphabet_node " << k << " " << v << endl;
110 | 
111 | 		if (this->symbol_set.test(k)) {
112 | 			remapped_symbol_set.set(v);
113 | 		}
114 | 	}
115 | 
116 | 	this->symbol_set = remapped_symbol_set;
117 | 
118 | }
119 | 
120 | 
121 | 
122 | int Node::num_of_1_in_matchset() const {
123 | 	int n = 0; 
124 | 	for (int i = 0; i < 256; i++) {
125 | 		auto symbol = (uint8_t) i;
126 | 		if (this->match2(i)) {
127 | 			n++;
128 | 		}
129 | 	}
130 | 	return n;
131 | }


--------------------------------------------------------------------------------
/code/src/commons/report_formatter.cpp:
--------------------------------------------------------------------------------
 1 | #include "report_formatter.h"
 2 | 
 3 | #include <vector>
 4 | #include <algorithm>
 5 | #include <string>
 6 | #include <fstream>
 7 | #include <iostream>
 8 | #include <execution>
 9 | 
10 | using std::cout;
11 | using std::endl;
12 | using std::vector;
13 | using std::string;
14 | 
15 | report::report(int offset, string str_id, int cc, int input_stream_id):
16 | offset(offset),
17 | str_id(str_id),
18 | cc(cc),
19 | input_stream_id(input_stream_id)
20 | {
21 | }
22 | 
23 | report_formatter::report_formatter() {}
24 | 
25 | void report_formatter::add_report(report rp) {
26 | 	this->reports.push_back(rp);
27 | }
28 | 
29 | 
30 | void report_formatter::print_to_file(string filename, bool unique1) {
31 | 	cout << "report_fomatter_print_to_file_num_report = " << reports.size() << endl;
32 | 	std::sort(std::execution::par_unseq, reports.begin(), reports.end());
33 | 
34 | 	if (unique1) {
35 | 		reports.erase( std::unique(std::execution::par_unseq, reports.begin(), reports.end() ), reports.end() );
36 | 		cout << "report_fomatter_print_to_file_unique_num_report = " << reports.size() << endl;	
37 | 	}
38 | 
39 | 	// std::ofstream out(filename);
40 | 	// cout<<filename<<"\n";
41 | 
42 | 	// for (auto it : reports) {
43 | 	// 	cout << it.offset << "\t" << it.str_id << endl; 
44 | 	// }
45 | 
46 | 
47 | 	// out.close();
48 | 
49 | }


--------------------------------------------------------------------------------
/code/src/gpunfautils/abstract_gpunfa.cu:
--------------------------------------------------------------------------------
 1 | #include "gpunfautils/abstract_gpunfa.h"
 2 | 
 3 | #include "gpunfautils/array2.h"
 4 | 
 5 | 
 6 | 
 7 | abstract_algorithm::abstract_algorithm(NFA *nfa) : 
 8 | nfa(nfa), 
 9 | padding_input_stream(4), 
10 | max_cc_size_limit(-1),
11 | read_input(true)
12 | {
13 | 
14 | 	block_size = 256;
15 | 
16 | 	output_file = "reports.txt";
17 | 	
18 | 	this->report_on = true;
19 | 	
20 | 	// set default alphabet
21 | 	this->alphabet.clear();
22 | 
23 | 	for (int i = 0; i < 256; i++) {
24 | 		this->alphabet.insert( (uint8_t) i );
25 | 	}
26 | 	
27 | }
28 | 
29 | 
30 | abstract_algorithm::~abstract_algorithm() {
31 | }
32 | 
33 | void abstract_algorithm::set_alphabet(set<uint8_t> alphabet) {
34 | 	this->alphabet = alphabet;
35 | }
36 | 
37 | const SymbolStream& abstract_algorithm::get_symbol_stream(int i) const {
38 | 	assert(i >= 0 && i < symbol_streams.size() );
39 | 	
40 | 	return symbol_streams[i];
41 | }
42 | 
43 | 
44 | void abstract_algorithm::add_symbol_stream(SymbolStream ss) {
45 | 	symbol_streams.push_back(ss);
46 | }
47 | 
48 | 
49 | void abstract_algorithm::set_block_size(int block_size) {
50 | 	this->block_size = block_size;
51 | 
52 | }
53 | 
54 | 
55 | Array2<uint8_t> *abstract_algorithm::concat_input_streams_to_array2() {
56 |   assert(symbol_streams.size() > 0);
57 | 
58 |   // cout << "padding_input_stream = " << this->padding_input_stream << endl;
59 | 
60 |   // for (int i = 0; i < this->symbol_streams.size(); i++) {
61 |   //   symbol_streams[i].padding_to_base(this->padding_input_stream);
62 |   // }
63 | 
64 |   int length = symbol_streams[0].get_length();
65 | 
66 |   for (auto ss : symbol_streams) {
67 |     assert(length == ss.get_length());
68 |   }
69 | 
70 |   auto input = new Array2<uint8_t>(symbol_streams.size() * length);
71 | 
72 |   int t = 0;
73 | 
74 |   for (auto ss : symbol_streams) {
75 |     for (int p = 0; p < ss.get_length(); p++) {
76 |       input->set(t++, ss.get_position(p));
77 |     }
78 |   }
79 | 
80 |   return input;
81 | }
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/code/src/gpunfautils/common.cpp:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | #include <unordered_map>
 4 | #include <iostream>
 5 | #include <string>
 6 | #include <algorithm>
 7 | #include <cassert>
 8 | #include <queue>
 9 | #include <stack>
10 | #include <memory>
11 | #include <map>
12 | #include <vector>
13 | #include <iomanip>
14 | #include <fstream>
15 | 
16 | 
17 | using std::string;
18 | 
19 | 
20 | 
21 | std::ostream& operator<<(std::ostream& os, const match_pair &obj) {
22 |     os << obj.symbol_offset << " " << obj.state_id << ' ';
23 |     return os;
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/code/src/infant/device_funcs.h:
--------------------------------------------------------------------------------
 1 | #ifndef INFANT_KERNELS_DEV_FUNC_H_
 2 | #define INFANT_KERNELS_DEV_FUNC_H_
 3 | 
 4 | 
 5 | 
 6 | #define  OUTPUT_BUFFER_TB  256
 7 | 
 8 | __device__ inline bool get_bit(int *arr, int len, int n_bit) {
 9 |     int n_cell = n_bit / (sizeof(int) * 8);
10 |     int offset = n_bit % (sizeof(int) * 8);
11 | 
12 |     return arr[n_cell] & (1 << offset);
13 | }
14 | 
15 | 
16 | 
17 | __device__ inline void set_bit(int *arr, int len, int n_bit) {
18 |     int n_cell = n_bit / (sizeof(int) * 8);
19 |     int offset = n_bit % (sizeof(int) * 8);
20 | 
21 |     atomicOr(&arr[n_cell], (1 << offset));
22 | 
23 | }
24 | 
25 | 
26 | template <class T>
27 | __device__ inline bool get_bit_single(T ele, int n_bit) {
28 | 	return ele & (1 << n_bit);
29 | }
30 | 
31 | 
32 | #endif


--------------------------------------------------------------------------------
/code/src/infant/infant.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * infant.h
  3 |  *
  4 |  *  Created on: May 16, 2018
  5 |  *      Author: hyliu
  6 |  */
  7 | 
  8 | #ifndef INFANT_H_
  9 | #define INFANT_H_
 10 | 
 11 | #include <algorithm>
 12 | #include <memory>
 13 | #include "commons/NFA.h"
 14 | #include "commons/SymbolStream.h"
 15 | #include <string>
 16 | 
 17 | #include "gpunfautils/array2.h"
 18 | #include <fstream>
 19 | #include "infant_kernels.h"
 20 | #include "infant_config.h"
 21 | #include "gpunfautils/abstract_gpunfa.h"
 22 | 
 23 | using std::string;
 24 | using std::unique_ptr;
 25 | using std::pair;
 26 | 
 27 | 
 28 | class AlphabetBasedTransitionTable {
 29 | public:
 30 | 	AlphabetBasedTransitionTable(const NFA& nfa, const set<uint8_t>& alphabet);
 31 | 	~AlphabetBasedTransitionTable();
 32 | 
 33 | 	void print_basic_stats();
 34 | 
 35 | 	void init_state_vector();
 36 | 
 37 | 	int get_transition_table_length() const;
 38 | 
 39 | 	const pair<int, int> *get_transitions() const;
 40 | 
 41 | 	const int *get_len() const;
 42 | 	const int *get_index() const;
 43 | 
 44 | 	const NFA& get_according_NFA() const;
 45 | 
 46 | 	int get_length_of_state_bitvec() const;
 47 | 	const int *get_enabled_bitvec() const;
 48 | 
 49 | 	int get_max_edge_list_length_of_symbol() const {
 50 | 		return max_edge_list_length_of_symbol;
 51 | 	}
 52 | 
 53 | 	double get_avg_edge_list_length_of_symbol() const {
 54 | 		assert(alphabet.size() > 0);
 55 | 		return this->sum_edge_list_length_of_symbol / alphabet.size();
 56 | 	}
 57 | 
 58 | private:
 59 | 	const NFA& nfa; 
 60 | 	const set<uint8_t> &alphabet;
 61 | 	
 62 | 	pair<int, int> * transitions;
 63 | 	int len[256];    //symbol_trans_len
 64 | 	int index[256];  //symbol_trans_len
 65 | 
 66 | 	int num_transitions;
 67 | 	int transition_table_length;
 68 | 
 69 | 	int V; 
 70 | 
 71 | 	int *enabled_bitvec; 
 72 | 	int state_bitvec_length; 
 73 | 
 74 | 	bool *always_enabled;
 75 | 
 76 | 
 77 | //statistics
 78 | 	int max_edge_list_length_of_symbol;
 79 | 	double sum_edge_list_length_of_symbol;
 80 | 
 81 | };
 82 | 
 83 | 
 84 | 
 85 | 
 86 | class iNFAnt : public abstract_algorithm {
 87 | public:
 88 | 	vector<NFA *> old_ccs;
 89 | 	infant_config* opt;
 90 | 
 91 | 	iNFAnt(NFA *nfa);
 92 | 	~iNFAnt();
 93 | 
 94 | 	int get_num_nfa() const;
 95 | 
 96 | 	const AlphabetBasedTransitionTable& get_trans_table(int i) const;
 97 | 	//void add_transition_table(AlphabetBasedTransitionTable *tt);
 98 | 	void add_NFA(NFA *nfa);
 99 | 	const NFA *get_NFA(int index) const;
100 | 
101 | 
102 | 	const SymbolStream& get_symbol_stream(int i) const;
103 | 	void add_symbol_stream(SymbolStream ss);
104 | 
105 | 	void init_host_transition_tables();
106 | 	void prepare_host_state_info();
107 | 	void prepare_host_input_streams();
108 | 	
109 | 	void prepare_state_vector();
110 | 
111 | 	void allocate_device_data_structures();
112 | 
113 | 	void calc_state_bitvec_length();
114 | 
115 | 	const AlphabetBasedTransitionTable *get_transition_table(int k) const;
116 | 
117 | 	
118 | 	void set_alphabet(set<uint8_t> alphabet);
119 | 
120 | 	void copy_to_device();
121 | 
122 | 	void launch_kernel() override;
123 | 
124 | 	void to_reports() const;
125 | 
126 | 	void set_num_state_per_group(int num_state_per_group) {
127 | 		this->num_state_per_group = num_state_per_group;
128 | 	}
129 | 	void set_option(infant_config &opt){
130 | 		this->opt = &opt;
131 | 	}
132 | 
133 | private:
134 | 	int num_state_per_group;
135 | 
136 | 	vector<NFA *> nfas;
137 | 	vector<AlphabetBasedTransitionTable * > transition_tables; // equals to num_nfa
138 | 
139 | 	// --------------------------------------------------------------------
140 | 	Array2<int> *arr_src_table; 
141 | 	Array2<int> *arr_dst_table; 
142 | 
143 | 
144 | 	Array2<int> *arr_start_position_transition_tables; // length equals to num_nfa
145 | 
146 |     Array2<int>  *arr_symbol_trans_len;
147 | 	Array2<int>  *arr_symbol_trans_index;
148 | 
149 |     Array2<char> *arr_states_status; // 00    is always enabled; is output;  
150 |     Array2<int>  *arr_state_start_position;
151 |     Array2<int>  *num_of_state_per_tb;
152 | 
153 | 	// input streams
154 | 	Array2<uint8_t> *arr_input_streams;
155 | 	Array2<uint8_t> *arr_input_streams2;
156 | 	//int input_stream_length,
157 | 	
158 | 	// state vector
159 | 	Array2<int> *arr_enabled_bitvec;
160 | 	int state_bitvec_length; // num of int per block
161 | 	
162 | 	// output processing
163 | 	Array2<int> 		*arr_match_count;
164 | 
165 | 
166 | };
167 | 
168 | 
169 | #endif /* INFANT_H_ */
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/code/src/infant/infant_config.h:
--------------------------------------------------------------------------------
 1 | #ifndef INFANT_CONFIG
 2 | #define INFANT_CONFIG
 3 | 
 4 | #include "commons/common_func.h"
 5 | 
 6 | using namespace clara;
 7 | 
 8 | class infant_config : public common_gpunfa_options {
 9 | public:
10 |     infant_config() : common_gpunfa_options()               
11 |     {
12 |         this->num_state_per_group = this->block_size;
13 | 
14 |         auto additional_parser =
15 |             Opt(num_state_per_group,
16 |                 "num_state_per_group")["--num-state-per-group"](
17 |                 "number of state per group in infant. ") |
18 |             Opt(validation,
19 |                 "validation")["--validation"]("fake validation (do nothing)");
20 | 
21 |         parser = parser | additional_parser;
22 |     }
23 | 
24 |     int num_state_per_group;
25 |     bool validation;
26 | };
27 | 
28 | 
29 | #endif
30 | 
31 | 
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/code/src/infant/infant_kernels.h:
--------------------------------------------------------------------------------
 1 | #ifndef INFANT_KERNELS_H_
 2 | #define INFANT_KERNELS_H_
 3 | 
 4 | #include "gpunfautils/common.h"
 5 | #include <cuda.h>
 6 | 
 7 | 
 8 | /**
 9 | *
10 | *  stores to one output array
11 | *  thread blocks have to compete on matc
12 | *
13 | *
14 | */
15 | __global__ void infant_kernel_one_output(
16 | 	int  *src_table, 
17 | 	int  *dst_table, 
18 | 	int  *trans_table_start_position, 
19 |     int  *symbol_trans_len,
20 | 	int  *symbol_trans_index,
21 | 
22 |     char *states_status, // 00    is always enabled; is output;  
23 |     int  *state_start_position,
24 |     int  *num_of_states_per_tb,
25 | 
26 | 	// input streams
27 | 	uint8_t *input_streams,
28 | 	int input_stream_length,
29 | 	
30 | 	// state vector
31 | 	int *enabled_bitvec,
32 | 	//int *active_bitvec,
33 | 	int state_bitvec_length, // num of int per block
34 | 
35 | 	// output processing
36 | 	match_entry  *match_array, // fixed size for each thread block,
37 | 	unsigned long long int match_array_capacity,
38 | 	unsigned long long int 	*match_count,
39 | 	bool report_on
40 | 
41 | 
42 | );
43 | 
44 | 
45 | #endif /* INFANT_KERNELS_H_ */
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/code/src/ngap/kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef NGAP_KERNELS_H_
 2 | #define NGAP_KERNELS_H_
 3 | 
 4 | #include "graph.h"
 5 | #include "group_graph.h"
 6 | #include "my_bitset.h"
 7 | #include "ngap_buffer.h"
 8 | 
 9 | // O0
10 | __global__ void advanceAndFilterBlockingGroups(
11 |     BlockingBuffer blb, uint8_t *arr_input_streams, int arr_input_streams_size,
12 |     GroupMatchset gms, GroupNodeAttrs gna, GroupAAS gaas, GroupCsr gcsr);
13 | 
14 | // NAP
15 | template <bool unique, bool record_fs>
16 | __global__ void advanceAndFilterNonBlockingNAPGroups(
17 |     NonBlockingBuffer nblb, uint8_t *arr_input_streams,
18 |     int arr_input_streams_size, GroupMatchset gms, GroupNodeAttrs gna,
19 |     GroupAAS gaas, GroupCsr gcsr);
20 | 
21 | // O1
22 | template <bool unique, bool record_fs>
23 | __global__ void advanceAndFilterNonBlockingGroups(NonBlockingBuffer nblb,
24 |                                                   uint8_t *arr_input_streams,
25 |                                                   int arr_input_streams_size,
26 |                                                   GroupMatchset gms,
27 |                                                   GroupNodeAttrs gna,
28 |                                                   GroupAAS gaas, GroupCsr gcsr);
29 | 
30 | // O3
31 | template <bool unique, int precompute_depth, bool record_fs>
32 | __global__ void advanceAndFilterNonBlockingPrecGroups(
33 |     NonBlockingBuffer nblb, uint8_t *arr_input_streams,
34 |     int arr_input_streams_size, GroupMatchset gms, GroupNodeAttrs gna,
35 |     GroupAAS gaas, GroupCsr gcsr);
36 | 
37 | // O4
38 | template <bool unique>
39 | __global__ void advanceAndFilterNonBlockingR1Groups(
40 |     NonBlockingBuffer nblb, uint8_t *arr_input_streams,
41 |     int arr_input_streams_size, GroupMatchset gms, GroupNodeAttrs gna,
42 |     GroupAAS gaas, GroupCsr gcsr);
43 | 
44 | template <bool unique>
45 | __global__ void advanceAndFilterNonBlockingR2Groups(
46 |     NonBlockingBuffer nblb, uint8_t *arr_input_streams,
47 |     int arr_input_streams_size, GroupMatchset gms, GroupNodeAttrs gna,
48 |     GroupAAS gaas, GroupCsr gcsr);
49 | 
50 | // OA
51 | template <bool unique, int precompute_depth, bool record_fs, bool adaptive_aas>
52 | __global__ void advanceAndFilterNonBlockingAllGroups(
53 |     NonBlockingBuffer nblb, uint8_t *arr_input_streams,
54 |     int arr_input_streams_size, GroupMatchset gms, GroupNodeAttrs gna,
55 |     GroupAAS gaas, GroupCsr gcsr);
56 | 
57 | #endif


--------------------------------------------------------------------------------
/code/src/ngap/main.cu:
--------------------------------------------------------------------------------
  1 | #include <string>
  2 | 
  3 | #include "NFA.h"
  4 | #include "NFALoader.h"
  5 | #include "SymbolStream.h"
  6 | #include "graph.h"
  7 | #include "kernel.h"
  8 | #include "nfa_utils.h"
  9 | #include "ngap.h"
 10 | #include "ngap_buffer.h"
 11 | 
 12 | #include "ngap_option.h"
 13 | #include "node.h"
 14 | #include "utils.h"
 15 | 
 16 | int main(int argc, char *argv[]) {
 17 |   printf("Command: ");
 18 |   for (int i = 0; i < argc; i++)
 19 |     printf("%s ", argv[i]);
 20 |   printf("\n");
 21 | 
 22 |   ngap_option opt;
 23 | 
 24 |   auto result = opt.parse(argc, argv);
 25 | 
 26 |   if (!result) {
 27 |     std::cerr << "Error in command line: " << result.errorMessage()
 28 |               << std::endl;
 29 |     exit(1);
 30 |   }
 31 | 
 32 |   if (opt.showHelp) {
 33 |     cout << opt.getHelp();
 34 |   }
 35 | 
 36 |   std::string automata_filename = opt.nfa_filename;
 37 |   std::string input_filename = opt.input_filename;
 38 |   int start_pos = opt.input_start_pos, input_length = opt.input_len;
 39 |   std::string algo = opt.algorithm;
 40 |   std::string output_file_name = opt.report_filename;
 41 |   int dup_input_stream = opt.duplicate_input_stream;
 42 |   unsigned long long int one_output_capacity = opt.output_capacity;
 43 |   int block_size = opt.block_size;
 44 |   int max_size_of_cc = opt.max_nfa_size;
 45 |   int split_entire_inputstream_to_chunk_size = opt.split_chunk_size;
 46 | 
 47 |   SymbolStream ss, old_ss;
 48 |   old_ss.readFromFile(input_filename);
 49 |   if (start_pos != -1 && input_length != -1) {
 50 |     assert(start_pos >= 0);
 51 |     old_ss = old_ss.slice(start_pos, input_length);
 52 |   }
 53 |   // cout << "input_stream_size = " << ss.size() << endl;
 54 |   auto ab = old_ss.calc_alphabet();
 55 | 
 56 |   auto nfa = load_nfa_from_file(automata_filename);
 57 |   nfa_utils::print_nfa_info(nfa);
 58 | 
 59 |   Graph g;
 60 |   // g.ReadANML(automata_filename);
 61 |   g.ReadNFA(nfa);
 62 |   printf("ReadANML finish \n");
 63 |   g.copyToDevice();
 64 | 
 65 |   ngap pl(nfa, g);
 66 |   pl.set_ngap_option(&opt);
 67 |   pl.set_max_cc_size_limit(max_size_of_cc);
 68 |   pl.preprocessing();
 69 |   auto grouped_nfas = nfa_utils::group_nfas_by_num(opt.group_num, pl.ccs);
 70 |   printf("grouped_nfas.size = %zu pl.num_seg=%d\n", grouped_nfas.size(),
 71 |          pl.num_seg);
 72 |   std::vector<Graph *> gs;
 73 |   for (auto nfa : grouped_nfas) {
 74 |     Graph *g = new Graph();
 75 |     g->ReadNFA(nfa);
 76 |     g->copyToDevice();
 77 |     gs.push_back(g);
 78 |   }
 79 |   assert(gs.size() == opt.group_num);
 80 | 
 81 |   cout << "Input Stream Info:\n";
 82 |   cout << "    input_start_pos = " << start_pos << endl;
 83 |   cout << "    input_length = " << input_length << endl;
 84 |   cout << "    split_entire_inputstream_to_chunk_size = "
 85 |        << split_entire_inputstream_to_chunk_size << endl;
 86 |   cout << "    dup_input_stream = " << dup_input_stream << endl;
 87 | 
 88 |   for (int i = 0; i < dup_input_stream; i++) {
 89 |     ss.concat(old_ss);
 90 |   }
 91 |   if (split_entire_inputstream_to_chunk_size > 0) {
 92 |     int sslen = ss.size();
 93 |     int num_seg = sslen / split_entire_inputstream_to_chunk_size;
 94 |     pl.num_seg = num_seg;
 95 |     // cout << "num_seg_" << i << " = " << num_seg << endl;
 96 |     for (int j = 0; j < num_seg; j++) {
 97 |       int start_pos1 = j * split_entire_inputstream_to_chunk_size;
 98 |       auto ss_seg =
 99 |           ss.slice(start_pos1, split_entire_inputstream_to_chunk_size);
100 |       pl.add_symbol_stream(ss_seg);
101 |     }
102 |   }
103 | 
104 |   pl.set_nfa_group(gs);
105 |   pl.set_report_off(opt.report_off, opt.output_capacity,
106 |                     opt.duplicate_input_stream * opt.quick_validation);
107 |   pl.set_output_file(output_file_name);
108 |   pl.set_num_segment_per_ss(1);
109 |   pl.set_output_buffer_size(one_output_capacity);
110 |   pl.set_block_size(block_size);
111 |   pl.set_alphabet(ab);
112 |   pl.prepare_original_input_streams(ss);
113 | 
114 |   if (algo == "blockinggroups") {
115 |     pl.launch_blocking_groups(); // BAP
116 |   } else if (algo == "NAPgroups") {
117 |     pl.launch_non_blocking_nap_groups(); // NAP
118 |   } else if (algo == "nonblockinggroups") {
119 |     pl.launch_non_blocking_groups(); // O1
120 |   } else if (algo == "nonblockingr1groups") {
121 |     pl.launch_non_blocking_r1_groups(); // O4
122 |   } else if (algo == "nonblockingr2groups") {
123 |     pl.launch_non_blocking_r2_groups(); // O4
124 |   } else if (algo == "nonblockingpcgroups") {
125 |     pl.launch_non_blocking_prec_groups(); // O3
126 |   } else if (algo == "nonblockingallgroups") {
127 |     pl.launch_non_blocking_all_groups(); // OA
128 |   } else {
129 |     cout << "not supported algoritm " << algo << endl;
130 |   }
131 | 
132 |   delete nfa;
133 |   for (auto g : gs)
134 |     delete g;
135 |   printf("FINISHED!\n");
136 |   return 0;
137 | }
138 | 


--------------------------------------------------------------------------------
/code/src/ngap/ngap_buffer.h:
--------------------------------------------------------------------------------
  1 | #ifndef NGAP_BUFFER_H_
  2 | #define NGAP_BUFFER_H_
  3 | 
  4 | #include "graph.h"
  5 | #include "my_bitset.h"
  6 | 
  7 | #include "ngap_option.h"
  8 | #include "precompute_table.h"
  9 | #include "utils.h"
 10 | 
 11 | // #define DEBUG_PL_FILTER
 12 | // #define DEBUG_PL_ADVANCE
 13 | // #define DEBUG_SHADOW_BUFFER
 14 | // #define DEBUG_PL_FILTER_ITER
 15 | #define DEBUG_ITER 10000
 16 | // #define DEBUG_MAX_BUFFER_SIZE  // set data_buffer_stream_size large enough
 17 | // #define DEBUG_PL_KERNEL_LAUNCH
 18 | 
 19 | #define DEBUG_PL_ADVANCE_CONCAT // do not comment it out
 20 | 
 21 | // #define DEBUG_FRONTIER_SIZE
 22 | // #define MULTI_BLOCKS
 23 | #define USE_CSR
 24 | // #define USE_PRECOMP_ONCE
 25 | // #define USE_PRECOMP_TWICE
 26 | 
 27 | #ifndef DATA_BUFFER_SIZE
 28 | // #define DATA_BUFFER_SIZE 300000000
 29 | #define DATA_BUFFER_SIZE 1000000000LL
 30 | #endif
 31 | 
 32 | #ifndef DATA_BUFFER_SIZE_FRONTIER
 33 | #define DATA_BUFFER_SIZE_FRONTIER 2000000000
 34 | #endif
 35 | 
 36 | #ifndef RESULTS_SIZE
 37 | #define RESULTS_SIZE 80000000
 38 | #endif
 39 | 
 40 | #define MAX_THREADS_PER_BLOCK 256
 41 | #define MIN_BLOCKS_PER_MP 16
 42 | 
 43 | #define BLOCK_SIZE 256
 44 | 
 45 | // #define PRINT_INDEX_QUEUE
 46 | 
 47 | class BlockingBuffer {
 48 | public:
 49 |   int buffer_capacity;
 50 |   int buffer_capacity_per_block;
 51 |   unsigned long long int results_capacity;
 52 |   bool unique;
 53 | 
 54 |   int *d_buffer;
 55 |   int *d_buffer_idx;
 56 |   int *d_buffer_size;
 57 |   uint64_t *d_results;
 58 |   unsigned long long int *d_results_size;
 59 | 
 60 |   int *d_froniter_length;
 61 |   int *d_froniter_end;
 62 | 
 63 |   int *d_froniter_divergence_end;
 64 |   int *d_froniter_divergence_advance;
 65 |   int *d_froniter_divergence_filter;
 66 |   int *d_froniter_workload_end;
 67 |   int *d_froniter_workload;
 68 | 
 69 |   int group_num;
 70 |   int num_seg;
 71 | 
 72 |   bool motivate_worklist_length;
 73 | 
 74 |   bool report_off;
 75 | 
 76 |   __host__ void init(Array2<uint8_t> *input_stream, int input_total_size,
 77 |                      int input_num, int multi_ss_size, Graph &graph,
 78 |                      ngap_option *plo);
 79 |   __host__ void init_nfagroups(Array2<uint8_t> *input_stream,
 80 |                                int input_total_size, int input_num,
 81 |                                int multi_ss_size, std::vector<Graph *> gs,
 82 |                                ngap_option *plo);
 83 | 
 84 |   __host__ void release();
 85 | };
 86 | 
 87 | class NonBlockingBuffer {
 88 | public:
 89 |   long long int buffer_capacity;
 90 |   long long int buffer_capacity_per_block;
 91 |   unsigned long long int results_capacity;
 92 |   int data_buffer_fetch_size = 64;
 93 | 
 94 |   int add_aas_start = 1000;
 95 |   int add_aas_interval;
 96 | 
 97 |   int active_threshold;
 98 | 
 99 |   bool unique;
100 |   int unique_frequency;
101 | 
102 |   int *d_buffer;
103 |   int *d_buffer_idx;
104 |   int *d_buffer2;
105 |   int *d_buffer_idx2;
106 | 
107 |   int *d_buffer_test;
108 |   int *d_buffer_idx_test;
109 |   uint *d_buffer_end_tmp_test;
110 | 
111 |   uint *d_buffer_start;
112 |   uint *d_buffer_end;
113 |   uint *d_buffer_end_tmp;
114 |   uint64_t *d_results;
115 |   uint32_t *d_results_v;
116 |   uint32_t *d_results_i;
117 |   unsigned long long int *d_results_size;
118 |   int *d_symbol_table;
119 |   int *d_newest_idx;
120 | 
121 |   int *prec_once_offset;
122 |   int *prec_once;
123 |   int *prec_twice_offset;
124 |   int *prec_twice;
125 |   int *prec_once_report_offset;
126 |   int *prec_once_report;
127 |   int *prec_twice_report_offset;
128 |   int *prec_twice_report;
129 | 
130 |   int *preresult;
131 |   int *preresult_iter;
132 |   int *preresult_size;
133 |   int *preresult_end;
134 | 
135 |   int *d_fakeiter;
136 |   int *d_fakeiter_size;
137 |   int *d_fakeiter2;
138 |   int *d_fakeiter_size2;
139 |   int d_fakeiter_capacity;
140 |   int *cutoffnum;
141 | 
142 |   int *d_froniter_length;
143 |   int *d_froniter_end;
144 | 
145 |   // O3
146 |   PrecTable *h_pts;
147 |   PrecTable *d_pts;
148 |   int precompute_depth = 0;
149 |   int precompute_cutoff;
150 | 
151 |   int group_num;
152 |   int num_seg;
153 | 
154 |   bool report_off;
155 | 
156 |   __host__ void init(Array2<uint8_t> *input_stream, int input_total_size,
157 |                      int input_num, int multi_ss_size, Graph &graph,
158 |                      ngap_option *plo);
159 |   __host__ void init_nfagroups(Array2<uint8_t> *input_stream,
160 |                                int input_total_size, int input_num,
161 |                                int multi_ss_size, std::vector<Graph *> gs,
162 |                                ngap_option *plo);
163 | 
164 |   __host__ void release(bool isGroup = false);
165 | 
166 |   __host__ void reset(Array2<uint8_t> *input_stream, int input_total_size,
167 |                       int multi_ss_size, int group_num, std::vector<Graph *> gs,
168 |                       ngap_option *plo);
169 | };
170 | 
171 | #endif


--------------------------------------------------------------------------------
/code/src/ngap/ngap_option.h:
--------------------------------------------------------------------------------
 1 | #ifndef NGAP_OPTION_H_
 2 | #define NGAP_OPTION_H_
 3 | 
 4 | #include "commons/common_func.h"
 5 | 
 6 | class ngap_option : public common_gpunfa_options {
 7 | public:
 8 |   ngap_option() : common_gpunfa_options() {
 9 |     this->algorithm = "graph";
10 |     this->num_state_per_group = this->block_size;
11 | 
12 |     auto additional_parser =
13 |         Opt(add_aas_start, "start number")["--add-aan-start"](
14 |             "the number of iteration to added always active state before "
15 |             "execution") |
16 |         Opt(add_aas_interval, "interval number")["--add-aas-interval"](
17 |             "the number of iteration to added always active state during "
18 |             "execution") |
19 |         Opt(unique, "true/false")["--unique"]("unique during execution") |
20 |         Opt(active_threshold, "active-threshold")["--active-threshold"](
21 |             "the active thread number to enable work privatization") |
22 |         Opt(validation, "true/false")["--validation"]("enable validation") |
23 |         Opt(use_soa, "true/false")["--use-soa"](
24 |             "change the data layout of NFA topology") |
25 |         Opt(precompute_cutoff, "precompute-cutoff")["--precompute-cutoff"](
26 |             "the threshold for table load balance") |
27 |         Opt(precompute_depth, "precompute-depth")["--precompute-depth"](
28 |             "the prefix length for the memiozation table") |
29 |         Opt(data_buffer_fetch_size,
30 |             "data-buffer-fetch-size")["--data-buffer-fetch-size"](
31 |             "the number of states taken from the buffer in each iteration") |
32 |         Opt(motivate_worklist_length,
33 |             "true/false")["--motivate-worklist-length"](
34 |             "record worklist length") |
35 |         Opt(num_state_per_group,
36 |             "num_state_per_group")["--num-state-per-group"](
37 |             "number of state per group.") |
38 |         Opt(group_num, "group_num")["--group-num"]("the group number for CCs") |
39 |         Opt(tuning, "true/false")["--tuning"]("enable tuning") |
40 |         Opt(pc_use_uvm, "true/false")["--pc-use-uvm"](
41 |             "use uvm to store memiozation tables") |
42 |         Opt(adaptive_aas, "true/false")["--adaptive-aas"](
43 |             "use adaptive strategy for interval number") |
44 |         Opt(try_adaptive_aas, "true/false")["--try-adaptive-aas"](
45 |             "retry when adaptive strategy  failed") |
46 |         Opt(compress_prec_table, "true/false")["--compress-prec-table"](
47 |             "compress memiozation tables");
48 |             
49 |     parser = parser | additional_parser;
50 |   }
51 | 
52 |   uint32_t data_buffer_fetch_size = 128;
53 |   int add_aas_start = 0;
54 |   int add_aas_interval = 1;
55 |   bool unique = false;
56 |   bool validation = true;
57 |   int active_threshold = 20;
58 |   bool use_soa = false;
59 |   int precompute_cutoff = -1;
60 |   int precompute_depth = 0;
61 |   bool motivate_worklist_length = false;
62 |   int num_state_per_group;
63 |   int group_num = 10;
64 |   bool compress_prec_table = true;
65 |   bool tuning = false;
66 |   bool pc_use_uvm = false;
67 |   bool adaptive_aas = false;
68 |   bool try_adaptive_aas = false;
69 | };
70 | 
71 | #endif
72 | 


--------------------------------------------------------------------------------
/code/src/obat/Makefile:
--------------------------------------------------------------------------------
 1 | all: nvcc clang
 2 | 
 3 | nvcc:
 4 | 	nvcc -O0 --std=c++11 -Xptxas='-v' --source-in-ptx -m64 main.cu one_byte_at_a_time.cu -arch=sm_50 -I../../include -L../../build/lib -lgpunfacommons -lgpunfautils -keep -o obat1_nvcc
 5 | 
 6 | clang:
 7 | 	clang++ -O0 --std=c++11 main.cu one_byte_at_a_time.cu --cuda-path=${CUDA_ROOT} --cuda-gpu-arch=sm_50 -lcudart_static -ldl -lrt -pthread -L/home/hyliu/gcc65/install/lib64  -lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0 -L${CUDA_ROOT}/lib64 -I../../include -L../../build/lib -lgpunfacommons -lgpunfautils -save-temps -o obat1_clang
 8 | 
 9 | clean:
10 | 	rm -f *.o
11 | 	rm -f *.ii
12 | 	rm -f *.i
13 | 	rm -f *.ptx
14 | 	rm -f *fatbin*
15 | 	rm -f *cubin*
16 | 	rm -f *stub*
17 | 	rm -f *sm_*
18 | 	rm -f *cudafe*
19 | 	rm -f *module_id
20 | 	rm -f *dlink*
21 | 	rm -f a.out
22 | 	rm -f *.png
23 | 	rm -f *.txt
24 | 	rm -f *.ll
25 | 	rm -f *.bc
26 | 	rm -f *.s
27 | 	rm -f *.cui
28 | 	rm -f obat1*
29 | 


--------------------------------------------------------------------------------
/code/src/obat/one_byte_at_a_time.h:
--------------------------------------------------------------------------------
  1 | #ifndef ONEBYTE_AT_A_TIME
  2 | #define ONEBYTE_AT_A_TIME
  3 | 
  4 | 
  5 | #include <algorithm>
  6 | #include <iostream>
  7 | #include <vector>
  8 | #include <map>
  9 | #include <list>
 10 | #include <cassert>
 11 | #include <set>
 12 | #include "commons/NFA.h"
 13 | #include <gpunfautils/utils.h>
 14 | #include <gpunfautils/array2.h>
 15 | #include <gpunfautils/common.h>
 16 | #include "commons/SymbolStream.h"
 17 | #include <gpunfautils/abstract_gpunfa.h>
 18 | #include <unordered_map>
 19 | #include "commons/report_formatter.h"
 20 | #include "option_config.h"
 21 | 
 22 | 
 23 | using std::unordered_map;
 24 | 
 25 | 
 26 | 
 27 | class one_byte_at_a_time : public abstract_algorithm {
 28 | public:
 29 | 	obat_config* opt;
 30 | 	vector<NFA *> old_ccs;
 31 | 	one_byte_at_a_time(NFA *nfa);
 32 | 	virtual ~one_byte_at_a_time();
 33 | 
 34 | 	void preprocessing_enable_active();
 35 | 
 36 | 	void check_grouped_nfa_sizes();
 37 | 
 38 | 	void preprocessing_active_active(); 
 39 | 
 40 | 	void launch_kernel() override;
 41 | 
 42 | 	void prepare_output_buffer();
 43 | 
 44 | 	void print_reports(string filename, report_formatter &rf);
 45 | 
 46 | 	void organize_reports2(Array2<match_entry> *output_buffer, int buffer_size, const vector<NFA*> &grouped_nfas1, report_formatter& rf);
 47 | 
 48 | 	void remap_intid_of_nodes(remap_node_type tp);
 49 | 
 50 | 	void remap_intid_of_nodes_with_boudary(remap_node_type tp, vector<NFA *> &grouped_nfa, const vector<int> &boundaries);
 51 | 
 52 | 	void hotstart_aa();
 53 | 
 54 |     //OBAT series
 55 |     void OBAT_baseline_2();
 56 |     void obat_MC();
 57 | 
 58 |     // important. Activity based hot cold approach.
 59 |     void test_hotcold_nodup_queue_mc_CaH();
 60 | 
 61 |     // hotstart
 62 |     void hotstart_ea();
 63 |     void hotstart_ea_without_MC2();
 64 | 
 65 | 	void set_node_active_freq_map(map<string, double> freq_map);
 66 | 	void set_hot_limit_by_bfs_layer(int hot_limit_by_bfs_layer);
 67 | 	void set_active_queue_size(int queuesize);
 68 | 
 69 | 	void set_cold_threshold(double cold_threshold) {
 70 | 		this->cold_thres = cold_threshold;
 71 | 	}
 72 | 	void set_option(obat_config &opt){
 73 | 		this->opt = &opt;
 74 | 	}
 75 | 
 76 | 	void print_node_matchset_complete_info(const vector<NFA*> &ccs);
 77 | 
 78 | 	void test_data_movement_read_input_stream_only(int num_tb_x);
 79 | 
 80 | 	void test_data_movement_read_input_stream_only2(int num_tb_x);
 81 | 
 82 |     bool hot_stage_only;
 83 |     bool remap_input_stream;
 84 | 
 85 |     int packing;
 86 |     string packing_filename;
 87 | 
 88 | private:
 89 | 	bool record_cold_warp_active_array;
 90 | 
 91 | 	int history_queue_capacity;
 92 | 
 93 | 	int active_queue_size;
 94 | 
 95 | 	int hot_limit_by_bfs_layer;
 96 | 
 97 | 	int max_indegree_of_cold_states; 
 98 | 
 99 | 	double cold_thres;
100 | 
101 | 	int profile_length;
102 | 
103 | 	map<string, double> freq_map;
104 | 
105 | 	vector<NFA *> grouped_nfas;
106 | 
107 | 	Array2<match_entry> *real_output_array;
108 | 	Array2<unsigned long long int>     *tail_of_real_output_array;
109 | 
110 | 	remap_node_type remap_node_id;
111 | 
112 | 
113 | };
114 | 
115 | 
116 | 
117 | #endif
118 | 


--------------------------------------------------------------------------------
/code/src/obat/option_config.h:
--------------------------------------------------------------------------------
 1 | #ifndef OBAT_CONFIG
 2 | #define OBAT_CONFIG
 3 | 
 4 | #include "commons/common_func.h"
 5 | 
 6 | using namespace clara;
 7 | 
 8 | class obat_config : public common_gpunfa_options {
 9 | public:
10 |     obat_config() : common_gpunfa_options(),
11 |                     bfs_hot_ratio(0.0),
12 |                     hotcold_filter_filename(""),
13 |                     hot_n_state_limit(-1),
14 |                     hot_stage_only(false),
15 |                     remap_input_stream(false),
16 |                     active_queue_size(1024),
17 |                     packing(0),
18 |                     cold_threshold(0.001),
19 |                     num_of_blocks_read_input_only(1),
20 |                     validation{true}
21 |                     {
22 | 
23 |         auto additional_parser =
24 |                 Opt(bfs_hot_ratio, "bfs_hot_ratio")["--hot-limit-by-bfs-ratio"]
25 |                         ("The ratio of states to be fixed mapped to threads that offloaed by bfs-layer")
26 |                 | Opt(hotcold_filter_filename, "hot cold filter filename")
27 |                 ["--hot-cold-filter"]
28 |                 ("The file specifies which states are hot and thereby fixed mapped to threads.")
29 | 
30 |                 | Opt(active_queue_size, "active_queue_size")["--active-queue-size"]["-q"]("worklist size in shared memory")
31 | 
32 |                 | Opt(hot_n_state_limit, "hot_n_state_limit")["--hot-limit-by-bfs"]("hot-limit-by-bfs")
33 |                 | Opt(hot_stage_only, "hot_stage_only")["--hot-stage-only"]("only execute hot stage. "
34 |                                                                             "Only works in hotstart_ea and hotstart_aa")
35 |                 | Opt(remap_input_stream, "remap_input_stream")["--remap-input-stream"]
36 |                                        ("remap input stream to thread block. (Testing now only applicable to hotstart ea)")
37 |                 | Opt(packing, "packing")["--packing"]("The way of packing NFAs to thread blocks. Default: 0. Random 1. ")
38 |                 | Opt(packing_activation_file, "packing_activation_file")["--packing-file"]("packing activation ratio file")
39 |                 | Opt(cold_threshold, "cold_threshold")["--cold-threshold"]("If we use profiling, what ratio can be considered to be cold")
40 |                 | Opt(num_of_blocks_read_input_only, "num_of_blocks_read_input_only")["--num-of-blocks-read-input-only"]("only for characterization. ")
41 |                 | Opt(validation,  "validation")["--validation"]("fake validation (do nothing)");
42 |         parser = parser | additional_parser;
43 | 
44 |     }
45 | 
46 |     bool hot_stage_only;
47 |     bool remap_input_stream;
48 |     double bfs_hot_ratio;
49 |     int hot_n_state_limit;
50 |     string hotcold_filter_filename;
51 |     int active_queue_size;
52 | 
53 |     int packing;
54 |     string packing_activation_file;
55 | 
56 |     double cold_threshold;
57 | 
58 |     int num_of_blocks_read_input_only;
59 | 
60 |     bool validation;
61 | };
62 | 
63 | 
64 | #endif
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/code/src/ppopp12/main.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <numeric>
  3 | #include <stdlib.h>
  4 | #include <cuda_runtime.h>
  5 | #include "NFA.h"
  6 | #include "NFALoader.h"
  7 | #include <string>
  8 | #include <getopt.h>
  9 | #include <memory>
 10 | #include <set>
 11 | #include "SymbolStream.h"
 12 | #include "ppopp12.h"
 13 | #include "utils.h"
 14 | #include "node.h"
 15 | #include "nfa_utils.h"
 16 | #include <clara/clara.hpp>
 17 | #include "moderngpu/context.hxx"
 18 | #include "moderngpu/util.hxx"
 19 | #include "ppopp12_option.h"
 20 | 
 21 | 
 22 | using namespace clara;
 23 | 
 24 | using std::set;
 25 | using std::unique_ptr;
 26 | using std::string;
 27 | using std::cout;
 28 | using std::endl;
 29 | 
 30 | 
 31 | int main(int argc, char *argv[])
 32 | {
 33 |     printf("Command: ");
 34 |     for (int i = 0; i < argc; i++)
 35 |         printf("%s ", argv[i]);
 36 |     printf("\n");
 37 | 
 38 |     ppopp12_config cfg;
 39 | 
 40 |     auto result = cfg.parse( argc, argv );
 41 | 
 42 |     if( !result )
 43 |     {
 44 |         std::cerr << "Error in command line: " << result.errorMessage() << std::endl;
 45 |         exit(1);
 46 |     }
 47 | 
 48 |     if (cfg.showHelp) {
 49 |         cout << cfg.getHelp();
 50 |     }
 51 | 
 52 |     string automata_filename = cfg.nfa_filename;
 53 |     string input_filename = cfg.input_filename;
 54 |     int start_pos = cfg.input_start_pos, input_length = cfg.input_len;
 55 |     string algo = cfg.algorithm;
 56 |     string output_file_name = cfg.report_filename;
 57 |     int dup_input_stream = cfg.duplicate_input_stream;
 58 |     unsigned long long int one_output_capacity = cfg.output_capacity;
 59 |     int block_size = cfg.block_size;
 60 |     int max_size_of_cc = cfg.max_nfa_size;
 61 |     int split_entire_inputstream_to_chunk_size = cfg.split_chunk_size;
 62 | 
 63 |     SymbolStream ss;
 64 |     ss.readFromFile(input_filename);
 65 |     
 66 |     if (start_pos != -1 && input_length != -1) {
 67 |         assert(start_pos >= 0);
 68 |         ss = ss.slice(start_pos, input_length);
 69 |     }
 70 | 
 71 |     //cout << "input_stream_size = " << ss.size() << endl;
 72 |     auto ab = ss.calc_alphabet();
 73 | 
 74 |     auto nfa = load_nfa_from_file(automata_filename);
 75 | 
 76 |     cout << "nfa_size_original = " << nfa->size() << endl;
 77 | 
 78 |     nfa_utils::print_starting_node_info(nfa);
 79 |     
 80 |     int active_state_array_size = block_size;
 81 | 
 82 |     ppopp12 p12(nfa);
 83 | 
 84 |     cout << "dup_input_stream = " << dup_input_stream << endl;
 85 |     cout << "split_entire_inputstream_to_chunk_size = " << split_entire_inputstream_to_chunk_size << endl;
 86 | 
 87 |     p12.set_max_cc_size_limit(max_size_of_cc);
 88 |     
 89 |     for (int i = 0; i < dup_input_stream; i++) {
 90 |         if (split_entire_inputstream_to_chunk_size == -1) {
 91 |             p12.add_symbol_stream(ss);    
 92 |         } else {
 93 |             assert(split_entire_inputstream_to_chunk_size > 0);
 94 |             int sslen = ss.size();
 95 |             int num_seg = sslen / split_entire_inputstream_to_chunk_size;
 96 | 
 97 |             cout << "num_seg_" << i << " = " << num_seg << endl; 
 98 | 
 99 |             for (int j = 0; j < num_seg; j++) {
100 |                 int start_pos1 = j * split_entire_inputstream_to_chunk_size;
101 |                 auto ss_seg = ss.slice(start_pos1, split_entire_inputstream_to_chunk_size);
102 |                 p12.add_symbol_stream(ss_seg);
103 |             }
104 |         }
105 |     }
106 | 
107 |     p12.set_report_off(cfg.report_off, cfg.output_capacity,
108 |                     cfg.duplicate_input_stream * cfg.quick_validation);
109 |     p12.set_output_file(output_file_name);
110 |     p12.set_num_segment_per_ss(1);
111 |     p12.set_output_buffer_size(one_output_capacity);
112 |     p12.set_block_size(block_size);
113 |     p12.set_active_state_array_size(active_state_array_size);
114 |     p12.set_alphabet(ab);
115 |     p12.set_option(cfg);
116 |     p12.validation = cfg.validation;
117 | 
118 |     
119 |     p12.preprocessing();
120 | 
121 | 
122 |     if (algo == "ppopp12") {
123 |         p12.launch_kernel();
124 |     } else if (algo == "ppopp12_inputshropt") {
125 |         p12.launch_kernel_readinputchunk();
126 |     }
127 |     else {
128 |         cout <<"not supported algoritm " << algo << endl;
129 | 
130 |     }
131 | 
132 |     delete nfa;
133 |     cout<< "FINISHED\n";
134 |     return 0;
135 | }
136 | 
137 | 
138 | 
139 | 
140 | 


--------------------------------------------------------------------------------
/code/src/ppopp12/ppopp12.h:
--------------------------------------------------------------------------------
  1 | #ifndef PPOPP12_H_
  2 | #define PPOPP12_H_
  3 | 
  4 | #include <algorithm>
  5 | #include <iostream>
  6 | #include <vector>
  7 | #include <map>
  8 | #include <list>
  9 | #include <cassert>
 10 | #include <set>
 11 | #include "NFA.h"
 12 | #include "array2.h"
 13 | #include "utils.h"
 14 | #include "common.h"
 15 | #include "SymbolStream.h"
 16 | #include <cuda.h>
 17 | #include "abstract_gpunfa.h"
 18 | #include "ppopp12_option.h"
 19 | #include "compatible_group_helper.h"
 20 | 
 21 | using std::map;
 22 | using std::vector;
 23 | using std::fill;
 24 | using std::cout;
 25 | using std::endl;
 26 | using std::pair;
 27 | using std::set;
 28 | using std::make_pair;
 29 | 
 30 | 
 31 | 
 32 | 
 33 | class ppopp12 : public abstract_algorithm {
 34 | public:
 35 | 	ppopp12_config* opt;
 36 | 	vector<NFA *> old_ccs;
 37 | 	ppopp12(NFA *nfa);
 38 | 	~ppopp12();
 39 | 
 40 | 	void set_block_size(int blocksize);
 41 | 	void set_active_state_array_size(int active_state_array_size);
 42 | 	void set_alphabet(set<uint8_t> alphabet);
 43 | 
 44 | 	void group_nfas();
 45 | 
 46 | 	virtual void preprocessing() override;
 47 | 
 48 | 	int get_num_states_gpu() const;
 49 | 
 50 | 	void prepare_transition_table();
 51 | 
 52 | 	void prepare_states_status();
 53 | 
 54 | 	void prepare_initial_active_state_array();
 55 | 
 56 | 	void prepare_state_start_position_tb();
 57 | 	void prepare_compatible_grps();
 58 | 
 59 | 	void prepare_input_streams();
 60 | 
 61 | 	void prepare_outputs();
 62 | 
 63 | 	void launch_kernel();
 64 | 
 65 | 	void launch_kernel_readinputchunk();
 66 | 	
 67 | 	void print_reports(string filename);
 68 | 
 69 | 	void  set_num_segment_per_ss(int nn) {
 70 | 		this->num_segment_per_ss = nn;
 71 | 	}
 72 | 	void set_option(ppopp12_config &opt){
 73 | 		this->opt = &opt;
 74 | 	}
 75 | 	
 76 | 
 77 | 	int get_num_segment_per_ss() const {
 78 | 		return num_segment_per_ss;
 79 | 	}
 80 | 		
 81 | private:
 82 | 	// for debug
 83 | 	NFA* select_one_nfa_by_id(string str_id);
 84 | 	
 85 | 	void calc_str_id_to_compatible_group_per_block();
 86 | 
 87 | 	int active_state_array_size;
 88 | 
 89 | 	map<int, vector<int> > nfa_group_tb;
 90 | 	int num_nfa_chunk;
 91 | 	map<int, int> num_compatible_groups_cc;
 92 | 
 93 | 	Array2<int> *state_start_position_tb; 
 94 | 
 95 | 	Array2<int> *num_state_tb;
 96 | 	Array2<int> *array_compatible_group;
 97 | 	Array2<int4> *trans_table;
 98 | 
 99 | 	Array2<int8_t> *states_status;
100 | 	Array2<int> *initial_active_state_array;
101 | 
102 | 	// input
103 | 	Array2<uint8_t> *arr_input_streams;
104 | 
105 | 	// output
106 |     Array2<match_entry> *match_array;
107 |     Array2<unsigned long long int > *match_count;
108 | 
109 | 	map<string, int> str_id_to_compatible_group; 
110 | 	// per cc 
111 | 	map<string, int> str_id_to_compatible_group_per_block;
112 | 	// per block
113 | 
114 | 	vector<NFA *> nfa_in_tb;
115 | 
116 | 	bool no_cg;
117 | 
118 | 	bool profile;
119 | 
120 | 	int num_segment_per_ss;
121 | }; 
122 | 
123 | 
124 | #endif
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/code/src/ppopp12/ppopp12_kernels.h:
--------------------------------------------------------------------------------
 1 | #ifndef PPOPP12_KERNELS_H_
 2 | #define PPOPP12_KERNELS_H_
 3 | 
 4 | #include "gpunfautils/common.h"
 5 | #include <cuda.h>
 6 | 
 7 | 
 8 | __global__ void ppopp12_kernel(
 9 | 	const __restrict__  int4 *transition_table,
10 | 	const int transition_table_length,
11 | 	
12 | 	int *state_start_position_tb, 
13 | 	//int *num_state_tb,
14 | 	int *state_compatible_group,
15 | 	int *initial_active_state_array,
16 | 	int active_state_array_size, // currently it is the same as block size
17 | 
18 | 	// for output / and start always enabled. 
19 | 	int8_t *states_status, // 00    is always enabled; is output;  
20 |     
21 | 	// input
22 | 	uint8_t *input_streams,
23 | 	int input_stream_length,
24 | 	
25 | 	// output processing
26 | 	match_entry  *match_array, // fixed size for each thread block,
27 | 	const unsigned long long int match_array_capacity,
28 | 	unsigned long long int 	*match_count,
29 | 	bool report_on
30 | ) ;
31 | 
32 | __global__ void ppopp12_kernel_shrreadchunk(
33 | 	const __restrict__  int4 *transition_table,
34 | 	const int transition_table_length,
35 | 	int *state_start_position_tb, 
36 | 	//int *num_state_tb,
37 | 	int *state_compatible_group,
38 | 	int *initial_active_state_array,
39 | 	int active_state_array_size, // currently it is the same as block size
40 | 
41 | 	// for output / and start always enabled. 
42 | 	int8_t *states_status, // 00    is always enabled; is output;  
43 |     
44 | 	// input
45 | 	uint8_t *input_streams,
46 | 	int input_stream_length,
47 | 	
48 | 	// output processing
49 | 	match_entry  *match_array, // fixed size for each thread block,
50 | 	const unsigned long long int match_array_capacity,
51 | 	unsigned long long int 	*match_count,
52 | 	bool report_on
53 | );
54 | 
55 | 
56 | 
57 | #endif
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/code/src/ppopp12/ppopp12_option.h:
--------------------------------------------------------------------------------
 1 | #ifndef PPOPP12_CONFIG
 2 | #define PPOPP12_CONFIG
 3 | 
 4 | #include "commons/common_func.h"
 5 | 
 6 | class ppopp12_config : public common_gpunfa_options {
 7 | public:
 8 |     ppopp12_config() : common_gpunfa_options() {
 9 | 
10 |          auto additional_parser =
11 |             Opt(validation,
12 |                 "validation")["--validation"]("validation");
13 |         parser = parser | additional_parser;
14 |     }
15 | 
16 |     bool validation;
17 | 
18 | 
19 | };
20 | 
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:12.0.1-devel-ubuntu20.04
 2 | ENV DEBIAN_FRONTEND=noninteractive
 3 | 
 4 | RUN apt-get update 
 5 | RUN apt-get install -y libtbb-dev=2020.1-2 wget git
 6 | RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.1/cmake-3.24.1-Linux-x86_64.sh \
 7 |       -q -O /tmp/cmake-install.sh \
 8 |       && chmod u+x /tmp/cmake-install.sh \
 9 |       && mkdir /opt/cmake-3.24.1 \
10 |       && /tmp/cmake-install.sh --skip-license --prefix=/opt/cmake-3.24.1 \
11 |       && rm /tmp/cmake-install.sh \
12 |       && ln -s /opt/cmake-3.24.1/bin/* /usr/local/bin
13 | # hyperscan
14 | RUN apt-get install -y ragel nasm libsqlite3-dev pkg-config
15 | RUN apt-get install -y libboost-all-dev
16 | RUN echo "deb http://dk.archive.ubuntu.com/ubuntu/ xenial main" |  tee -a /etc/apt/sources.list
17 | RUN echo "deb http://dk.archive.ubuntu.com/ubuntu/ xenial universe" |  tee -a /etc/apt/sources.list
18 | RUN apt-get update && apt-get install -y g++-5 gcc-5 
19 | RUN rm -rf /var/lib/apt/lists/*
20 | 
21 | 
22 | # install python environment
23 | ENV PATH="/root/miniconda3/bin:${PATH}"
24 | RUN wget -q \
25 |     https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
26 |     && mkdir /root/.conda \
27 |     && bash Miniconda3-latest-Linux-x86_64.sh -b \
28 |     && rm -f Miniconda3-latest-Linux-x86_64.sh 
29 | RUN conda init bash
30 | RUN /bin/bash -c "source /root/.bashrc"
31 | RUN conda install -y numpy scipy pandas seaborn -c conda-forge
32 | RUN pip install https://github.com/getianao/figurePlotter/archive/refs/tags/v0.23.9.14.tar.gz
33 | 
34 | ENV NGAP_ROOT="/ngAP"
35 | ENV PATH="/ngAP/code/build/bin:${PATH}"
36 | ENV PATH="/ngAP/hscompile/build:${PATH}"
37 | 
38 | WORKDIR /ngAP


--------------------------------------------------------------------------------
/env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | fullpath=$(readlink --canonicalize --no-newline $BASH_SOURCE)
 4 | cur_dir=$(cd `dirname ${fullpath}`; pwd)
 5 | # echo ${cur_dir}
 6 | 
 7 | export NGAP_ROOT=${cur_dir}
 8 | 
 9 | export PATH="${NGAP_ROOT}/code/build/bin:${PATH}"
10 | export PATH="${NGAP_ROOT}/hscompile/build:${PATH}"
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/ref_results/fig13_throughput.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getianao/ngAP/6fcab891ddcc1dbac79b533469f6ccbf3dd7845a/ref_results/fig13_throughput.pdf


--------------------------------------------------------------------------------
/ref_results/fig14_breakdown.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getianao/ngAP/6fcab891ddcc1dbac79b533469f6ccbf3dd7845a/ref_results/fig14_breakdown.pdf


--------------------------------------------------------------------------------
/ref_results/fig20_latency.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/getianao/ngAP/6fcab891ddcc1dbac79b533469f6ccbf3dd7845a/ref_results/fig20_latency.pdf


--------------------------------------------------------------------------------
/ref_results/raw/throughput_cpu/throughput_cpu_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | before-hyperscan_,1.19,1.63,196.01,214.16,0.31,-2.0,0.4,-1.0,-1.0,-2.0,-1.0,-1.0,-2.0,-1.0,-1.0,1.89,-2.0,2.9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,148.87
3 | before-vasim_,0.00182745,0.00227549,0.0421592,0.0414737,0.00224541,0.00102094,0.000835522,-1.0,-1.0,0.00158883,-1.0,-1.0,0.00145724,-1.0,-1.0,0.036843,0.00426582,0.0034086,-1.0,-1.0,0.000988023,1.12633,0.710085,0.689543,0.688317,0.27223
4 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_cpu/throughput_cpu_part2.csv:
--------------------------------------------------------------------------------
1 | config,Snort,FileCarving,ClamAV
2 | before-hyperscan_,36.58,-1.0,327.03
3 | before-vasim_,0.0309909,-1.0,0.00131314
4 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_cpu_oneinput/throughput_cpu_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | before-hyperscan_,1.19,1.6,71.91,84.03,0.32,0.16,0.4,-1.0,-1.0,0.04,-1.0,-1.0,0.02,-1.0,-1.0,1.85,0.03,2.89,-1.0,-1.0,61.31,285.88,1177.86,195.16,138.93,57.65
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_cpu_oneinput/throughput_cpu_part2.csv:
--------------------------------------------------------------------------------
1 | config,Snort,FileCarving,ClamAV
2 | before-hyperscan_,31.54,-1.0,262.67
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_best/throughput_gpu_napbest_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | oa-nonblocking-all-best,9.49091,80.8701,1612.11,222.526,37.0175,8.51391,1.82253,-1.0,-1.0,7.88546,-1.0,-1.0,0.890053,-1.0,-1.0,157.991,9.37462,4.62699,-1.0,-1.0,52.752,13452.8,8947.39,11743.4,11461.3,3298.04
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_best/throughput_gpu_napbest_part2.csv:
--------------------------------------------------------------------------------
1 | config,Snort,FileCarving,ClamAV
2 | oa-nonblocking-all-best,114.872,-1.0,5596.89
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_best/throughput_gpu_napbest_part3.csv:
--------------------------------------------------------------------------------
1 | config,smallSnort,smallFileCarving,smallClamAV
2 | oa-nonblocking-all-best,74.16,-1.0,3295.84
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_best_oneinput/throughput_gpu_napbest_oneinput_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | oa-nonblocking-all-best,4.85362,36.6275,45.0426,1.15836,22.3436,6.27635,7.57419,-1.0,-1.0,6.45865,-1.0,-1.0,0.716016,-1.0,-1.0,13.6536,0.559004,0.878501,-1.0,-1.0,3.34958,111.657,133.283,117.471,126.416,51.953
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_best_oneinput/throughput_gpu_napbest_oneinput_part2.csv:
--------------------------------------------------------------------------------
1 | config,Snort,FileCarving,ClamAV
2 | oa-nonblocking-all-best,1.83413,-1.0,38.9162
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_best_oneinput/throughput_gpu_napbest_oneinput_part3.csv:
--------------------------------------------------------------------------------
1 | config,smallSnort,smallFileCarving,smallClamAV
2 | oa-nonblocking-all-best,0.190169,-1.0,57.1354
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_breakdown/throughput_nap_breakdown_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | o0-blocking_,2.36783,3.21439,31.3726,30.8519,2.69913,1.55395,1.35045,-1.0,-1.0,1.95991,-1.0,-1.0,0.39358,-1.0,-1.0,20.4302,7.87992,4.5829,-1.0,-1.0,1.67768,333.874,226.043,224.893,226.549,121.238
3 | o0-nonblocking-NAP_,2.32725,3.55705,30.2642,26.6043,3.91616,1.54513,1.32497,-1.0,-1.0,2.10141,-1.0,-1.0,0.408884,-1.0,-1.0,18.6078,9.30723,4.1884,-1.0,-1.0,1.99783,150.837,125.299,125.405,124.745,81.1907
4 | o1-nonblocking_,5.48731,5.90764,66.7353,57.1711,6.93463,3.91444,1.60105,-1.0,-1.0,5.6975,-1.0,-1.0,0.17646,-1.0,-1.0,51.2707,7.19683,4.32801,-1.0,-1.0,4.48412,887.88,636.79,641.963,605.124,222.087
5 | o3-nonblocking-p3_,7.34958,54.2517,1125.96,158.133,36.4648,9.68088,1.62427,-1.0,-1.0,8.05241,-1.0,-1.0,0.971958,-1.0,-1.0,137.836,9.2406,4.28686,-1.0,-1.0,34.3151,2618.67,2202.54,2089.1,2022.6,787.731
6 | o4-nonblocking-r1f_,5.78694,5.51977,59.9846,57.1722,6.05899,4.16819,3.13265,-1.0,-1.0,5.30185,-1.0,-1.0,0.319643,-1.0,-1.0,59.8419,8.25265,4.66233,-1.0,-1.0,4.73265,876.315,568.532,564.681,549.169,206.94
7 | oa-nonblocking-all-p3r1_,7.97462,50.6649,1157.46,222.216,33.3177,8.36063,1.80042,-1.0,-1.0,8.03973,-1.0,-1.0,0.865747,-1.0,-1.0,152.499,8.74257,4.28031,-1.0,-1.0,50.842,2685.48,2218.65,1985.01,1979.47,733.358
8 | oa-nonblocking-all-p3r1f_,8.48512,47.8711,945.408,139.876,36.5803,7.87313,1.82704,-1.0,-1.0,7.93877,-1.0,-1.0,0.542088,-1.0,-1.0,152.262,8.23096,4.66774,-1.0,-1.0,52.752,2980.67,2369.49,2092.31,2082.94,782.884
9 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_breakdown/throughput_nap_breakdown_part2.csv:
--------------------------------------------------------------------------------
1 | config,Snort,FileCarving,ClamAV
2 | o0-blocking_,21.1791,-1.0,3.40173
3 | o0-nonblocking-NAP_,20.5721,-1.0,3.04677
4 | o1-nonblocking_,39.1882,-1.0,6.36849
5 | o3-nonblocking-p3_,52.1036,-1.0,1813.44
6 | o4-nonblocking-r1f_,51.2799,-1.0,3.68003
7 | oa-nonblocking-all-p3r1_,-1.0,-1.0,1811.11
8 | oa-nonblocking-all-p3r1f_,109.975,-1.0,2106.51
9 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_breakdown/throughput_nap_breakdown_part3.csv:
--------------------------------------------------------------------------------
1 | config,smallSnort,smallFileCarving,smallClamAV
2 | o0-blocking_,20.6698,-1.0,3.44144
3 | o0-nonblocking-NAP_,20.759,-1.0,3.09138
4 | o1-nonblocking_,39.1118,-1.0,6.37317
5 | o3-nonblocking-p3_,51.7606,-1.0,1804.14
6 | o4-nonblocking-r1f_,51.4143,-1.0,3.68474
7 | oa-nonblocking-all-p3r1f_,108.504,-1.0,2083.47
8 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | oa-nonblocking-default-best,8.60939,64.2597,1452.56,208.927,33.1166,7.26808,1.84866,-1.0,-1.0,7.48813,-1.0,-1.0,0.52098,-1.0,-1.0,141.725,8.3828,4.49539,-1.0,-1.0,50.409,3897.31,3690.17,3405.51,3598.85,2037.26
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part2.csv:
--------------------------------------------------------------------------------
1 | config,Snort,FileCarving,ClamAV
2 | oa-nonblocking-default-best,63.7755,-1.0,2334.82
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part3.csv:
--------------------------------------------------------------------------------
1 | config,smallSnort,smallFileCarving,smallClamAV
2 | oa-nonblocking-default-best,74.39,-1.0,1779.0
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_default_adp_oneinput/throughput_gpu_nap_default_oneinput_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | oa-nonblocking-default-best,2.43953,13.1657,15.0839,1.11249,7.84374,4.21962,5.71738,-1.0,-1.0,4.26646,-1.0,-1.0,0.623945,-1.0,-1.0,8.81588,0.559004,0.883431,-1.0,-1.0,3.10914,18.78,18.8387,18.8409,18.8719,15.645
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_default_adp_oneinput/throughput_gpu_nap_default_oneinput_part2.csv:
--------------------------------------------------------------------------------
1 | config,Snort,FileCarving,ClamAV
2 | oa-nonblocking-default-best,1.35838,-1.0,13.7059
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_nap_default_adp_oneinput/throughput_gpu_nap_default_oneinput_part3.csv:
--------------------------------------------------------------------------------
1 | config,smallSnort,smallFileCarving,smallClamAV
2 | oa-nonblocking-default-best,0.0790002,-1.0,10.9458
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_runahead/throughput_gpu_runahead_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | before-runahead-cc4_,4.675517,6.764452,57.299115,-2.0,22.255679,14.505548,1.137026,-1.0,-1.0,25.05346,-1.0,-1.0,0.700821,-1.0,-1.0,1.309869,-2.0,-2.0,-1.0,-1.0,6.853464,732.344044,440.55617,447.934965,448.066877,198.531099
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_runahead/throughput_gpu_runahead_part2.csv:
--------------------------------------------------------------------------------
1 | config,Snort,FileCarving,ClamAV
2 | before-runahead-cc4_,20.219294,-1.0,6.599365
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_runahead/throughput_gpu_runahead_part3.csv:
--------------------------------------------------------------------------------
1 | config,smallSnort,smallFileCarving,smallClamAV
2 | before-runahead-cc4_,20.244666,-1.0,6.586612
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_runahead_oneinput/throughput_gpu_runahead_oneinput_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | before-runahead-cc4_,2.777037,2.580277,10.455674,0.076473,10.557436,8.137953,0.667989,-1.0,-1.0,13.212716,-1.0,-1.0,0.685931,-1.0,-1.0,1.286162,-2.0,-2.0,-1.0,-1.0,1.518275,39.057815,35.711346,36.786173,35.720696,11.857418
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_runahead_oneinput/throughput_gpu_runahead_oneinput_part2.csv:
--------------------------------------------------------------------------------
1 | config,Snort,FileCarving,ClamAV
2 | before-runahead-cc4_,4.291639,-1.0,1.046811
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_runahead_oneinput/throughput_gpu_runahead_oneinput_part3.csv:
--------------------------------------------------------------------------------
1 | config,smallSnort,smallFileCarving,smallClamAV
2 | before-runahead-cc4_,4.152439,-1.0,1.091001
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_sota_best/throughput_gpu_sota_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | before-hotstart-nt_,6.79248,9.108471,41.064581,32.970172,4.109988,1.815733,4.773921,-1.0,-1.0,5.642327,-1.0,-1.0,1.23106,-1.0,-1.0,30.495631,3.566442,7.952852,-1.0,-1.0,3.534417,347.73229,283.197842,278.927813,276.3551,179.770678
3 | before-nfacg_,1.060613,0.740955,7.346126,9.281832,5.065893,2.545247,0.325797,-1.0,-1.0,4.91524,-1.0,-1.0,1.091527,-1.0,-1.0,4.751153,8.032496,3.114229,-1.0,-1.0,0.258865,143.187204,78.476178,77.617408,79.587591,31.312482
4 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_sota_best/throughput_gpu_sota_part3.csv:
--------------------------------------------------------------------------------
1 | config,smallSnort,smallFileCarving,smallClamAV
2 | before-hotstart-nt_,17.583161,-1.0,3.654265
3 | before-nfacg_,-1.0,-1.0,-1.0
4 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_sota_best_oneinput/throughput_gpu_sota_oneinput_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | before-hotstart-nt_,0.494784,1.625315,1.105979,1.081417,0.43556,0.229399,1.061616,-1.0,-1.0,0.270078,-1.0,-1.0,0.207295,-1.0,-1.0,0.525129,2.025399,1.10874,-1.0,-1.0,0.786183,1.48677,1.741122,1.659348,1.656274,1.393163
3 | before-nfacg_,0.981108,0.776644,1.374135,1.576036,1.508545,1.164417,0.393835,-1.0,-1.0,1.373664,-1.0,-1.0,0.963854,-1.0,-1.0,1.776493,1.281999,1.745596,-1.0,-1.0,0.25054,1.68832,1.65701,1.600435,1.639692,1.656128
4 | 


--------------------------------------------------------------------------------
/ref_results/raw/throughput_gpu_sota_best_oneinput/throughput_gpu_sota_oneinput_part3.csv:
--------------------------------------------------------------------------------
1 | config,smallSnort,smallFileCarving,smallClamAV
2 | before-hotstart-nt_,0.533841,-1.0,1.118965
3 | before-nfacg_,-1.0,-1.0,-1.0
4 | 


--------------------------------------------------------------------------------
/ref_results/raw/v100/throughput_gpu_nap_best/throughput_gpu_napbest_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | oa-nonblocking-all-best,6.09514,41.7723,1342.99,184.988,22.2398,4.41955,1.75018,-1.0,-1.0,4.18848,-1.0,-1.0,0.535905,-1.0,-1.0,94.5517,8.92899,4.2487,-1.0,-1.0,21.4083,8880.67,7569.57,6454.69,7033.73,2520.18
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/v100/throughput_gpu_nap_best/throughput_gpu_napbest_part2.csv:
--------------------------------------------------------------------------------
1 | config,Snort,FileCarving,ClamAV
2 | oa-nonblocking-all-best,49.6262,-1.0,3861.56
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/v100/throughput_gpu_nap_best/throughput_gpu_napbest_part3.csv:
--------------------------------------------------------------------------------
1 | config,smallSnort,smallFileCarving,smallClamAV
2 | oa-nonblocking-all-best,49.6452,-1.0,3971.3
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/v100/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | oa-nonblocking-default-best,4.01389,27.0318,804.678,124.093,15.3026,2.97133,1.7636,-1.0,-1.0,3.29583,-1.0,-1.0,0.215236,-1.0,-1.0,65.0104,8.53325,3.85484,-1.0,-1.0,17.0549,2249.96,2267.74,2144.67,1963.35,1245.33
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/v100/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part2.csv:
--------------------------------------------------------------------------------
1 | config,Snort,FileCarving,ClamAV
2 | oa-nonblocking-default-best,37.4139,-1.0,1307.5
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/v100/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part3.csv:
--------------------------------------------------------------------------------
1 | config,smallSnort,smallFileCarving,smallClamAV
2 | oa-nonblocking-default-best,43.9231,-1.0,1317.7
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/v100/throughput_gpu_runahead/throughput_gpu_runahead_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | before-runahead-cc4_,3.512485,5.135253,51.660751,-2.0,17.724456,11.598823,1.073728,-1.0,-1.0,20.17649,-1.0,-1.0,0.344323,-1.0,-1.0,0.750779,-2.0,-2.0,-1.0,-1.0,6.102957,618.361264,368.831971,380.604693,380.06254,170.980646
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/v100/throughput_gpu_runahead/throughput_gpu_runahead_part2.csv:
--------------------------------------------------------------------------------
1 | config,Snort,FileCarving,ClamAV
2 | before-runahead-cc4_,16.878107,-1.0,5.934625
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/v100/throughput_gpu_runahead/throughput_gpu_runahead_part3.csv:
--------------------------------------------------------------------------------
1 | config,smallSnort,smallFileCarving,smallClamAV
2 | before-runahead-cc4_,16.852375,-1.0,5.933999
3 | 


--------------------------------------------------------------------------------
/ref_results/raw/v100/throughput_gpu_sota_best/throughput_gpu_sota_part1.csv:
--------------------------------------------------------------------------------
1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP
2 | before-hotstart-nt_,6.320081,7.294038,35.837599,27.213543,3.104396,1.333548,3.569331,-1.0,-1.0,6.09973,-1.0,-1.0,1.573924,-1.0,-1.0,24.977908,4.947722,7.165793,-1.0,-1.0,3.482569,328.55615,260.383913,244.330903,246.823898,160.138904
3 | before-nfacg_,0.951604,0.628419,6.646628,9.100413,4.689425,2.041655,0.270739,-1.0,-1.0,4.215701,-1.0,-1.0,0.908289,-1.0,-1.0,4.290967,5.552233,2.898934,-1.0,-1.0,0.188535,138.858827,77.602717,75.70653,78.003779,29.367805
4 | 


--------------------------------------------------------------------------------
/ref_results/raw/v100/throughput_gpu_sota_best/throughput_gpu_sota_part3.csv:
--------------------------------------------------------------------------------
1 | config,smallSnort,smallFileCarving,smallClamAV
2 | before-hotstart-nt_,15.630824,-1.0,3.525744
3 | before-nfacg_,-1.0,-1.0,-1.0
4 | 


--------------------------------------------------------------------------------
/ref_results/tab4_throughput.csv:
--------------------------------------------------------------------------------
 1 | App,HyperScan,NFA-CG,AsyncAP,GPU-NFA,NAP,NAP-Best
 2 | APR,T,8.0,T,3.6,8.4,9.4
 3 | Brill,1.2,1.1,4.7,6.8,8.6,9.5
 4 | CRP1,0.3,5.1,22.3,4.1,33.1,37.0
 5 | CRP2,T,2.5,14.5,1.8,7.3,8.5
 6 | CAV,327.0,U,6.6,U,2334.8,5596.9
 7 | ER,1.6,0.7,6.8,9.1,64.3,80.9
 8 | HM,T,4.9,25.1,5.6,7.5,7.9
 9 | LV,T,1.1,0.7,1.2,0.5,0.9
10 | Pro,1.9,4.8,1.3,30.5,141.7,158.0
11 | RF,0.4,0.3,1.1,4.8,1.8,1.8
12 | SM,2.9,3.1,T,8.0,4.5,4.6
13 | Snort,36.6,U,20.2,U,63.8,114.9
14 | YARA,W,0.3,6.9,3.5,50.4,52.8
15 | DS,196.0,7.3,57.3,41.1,1452.6,1612.1
16 | PEN,214.2,9.3,T,33.0,208.9,222.5
17 | Bro,W,143.2,732.3,347.7,3897.3,13452.8
18 | EM,W,78.5,440.6,283.2,3690.2,8947.4
19 | Ran1,W,79.6,448.1,276.4,3598.9,11461.3
20 | Ran5,W,77.6,447.9,278.9,3405.5,11743.4
21 | TCP,148.9,31.3,198.5,179.8,2037.3,3298.0
22 | 


--------------------------------------------------------------------------------
/ref_results/tab6_latency.csv:
--------------------------------------------------------------------------------
 1 | App,HyperScan,NFA-CG,AsyncAP,GPU-NFA,NAP,NAP-Best
 2 | APR,0.03,1.28,T,2.03,0.56,0.56
 3 | Brill,1.19,0.98,2.78,0.49,2.44,4.85
 4 | CRP1,0.32,1.51,10.56,0.44,7.84,22.34
 5 | CRP2,0.16,1.16,8.14,0.23,4.22,6.28
 6 | CAV,262.67,U,1.05,U,13.71,38.92
 7 | ER,1.6,0.78,2.58,1.63,13.17,36.63
 8 | HM,0.04,1.37,13.21,0.27,4.27,6.46
 9 | LV,0.02,0.96,0.69,0.21,0.62,0.72
10 | Pro,1.85,1.78,1.29,0.53,8.82,13.65
11 | RF,0.4,0.39,0.67,1.06,5.72,7.57
12 | SM,2.89,1.75,T,1.11,0.88,0.88
13 | Snort,31.54,U,4.29,U,1.36,1.83
14 | YARA,61.31,0.25,1.52,0.79,3.11,3.35
15 | DS,71.91,1.37,10.46,1.11,15.08,45.04
16 | PEN,84.03,1.58,0.08,1.08,1.11,1.16
17 | Bro,285.88,1.69,39.06,1.49,18.78,111.66
18 | EM,1177.86,1.66,35.71,1.74,18.84,133.28
19 | Ran1,138.93,1.64,35.72,1.66,18.87,126.42
20 | Ran5,195.16,1.6,36.79,1.66,18.84,117.47
21 | TCP,57.65,1.66,11.86,1.39,15.65,51.95
22 | 


--------------------------------------------------------------------------------
/scripts/gen-breakdown-fig14.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python  ${NGAP_ROOT}/scripts/plot_throughput_gpu_nap_breakdown.py
3 | 
4 | 


--------------------------------------------------------------------------------
/scripts/gen-latency-fig20tab6.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python ${NGAP_ROOT}/scripts/plot_throughput_gpu_sota_oneinput.py
4 | python ${NGAP_ROOT}/scripts/table_throughput_oneinput.py
5 | 
6 | 


--------------------------------------------------------------------------------
/scripts/gen-throughput-fig13tab4.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python ${NGAP_ROOT}/scripts/plot_throughput_gpu_sota.py
4 | 
5 | python ${NGAP_ROOT}/scripts/table_throughput.py
6 | 
7 | 


--------------------------------------------------------------------------------
/scripts/run-breakdown.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | time ${NGAP_ROOT}/scripts/run_throughput_NAP_breakdown.sh
4 | 


--------------------------------------------------------------------------------
/scripts/run-latency.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | time ${NGAP_ROOT}/scripts/run_throughput_gpu_sota_best_oneinput.sh
 5 | time ${NGAP_ROOT}/scripts/run_throughput_runahead_oneinput.sh
 6 | time ${NGAP_ROOT}/scripts/run_throughput_gpu_nap_defalut_oneinput.sh
 7 | time ${NGAP_ROOT}/scripts/run_throughput_gpu_nap_best_oneinput.sh
 8 | 
 9 | time ${NGAP_ROOT}/scripts/run_throughput_cpu_oneinput.sh
10 | 
11 | 


--------------------------------------------------------------------------------
/scripts/run-throughput.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | time ${NGAP_ROOT}/scripts/run_throughput_gpu_sota_best.sh # 3.5 hrs
 4 | time ${NGAP_ROOT}/scripts/run_throughput_runahead.sh # 3.5 hrs
 5 | time ${NGAP_ROOT}/scripts/run_throughput_gpu_nap_defalut.sh # 1 hrs
 6 | time ${NGAP_ROOT}/scripts/run_throughput_gpu_nap_best.sh # 1 hrs
 7 | 
 8 | time ${NGAP_ROOT}/scripts/run_throughput_cpu.sh  # 5.5 hrs
 9 | 
10 | 


--------------------------------------------------------------------------------
/scripts/run_experiments.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$(cd `dirname $0`; pwd)
 4 | cd ${DIR}
 5 | 
 6 | 
 7 | 
 8 | FOLDER="exp-`date "+%Y%m%d-%H%M%S"`"
 9 | 
10 | cd ../raw_results
11 | if [ ! -d ${FOLDER} ]; then
12 |     mkdir ${FOLDER} && cd ${FOLDER}
13 | else
14 |     cd ${FOLDER}
15 | fi
16 | 
17 | cp ../../code/scripts/configs/* .
18 | 
19 | echo "Running Experiments... This will take several hours. "
20 | 
21 | 
22 | 
23 | APP_SPEC=$1
24 | EXEC_CONFIG=$2
25 | 
26 | 
27 | python ../../code/scripts/launch_exps.py -b ${APP_SPEC} -f ${EXEC_CONFIG} -e --clean ${@:3}
28 | 
29 | echo "Experiments finished. "
30 | 
31 | 
32 | if [ $? -eq 0 ]; then
33 |     echo "Collecting experiment raw data."
34 |     python ../../code/scripts/collect_results.py -b ${APP_SPEC} -f ${EXEC_CONFIG} ${@:3}
35 | else
36 |     echo "Experiments terminate abnormally. "
37 |     exit 1
38 | fi
39 | 
40 | cd ${NGAP_ROOT}
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/scripts/run_throughput.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$(cd `dirname $0`; pwd)
 4 | cd ${DIR}
 5 | 
 6 | if [ -z "${CONFIGS}" ] || [ -z "${APPS}" ]; then
 7 |   echo "Either CONFIGS or APPS is empty"
 8 |   exit 1
 9 | fi
10 | 
11 | mkdir -p ../raw_results/log/
12 | 
13 | 
14 | IFS=',' # Use the IFS (Internal Field Separator) variable to set the delimiter
15 | configs_arr=(${CONFIGS})
16 | apps_arr=(${APPS})
17 | 
18 | 
19 | for config in ${configs_arr[@]}; do
20 |     for app in ${apps_arr[@]}; do
21 |         LOG=../raw_results/log/"exp-`date "+%Y%m%d-%H%M%S"`.log"
22 |         ./run_experiments.sh ${app} ${config}  $@ 2>&1 | tee ${LOG}
23 |     done
24 | done
25 | 


--------------------------------------------------------------------------------
/scripts/run_throughput_NAP_breakdown.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$(cd `dirname $0`; pwd)
 4 | cd ${DIR}
 5 | 
 6 | VALI=--validation
 7 | 
 8 | mkdir -p ./results/raw/throughput_gpu_nap_breakdown
 9 | 
10 | # config: [NAP-Breakdown], apps: [part 1, part 2]
11 | APPS="app_spec_ngap_new_quickvalidation_part1" \
12 | CONFIGS="exec_config_ngap_groups_design_NAP" \
13 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
14 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_breakdown/throughput_nap_breakdown_part1.csv
15 | 
16 | APPS="app_spec_ngap_new_quickvalidation_part2" \
17 | CONFIGS="exec_config_ngap_groups_design_NAP" \
18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
19 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_breakdown/throughput_nap_breakdown_part2.csv
20 | 
21 | # config: [NAP-Breakdown'], apps: [part 3]
22 | APPS="app_spec_ngap_new_quickvalidation_part3" \
23 | CONFIGS="exec_config_ngap_groups_design_NAP_4degree" \
24 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
25 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_breakdown/throughput_nap_breakdown_part3.csv
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/scripts/run_throughput_cpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$(cd `dirname $0`; pwd)
 4 | cd ${DIR}
 5 | 
 6 | # VALI=--validation
 7 | 
 8 | mkdir -p ./results/raw/throughput_cpu
 9 | 
10 | # config: [cpu], apps: [part 1]
11 | APPS="app_spec_ngap_new_quickvalidation_part1" \
12 | CONFIGS="exec_config_ngap_groups_design_cpu" \
13 | ./run_throughput.sh  --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
14 | ${VALI}  --timeout-mins=60 \
15 | --csvdest=./results/raw/throughput_cpu/throughput_cpu_part1.csv
16 | 
17 | # config: [cpu], apps: [part 2]
18 | APPS="app_spec_ngap_new_quickvalidation_part2" \
19 | CONFIGS="exec_config_ngap_groups_design_cpu" \
20 | ./run_throughput.sh  --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
21 | ${VALI}  --timeout-mins=60 \
22 | --csvdest=./results/raw/throughput_cpu/throughput_cpu_part2.csv


--------------------------------------------------------------------------------
/scripts/run_throughput_cpu_oneinput.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$(cd `dirname $0`; pwd)
 4 | cd ${DIR}
 5 | 
 6 | # VALI=--validation
 7 | 
 8 | mkdir -p ../results/raw/throughput_cpu_oneinput
 9 | 
10 | # config: [cpu], apps: [part 1]
11 | APPS="app_spec_ngap_new_quickvalidation_part1" \
12 | CONFIGS="exec_config_ngap_groups_design_cpu_oneinput" \
13 | ./run_throughput.sh  --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
14 | ${VALI}  --timeout-mins=60 \
15 | --csvdest=./results/raw/throughput_cpu_oneinput/throughput_cpu_part1.csv
16 | 
17 | # config: [cpu], apps: [part 2]
18 | APPS="app_spec_ngap_new_quickvalidation_part2" \
19 | CONFIGS="exec_config_ngap_groups_design_cpu_oneinput" \
20 | ./run_throughput.sh  --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
21 | ${VALI}  --timeout-mins=60 \
22 | --csvdest=./results/raw/throughput_cpu_oneinput/throughput_cpu_part2.csv


--------------------------------------------------------------------------------
/scripts/run_throughput_gpu_nap_best.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$(cd `dirname $0`; pwd)
 4 | cd ${DIR}
 5 | 
 6 | VALI=--validation
 7 | 
 8 | mkdir -p ./results/raw/throughput_gpu_nap_best
 9 | 
10 | # config: [NAP-Best], apps: [part 1, part 2]
11 | APPS="app_spec_ngap_new_quickvalidation_part1" \
12 | CONFIGS="exec_config_ngap_groups_best" \
13 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
14 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_best/throughput_gpu_napbest_part1.csv
15 | 
16 | APPS="app_spec_ngap_new_quickvalidation_part2" \
17 | CONFIGS="exec_config_ngap_groups_best" \
18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
19 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_best/throughput_gpu_napbest_part2.csv
20 | 
21 | # config: [NAP-Best'], apps: [part 3]
22 | APPS="app_spec_ngap_new_quickvalidation_part3" \
23 | CONFIGS="exec_config_ngap_groups_best_4degree" \
24 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
25 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_best/throughput_gpu_napbest_part3.csv


--------------------------------------------------------------------------------
/scripts/run_throughput_gpu_nap_best_oneinput.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$(cd `dirname $0`; pwd)
 4 | cd ${DIR}
 5 | 
 6 | VALI=--validation
 7 | 
 8 | mkdir -p ./results/raw/throughput_gpu_nap_best_oneinput
 9 | 
10 | # config: [NAP-Best], apps: [part 1, part 2]
11 | APPS="app_spec_ngap_new_quickvalidation_part1" \
12 | CONFIGS="exec_config_ngap_groups_best_oneinput" \
13 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
14 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_best_oneinput/throughput_gpu_napbest_oneinput_part1.csv
15 | 
16 | APPS="app_spec_ngap_new_quickvalidation_part2" \
17 | CONFIGS="exec_config_ngap_groups_best_oneinput" \
18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
19 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_best_oneinput/throughput_gpu_napbest_oneinput_part2.csv
20 | 
21 | # config: [NAP-Best'], apps: [part 3]
22 | APPS="app_spec_ngap_new_quickvalidation_part3" \
23 | CONFIGS="exec_config_ngap_groups_best_4degree_oneinput" \
24 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
25 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_best_oneinput/throughput_gpu_napbest_oneinput_part3.csv


--------------------------------------------------------------------------------
/scripts/run_throughput_gpu_nap_defalut.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$(cd `dirname $0`; pwd)
 4 | cd ${DIR}
 5 | 
 6 | VALI=--validation
 7 | 
 8 | mkdir -p ./results/raw/throughput_gpu_nap_default_adp
 9 | 
10 | # config: [NAP-default], apps: [part 1, part 2]
11 | APPS="app_spec_ngap_new_quickvalidation_part1" \
12 | CONFIGS="exec_config_ngap_groups_nap_default" \
13 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
14 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part1.csv
15 | 
16 | APPS="app_spec_ngap_new_quickvalidation_part2" \
17 | CONFIGS="exec_config_ngap_groups_nap_default" \
18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
19 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part2.csv
20 | 
21 | # config: [NAP-default'], apps: [part 3]
22 | APPS="app_spec_ngap_new_quickvalidation_part3" \
23 | CONFIGS="exec_config_ngap_groups_nap_default_4degree" \
24 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
25 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part3.csv


--------------------------------------------------------------------------------
/scripts/run_throughput_gpu_nap_defalut_oneinput.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$(cd `dirname $0`; pwd)
 4 | cd ${DIR}
 5 | 
 6 | VALI=--validation
 7 | 
 8 | mkdir -p ./results/raw/throughput_gpu_nap_default_adp_oneinput
 9 | 
10 | # config: [NAP-default], apps: [part 1, part 2]
11 | APPS="app_spec_ngap_new_quickvalidation_part1" \
12 | CONFIGS="exec_config_ngap_groups_nap_default_oneinput" \
13 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
14 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_default_adp_oneinput/throughput_gpu_nap_default_oneinput_part1.csv
15 | 
16 | APPS="app_spec_ngap_new_quickvalidation_part2" \
17 | CONFIGS="exec_config_ngap_groups_nap_default_oneinput" \
18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
19 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_default_adp_oneinput/throughput_gpu_nap_default_oneinput_part2.csv
20 | 
21 | # config: [NAP-default'], apps: [part 3]
22 | APPS="app_spec_ngap_new_quickvalidation_part3" \
23 | CONFIGS="exec_config_ngap_groups_nap_default_4degree_oneinput" \
24 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
25 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_default_adp_oneinput/throughput_gpu_nap_default_oneinput_part3.csv


--------------------------------------------------------------------------------
/scripts/run_throughput_gpu_sota_best.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$(cd `dirname $0`; pwd)
 4 | cd ${DIR}
 5 | 
 6 | VALI=--validation
 7 | 
 8 | mkdir -p ./results/raw/throughput_gpu_sota_best
 9 | # config: [sota], apps: [part 1]
10 | APPS="app_spec_ngap_new_quickvalidation_part1" \
11 | CONFIGS="exec_config_ngap_groups_design_sota" \
12 | ./run_throughput.sh  --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
13 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_sota_best/throughput_gpu_sota_part1.csv
14 | 
15 | # config: [sota], apps: [part 3]
16 | APPS="app_spec_ngap_new_quickvalidation_part3" \
17 | CONFIGS="exec_config_ngap_groups_design_sota_4degree" \
18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
19 | ${VALI}  --timeout-mins=60  --csvdest=./results/raw/throughput_gpu_sota_best/throughput_gpu_sota_part3.csv


--------------------------------------------------------------------------------
/scripts/run_throughput_gpu_sota_best_oneinput.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$(cd `dirname $0`; pwd)
 4 | cd ${DIR}
 5 | 
 6 | VALI=--validation
 7 | 
 8 | mkdir -p ./results/raw/throughput_gpu_sota_best_oneinput
 9 | # config: [sota], apps: [part 1]
10 | APPS="app_spec_ngap_new_quickvalidation_part1" \
11 | CONFIGS="exec_config_ngap_groups_design_sota_oneinput" \
12 | ./run_throughput.sh  --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
13 | ${VALI}  --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_sota_best_oneinput/throughput_gpu_sota_oneinput_part1.csv
14 | 
15 | # config: [sota], apps: [part 3]
16 | APPS="app_spec_ngap_new_quickvalidation_part3" \
17 | CONFIGS="exec_config_ngap_groups_design_sota_4degree_oneinput" \
18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
19 | ${VALI}  --timeout-mins=60  --csvdest=./results/raw/throughput_gpu_sota_best_oneinput/throughput_gpu_sota_oneinput_part3.csv


--------------------------------------------------------------------------------
/scripts/run_throughput_runahead.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$(cd `dirname $0`; pwd)
 4 | cd ${DIR}
 5 | 
 6 | # VALI=--validation
 7 | 
 8 | mkdir -p ./results/raw/throughput_gpu_runahead
 9 | 
10 | # config: [runahead], apps: [part 1]
11 | APPS="app_spec_ngap_new_quickvalidation_part1" \
12 | CONFIGS="exec_config_ngap_groups_design_sota_runahead" \
13 | ./run_throughput.sh  --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
14 | ${VALI}  --timeout-mins=60 \
15 | --csvdest=./results/raw/throughput_gpu_runahead/throughput_gpu_runahead_part1.csv
16 | 
17 | # config: [runahead], apps: [part 2]
18 | APPS="app_spec_ngap_new_quickvalidation_part2" \
19 | CONFIGS="exec_config_ngap_groups_design_sota_runahead" \
20 | ./run_throughput.sh  --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
21 | ${VALI}  --timeout-mins=60 \
22 | --csvdest=./results/raw/throughput_gpu_runahead/throughput_gpu_runahead_part2.csv
23 | 
24 | 
25 | # config: [runahead'], apps: [part 3]
26 | APPS="app_spec_ngap_new_quickvalidation_part3" \
27 | CONFIGS="exec_config_ngap_groups_design_sota_runahead_4degree" \
28 | ./run_throughput.sh  --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
29 | ${VALI} --timeout-mins=60 \
30 | --csvdest=./results/raw/throughput_gpu_runahead/throughput_gpu_runahead_part3.csv
31 | 


--------------------------------------------------------------------------------
/scripts/run_throughput_runahead_oneinput.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$(cd `dirname $0`; pwd)
 4 | cd ${DIR}
 5 | 
 6 | # VALI=--validation
 7 | 
 8 | 
 9 | mkdir -p ./results/raw/throughput_gpu_runahead_oneinput
10 | 
11 | # config: [runahead], apps: [part 1]
12 | APPS="app_spec_ngap_new_quickvalidation_part1" \
13 | CONFIGS="exec_config_ngap_groups_design_sota_runahead_oneinput" \
14 | ./run_throughput.sh  --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
15 | ${VALI}  --timeout-mins=60 \
16 | --csvdest=./results/raw/throughput_gpu_runahead_oneinput/throughput_gpu_runahead_oneinput_part1.csv
17 | 
18 | # config: [runahead], apps: [part 2]
19 | APPS="app_spec_ngap_new_quickvalidation_part2" \
20 | CONFIGS="exec_config_ngap_groups_design_sota_runahead_oneinput" \
21 | ./run_throughput.sh  --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
22 | ${VALI}  --timeout-mins=60 \
23 | --csvdest=./results/raw/throughput_gpu_runahead_oneinput/throughput_gpu_runahead_oneinput_part2.csv
24 | 
25 | 
26 | # config: [runahead'], apps: [part 3]
27 | APPS="app_spec_ngap_new_quickvalidation_part3" \
28 | CONFIGS="exec_config_ngap_groups_design_sota_runahead_4degree_oneinput" \
29 | ./run_throughput.sh  --keywords=../../code/scripts/collect_keyword_list_throughput.txt \
30 | ${VALI} --timeout-mins=60 \
31 | --csvdest=./results/raw/throughput_gpu_runahead_oneinput/throughput_gpu_runahead_oneinput_part3.csv
32 | 


--------------------------------------------------------------------------------
/small_dataset/apple.anml:
--------------------------------------------------------------------------------
 1 | <anml version="1.0"  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
 2 |   <automata-network id="temp">
 3 |     <state-transition-element id="__41__" symbol-set="[a]" start="all-input">
 4 |       <activate-on-match element="__42__"/>
 5 |     </state-transition-element>
 6 | 
 7 |     <state-transition-element id="__42__" symbol-set="[p]">
 8 |       <activate-on-match element="__43__"/>
 9 |     </state-transition-element>
10 | 
11 |     <state-transition-element id="__43__" symbol-set="[p]">
12 |       <activate-on-match element="__44__"/>
13 |     </state-transition-element>
14 | 
15 |     <state-transition-element id="__44__" symbol-set="[l]">
16 |       <activate-on-match element="__45__"/>
17 |     </state-transition-element>
18 | 
19 |     <state-transition-element id="__45__" symbol-set="[e]">
20 |        <report-on-match reportcode="2019"/>
21 |     </state-transition-element>
22 | 
23 |   </automata-network>
24 | </anml>
25 | 


--------------------------------------------------------------------------------
/small_dataset/inputstream.txt:
--------------------------------------------------------------------------------
1 | aappleaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzzappletoptoplslslsappleappappapplezzzz
2 | 


--------------------------------------------------------------------------------