├── .gitignore ├── .gitmodules ├── 1_download_benchmark.sh ├── 2_build_docker.sh ├── 3_launch_docker.sh ├── 4_build_all.sh ├── 5_run_all.sh ├── 6_gen_all.sh ├── LICENSE ├── README.md ├── clean.sh ├── code ├── CMakeLists.txt ├── include │ ├── clara │ │ ├── clara.hpp │ │ └── clara_textflow.hpp │ ├── commons │ │ ├── NFA.h │ │ ├── NFALoader.h │ │ ├── SymbolStream.h │ │ ├── common_func.h │ │ ├── compatible_group_helper.h │ │ ├── device_intrinsics.h │ │ ├── graph.h │ │ ├── graph_helper.h │ │ ├── group_graph.h │ │ ├── my_bitset.h │ │ ├── nfa_utils.h │ │ ├── node.h │ │ ├── precompute_table.h │ │ ├── report_formatter.h │ │ ├── validate.h │ │ └── vasim_helper.h │ ├── gpunfautils │ │ ├── abstract_gpunfa.h │ │ ├── array2.h │ │ ├── common.h │ │ └── utils.h │ ├── moderngpu │ │ ├── context.hxx │ │ ├── cpp11.hxx │ │ ├── cta_load_balance.hxx │ │ ├── cta_merge.hxx │ │ ├── cta_mergesort.hxx │ │ ├── cta_reduce.hxx │ │ ├── cta_scan.hxx │ │ ├── cta_search.hxx │ │ ├── cta_segscan.hxx │ │ ├── cta_segsort.hxx │ │ ├── intrinsics.hxx │ │ ├── kernel_bulkinsert.hxx │ │ ├── kernel_bulkremove.hxx │ │ ├── kernel_compact.hxx │ │ ├── kernel_intervalmove.hxx │ │ ├── kernel_join.hxx │ │ ├── kernel_load_balance.hxx │ │ ├── kernel_merge.hxx │ │ ├── kernel_mergesort.hxx │ │ ├── kernel_reduce.hxx │ │ ├── kernel_scan.hxx │ │ ├── kernel_segreduce.hxx │ │ ├── kernel_segsort.hxx │ │ ├── kernel_sortedsearch.hxx │ │ ├── kernel_workcreate.hxx │ │ ├── launch_box.hxx │ │ ├── launch_params.hxx │ │ ├── loadstore.hxx │ │ ├── memory.hxx │ │ ├── meta.hxx │ │ ├── operators.hxx │ │ ├── search.hxx │ │ ├── sort_networks.hxx │ │ ├── transform.hxx │ │ ├── tuple.hxx │ │ ├── types.hxx │ │ └── util.hxx │ └── pugixml │ │ ├── pugiconfig.hpp │ │ ├── pugixml.cpp │ │ └── pugixml.hpp ├── scripts │ ├── collect_keyword_list_throughput.txt │ ├── collect_results.py │ ├── configs │ │ ├── app_spec_ngap_new │ │ ├── app_spec_ngap_new_quickvalidation │ │ ├── app_spec_ngap_new_quickvalidation_part1 │ │ ├── app_spec_ngap_new_quickvalidation_part2 │ │ ├── app_spec_ngap_new_quickvalidation_part3 │ │ ├── exec_config_ngap_groups_best │ │ ├── exec_config_ngap_groups_best_4degree │ │ ├── exec_config_ngap_groups_best_4degree_oneinput │ │ ├── exec_config_ngap_groups_best_oneinput │ │ ├── exec_config_ngap_groups_design_NAP │ │ ├── exec_config_ngap_groups_design_NAP_4degree │ │ ├── exec_config_ngap_groups_design_cpu │ │ ├── exec_config_ngap_groups_design_cpu_oneinput │ │ ├── exec_config_ngap_groups_design_sota │ │ ├── exec_config_ngap_groups_design_sota_4degree │ │ ├── exec_config_ngap_groups_design_sota_4degree_oneinput │ │ ├── exec_config_ngap_groups_design_sota_oneinput │ │ ├── exec_config_ngap_groups_design_sota_runahead │ │ ├── exec_config_ngap_groups_design_sota_runahead_4degree │ │ ├── exec_config_ngap_groups_design_sota_runahead_4degree_oneinput │ │ ├── exec_config_ngap_groups_design_sota_runahead_oneinput │ │ ├── exec_config_ngap_groups_nap_default │ │ ├── exec_config_ngap_groups_nap_default_4degree │ │ ├── exec_config_ngap_groups_nap_default_4degree_oneinput │ │ └── exec_config_ngap_groups_nap_default_oneinput │ ├── launch_exps.py │ └── llcommons.py └── src │ ├── asyncap │ ├── CMakeLists.txt │ ├── Makefile │ ├── include │ │ ├── run_ahead_approach.h │ │ ├── run_ahead_kernels.h │ │ └── scan_kernels.h │ └── src │ │ ├── main.cu │ │ └── run_ahead_approach.cu │ ├── commons │ ├── NFA.cpp │ ├── NFALoader.cpp │ ├── SymbolStream.cpp │ ├── common_func.cpp │ ├── compatible_group_helper.cpp │ ├── graph.cu │ ├── graph_helper.cpp │ ├── nfa_utils.cpp │ ├── node.cpp │ ├── precompute_table.cu │ ├── report_formatter.cpp │ ├── validate.cpp │ └── vasim_helper.cpp │ ├── gpunfautils │ ├── abstract_gpunfa.cu │ ├── common.cpp │ └── utils.cu │ ├── infant │ ├── device_funcs.h │ ├── infant.cu │ ├── infant.h │ ├── infant_config.h │ ├── infant_kernels.cu │ ├── infant_kernels.h │ └── main.cu │ ├── ngap │ ├── kernel.h │ ├── kernel_bap.cu │ ├── kernel_helper.h │ ├── kernel_ngap_O0.cu │ ├── kernel_ngap_O1.cu │ ├── kernel_ngap_O3.cu │ ├── kernel_ngap_O4.cu │ ├── kernel_ngap_OA.cu │ ├── main.cu │ ├── ngap.cu │ ├── ngap.h │ ├── ngap_buffer.cu │ ├── ngap_buffer.h │ └── ngap_option.h │ ├── obat │ ├── Makefile │ ├── main.cu │ ├── one_byte_a_time_kernels.h │ ├── one_byte_at_a_time.cu │ ├── one_byte_at_a_time.h │ └── option_config.h │ └── ppopp12 │ ├── main.cu │ ├── ppopp12.cu │ ├── ppopp12.h │ ├── ppopp12_kernels.cu │ ├── ppopp12_kernels.h │ └── ppopp12_option.h ├── docker └── Dockerfile ├── env.sh ├── ref_results ├── fig13_throughput.pdf ├── fig14_breakdown.pdf ├── fig20_latency.pdf ├── raw │ ├── throughput_cpu │ │ ├── throughput_cpu_part1.csv │ │ └── throughput_cpu_part2.csv │ ├── throughput_cpu_oneinput │ │ ├── throughput_cpu_part1.csv │ │ └── throughput_cpu_part2.csv │ ├── throughput_gpu_nap_best │ │ ├── throughput_gpu_napbest_part1.csv │ │ ├── throughput_gpu_napbest_part2.csv │ │ └── throughput_gpu_napbest_part3.csv │ ├── throughput_gpu_nap_best_oneinput │ │ ├── throughput_gpu_napbest_oneinput_part1.csv │ │ ├── throughput_gpu_napbest_oneinput_part2.csv │ │ └── throughput_gpu_napbest_oneinput_part3.csv │ ├── throughput_gpu_nap_breakdown │ │ ├── throughput_nap_breakdown_part1.csv │ │ ├── throughput_nap_breakdown_part2.csv │ │ └── throughput_nap_breakdown_part3.csv │ ├── throughput_gpu_nap_default_adp │ │ ├── throughput_gpu_nap_default_part1.csv │ │ ├── throughput_gpu_nap_default_part2.csv │ │ └── throughput_gpu_nap_default_part3.csv │ ├── throughput_gpu_nap_default_adp_oneinput │ │ ├── throughput_gpu_nap_default_oneinput_part1.csv │ │ ├── throughput_gpu_nap_default_oneinput_part2.csv │ │ └── throughput_gpu_nap_default_oneinput_part3.csv │ ├── throughput_gpu_runahead │ │ ├── throughput_gpu_runahead_part1.csv │ │ ├── throughput_gpu_runahead_part2.csv │ │ └── throughput_gpu_runahead_part3.csv │ ├── throughput_gpu_runahead_oneinput │ │ ├── throughput_gpu_runahead_oneinput_part1.csv │ │ ├── throughput_gpu_runahead_oneinput_part2.csv │ │ └── throughput_gpu_runahead_oneinput_part3.csv │ ├── throughput_gpu_sota_best │ │ ├── throughput_gpu_sota_part1.csv │ │ └── throughput_gpu_sota_part3.csv │ ├── throughput_gpu_sota_best_oneinput │ │ ├── throughput_gpu_sota_oneinput_part1.csv │ │ └── throughput_gpu_sota_oneinput_part3.csv │ └── v100 │ │ ├── throughput_gpu_nap_best │ │ ├── throughput_gpu_napbest_part1.csv │ │ ├── throughput_gpu_napbest_part2.csv │ │ └── throughput_gpu_napbest_part3.csv │ │ ├── throughput_gpu_nap_default_adp │ │ ├── throughput_gpu_nap_default_part1.csv │ │ ├── throughput_gpu_nap_default_part2.csv │ │ └── throughput_gpu_nap_default_part3.csv │ │ ├── throughput_gpu_runahead │ │ ├── throughput_gpu_runahead_part1.csv │ │ ├── throughput_gpu_runahead_part2.csv │ │ └── throughput_gpu_runahead_part3.csv │ │ └── throughput_gpu_sota_best │ │ ├── throughput_gpu_sota_part1.csv │ │ └── throughput_gpu_sota_part3.csv ├── tab4_throughput.csv └── tab6_latency.csv ├── scripts ├── dict_config.py ├── gen-breakdown-fig14.sh ├── gen-latency-fig20tab6.sh ├── gen-throughput-fig13tab4.sh ├── plot_throughput_gpu_nap_breakdown.py ├── plot_throughput_gpu_sota.py ├── plot_throughput_gpu_sota_oneinput.py ├── run-breakdown.sh ├── run-latency.sh ├── run-throughput.sh ├── run_experiments.sh ├── run_throughput.sh ├── run_throughput_NAP_breakdown.sh ├── run_throughput_cpu.sh ├── run_throughput_cpu_oneinput.sh ├── run_throughput_gpu_nap_best.sh ├── run_throughput_gpu_nap_best_oneinput.sh ├── run_throughput_gpu_nap_defalut.sh ├── run_throughput_gpu_nap_defalut_oneinput.sh ├── run_throughput_gpu_sota_best.sh ├── run_throughput_gpu_sota_best_oneinput.sh ├── run_throughput_runahead.sh ├── run_throughput_runahead_oneinput.sh ├── table_throughput.py └── table_throughput_oneinput.py └── small_dataset ├── apple.anml └── inputstream.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.pyc 3 | code/build 4 | code/log 5 | code/results 6 | .vscode 7 | raw_results/* 8 | results/* 9 | exp_table3/* 10 | characterize/* 11 | froniter_length*/* 12 | execution_path*/* 13 | scripts/__pycache__/* 14 | automata_benchmark_original/* 15 | automata_benchmark.tar.gz 16 | code/src/asyncap/bin/* 17 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "hscompile"] 2 | path = hscompile 3 | url = https://github.com/getianao/hscompile.git 4 | -------------------------------------------------------------------------------- /1_download_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd ${NGAP_ROOT} 4 | wget https://hkustgz-my.sharepoint.com/:u:/g/personal/tge601_connect_hkust-gz_edu_cn/EbRBcgYV7Z1KrGLk56PjswsBAmdDwfen2zdXTknP5owEAg\?e\=5bWc4W\&download=1 -O automata_benchmark_original.tar.gz 5 | tar -zxvf automata_benchmark_original.tar.gz -------------------------------------------------------------------------------- /2_build_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker build -t ngap-ae ${NGAP_ROOT}/docker -------------------------------------------------------------------------------- /3_launch_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker run -it --rm --gpus all -v ${NGAP_ROOT}:/ngAP ngap-ae:latest /bin/bash -------------------------------------------------------------------------------- /4_build_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # GPU Schemes 4 | cd ${NGAP_ROOT}/code && mkdir -p build && cd build 5 | cmake -DCMAKE_BUILD_TYPE=Release .. 6 | make -j 7 | 8 | # CPU Schemes 9 | cd ${NGAP_ROOT}/hscompile/lib/hyperscan && mkdir -p build && cd build 10 | cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc .. 11 | make -j 12 | cd ${NGAP_ROOT}/hscompile/lib/mnrl/C++ 13 | sed -i 's/CC = .*/CC = g++-5/g' Makefile # requires GCC-5. 14 | make # If an error occurs, try to run it again 15 | cd ${NGAP_ROOT}/hscompile && mkdir -p build && cd build 16 | cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc \ 17 | -DHS_SOURCE_DIR=${NGAP_ROOT}/hscompile/lib/hyperscan \ 18 | -DMNRL_SOURCE_DIR=${NGAP_ROOT}/hscompile/lib/mnrl/C++ \ 19 | .. 20 | make -j -------------------------------------------------------------------------------- /5_run_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | time ./scripts/run-throughput.sh # 10 hrs 4 | time ./scripts/run-breakdown.sh # 5 hrs 5 | time ./scripts/run-latency.sh # 1 hrs -------------------------------------------------------------------------------- /6_gen_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./scripts/gen-throughput-fig13tab4.sh 4 | ./scripts/gen-breakdown-fig14.sh 5 | ./scripts/gen-latency-fig20tab6.sh -------------------------------------------------------------------------------- /clean.sh: -------------------------------------------------------------------------------- 1 | # Docker 2 | 3 | # docker ps -a 4 | # docker image ls 5 | # docker stop ngap-ae:latest 6 | # docker rm ngap-ae:latest 7 | # docker rmi ngap-ae 8 | -------------------------------------------------------------------------------- /code/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR) 2 | enable_language(CXX CUDA) 3 | 4 | project(GPUNFA2019) 5 | find_package(CUDA REQUIRED) 6 | find_package(OpenMP REQUIRED) 7 | find_package(TBB REQUIRED COMPONENTS tbb) 8 | if (OPENMP_FOUND) 9 | set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") 10 | set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 11 | set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") 12 | endif() 13 | 14 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 15 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 16 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 17 | 18 | set(CMAKE_CXX_STANDARD 17) 19 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 20 | set(CMAKE_CXX_EXTENSIONS OFF) 21 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++17 -lineinfo --expt-extended-lambda -O3 -Wno-deprecated-gpu-targets -arch=sm_86 --keep") 22 | # set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++17 -lineinfo --expt-extended-lambda -O3 -Wno-deprecated-gpu-targets -arch=sm_70 --keep") 23 | # set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++17 -lineinfo --expt-extended-lambda -g -G -Wno-deprecated-gpu-targets -arch=sm_86") 24 | # -arch=sm_86 25 | 26 | if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) 27 | set(CMAKE_CUDA_ARCHITECTURES 86) 28 | endif() 29 | 30 | add_definitions(-DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT) 31 | 32 | include_directories(.) 33 | include_directories(include) 34 | include_directories(${CUDA_INCLUDE_DIRS}) 35 | 36 | file(GLOB_RECURSE SOURCES RELATIVE ${CMAKE_SOURCE_DIR} "src/commons/*") 37 | add_library(gpunfacommons ${SOURCES}) 38 | target_link_libraries(gpunfacommons tbb) 39 | target_include_directories(gpunfacommons PRIVATE include/commons include/pugixml ) 40 | 41 | 42 | file(GLOB_RECURSE SOURCES1 RELATIVE ${CMAKE_SOURCE_DIR} "src/gpunfautils/*") 43 | cuda_add_library(gpunfautils ${SOURCES1}) 44 | target_include_directories(gpunfautils PRIVATE include/commons include/gpunfautils) 45 | set_property(TARGET gpunfautils PROPERTY CUDA_STANDARD 17) 46 | 47 | 48 | # CUDA Projects 49 | # obat 50 | SET(PROJ "obat") 51 | file(GLOB_RECURSE SOURCES2 RELATIVE ${CMAKE_SOURCE_DIR} "src/${PROJ}/*") 52 | cuda_add_executable(${PROJ} ${SOURCES2}) 53 | set_property(TARGET ${PROJ} PROPERTY CUDA_STANDARD 11) 54 | target_link_libraries(${PROJ} gpunfacommons gpunfautils) 55 | target_include_directories(${PROJ} PRIVATE include/commons;include/gpunfautils) 56 | 57 | # # infant 58 | # file(GLOB_RECURSE SOURCES_INFANT RELATIVE ${CMAKE_SOURCE_DIR} "src/infant/*") 59 | # cuda_add_executable(infant ${SOURCES_INFANT}) 60 | # set_property(TARGET infant PROPERTY CUDA_STANDARD 11) 61 | # target_link_libraries(infant gpunfacommons gpunfautils) 62 | 63 | # ppopp12 64 | file(GLOB_RECURSE SOURCES_PPOPP RELATIVE ${CMAKE_SOURCE_DIR} "src/ppopp12/*") 65 | cuda_add_executable(ppopp12 ${SOURCES_PPOPP}) 66 | set_property(TARGET ppopp12 PROPERTY CUDA_STANDARD 11) 67 | target_link_libraries(ppopp12 gpunfacommons gpunfautils) 68 | target_include_directories(ppopp12 PRIVATE include/commons;include/gpunfautils) 69 | 70 | 71 | if(DEFINED DATA_BUFFER_SIZE) 72 | message("DATA_BUFFER_SIZE is defined to ${DATA_BUFFER_SIZE}") 73 | add_compile_definitions(DATA_BUFFER_SIZE=${DATA_BUFFER_SIZE}) 74 | endif() 75 | if(DEFINED DATA_BUFFER_SIZE_FRONTIER) 76 | message("DATA_BUFFER_SIZE_FRONTIER is defined to ${DATA_BUFFER_SIZE_FRONTIER}") 77 | add_compile_definitions(DATA_BUFFER_SIZE_FRONTIER=${DATA_BUFFER_SIZE_FRONTIER}) 78 | endif() 79 | if(DEFINED RESULTS_SIZE) 80 | message("RESULTS_SIZE is defined to ${RESULTS_SIZE}") 81 | add_compile_definitions(RESULTS_SIZE=${RESULTS_SIZE}) 82 | endif() 83 | 84 | 85 | # ngap 86 | file(GLOB_RECURSE SOURCES_PPOPP RELATIVE ${CMAKE_SOURCE_DIR} "src/ngap/*") 87 | cuda_add_executable(ngap ${SOURCES_PPOPP}) 88 | set_property(TARGET ngap PROPERTY CUDA_STANDARD 11) 89 | target_link_libraries(ngap gpunfacommons gpunfautils) 90 | target_include_directories(ngap PRIVATE include/commons;include/gpunfautils) 91 | 92 | # asyncap 93 | add_subdirectory(src/asyncap) 94 | add_custom_target(asyncap ALL 95 | COMMAND make -C ${CMAKE_CURRENT_SOURCE_DIR}/src/asyncap all 96 | COMMENT "Running Makefile in asyncap" 97 | ) 98 | add_dependencies(asyncap gpunfautils gpunfacommons) 99 | set_property( 100 | TARGET asyncap 101 | APPEND 102 | PROPERTY ADDITIONAL_CLEAN_FILES 103 | ${CMAKE_CURRENT_SOURCE_DIR}/src/asyncap/bin 104 | ${CMAKE_CURRENT_SOURCE_DIR}/build/bin/asyncap 105 | ) 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /code/include/commons/NFA.h: -------------------------------------------------------------------------------- 1 | /* 2 | * NFA.h 3 | * 4 | * Created on: Apr 29, 2018 5 | * Author: hyliu 6 | */ 7 | 8 | #ifndef NFA_H_ 9 | #define NFA_H_ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include "node.h" 20 | 21 | 22 | using std::set; 23 | using std::unique_ptr; 24 | using std::bitset; 25 | using std::string; 26 | using std::map; 27 | using std::vector; 28 | using std::list; 29 | using std::unordered_map; 30 | using std::pair; 31 | using std::make_pair; 32 | 33 | 34 | class NFA { 35 | 36 | public: 37 | NFA(); 38 | NFA(int V); 39 | virtual ~NFA(); 40 | 41 | void addNode(Node *n); 42 | 43 | void addNode(Node *n, int intid); 44 | 45 | void addEdge(string from_str_id, string to_str_id); 46 | 47 | int size() const; 48 | int edge_size() const; 49 | int always_active_nodes_num; 50 | int start_active_nodes_num; 51 | 52 | void mark_cc_id(); 53 | int get_num_cc() const; 54 | 55 | Node* get_node_by_int_id(int iid) const; 56 | Node* get_node_by_str_id(string sid) const; 57 | 58 | int get_num_transitions() const; 59 | 60 | int get_int_id_by_str_id(string str_id) const; 61 | 62 | void print(); 63 | 64 | void calc_scc(); 65 | void topo_sort(); 66 | 67 | int get_num_scc() const; 68 | 69 | vector get_nodes_by_original_id(string original_id) const; 70 | 71 | vector get_adj(string str) const; 72 | vector get_from(string str_id) const; 73 | 74 | int get_indegree_of_node(string str_id) const; 75 | int get_outdegree_of_node(string str_id) const; 76 | 77 | bool has_node(string str) const; 78 | bool has_node(int int_id) const; 79 | 80 | void to_dot_file(string dotfile) const; 81 | 82 | /** return the removed node's intid 83 | 84 | This function must be called followed by an addNode(Node *n, int intid); 85 | where the intid is the previous one. 86 | or there will be an inconsistency. 87 | 88 | 89 | **/ 90 | Node remove_node_unsafe(string str_id); 91 | 92 | void remove_edge(string from_node, string to_node); 93 | 94 | int get_num_topoorder() const; 95 | 96 | set get_alphabet_in_nfa_wo_wildcard() const; 97 | 98 | set get_alphabet_in_nodes_wo_wildcard_wo_nottype() const; 99 | 100 | int get_num_states_leq_topo(int topo); 101 | 102 | int get_dag(); 103 | 104 | bool has_self_loop(int sid) const; 105 | bool has_self_loop(string str_id) const; 106 | 107 | void remove_self_loop(int sid); 108 | void remove_self_loop(string str_id); 109 | 110 | int has_self_loop_plus_large_matchset() const; 111 | unordered_map > adj; 112 | 113 | 114 | private: 115 | 116 | // ----- for separate CCs ------------------------------- 117 | void calc_bidirected_graph(); 118 | void clear_visit_flag(); 119 | void dfs(int start_iid, int cc_id); 120 | 121 | unordered_map > from_node; 122 | 123 | unordered_map strid_to_intid; 124 | 125 | unordered_map node_pool; 126 | 127 | int V; // n nodes; 128 | int E; 129 | 130 | unordered_map> bi_directed_eq_graph; 131 | int num_cc; 132 | 133 | 134 | unordered_map > original_id_to_nodes; 135 | 136 | }; 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | #endif /* NFA_H_ */ 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /code/include/commons/NFALoader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * NFALoader.h 3 | * 4 | * Created on: Apr 29, 2018 5 | * Author: hyliu 6 | */ 7 | 8 | #ifndef NFALOADER_H_ 9 | #define NFALOADER_H_ 10 | 11 | #include 12 | #include "NFA.h" 13 | #include 14 | #include "pugixml/pugixml.hpp" 15 | //#include "mnrl.hpp" 16 | 17 | using std::string; 18 | 19 | NFA *load_nfa_from_anml(string filename); 20 | 21 | //NFA *load_nfa_from_mnrl(string filename); 22 | 23 | NFA *load_nfa_from_file(string filename); 24 | 25 | 26 | #endif /* NFALOADER_H_ */ 27 | 28 | 29 | -------------------------------------------------------------------------------- /code/include/commons/SymbolStream.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SymbolStream.h 3 | * 4 | * Created on: May 1, 2018 5 | * Author: hyliu 6 | */ 7 | 8 | #ifndef SYMBOLSTREAM_H_ 9 | #define SYMBOLSTREAM_H_ 10 | 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | using std::string; 17 | using std::set; 18 | using std::vector; 19 | 20 | 21 | class SymbolStream { 22 | public: 23 | SymbolStream(); 24 | 25 | virtual ~SymbolStream(); 26 | void readFromFile(string filename); 27 | 28 | const set& calc_alphabet(); 29 | uint8_t get_position(int pos) const; 30 | void set_position(int pos, uint8_t c); 31 | 32 | void push_back(uint8_t c) { 33 | input.push_back(c); 34 | } 35 | 36 | void concat(SymbolStream &s) { 37 | input.insert(input.end(), s.input.begin(), s.input.end()); 38 | } 39 | 40 | int get_length() const; 41 | 42 | int size() const { 43 | return input.size(); 44 | } 45 | 46 | SymbolStream slice(int start, int len) const; 47 | 48 | void padding_to_base(int base); 49 | 50 | private: 51 | vector input; 52 | string fromFile; 53 | set alphabet; 54 | 55 | }; 56 | 57 | 58 | 59 | #endif /* SYMBOLSTREAM_H_ */ 60 | -------------------------------------------------------------------------------- /code/include/commons/graph_helper.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPH_HELPER_H 2 | #define GRAPH_HELPER_H 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using std::map; 12 | using std::vector; 13 | using std::fill; 14 | using std::cout; 15 | using std::endl; 16 | 17 | 18 | 19 | class DAG { // for topological sort. 20 | private: 21 | int V; 22 | std::map > adj; 23 | std::map in_degree; 24 | bool **gg; 25 | 26 | std::vector > topo_order; 27 | bool *flag; 28 | 29 | public: 30 | DAG (int V); 31 | ~DAG(); 32 | void addEdge(int v, int w); 33 | void topological_sort(); 34 | std::vector > &get_topo_order(); 35 | 36 | }; 37 | 38 | 39 | // A class that represents an directed graph 40 | class MyGraph 41 | { 42 | int V; // No. of vertices 43 | std::map > adj; // A dynamic array of adjacency lists 44 | std::map scc; 45 | int time_, *pre, *low, *stk, tops, sccN; 46 | void tarjan(int s); 47 | 48 | //bool dag; 49 | 50 | 51 | bool *visited; 52 | int *bfs_layer; 53 | 54 | public: 55 | MyGraph(int V); // Constructor 56 | ~MyGraph(); 57 | void addEdge(int v, int w); // function to add an edge to graph 58 | 59 | void calc_SCC(); 60 | int get_n_scc(); 61 | 62 | void bfs(); 63 | 64 | //bool is_dag() const { 65 | // return dag; 66 | //} 67 | 68 | std::map get_scc(); 69 | 70 | const int *get_bfs_layers() { 71 | return bfs_layer; 72 | } 73 | 74 | void print_SCC(); 75 | 76 | }; 77 | 78 | 79 | #endif -------------------------------------------------------------------------------- /code/include/commons/group_graph.h: -------------------------------------------------------------------------------- 1 | #ifndef GROUP_GRAPH_H_ 2 | #define GROUP_GRAPH_H_ 3 | 4 | #include "graph.h" 5 | 6 | class GroupCsr { 7 | public: 8 | int size; 9 | Csr *groups_csr; 10 | Csr *h_groups_csr; 11 | 12 | void init(std::vector &gs) { 13 | this->size = gs.size(); 14 | h_groups_csr = new Csr[size]; 15 | CHECK_ERROR(cudaMalloc(&groups_csr, sizeof(Csr) * size)); 16 | for (int i = 0; i < size; i++) { 17 | Graph *graph = gs[i]; 18 | Csr csr(*graph); 19 | csr.fromCoo(graph->edge_pairs->get_host()); 20 | csr.moveToDevice(); 21 | h_groups_csr[i] = csr; 22 | CHECK_ERROR(cudaMemcpy((void *)(groups_csr + i), (Csr *)&csr, sizeof(Csr), 23 | cudaMemcpyHostToDevice)); 24 | } 25 | } 26 | 27 | void release() { 28 | if (size > 0) { 29 | CHECK_ERROR(cudaFree((void *)groups_csr)); 30 | delete[] h_groups_csr; 31 | } 32 | } 33 | }; 34 | 35 | class GroupMatchset { 36 | public: 37 | int size; 38 | Matchset *groups_ms; 39 | 40 | void init(std::vector &gs, bool use_soa) { 41 | this->size = gs.size(); 42 | CHECK_ERROR(cudaMalloc(&groups_ms, sizeof(Matchset) * size)); 43 | for (int i = 0; i < size; i++) { 44 | Graph *graph = gs[i]; 45 | Matchset ms = graph->get_matchset_device(use_soa); 46 | CHECK_ERROR(cudaMemcpy((void *)(groups_ms + i), (Matchset *)&ms, 47 | sizeof(Matchset), cudaMemcpyHostToDevice)); 48 | } 49 | } 50 | 51 | void release() { 52 | if (size > 0) { 53 | CHECK_ERROR(cudaFree((void *)groups_ms)); 54 | } 55 | } 56 | }; 57 | 58 | class GroupNodeAttrs { 59 | public: 60 | int size; 61 | uint8_t **groups_node_attrs; 62 | 63 | void init(std::vector &gs) { 64 | this->size = gs.size(); 65 | CHECK_ERROR(cudaMalloc(&groups_node_attrs, sizeof(uint8_t *) * size)); 66 | for (int i = 0; i < size; i++) { 67 | Graph *graph = gs[i]; 68 | uint8_t *pointer = graph->node_attrs->get_dev(); 69 | CHECK_ERROR(cudaMemcpy((void *)(groups_node_attrs + i), (void *)&pointer, 70 | sizeof(uint8_t *), cudaMemcpyHostToDevice)); 71 | } 72 | } 73 | 74 | void release() { 75 | if (size > 0) { 76 | // for (int i = 0; i < size; i++) { 77 | // CHECK_ERROR(cudaFree((void *)groups_node_attrs[i])); 78 | // } 79 | CHECK_ERROR(cudaFree((void *)groups_node_attrs)); 80 | } 81 | } 82 | }; 83 | 84 | class GroupAAS { 85 | public: 86 | int size; 87 | int **groups_always_active_states; 88 | 89 | void init(std::vector &gs) { 90 | this->size = gs.size(); 91 | CHECK_ERROR(cudaMalloc(&groups_always_active_states, sizeof(int *) * size)); 92 | for (int i = 0; i < size; i++) { 93 | Graph *graph = gs[i]; 94 | int *pointer = graph->always_active_nodes->get_dev(); 95 | CHECK_ERROR(cudaMemcpy((void *)(groups_always_active_states + i), 96 | (void *)&pointer, sizeof(int *), 97 | cudaMemcpyHostToDevice)); 98 | } 99 | } 100 | 101 | void release() { 102 | if (size > 0) { 103 | // for (int i = 0; i < size; i++) { 104 | // CHECK_ERROR(cudaFree((void *)groups_always_active_states[i])); 105 | // } 106 | CHECK_ERROR(cudaFree((void *)groups_always_active_states)); 107 | } 108 | } 109 | }; 110 | 111 | #endif -------------------------------------------------------------------------------- /code/include/commons/my_bitset.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | typedef struct My_bitset256 { 9 | uint32_t data[8]; 10 | 11 | __host__ __device__ My_bitset256() { memset(data, 0, sizeof(data)); } 12 | 13 | __host__ __device__ My_bitset256(const My_bitset256 &other) { 14 | memcpy(data, other.data, sizeof(data)); 15 | } 16 | 17 | __host__ __device__ ~My_bitset256() {} 18 | 19 | __host__ __device__ cudaError_t reset() { 20 | memset(data, 0, sizeof(data)); 21 | return cudaSuccess; 22 | } 23 | 24 | __host__ __device__ cudaError_t set(uint8_t offset, int value) { 25 | int pos = (offset / 32); 26 | data[pos] = data[pos] | ((value & 1) << (offset % 32)); 27 | return cudaSuccess; 28 | } 29 | 30 | __host__ __device__ bool test(uint8_t offset) { 31 | int pos = (offset / 32); 32 | return data[pos] & (1 << (offset % 32)); 33 | } 34 | 35 | __host__ __device__ My_bitset256 &operator=(const My_bitset256 &other) { 36 | memcpy(data, other.data, sizeof(data)); 37 | return *this; 38 | } 39 | 40 | void fromBitset(std::bitset<256> column) { 41 | for (int i = 0; i < 256; i++) { 42 | if (column.test(i)) { 43 | set(i, 1); 44 | } else { 45 | set(i, 0); 46 | } 47 | } 48 | } 49 | 50 | } My_bitset256; 51 | 52 | struct My_bitsetN { 53 | uint32_t N; 54 | uint32_t size; 55 | uint32_t *data; 56 | 57 | __host__ __device__ My_bitsetN(int N = 256) : N(N) { 58 | // this->N = N; 59 | this->size = (N - 1) / 32 + 1; 60 | data = new uint32_t[size]; 61 | memset(data, 0, sizeof(uint32_t) * size); 62 | } 63 | 64 | __host__ __device__ My_bitsetN(const My_bitsetN &other) { 65 | this->N = N; 66 | this->size = (N - 1) / 32 + 1; 67 | data = new uint32_t[size]; 68 | memcpy(data, other.data, sizeof(uint32_t) * size); 69 | } 70 | 71 | __host__ __device__ ~My_bitsetN() { delete[] data; } 72 | 73 | __host__ __device__ cudaError_t reset() { 74 | memset(data, 0, sizeof(uint32_t) * size); 75 | return cudaSuccess; 76 | } 77 | 78 | __host__ __device__ cudaError_t set(int offset, int value) { 79 | int pos = (offset / 32); 80 | data[pos] = data[pos] | ((value & 1) << (offset % 32)); 81 | return cudaSuccess; 82 | } 83 | 84 | __host__ __device__ bool test(int offset) { 85 | int pos = (offset / 32); 86 | return data[pos] & (1 << (offset % 32)); 87 | } 88 | 89 | __host__ __device__ My_bitsetN &operator=(const My_bitsetN &other) { 90 | this->N = N; 91 | this->size = (N - 1) / 32 + 1; 92 | data = new uint32_t[size]; 93 | memcpy(data, other.data, sizeof(uint32_t) * size); 94 | return *this; 95 | } 96 | 97 | // void fromBitset(std::bitset column) { 98 | // for (int i = 0; i < N; i++) { 99 | // if (column.test(i)) { 100 | // set(i, 1); 101 | // } else { 102 | // set(i, 0); 103 | // } 104 | // } 105 | // } 106 | }; 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /code/include/commons/node.h: -------------------------------------------------------------------------------- 1 | #ifndef NODE_H_ 2 | #define NODE_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "vasim_helper.h" 13 | 14 | 15 | using std::set; 16 | using std::unique_ptr; 17 | using std::bitset; 18 | using std::string; 19 | using std::map; 20 | using std::vector; 21 | using std::list; 22 | using std::unordered_map; 23 | using std::pair; 24 | using std::make_pair; 25 | 26 | enum NODE_START_ENUM { 27 | START=1, 28 | START_ALWAYS_ENABLED=2 29 | }; 30 | 31 | 32 | class Node { 33 | public: 34 | Node(); 35 | 36 | ~Node(); 37 | 38 | string original_id; 39 | 40 | string str_id; 41 | 42 | int sid; 43 | int cc_id; 44 | int cc_local_id; 45 | 46 | int scc_id; 47 | int topo_order; 48 | 49 | int bfs_layer; 50 | 51 | 52 | bitset<256> symbol_set; 53 | 54 | string symbol_set_str; 55 | 56 | bool complete; 57 | bool complement; 58 | int match_set_range; 59 | 60 | int start; 61 | 62 | 63 | bool report = false; 64 | 65 | // new added for mnrl 66 | string report_code; 67 | bool report_eod = false; 68 | 69 | 70 | bool visited = false; 71 | 72 | void symbol_set_to_bit(); 73 | 74 | inline bool match2(uint8_t input) const { 75 | return symbol_set.test(input); 76 | } 77 | 78 | bool is_start_always_enabled() const; 79 | bool is_start() const; 80 | bool is_report() const; 81 | 82 | bool is_wildcard() const; 83 | 84 | // if the symbol set is a reverse of one symbol, we classify this to not type. 85 | bool is_not_type_node() const; 86 | 87 | int num_of_accept_symbol() const; 88 | 89 | void remap_alphabet(const map &remap_table); 90 | 91 | int num_of_1_in_matchset() const; 92 | 93 | 94 | double hot_degree; 95 | 96 | int cg_id; 97 | 98 | }; 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | #endif /*NODE_H */ 107 | 108 | -------------------------------------------------------------------------------- /code/include/commons/precompute_table.h: -------------------------------------------------------------------------------- 1 | #ifndef PRECOMPUTE_TABLE_H 2 | #define PRECOMPUTE_TABLE_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | class PrecTable { 9 | public: 10 | uint64_t size = 0; 11 | int depth = 0; 12 | int cutoff; 13 | uint64_t nonzeroVerticesNum = 0; 14 | uint64_t nonzeroResultsNum = 0; 15 | bool isCompress; 16 | int maxkey = 0; 17 | 18 | std::vector vertices; 19 | std::vector vertices_length; 20 | std::vector results; 21 | std::vector results_length; 22 | 23 | std::vector nonzeroVerticesMap; // from index to vertix 24 | std::vector nonzeroResultsMap; // from index to result 25 | 26 | int *d_vertices; 27 | int *d_vertices_offsets; 28 | int *d_results; 29 | int *d_results_offsets; 30 | 31 | uint32_t* d_nonzeroVerticesMap; // from index to vertix 32 | uint32_t* d_nonzeroResultsMap; // from index to result 33 | 34 | PrecTable(){} 35 | 36 | PrecTable(uint64_t size); 37 | 38 | void allocate(uint64_t size, int depth, bool isCompress = true); 39 | 40 | void setVertices(uint32_t index, std::vector &v); 41 | 42 | void setResults(uint32_t index, std::vector &v); 43 | 44 | 45 | int printHistogram(); 46 | 47 | void calcCutoff(); 48 | 49 | void calcCutoffMedian(); 50 | 51 | void toDevice(bool use_uvm = false); 52 | 53 | void releaseHost(); 54 | 55 | void releaseDevice(); 56 | 57 | 58 | // template 59 | __device__ __forceinline__ int getVertexSymbolIndex(uint32_t symbol) { 60 | if(isCompress) 61 | return binary_search(d_nonzeroVerticesMap, nonzeroVerticesNum, symbol); 62 | else 63 | return (int)symbol; 64 | } 65 | 66 | // template 67 | __device__ __forceinline__ int getResultSymbolIndex(uint32_t symbol) { 68 | if(isCompress) 69 | return binary_search(d_nonzeroResultsMap, nonzeroResultsNum, symbol); 70 | else 71 | return (int)symbol; 72 | } 73 | 74 | // template 75 | __host__ __forceinline__ int getVertexSymbolIndexHost(uint32_t symbol) { 76 | return binary_search(&nonzeroVerticesMap[0], nonzeroVerticesNum, symbol); 77 | } 78 | 79 | // template 80 | __host__ __forceinline__ int getResultSymbolIndexHost(uint32_t symbol) { 81 | return binary_search(&nonzeroResultsMap[0], nonzeroResultsNum, symbol); 82 | } 83 | 84 | // template 85 | __device__ __host__ __forceinline__ int binary_search(uint32_t *arr, 86 | int n, 87 | uint32_t x) { 88 | int start = 0; 89 | int end = n - 1; 90 | while (start <= end) { 91 | int mid = (start + end) / 2; 92 | if (arr[mid] == x) 93 | return (int)mid; 94 | else if (arr[mid] < x) 95 | start = mid + 1; 96 | else 97 | end = mid - 1; 98 | } 99 | return -1; 100 | } 101 | }; 102 | 103 | #endif -------------------------------------------------------------------------------- /code/include/commons/report_formatter.h: -------------------------------------------------------------------------------- 1 | #ifndef REPORT_FORMATTER_H_ 2 | #define REPORT_FORMATTER_H_ 3 | 4 | #include 5 | #include 6 | 7 | using std::vector; 8 | 9 | using std::string; 10 | 11 | 12 | class report { 13 | 14 | public: 15 | int offset; 16 | string str_id; 17 | int cc; 18 | int input_stream_id; 19 | 20 | bool operator < (const report &r ) const { 21 | if (input_stream_id < r.input_stream_id) { 22 | return true; 23 | } else if (input_stream_id == r.input_stream_id) { 24 | if (offset < r.offset) { 25 | return true; 26 | } else if (offset == r.offset) { 27 | if (cc < r.cc) { 28 | return true; 29 | } else if(str_id == r.str_id) { 30 | if (cc < r.cc) { 31 | return true; 32 | } else { 33 | return false; 34 | } 35 | } else { 36 | return false; 37 | } 38 | } else { 39 | return false; 40 | } 41 | } else { 42 | return false; 43 | } 44 | } 45 | 46 | bool operator == (const report &r) const { 47 | return offset == r.offset && str_id == r.str_id && cc == r.cc && input_stream_id == r.input_stream_id; 48 | } 49 | report(){}; 50 | 51 | report(int offset, string str_id, int cc, int input_stream_id); 52 | 53 | }; 54 | 55 | 56 | class report_formatter { 57 | public: 58 | report_formatter(); 59 | 60 | void print_to_file(string filename, bool unique=true); 61 | 62 | void add_report(report rp); 63 | 64 | int size() const { 65 | return reports.size(); 66 | } 67 | 68 | // private: 69 | vector reports; 70 | 71 | }; 72 | 73 | #endif -------------------------------------------------------------------------------- /code/include/commons/validate.h: -------------------------------------------------------------------------------- 1 | #ifndef VALIDATE_H_ 2 | #define VALIDATE_H_ 3 | 4 | #include "graph.h" 5 | #include "group_graph.h" 6 | 7 | namespace automata_utils { 8 | void automataGroupsReference(std::vector &gs, 9 | uint8_t *input_str, int num_seg, 10 | int input_length, 11 | std::vector *results, 12 | std::vector *db_results, 13 | int debug_iter, GroupCsr gcsr); 14 | void automataReference(Graph &g, uint8_t *input_str, int num_seg, 15 | int input_length, 16 | std::vector *results, 17 | std::vector *db_results, 18 | int debug_iter, Csr csr); 19 | 20 | bool automataValidation(std::vector *results, 21 | std::vector *ref_results, 22 | bool ifPrintBoth); 23 | } 24 | 25 | #endif -------------------------------------------------------------------------------- /code/include/commons/vasim_helper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | 4 | #include 5 | #include 6 | 7 | 8 | 9 | /** 10 | * helper functions 11 | * From VASim 12 | */ 13 | namespace VASim { 14 | void find_and_replace(std::string & source, std::string const & find, std::string const & replace); 15 | void setRange(std::bitset<256> &column, int start, int end, int value); 16 | void parseSymbolSet(std::bitset<256> &column, std::string symbol_set); 17 | } 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /code/include/gpunfautils/abstract_gpunfa.h: -------------------------------------------------------------------------------- 1 | #ifndef ABSTRACT_NFA_PROCESSING_ALGORITHM 2 | #define ABSTRACT_NFA_PROCESSING_ALGORITHM 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "commons/NFA.h" 13 | #include "array2.h" 14 | #include "utils.h" 15 | #include "common.h" 16 | #include "commons/SymbolStream.h" 17 | 18 | using std::map; 19 | using std::vector; 20 | using std::fill; 21 | using std::cout; 22 | using std::endl; 23 | using std::pair; 24 | using std::set; 25 | using std::make_pair; 26 | 27 | 28 | 29 | class abstract_algorithm { 30 | public: 31 | explicit abstract_algorithm(NFA *nfa); 32 | virtual ~abstract_algorithm(); 33 | 34 | virtual void preprocessing() {}; 35 | virtual void launch_kernel() = 0; 36 | virtual void postprocessing() {}; 37 | 38 | virtual void set_alphabet(set alphabet); 39 | virtual const SymbolStream& get_symbol_stream(int i) const; 40 | virtual void add_symbol_stream(SymbolStream ss); 41 | 42 | virtual int get_num_streams() const { 43 | return symbol_streams.size(); 44 | } 45 | 46 | void set_block_size(int block_size); 47 | 48 | void set_output_file(string output_filename) { 49 | this->output_file = output_filename; 50 | } 51 | 52 | void set_output_buffer_size(unsigned long long int ob_size) { 53 | this->output_buffer_size = ob_size; 54 | } 55 | 56 | void set_NFA(NFA *nfa) { 57 | this->nfa = nfa; 58 | } 59 | 60 | // whether we want the algorithm to generate reports. 61 | // If not, we can save time and space for the reports. 62 | void turn_off_report() { 63 | this->report_on = false; 64 | } 65 | 66 | void turn_on_report() { 67 | this->report_on = true; 68 | } 69 | 70 | void set_report_off(bool &report_off, 71 | unsigned long long int result_capacity, 72 | long long int quick_result_number) { 73 | if (quick_result_number >= 0 && 74 | result_capacity <= quick_result_number * 1.5) { 75 | if (!report_off) { 76 | printf("Warning: The number of results may exceed the " 77 | "capacity limit. " 78 | "Set report_off=true.\n"); 79 | report_off = true; 80 | } 81 | } 82 | this->report_on = !report_off; 83 | if (this->report_on) { 84 | printf("Report on.\n"); 85 | } else { 86 | printf("Report off.\n"); 87 | } 88 | } 89 | 90 | Array2 *concat_input_streams_to_array2(); 91 | 92 | void set_padding_input_stream(int pad) { 93 | this->padding_input_stream = pad; 94 | } 95 | 96 | 97 | void set_max_cc_size_limit(int max_cc_size_limit) { 98 | this->max_cc_size_limit = max_cc_size_limit; 99 | } 100 | 101 | void set_read_input(bool b) { 102 | this->read_input = b; 103 | } 104 | 105 | bool validation; 106 | NFA *nfa; 107 | vector ccs; 108 | 109 | protected: 110 | 111 | int max_cc_size_limit; 112 | 113 | int padding_input_stream; 114 | 115 | unsigned long long int output_buffer_size; 116 | 117 | vector symbol_streams; 118 | 119 | set alphabet; 120 | 121 | int block_size; 122 | 123 | bool report_on; // decide whether generating reports. 124 | 125 | string output_file; 126 | 127 | bool read_input; 128 | 129 | }; 130 | 131 | 132 | 133 | 134 | #endif -------------------------------------------------------------------------------- /code/include/gpunfautils/array2.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef ARRAY2_H_ 3 | #define ARRAY2_H_ 4 | 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using std::cout; 13 | using std::cerr; 14 | using std::endl; 15 | 16 | 17 | template 18 | class Array2 { 19 | public: 20 | Array2(int arr_length): Array2(arr_length, "noname_array") { 21 | 22 | } 23 | 24 | Array2(int arr_length, std::string arr_id) : h_arr(NULL), d_arr(NULL) { 25 | this->arr_id = arr_id; 26 | 27 | // cout << "arr_length = " << arr_length << endl; 28 | 29 | this->arr_length = arr_length; 30 | this->element_bytes = sizeof(T); 31 | 32 | if (arr_length <= 0) { 33 | // cout << "arr_length = " << arr_length << " arr_id = " << arr_id << endl; 34 | assert(arr_length >= 0); 35 | return; 36 | } 37 | 38 | h_arr = new T[arr_length]; 39 | 40 | //assert(element_bytes * arr_length > 0); 41 | 42 | auto errcode = cudaMalloc(&d_arr, 1ULL * element_bytes * arr_length); 43 | 44 | if (errcode != cudaSuccess) { 45 | cerr << "try to allocate " << 1ULL * arr_length * element_bytes << " byte of memory failed" << " arrid = " << arr_id << endl; 46 | cerr << "cannot allocate cuda memory " << errcode << endl; 47 | exit(-1); 48 | } 49 | 50 | } 51 | 52 | 53 | virtual ~Array2() { 54 | assert(h_arr != NULL); 55 | delete [] h_arr; 56 | 57 | assert(d_arr != NULL); 58 | cudaFree(d_arr); 59 | 60 | } 61 | 62 | 63 | int size() const { 64 | return arr_length; 65 | } 66 | 67 | int size_of_T() const { 68 | return element_bytes; 69 | } 70 | 71 | unsigned long long num_of_byte() const { 72 | return 1ULL * element_bytes * arr_length; 73 | } 74 | 75 | T *get_dev() const { 76 | return d_arr; 77 | } 78 | 79 | T get(int idx) const { 80 | assert(idx >= 0 && idx < size()); 81 | return h_arr[idx]; 82 | } 83 | 84 | T *get_host() const { 85 | return h_arr; 86 | } 87 | 88 | void clear_to_zero() { 89 | memset(h_arr, 0, num_of_byte()); 90 | } 91 | 92 | 93 | void fill(T val) { 94 | for (int i = 0; i < arr_length; i++) { 95 | h_arr[i] = val; 96 | } 97 | } 98 | 99 | 100 | T operator[] (int idx) { 101 | assert(idx >= 0 && idx < size()); 102 | #ifdef __CUDA_ARCH__ 103 | return d_arr[idx]; 104 | #else 105 | return h_arr[idx]; 106 | #endif 107 | } 108 | 109 | void set(int idx, T v) { 110 | if (!(idx >= 0 && idx < size())) { 111 | cout << "assert(idx >= 0 && idx < size()); " << idx << endl; 112 | assert(idx >= 0 && idx < size()); 113 | } 114 | 115 | h_arr[idx] = v; 116 | 117 | } 118 | 119 | void copy_to_device() { 120 | 121 | //cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) 122 | if (num_of_byte() == 0) { 123 | // std::cout<< "FAIL: copy_to_device !!!!!\n"; 124 | // printf("%d, %d", element_bytes ,arr_length); 125 | return; 126 | } 127 | 128 | 129 | auto errcode = cudaMemcpy(d_arr, h_arr, num_of_byte(), cudaMemcpyHostToDevice); 130 | 131 | if (errcode != cudaSuccess) { 132 | cout << "trying to copy " << num_of_byte() << " byte to device " << endl; 133 | cout << "cannot copy to device error code = " << errcode << endl; 134 | exit(-1); 135 | } 136 | } 137 | 138 | void copy_back() { 139 | auto errcode = cudaMemcpy(h_arr, d_arr, num_of_byte(), cudaMemcpyDeviceToHost); 140 | if (errcode != cudaSuccess) { 141 | cerr << "cannot copy back" << " " << errcode << endl; 142 | exit(-1); 143 | } 144 | } 145 | 146 | T *copy_to_host(int num_of_element) { 147 | assert(num_of_element <= arr_length); 148 | 149 | T *arr = new T[num_of_element]; 150 | auto errcode = cudaMemcpy(arr, d_arr, 1ULL * sizeof(T) * num_of_element, cudaMemcpyDeviceToHost); 151 | if (errcode != cudaSuccess) { 152 | cerr << "cannot copy_to_host " << " " << errcode << endl; 153 | exit(-1); 154 | } 155 | 156 | return arr; 157 | 158 | } 159 | 160 | void print() const { 161 | cout << "print for debug array2 length = " << arr_length << endl; 162 | for (int i = 0; i < arr_length; i++) { 163 | cout << h_arr[i] << " " ; 164 | } 165 | 166 | cout << endl; 167 | } 168 | 169 | T *copy_to_host_async(int num_of_element) { 170 | assert(num_of_element <= arr_length); 171 | 172 | T *arr = new T[num_of_element]; 173 | auto errcode = cudaMemcpyAsync(arr, d_arr, 1ULL * sizeof(T) * num_of_element, cudaMemcpyDeviceToHost); 174 | if (errcode != cudaSuccess) { 175 | cerr << "cannot copy_to_host " << " " << errcode << endl; 176 | exit(-1); 177 | } 178 | 179 | return arr; 180 | } 181 | 182 | // private: 183 | int arr_length; 184 | int element_bytes; 185 | 186 | T *h_arr; 187 | T *d_arr; 188 | 189 | std::string arr_id; 190 | 191 | }; 192 | 193 | 194 | #endif -------------------------------------------------------------------------------- /code/include/gpunfautils/common.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H_ 2 | #define COMMON_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | using std::vector; 18 | using std::string; 19 | using std::make_pair; 20 | using std::pair; 21 | 22 | const int ALPHABET_SIZE = 256; 23 | 24 | const int EMPTY_ENTRY = 56789; 25 | 26 | enum remap_node_type { 27 | NONE = 0, 28 | REPORT = 1, 29 | TOPO_ORDER = 2, 30 | BFS_LAYER = 3, 31 | OUTDEGREE = 4, 32 | COMPLETE = 5, 33 | COMPLETE_AND_TOP = 6, 34 | COMPLETE_AND_BFS = 7 35 | 36 | }; 37 | 38 | 39 | struct match_pair { 40 | int symbol_offset; 41 | int state_id; 42 | 43 | bool operator< (const match_pair& o) const { 44 | if (symbol_offset < o.symbol_offset) { 45 | return true; 46 | } else if (symbol_offset == o.symbol_offset) { 47 | if (state_id < o.state_id ) { 48 | return true; 49 | } 50 | return false; 51 | } else { 52 | return false; 53 | } 54 | } 55 | }; 56 | 57 | 58 | struct match3 { 59 | int symbol_offset; 60 | int state_id; 61 | int nfa; 62 | 63 | bool operator< (const match_pair& o) const { 64 | if (symbol_offset < o.symbol_offset) { 65 | return true; 66 | } else if (symbol_offset == o.symbol_offset) { 67 | if (state_id < o.state_id ) { 68 | return true; 69 | } 70 | return false; 71 | } else { 72 | return false; 73 | } 74 | } 75 | 76 | }; 77 | 78 | 79 | 80 | struct match_entry { 81 | int symbol_offset; 82 | int state_id; 83 | int cc_id; 84 | int stream_id; 85 | }; 86 | 87 | std::ostream& operator<<(std::ostream& os, const match_pair &obj); 88 | 89 | 90 | 91 | template 92 | struct STE_dev { 93 | int32_t ms[8]; // 8 * 32 = 256; local memory. 94 | 95 | int edge_dst[DEGREE_LIMIT]; 96 | 97 | char attribute; // is report? 98 | int degree; 99 | 100 | }; 101 | 102 | 103 | 104 | struct STE_dev4 { 105 | int32_t ms[8]; // 8 * 32 = 256; local memory. 106 | 107 | unsigned long long edges; 108 | 109 | char attribute; // is report? 110 | int degree; 111 | 112 | }; 113 | 114 | 115 | struct STE_dev4_compressed_matchset { 116 | int32_t ms[8]; // 8 * 32 = 256; local memory. 117 | 118 | 119 | unsigned long long edges; 120 | 121 | char attribute; // is report? 122 | // attribute has 8 bit.. 123 | 124 | // complete; complement; 125 | 126 | //uint8_t start; 127 | //uint8_t end; 128 | 129 | unsigned int start_end; 130 | 131 | int degree; 132 | 133 | }; 134 | 135 | 136 | 137 | struct STE_dev4_compressed_matchset_allcomplete { 138 | unsigned long long edges; 139 | 140 | char attribute; // is report? 141 | 142 | unsigned int start_end; 143 | 144 | int degree; 145 | }; 146 | 147 | 148 | 149 | 150 | 151 | // Revised implementation. 20190121 152 | 153 | struct STE_nodeinfo_new_imp { 154 | unsigned long long edges; 155 | 156 | unsigned int attribute : 8; 157 | unsigned int start : 8; 158 | unsigned int end : 8; 159 | unsigned int degree : 8; 160 | }; 161 | 162 | 163 | 164 | struct STE_nodeinfo_new_imp2 { 165 | unsigned long long edges; 166 | unsigned int attribute : 8; 167 | unsigned int degree : 8; 168 | }; 169 | 170 | 171 | 172 | struct STE_matchset_new_imp { 173 | int32_t ms[8]; // 8 * 32 = 256; local memory. 174 | }; 175 | 176 | 177 | struct STE_nodeinfo_new_imp_withcg { 178 | unsigned long long edges; 179 | 180 | unsigned int attribute : 8; 181 | unsigned int start : 8; 182 | unsigned int end : 8; 183 | unsigned int degree : 8; 184 | 185 | // cg_id ---> write position in gpu kernel. 186 | uint16_t cg_id; 187 | uint16_t cg_of_to_edges[4]; 188 | }; 189 | 190 | struct matchset_t { 191 | int32_t m[8]; 192 | }; 193 | 194 | 195 | 196 | 197 | #endif 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | -------------------------------------------------------------------------------- /code/include/moderngpu/cta_mergesort.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | 4 | #include "cta_merge.hxx" 5 | #include "sort_networks.hxx" 6 | 7 | BEGIN_MGPU_NAMESPACE 8 | 9 | MGPU_HOST_DEVICE int out_of_range_flags(int first, int vt, int count) { 10 | int out_of_range = min(vt, first + vt - count); 11 | int head_flags = 0; 12 | if(out_of_range > 0) { 13 | const int mask = (1<< vt) - 1; 14 | head_flags = mask & (~mask>> out_of_range); 15 | } 16 | return head_flags; 17 | } 18 | 19 | MGPU_HOST_DEVICE merge_range_t compute_mergesort_frame(int partition, 20 | int coop, int spacing) { 21 | 22 | int size = spacing * (coop / 2); 23 | int start = ~(coop - 1) & partition; 24 | int a_begin = spacing * start; 25 | int b_begin = spacing * start + size; 26 | 27 | return merge_range_t { 28 | a_begin, 29 | a_begin + size, 30 | b_begin, 31 | b_begin + size 32 | }; 33 | } 34 | 35 | MGPU_HOST_DEVICE merge_range_t compute_mergesort_range(int count, 36 | int partition, int coop, int spacing) { 37 | 38 | merge_range_t frame = compute_mergesort_frame(partition, coop, spacing); 39 | 40 | return merge_range_t { 41 | frame.a_begin, 42 | min(count, frame.a_end), 43 | min(count, frame.b_begin), 44 | min(count, frame.b_end) 45 | }; 46 | } 47 | 48 | MGPU_HOST_DEVICE merge_range_t compute_mergesort_range(int count, 49 | int partition, int coop, int spacing, int mp0, int mp1) { 50 | 51 | merge_range_t range = compute_mergesort_range(count, partition, 52 | coop, spacing); 53 | 54 | // Locate the diagonal from the start of the A sublist. 55 | int diag = spacing * partition - range.a_begin; 56 | 57 | // The end partition of the last cta for each merge operation is computed 58 | // and stored as the begin partition for the subsequent merge. i.e. it is 59 | // the same partition but in the wrong coordinate system, so its 0 when it 60 | // should be listSize. Correct that by checking if this is the last cta 61 | // in this merge operation. 62 | if(coop - 1 != ((coop - 1) & partition)) { 63 | range.a_end = range.a_begin + mp1; 64 | range.b_end = min(count, range.b_begin + diag + spacing - mp1); 65 | } 66 | 67 | range.a_begin = range.a_begin + mp0; 68 | range.b_begin = min(count, range.b_begin + diag - mp0); 69 | 70 | return range; 71 | } 72 | 73 | template 74 | struct cta_sort_t { 75 | enum { 76 | has_values = !std::is_same::value, 77 | num_passes = s_log2(nt) 78 | }; 79 | 80 | union storage_t { 81 | key_t keys[nt * vt + 1]; 82 | val_t vals[nt * vt]; 83 | }; 84 | 85 | static_assert(is_pow2(nt), "cta_sort_t requires pow2 number of threads"); 86 | 87 | template 88 | MGPU_DEVICE kv_array_t 89 | merge_pass(kv_array_t x, int tid, int count, 90 | int pass, comp_t comp, storage_t& storage) const { 91 | 92 | // Divide the CTA's keys into lists. 93 | int coop = 2<< pass; 94 | merge_range_t range = compute_mergesort_range(count, tid, coop, vt); 95 | int diag = vt * tid - range.a_begin; 96 | 97 | // Store the keys into shared memory for searching. 98 | reg_to_shared_thread(x.keys, tid, storage.keys); 99 | 100 | // Search for the merge path for this thread within its list. 101 | int mp = merge_path(storage.keys, range, diag, comp); 102 | 103 | // Run a serial merge and return. 104 | merge_pair_t merge = serial_merge( 105 | storage.keys, range.partition(mp, diag), comp); 106 | x.keys = merge.keys; 107 | 108 | if(has_values) { 109 | // Reorder values through shared memory. 110 | reg_to_shared_thread(x.vals, tid, storage.vals); 111 | x.vals = shared_gather(storage.vals, merge.indices); 112 | } 113 | 114 | return x; 115 | } 116 | 117 | template 118 | MGPU_DEVICE kv_array_t 119 | block_sort(kv_array_t x, int tid, int count, 120 | comp_t comp, storage_t& storage) const { 121 | 122 | // Sort the inputs within each thread. If any threads have fewer than 123 | // vt items, use the segmented sort network to prevent out-of-range 124 | // elements from contaminating the sort. 125 | if(count < nt * vt) { 126 | int head_flags = out_of_range_flags(vt * tid, vt, count); 127 | x = odd_even_sort(x, comp, head_flags); 128 | } else 129 | x = odd_even_sort(x, comp); 130 | 131 | // Merge threads starting with a pair until all values are merged. 132 | for(int pass = 0; pass < num_passes; ++pass) 133 | x = merge_pass(x, tid, count, pass, comp, storage); 134 | 135 | return x; 136 | } 137 | }; 138 | 139 | 140 | END_MGPU_NAMESPACE 141 | -------------------------------------------------------------------------------- /code/include/moderngpu/cta_reduce.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | #include "loadstore.hxx" 4 | #include "intrinsics.hxx" 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | // requires __CUDA_ARCH__ >= 300. 9 | // warp_size can be any power-of-two <= warp_size. 10 | // warp_reduce_t returns the reduction only in lane 0. 11 | template 12 | struct shfl_reduce_t { 13 | 14 | static_assert(group_size <= warp_size && is_pow2(group_size), 15 | "shfl_reduce_t must operate on a pow2 number of threads <= warp_size (32)"); 16 | enum { num_passes = s_log2(group_size) }; 17 | 18 | template > 19 | MGPU_DEVICE type_t reduce(int lane, type_t x, int count, op_t op = op_t()) { 20 | if(count == group_size) { 21 | iterate([&](int pass) { 22 | int offset = 1<< pass; 23 | x = shfl_down_op(x, offset, op, group_size); 24 | }); 25 | } else { 26 | iterate([&](int pass) { 27 | int offset = 1<< pass; 28 | type_t y = shfl_down(x, offset, group_size); 29 | if(lane + offset < count) x = op(x, y); 30 | }); 31 | } 32 | return x; 33 | } 34 | }; 35 | 36 | // cta_reduce_t returns the reduction of all inputs for thread 0, and returns 37 | // type_t() for all other threads. This behavior saves a broadcast. 38 | 39 | template 40 | struct cta_reduce_t { 41 | 42 | enum { 43 | group_size = min(nt, (int)warp_size), 44 | num_passes = s_log2(group_size), 45 | num_items = nt / group_size 46 | }; 47 | 48 | static_assert(0 == nt % warp_size, 49 | "cta_reduce_t requires num threads to be a multiple of warp_size (32)"); 50 | 51 | struct storage_t { 52 | struct { type_t data[max(nt, 2 * group_size)]; }; 53 | }; 54 | 55 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 56 | 57 | typedef shfl_reduce_t group_reduce_t; 58 | 59 | template > 60 | MGPU_DEVICE type_t reduce(int tid, type_t x, storage_t& storage, 61 | int count = nt, op_t op = op_t(), bool all_return = true) const { 62 | 63 | // Store your data into shared memory. 64 | storage.data[tid] = x; 65 | __syncthreads(); 66 | 67 | if(tid < group_size) { 68 | // Each thread scans within its lane. 69 | strided_iterate([&](int i, int j) { 70 | if(i > 0) x = op(x, storage.data[j]); 71 | }, tid, count); 72 | 73 | // Cooperative reduction. 74 | x = group_reduce_t().reduce(tid, x, min(count, (int)group_size), op); 75 | 76 | if(all_return) storage.data[tid] = x; 77 | } 78 | __syncthreads(); 79 | 80 | if(all_return) { 81 | x = storage.data[0]; 82 | __syncthreads(); 83 | } 84 | return x; 85 | } 86 | 87 | #else 88 | 89 | template > 90 | MGPU_DEVICE type_t reduce(int tid, type_t x, storage_t& storage, 91 | int count = nt, op_t op = op_t(), bool all_return = true) const { 92 | 93 | // Store your data into shared memory. 94 | storage.data[tid] = x; 95 | __syncthreads(); 96 | 97 | if(tid < group_size) { 98 | // Each thread scans within its lane. 99 | strided_iterate([&](int i, int j) { 100 | type_t y = storage.data[j]; 101 | if(i > 0) x = op(x, y); 102 | }, tid, count); 103 | storage.data[tid] = x; 104 | } 105 | __syncthreads(); 106 | 107 | int count2 = min(count, int(group_size)); 108 | int first = (1 & num_passes) ? group_size : 0; 109 | if(tid < group_size) 110 | storage.data[first + tid] = x; 111 | __syncthreads(); 112 | 113 | iterate([&](int pass) { 114 | if(tid < group_size) { 115 | int offset = 1 << pass; 116 | if(tid + offset < count2) 117 | x = op(x, storage.data[first + offset + tid]); 118 | first = group_size - first; 119 | storage.data[first + tid] = x; 120 | } 121 | __syncthreads(); 122 | }); 123 | 124 | if(all_return) { 125 | x = storage.data[0]; 126 | __syncthreads(); 127 | } 128 | return x; 129 | } 130 | 131 | #endif 132 | }; 133 | 134 | END_MGPU_NAMESPACE 135 | -------------------------------------------------------------------------------- /code/include/moderngpu/cta_search.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | 4 | #include "cta_merge.hxx" 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | template 10 | MGPU_HOST_DEVICE int_t binary_search(keys_it keys, int_t count, key_t key, 11 | comp_t comp) { 12 | 13 | int_t begin = 0; 14 | int_t end = count; 15 | while(begin < end) { 16 | int_t mid = (begin + end) / 2; 17 | key_t key2 = keys[mid]; 18 | bool pred = (bounds_upper == bounds) ? 19 | !comp(key, key2) : 20 | comp(key2, key); 21 | if(pred) begin = mid + 1; 22 | else end = mid; 23 | } 24 | return begin; 25 | } 26 | 27 | //////////////////////////////////////////////////////////////////////////////// 28 | // TODO: Implement a moderngpu V1 style vectorized sorted search. 29 | 30 | template 31 | struct search_result_t { 32 | array_t keys; 33 | array_t indices; 34 | int decisions; // Set a bit if this iteration has progressed A. 35 | int matches_a; // A set flag for a match on each iteration. 36 | int matches_b; 37 | }; 38 | 39 | template 41 | MGPU_DEVICE search_result_t 42 | serial_search(const type_t* keys_shared, merge_range_t range, 43 | int a_offset, int b_offset, comp_t comp, bool sync = true) { 44 | 45 | type_t a_key = keys_shared[range.a_begin]; 46 | type_t b_key = keys_shared[range.b_begin]; 47 | type_t a_prev = type_t(), b_prev = type_t(); 48 | 49 | int a_start = 0; 50 | int b_start = range.a_end; // Assume the b_keys start right after the end 51 | // of the a_keys. 52 | if(range.a_begin > 0) a_prev = keys_shared[range.a_begin - 1]; 53 | if(range.b_begin > b_start) b_prev = keys_shared[range.b_begin - 1]; 54 | 55 | search_result_t result = search_result_t(); 56 | 57 | iterate([&](int i) { 58 | // This is almost the same body as serial_merge, except for the match 59 | // criterion below. 60 | bool p = merge_predicate(a_key, b_key, range, comp); 61 | 62 | if(p) { 63 | bool match = (bounds_upper == bounds) ? 64 | (!range_check || range.b_begin > b_start) && 65 | !comp(b_prev, a_key) : 66 | (!range_check || range.b_valid()) && 67 | !comp(a_key, b_key); 68 | 69 | result.decisions |= 1<< i; 70 | result.matches_a |= (int)match<< i; 71 | a_prev = a_key; 72 | 73 | } else { 74 | bool match = (bounds_upper == bounds) ? 75 | (!range_check || (range.a_valid() && range.b_valid())) && 76 | !comp(b_key, a_key) : 77 | (!range_check || (range.b_valid() && range.a_begin > a_start)) && 78 | !comp(a_prev, b_key); 79 | 80 | result.matches_b |= (int)match<< i; 81 | b_prev = b_key; 82 | } 83 | 84 | // Same advancement behavior as serial_merge. 85 | int index = p ? range.a_begin : range.b_begin; 86 | 87 | result.keys[i] = p ? a_key : b_key; 88 | result.indices[i] = index + (p ? a_offset : b_offset); 89 | 90 | type_t c_key = keys_shared[++index]; 91 | if(p) a_key = c_key, range.a_begin = index; 92 | else b_key = c_key, range.b_begin = index; 93 | }); 94 | 95 | if(sync) __syncthreads(); 96 | 97 | return result; 98 | } 99 | 100 | END_MGPU_NAMESPACE 101 | -------------------------------------------------------------------------------- /code/include/moderngpu/cta_segscan.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | 4 | #include "cta_scan.hxx" 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | template 9 | struct segscan_result_t { 10 | type_t scan; 11 | type_t reduction; 12 | bool has_carry_in; 13 | int left_lane; 14 | }; 15 | 16 | template 17 | struct cta_segscan_t { 18 | enum { num_warps = nt / warp_size }; 19 | 20 | union storage_t { 21 | int delta[num_warps + nt]; 22 | struct { type_t values[2 * nt]; int packed[nt]; }; 23 | }; 24 | 25 | MGPU_DEVICE int find_left_lane(int tid, bool has_head_flag, 26 | storage_t& storage) const { 27 | 28 | int warp = tid / warp_size; 29 | int lane = (warp_size - 1) & tid; 30 | int warp_mask = 0xffffffff>> (31 - lane); // inclusive search. 31 | int cta_mask = 0x7fffffff>> (31 - lane); // exclusive search. 32 | 33 | // Build a head flag bitfield and store it into shared memory. 34 | int warp_bits = __ballot(has_head_flag); 35 | storage.delta[warp] = warp_bits; 36 | __syncthreads(); 37 | 38 | if(tid < num_warps) { 39 | int cta_bits = __ballot(0 != storage.delta[tid]); 40 | int warp_segment = 31 - clz(cta_mask & cta_bits); 41 | int start = (-1 != warp_segment) ? 42 | (31 - clz(storage.delta[warp_segment]) + 32 * warp_segment) : 0; 43 | storage.delta[num_warps + tid] = start; 44 | } 45 | __syncthreads(); 46 | 47 | // Find the closest flag to the left of this thread within the warp. 48 | // Include the flag for this thread. 49 | int start = 31 - clz(warp_mask & warp_bits); 50 | if(-1 != start) start += ~31 & tid; 51 | else start = storage.delta[num_warps + warp]; 52 | __syncthreads(); 53 | 54 | return start; 55 | } 56 | 57 | template > 58 | MGPU_DEVICE segscan_result_t segscan(int tid, bool has_head_flag, 59 | bool has_carry_out, type_t x, storage_t& storage, type_t init = type_t(), 60 | op_t op = op_t()) const { 61 | 62 | if(!has_carry_out) x = init; 63 | 64 | int left_lane = find_left_lane(tid, has_head_flag, storage); 65 | int tid_delta = tid - left_lane; 66 | 67 | // Store the has_carry_out flag. 68 | storage.packed[tid] = (int)has_carry_out | (left_lane<< 1); 69 | 70 | // Run an inclusive scan. 71 | int first = 0; 72 | storage.values[first + tid] = x; 73 | __syncthreads(); 74 | 75 | int packed = storage.packed[left_lane]; 76 | left_lane = packed>> 1; 77 | tid_delta = tid - left_lane; 78 | if(0 == (1 & packed)) --tid_delta; 79 | 80 | iterate([&](int pass) { 81 | int offset = 1<< pass; 82 | if(tid_delta >= offset) 83 | x = op(x, storage.values[first + tid - offset]); 84 | first = nt - first; 85 | storage.values[first + tid] = x; 86 | __syncthreads(); 87 | }); 88 | 89 | // Get the exclusive scan by fetching the preceding element. Also return 90 | // the carry-out value as the total. 91 | bool has_carry_in = tid ? (0 != (1 & storage.packed[tid - 1])) : false; 92 | 93 | segscan_result_t result { 94 | (has_carry_in && tid) ? storage.values[first + tid - 1] : init, 95 | storage.values[first + nt - 1], 96 | has_carry_in, 97 | left_lane 98 | }; 99 | __syncthreads(); 100 | 101 | return result; 102 | } 103 | }; 104 | 105 | END_MGPU_NAMESPACE 106 | -------------------------------------------------------------------------------- /code/include/moderngpu/kernel_bulkinsert.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | #include "kernel_merge.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | // Insert the values at a_keys before the values at b_keys identified by 8 | // insert. 9 | template 11 | void bulk_insert(a_it a, insert_it a_insert, int insert_size, b_it b, 12 | int source_size, c_it c, context_t& context) { 13 | 14 | merge(a_insert, a, insert_size, counting_iterator_t(0), b, 15 | source_size, discard_iterator_t(), c, mgpu::less_t(), context); 16 | } 17 | 18 | END_MGPU_NAMESPACE 19 | -------------------------------------------------------------------------------- /code/include/moderngpu/kernel_bulkremove.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | #include "search.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | template 9 | void bulk_remove(input_it input, int count, indices_it indices, 10 | int num_indices, output_it output, context_t& context) { 11 | 12 | typedef typename conditional_typedef_t, 15 | arch_35_cta<128, 11>, 16 | arch_52_cta<128, 15> 17 | > 18 | >::type_t launch_t; 19 | 20 | typedef typename std::iterator_traits::value_type type_t; 21 | 22 | // Map the removal indices into tiles. 23 | mem_t partitions = binary_search_partitions(indices, 24 | count, num_indices, launch_t::nv(context), context); 25 | const int* p_data = partitions.data(); 26 | 27 | auto k = [=]MGPU_DEVICE(int tid, int cta) { 28 | typedef typename launch_t::sm_ptx params_t; 29 | enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; 30 | 31 | __shared__ union { 32 | int indices[nv + 1]; 33 | } shared; 34 | 35 | range_t tile = get_tile(cta, nv, count); 36 | 37 | // Search the begin and end iterators to load. 38 | int begin = p_data[cta]; 39 | int end = p_data[cta + 1]; 40 | int b_count = end - begin; 41 | 42 | int* a_shared = shared.indices; 43 | int* b_shared = shared.indices + tile.count() - b_count; 44 | 45 | // Store the indices to shared memory. 46 | // TODO: MODIFY MEM_TO_SHARED TO UNCONDITIONALLY WRITE TO FULL SMEM. 47 | mem_to_shared(indices + begin, tid, b_count, b_shared, false); 48 | 49 | // Binary search into the remove array to prepare a range for the thread. 50 | merge_range_t range = { 51 | // a range 52 | vt * tid, 53 | tile.count(), 54 | 55 | // b range 56 | binary_search(b_shared, b_count, 57 | tile.begin + vt * tid, less_t()), 58 | b_count 59 | }; 60 | 61 | // Emit all values that aren't removed. 62 | iterate([&](int i) { 63 | bool p = range.a_valid() && (!range.b_valid() || 64 | tile.begin + range.a_begin < b_shared[range.b_begin]); 65 | if(p) 66 | a_shared[range.a_begin - range.b_begin] = tile.begin + range.a_begin; 67 | else 68 | ++range.b_begin; 69 | ++range.a_begin; 70 | }); 71 | __syncthreads(); 72 | 73 | // Pull the gather indices out of shared memory in strided order. 74 | array_t gather = shared_to_reg_strided( 75 | shared.indices, tid); 76 | 77 | // Gather the elements from input. 78 | int num_move = tile.count() - b_count; 79 | array_t values; 80 | strided_iterate([&](int i, int j) { 81 | values[i] = input[gather[i]]; 82 | }, tid, num_move); 83 | 84 | // Stream to output. 85 | reg_to_mem_strided(values, tid, num_move, 86 | output + tile.begin - begin); 87 | }; 88 | cta_transform(k, count, context); 89 | } 90 | 91 | END_MGPU_NAMESPACE 92 | -------------------------------------------------------------------------------- /code/include/moderngpu/kernel_intervalmove.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | #include "kernel_load_balance.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | template 9 | void interval_expand(input_it input, int count, segments_it segments, 10 | int num_segments, output_it output, context_t& context) { 11 | 12 | typedef typename std::iterator_traits::value_type type_t; 13 | transform_lbs( 14 | []MGPU_DEVICE(int index, int seg, int rank, tuple desc, 15 | output_it output) { 16 | output[index] = get<0>(desc); 17 | }, 18 | count, segments, num_segments, make_tuple(input), context, output 19 | ); 20 | } 21 | 22 | template 24 | void interval_gather(input_it input, int count, segments_it segments, 25 | int num_segments, gather_it gather, output_it output, context_t& context) { 26 | 27 | transform_lbs( 28 | []MGPU_DEVICE(int index, int seg, int rank, tuple desc, 29 | input_it input, output_it output) { 30 | output[index] = input[get<0>(desc) + rank]; 31 | }, 32 | count, segments, num_segments, make_tuple(gather), context, input, output 33 | ); 34 | } 35 | 36 | template 38 | void interval_scatter(input_it input, int count, segments_it segments, 39 | int num_segments, scatter_it scatter, output_it output, context_t& context) { 40 | 41 | transform_lbs( 42 | []MGPU_DEVICE(int index, int seg, int rank, tuple desc, 43 | input_it input, output_it output) { 44 | output[get<0>(desc) + rank] = input[index]; 45 | }, 46 | count, segments, num_segments, make_tuple(scatter), context, input, output 47 | ); 48 | } 49 | 50 | template 53 | void interval_move(input_it input, int count, segments_it segments, 54 | int num_segments, scatter_it scatter, gather_it gather, output_it output, 55 | context_t& context) { 56 | 57 | transform_lbs( 58 | []MGPU_DEVICE(int index, int seg, int rank, tuple desc, 59 | input_it input, output_it output) { 60 | output[get<0>(desc) + rank] = input[get<1>(desc) + rank]; 61 | }, 62 | count, segments, num_segments, make_tuple(scatter, gather), context, 63 | input, output 64 | ); 65 | } 66 | 67 | END_MGPU_NAMESPACE 68 | -------------------------------------------------------------------------------- /code/include/moderngpu/kernel_join.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | #include "kernel_sortedsearch.hxx" 4 | #include "kernel_scan.hxx" 5 | #include "kernel_load_balance.hxx" 6 | 7 | BEGIN_MGPU_NAMESPACE 8 | 9 | template 11 | mem_t inner_join(a_it a, int a_count, b_it b, int b_count, 12 | comp_t comp, context_t& context) { 13 | 14 | // Compute lower and upper bounds of a into b. 15 | mem_t lower(a_count, context); 16 | mem_t upper(a_count, context); 17 | sorted_search(a, a_count, b, b_count, 18 | lower.data(), comp, context); 19 | sorted_search(a, a_count, b, b_count, 20 | upper.data(), comp, context); 21 | 22 | // Compute output ranges by scanning upper - lower. Retrieve the reduction 23 | // of the scan, which specifies the size of the output array to allocate. 24 | mem_t scanned_sizes(a_count, context); 25 | const int* lower_data = lower.data(); 26 | const int* upper_data = upper.data(); 27 | 28 | mem_t count(1, context); 29 | transform_scan([=]MGPU_DEVICE(int index) { 30 | return upper_data[index] - lower_data[index]; 31 | }, a_count, scanned_sizes.data(), plus_t(), count.data(), context); 32 | 33 | // Allocate an int2 output array and use load-balancing search to compute 34 | // the join. 35 | int join_count = from_mem(count)[0]; 36 | mem_t output(join_count, context); 37 | int2* output_data = output.data(); 38 | 39 | // Use load-balancing search on the segmens. The output is a pair with 40 | // a_index = seg and b_index = lower_data[seg] + rank. 41 | auto k = [=]MGPU_DEVICE(int index, int seg, int rank, tuple lower) { 42 | output_data[index] = make_int2(seg, get<0>(lower) + rank); 43 | }; 44 | transform_lbs(k, join_count, scanned_sizes.data(), a_count, 45 | make_tuple(lower_data), context); 46 | 47 | return output; 48 | } 49 | 50 | END_MGPU_NAMESPACE 51 | -------------------------------------------------------------------------------- /code/include/moderngpu/kernel_load_balance.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | #include "cta_load_balance.hxx" 4 | #include "search.hxx" 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | template 10 | void transform_lbs(func_t f, int count, segments_it segments, 11 | int num_segments, pointers_t caching_iterators, context_t& context, 12 | args_t... args) { 13 | 14 | typedef typename conditional_typedef_t, 17 | arch_35_cta<128, 7, 5>, 18 | arch_52_cta<128, 11, 9> 19 | > 20 | >::type_t launch_t; 21 | 22 | typedef typename std::iterator_traits::value_type int_t; 23 | typedef tuple_iterator_value_t value_t; 24 | 25 | mem_t mp = load_balance_partitions(count, segments, num_segments, 26 | launch_t::nv(context), context); 27 | const int_t* mp_data = mp.data(); 28 | 29 | auto k = [=]MGPU_DEVICE(int tid, int cta, args_t... args) { 30 | 31 | typedef typename launch_t::sm_ptx params_t; 32 | enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 }; 33 | typedef cta_load_balance_t load_balance_t; 34 | typedef detail::cached_segment_load_t cached_load_t; 35 | 36 | __shared__ union { 37 | typename load_balance_t::storage_t lbs; 38 | typename cached_load_t::storage_t cached; 39 | } shared; 40 | 41 | // Compute the load-balancing search and materialize (index, seg, rank) 42 | // arrays. 43 | auto lbs = load_balance_t().load_balance(count, segments, num_segments, 44 | tid, cta, mp_data, shared.lbs); 45 | 46 | // Load from the cached iterators. Use the placement range, not the 47 | // merge-path range for situating the segments. 48 | array_t cached_values = cached_load_t::template load( 49 | tid, lbs.merge_range.a_count(), lbs.placement.range.b_range(), 50 | lbs.segments, shared.cached, caching_iterators); 51 | 52 | // Call the user-supplied functor f. 53 | strided_iterate([=](int i, int j) { 54 | int index = lbs.merge_range.a_begin + j; 55 | int seg = lbs.segments[i]; 56 | int rank = lbs.ranks[i]; 57 | 58 | f(index, seg, rank, cached_values[i], args...); 59 | }, tid, lbs.merge_range.a_count()); 60 | }; 61 | cta_transform(k, count + num_segments, context, args...); 62 | } 63 | 64 | // load-balancing search without caching. 65 | template 67 | void transform_lbs(func_t f, int count, segments_it segments, 68 | int num_segments, context_t& context, args_t... args) { 69 | 70 | transform_lbs( 71 | [=]MGPU_DEVICE(int index, int seg, int rank, tuple<>, args_t... args) { 72 | f(index, seg, rank, args...); // drop the cached values. 73 | }, 74 | count, segments, num_segments, tuple<>(), context, args... 75 | ); 76 | } 77 | 78 | template 80 | void load_balance_search(int count, segments_it segments, 81 | int num_segments, output_it output, context_t& context) { 82 | 83 | transform_lbs([=]MGPU_DEVICE(int index, int seg, int rank) { 84 | output[index] = seg; 85 | }, count, segments, num_segments, context); 86 | } 87 | 88 | END_MGPU_NAMESPACE 89 | -------------------------------------------------------------------------------- /code/include/moderngpu/kernel_merge.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | #include "cta_merge.hxx" 4 | #include "search.hxx" 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | // Key-value merge. 9 | template 14 | void merge(a_keys_it a_keys, a_vals_it a_vals, int a_count, 15 | b_keys_it b_keys, b_vals_it b_vals, int b_count, 16 | c_keys_it c_keys, c_vals_it c_vals, comp_t comp, context_t& context) { 17 | 18 | typedef typename conditional_typedef_t, 21 | arch_35_cta<128, 11>, 22 | arch_52_cta<128, 15> 23 | > 24 | >::type_t launch_t; 25 | 26 | typedef typename std::iterator_traits::value_type type_t; 27 | typedef typename std::iterator_traits::value_type val_t; 28 | enum { has_values = !std::is_same::value }; 29 | 30 | mem_t partitions = merge_path_partitions(a_keys, a_count, 31 | b_keys, b_count, launch_t::nv(context), comp, context); 32 | int* mp_data = partitions.data(); 33 | 34 | auto k = [=] MGPU_DEVICE (int tid, int cta) { 35 | typedef typename launch_t::sm_ptx params_t; 36 | enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; 37 | 38 | __shared__ union { 39 | type_t keys[nv + 1]; 40 | int indices[nv]; 41 | } shared; 42 | 43 | // Load the range for this CTA and merge the values into register. 44 | int mp0 = mp_data[cta + 0]; 45 | int mp1 = mp_data[cta + 1]; 46 | merge_range_t range = compute_merge_range(a_count, b_count, cta, nv, 47 | mp0, mp1); 48 | 49 | merge_pair_t merge = cta_merge_from_mem( 50 | a_keys, b_keys, range, tid, comp, shared.keys); 51 | 52 | int dest_offset = nv * cta; 53 | reg_to_mem_thread(merge.keys, tid, range.total(), c_keys + dest_offset, 54 | shared.keys); 55 | 56 | if(has_values) { 57 | // Transpose the indices from thread order to strided order. 58 | array_t indices = reg_thread_to_strided(merge.indices, tid, 59 | shared.indices); 60 | 61 | // Gather the input values and merge into the output values. 62 | transfer_two_streams_strided(a_vals + range.a_begin, range.a_count(), 63 | b_vals + range.b_begin, range.b_count(), indices, tid, 64 | c_vals + dest_offset); 65 | } 66 | }; 67 | cta_transform(k, a_count + b_count, context); 68 | } 69 | 70 | // Key-only merge. 71 | template 74 | void merge(a_keys_it a_keys, int a_count, b_keys_it b_keys, int b_count, 75 | c_keys_it c_keys, comp_t comp, context_t& context) { 76 | 77 | merge(a_keys, (const empty_t*)nullptr, a_count, b_keys, 78 | (const empty_t*)nullptr, b_count, c_keys, (empty_t*)nullptr, comp, 79 | context); 80 | } 81 | 82 | END_MGPU_NAMESPACE 83 | -------------------------------------------------------------------------------- /code/include/moderngpu/kernel_reduce.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | 4 | #include "cta_reduce.hxx" 5 | #include "memory.hxx" 6 | #include "transform.hxx" 7 | #include "operators.hxx" 8 | 9 | BEGIN_MGPU_NAMESPACE 10 | 11 | template 13 | void reduce(input_it input, int count, output_it reduction, op_t op, 14 | context_t& context) { 15 | 16 | typedef typename conditional_typedef_t 18 | >::type_t launch_t; 19 | 20 | typedef typename std::iterator_traits::value_type type_t; 21 | 22 | int num_ctas = launch_t::cta_dim(context).num_ctas(count); 23 | mem_t partials(num_ctas, context); 24 | type_t* partials_data = partials.data(); 25 | 26 | auto k = [=] MGPU_DEVICE(int tid, int cta) { 27 | typedef typename launch_t::sm_ptx params_t; 28 | enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; 29 | typedef cta_reduce_t reduce_t; 30 | __shared__ typename reduce_t::storage_t shared_reduce; 31 | 32 | // Load the data for the first tile for each cta. 33 | range_t tile = get_tile(cta, nv, count); 34 | array_t x = mem_to_reg_strided(input + tile.begin, 35 | tid, tile.count()); 36 | 37 | // Reduce the multiple values per thread into a scalar. 38 | type_t scalar; 39 | strided_iterate([&](int i, int j) { 40 | scalar = i ? op(scalar, x[i]) : x[0]; 41 | }, tid, tile.count()); 42 | 43 | // Reduce to a scalar per CTA. 44 | scalar = reduce_t().reduce(tid, scalar, shared_reduce, 45 | min(tile.count(), (int)nt), op, false); 46 | 47 | if(!tid) { 48 | if(1 == num_ctas) *reduction = scalar; 49 | else partials_data[cta] = scalar; 50 | } 51 | }; 52 | cta_launch(k, num_ctas, context); 53 | 54 | // Recursively call reduce until there's just one scalar. 55 | if(num_ctas > 1) 56 | reduce >(partials_data, num_ctas, reduction, op, 57 | context); 58 | } 59 | 60 | template 62 | void transform_reduce(func_t f, int count, output_it reduction, op_t op, 63 | context_t& context) { 64 | 65 | typedef typename std::iterator_traits::value_type type_t; 66 | reduce(make_load_iterator(f), count, reduction, op, 67 | context); 68 | } 69 | 70 | END_MGPU_NAMESPACE 71 | -------------------------------------------------------------------------------- /code/include/moderngpu/kernel_sortedsearch.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cta_merge.hxx" 3 | #include "search.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | template 10 | void sorted_search(needles_it needles, int num_needles, haystack_it haystack, 11 | int num_haystack, indices_it indices, comp_it comp, context_t& context) { 12 | 13 | typedef typename conditional_typedef_t, 16 | arch_35_cta<128, 11>, 17 | arch_52_cta<128, 15> 18 | > 19 | >::type_t launch_t; 20 | 21 | typedef typename std::iterator_traits::value_type type_t; 22 | 23 | // Partition the needles and haystacks into tiles. 24 | mem_t partitions = merge_path_partitions(needles, num_needles, 25 | haystack, num_haystack, launch_t::nv(context), comp, context); 26 | const int* mp_data = partitions.data(); 27 | 28 | auto k = [=]MGPU_DEVICE(int tid, int cta) { 29 | typedef typename launch_t::sm_ptx params_t; 30 | enum { nt = params_t::nt, vt = params_t::vt, nv = nt * vt }; 31 | 32 | __shared__ union { 33 | type_t keys[nv + 1]; 34 | int indices[nv]; 35 | } shared; 36 | 37 | // Load the range for this CTA and merge the values into register. 38 | int mp0 = mp_data[cta + 0]; 39 | int mp1 = mp_data[cta + 1]; 40 | merge_range_t range = compute_merge_range(num_needles, num_haystack, cta, 41 | nv, mp0, mp1); 42 | 43 | // Merge the values needles and haystack. 44 | merge_pair_t merge = cta_merge_from_mem( 45 | needles, haystack, range, tid, comp, shared.keys); 46 | 47 | // Store the needle indices to shared memory. 48 | iterate([&](int i) { 49 | if(merge.indices[i] < range.a_count()) { 50 | int needle = merge.indices[i]; 51 | int haystack = range.b_begin + vt * tid + i - needle; 52 | shared.indices[needle] = haystack; 53 | } 54 | }); 55 | __syncthreads(); 56 | 57 | shared_to_mem(shared.indices, tid, range.a_count(), 58 | indices + range.a_begin); 59 | }; 60 | 61 | cta_transform(k, num_needles + num_haystack, context); 62 | } 63 | 64 | END_MGPU_NAMESPACE 65 | -------------------------------------------------------------------------------- /code/include/moderngpu/launch_box.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | 4 | #include "context.hxx" 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | // Specializable launch parameters. 9 | struct launch_box_default_t { 10 | typedef launch_cta_t<0, 0, 0> sm_00; 11 | typedef empty_t sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53; 12 | 13 | template 14 | using rebind = launch_box_default_t; 15 | }; 16 | 17 | template 18 | struct launch_box_t : inherit_t { 19 | typedef inherit_t base_t; 20 | 21 | typedef typename conditional_typedef_t< 22 | typename base_t::sm_20, typename base_t::sm_00 23 | >::type_t sm_20; 24 | 25 | #define INHERIT_LAUNCH_PARAMS(new_ver, old_ver) \ 26 | typedef typename conditional_typedef_t< \ 27 | typename base_t::sm_##new_ver, sm_##old_ver \ 28 | >::type_t sm_##new_ver; 29 | 30 | INHERIT_LAUNCH_PARAMS(21, 20) 31 | INHERIT_LAUNCH_PARAMS(30, 21) 32 | INHERIT_LAUNCH_PARAMS(32, 30) 33 | INHERIT_LAUNCH_PARAMS(35, 30) 34 | INHERIT_LAUNCH_PARAMS(37, 35) 35 | INHERIT_LAUNCH_PARAMS(50, 35) 36 | INHERIT_LAUNCH_PARAMS(52, 50) 37 | INHERIT_LAUNCH_PARAMS(53, 50) 38 | 39 | // Overwrite the params defined for sm_00 so that the host-side compiler 40 | // has all expected symbols available to it. 41 | typedef sm_53 sm_00; 42 | typedef MGPU_LAUNCH_PARAMS(launch_box_t) sm_ptx; 43 | 44 | static cta_dim_t cta_dim(int ptx_version) { 45 | // Ptx version from cudaFuncGetAttributes. 46 | if (ptx_version == 53) return cta_dim_t { sm_53::nt, sm_53::vt }; 47 | else if(ptx_version >= 52) return cta_dim_t { sm_52::nt, sm_52::vt }; 48 | else if(ptx_version >= 50) return cta_dim_t { sm_50::nt, sm_50::vt }; 49 | else if(ptx_version == 37) return cta_dim_t { sm_37::nt, sm_37::vt }; 50 | else if(ptx_version >= 35) return cta_dim_t { sm_35::nt, sm_35::vt }; 51 | else if(ptx_version == 32) return cta_dim_t { sm_32::nt, sm_32::vt }; 52 | else if(ptx_version >= 30) return cta_dim_t { sm_30::nt, sm_30::vt }; 53 | else if(ptx_version >= 21) return cta_dim_t { sm_21::nt, sm_21::vt }; 54 | else if(ptx_version >= 20) return cta_dim_t { sm_20::nt, sm_20::vt }; 55 | else return cta_dim_t { -1, 0 }; 56 | } 57 | 58 | static cta_dim_t cta_dim(const context_t& context) { 59 | return cta_dim(context.ptx_version()); 60 | } 61 | 62 | static int nv(const context_t& context) { 63 | return cta_dim(context.ptx_version()).nv(); 64 | } 65 | }; 66 | 67 | 68 | template 69 | int occupancy(func_t f, const context_t& context, args_t... args) { 70 | int num_blocks; 71 | int nt = launch_box::cta_dim(context).nt; 72 | cudaError_t result = cudaOccupancyMaxActiveBlocksPerMultiprocessor( 73 | &num_blocks, 74 | &launch_box_cta_k, 75 | nt, 76 | (size_t)0 77 | ); 78 | if(cudaSuccess != result) throw cuda_exception_t(result); 79 | return context.props().multiProcessorCount * num_blocks; 80 | } 81 | 82 | END_MGPU_NAMESPACE 83 | -------------------------------------------------------------------------------- /code/include/moderngpu/launch_params.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | 4 | #include "meta.hxx" 5 | #include "tuple.hxx" 6 | 7 | #ifdef __CUDA_ARCH__ 8 | #if __CUDA_ARCH__ == 530 9 | #define MGPU_SM_TAG sm_53 10 | #elif __CUDA_ARCH__ >= 520 11 | #define MGPU_SM_TAG sm_52 12 | #elif __CUDA_ARCH__ >= 500 13 | #define MGPU_SM_TAG sm_50 14 | #elif __CUDA_ARCH__ == 370 15 | #define MGPU_SM_TAG sm_37 16 | #elif __CUDA_ARCH__ >= 350 17 | #define MGPU_SM_TAG sm_35 18 | #elif __CUDA_ARCH__ == 320 19 | #define MGPU_SM_TAG sm_32 20 | #elif __CUDA_ARCH__ >= 300 21 | #define MGPU_SM_TAG sm_30 22 | #elif __CUDA_ARCH__ >= 210 23 | #define MGPU_SM_TAG sm_21 24 | #elif __CUDA_ARCH__ >= 200 25 | #define MGPU_SM_TAG sm_20 26 | #else 27 | #error "Modern GPU v3 does not support builds for sm_1.x" 28 | #endif 29 | #else // __CUDA_ARCH__ 30 | #define MGPU_SM_TAG sm_00 31 | #endif 32 | 33 | #define MGPU_LAUNCH_PARAMS(launch_box) \ 34 | typename launch_box::MGPU_SM_TAG 35 | #define MGPU_LAUNCH_BOUNDS(launch_box) \ 36 | __launch_bounds__(launch_box::sm_ptx::nt, launch_box::sm_ptx::occ) 37 | 38 | BEGIN_MGPU_NAMESPACE 39 | 40 | struct MGPU_ALIGN(8) cta_dim_t { 41 | int nt, vt; 42 | int nv() const { return nt * vt; } 43 | int num_ctas(int count) const { 44 | return div_up(count, nv()); 45 | } 46 | }; 47 | 48 | namespace detail { 49 | 50 | // Due to a bug in the compiler we need to expand make_restrict() before 51 | // branching on cta < num_ctas. 52 | template 53 | MGPU_DEVICE void restrict_forward(func_t f, int tid, int cta, int num_ctas, 54 | args_t... args) { 55 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 56 | if(cta < num_ctas) 57 | #endif 58 | f(tid, cta, args...); 59 | } 60 | 61 | } 62 | 63 | // Generic thread cta kernel. 64 | template 65 | __global__ MGPU_LAUNCH_BOUNDS(launch_box) 66 | void launch_box_cta_k(func_t f, int num_ctas, args_t... args) { 67 | // Masking threadIdx.x by (nt - 1) may help strength reduction because the 68 | // compiler now knows the range of tid: (0, nt). 69 | typedef typename launch_box::sm_ptx params_t; 70 | int tid = (int)(threadIdx.x % (unsigned)params_t::nt); 71 | int cta = blockIdx.x; 72 | 73 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 74 | cta += gridDim.x * blockIdx.y; 75 | #endif 76 | 77 | detail::restrict_forward(f, tid, cta, num_ctas, make_restrict(args)...); 78 | } 79 | 80 | // Dummy kernel for retrieving PTX version. 81 | template 82 | __global__ void dummy_k() { } 83 | 84 | template 85 | struct launch_cta_t { 86 | enum { nt = nt_, vt = vt_, vt0 = vt0_, occ = occ_ }; 87 | }; 88 | 89 | #define DEF_ARCH_STRUCT(ver) \ 90 | template \ 91 | struct arch_##ver : base_t { \ 92 | typedef params_t sm_##ver; \ 93 | \ 94 | template \ 95 | using rebind = arch_##ver; \ 96 | }; \ 97 | \ 98 | template \ 99 | using arch_##ver##_cta = arch_##ver >; 100 | 101 | DEF_ARCH_STRUCT(20) 102 | DEF_ARCH_STRUCT(21) 103 | DEF_ARCH_STRUCT(30) 104 | DEF_ARCH_STRUCT(32) 105 | DEF_ARCH_STRUCT(35) 106 | DEF_ARCH_STRUCT(37) 107 | DEF_ARCH_STRUCT(50) 108 | DEF_ARCH_STRUCT(52) 109 | DEF_ARCH_STRUCT(53) 110 | 111 | #undef DEF_ARCH_STRUCT 112 | 113 | struct context_t; 114 | 115 | // Non-specializable launch parameters. 116 | template 117 | struct launch_params_t : launch_cta_t { 118 | typedef launch_params_t sm_ptx; 119 | 120 | static cta_dim_t cta_dim() { 121 | return cta_dim_t { nt, vt }; 122 | } 123 | 124 | static cta_dim_t cta_dim(int) { 125 | return cta_dim(); 126 | } 127 | 128 | static cta_dim_t cta_dim(const context_t& context) { 129 | return cta_dim(); 130 | } 131 | 132 | static int nv(const context_t& context) { 133 | return cta_dim().nv(); 134 | } 135 | }; 136 | 137 | END_MGPU_NAMESPACE 138 | -------------------------------------------------------------------------------- /code/include/moderngpu/memory.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | 4 | #include "transform.hxx" 5 | #include "context.hxx" 6 | 7 | BEGIN_MGPU_NAMESPACE 8 | 9 | //////////////////////////////////////////////////////////////////////////////// 10 | // Memory functions on raw pointers. 11 | 12 | template 13 | cudaError_t htoh(type_t* dest, const type_t* source, size_t count) { 14 | if(count) 15 | memcpy(dest, source, sizeof(type_t) * count); 16 | return cudaSuccess; 17 | } 18 | 19 | template 20 | cudaError_t dtoh(type_t* dest, const type_t* source, size_t count) { 21 | cudaError_t result = count ? 22 | cudaMemcpy(dest, source, sizeof(type_t) * count, 23 | cudaMemcpyDeviceToHost) : 24 | cudaSuccess; 25 | return result; 26 | } 27 | 28 | template 29 | cudaError_t htod(type_t* dest, const type_t* source, size_t count) { 30 | cudaError_t result = count ? 31 | cudaMemcpy(dest, source, sizeof(type_t) * count, 32 | cudaMemcpyHostToDevice) : 33 | cudaSuccess; 34 | return result; 35 | } 36 | 37 | template 38 | cudaError_t dtod(type_t* dest, const type_t* source, size_t count) { 39 | cudaError_t result = count ? 40 | cudaMemcpy(dest, source, sizeof(type_t) * count, 41 | cudaMemcpyDeviceToDevice) : 42 | cudaSuccess; 43 | return result; 44 | } 45 | 46 | template 47 | cudaError_t dtoh(std::vector& dest, const type_t* source, 48 | size_t count) { 49 | dest.resize(count); 50 | return dtoh(dest.data(), source, count); 51 | } 52 | 53 | template 54 | cudaError_t htod(type_t* dest, const std::vector& source) { 55 | return htod(dest, source.data(), source.size()); 56 | } 57 | 58 | //////////////////////////////////////////////////////////////////////////////// 59 | // Memory functions on mem_t. 60 | 61 | template 62 | mem_t to_mem(const std::vector& data, context_t& context) { 63 | mem_t mem(data.size(), context); 64 | cudaError_t result = htod(mem.data(), data); 65 | if(cudaSuccess != result) throw cuda_exception_t(result); 66 | return mem; 67 | } 68 | 69 | template 70 | std::vector from_mem(const mem_t& mem) { 71 | std::vector host; 72 | cudaError_t result = dtoh(host, mem.data(), mem.size()); 73 | if(cudaSuccess != result) throw cuda_exception_t(result); 74 | return host; 75 | } 76 | 77 | template 78 | mem_t fill_function(func_t f, size_t count, context_t& context) { 79 | mem_t mem(count, context); 80 | type_t* p = mem.data(); 81 | transform([=]MGPU_DEVICE(int index) { 82 | p[index] = f(index); 83 | }, count, context); 84 | return mem; 85 | } 86 | 87 | template 88 | mem_t fill(type_t value, size_t count, context_t& context) { 89 | // We'd prefer to call fill_function and pass a lambda that returns value, 90 | // but that can create tokens that are too long for VS2013. 91 | mem_t mem(count, context); 92 | type_t* p = mem.data(); 93 | transform([=]MGPU_DEVICE(int index) { 94 | p[index] = value; 95 | }, count, context); 96 | return mem; 97 | } 98 | 99 | template 100 | auto copy_to_mem(it_t input, size_t count, context_t& context) -> 101 | mem_t::value_type> { 102 | 103 | typedef typename std::iterator_traits::value_type type_t; 104 | mem_t mem(count, context); 105 | type_t* p = mem.data(); 106 | transform([=]MGPU_DEVICE(int index) { 107 | p[index] = input[index]; 108 | }, count, context); 109 | return mem; 110 | } 111 | 112 | inline std::mt19937& get_mt19937() { 113 | static std::mt19937 mt19937; 114 | return mt19937; 115 | } 116 | 117 | mem_t inline fill_random(int a, int b, size_t count, bool sorted, 118 | context_t& context) { 119 | 120 | std::uniform_int_distribution d(a, b); 121 | std::vector data(count); 122 | 123 | for(int& i : data) 124 | i = d(get_mt19937()); 125 | if(sorted) 126 | std::sort(data.begin(), data.end()); 127 | 128 | return to_mem(data, context); 129 | } 130 | 131 | END_MGPU_NAMESPACE 132 | -------------------------------------------------------------------------------- /code/include/moderngpu/search.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | 4 | #include "loadstore.hxx" 5 | #include "operators.hxx" 6 | #include "cta_search.hxx" 7 | #include "memory.hxx" 8 | #include "context.hxx" 9 | 10 | BEGIN_MGPU_NAMESPACE 11 | 12 | template 14 | mem_t merge_path_partitions(a_keys_it a, int64_t a_count, b_keys_it b, 15 | int64_t b_count, int64_t spacing, comp_t comp, context_t& context) { 16 | 17 | typedef int int_t; 18 | int num_partitions = (int)div_up(a_count + b_count, spacing) + 1; 19 | mem_t mem(num_partitions, context); 20 | int_t* p = mem.data(); 21 | transform([=]MGPU_DEVICE(int index) { 22 | int_t diag = (int_t)min(spacing * index, a_count + b_count); 23 | p[index] = merge_path(a, (int_t)a_count, b, (int_t)b_count, 24 | diag, comp); 25 | }, num_partitions, context); 26 | return mem; 27 | } 28 | 29 | template 30 | auto load_balance_partitions(int64_t dest_count, segments_it segments, 31 | int num_segments, int spacing, context_t& context) -> 32 | mem_t::value_type> { 33 | 34 | typedef typename std::iterator_traits::value_type int_t; 35 | return merge_path_partitions(counting_iterator_t(0), 36 | dest_count, segments, num_segments, spacing, less_t(), context); 37 | } 38 | 39 | template 40 | mem_t binary_search_partitions(keys_it keys, int count, int num_items, 41 | int spacing, context_t& context) { 42 | 43 | int num_partitions = div_up(count, spacing) + 1; 44 | mem_t mem(num_partitions, context); 45 | int* p = mem.data(); 46 | transform([=]MGPU_DEVICE(int index) { 47 | int key = min(spacing * index, count); 48 | p[index] = binary_search(keys, num_items, key, less_t()); 49 | }, num_partitions, context); 50 | return mem; 51 | } 52 | 53 | END_MGPU_NAMESPACE 54 | -------------------------------------------------------------------------------- /code/include/moderngpu/sort_networks.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | #include "operators.hxx" 4 | 5 | BEGIN_MGPU_NAMESPACE 6 | 7 | //////////////////////////////////////////////////////////////////////////////// 8 | // Odd-even transposition sorting network. Sorts keys and values in-place in 9 | // register. 10 | // http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort 11 | 12 | template 13 | MGPU_HOST_DEVICE array_t 14 | odd_even_sort(array_t x, comp_t comp, int flags = 0) { 15 | iterate([&](int I) { 16 | PRAGMA_UNROLL 17 | for(int i = 1 & I; i < vt - 1; i += 2) { 18 | if((0 == ((2<< i) & flags)) && comp(x[i + 1], x[i])) 19 | swap(x[i], x[i + 1]); 20 | } 21 | }); 22 | return x; 23 | } 24 | 25 | template 26 | MGPU_HOST_DEVICE kv_array_t 27 | odd_even_sort(kv_array_t x, comp_t comp, int flags = 0) { 28 | iterate([&](int I) { 29 | PRAGMA_UNROLL 30 | for(int i = 1 & I; i < vt - 1; i += 2) { 31 | if((0 == ((2<< i) & flags)) && comp(x.keys[i + 1], x.keys[i])) { 32 | swap(x.keys[i], x.keys[i + 1]); 33 | swap(x.vals[i], x.vals[i + 1]); 34 | } 35 | } 36 | }); 37 | return x; 38 | } 39 | 40 | //////////////////////////////////////////////////////////////////////////////// 41 | // TODO: Batcher Odd-Even Mergesort network 42 | // Unstable but executes much faster than the transposition sort. 43 | // http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort 44 | #if 0 45 | template 46 | struct odd_even_mergesort_t { 47 | 48 | }; 49 | 50 | template 51 | MGPU_HOST_DEVICE kv_array_t 52 | odd_even_mergesort(kv_array_t x, int flags = 0) { 53 | return kv_array_t(); 54 | } 55 | #endif 56 | 57 | END_MGPU_NAMESPACE 58 | -------------------------------------------------------------------------------- /code/include/moderngpu/transform.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | 4 | #include 5 | #include 6 | #include 7 | #include "launch_box.hxx" 8 | 9 | BEGIN_MGPU_NAMESPACE 10 | 11 | //////////////////////////////////////////////////////////////////////////////// 12 | // Launch a grid given a number of CTAs. 13 | 14 | template 15 | void cta_launch(func_t f, int num_ctas, context_t& context, args_t... args) { 16 | cta_dim_t cta = launch_box::cta_dim(context.ptx_version()); 17 | dim3 grid_dim(num_ctas); 18 | if(context.ptx_version() < 30 && num_ctas > 65535) 19 | grid_dim = dim3(256, div_up(num_ctas, 256)); 20 | 21 | if(num_ctas) 22 | launch_box_cta_k 23 | <<>>(f, num_ctas, args...); 24 | } 25 | 26 | template 27 | void cta_launch(func_t f, int num_ctas, context_t& context, args_t... args) { 28 | cta_launch >(f, num_ctas, context, args...); 29 | } 30 | 31 | //////////////////////////////////////////////////////////////////////////////// 32 | // Launch a grid given a number of work-items. 33 | 34 | template 35 | void cta_transform(func_t f, int count, context_t& context, args_t... args) { 36 | cta_dim_t cta = launch_box::cta_dim(context.ptx_version()); 37 | int num_ctas = div_up(count, cta.nv()); 38 | cta_launch(f, num_ctas, context, args...); 39 | } 40 | 41 | template 42 | void cta_transform(func_t f, int count, context_t& context, args_t... args) { 43 | cta_transform >(f, count, context, args...); 44 | } 45 | 46 | //////////////////////////////////////////////////////////////////////////////// 47 | // Launch persistent CTAs and loop through num_ctas values. 48 | 49 | template 50 | void cta_launch(func_t f, const int* num_tiles, context_t& context, 51 | args_t... args) { 52 | 53 | // Over-subscribe the device by a factor of 8. 54 | // This reduces the penalty if we can't schedule all the CTAs to run 55 | // concurrently. 56 | int num_ctas = 8 * occupancy(f, context); 57 | 58 | auto k = [=] MGPU_DEVICE(int tid, int cta, args_t... args) { 59 | int count = *num_tiles; 60 | while(cta < count) { 61 | f(tid, cta, args...); 62 | cta += num_ctas; 63 | } 64 | }; 65 | cta_launch(k, num_ctas, context, args...); 66 | } 67 | 68 | //////////////////////////////////////////////////////////////////////////////// 69 | // Ordinary transform launch. This uses the standard launch box mechanism 70 | // so we can query its occupancy and other things. 71 | 72 | namespace detail { 73 | 74 | template 75 | struct transform_f { 76 | template 77 | MGPU_DEVICE void operator()(int tid, int cta, func_t f, 78 | size_t count, args_t... args) { 79 | 80 | typedef typename launch_t::sm_ptx params_t; 81 | enum { nt = params_t::nt, vt = params_t::vt, vt0 = params_t::vt0 }; 82 | 83 | range_t range = get_tile(cta, nt * vt, count); 84 | 85 | strided_iterate([=](int i, int j) { 86 | f(range.begin + j, args...); 87 | }, tid, range.count()); 88 | } 89 | }; 90 | 91 | } 92 | 93 | template 94 | void transform(func_t f, size_t count, context_t& context, args_t... args) { 95 | cta_transform(detail::transform_f(), count, 96 | context, f, count, args...); 97 | } 98 | 99 | template 100 | void transform(func_t f, size_t count, context_t& context, args_t... args) { 101 | transform >(f, count, context, args...); 102 | } 103 | 104 | END_MGPU_NAMESPACE 105 | -------------------------------------------------------------------------------- /code/include/moderngpu/types.hxx: -------------------------------------------------------------------------------- 1 | // moderngpu copyright (c) 2016, Sean Baxter http://www.moderngpu.com 2 | #pragma once 3 | 4 | #include "meta.hxx" 5 | #include "operators.hxx" 6 | 7 | BEGIN_MGPU_NAMESPACE 8 | 9 | struct cuda_exception_t : std::exception { 10 | cudaError_t result; 11 | 12 | cuda_exception_t(cudaError_t result_) : result(result_) { } 13 | virtual const char* what() const noexcept { 14 | return cudaGetErrorString(result); 15 | } 16 | }; 17 | 18 | 19 | template 20 | struct array_t { 21 | type_t data[size]; 22 | 23 | MGPU_HOST_DEVICE type_t operator[](int i) const { return data[i]; } 24 | MGPU_HOST_DEVICE type_t& operator[](int i) { return data[i]; } 25 | 26 | array_t() = default; 27 | array_t(const array_t&) = default; 28 | array_t& operator=(const array_t&) = default; 29 | 30 | // Fill the array with x. 31 | MGPU_HOST_DEVICE array_t(type_t x) { 32 | iterate([&](int i) { data[i] = x; }); 33 | } 34 | }; 35 | 36 | template 37 | struct array_t { 38 | MGPU_HOST_DEVICE type_t operator[](int i) const { return type_t(); } 39 | MGPU_HOST_DEVICE type_t& operator[](int i) { return *(type_t*)nullptr; } 40 | }; 41 | 42 | // Reduce on components of array_t. 43 | template > 44 | MGPU_HOST_DEVICE type_t reduce(array_t x, op_t op = op_t()) { 45 | type_t a; 46 | iterate([&](int i) { 47 | a = i ? op(a, x[i]) : x[i]; 48 | }); 49 | return a; 50 | } 51 | 52 | // Call the operator component-wise on all components. 53 | template 54 | MGPU_HOST_DEVICE array_t combine(array_t x, 55 | array_t y, op_t op) { 56 | 57 | array_t z; 58 | iterate([&](int i) { z[i] = op(x[i], y[i]); }); 59 | return z; 60 | } 61 | 62 | template 63 | MGPU_HOST_DEVICE array_t operator+( 64 | array_t a, array_t b) { 65 | return combine(a, b, plus_t()); 66 | } 67 | 68 | template 69 | MGPU_HOST_DEVICE array_t operator-( 70 | array_t a, array_t b) { 71 | return combine(a, b, minus_t()); 72 | } 73 | 74 | 75 | template 76 | struct kv_array_t { 77 | array_t keys; 78 | array_t vals; 79 | }; 80 | 81 | enum bounds_t { 82 | bounds_lower, 83 | bounds_upper 84 | }; 85 | 86 | struct MGPU_ALIGN(8) range_t { 87 | int begin, end; 88 | MGPU_HOST_DEVICE int size() const { return end - begin; } 89 | MGPU_HOST_DEVICE int count() const { return size(); } 90 | MGPU_HOST_DEVICE bool valid() const { return end > begin; } 91 | }; 92 | 93 | MGPU_HOST_DEVICE range_t get_tile(int cta, int nv, int count) { 94 | return range_t { nv * cta, min(count, nv * (cta + 1)) }; 95 | } 96 | 97 | 98 | struct MGPU_ALIGN(16) merge_range_t { 99 | int a_begin, a_end, b_begin, b_end; 100 | 101 | MGPU_HOST_DEVICE int a_count() const { return a_end - a_begin; } 102 | MGPU_HOST_DEVICE int b_count() const { return b_end - b_begin; } 103 | MGPU_HOST_DEVICE int total() const { return a_count() + b_count(); } 104 | 105 | MGPU_HOST_DEVICE range_t a_range() const { 106 | return range_t { a_begin, a_end }; 107 | } 108 | MGPU_HOST_DEVICE range_t b_range() const { 109 | return range_t { b_begin, b_end }; 110 | } 111 | 112 | MGPU_HOST_DEVICE merge_range_t to_local() const { 113 | return merge_range_t { 0, a_count(), a_count(), total() }; 114 | } 115 | 116 | // Partition from mp to the end. 117 | MGPU_HOST_DEVICE merge_range_t partition(int mp0, int diag) const { 118 | return merge_range_t { a_begin + mp0, a_end, b_begin + diag - mp0, b_end }; 119 | } 120 | 121 | // Partition from mp0 to mp1. 122 | MGPU_HOST_DEVICE merge_range_t partition(int mp0, int diag0, 123 | int mp1, int diag1) const { 124 | return merge_range_t { 125 | a_begin + mp0, 126 | a_begin + mp1, 127 | b_begin + diag0 - mp0, 128 | b_begin + diag1 - mp1 129 | }; 130 | } 131 | 132 | MGPU_HOST_DEVICE bool a_valid() const { 133 | return a_begin < a_end; 134 | } 135 | MGPU_HOST_DEVICE bool b_valid() const { 136 | return b_begin < b_end; 137 | } 138 | }; 139 | 140 | template 141 | struct merge_pair_t { 142 | array_t keys; 143 | array_t indices; 144 | }; 145 | 146 | 147 | END_MGPU_NAMESPACE 148 | -------------------------------------------------------------------------------- /code/include/moderngpu/util.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "types.hxx" 3 | #include 4 | #include 5 | 6 | BEGIN_MGPU_NAMESPACE 7 | 8 | namespace detail { 9 | 10 | inline std::string stringprintf(const char* format, ...) { 11 | va_list args; 12 | va_start(args, format); 13 | int len = vsnprintf(0, 0, format, args); 14 | va_end(args); 15 | 16 | // allocate space. 17 | std::string text; 18 | text.resize(len); 19 | 20 | va_start(args, format); 21 | vsnprintf(&text[0], len + 1, format, args); 22 | va_end(args); 23 | 24 | return text; 25 | } 26 | 27 | } // namespace detail 28 | 29 | END_MGPU_NAMESPACE 30 | 31 | -------------------------------------------------------------------------------- /code/include/pugixml/pugiconfig.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * pugixml parser - version 1.9 3 | * -------------------------------------------------------- 4 | * Copyright (C) 2006-2019, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) 5 | * Report bugs and download new versions at https://pugixml.org/ 6 | * 7 | * This library is distributed under the MIT License. See notice at the end 8 | * of this file. 9 | * 10 | * This work is based on the pugxml parser, which is: 11 | * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) 12 | */ 13 | 14 | #ifndef HEADER_PUGICONFIG_HPP 15 | #define HEADER_PUGICONFIG_HPP 16 | 17 | // Uncomment this to enable wchar_t mode 18 | // #define PUGIXML_WCHAR_MODE 19 | 20 | // Uncomment this to enable compact mode 21 | // #define PUGIXML_COMPACT 22 | 23 | // Uncomment this to disable XPath 24 | // #define PUGIXML_NO_XPATH 25 | 26 | // Uncomment this to disable STL 27 | // #define PUGIXML_NO_STL 28 | 29 | // Uncomment this to disable exceptions 30 | // #define PUGIXML_NO_EXCEPTIONS 31 | 32 | // Set this to control attributes for public classes/functions, i.e.: 33 | // #define PUGIXML_API __declspec(dllexport) // to export all public symbols from DLL 34 | // #define PUGIXML_CLASS __declspec(dllimport) // to import all classes from DLL 35 | // #define PUGIXML_FUNCTION __fastcall // to set calling conventions to all public functions to fastcall 36 | // In absence of PUGIXML_CLASS/PUGIXML_FUNCTION definitions PUGIXML_API is used instead 37 | 38 | // Tune these constants to adjust memory-related behavior 39 | // #define PUGIXML_MEMORY_PAGE_SIZE 32768 40 | // #define PUGIXML_MEMORY_OUTPUT_STACK 10240 41 | // #define PUGIXML_MEMORY_XPATH_PAGE_SIZE 4096 42 | 43 | // Uncomment this to switch to header-only version 44 | 45 | #define PUGIXML_HEADER_ONLY 46 | 47 | // Uncomment this to enable long long support 48 | // #define PUGIXML_HAS_LONG_LONG 49 | 50 | #endif 51 | 52 | /** 53 | * Copyright (c) 2006-2019 Arseny Kapoulkine 54 | * 55 | * Permission is hereby granted, free of charge, to any person 56 | * obtaining a copy of this software and associated documentation 57 | * files (the "Software"), to deal in the Software without 58 | * restriction, including without limitation the rights to use, 59 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 60 | * copies of the Software, and to permit persons to whom the 61 | * Software is furnished to do so, subject to the following 62 | * conditions: 63 | * 64 | * The above copyright notice and this permission notice shall be 65 | * included in all copies or substantial portions of the Software. 66 | * 67 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 68 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 69 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 70 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 71 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 72 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 73 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 74 | * OTHER DEALINGS IN THE SOFTWARE. 75 | */ 76 | -------------------------------------------------------------------------------- /code/scripts/collect_keyword_list_throughput.txt: -------------------------------------------------------------------------------- 1 | [ 2 | ('throughput', 'float'), 3 | ] -------------------------------------------------------------------------------- /code/scripts/configs/app_spec_ngap_new_quickvalidation_part2: -------------------------------------------------------------------------------- 1 | { 2 | "root": "/ngAP/automata_benchmark_original", 3 | "apps": [ 4 | { 5 | "name": "Snort", 6 | "input": "AutomataZoo/Snort/benchmarks/inputs/wrccdc2012.pcap", 7 | "mnrl": "AutomataZoo/Snort/benchmarks/automata/snort.mnrl", 8 | "anml_no_ORs": "AutomataZoo/Snort/benchmarks/anml_remove_or/automata_0.anml", 9 | "optimized_ANML": "AutomataZoo/Snort/benchmarks/optimized_ANML/automata_0.anml", 10 | "hs": "AutomataZoo/Snort/benchmarks/hs/automata.hs", 11 | "automata": "AutomataZoo/Snort/benchmarks/anml_remove_or/automata_0.anml", 12 | "quick_validation": "128259", 13 | }, 14 | { 15 | "name": "FileCarving", 16 | "input": "AutomataZoo/FileCarving/benchmarks/inputs/fat32_files.input", 17 | "anml": "AutomataZoo/FileCarving/benchmarks/automata/file_carver.anml", 18 | "anml_no_ORs": "AutomataZoo/FileCarving/benchmarks/anml_remove_or/automata_0.anml", 19 | "optimized_ANML": "AutomataZoo/FileCarving/benchmarks/optimized_ANML/automata_0.anml", 20 | "mnrl": "AutomataZoo/FileCarving/benchmarks/mnrl/automata_0.mnrl", 21 | "hs": "AutomataZoo/FileCarving/benchmarks/hs/automata.hs", 22 | "automata": "AutomataZoo/FileCarving/benchmarks/anml_remove_or/automata_0.anml", 23 | "quick_validation": "0", 24 | }, 25 | { 26 | "name": "ClamAV", 27 | "input": "AutomataZoo/ClamAV/benchmarks/inputs/clamav.input", 28 | "anml": "AutomataZoo/ClamAV/benchmarks/automata/clamav.anml", 29 | "anml_no_ORs": "AutomataZoo/ClamAV/benchmarks/anml_remove_or/automata_0.anml", 30 | "optimized_ANML": "AutomataZoo/ClamAV/benchmarks/optimized_ANML/automata_0.anml", 31 | "mnrl": "AutomataZoo/ClamAV/benchmarks/mnrl/automata_0.mnrl", 32 | "hs": "AutomataZoo/ClamAV/benchmarks/hs/automata.hs", 33 | "automata": "AutomataZoo/ClamAV/benchmarks/anml_remove_or/automata_0.anml", 34 | "quick_validation": "1", 35 | }, 36 | ], 37 | 38 | "exclude_apps": [ 39 | "Hamming_N1000_l22_d5", 40 | "Hamming_N1000_l31_d10", 41 | "Levenshtein_l24d5", 42 | "Levenshtein_l37d10", 43 | "RandomForest_20_400_270", 44 | "RandomForest_20_800_200", 45 | "SeqMatch_BIBLE_w6_p10", 46 | "Fermi", 47 | "FileCarving", 48 | "smallFileCarving", 49 | ], 50 | } -------------------------------------------------------------------------------- /code/scripts/configs/app_spec_ngap_new_quickvalidation_part3: -------------------------------------------------------------------------------- 1 | { 2 | "root": "/ngAP/automata_benchmark_original", 3 | "apps": [ 4 | { 5 | "name": "smallSnort", 6 | "input": "AutomataZoo/Snort/benchmarks/inputs/wrccdc2012.pcap", 7 | "mnrl": "AutomataZoo/Snort/benchmarks/automata/snort.mnrl", 8 | "anml_no_ORs": "AutomataZoo/Snort/benchmarks/anml_remove_or/automata_0.anml", 9 | "optimized_ANML": "AutomataZoo/Snort/benchmarks/optimized_ANML/automata_0.anml", 10 | "hs": "AutomataZoo/Snort/benchmarks/hs/automata.hs", 11 | "automata": "AutomataZoo/Snort/benchmarks/anml_remove_or/automata_0.anml", 12 | "quick_validation": "128259", 13 | "validation": 0, 14 | }, 15 | { 16 | "name": "smallFileCarving", 17 | "input": "AutomataZoo/FileCarving/benchmarks/inputs/fat32_files.input", 18 | "anml": "AutomataZoo/FileCarving/benchmarks/automata/file_carver.anml", 19 | "anml_no_ORs": "AutomataZoo/FileCarving/benchmarks/anml_remove_or/automata_0.anml", 20 | "optimized_ANML": "AutomataZoo/FileCarving/benchmarks/optimized_ANML/automata_0.anml", 21 | "mnrl": "AutomataZoo/FileCarving/benchmarks/mnrl/automata_0.mnrl", 22 | "hs": "AutomataZoo/FileCarving/benchmarks/hs/automata.hs", 23 | "automata": "AutomataZoo/FileCarving/benchmarks/anml_remove_or/automata_0.anml", 24 | "quick_validation": "0", 25 | }, 26 | { 27 | "name": "smallClamAV", 28 | "input": "AutomataZoo/ClamAV/benchmarks/inputs/clamav.input", 29 | "anml": "AutomataZoo/ClamAV/benchmarks/automata/clamav.anml", 30 | "anml_no_ORs": "AutomataZoo/ClamAV/benchmarks/anml_remove_or/automata_0.anml", 31 | "optimized_ANML": "AutomataZoo/ClamAV/benchmarks/optimized_ANML/automata_0.anml", 32 | "mnrl": "AutomataZoo/ClamAV/benchmarks/mnrl/automata_0.mnrl", 33 | "hs": "AutomataZoo/ClamAV/benchmarks/hs/automata.hs", 34 | "automata": "AutomataZoo/ClamAV/benchmarks/anml_remove_or/automata_0.anml", 35 | "quick_validation": "1", 36 | }, 37 | ], 38 | 39 | "exclude_apps": [ 40 | "Hamming_N1000_l22_d5", 41 | "Hamming_N1000_l31_d10", 42 | "Levenshtein_l24d5", 43 | "Levenshtein_l37d10", 44 | "RandomForest_20_400_270", 45 | "RandomForest_20_800_200", 46 | "SeqMatch_BIBLE_w6_p10", 47 | "Fermi", 48 | "FileCarving", 49 | "smallFileCarving", 50 | ], 51 | } -------------------------------------------------------------------------------- /code/scripts/configs/exec_config_ngap_groups_design_cpu: -------------------------------------------------------------------------------- 1 | { 2 | "exp_parameters" : { 3 | "before-hyperscan" : [ 4 | ("exec", 'hsrun', 'nocombination'), 5 | ("output-name", "before-hyperscan_", 'nocombination'), 6 | ("t", ["12"]), 7 | ("i", ["1000000"]), 8 | ("d", ["600"]), 9 | ], 10 | }, 11 | "exp_times" : 1, 12 | "out_prefix" : "output", 13 | "input_suffix" : "1MB", 14 | "exclude_configs" : ["o1-nonblocking-aas", "o1-nonblocking-unique"], 15 | } -------------------------------------------------------------------------------- /code/scripts/configs/exec_config_ngap_groups_design_cpu_oneinput: -------------------------------------------------------------------------------- 1 | { 2 | "exp_parameters" : { 3 | "before-hyperscan" : [ 4 | ("exec", 'hsrun', 'nocombination'), 5 | ("output-name", "before-hyperscan_", 'nocombination'), 6 | ("t", ["12"]), 7 | ("i", ["1000000"]), 8 | ("d", ["1"]), 9 | ], 10 | }, 11 | "exp_times" : 1, 12 | "out_prefix" : "output", 13 | "input_suffix" : "1MB", 14 | "exclude_configs" : ["o1-nonblocking-aas", "o1-nonblocking-unique"], 15 | } -------------------------------------------------------------------------------- /code/scripts/configs/exec_config_ngap_groups_design_sota_runahead: -------------------------------------------------------------------------------- 1 | { 2 | "exp_parameters" : { 3 | "before-runahead-cc4" : [ 4 | ("exec", 'asyncap', 'nocombination'), 5 | ("output-name", "before-runahead-cc4_", 'nocombination'), 6 | ("algorithm", ["runahead"]), 7 | ("input-len", ["1000000"]), 8 | ("report-off", ['true']), 9 | ("duplicate-input-stream", ['600']), 10 | ("one-output-capacity", ['104619400']), 11 | ("scanning-R", ['999999999']), 12 | ("block-size", ['128']), 13 | ("record-ir", ['0']), 14 | ("blockDimX", ['-1']), 15 | ("num-streams", ['4']), 16 | ("merge-cc", ['4']), 17 | ("shrmem-wl", ['1']), 18 | ("shr_wl_len", ['4']), 19 | ("remove-degree", ['false']), 20 | ("quit-degree", ['false']), 21 | ], 22 | 23 | }, 24 | 25 | 'exp_times' : 1, 26 | 'out_prefix' : 'output', 27 | 'input_suffix' : '1MB' 28 | } 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /code/scripts/configs/exec_config_ngap_groups_design_sota_runahead_4degree: -------------------------------------------------------------------------------- 1 | { 2 | "exp_parameters" : { 3 | "before-runahead-cc4" : [ 4 | ("exec", 'asyncap', 'nocombination'), 5 | ("output-name", "before-runahead-cc4_", 'nocombination'), 6 | ("algorithm", ["runahead"]), 7 | ("input-len", ["1000000"]), 8 | ("report-off", ['true']), 9 | ("duplicate-input-stream", ['600']), 10 | ("one-output-capacity", ['104619400']), 11 | ("scanning-R", ['999999999']), 12 | ("block-size", ['128']), 13 | ("record-ir", ['0']), 14 | ("blockDimX", ['-1']), 15 | ("num-streams", ['4']), 16 | ("merge-cc", ['4']), 17 | ("shrmem-wl", ['1']), 18 | ("shr_wl_len", ['4']), 19 | ("remove-degree", ['true']), 20 | ("quit-degree", ['false']), 21 | ], 22 | 23 | }, 24 | 25 | 'exp_times' : 1, 26 | 'out_prefix' : 'output', 27 | 'input_suffix' : '1MB' 28 | } 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /code/scripts/configs/exec_config_ngap_groups_design_sota_runahead_4degree_oneinput: -------------------------------------------------------------------------------- 1 | { 2 | "exp_parameters" : { 3 | "before-runahead-cc4" : [ 4 | ("exec", 'asyncap', 'nocombination'), 5 | ("output-name", "before-runahead-cc4_", 'nocombination'), 6 | ("algorithm", ["runahead"]), 7 | ("input-len", ["1000000"]), 8 | ("report-off", ['true']), 9 | ("duplicate-input-stream", ['1']), 10 | ("one-output-capacity", ['104619400']), 11 | ("scanning-R", ['999999999']), 12 | ("block-size", ['128']), 13 | ("record-ir", ['0']), 14 | ("blockDimX", ['-1']), 15 | ("num-streams", ['4']), 16 | ("merge-cc", ['4']), 17 | ("shrmem-wl", ['1']), 18 | ("shr_wl_len", ['4']), 19 | ("remove-degree", ['true']), 20 | ("quit-degree", ['false']), 21 | ], 22 | 23 | }, 24 | 25 | 'exp_times' : 1, 26 | 'out_prefix' : 'output', 27 | 'input_suffix' : '1MB' 28 | } 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /code/scripts/configs/exec_config_ngap_groups_design_sota_runahead_oneinput: -------------------------------------------------------------------------------- 1 | { 2 | "exp_parameters" : { 3 | "before-runahead-cc4" : [ 4 | ("exec", 'asyncap', 'nocombination'), 5 | ("output-name", "before-runahead-cc4_", 'nocombination'), 6 | ("algorithm", ["runahead"]), 7 | ("input-len", ["1000000"]), 8 | ("report-off", ['true']), 9 | ("duplicate-input-stream", ['1']), 10 | ("one-output-capacity", ['104619400']), 11 | ("scanning-R", ['999999999']), 12 | ("block-size", ['128']), 13 | ("record-ir", ['0']), 14 | ("blockDimX", ['-1']), 15 | ("num-streams", ['4']), 16 | ("merge-cc", ['4']), 17 | ("shrmem-wl", ['1']), 18 | ("shr_wl_len", ['4']), 19 | ("remove-degree", ['false']), 20 | ("quit-degree", ['false']), 21 | ], 22 | 23 | }, 24 | 25 | 'exp_times' : 1, 26 | 'out_prefix' : 'output', 27 | 'input_suffix' : '1MB' 28 | } 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /code/scripts/llcommons.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os, errno 3 | import collections 4 | import scipy 5 | import scipy.stats 6 | import pandas as pd 7 | import argparse 8 | import math 9 | 10 | # begin from Prof. Sree ----------------------------- 11 | def critlevel(n, level_perc): 12 | import scipy.stats 13 | # not the same alpha as in the eqns ... 14 | alpha = level_perc / 100.0 15 | 16 | if n > 32: 17 | return scipy.stats.norm.interval(alpha)[1] 18 | else: 19 | return scipy.stats.t.interval(alpha, n - 1)[1] 20 | 21 | def calc_ci(stdev, n, level_perc=95): 22 | t1 = critlevel(n, level_perc) 23 | se = stdev / math.sqrt(n) 24 | zt = t1*se 25 | return zt 26 | # end ------------------------------------------ 27 | 28 | 29 | nested_dict = lambda: collections.defaultdict(nested_dict) 30 | 31 | def get_layer1_folders(path): 32 | d = path 33 | print(path) 34 | return filter(lambda x: os.path.isdir(os.path.join(d, x)), os.listdir(d)) 35 | 36 | 37 | def get_layer1_files(mypath): 38 | print(mypath) 39 | onlyfiles = [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))] 40 | return onlyfiles 41 | 42 | 43 | def create_dirs_on_path(filepath): 44 | if not os.path.exists(os.path.dirname(filepath)): 45 | try: 46 | os.makedirs(os.path.dirname(filepath)) 47 | except OSError as exc: # Guard against race condition 48 | if exc.errno != errno.EEXIST: 49 | raise 50 | 51 | 52 | def replace_string_based_on_map(ss, mp): 53 | ss1 = ss 54 | for kw in mp: 55 | ss1 = ss1.replace(kw, mp[kw]) 56 | 57 | return ss1 58 | 59 | 60 | def read_file_to_string(filepath): 61 | #print os.getcwd() 62 | with open(filepath, 'r') as myfile: 63 | data=myfile.read() 64 | 65 | return data 66 | 67 | 68 | def get1Minput(path): 69 | for subdir, dirs, files in os.walk(path): 70 | #print files 71 | for ff in files: 72 | if ff.find('1MB') != -1: 73 | return ff 74 | 75 | 76 | def get_anml(path): 77 | for subdir, dirs, files in os.walk(path): 78 | #print files 79 | for ff in files: 80 | if ff.endswith('.anml'): 81 | return os.path.abspath(os.path.join(path, ff)) 82 | 83 | def get_hs(path): 84 | for subdir, dirs, files in os.walk(path): 85 | #print files 86 | for ff in files: 87 | if ff.endswith('.hs'): 88 | return os.path.abspath(os.path.join(path, ff)) 89 | 90 | 91 | def get_file_path(path, suffix): 92 | files = get_layer1_files(path) 93 | 94 | res = [] 95 | for f in files: 96 | assert(os.path.isfile(os.path.join(path, f))) 97 | 98 | filename_wo_ext = os.path.splitext(f)[0] 99 | if filename_wo_ext.endswith(suffix): 100 | res.append(os.path.abspath(os.path.join(path, f))) 101 | 102 | assert(len(res) == 1) 103 | 104 | return res[0] 105 | 106 | 107 | 108 | 109 | def mean_confidence_interval(data, confidence=0.95): 110 | a = 1.0 * np.array(data) 111 | n = len(a) 112 | m, se = np.mean(a), scipy.stats.sem(a) 113 | h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1) 114 | return m, m-h, m+h 115 | 116 | 117 | 118 | if __name__ == '__main__': 119 | # just a few tests. 120 | print(get_layer1_folders('../benchmarks')) 121 | print(get_layer1_files('../benchmarks/Brill/inputs')) 122 | for app in get_layer1_folders('../benchmarks'): 123 | print(get_file_path('../benchmarks/%s/inputs' % app, '1MB')) 124 | 125 | -------------------------------------------------------------------------------- /code/src/asyncap/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # add_custom_target(runahead 2 | # COMMAND make -C ${CMAKE_CURRENT_SOURCE_DIR} 3 | # COMMENT "Running Makefile in Runahead" 4 | # ) 5 | 6 | -------------------------------------------------------------------------------- /code/src/asyncap/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: directories 2 | 3 | 4 | #ifeq ($(CXXFLAGS),) 5 | CC = g++ 6 | AR = ar 7 | NVCC=/usr/local/cuda/bin/nvcc 8 | SM=sm_80 9 | 10 | # -I../mnrl/include 11 | CXXFLAGS= -std=c++14 -O3 -Iinclude -I../../include -I../../include/commons -I../../include/pugixml -I../../include/gpunfautils -w 12 | CUDA_INCLUDE=-I/usr/local/cuda/include 13 | NVCCFLAGS+= $(CUDA_INCLUDE) -O3 -D_FORCE_INLINES -arch ${SM} --default-stream per-thread --compiler-options -Wno-deprecated -lineinfo -cudart shared -rdc=true -use_fast_math -extra-device-vectorization -restrict 14 | # -Xptxas -v 15 | #endif 16 | 17 | #NVCCFLAGS += -I../cub #-I../obat/include 18 | 19 | objects := obj/run_ahead_approach.o obj/main.o 20 | libs_objects = obj/run_ahead_approach.o 21 | 22 | LDFLAGS= -L../../build/lib -lgpunfacommons -lgpunfautils -lpthread -ltbb 23 | 24 | 25 | all: directories bin/asyncap 26 | 27 | directories: 28 | mkdir -p obj 29 | mkdir -p export_lib 30 | mkdir -p bin 31 | 32 | export_lib/libgpunfa_runahead.so: $(libs_objects) 33 | $(NVCC) ${NVCCFLAGS} ${CXXFLAGS} -shared --compiler-options '-fPIC' ${libs_objects} ${LDFLAGS} -o $@ 34 | 35 | 36 | # bin/asyncap: ${objects} export_lib/libgpunfa_runahead.so 37 | bin/asyncap: ${objects} 38 | $(NVCC) ${NVCCFLAGS} ${CXXFLAGS} ${objects} ${LDFLAGS} -o $@ 39 | cp -r bin/* ../../build/bin/ 40 | 41 | obj/run_ahead_approach.o: 42 | $(NVCC) -c ${CXXFLAGS} ${NVCCFLAGS} --shared --compiler-options '-fPIC' src/run_ahead_approach.cu ${LDFLAGS} -o $@ 43 | 44 | obj/main.o: 45 | nvcc -c ${CXXFLAGS} ${NVCCFLAGS} src/main.cu ${LDFLAGS} -o $@ 46 | 47 | clean: 48 | rm -rf obj 49 | rm -rf export_lib 50 | 51 | -------------------------------------------------------------------------------- /code/src/commons/SymbolStream.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * SymbolStream.cpp 3 | * 4 | * Created on: May 1, 2018 5 | * Author: hyliu 6 | */ 7 | 8 | #include "SymbolStream.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | using std::string; 22 | using std::set; 23 | using std::vector; 24 | using std::endl; 25 | using std::cout; 26 | using std::ios; 27 | 28 | 29 | SymbolStream::SymbolStream() { 30 | } 31 | 32 | SymbolStream::~SymbolStream() { 33 | } 34 | 35 | 36 | const set& SymbolStream::calc_alphabet() { 37 | this->alphabet.clear(); 38 | for (int i = 0; i < input.size(); i++) { 39 | alphabet.insert(input[i]); 40 | } 41 | 42 | //cout << "size_of_alphabet = " << alphabet.size() << endl; 43 | return alphabet; 44 | } 45 | 46 | 47 | int SymbolStream::get_length() const { 48 | return input.size(); 49 | } 50 | 51 | 52 | /** 53 | * From VASim 54 | */ 55 | static vector file2CharVector(string fn) { 56 | 57 | // open the file: 58 | std::ifstream file(fn, std::ios::binary); 59 | if(file.fail()){ 60 | if(errno == ENOENT) { 61 | cout<< " Error: no such input file." << endl; 62 | exit(-1); 63 | } 64 | } 65 | 66 | // get its size: 67 | std::streampos fileSize; 68 | 69 | file.seekg(0, std::ios::end); 70 | fileSize = file.tellg(); 71 | file.seekg(0, ios::beg); 72 | 73 | // Stop eating new lines in binary mode!!! 74 | file.unsetf(std::ios::skipws); 75 | 76 | // reserve capacity 77 | std::vector vec; 78 | vec.reserve(fileSize); 79 | 80 | // read the data: 81 | vec.insert(vec.begin(), 82 | std::istream_iterator(file), 83 | std::istream_iterator()); 84 | 85 | return vec; 86 | 87 | } 88 | 89 | static vector generateRandomCharVector() { 90 | std::string path = "./random.txt"; 91 | std::ofstream randomFile(path); 92 | std::vector vec; 93 | std::srand(time(0)); 94 | 95 | printf("Save random file to %s\n", path.c_str()); 96 | if (randomFile.is_open()) { 97 | for (int i = 0; i < 1000050; i++) { 98 | char c = rand() % 256; 99 | vec.push_back(c); 100 | randomFile << c; 101 | } 102 | randomFile.close(); 103 | } 104 | return vec; 105 | } 106 | 107 | void SymbolStream::readFromFile(string filename) { 108 | string input_fn = filename; 109 | bool randomInput = false; 110 | vector input2; 111 | 112 | auto hasEnding = [](std::string const &fullString, 113 | std::string const &ending) -> bool { 114 | if (fullString.length() >= ending.length()) { 115 | return (0 == fullString.compare(fullString.length() - ending.length(), 116 | ending.length(), ending)); 117 | } else { 118 | return false; 119 | } 120 | }; 121 | 122 | if (hasEnding(input_fn, "random")) { 123 | randomInput = true; 124 | std::cout << "generate random input\n"; 125 | input2 = generateRandomCharVector(); 126 | } else { 127 | cout << "read input stream from file = " << input_fn << endl; 128 | input2 = file2CharVector(input_fn); 129 | } 130 | 131 | input.clear(); 132 | 133 | // copy bytes to unsigned ints 134 | uint32_t counter = 0; 135 | 136 | for(uint8_t val : input2){ 137 | input.push_back(val); 138 | } 139 | 140 | cout << "input_stream_size = " << input.size() << endl; 141 | 142 | this->fromFile = filename; 143 | } 144 | 145 | 146 | uint8_t SymbolStream::get_position(int pos) const { 147 | return this->input[pos]; 148 | } 149 | 150 | void SymbolStream::set_position(int pos, uint8_t c) { 151 | assert(pos >= 0 && pos < size()); 152 | this->input[pos] = c; 153 | } 154 | 155 | SymbolStream SymbolStream::slice(int start, int len) const { 156 | SymbolStream res; 157 | 158 | assert(start >= 0); 159 | assert(len >= 0); 160 | assert(start < this->input.size()); 161 | 162 | if (start + len > this->input.size()) { 163 | len = this->input.size() - start; 164 | cout << "the input is shorter than the length specified, just slice to end" << endl; 165 | } 166 | 167 | assert(start + len <= this->input.size()); 168 | 169 | for (int i = start; i < start + len; i++) { 170 | res.input.push_back(this->input[i]); 171 | } 172 | 173 | return res; 174 | } 175 | 176 | 177 | 178 | 179 | void SymbolStream::padding_to_base(int base) { 180 | if (base <= 0) { 181 | return; 182 | } 183 | 184 | while (this->size() % base != 0) { 185 | this->input.push_back( (uint8_t) 0); 186 | } 187 | } -------------------------------------------------------------------------------- /code/src/commons/common_func.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "common_func.h" 3 | 4 | #include "NFA.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include "nfa_utils.h" 19 | 20 | #include 21 | #include 22 | 23 | 24 | 25 | using std::ifstream; 26 | using std::string; 27 | using std::endl; 28 | using std::cout; 29 | using std::pair; 30 | 31 | 32 | 33 | 34 | 35 | void tools::create_path_if_not_exists(string path) { 36 | struct stat info; 37 | 38 | if( stat( path.c_str(), &info ) != 0 ) 39 | { 40 | printf( "cannot access %s\n", path.c_str() ); 41 | 42 | const int dir_err = mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); 43 | if (-1 == dir_err) 44 | { 45 | printf("Error creating directory!n"); 46 | exit(1); 47 | } else { 48 | puts("success create dir "); 49 | } 50 | 51 | } 52 | 53 | else if( info.st_mode & S_IFDIR ) // S_ISDIR() doesn't exist on my windows 54 | { 55 | printf( "%s is a directory\n", path.c_str() ); 56 | 57 | } 58 | else 59 | { 60 | printf( "%s is no directory\n", path.c_str() ); 61 | exit(-1); 62 | } 63 | 64 | } -------------------------------------------------------------------------------- /code/src/commons/node.cpp: -------------------------------------------------------------------------------- 1 | #include "node.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "vasim_helper.h" 12 | 13 | using std::cout; 14 | using std::endl; 15 | 16 | using namespace VASim; 17 | 18 | using std::set; 19 | using std::unique_ptr; 20 | using std::bitset; 21 | using std::string; 22 | using std::map; 23 | using std::vector; 24 | using std::list; 25 | using std::unordered_map; 26 | using std::pair; 27 | using std::make_pair; 28 | 29 | 30 | 31 | Node::Node() { 32 | 33 | str_id = ""; 34 | sid = -1; 35 | symbol_set.reset(); 36 | symbol_set_str = ""; 37 | start = 0; 38 | str_id = ""; 39 | cc_id = 0; 40 | 41 | scc_id = -1; 42 | topo_order = -1; 43 | 44 | this->original_id = "undefined"; 45 | 46 | report = false; 47 | 48 | complete = false; 49 | complement = false; 50 | 51 | 52 | this->hot_degree = 0.0; 53 | 54 | 55 | cg_id = -1; 56 | 57 | visited = false; 58 | } 59 | 60 | Node::~Node() { 61 | } 62 | 63 | 64 | bool Node::is_start_always_enabled() const { 65 | return (start == NODE_START_ENUM::START_ALWAYS_ENABLED); 66 | } 67 | 68 | bool Node::is_start() const { 69 | return (start == NODE_START_ENUM::START || start == NODE_START_ENUM::START_ALWAYS_ENABLED); 70 | } 71 | 72 | bool Node::is_report() const { 73 | return report; 74 | } 75 | 76 | 77 | void Node::symbol_set_to_bit() { 78 | parseSymbolSet(this->symbol_set, this->symbol_set_str); 79 | } 80 | 81 | 82 | 83 | bool Node::is_wildcard() const { 84 | return symbol_set.all(); 85 | } 86 | 87 | 88 | 89 | // if the symbol set is a reverse of one symbol, we classify this to not type. 90 | bool Node::is_not_type_node() const { 91 | return (symbol_set.count() == 1); 92 | } 93 | 94 | 95 | 96 | int Node::num_of_accept_symbol() const { 97 | return (symbol_set.count()); 98 | } 99 | 100 | 101 | 102 | 103 | void Node::remap_alphabet(const map &remap_table) { 104 | bitset<256> remapped_symbol_set; 105 | remapped_symbol_set.reset(); 106 | for (auto it : remap_table) { 107 | int k = it.first; 108 | int v = it.second; 109 | //cout << "remap_alphabet_node " << k << " " << v << endl; 110 | 111 | if (this->symbol_set.test(k)) { 112 | remapped_symbol_set.set(v); 113 | } 114 | } 115 | 116 | this->symbol_set = remapped_symbol_set; 117 | 118 | } 119 | 120 | 121 | 122 | int Node::num_of_1_in_matchset() const { 123 | int n = 0; 124 | for (int i = 0; i < 256; i++) { 125 | auto symbol = (uint8_t) i; 126 | if (this->match2(i)) { 127 | n++; 128 | } 129 | } 130 | return n; 131 | } -------------------------------------------------------------------------------- /code/src/commons/report_formatter.cpp: -------------------------------------------------------------------------------- 1 | #include "report_formatter.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using std::cout; 11 | using std::endl; 12 | using std::vector; 13 | using std::string; 14 | 15 | report::report(int offset, string str_id, int cc, int input_stream_id): 16 | offset(offset), 17 | str_id(str_id), 18 | cc(cc), 19 | input_stream_id(input_stream_id) 20 | { 21 | } 22 | 23 | report_formatter::report_formatter() {} 24 | 25 | void report_formatter::add_report(report rp) { 26 | this->reports.push_back(rp); 27 | } 28 | 29 | 30 | void report_formatter::print_to_file(string filename, bool unique1) { 31 | cout << "report_fomatter_print_to_file_num_report = " << reports.size() << endl; 32 | std::sort(std::execution::par_unseq, reports.begin(), reports.end()); 33 | 34 | if (unique1) { 35 | reports.erase( std::unique(std::execution::par_unseq, reports.begin(), reports.end() ), reports.end() ); 36 | cout << "report_fomatter_print_to_file_unique_num_report = " << reports.size() << endl; 37 | } 38 | 39 | // std::ofstream out(filename); 40 | // cout<report_on = true; 19 | 20 | // set default alphabet 21 | this->alphabet.clear(); 22 | 23 | for (int i = 0; i < 256; i++) { 24 | this->alphabet.insert( (uint8_t) i ); 25 | } 26 | 27 | } 28 | 29 | 30 | abstract_algorithm::~abstract_algorithm() { 31 | } 32 | 33 | void abstract_algorithm::set_alphabet(set alphabet) { 34 | this->alphabet = alphabet; 35 | } 36 | 37 | const SymbolStream& abstract_algorithm::get_symbol_stream(int i) const { 38 | assert(i >= 0 && i < symbol_streams.size() ); 39 | 40 | return symbol_streams[i]; 41 | } 42 | 43 | 44 | void abstract_algorithm::add_symbol_stream(SymbolStream ss) { 45 | symbol_streams.push_back(ss); 46 | } 47 | 48 | 49 | void abstract_algorithm::set_block_size(int block_size) { 50 | this->block_size = block_size; 51 | 52 | } 53 | 54 | 55 | Array2 *abstract_algorithm::concat_input_streams_to_array2() { 56 | assert(symbol_streams.size() > 0); 57 | 58 | // cout << "padding_input_stream = " << this->padding_input_stream << endl; 59 | 60 | // for (int i = 0; i < this->symbol_streams.size(); i++) { 61 | // symbol_streams[i].padding_to_base(this->padding_input_stream); 62 | // } 63 | 64 | int length = symbol_streams[0].get_length(); 65 | 66 | for (auto ss : symbol_streams) { 67 | assert(length == ss.get_length()); 68 | } 69 | 70 | auto input = new Array2(symbol_streams.size() * length); 71 | 72 | int t = 0; 73 | 74 | for (auto ss : symbol_streams) { 75 | for (int p = 0; p < ss.get_length(); p++) { 76 | input->set(t++, ss.get_position(p)); 77 | } 78 | } 79 | 80 | return input; 81 | } 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /code/src/gpunfautils/common.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | using std::string; 18 | 19 | 20 | 21 | std::ostream& operator<<(std::ostream& os, const match_pair &obj) { 22 | os << obj.symbol_offset << " " << obj.state_id << ' '; 23 | return os; 24 | } 25 | 26 | -------------------------------------------------------------------------------- /code/src/infant/device_funcs.h: -------------------------------------------------------------------------------- 1 | #ifndef INFANT_KERNELS_DEV_FUNC_H_ 2 | #define INFANT_KERNELS_DEV_FUNC_H_ 3 | 4 | 5 | 6 | #define OUTPUT_BUFFER_TB 256 7 | 8 | __device__ inline bool get_bit(int *arr, int len, int n_bit) { 9 | int n_cell = n_bit / (sizeof(int) * 8); 10 | int offset = n_bit % (sizeof(int) * 8); 11 | 12 | return arr[n_cell] & (1 << offset); 13 | } 14 | 15 | 16 | 17 | __device__ inline void set_bit(int *arr, int len, int n_bit) { 18 | int n_cell = n_bit / (sizeof(int) * 8); 19 | int offset = n_bit % (sizeof(int) * 8); 20 | 21 | atomicOr(&arr[n_cell], (1 << offset)); 22 | 23 | } 24 | 25 | 26 | template 27 | __device__ inline bool get_bit_single(T ele, int n_bit) { 28 | return ele & (1 << n_bit); 29 | } 30 | 31 | 32 | #endif -------------------------------------------------------------------------------- /code/src/infant/infant.h: -------------------------------------------------------------------------------- 1 | /* 2 | * infant.h 3 | * 4 | * Created on: May 16, 2018 5 | * Author: hyliu 6 | */ 7 | 8 | #ifndef INFANT_H_ 9 | #define INFANT_H_ 10 | 11 | #include 12 | #include 13 | #include "commons/NFA.h" 14 | #include "commons/SymbolStream.h" 15 | #include 16 | 17 | #include "gpunfautils/array2.h" 18 | #include 19 | #include "infant_kernels.h" 20 | #include "infant_config.h" 21 | #include "gpunfautils/abstract_gpunfa.h" 22 | 23 | using std::string; 24 | using std::unique_ptr; 25 | using std::pair; 26 | 27 | 28 | class AlphabetBasedTransitionTable { 29 | public: 30 | AlphabetBasedTransitionTable(const NFA& nfa, const set& alphabet); 31 | ~AlphabetBasedTransitionTable(); 32 | 33 | void print_basic_stats(); 34 | 35 | void init_state_vector(); 36 | 37 | int get_transition_table_length() const; 38 | 39 | const pair *get_transitions() const; 40 | 41 | const int *get_len() const; 42 | const int *get_index() const; 43 | 44 | const NFA& get_according_NFA() const; 45 | 46 | int get_length_of_state_bitvec() const; 47 | const int *get_enabled_bitvec() const; 48 | 49 | int get_max_edge_list_length_of_symbol() const { 50 | return max_edge_list_length_of_symbol; 51 | } 52 | 53 | double get_avg_edge_list_length_of_symbol() const { 54 | assert(alphabet.size() > 0); 55 | return this->sum_edge_list_length_of_symbol / alphabet.size(); 56 | } 57 | 58 | private: 59 | const NFA& nfa; 60 | const set &alphabet; 61 | 62 | pair * transitions; 63 | int len[256]; //symbol_trans_len 64 | int index[256]; //symbol_trans_len 65 | 66 | int num_transitions; 67 | int transition_table_length; 68 | 69 | int V; 70 | 71 | int *enabled_bitvec; 72 | int state_bitvec_length; 73 | 74 | bool *always_enabled; 75 | 76 | 77 | //statistics 78 | int max_edge_list_length_of_symbol; 79 | double sum_edge_list_length_of_symbol; 80 | 81 | }; 82 | 83 | 84 | 85 | 86 | class iNFAnt : public abstract_algorithm { 87 | public: 88 | vector old_ccs; 89 | infant_config* opt; 90 | 91 | iNFAnt(NFA *nfa); 92 | ~iNFAnt(); 93 | 94 | int get_num_nfa() const; 95 | 96 | const AlphabetBasedTransitionTable& get_trans_table(int i) const; 97 | //void add_transition_table(AlphabetBasedTransitionTable *tt); 98 | void add_NFA(NFA *nfa); 99 | const NFA *get_NFA(int index) const; 100 | 101 | 102 | const SymbolStream& get_symbol_stream(int i) const; 103 | void add_symbol_stream(SymbolStream ss); 104 | 105 | void init_host_transition_tables(); 106 | void prepare_host_state_info(); 107 | void prepare_host_input_streams(); 108 | 109 | void prepare_state_vector(); 110 | 111 | void allocate_device_data_structures(); 112 | 113 | void calc_state_bitvec_length(); 114 | 115 | const AlphabetBasedTransitionTable *get_transition_table(int k) const; 116 | 117 | 118 | void set_alphabet(set alphabet); 119 | 120 | void copy_to_device(); 121 | 122 | void launch_kernel() override; 123 | 124 | void to_reports() const; 125 | 126 | void set_num_state_per_group(int num_state_per_group) { 127 | this->num_state_per_group = num_state_per_group; 128 | } 129 | void set_option(infant_config &opt){ 130 | this->opt = &opt; 131 | } 132 | 133 | private: 134 | int num_state_per_group; 135 | 136 | vector nfas; 137 | vector transition_tables; // equals to num_nfa 138 | 139 | // -------------------------------------------------------------------- 140 | Array2 *arr_src_table; 141 | Array2 *arr_dst_table; 142 | 143 | 144 | Array2 *arr_start_position_transition_tables; // length equals to num_nfa 145 | 146 | Array2 *arr_symbol_trans_len; 147 | Array2 *arr_symbol_trans_index; 148 | 149 | Array2 *arr_states_status; // 00 is always enabled; is output; 150 | Array2 *arr_state_start_position; 151 | Array2 *num_of_state_per_tb; 152 | 153 | // input streams 154 | Array2 *arr_input_streams; 155 | Array2 *arr_input_streams2; 156 | //int input_stream_length, 157 | 158 | // state vector 159 | Array2 *arr_enabled_bitvec; 160 | int state_bitvec_length; // num of int per block 161 | 162 | // output processing 163 | Array2 *arr_match_count; 164 | 165 | 166 | }; 167 | 168 | 169 | #endif /* INFANT_H_ */ 170 | 171 | 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /code/src/infant/infant_config.h: -------------------------------------------------------------------------------- 1 | #ifndef INFANT_CONFIG 2 | #define INFANT_CONFIG 3 | 4 | #include "commons/common_func.h" 5 | 6 | using namespace clara; 7 | 8 | class infant_config : public common_gpunfa_options { 9 | public: 10 | infant_config() : common_gpunfa_options() 11 | { 12 | this->num_state_per_group = this->block_size; 13 | 14 | auto additional_parser = 15 | Opt(num_state_per_group, 16 | "num_state_per_group")["--num-state-per-group"]( 17 | "number of state per group in infant. ") | 18 | Opt(validation, 19 | "validation")["--validation"]("fake validation (do nothing)"); 20 | 21 | parser = parser | additional_parser; 22 | } 23 | 24 | int num_state_per_group; 25 | bool validation; 26 | }; 27 | 28 | 29 | #endif 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /code/src/infant/infant_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef INFANT_KERNELS_H_ 2 | #define INFANT_KERNELS_H_ 3 | 4 | #include "gpunfautils/common.h" 5 | #include 6 | 7 | 8 | /** 9 | * 10 | * stores to one output array 11 | * thread blocks have to compete on matc 12 | * 13 | * 14 | */ 15 | __global__ void infant_kernel_one_output( 16 | int *src_table, 17 | int *dst_table, 18 | int *trans_table_start_position, 19 | int *symbol_trans_len, 20 | int *symbol_trans_index, 21 | 22 | char *states_status, // 00 is always enabled; is output; 23 | int *state_start_position, 24 | int *num_of_states_per_tb, 25 | 26 | // input streams 27 | uint8_t *input_streams, 28 | int input_stream_length, 29 | 30 | // state vector 31 | int *enabled_bitvec, 32 | //int *active_bitvec, 33 | int state_bitvec_length, // num of int per block 34 | 35 | // output processing 36 | match_entry *match_array, // fixed size for each thread block, 37 | unsigned long long int match_array_capacity, 38 | unsigned long long int *match_count, 39 | bool report_on 40 | 41 | 42 | ); 43 | 44 | 45 | #endif /* INFANT_KERNELS_H_ */ 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /code/src/ngap/kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef NGAP_KERNELS_H_ 2 | #define NGAP_KERNELS_H_ 3 | 4 | #include "graph.h" 5 | #include "group_graph.h" 6 | #include "my_bitset.h" 7 | #include "ngap_buffer.h" 8 | 9 | // O0 10 | __global__ void advanceAndFilterBlockingGroups( 11 | BlockingBuffer blb, uint8_t *arr_input_streams, int arr_input_streams_size, 12 | GroupMatchset gms, GroupNodeAttrs gna, GroupAAS gaas, GroupCsr gcsr); 13 | 14 | // NAP 15 | template 16 | __global__ void advanceAndFilterNonBlockingNAPGroups( 17 | NonBlockingBuffer nblb, uint8_t *arr_input_streams, 18 | int arr_input_streams_size, GroupMatchset gms, GroupNodeAttrs gna, 19 | GroupAAS gaas, GroupCsr gcsr); 20 | 21 | // O1 22 | template 23 | __global__ void advanceAndFilterNonBlockingGroups(NonBlockingBuffer nblb, 24 | uint8_t *arr_input_streams, 25 | int arr_input_streams_size, 26 | GroupMatchset gms, 27 | GroupNodeAttrs gna, 28 | GroupAAS gaas, GroupCsr gcsr); 29 | 30 | // O3 31 | template 32 | __global__ void advanceAndFilterNonBlockingPrecGroups( 33 | NonBlockingBuffer nblb, uint8_t *arr_input_streams, 34 | int arr_input_streams_size, GroupMatchset gms, GroupNodeAttrs gna, 35 | GroupAAS gaas, GroupCsr gcsr); 36 | 37 | // O4 38 | template 39 | __global__ void advanceAndFilterNonBlockingR1Groups( 40 | NonBlockingBuffer nblb, uint8_t *arr_input_streams, 41 | int arr_input_streams_size, GroupMatchset gms, GroupNodeAttrs gna, 42 | GroupAAS gaas, GroupCsr gcsr); 43 | 44 | template 45 | __global__ void advanceAndFilterNonBlockingR2Groups( 46 | NonBlockingBuffer nblb, uint8_t *arr_input_streams, 47 | int arr_input_streams_size, GroupMatchset gms, GroupNodeAttrs gna, 48 | GroupAAS gaas, GroupCsr gcsr); 49 | 50 | // OA 51 | template 52 | __global__ void advanceAndFilterNonBlockingAllGroups( 53 | NonBlockingBuffer nblb, uint8_t *arr_input_streams, 54 | int arr_input_streams_size, GroupMatchset gms, GroupNodeAttrs gna, 55 | GroupAAS gaas, GroupCsr gcsr); 56 | 57 | #endif -------------------------------------------------------------------------------- /code/src/ngap/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "NFA.h" 4 | #include "NFALoader.h" 5 | #include "SymbolStream.h" 6 | #include "graph.h" 7 | #include "kernel.h" 8 | #include "nfa_utils.h" 9 | #include "ngap.h" 10 | #include "ngap_buffer.h" 11 | 12 | #include "ngap_option.h" 13 | #include "node.h" 14 | #include "utils.h" 15 | 16 | int main(int argc, char *argv[]) { 17 | printf("Command: "); 18 | for (int i = 0; i < argc; i++) 19 | printf("%s ", argv[i]); 20 | printf("\n"); 21 | 22 | ngap_option opt; 23 | 24 | auto result = opt.parse(argc, argv); 25 | 26 | if (!result) { 27 | std::cerr << "Error in command line: " << result.errorMessage() 28 | << std::endl; 29 | exit(1); 30 | } 31 | 32 | if (opt.showHelp) { 33 | cout << opt.getHelp(); 34 | } 35 | 36 | std::string automata_filename = opt.nfa_filename; 37 | std::string input_filename = opt.input_filename; 38 | int start_pos = opt.input_start_pos, input_length = opt.input_len; 39 | std::string algo = opt.algorithm; 40 | std::string output_file_name = opt.report_filename; 41 | int dup_input_stream = opt.duplicate_input_stream; 42 | unsigned long long int one_output_capacity = opt.output_capacity; 43 | int block_size = opt.block_size; 44 | int max_size_of_cc = opt.max_nfa_size; 45 | int split_entire_inputstream_to_chunk_size = opt.split_chunk_size; 46 | 47 | SymbolStream ss, old_ss; 48 | old_ss.readFromFile(input_filename); 49 | if (start_pos != -1 && input_length != -1) { 50 | assert(start_pos >= 0); 51 | old_ss = old_ss.slice(start_pos, input_length); 52 | } 53 | // cout << "input_stream_size = " << ss.size() << endl; 54 | auto ab = old_ss.calc_alphabet(); 55 | 56 | auto nfa = load_nfa_from_file(automata_filename); 57 | nfa_utils::print_nfa_info(nfa); 58 | 59 | Graph g; 60 | // g.ReadANML(automata_filename); 61 | g.ReadNFA(nfa); 62 | printf("ReadANML finish \n"); 63 | g.copyToDevice(); 64 | 65 | ngap pl(nfa, g); 66 | pl.set_ngap_option(&opt); 67 | pl.set_max_cc_size_limit(max_size_of_cc); 68 | pl.preprocessing(); 69 | auto grouped_nfas = nfa_utils::group_nfas_by_num(opt.group_num, pl.ccs); 70 | printf("grouped_nfas.size = %zu pl.num_seg=%d\n", grouped_nfas.size(), 71 | pl.num_seg); 72 | std::vector gs; 73 | for (auto nfa : grouped_nfas) { 74 | Graph *g = new Graph(); 75 | g->ReadNFA(nfa); 76 | g->copyToDevice(); 77 | gs.push_back(g); 78 | } 79 | assert(gs.size() == opt.group_num); 80 | 81 | cout << "Input Stream Info:\n"; 82 | cout << " input_start_pos = " << start_pos << endl; 83 | cout << " input_length = " << input_length << endl; 84 | cout << " split_entire_inputstream_to_chunk_size = " 85 | << split_entire_inputstream_to_chunk_size << endl; 86 | cout << " dup_input_stream = " << dup_input_stream << endl; 87 | 88 | for (int i = 0; i < dup_input_stream; i++) { 89 | ss.concat(old_ss); 90 | } 91 | if (split_entire_inputstream_to_chunk_size > 0) { 92 | int sslen = ss.size(); 93 | int num_seg = sslen / split_entire_inputstream_to_chunk_size; 94 | pl.num_seg = num_seg; 95 | // cout << "num_seg_" << i << " = " << num_seg << endl; 96 | for (int j = 0; j < num_seg; j++) { 97 | int start_pos1 = j * split_entire_inputstream_to_chunk_size; 98 | auto ss_seg = 99 | ss.slice(start_pos1, split_entire_inputstream_to_chunk_size); 100 | pl.add_symbol_stream(ss_seg); 101 | } 102 | } 103 | 104 | pl.set_nfa_group(gs); 105 | pl.set_report_off(opt.report_off, opt.output_capacity, 106 | opt.duplicate_input_stream * opt.quick_validation); 107 | pl.set_output_file(output_file_name); 108 | pl.set_num_segment_per_ss(1); 109 | pl.set_output_buffer_size(one_output_capacity); 110 | pl.set_block_size(block_size); 111 | pl.set_alphabet(ab); 112 | pl.prepare_original_input_streams(ss); 113 | 114 | if (algo == "blockinggroups") { 115 | pl.launch_blocking_groups(); // BAP 116 | } else if (algo == "NAPgroups") { 117 | pl.launch_non_blocking_nap_groups(); // NAP 118 | } else if (algo == "nonblockinggroups") { 119 | pl.launch_non_blocking_groups(); // O1 120 | } else if (algo == "nonblockingr1groups") { 121 | pl.launch_non_blocking_r1_groups(); // O4 122 | } else if (algo == "nonblockingr2groups") { 123 | pl.launch_non_blocking_r2_groups(); // O4 124 | } else if (algo == "nonblockingpcgroups") { 125 | pl.launch_non_blocking_prec_groups(); // O3 126 | } else if (algo == "nonblockingallgroups") { 127 | pl.launch_non_blocking_all_groups(); // OA 128 | } else { 129 | cout << "not supported algoritm " << algo << endl; 130 | } 131 | 132 | delete nfa; 133 | for (auto g : gs) 134 | delete g; 135 | printf("FINISHED!\n"); 136 | return 0; 137 | } 138 | -------------------------------------------------------------------------------- /code/src/ngap/ngap_buffer.h: -------------------------------------------------------------------------------- 1 | #ifndef NGAP_BUFFER_H_ 2 | #define NGAP_BUFFER_H_ 3 | 4 | #include "graph.h" 5 | #include "my_bitset.h" 6 | 7 | #include "ngap_option.h" 8 | #include "precompute_table.h" 9 | #include "utils.h" 10 | 11 | // #define DEBUG_PL_FILTER 12 | // #define DEBUG_PL_ADVANCE 13 | // #define DEBUG_SHADOW_BUFFER 14 | // #define DEBUG_PL_FILTER_ITER 15 | #define DEBUG_ITER 10000 16 | // #define DEBUG_MAX_BUFFER_SIZE // set data_buffer_stream_size large enough 17 | // #define DEBUG_PL_KERNEL_LAUNCH 18 | 19 | #define DEBUG_PL_ADVANCE_CONCAT // do not comment it out 20 | 21 | // #define DEBUG_FRONTIER_SIZE 22 | // #define MULTI_BLOCKS 23 | #define USE_CSR 24 | // #define USE_PRECOMP_ONCE 25 | // #define USE_PRECOMP_TWICE 26 | 27 | #ifndef DATA_BUFFER_SIZE 28 | // #define DATA_BUFFER_SIZE 300000000 29 | #define DATA_BUFFER_SIZE 1000000000LL 30 | #endif 31 | 32 | #ifndef DATA_BUFFER_SIZE_FRONTIER 33 | #define DATA_BUFFER_SIZE_FRONTIER 2000000000 34 | #endif 35 | 36 | #ifndef RESULTS_SIZE 37 | #define RESULTS_SIZE 80000000 38 | #endif 39 | 40 | #define MAX_THREADS_PER_BLOCK 256 41 | #define MIN_BLOCKS_PER_MP 16 42 | 43 | #define BLOCK_SIZE 256 44 | 45 | // #define PRINT_INDEX_QUEUE 46 | 47 | class BlockingBuffer { 48 | public: 49 | int buffer_capacity; 50 | int buffer_capacity_per_block; 51 | unsigned long long int results_capacity; 52 | bool unique; 53 | 54 | int *d_buffer; 55 | int *d_buffer_idx; 56 | int *d_buffer_size; 57 | uint64_t *d_results; 58 | unsigned long long int *d_results_size; 59 | 60 | int *d_froniter_length; 61 | int *d_froniter_end; 62 | 63 | int *d_froniter_divergence_end; 64 | int *d_froniter_divergence_advance; 65 | int *d_froniter_divergence_filter; 66 | int *d_froniter_workload_end; 67 | int *d_froniter_workload; 68 | 69 | int group_num; 70 | int num_seg; 71 | 72 | bool motivate_worklist_length; 73 | 74 | bool report_off; 75 | 76 | __host__ void init(Array2 *input_stream, int input_total_size, 77 | int input_num, int multi_ss_size, Graph &graph, 78 | ngap_option *plo); 79 | __host__ void init_nfagroups(Array2 *input_stream, 80 | int input_total_size, int input_num, 81 | int multi_ss_size, std::vector gs, 82 | ngap_option *plo); 83 | 84 | __host__ void release(); 85 | }; 86 | 87 | class NonBlockingBuffer { 88 | public: 89 | long long int buffer_capacity; 90 | long long int buffer_capacity_per_block; 91 | unsigned long long int results_capacity; 92 | int data_buffer_fetch_size = 64; 93 | 94 | int add_aas_start = 1000; 95 | int add_aas_interval; 96 | 97 | int active_threshold; 98 | 99 | bool unique; 100 | int unique_frequency; 101 | 102 | int *d_buffer; 103 | int *d_buffer_idx; 104 | int *d_buffer2; 105 | int *d_buffer_idx2; 106 | 107 | int *d_buffer_test; 108 | int *d_buffer_idx_test; 109 | uint *d_buffer_end_tmp_test; 110 | 111 | uint *d_buffer_start; 112 | uint *d_buffer_end; 113 | uint *d_buffer_end_tmp; 114 | uint64_t *d_results; 115 | uint32_t *d_results_v; 116 | uint32_t *d_results_i; 117 | unsigned long long int *d_results_size; 118 | int *d_symbol_table; 119 | int *d_newest_idx; 120 | 121 | int *prec_once_offset; 122 | int *prec_once; 123 | int *prec_twice_offset; 124 | int *prec_twice; 125 | int *prec_once_report_offset; 126 | int *prec_once_report; 127 | int *prec_twice_report_offset; 128 | int *prec_twice_report; 129 | 130 | int *preresult; 131 | int *preresult_iter; 132 | int *preresult_size; 133 | int *preresult_end; 134 | 135 | int *d_fakeiter; 136 | int *d_fakeiter_size; 137 | int *d_fakeiter2; 138 | int *d_fakeiter_size2; 139 | int d_fakeiter_capacity; 140 | int *cutoffnum; 141 | 142 | int *d_froniter_length; 143 | int *d_froniter_end; 144 | 145 | // O3 146 | PrecTable *h_pts; 147 | PrecTable *d_pts; 148 | int precompute_depth = 0; 149 | int precompute_cutoff; 150 | 151 | int group_num; 152 | int num_seg; 153 | 154 | bool report_off; 155 | 156 | __host__ void init(Array2 *input_stream, int input_total_size, 157 | int input_num, int multi_ss_size, Graph &graph, 158 | ngap_option *plo); 159 | __host__ void init_nfagroups(Array2 *input_stream, 160 | int input_total_size, int input_num, 161 | int multi_ss_size, std::vector gs, 162 | ngap_option *plo); 163 | 164 | __host__ void release(bool isGroup = false); 165 | 166 | __host__ void reset(Array2 *input_stream, int input_total_size, 167 | int multi_ss_size, int group_num, std::vector gs, 168 | ngap_option *plo); 169 | }; 170 | 171 | #endif -------------------------------------------------------------------------------- /code/src/ngap/ngap_option.h: -------------------------------------------------------------------------------- 1 | #ifndef NGAP_OPTION_H_ 2 | #define NGAP_OPTION_H_ 3 | 4 | #include "commons/common_func.h" 5 | 6 | class ngap_option : public common_gpunfa_options { 7 | public: 8 | ngap_option() : common_gpunfa_options() { 9 | this->algorithm = "graph"; 10 | this->num_state_per_group = this->block_size; 11 | 12 | auto additional_parser = 13 | Opt(add_aas_start, "start number")["--add-aan-start"]( 14 | "the number of iteration to added always active state before " 15 | "execution") | 16 | Opt(add_aas_interval, "interval number")["--add-aas-interval"]( 17 | "the number of iteration to added always active state during " 18 | "execution") | 19 | Opt(unique, "true/false")["--unique"]("unique during execution") | 20 | Opt(active_threshold, "active-threshold")["--active-threshold"]( 21 | "the active thread number to enable work privatization") | 22 | Opt(validation, "true/false")["--validation"]("enable validation") | 23 | Opt(use_soa, "true/false")["--use-soa"]( 24 | "change the data layout of NFA topology") | 25 | Opt(precompute_cutoff, "precompute-cutoff")["--precompute-cutoff"]( 26 | "the threshold for table load balance") | 27 | Opt(precompute_depth, "precompute-depth")["--precompute-depth"]( 28 | "the prefix length for the memiozation table") | 29 | Opt(data_buffer_fetch_size, 30 | "data-buffer-fetch-size")["--data-buffer-fetch-size"]( 31 | "the number of states taken from the buffer in each iteration") | 32 | Opt(motivate_worklist_length, 33 | "true/false")["--motivate-worklist-length"]( 34 | "record worklist length") | 35 | Opt(num_state_per_group, 36 | "num_state_per_group")["--num-state-per-group"]( 37 | "number of state per group.") | 38 | Opt(group_num, "group_num")["--group-num"]("the group number for CCs") | 39 | Opt(tuning, "true/false")["--tuning"]("enable tuning") | 40 | Opt(pc_use_uvm, "true/false")["--pc-use-uvm"]( 41 | "use uvm to store memiozation tables") | 42 | Opt(adaptive_aas, "true/false")["--adaptive-aas"]( 43 | "use adaptive strategy for interval number") | 44 | Opt(try_adaptive_aas, "true/false")["--try-adaptive-aas"]( 45 | "retry when adaptive strategy failed") | 46 | Opt(compress_prec_table, "true/false")["--compress-prec-table"]( 47 | "compress memiozation tables"); 48 | 49 | parser = parser | additional_parser; 50 | } 51 | 52 | uint32_t data_buffer_fetch_size = 128; 53 | int add_aas_start = 0; 54 | int add_aas_interval = 1; 55 | bool unique = false; 56 | bool validation = true; 57 | int active_threshold = 20; 58 | bool use_soa = false; 59 | int precompute_cutoff = -1; 60 | int precompute_depth = 0; 61 | bool motivate_worklist_length = false; 62 | int num_state_per_group; 63 | int group_num = 10; 64 | bool compress_prec_table = true; 65 | bool tuning = false; 66 | bool pc_use_uvm = false; 67 | bool adaptive_aas = false; 68 | bool try_adaptive_aas = false; 69 | }; 70 | 71 | #endif 72 | -------------------------------------------------------------------------------- /code/src/obat/Makefile: -------------------------------------------------------------------------------- 1 | all: nvcc clang 2 | 3 | nvcc: 4 | nvcc -O0 --std=c++11 -Xptxas='-v' --source-in-ptx -m64 main.cu one_byte_at_a_time.cu -arch=sm_50 -I../../include -L../../build/lib -lgpunfacommons -lgpunfautils -keep -o obat1_nvcc 5 | 6 | clang: 7 | clang++ -O0 --std=c++11 main.cu one_byte_at_a_time.cu --cuda-path=${CUDA_ROOT} --cuda-gpu-arch=sm_50 -lcudart_static -ldl -lrt -pthread -L/home/hyliu/gcc65/install/lib64 -lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0 -L${CUDA_ROOT}/lib64 -I../../include -L../../build/lib -lgpunfacommons -lgpunfautils -save-temps -o obat1_clang 8 | 9 | clean: 10 | rm -f *.o 11 | rm -f *.ii 12 | rm -f *.i 13 | rm -f *.ptx 14 | rm -f *fatbin* 15 | rm -f *cubin* 16 | rm -f *stub* 17 | rm -f *sm_* 18 | rm -f *cudafe* 19 | rm -f *module_id 20 | rm -f *dlink* 21 | rm -f a.out 22 | rm -f *.png 23 | rm -f *.txt 24 | rm -f *.ll 25 | rm -f *.bc 26 | rm -f *.s 27 | rm -f *.cui 28 | rm -f obat1* 29 | -------------------------------------------------------------------------------- /code/src/obat/one_byte_at_a_time.h: -------------------------------------------------------------------------------- 1 | #ifndef ONEBYTE_AT_A_TIME 2 | #define ONEBYTE_AT_A_TIME 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "commons/NFA.h" 13 | #include 14 | #include 15 | #include 16 | #include "commons/SymbolStream.h" 17 | #include 18 | #include 19 | #include "commons/report_formatter.h" 20 | #include "option_config.h" 21 | 22 | 23 | using std::unordered_map; 24 | 25 | 26 | 27 | class one_byte_at_a_time : public abstract_algorithm { 28 | public: 29 | obat_config* opt; 30 | vector old_ccs; 31 | one_byte_at_a_time(NFA *nfa); 32 | virtual ~one_byte_at_a_time(); 33 | 34 | void preprocessing_enable_active(); 35 | 36 | void check_grouped_nfa_sizes(); 37 | 38 | void preprocessing_active_active(); 39 | 40 | void launch_kernel() override; 41 | 42 | void prepare_output_buffer(); 43 | 44 | void print_reports(string filename, report_formatter &rf); 45 | 46 | void organize_reports2(Array2 *output_buffer, int buffer_size, const vector &grouped_nfas1, report_formatter& rf); 47 | 48 | void remap_intid_of_nodes(remap_node_type tp); 49 | 50 | void remap_intid_of_nodes_with_boudary(remap_node_type tp, vector &grouped_nfa, const vector &boundaries); 51 | 52 | void hotstart_aa(); 53 | 54 | //OBAT series 55 | void OBAT_baseline_2(); 56 | void obat_MC(); 57 | 58 | // important. Activity based hot cold approach. 59 | void test_hotcold_nodup_queue_mc_CaH(); 60 | 61 | // hotstart 62 | void hotstart_ea(); 63 | void hotstart_ea_without_MC2(); 64 | 65 | void set_node_active_freq_map(map freq_map); 66 | void set_hot_limit_by_bfs_layer(int hot_limit_by_bfs_layer); 67 | void set_active_queue_size(int queuesize); 68 | 69 | void set_cold_threshold(double cold_threshold) { 70 | this->cold_thres = cold_threshold; 71 | } 72 | void set_option(obat_config &opt){ 73 | this->opt = &opt; 74 | } 75 | 76 | void print_node_matchset_complete_info(const vector &ccs); 77 | 78 | void test_data_movement_read_input_stream_only(int num_tb_x); 79 | 80 | void test_data_movement_read_input_stream_only2(int num_tb_x); 81 | 82 | bool hot_stage_only; 83 | bool remap_input_stream; 84 | 85 | int packing; 86 | string packing_filename; 87 | 88 | private: 89 | bool record_cold_warp_active_array; 90 | 91 | int history_queue_capacity; 92 | 93 | int active_queue_size; 94 | 95 | int hot_limit_by_bfs_layer; 96 | 97 | int max_indegree_of_cold_states; 98 | 99 | double cold_thres; 100 | 101 | int profile_length; 102 | 103 | map freq_map; 104 | 105 | vector grouped_nfas; 106 | 107 | Array2 *real_output_array; 108 | Array2 *tail_of_real_output_array; 109 | 110 | remap_node_type remap_node_id; 111 | 112 | 113 | }; 114 | 115 | 116 | 117 | #endif 118 | -------------------------------------------------------------------------------- /code/src/obat/option_config.h: -------------------------------------------------------------------------------- 1 | #ifndef OBAT_CONFIG 2 | #define OBAT_CONFIG 3 | 4 | #include "commons/common_func.h" 5 | 6 | using namespace clara; 7 | 8 | class obat_config : public common_gpunfa_options { 9 | public: 10 | obat_config() : common_gpunfa_options(), 11 | bfs_hot_ratio(0.0), 12 | hotcold_filter_filename(""), 13 | hot_n_state_limit(-1), 14 | hot_stage_only(false), 15 | remap_input_stream(false), 16 | active_queue_size(1024), 17 | packing(0), 18 | cold_threshold(0.001), 19 | num_of_blocks_read_input_only(1), 20 | validation{true} 21 | { 22 | 23 | auto additional_parser = 24 | Opt(bfs_hot_ratio, "bfs_hot_ratio")["--hot-limit-by-bfs-ratio"] 25 | ("The ratio of states to be fixed mapped to threads that offloaed by bfs-layer") 26 | | Opt(hotcold_filter_filename, "hot cold filter filename") 27 | ["--hot-cold-filter"] 28 | ("The file specifies which states are hot and thereby fixed mapped to threads.") 29 | 30 | | Opt(active_queue_size, "active_queue_size")["--active-queue-size"]["-q"]("worklist size in shared memory") 31 | 32 | | Opt(hot_n_state_limit, "hot_n_state_limit")["--hot-limit-by-bfs"]("hot-limit-by-bfs") 33 | | Opt(hot_stage_only, "hot_stage_only")["--hot-stage-only"]("only execute hot stage. " 34 | "Only works in hotstart_ea and hotstart_aa") 35 | | Opt(remap_input_stream, "remap_input_stream")["--remap-input-stream"] 36 | ("remap input stream to thread block. (Testing now only applicable to hotstart ea)") 37 | | Opt(packing, "packing")["--packing"]("The way of packing NFAs to thread blocks. Default: 0. Random 1. ") 38 | | Opt(packing_activation_file, "packing_activation_file")["--packing-file"]("packing activation ratio file") 39 | | Opt(cold_threshold, "cold_threshold")["--cold-threshold"]("If we use profiling, what ratio can be considered to be cold") 40 | | Opt(num_of_blocks_read_input_only, "num_of_blocks_read_input_only")["--num-of-blocks-read-input-only"]("only for characterization. ") 41 | | Opt(validation, "validation")["--validation"]("fake validation (do nothing)"); 42 | parser = parser | additional_parser; 43 | 44 | } 45 | 46 | bool hot_stage_only; 47 | bool remap_input_stream; 48 | double bfs_hot_ratio; 49 | int hot_n_state_limit; 50 | string hotcold_filter_filename; 51 | int active_queue_size; 52 | 53 | int packing; 54 | string packing_activation_file; 55 | 56 | double cold_threshold; 57 | 58 | int num_of_blocks_read_input_only; 59 | 60 | bool validation; 61 | }; 62 | 63 | 64 | #endif 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /code/src/ppopp12/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "NFA.h" 6 | #include "NFALoader.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "SymbolStream.h" 12 | #include "ppopp12.h" 13 | #include "utils.h" 14 | #include "node.h" 15 | #include "nfa_utils.h" 16 | #include 17 | #include "moderngpu/context.hxx" 18 | #include "moderngpu/util.hxx" 19 | #include "ppopp12_option.h" 20 | 21 | 22 | using namespace clara; 23 | 24 | using std::set; 25 | using std::unique_ptr; 26 | using std::string; 27 | using std::cout; 28 | using std::endl; 29 | 30 | 31 | int main(int argc, char *argv[]) 32 | { 33 | printf("Command: "); 34 | for (int i = 0; i < argc; i++) 35 | printf("%s ", argv[i]); 36 | printf("\n"); 37 | 38 | ppopp12_config cfg; 39 | 40 | auto result = cfg.parse( argc, argv ); 41 | 42 | if( !result ) 43 | { 44 | std::cerr << "Error in command line: " << result.errorMessage() << std::endl; 45 | exit(1); 46 | } 47 | 48 | if (cfg.showHelp) { 49 | cout << cfg.getHelp(); 50 | } 51 | 52 | string automata_filename = cfg.nfa_filename; 53 | string input_filename = cfg.input_filename; 54 | int start_pos = cfg.input_start_pos, input_length = cfg.input_len; 55 | string algo = cfg.algorithm; 56 | string output_file_name = cfg.report_filename; 57 | int dup_input_stream = cfg.duplicate_input_stream; 58 | unsigned long long int one_output_capacity = cfg.output_capacity; 59 | int block_size = cfg.block_size; 60 | int max_size_of_cc = cfg.max_nfa_size; 61 | int split_entire_inputstream_to_chunk_size = cfg.split_chunk_size; 62 | 63 | SymbolStream ss; 64 | ss.readFromFile(input_filename); 65 | 66 | if (start_pos != -1 && input_length != -1) { 67 | assert(start_pos >= 0); 68 | ss = ss.slice(start_pos, input_length); 69 | } 70 | 71 | //cout << "input_stream_size = " << ss.size() << endl; 72 | auto ab = ss.calc_alphabet(); 73 | 74 | auto nfa = load_nfa_from_file(automata_filename); 75 | 76 | cout << "nfa_size_original = " << nfa->size() << endl; 77 | 78 | nfa_utils::print_starting_node_info(nfa); 79 | 80 | int active_state_array_size = block_size; 81 | 82 | ppopp12 p12(nfa); 83 | 84 | cout << "dup_input_stream = " << dup_input_stream << endl; 85 | cout << "split_entire_inputstream_to_chunk_size = " << split_entire_inputstream_to_chunk_size << endl; 86 | 87 | p12.set_max_cc_size_limit(max_size_of_cc); 88 | 89 | for (int i = 0; i < dup_input_stream; i++) { 90 | if (split_entire_inputstream_to_chunk_size == -1) { 91 | p12.add_symbol_stream(ss); 92 | } else { 93 | assert(split_entire_inputstream_to_chunk_size > 0); 94 | int sslen = ss.size(); 95 | int num_seg = sslen / split_entire_inputstream_to_chunk_size; 96 | 97 | cout << "num_seg_" << i << " = " << num_seg << endl; 98 | 99 | for (int j = 0; j < num_seg; j++) { 100 | int start_pos1 = j * split_entire_inputstream_to_chunk_size; 101 | auto ss_seg = ss.slice(start_pos1, split_entire_inputstream_to_chunk_size); 102 | p12.add_symbol_stream(ss_seg); 103 | } 104 | } 105 | } 106 | 107 | p12.set_report_off(cfg.report_off, cfg.output_capacity, 108 | cfg.duplicate_input_stream * cfg.quick_validation); 109 | p12.set_output_file(output_file_name); 110 | p12.set_num_segment_per_ss(1); 111 | p12.set_output_buffer_size(one_output_capacity); 112 | p12.set_block_size(block_size); 113 | p12.set_active_state_array_size(active_state_array_size); 114 | p12.set_alphabet(ab); 115 | p12.set_option(cfg); 116 | p12.validation = cfg.validation; 117 | 118 | 119 | p12.preprocessing(); 120 | 121 | 122 | if (algo == "ppopp12") { 123 | p12.launch_kernel(); 124 | } else if (algo == "ppopp12_inputshropt") { 125 | p12.launch_kernel_readinputchunk(); 126 | } 127 | else { 128 | cout <<"not supported algoritm " << algo << endl; 129 | 130 | } 131 | 132 | delete nfa; 133 | cout<< "FINISHED\n"; 134 | return 0; 135 | } 136 | 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /code/src/ppopp12/ppopp12.h: -------------------------------------------------------------------------------- 1 | #ifndef PPOPP12_H_ 2 | #define PPOPP12_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "NFA.h" 12 | #include "array2.h" 13 | #include "utils.h" 14 | #include "common.h" 15 | #include "SymbolStream.h" 16 | #include 17 | #include "abstract_gpunfa.h" 18 | #include "ppopp12_option.h" 19 | #include "compatible_group_helper.h" 20 | 21 | using std::map; 22 | using std::vector; 23 | using std::fill; 24 | using std::cout; 25 | using std::endl; 26 | using std::pair; 27 | using std::set; 28 | using std::make_pair; 29 | 30 | 31 | 32 | 33 | class ppopp12 : public abstract_algorithm { 34 | public: 35 | ppopp12_config* opt; 36 | vector old_ccs; 37 | ppopp12(NFA *nfa); 38 | ~ppopp12(); 39 | 40 | void set_block_size(int blocksize); 41 | void set_active_state_array_size(int active_state_array_size); 42 | void set_alphabet(set alphabet); 43 | 44 | void group_nfas(); 45 | 46 | virtual void preprocessing() override; 47 | 48 | int get_num_states_gpu() const; 49 | 50 | void prepare_transition_table(); 51 | 52 | void prepare_states_status(); 53 | 54 | void prepare_initial_active_state_array(); 55 | 56 | void prepare_state_start_position_tb(); 57 | void prepare_compatible_grps(); 58 | 59 | void prepare_input_streams(); 60 | 61 | void prepare_outputs(); 62 | 63 | void launch_kernel(); 64 | 65 | void launch_kernel_readinputchunk(); 66 | 67 | void print_reports(string filename); 68 | 69 | void set_num_segment_per_ss(int nn) { 70 | this->num_segment_per_ss = nn; 71 | } 72 | void set_option(ppopp12_config &opt){ 73 | this->opt = &opt; 74 | } 75 | 76 | 77 | int get_num_segment_per_ss() const { 78 | return num_segment_per_ss; 79 | } 80 | 81 | private: 82 | // for debug 83 | NFA* select_one_nfa_by_id(string str_id); 84 | 85 | void calc_str_id_to_compatible_group_per_block(); 86 | 87 | int active_state_array_size; 88 | 89 | map > nfa_group_tb; 90 | int num_nfa_chunk; 91 | map num_compatible_groups_cc; 92 | 93 | Array2 *state_start_position_tb; 94 | 95 | Array2 *num_state_tb; 96 | Array2 *array_compatible_group; 97 | Array2 *trans_table; 98 | 99 | Array2 *states_status; 100 | Array2 *initial_active_state_array; 101 | 102 | // input 103 | Array2 *arr_input_streams; 104 | 105 | // output 106 | Array2 *match_array; 107 | Array2 *match_count; 108 | 109 | map str_id_to_compatible_group; 110 | // per cc 111 | map str_id_to_compatible_group_per_block; 112 | // per block 113 | 114 | vector nfa_in_tb; 115 | 116 | bool no_cg; 117 | 118 | bool profile; 119 | 120 | int num_segment_per_ss; 121 | }; 122 | 123 | 124 | #endif 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /code/src/ppopp12/ppopp12_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef PPOPP12_KERNELS_H_ 2 | #define PPOPP12_KERNELS_H_ 3 | 4 | #include "gpunfautils/common.h" 5 | #include 6 | 7 | 8 | __global__ void ppopp12_kernel( 9 | const __restrict__ int4 *transition_table, 10 | const int transition_table_length, 11 | 12 | int *state_start_position_tb, 13 | //int *num_state_tb, 14 | int *state_compatible_group, 15 | int *initial_active_state_array, 16 | int active_state_array_size, // currently it is the same as block size 17 | 18 | // for output / and start always enabled. 19 | int8_t *states_status, // 00 is always enabled; is output; 20 | 21 | // input 22 | uint8_t *input_streams, 23 | int input_stream_length, 24 | 25 | // output processing 26 | match_entry *match_array, // fixed size for each thread block, 27 | const unsigned long long int match_array_capacity, 28 | unsigned long long int *match_count, 29 | bool report_on 30 | ) ; 31 | 32 | __global__ void ppopp12_kernel_shrreadchunk( 33 | const __restrict__ int4 *transition_table, 34 | const int transition_table_length, 35 | int *state_start_position_tb, 36 | //int *num_state_tb, 37 | int *state_compatible_group, 38 | int *initial_active_state_array, 39 | int active_state_array_size, // currently it is the same as block size 40 | 41 | // for output / and start always enabled. 42 | int8_t *states_status, // 00 is always enabled; is output; 43 | 44 | // input 45 | uint8_t *input_streams, 46 | int input_stream_length, 47 | 48 | // output processing 49 | match_entry *match_array, // fixed size for each thread block, 50 | const unsigned long long int match_array_capacity, 51 | unsigned long long int *match_count, 52 | bool report_on 53 | ); 54 | 55 | 56 | 57 | #endif 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /code/src/ppopp12/ppopp12_option.h: -------------------------------------------------------------------------------- 1 | #ifndef PPOPP12_CONFIG 2 | #define PPOPP12_CONFIG 3 | 4 | #include "commons/common_func.h" 5 | 6 | class ppopp12_config : public common_gpunfa_options { 7 | public: 8 | ppopp12_config() : common_gpunfa_options() { 9 | 10 | auto additional_parser = 11 | Opt(validation, 12 | "validation")["--validation"]("validation"); 13 | parser = parser | additional_parser; 14 | } 15 | 16 | bool validation; 17 | 18 | 19 | }; 20 | 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.0.1-devel-ubuntu20.04 2 | ENV DEBIAN_FRONTEND=noninteractive 3 | 4 | RUN apt-get update 5 | RUN apt-get install -y libtbb-dev=2020.1-2 wget git 6 | RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.1/cmake-3.24.1-Linux-x86_64.sh \ 7 | -q -O /tmp/cmake-install.sh \ 8 | && chmod u+x /tmp/cmake-install.sh \ 9 | && mkdir /opt/cmake-3.24.1 \ 10 | && /tmp/cmake-install.sh --skip-license --prefix=/opt/cmake-3.24.1 \ 11 | && rm /tmp/cmake-install.sh \ 12 | && ln -s /opt/cmake-3.24.1/bin/* /usr/local/bin 13 | # hyperscan 14 | RUN apt-get install -y ragel nasm libsqlite3-dev pkg-config 15 | RUN apt-get install -y libboost-all-dev 16 | RUN echo "deb http://dk.archive.ubuntu.com/ubuntu/ xenial main" | tee -a /etc/apt/sources.list 17 | RUN echo "deb http://dk.archive.ubuntu.com/ubuntu/ xenial universe" | tee -a /etc/apt/sources.list 18 | RUN apt-get update && apt-get install -y g++-5 gcc-5 19 | RUN rm -rf /var/lib/apt/lists/* 20 | 21 | 22 | # install python environment 23 | ENV PATH="/root/miniconda3/bin:${PATH}" 24 | RUN wget -q \ 25 | https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ 26 | && mkdir /root/.conda \ 27 | && bash Miniconda3-latest-Linux-x86_64.sh -b \ 28 | && rm -f Miniconda3-latest-Linux-x86_64.sh 29 | RUN conda init bash 30 | RUN /bin/bash -c "source /root/.bashrc" 31 | RUN conda install -y numpy scipy pandas seaborn -c conda-forge 32 | RUN pip install https://github.com/getianao/figurePlotter/archive/refs/tags/v0.23.9.14.tar.gz 33 | 34 | ENV NGAP_ROOT="/ngAP" 35 | ENV PATH="/ngAP/code/build/bin:${PATH}" 36 | ENV PATH="/ngAP/hscompile/build:${PATH}" 37 | 38 | WORKDIR /ngAP -------------------------------------------------------------------------------- /env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | fullpath=$(readlink --canonicalize --no-newline $BASH_SOURCE) 4 | cur_dir=$(cd `dirname ${fullpath}`; pwd) 5 | # echo ${cur_dir} 6 | 7 | export NGAP_ROOT=${cur_dir} 8 | 9 | export PATH="${NGAP_ROOT}/code/build/bin:${PATH}" 10 | export PATH="${NGAP_ROOT}/hscompile/build:${PATH}" 11 | 12 | 13 | -------------------------------------------------------------------------------- /ref_results/fig13_throughput.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getianao/ngAP/6fcab891ddcc1dbac79b533469f6ccbf3dd7845a/ref_results/fig13_throughput.pdf -------------------------------------------------------------------------------- /ref_results/fig14_breakdown.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getianao/ngAP/6fcab891ddcc1dbac79b533469f6ccbf3dd7845a/ref_results/fig14_breakdown.pdf -------------------------------------------------------------------------------- /ref_results/fig20_latency.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getianao/ngAP/6fcab891ddcc1dbac79b533469f6ccbf3dd7845a/ref_results/fig20_latency.pdf -------------------------------------------------------------------------------- /ref_results/raw/throughput_cpu/throughput_cpu_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | before-hyperscan_,1.19,1.63,196.01,214.16,0.31,-2.0,0.4,-1.0,-1.0,-2.0,-1.0,-1.0,-2.0,-1.0,-1.0,1.89,-2.0,2.9,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,148.87 3 | before-vasim_,0.00182745,0.00227549,0.0421592,0.0414737,0.00224541,0.00102094,0.000835522,-1.0,-1.0,0.00158883,-1.0,-1.0,0.00145724,-1.0,-1.0,0.036843,0.00426582,0.0034086,-1.0,-1.0,0.000988023,1.12633,0.710085,0.689543,0.688317,0.27223 4 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_cpu/throughput_cpu_part2.csv: -------------------------------------------------------------------------------- 1 | config,Snort,FileCarving,ClamAV 2 | before-hyperscan_,36.58,-1.0,327.03 3 | before-vasim_,0.0309909,-1.0,0.00131314 4 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_cpu_oneinput/throughput_cpu_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | before-hyperscan_,1.19,1.6,71.91,84.03,0.32,0.16,0.4,-1.0,-1.0,0.04,-1.0,-1.0,0.02,-1.0,-1.0,1.85,0.03,2.89,-1.0,-1.0,61.31,285.88,1177.86,195.16,138.93,57.65 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_cpu_oneinput/throughput_cpu_part2.csv: -------------------------------------------------------------------------------- 1 | config,Snort,FileCarving,ClamAV 2 | before-hyperscan_,31.54,-1.0,262.67 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_best/throughput_gpu_napbest_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | oa-nonblocking-all-best,9.49091,80.8701,1612.11,222.526,37.0175,8.51391,1.82253,-1.0,-1.0,7.88546,-1.0,-1.0,0.890053,-1.0,-1.0,157.991,9.37462,4.62699,-1.0,-1.0,52.752,13452.8,8947.39,11743.4,11461.3,3298.04 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_best/throughput_gpu_napbest_part2.csv: -------------------------------------------------------------------------------- 1 | config,Snort,FileCarving,ClamAV 2 | oa-nonblocking-all-best,114.872,-1.0,5596.89 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_best/throughput_gpu_napbest_part3.csv: -------------------------------------------------------------------------------- 1 | config,smallSnort,smallFileCarving,smallClamAV 2 | oa-nonblocking-all-best,74.16,-1.0,3295.84 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_best_oneinput/throughput_gpu_napbest_oneinput_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | oa-nonblocking-all-best,4.85362,36.6275,45.0426,1.15836,22.3436,6.27635,7.57419,-1.0,-1.0,6.45865,-1.0,-1.0,0.716016,-1.0,-1.0,13.6536,0.559004,0.878501,-1.0,-1.0,3.34958,111.657,133.283,117.471,126.416,51.953 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_best_oneinput/throughput_gpu_napbest_oneinput_part2.csv: -------------------------------------------------------------------------------- 1 | config,Snort,FileCarving,ClamAV 2 | oa-nonblocking-all-best,1.83413,-1.0,38.9162 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_best_oneinput/throughput_gpu_napbest_oneinput_part3.csv: -------------------------------------------------------------------------------- 1 | config,smallSnort,smallFileCarving,smallClamAV 2 | oa-nonblocking-all-best,0.190169,-1.0,57.1354 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_breakdown/throughput_nap_breakdown_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | o0-blocking_,2.36783,3.21439,31.3726,30.8519,2.69913,1.55395,1.35045,-1.0,-1.0,1.95991,-1.0,-1.0,0.39358,-1.0,-1.0,20.4302,7.87992,4.5829,-1.0,-1.0,1.67768,333.874,226.043,224.893,226.549,121.238 3 | o0-nonblocking-NAP_,2.32725,3.55705,30.2642,26.6043,3.91616,1.54513,1.32497,-1.0,-1.0,2.10141,-1.0,-1.0,0.408884,-1.0,-1.0,18.6078,9.30723,4.1884,-1.0,-1.0,1.99783,150.837,125.299,125.405,124.745,81.1907 4 | o1-nonblocking_,5.48731,5.90764,66.7353,57.1711,6.93463,3.91444,1.60105,-1.0,-1.0,5.6975,-1.0,-1.0,0.17646,-1.0,-1.0,51.2707,7.19683,4.32801,-1.0,-1.0,4.48412,887.88,636.79,641.963,605.124,222.087 5 | o3-nonblocking-p3_,7.34958,54.2517,1125.96,158.133,36.4648,9.68088,1.62427,-1.0,-1.0,8.05241,-1.0,-1.0,0.971958,-1.0,-1.0,137.836,9.2406,4.28686,-1.0,-1.0,34.3151,2618.67,2202.54,2089.1,2022.6,787.731 6 | o4-nonblocking-r1f_,5.78694,5.51977,59.9846,57.1722,6.05899,4.16819,3.13265,-1.0,-1.0,5.30185,-1.0,-1.0,0.319643,-1.0,-1.0,59.8419,8.25265,4.66233,-1.0,-1.0,4.73265,876.315,568.532,564.681,549.169,206.94 7 | oa-nonblocking-all-p3r1_,7.97462,50.6649,1157.46,222.216,33.3177,8.36063,1.80042,-1.0,-1.0,8.03973,-1.0,-1.0,0.865747,-1.0,-1.0,152.499,8.74257,4.28031,-1.0,-1.0,50.842,2685.48,2218.65,1985.01,1979.47,733.358 8 | oa-nonblocking-all-p3r1f_,8.48512,47.8711,945.408,139.876,36.5803,7.87313,1.82704,-1.0,-1.0,7.93877,-1.0,-1.0,0.542088,-1.0,-1.0,152.262,8.23096,4.66774,-1.0,-1.0,52.752,2980.67,2369.49,2092.31,2082.94,782.884 9 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_breakdown/throughput_nap_breakdown_part2.csv: -------------------------------------------------------------------------------- 1 | config,Snort,FileCarving,ClamAV 2 | o0-blocking_,21.1791,-1.0,3.40173 3 | o0-nonblocking-NAP_,20.5721,-1.0,3.04677 4 | o1-nonblocking_,39.1882,-1.0,6.36849 5 | o3-nonblocking-p3_,52.1036,-1.0,1813.44 6 | o4-nonblocking-r1f_,51.2799,-1.0,3.68003 7 | oa-nonblocking-all-p3r1_,-1.0,-1.0,1811.11 8 | oa-nonblocking-all-p3r1f_,109.975,-1.0,2106.51 9 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_breakdown/throughput_nap_breakdown_part3.csv: -------------------------------------------------------------------------------- 1 | config,smallSnort,smallFileCarving,smallClamAV 2 | o0-blocking_,20.6698,-1.0,3.44144 3 | o0-nonblocking-NAP_,20.759,-1.0,3.09138 4 | o1-nonblocking_,39.1118,-1.0,6.37317 5 | o3-nonblocking-p3_,51.7606,-1.0,1804.14 6 | o4-nonblocking-r1f_,51.4143,-1.0,3.68474 7 | oa-nonblocking-all-p3r1f_,108.504,-1.0,2083.47 8 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | oa-nonblocking-default-best,8.60939,64.2597,1452.56,208.927,33.1166,7.26808,1.84866,-1.0,-1.0,7.48813,-1.0,-1.0,0.52098,-1.0,-1.0,141.725,8.3828,4.49539,-1.0,-1.0,50.409,3897.31,3690.17,3405.51,3598.85,2037.26 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part2.csv: -------------------------------------------------------------------------------- 1 | config,Snort,FileCarving,ClamAV 2 | oa-nonblocking-default-best,63.7755,-1.0,2334.82 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part3.csv: -------------------------------------------------------------------------------- 1 | config,smallSnort,smallFileCarving,smallClamAV 2 | oa-nonblocking-default-best,74.39,-1.0,1779.0 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_default_adp_oneinput/throughput_gpu_nap_default_oneinput_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | oa-nonblocking-default-best,2.43953,13.1657,15.0839,1.11249,7.84374,4.21962,5.71738,-1.0,-1.0,4.26646,-1.0,-1.0,0.623945,-1.0,-1.0,8.81588,0.559004,0.883431,-1.0,-1.0,3.10914,18.78,18.8387,18.8409,18.8719,15.645 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_default_adp_oneinput/throughput_gpu_nap_default_oneinput_part2.csv: -------------------------------------------------------------------------------- 1 | config,Snort,FileCarving,ClamAV 2 | oa-nonblocking-default-best,1.35838,-1.0,13.7059 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_nap_default_adp_oneinput/throughput_gpu_nap_default_oneinput_part3.csv: -------------------------------------------------------------------------------- 1 | config,smallSnort,smallFileCarving,smallClamAV 2 | oa-nonblocking-default-best,0.0790002,-1.0,10.9458 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_runahead/throughput_gpu_runahead_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | before-runahead-cc4_,4.675517,6.764452,57.299115,-2.0,22.255679,14.505548,1.137026,-1.0,-1.0,25.05346,-1.0,-1.0,0.700821,-1.0,-1.0,1.309869,-2.0,-2.0,-1.0,-1.0,6.853464,732.344044,440.55617,447.934965,448.066877,198.531099 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_runahead/throughput_gpu_runahead_part2.csv: -------------------------------------------------------------------------------- 1 | config,Snort,FileCarving,ClamAV 2 | before-runahead-cc4_,20.219294,-1.0,6.599365 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_runahead/throughput_gpu_runahead_part3.csv: -------------------------------------------------------------------------------- 1 | config,smallSnort,smallFileCarving,smallClamAV 2 | before-runahead-cc4_,20.244666,-1.0,6.586612 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_runahead_oneinput/throughput_gpu_runahead_oneinput_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | before-runahead-cc4_,2.777037,2.580277,10.455674,0.076473,10.557436,8.137953,0.667989,-1.0,-1.0,13.212716,-1.0,-1.0,0.685931,-1.0,-1.0,1.286162,-2.0,-2.0,-1.0,-1.0,1.518275,39.057815,35.711346,36.786173,35.720696,11.857418 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_runahead_oneinput/throughput_gpu_runahead_oneinput_part2.csv: -------------------------------------------------------------------------------- 1 | config,Snort,FileCarving,ClamAV 2 | before-runahead-cc4_,4.291639,-1.0,1.046811 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_runahead_oneinput/throughput_gpu_runahead_oneinput_part3.csv: -------------------------------------------------------------------------------- 1 | config,smallSnort,smallFileCarving,smallClamAV 2 | before-runahead-cc4_,4.152439,-1.0,1.091001 3 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_sota_best/throughput_gpu_sota_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | before-hotstart-nt_,6.79248,9.108471,41.064581,32.970172,4.109988,1.815733,4.773921,-1.0,-1.0,5.642327,-1.0,-1.0,1.23106,-1.0,-1.0,30.495631,3.566442,7.952852,-1.0,-1.0,3.534417,347.73229,283.197842,278.927813,276.3551,179.770678 3 | before-nfacg_,1.060613,0.740955,7.346126,9.281832,5.065893,2.545247,0.325797,-1.0,-1.0,4.91524,-1.0,-1.0,1.091527,-1.0,-1.0,4.751153,8.032496,3.114229,-1.0,-1.0,0.258865,143.187204,78.476178,77.617408,79.587591,31.312482 4 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_sota_best/throughput_gpu_sota_part3.csv: -------------------------------------------------------------------------------- 1 | config,smallSnort,smallFileCarving,smallClamAV 2 | before-hotstart-nt_,17.583161,-1.0,3.654265 3 | before-nfacg_,-1.0,-1.0,-1.0 4 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_sota_best_oneinput/throughput_gpu_sota_oneinput_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | before-hotstart-nt_,0.494784,1.625315,1.105979,1.081417,0.43556,0.229399,1.061616,-1.0,-1.0,0.270078,-1.0,-1.0,0.207295,-1.0,-1.0,0.525129,2.025399,1.10874,-1.0,-1.0,0.786183,1.48677,1.741122,1.659348,1.656274,1.393163 3 | before-nfacg_,0.981108,0.776644,1.374135,1.576036,1.508545,1.164417,0.393835,-1.0,-1.0,1.373664,-1.0,-1.0,0.963854,-1.0,-1.0,1.776493,1.281999,1.745596,-1.0,-1.0,0.25054,1.68832,1.65701,1.600435,1.639692,1.656128 4 | -------------------------------------------------------------------------------- /ref_results/raw/throughput_gpu_sota_best_oneinput/throughput_gpu_sota_oneinput_part3.csv: -------------------------------------------------------------------------------- 1 | config,smallSnort,smallFileCarving,smallClamAV 2 | before-hotstart-nt_,0.533841,-1.0,1.118965 3 | before-nfacg_,-1.0,-1.0,-1.0 4 | -------------------------------------------------------------------------------- /ref_results/raw/v100/throughput_gpu_nap_best/throughput_gpu_napbest_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | oa-nonblocking-all-best,6.09514,41.7723,1342.99,184.988,22.2398,4.41955,1.75018,-1.0,-1.0,4.18848,-1.0,-1.0,0.535905,-1.0,-1.0,94.5517,8.92899,4.2487,-1.0,-1.0,21.4083,8880.67,7569.57,6454.69,7033.73,2520.18 3 | -------------------------------------------------------------------------------- /ref_results/raw/v100/throughput_gpu_nap_best/throughput_gpu_napbest_part2.csv: -------------------------------------------------------------------------------- 1 | config,Snort,FileCarving,ClamAV 2 | oa-nonblocking-all-best,49.6262,-1.0,3861.56 3 | -------------------------------------------------------------------------------- /ref_results/raw/v100/throughput_gpu_nap_best/throughput_gpu_napbest_part3.csv: -------------------------------------------------------------------------------- 1 | config,smallSnort,smallFileCarving,smallClamAV 2 | oa-nonblocking-all-best,49.6452,-1.0,3971.3 3 | -------------------------------------------------------------------------------- /ref_results/raw/v100/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | oa-nonblocking-default-best,4.01389,27.0318,804.678,124.093,15.3026,2.97133,1.7636,-1.0,-1.0,3.29583,-1.0,-1.0,0.215236,-1.0,-1.0,65.0104,8.53325,3.85484,-1.0,-1.0,17.0549,2249.96,2267.74,2144.67,1963.35,1245.33 3 | -------------------------------------------------------------------------------- /ref_results/raw/v100/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part2.csv: -------------------------------------------------------------------------------- 1 | config,Snort,FileCarving,ClamAV 2 | oa-nonblocking-default-best,37.4139,-1.0,1307.5 3 | -------------------------------------------------------------------------------- /ref_results/raw/v100/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part3.csv: -------------------------------------------------------------------------------- 1 | config,smallSnort,smallFileCarving,smallClamAV 2 | oa-nonblocking-default-best,43.9231,-1.0,1317.7 3 | -------------------------------------------------------------------------------- /ref_results/raw/v100/throughput_gpu_runahead/throughput_gpu_runahead_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | before-runahead-cc4_,3.512485,5.135253,51.660751,-2.0,17.724456,11.598823,1.073728,-1.0,-1.0,20.17649,-1.0,-1.0,0.344323,-1.0,-1.0,0.750779,-2.0,-2.0,-1.0,-1.0,6.102957,618.361264,368.831971,380.604693,380.06254,170.980646 3 | -------------------------------------------------------------------------------- /ref_results/raw/v100/throughput_gpu_runahead/throughput_gpu_runahead_part2.csv: -------------------------------------------------------------------------------- 1 | config,Snort,FileCarving,ClamAV 2 | before-runahead-cc4_,16.878107,-1.0,5.934625 3 | -------------------------------------------------------------------------------- /ref_results/raw/v100/throughput_gpu_runahead/throughput_gpu_runahead_part3.csv: -------------------------------------------------------------------------------- 1 | config,smallSnort,smallFileCarving,smallClamAV 2 | before-runahead-cc4_,16.852375,-1.0,5.933999 3 | -------------------------------------------------------------------------------- /ref_results/raw/v100/throughput_gpu_sota_best/throughput_gpu_sota_part1.csv: -------------------------------------------------------------------------------- 1 | config,Brill,EntityResolution,Dotstar,PowerEN,CRISPR_CasOFFinder,CRISPR_CasOT,RandomForest_20_400_200,RandomForest_20_400_270,RandomForest_20_800_200,Hamming_N1000_l18_d3,Hamming_N1000_l22_d5,Hamming_N1000_l31_d10,Levenshtein_l19d3,Levenshtein_l24d5,Levenshtein_l37d10,Protomata,APPRNG4,SeqMatch_BIBLE_w6_p6,SeqMatch_BIBLE_w6_p10,Fermi,YARA,Bro217,ExactMath,Ranges05,Ranges1,TCP 2 | before-hotstart-nt_,6.320081,7.294038,35.837599,27.213543,3.104396,1.333548,3.569331,-1.0,-1.0,6.09973,-1.0,-1.0,1.573924,-1.0,-1.0,24.977908,4.947722,7.165793,-1.0,-1.0,3.482569,328.55615,260.383913,244.330903,246.823898,160.138904 3 | before-nfacg_,0.951604,0.628419,6.646628,9.100413,4.689425,2.041655,0.270739,-1.0,-1.0,4.215701,-1.0,-1.0,0.908289,-1.0,-1.0,4.290967,5.552233,2.898934,-1.0,-1.0,0.188535,138.858827,77.602717,75.70653,78.003779,29.367805 4 | -------------------------------------------------------------------------------- /ref_results/raw/v100/throughput_gpu_sota_best/throughput_gpu_sota_part3.csv: -------------------------------------------------------------------------------- 1 | config,smallSnort,smallFileCarving,smallClamAV 2 | before-hotstart-nt_,15.630824,-1.0,3.525744 3 | before-nfacg_,-1.0,-1.0,-1.0 4 | -------------------------------------------------------------------------------- /ref_results/tab4_throughput.csv: -------------------------------------------------------------------------------- 1 | App,HyperScan,NFA-CG,AsyncAP,GPU-NFA,NAP,NAP-Best 2 | APR,T,8.0,T,3.6,8.4,9.4 3 | Brill,1.2,1.1,4.7,6.8,8.6,9.5 4 | CRP1,0.3,5.1,22.3,4.1,33.1,37.0 5 | CRP2,T,2.5,14.5,1.8,7.3,8.5 6 | CAV,327.0,U,6.6,U,2334.8,5596.9 7 | ER,1.6,0.7,6.8,9.1,64.3,80.9 8 | HM,T,4.9,25.1,5.6,7.5,7.9 9 | LV,T,1.1,0.7,1.2,0.5,0.9 10 | Pro,1.9,4.8,1.3,30.5,141.7,158.0 11 | RF,0.4,0.3,1.1,4.8,1.8,1.8 12 | SM,2.9,3.1,T,8.0,4.5,4.6 13 | Snort,36.6,U,20.2,U,63.8,114.9 14 | YARA,W,0.3,6.9,3.5,50.4,52.8 15 | DS,196.0,7.3,57.3,41.1,1452.6,1612.1 16 | PEN,214.2,9.3,T,33.0,208.9,222.5 17 | Bro,W,143.2,732.3,347.7,3897.3,13452.8 18 | EM,W,78.5,440.6,283.2,3690.2,8947.4 19 | Ran1,W,79.6,448.1,276.4,3598.9,11461.3 20 | Ran5,W,77.6,447.9,278.9,3405.5,11743.4 21 | TCP,148.9,31.3,198.5,179.8,2037.3,3298.0 22 | -------------------------------------------------------------------------------- /ref_results/tab6_latency.csv: -------------------------------------------------------------------------------- 1 | App,HyperScan,NFA-CG,AsyncAP,GPU-NFA,NAP,NAP-Best 2 | APR,0.03,1.28,T,2.03,0.56,0.56 3 | Brill,1.19,0.98,2.78,0.49,2.44,4.85 4 | CRP1,0.32,1.51,10.56,0.44,7.84,22.34 5 | CRP2,0.16,1.16,8.14,0.23,4.22,6.28 6 | CAV,262.67,U,1.05,U,13.71,38.92 7 | ER,1.6,0.78,2.58,1.63,13.17,36.63 8 | HM,0.04,1.37,13.21,0.27,4.27,6.46 9 | LV,0.02,0.96,0.69,0.21,0.62,0.72 10 | Pro,1.85,1.78,1.29,0.53,8.82,13.65 11 | RF,0.4,0.39,0.67,1.06,5.72,7.57 12 | SM,2.89,1.75,T,1.11,0.88,0.88 13 | Snort,31.54,U,4.29,U,1.36,1.83 14 | YARA,61.31,0.25,1.52,0.79,3.11,3.35 15 | DS,71.91,1.37,10.46,1.11,15.08,45.04 16 | PEN,84.03,1.58,0.08,1.08,1.11,1.16 17 | Bro,285.88,1.69,39.06,1.49,18.78,111.66 18 | EM,1177.86,1.66,35.71,1.74,18.84,133.28 19 | Ran1,138.93,1.64,35.72,1.66,18.87,126.42 20 | Ran5,195.16,1.6,36.79,1.66,18.84,117.47 21 | TCP,57.65,1.66,11.86,1.39,15.65,51.95 22 | -------------------------------------------------------------------------------- /scripts/gen-breakdown-fig14.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python ${NGAP_ROOT}/scripts/plot_throughput_gpu_nap_breakdown.py 3 | 4 | -------------------------------------------------------------------------------- /scripts/gen-latency-fig20tab6.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python ${NGAP_ROOT}/scripts/plot_throughput_gpu_sota_oneinput.py 4 | python ${NGAP_ROOT}/scripts/table_throughput_oneinput.py 5 | 6 | -------------------------------------------------------------------------------- /scripts/gen-throughput-fig13tab4.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python ${NGAP_ROOT}/scripts/plot_throughput_gpu_sota.py 4 | 5 | python ${NGAP_ROOT}/scripts/table_throughput.py 6 | 7 | -------------------------------------------------------------------------------- /scripts/run-breakdown.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | time ${NGAP_ROOT}/scripts/run_throughput_NAP_breakdown.sh 4 | -------------------------------------------------------------------------------- /scripts/run-latency.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | time ${NGAP_ROOT}/scripts/run_throughput_gpu_sota_best_oneinput.sh 5 | time ${NGAP_ROOT}/scripts/run_throughput_runahead_oneinput.sh 6 | time ${NGAP_ROOT}/scripts/run_throughput_gpu_nap_defalut_oneinput.sh 7 | time ${NGAP_ROOT}/scripts/run_throughput_gpu_nap_best_oneinput.sh 8 | 9 | time ${NGAP_ROOT}/scripts/run_throughput_cpu_oneinput.sh 10 | 11 | -------------------------------------------------------------------------------- /scripts/run-throughput.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | time ${NGAP_ROOT}/scripts/run_throughput_gpu_sota_best.sh # 3.5 hrs 4 | time ${NGAP_ROOT}/scripts/run_throughput_runahead.sh # 3.5 hrs 5 | time ${NGAP_ROOT}/scripts/run_throughput_gpu_nap_defalut.sh # 1 hrs 6 | time ${NGAP_ROOT}/scripts/run_throughput_gpu_nap_best.sh # 1 hrs 7 | 8 | time ${NGAP_ROOT}/scripts/run_throughput_cpu.sh # 5.5 hrs 9 | 10 | -------------------------------------------------------------------------------- /scripts/run_experiments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$(cd `dirname $0`; pwd) 4 | cd ${DIR} 5 | 6 | 7 | 8 | FOLDER="exp-`date "+%Y%m%d-%H%M%S"`" 9 | 10 | cd ../raw_results 11 | if [ ! -d ${FOLDER} ]; then 12 | mkdir ${FOLDER} && cd ${FOLDER} 13 | else 14 | cd ${FOLDER} 15 | fi 16 | 17 | cp ../../code/scripts/configs/* . 18 | 19 | echo "Running Experiments... This will take several hours. " 20 | 21 | 22 | 23 | APP_SPEC=$1 24 | EXEC_CONFIG=$2 25 | 26 | 27 | python ../../code/scripts/launch_exps.py -b ${APP_SPEC} -f ${EXEC_CONFIG} -e --clean ${@:3} 28 | 29 | echo "Experiments finished. " 30 | 31 | 32 | if [ $? -eq 0 ]; then 33 | echo "Collecting experiment raw data." 34 | python ../../code/scripts/collect_results.py -b ${APP_SPEC} -f ${EXEC_CONFIG} ${@:3} 35 | else 36 | echo "Experiments terminate abnormally. " 37 | exit 1 38 | fi 39 | 40 | cd ${NGAP_ROOT} 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /scripts/run_throughput.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$(cd `dirname $0`; pwd) 4 | cd ${DIR} 5 | 6 | if [ -z "${CONFIGS}" ] || [ -z "${APPS}" ]; then 7 | echo "Either CONFIGS or APPS is empty" 8 | exit 1 9 | fi 10 | 11 | mkdir -p ../raw_results/log/ 12 | 13 | 14 | IFS=',' # Use the IFS (Internal Field Separator) variable to set the delimiter 15 | configs_arr=(${CONFIGS}) 16 | apps_arr=(${APPS}) 17 | 18 | 19 | for config in ${configs_arr[@]}; do 20 | for app in ${apps_arr[@]}; do 21 | LOG=../raw_results/log/"exp-`date "+%Y%m%d-%H%M%S"`.log" 22 | ./run_experiments.sh ${app} ${config} $@ 2>&1 | tee ${LOG} 23 | done 24 | done 25 | -------------------------------------------------------------------------------- /scripts/run_throughput_NAP_breakdown.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$(cd `dirname $0`; pwd) 4 | cd ${DIR} 5 | 6 | VALI=--validation 7 | 8 | mkdir -p ./results/raw/throughput_gpu_nap_breakdown 9 | 10 | # config: [NAP-Breakdown], apps: [part 1, part 2] 11 | APPS="app_spec_ngap_new_quickvalidation_part1" \ 12 | CONFIGS="exec_config_ngap_groups_design_NAP" \ 13 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 14 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_breakdown/throughput_nap_breakdown_part1.csv 15 | 16 | APPS="app_spec_ngap_new_quickvalidation_part2" \ 17 | CONFIGS="exec_config_ngap_groups_design_NAP" \ 18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 19 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_breakdown/throughput_nap_breakdown_part2.csv 20 | 21 | # config: [NAP-Breakdown'], apps: [part 3] 22 | APPS="app_spec_ngap_new_quickvalidation_part3" \ 23 | CONFIGS="exec_config_ngap_groups_design_NAP_4degree" \ 24 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 25 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_breakdown/throughput_nap_breakdown_part3.csv 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /scripts/run_throughput_cpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$(cd `dirname $0`; pwd) 4 | cd ${DIR} 5 | 6 | # VALI=--validation 7 | 8 | mkdir -p ./results/raw/throughput_cpu 9 | 10 | # config: [cpu], apps: [part 1] 11 | APPS="app_spec_ngap_new_quickvalidation_part1" \ 12 | CONFIGS="exec_config_ngap_groups_design_cpu" \ 13 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 14 | ${VALI} --timeout-mins=60 \ 15 | --csvdest=./results/raw/throughput_cpu/throughput_cpu_part1.csv 16 | 17 | # config: [cpu], apps: [part 2] 18 | APPS="app_spec_ngap_new_quickvalidation_part2" \ 19 | CONFIGS="exec_config_ngap_groups_design_cpu" \ 20 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 21 | ${VALI} --timeout-mins=60 \ 22 | --csvdest=./results/raw/throughput_cpu/throughput_cpu_part2.csv -------------------------------------------------------------------------------- /scripts/run_throughput_cpu_oneinput.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$(cd `dirname $0`; pwd) 4 | cd ${DIR} 5 | 6 | # VALI=--validation 7 | 8 | mkdir -p ../results/raw/throughput_cpu_oneinput 9 | 10 | # config: [cpu], apps: [part 1] 11 | APPS="app_spec_ngap_new_quickvalidation_part1" \ 12 | CONFIGS="exec_config_ngap_groups_design_cpu_oneinput" \ 13 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 14 | ${VALI} --timeout-mins=60 \ 15 | --csvdest=./results/raw/throughput_cpu_oneinput/throughput_cpu_part1.csv 16 | 17 | # config: [cpu], apps: [part 2] 18 | APPS="app_spec_ngap_new_quickvalidation_part2" \ 19 | CONFIGS="exec_config_ngap_groups_design_cpu_oneinput" \ 20 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 21 | ${VALI} --timeout-mins=60 \ 22 | --csvdest=./results/raw/throughput_cpu_oneinput/throughput_cpu_part2.csv -------------------------------------------------------------------------------- /scripts/run_throughput_gpu_nap_best.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$(cd `dirname $0`; pwd) 4 | cd ${DIR} 5 | 6 | VALI=--validation 7 | 8 | mkdir -p ./results/raw/throughput_gpu_nap_best 9 | 10 | # config: [NAP-Best], apps: [part 1, part 2] 11 | APPS="app_spec_ngap_new_quickvalidation_part1" \ 12 | CONFIGS="exec_config_ngap_groups_best" \ 13 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 14 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_best/throughput_gpu_napbest_part1.csv 15 | 16 | APPS="app_spec_ngap_new_quickvalidation_part2" \ 17 | CONFIGS="exec_config_ngap_groups_best" \ 18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 19 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_best/throughput_gpu_napbest_part2.csv 20 | 21 | # config: [NAP-Best'], apps: [part 3] 22 | APPS="app_spec_ngap_new_quickvalidation_part3" \ 23 | CONFIGS="exec_config_ngap_groups_best_4degree" \ 24 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 25 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_best/throughput_gpu_napbest_part3.csv -------------------------------------------------------------------------------- /scripts/run_throughput_gpu_nap_best_oneinput.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$(cd `dirname $0`; pwd) 4 | cd ${DIR} 5 | 6 | VALI=--validation 7 | 8 | mkdir -p ./results/raw/throughput_gpu_nap_best_oneinput 9 | 10 | # config: [NAP-Best], apps: [part 1, part 2] 11 | APPS="app_spec_ngap_new_quickvalidation_part1" \ 12 | CONFIGS="exec_config_ngap_groups_best_oneinput" \ 13 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 14 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_best_oneinput/throughput_gpu_napbest_oneinput_part1.csv 15 | 16 | APPS="app_spec_ngap_new_quickvalidation_part2" \ 17 | CONFIGS="exec_config_ngap_groups_best_oneinput" \ 18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 19 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_best_oneinput/throughput_gpu_napbest_oneinput_part2.csv 20 | 21 | # config: [NAP-Best'], apps: [part 3] 22 | APPS="app_spec_ngap_new_quickvalidation_part3" \ 23 | CONFIGS="exec_config_ngap_groups_best_4degree_oneinput" \ 24 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 25 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_best_oneinput/throughput_gpu_napbest_oneinput_part3.csv -------------------------------------------------------------------------------- /scripts/run_throughput_gpu_nap_defalut.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$(cd `dirname $0`; pwd) 4 | cd ${DIR} 5 | 6 | VALI=--validation 7 | 8 | mkdir -p ./results/raw/throughput_gpu_nap_default_adp 9 | 10 | # config: [NAP-default], apps: [part 1, part 2] 11 | APPS="app_spec_ngap_new_quickvalidation_part1" \ 12 | CONFIGS="exec_config_ngap_groups_nap_default" \ 13 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 14 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part1.csv 15 | 16 | APPS="app_spec_ngap_new_quickvalidation_part2" \ 17 | CONFIGS="exec_config_ngap_groups_nap_default" \ 18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 19 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part2.csv 20 | 21 | # config: [NAP-default'], apps: [part 3] 22 | APPS="app_spec_ngap_new_quickvalidation_part3" \ 23 | CONFIGS="exec_config_ngap_groups_nap_default_4degree" \ 24 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 25 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_default_adp/throughput_gpu_nap_default_part3.csv -------------------------------------------------------------------------------- /scripts/run_throughput_gpu_nap_defalut_oneinput.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$(cd `dirname $0`; pwd) 4 | cd ${DIR} 5 | 6 | VALI=--validation 7 | 8 | mkdir -p ./results/raw/throughput_gpu_nap_default_adp_oneinput 9 | 10 | # config: [NAP-default], apps: [part 1, part 2] 11 | APPS="app_spec_ngap_new_quickvalidation_part1" \ 12 | CONFIGS="exec_config_ngap_groups_nap_default_oneinput" \ 13 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 14 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_default_adp_oneinput/throughput_gpu_nap_default_oneinput_part1.csv 15 | 16 | APPS="app_spec_ngap_new_quickvalidation_part2" \ 17 | CONFIGS="exec_config_ngap_groups_nap_default_oneinput" \ 18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 19 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_default_adp_oneinput/throughput_gpu_nap_default_oneinput_part2.csv 20 | 21 | # config: [NAP-default'], apps: [part 3] 22 | APPS="app_spec_ngap_new_quickvalidation_part3" \ 23 | CONFIGS="exec_config_ngap_groups_nap_default_4degree_oneinput" \ 24 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 25 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_nap_default_adp_oneinput/throughput_gpu_nap_default_oneinput_part3.csv -------------------------------------------------------------------------------- /scripts/run_throughput_gpu_sota_best.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$(cd `dirname $0`; pwd) 4 | cd ${DIR} 5 | 6 | VALI=--validation 7 | 8 | mkdir -p ./results/raw/throughput_gpu_sota_best 9 | # config: [sota], apps: [part 1] 10 | APPS="app_spec_ngap_new_quickvalidation_part1" \ 11 | CONFIGS="exec_config_ngap_groups_design_sota" \ 12 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 13 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_sota_best/throughput_gpu_sota_part1.csv 14 | 15 | # config: [sota], apps: [part 3] 16 | APPS="app_spec_ngap_new_quickvalidation_part3" \ 17 | CONFIGS="exec_config_ngap_groups_design_sota_4degree" \ 18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 19 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_sota_best/throughput_gpu_sota_part3.csv -------------------------------------------------------------------------------- /scripts/run_throughput_gpu_sota_best_oneinput.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$(cd `dirname $0`; pwd) 4 | cd ${DIR} 5 | 6 | VALI=--validation 7 | 8 | mkdir -p ./results/raw/throughput_gpu_sota_best_oneinput 9 | # config: [sota], apps: [part 1] 10 | APPS="app_spec_ngap_new_quickvalidation_part1" \ 11 | CONFIGS="exec_config_ngap_groups_design_sota_oneinput" \ 12 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 13 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_sota_best_oneinput/throughput_gpu_sota_oneinput_part1.csv 14 | 15 | # config: [sota], apps: [part 3] 16 | APPS="app_spec_ngap_new_quickvalidation_part3" \ 17 | CONFIGS="exec_config_ngap_groups_design_sota_4degree_oneinput" \ 18 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 19 | ${VALI} --timeout-mins=60 --csvdest=./results/raw/throughput_gpu_sota_best_oneinput/throughput_gpu_sota_oneinput_part3.csv -------------------------------------------------------------------------------- /scripts/run_throughput_runahead.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$(cd `dirname $0`; pwd) 4 | cd ${DIR} 5 | 6 | # VALI=--validation 7 | 8 | mkdir -p ./results/raw/throughput_gpu_runahead 9 | 10 | # config: [runahead], apps: [part 1] 11 | APPS="app_spec_ngap_new_quickvalidation_part1" \ 12 | CONFIGS="exec_config_ngap_groups_design_sota_runahead" \ 13 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 14 | ${VALI} --timeout-mins=60 \ 15 | --csvdest=./results/raw/throughput_gpu_runahead/throughput_gpu_runahead_part1.csv 16 | 17 | # config: [runahead], apps: [part 2] 18 | APPS="app_spec_ngap_new_quickvalidation_part2" \ 19 | CONFIGS="exec_config_ngap_groups_design_sota_runahead" \ 20 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 21 | ${VALI} --timeout-mins=60 \ 22 | --csvdest=./results/raw/throughput_gpu_runahead/throughput_gpu_runahead_part2.csv 23 | 24 | 25 | # config: [runahead'], apps: [part 3] 26 | APPS="app_spec_ngap_new_quickvalidation_part3" \ 27 | CONFIGS="exec_config_ngap_groups_design_sota_runahead_4degree" \ 28 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 29 | ${VALI} --timeout-mins=60 \ 30 | --csvdest=./results/raw/throughput_gpu_runahead/throughput_gpu_runahead_part3.csv 31 | -------------------------------------------------------------------------------- /scripts/run_throughput_runahead_oneinput.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$(cd `dirname $0`; pwd) 4 | cd ${DIR} 5 | 6 | # VALI=--validation 7 | 8 | 9 | mkdir -p ./results/raw/throughput_gpu_runahead_oneinput 10 | 11 | # config: [runahead], apps: [part 1] 12 | APPS="app_spec_ngap_new_quickvalidation_part1" \ 13 | CONFIGS="exec_config_ngap_groups_design_sota_runahead_oneinput" \ 14 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 15 | ${VALI} --timeout-mins=60 \ 16 | --csvdest=./results/raw/throughput_gpu_runahead_oneinput/throughput_gpu_runahead_oneinput_part1.csv 17 | 18 | # config: [runahead], apps: [part 2] 19 | APPS="app_spec_ngap_new_quickvalidation_part2" \ 20 | CONFIGS="exec_config_ngap_groups_design_sota_runahead_oneinput" \ 21 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 22 | ${VALI} --timeout-mins=60 \ 23 | --csvdest=./results/raw/throughput_gpu_runahead_oneinput/throughput_gpu_runahead_oneinput_part2.csv 24 | 25 | 26 | # config: [runahead'], apps: [part 3] 27 | APPS="app_spec_ngap_new_quickvalidation_part3" \ 28 | CONFIGS="exec_config_ngap_groups_design_sota_runahead_4degree_oneinput" \ 29 | ./run_throughput.sh --keywords=../../code/scripts/collect_keyword_list_throughput.txt \ 30 | ${VALI} --timeout-mins=60 \ 31 | --csvdest=./results/raw/throughput_gpu_runahead_oneinput/throughput_gpu_runahead_oneinput_part3.csv 32 | -------------------------------------------------------------------------------- /small_dataset/apple.anml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /small_dataset/inputstream.txt: -------------------------------------------------------------------------------- 1 | aappleaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzzappletoptoplslslsappleappappapplezzzz 2 | --------------------------------------------------------------------------------