├── afl_llvm_allowlist.txt
├── llvm-isel-afl
    ├── .gitignore
    ├── CMakeLists.txt
    ├── llvm-isel-fuzzer.cpp
    └── afl-driver.cpp
├── mutator
    ├── scripts
    │   ├── .gitignore
    │   └── validate.sh
    ├── include
    │   └── mutator.h
    ├── CMakeLists.txt
    └── src
    │   ├── fuzzmutate.cpp
    │   ├── main.cpp
    │   ├── mutator.cpp
    │   └── afl-mutator.c
├── seeds
    └── seed.bc
├── scripts
    ├── requirements.txt
    ├── lib
    │   ├── time_parser.py
    │   ├── fs.py
    │   ├── matcher_table_sizes.py
    │   ├── plot_data.py
    │   ├── __init__.py
    │   ├── arch.py
    │   ├── process_concurrency.py
    │   ├── experiment.py
    │   ├── triple.py
    │   ├── target_lists.py
    │   ├── llc_command.py
    │   ├── target.py
    │   └── llc_test.py
    ├── stat_experiments.py
    ├── combine_crash_data.py
    ├── summarize_crash_data.py
    ├── classify_llc_tests.py
    ├── collect_combined_mt_coverage.py
    ├── collect_bad_inputs.py
    ├── combine_fuzzing_results.py
    ├── collect_matcher_table_size.py
    ├── batch_classify.py
    ├── collect_seeds.py
    ├── batch_compile.py
    ├── process_data.py
    ├── compare_experiments.py
    ├── classify.py
    └── fuzz.py
├── .gitignore
├── .dockerignore
├── .gitmodules
├── Dockerfile
├── LICENSE
└── README.md


/afl_llvm_allowlist.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/llvm-isel-afl/.gitignore:
--------------------------------------------------------------------------------
1 | afl-compiler-rt.o
2 | 


--------------------------------------------------------------------------------
/mutator/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !validate.sh
3 | !.gitignore


--------------------------------------------------------------------------------
/seeds/seed.bc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SecurityLab-UCD/IRFuzzer/HEAD/seeds/seed.bc


--------------------------------------------------------------------------------
/scripts/requirements.txt:
--------------------------------------------------------------------------------
1 | bitarray==2.8.0
2 | docker_py==1.10.6
3 | matplotlib==3.5.1
4 | numpy==1.21.5
5 | pandas==2.0.3
6 | tqdm==4.65.0
7 | typed_argument_parser==1.8.1


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | llvm-aie/
 3 | llvm-fix/
 4 | llvm-project/
 5 | .vscode/
 6 | AFLplusplus/
 7 | O
 8 | fuzzing*
 9 | *-w*
10 | tmp
11 | seed
12 | *.bc
13 | *.ll
14 | analysis
15 | __pycache__
16 | tmpfs
17 | docker*
18 | output
19 | archive*
20 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | **/build/
 2 | llvm-aie/
 3 | llvm-fix/
 4 | llvm-project/
 5 | .vscode/
 6 | AFLplusplus/
 7 | O
 8 | fuzzing*
 9 | *-w*
10 | tmp
11 | seed
12 | *.bc
13 | *.ll
14 | analysis
15 | */__pycache__/
16 | tmpfs
17 | docker*
18 | output
19 | archive*
20 | 


--------------------------------------------------------------------------------
/scripts/lib/time_parser.py:
--------------------------------------------------------------------------------
 1 | SECONDS_PER_UNIT: dict[str, int] = {
 2 |     "s": 1,
 3 |     "m": 60,
 4 |     "h": 3600,
 5 |     "d": 86400,
 6 |     "w": 604800,
 7 | }
 8 | 
 9 | 
10 | def get_time_in_seconds(s: str) -> int:
11 |     return int(s[:-1]) * SECONDS_PER_UNIT[s[-1]]
12 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "llvm-project"]
2 | 	path = llvm-project
3 | 	url = https://github.com/SecurityLab-UCD/llvm-project.git
4 | 	branch = irfuzzer-0.2
5 | [submodule "AFLplusplus"]
6 | 	path = AFLplusplus
7 | 	url = https://github.com/SecurityLab-UCD/AFLplusplus.git
8 | 	branch = irfuzzer-0.2


--------------------------------------------------------------------------------
/mutator/include/mutator.h:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stdlib.h>
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C" {
 6 | #endif
 7 | void createISelMutator();
 8 | size_t LLVMFuzzerCustomMutator(uint8_t *Data, size_t Size, size_t MaxSize,
 9 |                                unsigned int Seed);
10 | 
11 | #ifdef __cplusplus
12 | }
13 | #endif


--------------------------------------------------------------------------------
/scripts/lib/fs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | from typing import Iterator
 4 | 
 5 | 
 6 | def subdirs_of(dir: Path | str) -> Iterator[os.DirEntry]:
 7 |     return (f for f in os.scandir(dir) if f.is_dir())
 8 | 
 9 | 
10 | def count_files(dir: Path) -> int:
11 |     """
12 |     count number of file in the specified directory (not including sub-directories)
13 |     """
14 |     return len(next(os.walk(dir))[2])
15 | 


--------------------------------------------------------------------------------
/mutator/scripts/validate.sh:
--------------------------------------------------------------------------------
 1 | INPUT=$1
 2 | 
 3 | if [[ -z $INPUT ]]
 4 | then
 5 |     INPUT=$FUZZING_HOME/seeds/seed.bc
 6 | fi
 7 | 
 8 | export NPROC=`nproc --all`
 9 | export NPROC=1
10 | for J in $(seq $NPROC)
11 | do
12 |     rm O$J; touch O$J
13 |     for I in $(seq 1000)
14 |     do     
15 |         SEED=$(shuf -i 0-4294967295 -n 1)
16 |         $FUZZING_HOME/mutator/build/MutatorDriver $INPUT $SEED -v
17 |         if [[ $? -ne 0 ]]
18 |         then
19 |             echo $SEED &>> O$J
20 |         fi
21 |     done &
22 | done
23 | wait
24 | # Try to match O* that is not empty.
25 | ls -la | grep "$USER $USER [[:space:]]*[1-9].* [0-9]*:[0-9]* O[0-9]*"
26 | 


--------------------------------------------------------------------------------
/scripts/lib/matcher_table_sizes.py:
--------------------------------------------------------------------------------
 1 | DAGISEL_MATCHER_TABLE_SIZES: dict[str, int] = {
 2 |     "AArch64": 486171,
 3 |     "AMDGPU": 493660,
 4 |     "ARC": 1998,
 5 |     "ARM": 201172,
 6 |     "AVR": 2973,
 7 |     "BPF": 3586,
 8 |     "CSKY": 19076,
 9 |     "Hexagon": 178301,
10 |     "Lanai": 2337,
11 |     "LoongArch": 28486,
12 |     "M68k": 18850,
13 |     "MSP430": 9103,
14 |     "Mips": 54044,
15 |     "NVPTX": 185247,
16 |     "PowerPC": 190304,
17 |     "RISCV": 2692926,
18 |     "Sparc": 6607,
19 |     "SystemZ": 53271,
20 |     "VE": 71577,
21 |     "WebAssembly": 25991,
22 |     "X86": 685990,
23 |     "XCore": 3854,
24 | }
25 | 
26 | GISEL_MATCHER_TABLE_SIZES: dict[str, int] = {
27 |     "AArch64": 277753,
28 |     "AMDGPU": 338644,
29 |     "ARM": 130029,
30 |     "M68k": 2388,
31 |     "Mips": 60449,
32 |     "PowerPC": 83201,
33 |     "RISCV": 183021,
34 |     "X86": 62522,
35 | }
36 | 


--------------------------------------------------------------------------------
/scripts/lib/plot_data.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def __convert_percentage_to_float(s: str) -> float:
 6 |     return float(s.strip("%")) / 100
 7 | 
 8 | 
 9 | def read_plot_data(file_path: Path) -> pd.DataFrame:
10 |     # the table header is not consistent, so we don't want pandas to detect and process the header (1st row of csv)
11 |     # but let it use the hard-coded column names below.
12 |     return pd.read_csv(
13 |         file_path,
14 |         index_col=False,
15 |         header=None,
16 |         skiprows=1,
17 |         names=[
18 |             "# relative_time",
19 |             "cycles_done",
20 |             "cur_item",
21 |             "corpus_count",
22 |             "pending_total",
23 |             "pending_favs",
24 |             "bit_cvg",
25 |             "shw_cvg",
26 |             "saved_crashes",
27 |             "saved_hangs",
28 |             "max_depth",
29 |             "execs_per_sec",
30 |             "total_execs",
31 |             "edges_found",
32 |         ],
33 |         converters={
34 |             "bit_cvg": __convert_percentage_to_float,
35 |             "shw_cvg": __convert_percentage_to_float,
36 |         },
37 |     )
38 | 


--------------------------------------------------------------------------------
/scripts/stat_experiments.py:
--------------------------------------------------------------------------------
 1 | from tap import Tap
 2 | from lib.experiment import get_all_experiments
 3 | from process_data import iterate_over_all_experiments
 4 | 
 5 | 
 6 | class Args(Tap):
 7 |     input: str
 8 |     """root directory containing fuzzing output"""
 9 | 
10 |     def configure(self) -> None:
11 |         self.add_argument("input")
12 | 
13 | 
14 | def print_experiment_statuses(root_dir: str) -> None:
15 |     for expr in get_all_experiments(root_dir):
16 |         print(
17 |             expr.isel.ljust(8),
18 |             str(expr.target).ljust(40),
19 |             str(expr.replicate_id).ljust(2),
20 |             end=" ",
21 |         )
22 | 
23 |         df = expr.read_plot_data()
24 | 
25 |         if df.shape[0] > 0:
26 |             print(
27 |                 f"{df.iloc[-1]['# relative_time'] / 3600 :.1f}h".ljust(6),
28 |                 f"{df.iloc[0]['shw_cvg']:.3%}".ljust(7),
29 |                 "->",
30 |                 f"{df.iloc[-1]['shw_cvg']:.3%}".ljust(8),
31 |             )
32 |         else:
33 |             print()
34 | 
35 | 
36 | def main():
37 |     args = Args().parse_args()
38 |     print_experiment_statuses(args.input)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/llvm-isel-afl/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.4)
 2 | project(llvm-isel-fuzzer)
 3 | set(AFLplusplus AFLplusplus)
 4 | 
 5 | find_package(LLVM REQUIRED PATHS $ENV{FUZZING_HOME}/$ENV{LLVM}/build-afl/lib/cmake/llvm NO_DEFAULT_PATH)
 6 | 
 7 | message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
 8 | message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 9 | 
10 | set(CMAKE_CXX_FLAGS "-std=c++17 -fno-rtti -Wall -pthread")
11 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG")
12 | 
13 | llvm_map_components_to_libnames(LLVM_LIBS
14 |     AllTargetsAsmParsers
15 |     AllTargetsCodeGens
16 |     AllTargetsDescs
17 |     AllTargetsInfos
18 |     Analysis
19 |     AsmPrinter
20 |     BitReader
21 |     BitWriter
22 |     CodeGen
23 |     Core
24 |     FuzzMutate
25 |     IRReader
26 |     MC
27 |     ScalarOpts
28 |     SelectionDAG
29 |     Support
30 |     Target
31 | )
32 | 
33 | include_directories(
34 |     ${LLVM_INCLUDE_DIRS}
35 | )
36 | add_executable(isel-fuzzing
37 |     afl-driver.cpp
38 |     llvm-isel-fuzzer.cpp
39 | )
40 | target_compile_options(isel-fuzzing PRIVATE -fno-rtti)
41 | 
42 | target_link_libraries(isel-fuzzing
43 |     $ENV{FUZZING_HOME}/${AFLplusplus}/afl-compiler-rt.o
44 |     ${LLVM_LIBS}
45 | )


--------------------------------------------------------------------------------
/scripts/lib/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from pathlib import Path
 4 | import subprocess
 5 | 
 6 | 
 7 | FUZZING_HOME = os.getenv(key="FUZZING_HOME")
 8 | LLVM = os.getenv(key="LLVM", default="llvm-project")
 9 | 
10 | LLVM_BIN_PATH = Path(LLVM, "build-release/bin")
11 | LLC = Path(LLVM_BIN_PATH, "llc")
12 | LLVM_AS = Path(LLVM_BIN_PATH, "llvm-as")
13 | LLVM_DIS = Path(LLVM_BIN_PATH, "llvm-dis")
14 | 
15 | IRFUZZER_DATA_ENV = "IRFUZZER_DATA"
16 | 
17 | 
18 | def __verify_working_dir():
19 |     if FUZZING_HOME is None:
20 |         logging.error(
21 |             "$FUZZING_HOME not set, why am I running? Did you install correctly?"
22 |         )
23 |         exit(1)
24 | 
25 |     if not os.path.samefile(os.getcwd(), FUZZING_HOME):
26 |         logging.warning("I am not in $FUZZING_HOME now.")
27 | 
28 | 
29 | def __verify_llvm_version():
30 |     expected_commit = "bcb8a9450388"
31 | 
32 |     actual_commit = (
33 |         subprocess.check_output(
34 |             ["git", "-C", LLVM, "rev-parse", "--short", "HEAD"]
35 |         )
36 |         .decode("ascii")
37 |         .strip()
38 |     )
39 | 
40 |     if actual_commit != expected_commit:
41 |         logging.warn(
42 |             f"Your LLVM version {actual_commit} is not {expected_commit}."
43 |             " Matcher table sizes may be incorrect."
44 |         )
45 | 
46 | 
47 | __verify_working_dir()
48 | __verify_llvm_version()
49 | 


--------------------------------------------------------------------------------
/scripts/lib/arch.py:
--------------------------------------------------------------------------------
 1 | ARCH_TO_BACKEND_MAP: dict[str, str] = {
 2 |     "aarch64": "AArch64",
 3 |     "aarch64_32": "AArch64",
 4 |     "aarch64_be": "AArch64",
 5 |     "amdgcn": "AMDGPU",
 6 |     "arc": "ARC",
 7 |     "arm": "ARM",
 8 |     "armeb": "ARM",
 9 |     "avr": "AVR",
10 |     "bpf": "BPF",
11 |     "bpfeb": "BPF",
12 |     "bpfel": "BPF",
13 |     "csky": "CSKY",
14 |     "hexagon": "Hexagon",
15 |     "lanai": "Lanai",
16 |     "loongarch32": "LoongArch",
17 |     "loongarch64": "LoongArch",
18 |     "m68k": "M68k",
19 |     "mips": "Mips",
20 |     "mips64": "Mips",
21 |     "mips64el": "Mips",
22 |     "mipsel": "Mips",
23 |     "msp430": "MSP430",
24 |     "nvptx": "NVPTX",
25 |     "nvptx64": "NVPTX",
26 |     "ppc": "PowerPC",
27 |     "ppcle": "PowerPC",
28 |     "ppc64": "PowerPC",
29 |     "ppc64le": "PowerPC",
30 |     "r600": "R600",
31 |     "riscv32": "RISCV",
32 |     "riscv64": "RISCV",
33 |     "sparc": "Sparc",
34 |     "sparcel": "Sparc",
35 |     "sparcv9": "Sparc",
36 |     "systemz": "SystemZ",
37 |     "thumb": "ARM",
38 |     "thumbeb": "ARM",
39 |     "ve": "VE",
40 |     "wasm32": "WebAssembly",
41 |     "wasm64": "WebAssembly",
42 |     "i686": "X86",
43 |     "x86_64": "X86",
44 |     "xcore": "XCore",
45 | }
46 | 
47 | 
48 | def normalize_arch(arch: str) -> str:
49 |     match arch:
50 |         case "aarch64" | "arm64":
51 |             return "aarch64"
52 |         case "aarch64_32" | "arm64_32":
53 |             return "aarch64_32"
54 |         case "powerpc" | "ppc" | "ppc32":
55 |             return "ppc"
56 |         case "powerpcle" | "ppcle" | "ppc32le":
57 |             return "ppcle"
58 |         case "powerpc64" | "ppc64":
59 |             return "ppc64"
60 |         case "powerpc64le" | "ppc64le":
61 |             return "ppc64le"
62 |         case "s390x" | "systemz":
63 |             return "systemz"
64 |         case _:
65 |             return arch
66 | 


--------------------------------------------------------------------------------
/mutator/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.4)
 2 | project(afl-ir-custom-mutator)
 3 | 
 4 | 
 5 | if(NOT CMAKE_BUILD_TYPE)
 6 |   set(CMAKE_BUILD_TYPE "Release")
 7 |   message(STATUS "Build type not specified: Use Release by default")
 8 | endif(NOT CMAKE_BUILD_TYPE)
 9 | 
10 | IF(CMAKE_BUILD_TYPE MATCHES Release)
11 |     find_package(LLVM REQUIRED PATHS $ENV{FUZZING_HOME}/$ENV{LLVM}/build-release/lib/cmake/llvm NO_DEFAULT_PATH)
12 | ENDIF(CMAKE_BUILD_TYPE MATCHES Release)
13 | 
14 | 
15 | IF(CMAKE_BUILD_TYPE MATCHES Debug)
16 |     find_package(LLVM REQUIRED PATHS $ENV{FUZZING_HOME}/$ENV{LLVM}/build-debug/lib/cmake/llvm NO_DEFAULT_PATH)
17 | ENDIF(CMAKE_BUILD_TYPE MATCHES Debug)
18 | 
19 | message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
20 | message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
21 | 
22 | set(CMAKE_CXX_FLAGS "-std=c++17 -fno-rtti -Wall -pthread")
23 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DDEBUG")
24 | 
25 | llvm_map_components_to_libnames(LLVM_LIBS
26 |     AllTargetsAsmParsers
27 |     AllTargetsCodeGens
28 |     AllTargetsDescs
29 |     AllTargetsInfos
30 |     Analysis
31 |     AsmPrinter
32 |     BitReader
33 |     BitWriter
34 |     CodeGen
35 |     Core
36 |     FuzzMutate
37 |     IRReader
38 |     MC
39 |     ScalarOpts
40 |     SelectionDAG
41 |     Support
42 |     Target
43 | )
44 | 
45 | include_directories(
46 |     $ENV{FUZZING_HOME}/$ENV{AFL}/include/
47 |     ${LLVM_INCLUDE_DIRS}
48 |     ./include/
49 | )
50 | add_definitions(${LLVM_DEFINITIONS})
51 | 
52 | add_library(AFLCustomIRMutator SHARED
53 |     src/afl-mutator.c
54 |     src/mutator.cpp
55 | )
56 | target_link_libraries(AFLCustomIRMutator
57 |     ${LLVM_LIBS}
58 | )
59 | 
60 | add_library(AFLFuzzMutate SHARED
61 |     src/afl-mutator.c
62 |     src/fuzzmutate.cpp
63 | )
64 | target_link_libraries(AFLFuzzMutate
65 |     ${LLVM_LIBS}
66 | )
67 | 
68 | 
69 | add_executable(MutatorDriver
70 |     src/main.cpp
71 | )
72 | target_link_libraries(MutatorDriver
73 |     AFLCustomIRMutator
74 |     ${LLVM_LIBS}
75 | )
76 | 


--------------------------------------------------------------------------------
/scripts/combine_crash_data.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from itertools import groupby
 3 | from typing import Generator, Set, Tuple
 4 | 
 5 | import pandas as pd
 6 | 
 7 | from lib.experiment import Experiment, get_all_experiments
 8 | from lib.fs import subdirs_of
 9 | 
10 | 
11 | def iterate_over_all_experiments(
12 |     dir: str,
13 | ) -> Generator[Tuple[Experiment, Set[str]], None, None]:
14 |     for expr in get_all_experiments(dir):
15 |         crashes = set()
16 | 
17 |         for crash_type_dir in subdirs_of(expr.path):
18 |             for subdir in subdirs_of(crash_type_dir.path):
19 |                 if subdir.name.startswith("tracedepth_"):
20 |                     crashes.add(subdir.name)
21 |                 else:
22 |                     for subsubdir in subdirs_of(subdir.path):
23 |                         assert subsubdir.name.startswith("tracedepth_")
24 |                         crashes.add(subsubdir.name)
25 | 
26 |         yield (expr, crashes)
27 | 
28 | 
29 | def main() -> None:
30 |     parser = argparse.ArgumentParser(
31 |         description="Combine crash data from differnt experiments to count unique crashes"
32 |     )
33 | 
34 |     parser.add_argument(
35 |         "-i",
36 |         "--input",
37 |         type=str,
38 |         required=True,
39 |         help="The input directory (the output directory for batch classification script)",
40 |     )
41 | 
42 |     args = parser.parse_args()
43 | 
44 |     groups = groupby(
45 |         iterate_over_all_experiments(args.input),
46 |         key=lambda tuple: ([tuple[0].fuzzer, tuple[0].isel, str(tuple[0].target)]),
47 |     )
48 | 
49 |     df = pd.DataFrame(
50 |         columns=["fuzzer", "isel", "target", "n_unique_crashes"],
51 |         data=(
52 |             [
53 |                 *k,
54 |                 len(set((crash for (_, crashes) in v for crash in crashes))),
55 |             ]
56 |             for (k, v) in groups
57 |         ),
58 |     )
59 | 
60 |     df.to_csv("combined-crash-counts.csv")
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     main()
65 | 


--------------------------------------------------------------------------------
/scripts/lib/process_concurrency.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import multiprocessing
 3 | import os
 4 | import subprocess
 5 | from typing import Callable, Iterable, Optional, Tuple, TypeVar
 6 | 
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | MAX_SUBPROCESSES = max(multiprocessing.cpu_count() - 2, 1)
11 | 
12 | __T = TypeVar("__T")
13 | __R = TypeVar("__R")
14 | 
15 | 
16 | def run_concurrent_subprocesses(
17 |     iter: Iterable[__T],
18 |     subprocess_creator: Callable[[__T], subprocess.Popen],
19 |     on_exit: Optional[Callable[[__T, Optional[int], subprocess.Popen], __R]] = None,
20 |     max_jobs: int = MAX_SUBPROCESSES,
21 | ) -> dict[__T, __R]:
22 |     """
23 |     Creates up to `max_jobs` subprocesses that run concurrently.
24 |     `iter` contains inputs that is used to start each subprocess.
25 |     `subprocess_creator` creates the subprocess and returns a `Popen`.
26 |     After each subprocess ends, `on_exit` will go collect user defined input and return.
27 |     The return valus is a dictionary of inputs and outputs.
28 | 
29 |     User has to guarantee elements in `iter` is unique, or the output may be incorrect.
30 |     """
31 |     ret: dict[__T, __R] = {}
32 |     processes: dict[int, Tuple[subprocess.Popen, __T]] = dict()
33 | 
34 |     def wait_next() -> None:
35 |         pid, status = os.wait()
36 |         p, i = processes.pop(pid)
37 | 
38 |         exit_code: Optional[int] = None
39 | 
40 |         if os.WIFEXITED(status):
41 |             exit_code = os.WEXITSTATUS(status)
42 |             logging.debug(f"Child process {pid} exited with code {exit_code}.")
43 |         else:
44 |             logging.debug(f"Child process {pid} exited abnormally.")
45 | 
46 |         if on_exit is not None:
47 |             ret[i] = on_exit(i, exit_code, p)
48 | 
49 |     for input in tqdm(iter):
50 |         p = subprocess_creator(input)
51 |         processes[p.pid] = (p, input)
52 | 
53 |         if len(processes) >= max_jobs:
54 |             wait_next()
55 | 
56 |     # wait for remaining processes to exit
57 |     while len(processes) > 0:
58 |         wait_next()
59 | 
60 |     return ret
61 | 


--------------------------------------------------------------------------------
/scripts/summarize_crash_data.py:
--------------------------------------------------------------------------------
 1 | from typing import Generator, Tuple
 2 | import pandas as pd
 3 | import argparse
 4 | 
 5 | from lib.experiment import Experiment, get_all_experiments
 6 | 
 7 | 
 8 | def iterate_over_all_experiments(
 9 |     dir: str,
10 | ) -> Generator[Tuple[Experiment, int], None, None]:
11 |     for expr_info in get_all_experiments(dir):
12 |         with open(
13 |             expr_info.path.joinpath("unique_crashes"), "r"
14 |         ) as file:
15 |             yield (expr_info, int(file.readline()))
16 | 
17 | 
18 | def collect_crash_data(dir: str) -> pd.DataFrame:
19 |     return pd.DataFrame(
20 |         columns=["fuzzer", "isel", "target", "replicate", "n_unique_crashes"],
21 |         data=(
22 |             [
23 |                 exp.fuzzer,
24 |                 exp.isel,
25 |                 str(exp.target),
26 |                 exp.replicate_id,
27 |                 n_unique_crashes,
28 |             ]
29 |             for (exp, n_unique_crashes) in iterate_over_all_experiments(dir)
30 |         ),
31 |     )
32 | 
33 | 
34 | def main() -> None:
35 |     parser = argparse.ArgumentParser(description="Summerize crash data")
36 | 
37 |     parser.add_argument(
38 |         "-i",
39 |         "--input",
40 |         type=str,
41 |         required=True,
42 |         help="The input directory (the output directory for batch classification script)",
43 |     )
44 | 
45 |     args = parser.parse_args()
46 | 
47 |     df = collect_crash_data(args.input)
48 | 
49 |     df_summary = (
50 |         df.drop(columns=["replicate"])
51 |         .groupby(["fuzzer", "isel", "target"])
52 |         .agg(["min", "max", "count", "mean", "std"])
53 |     )
54 | 
55 |     df_summary.to_csv("crash-counts.csv")
56 | 
57 |     df_irfuzzer = df[df["fuzzer"] == "irfuzzer"].drop(columns=["fuzzer"])
58 |     df_libfuzzer = df[df["fuzzer"] == "libfuzzer"].drop(columns=["fuzzer"])
59 | 
60 |     df_comparison = df_irfuzzer.merge(
61 |         df_libfuzzer,
62 |         on=["isel", "target", "replicate"],
63 |         suffixes=("_irfuzzer", "_libfuzzer"),
64 |     )
65 | 
66 |     df_comparison.to_csv("crash-data-comparison.csv")
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 


--------------------------------------------------------------------------------
/scripts/classify_llc_tests.py:
--------------------------------------------------------------------------------
 1 | from itertools import groupby
 2 | from pathlib import Path
 3 | import pandas as pd
 4 | from tap import Tap
 5 | 
 6 | from lib.llc_test import LLCTest, parse_llc_tests
 7 | 
 8 | 
 9 | class Args(Tap):
10 |     output: str
11 |     """directory for storing summary (will create if not exist)"""
12 | 
13 |     def configure(self):
14 |         self.add_argument("-o", "--output")
15 | 
16 | 
17 | def classify(
18 |     backend: str,
19 |     tests: list[LLCTest],
20 |     summary_out: Path,
21 | ) -> None:
22 |     commands = (cmd for test in tests for cmd in test.runnable_llc_commands)
23 | 
24 |     df = pd.DataFrame(
25 |         columns=["arch", "gisel", "triple", "cpu", "attrs"],
26 |         data=(
27 |             [
28 |                 cmd.target.triple.arch,
29 |                 cmd.global_isel,
30 |                 str(cmd.target.triple),
31 |                 cmd.target.cpu,
32 |                 ",".join(sorted(cmd.target.attrs)),
33 |             ]
34 |             for cmd in commands
35 |         ),
36 |     )
37 | 
38 |     df.to_csv(summary_out.joinpath(f"{backend}-raw.csv"))
39 | 
40 |     df.groupby(["arch", "gisel", "triple", "cpu", "attrs"], dropna=False).size().to_csv(
41 |         summary_out.joinpath(f"{backend}-summary.csv")
42 |     )
43 | 
44 |     for arch in df["arch"].unique():
45 |         arch_df = df[df["arch"] == arch]
46 | 
47 |         pd.crosstab(
48 |             index=arch_df["cpu"].fillna(""),
49 |             columns=arch_df["attrs"],
50 |             dropna=False,
51 |         ).to_csv(summary_out.joinpath(f"{arch}-crosstab.csv"))
52 | 
53 | 
54 | def main() -> None:
55 |     args = Args(underscores_to_dashes=True).parse_args()
56 | 
57 |     summary_out = Path(args.output)
58 |     summary_out.mkdir(exist_ok=True)
59 | 
60 |     tests = parse_llc_tests()
61 | 
62 |     for key, group in groupby(tests, key=lambda test: test.backend):
63 |         arch_summary_out = summary_out.joinpath(key)
64 |         arch_summary_out.mkdir(exist_ok=True)
65 | 
66 |         classify(
67 |             backend=key,
68 |             tests=list(group),
69 |             summary_out=arch_summary_out,
70 |         )
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 


--------------------------------------------------------------------------------
/scripts/lib/experiment.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Iterable, NamedTuple, Optional
 3 | 
 4 | import pandas as pd
 5 | from lib.fs import subdirs_of
 6 | from lib.plot_data import read_plot_data
 7 | 
 8 | from lib.target import Target
 9 | 
10 | 
11 | class Experiment(NamedTuple):
12 |     path: Path
13 |     fuzzer: str
14 |     isel: str
15 |     target: Target
16 |     replicate_id: int
17 | 
18 |     @property
19 |     def name(self) -> str:
20 |         return f"{self.fuzzer}:{self.isel}:{self.target}:{self.replicate_id}"
21 | 
22 |     @property
23 |     def plot_data_path(self) -> Path:
24 |         return self.path.joinpath("default", "plot_data")
25 | 
26 |     @property
27 |     def fuzzer_stats_path(self) -> Path:
28 |         return self.path.joinpath("default", "fuzzer_stats")
29 |     
30 |     @property
31 |     def cur_input_path(self) -> Path:
32 |         return self.path.joinpath("default", ".cur_input")
33 |     
34 |     @property
35 |     def run_time(self) -> int:
36 |         s = self['run_time']
37 |         return -1 if s is None else int(s)
38 | 
39 |     def __getitem__(self, key: str) -> Optional[str]:
40 |         if not self.fuzzer_stats_path.exists():
41 |             return None
42 | 
43 |         with open(self.fuzzer_stats_path) as f:
44 |             for line in f:
45 |                 if line.startswith(key):
46 |                     return line.split(" : ")[1]
47 |         
48 |         return None
49 | 
50 |     def read_plot_data(self) -> pd.DataFrame:
51 |         return read_plot_data(self.plot_data_path)
52 | 
53 | 
54 | def get_all_experiments(root_dir: Path | str) -> Iterable[Experiment]:
55 |     for fuzzer_dir in subdirs_of(root_dir):
56 |         for isel_dir in subdirs_of(fuzzer_dir.path):
57 |             for target_dir in sorted(
58 |                 subdirs_of(isel_dir.path), key=lambda dir: dir.name
59 |             ):
60 |                 for replicate_dir in subdirs_of(target_dir.path):
61 |                     yield Experiment(
62 |                         path=Path(replicate_dir.path),
63 |                         fuzzer=fuzzer_dir.name.split(".")[0],
64 |                         isel=isel_dir.name,
65 |                         target=Target.parse(target_dir.name),
66 |                         replicate_id=int(replicate_dir.name),
67 |                     )
68 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:22.04
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | RUN apt-get update && \
 5 |     apt-get -y upgrade && \
 6 |     apt-get install -y -q git build-essential wget zlib1g-dev cmake python3 python3-pip ninja-build ccache && \
 7 |     apt-get clean
 8 | 
 9 | ENV FUZZING_HOME=/IRFuzzer
10 | WORKDIR $FUZZING_HOME
11 | COPY . $FUZZING_HOME
12 | 
13 | ENV LLVM=llvm-project
14 | ENV AFL=AFLplusplus
15 | ENV PATH="${PATH}:/clang+llvm/bin"
16 | ENV AFL_LLVM_INSTRUMENT=CLASSIC
17 | 
18 | RUN CLANG_LLVM=clang+llvm-14.0.0-x86_64-linux-gnu-ubuntu-18.04 && \
19 |     wget --no-verbose --show-progress --progress=dot:mega https://github.com/llvm/llvm-project/releases/download/llvmorg-14.0.0/$CLANG_LLVM.tar.xz && \
20 |     tar -xvf $CLANG_LLVM.tar.xz -C / && \
21 |     mv /$CLANG_LLVM /clang+llvm && \
22 |     rm $CLANG_LLVM.tar.xz
23 | 
24 | RUN git clone https://github.com/SecurityLab-UCD/AFLplusplus.git --branch=irfuzzer-0.2 --depth=1 $AFL && \
25 |     cd $AFL && \
26 |     make -j
27 | 
28 | RUN git clone https://github.com/SecurityLab-UCD/llvm-project.git --branch=irfuzzer-0.2 --depth=1 $LLVM
29 | 
30 | RUN mkdir -p $LLVM/build-afl && \
31 |     cd $LLVM/build-afl && \
32 |     cmake \
33 |     -GNinja \
34 |     -DBUILD_SHARED_LIBS=OFF \
35 |     -DLLVM_BUILD_TOOLS=ON \
36 |     -DLLVM_CCACHE_BUILD=ON \
37 |     -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD="ARC;CSKY;LoongArch;M68k" \
38 |     -DCMAKE_C_COMPILER=$FUZZING_HOME/$AFL/afl-clang-fast \
39 |     -DCMAKE_CXX_COMPILER=$FUZZING_HOME/$AFL/afl-clang-fast++ \
40 |     -DCMAKE_BUILD_TYPE=Release \
41 |     -DLLVM_APPEND_VC_REV=OFF \
42 |     -DLLVM_BUILD_EXAMPLES=OFF \
43 |     -DLLVM_BUILD_RUNTIME=OFF \
44 |     -DLLVM_INCLUDE_EXAMPLES=OFF \
45 |     -DLLVM_USE_SANITIZE_COVERAGE=OFF \
46 |     -DLLVM_USE_SANITIZER="" \
47 |     ../llvm && \
48 |     ninja -j $(nproc --all)
49 | 
50 | RUN mkdir -p $LLVM/build-release && \
51 |     cd $LLVM/build-release && \
52 |     cmake \
53 |     -GNinja \
54 |     -DBUILD_SHARED_LIBS=ON \
55 |     -DLLVM_CCACHE_BUILD=ON \
56 |     -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD="ARC;CSKY;LoongArch;M68k" \
57 |     -DCMAKE_C_COMPILER=clang \
58 |     -DCMAKE_CXX_COMPILER=clang++ \
59 |     -DCMAKE_BUILD_TYPE=Release \
60 |     ../llvm && \
61 |     ninja -j $(nproc --all)
62 | 
63 | RUN mkdir -p llvm-isel-afl/build && \
64 |     cd llvm-isel-afl/build && \
65 |     cmake \
66 |     -GNinja \
67 |     -DCMAKE_C_COMPILER=$FUZZING_HOME/$AFL/afl-clang-fast \
68 |     -DCMAKE_CXX_COMPILER=$FUZZING_HOME/$AFL/afl-clang-fast++ \
69 |     .. && \
70 |     ninja -j $(nproc --all)
71 | 
72 | RUN mkdir -p mutator/build && \
73 |     cd mutator/build && \
74 |     cmake -GNinja .. && \
75 |     ninja -j $(nproc --all)
76 | 


--------------------------------------------------------------------------------
/scripts/lib/triple.py:
--------------------------------------------------------------------------------
 1 | from ctypes import CDLL, c_char_p, cdll
 2 | from typing import ClassVar, Optional
 3 | from lib import LLVM
 4 | 
 5 | from lib.arch import ARCH_TO_BACKEND_MAP, normalize_arch
 6 | 
 7 | 
 8 | LIB_LLVM_TARGET_PATH = LLVM + "/build-release/lib/libLLVMTarget.so"
 9 | 
10 | class Triple:
11 |     llvm_lib: ClassVar[Optional[CDLL]] = None
12 | 
13 |     arch: str
14 |     vendor: Optional[str]
15 |     os: Optional[str]
16 |     abi: Optional[str]
17 | 
18 |     @property
19 |     def backend(self) -> str:
20 |         return ARCH_TO_BACKEND_MAP[self.arch]
21 | 
22 |     def __init__(
23 |         self,
24 |         arch: str,
25 |         vendor: Optional[str] = None,
26 |         os: Optional[str] = None,
27 |         abi: Optional[str] = None,
28 |     ) -> None:
29 |         assert len(arch) > 0
30 |         self.arch = normalize_arch(arch)
31 |         self.vendor = self.normalize_component(vendor)
32 |         self.os = self.normalize_component(os)
33 |         self.abi = self.normalize_component(abi)
34 | 
35 |     def __eq__(self, __o: object) -> bool:
36 |         if not isinstance(__o, Triple):
37 |             return False
38 | 
39 |         return (
40 |             self.arch == __o.arch
41 |             and self.vendor == __o.vendor
42 |             and self.os == __o.os
43 |             and self.abi == __o.abi
44 |         )
45 | 
46 |     def __hash__(self) -> int:
47 |         return hash(str(self))
48 | 
49 |     def __repr__(self) -> str:
50 |         s = "-".join(
51 |             (component if component else "")
52 |             for component in [self.arch, self.vendor, self.os, self.abi]
53 |         )
54 | 
55 |         return s.rstrip("-")
56 | 
57 |     @classmethod
58 |     def normalize_component(cls, s: Optional[str]) -> Optional[str]:
59 |         return None if s in [None, "", "none", "unknown"] else s
60 | 
61 |     @classmethod
62 |     def normalize(cls, s: str) -> str:
63 |         if cls.llvm_lib is None:
64 |             cls.llvm_lib = cdll.LoadLibrary(LIB_LLVM_TARGET_PATH)
65 |             cls.llvm_lib.LLVMNormalizeTargetTriple.restype = c_char_p
66 | 
67 |         c_arg = c_char_p(s.encode("ascii"))
68 |         c_ret = cls.llvm_lib.LLVMNormalizeTargetTriple(c_arg)
69 |         return c_ret.decode("ascii")
70 | 
71 |     @classmethod
72 |     def parse(cls, s: str) -> "Triple":
73 |         assert len(s) > 0
74 | 
75 |         parts = cls.normalize(s).split("-")
76 |         n = len(parts)
77 | 
78 |         assert n > 0 and n <= 4
79 | 
80 |         return Triple(
81 |             arch=parts[0],
82 |             vendor=parts[1] if n >= 2 else None,
83 |             os=parts[2] if n >= 3 else None,
84 |             abi=parts[3] if n == 4 else None,
85 |         )
86 | 


--------------------------------------------------------------------------------
/scripts/lib/target_lists.py:
--------------------------------------------------------------------------------
 1 | from lib.target import Target
 2 | 
 3 | TARGET_LISTS: dict[str, list[Target]] = {
 4 |     "1": [
 5 |         Target("arm"),
 6 |         Target("aarch64"),
 7 |         Target("i686"),
 8 |         Target("x86_64"),
 9 |         Target("riscv32"),
10 |         Target("riscv64"),
11 |         Target("wasm32"),
12 |         Target("wasm64"),
13 |     ],
14 |     "1a": [
15 |         Target("arm", None, "+neon"),
16 |         Target("aarch64", None, "+neon"),
17 |         Target("i686", None, "+avx512f"),
18 |         Target("x86_64", None, "+avx512f"),
19 |         Target("riscv32", None, "+v"),
20 |         Target("riscv64", None, "+v"),
21 |         Target("wasm32", None, "+simd128"),
22 |         Target("wasm64", None, "+simd128"),
23 |     ],
24 |     "2": [
25 |         Target("mips"),
26 |         Target("mips64"),
27 |         Target("ppc"),
28 |         Target("ppc64"),
29 |         Target("amdgcn"),
30 |         Target("nvptx64"),
31 |         Target("hexagon"),
32 |     ],
33 |     "3": [
34 |         Target("aarch64_32"),
35 |         Target("aarch64_be"),
36 |         Target("armeb"),
37 |         Target("avr"),
38 |         Target("bpf"),
39 |         Target("bpfeb"),
40 |         Target("bpfel"),
41 |         Target("lanai"),
42 |         Target("mips64el"),
43 |         Target("mipsel"),
44 |         Target("msp430"),
45 |         Target("nvptx"),
46 |         Target("ppcle"),
47 |         Target("ppc64le"),
48 |         Target("r600"),
49 |         Target("sparc"),
50 |         Target("sparcel"),
51 |         Target("sparcv9"),
52 |         Target("systemz"),
53 |         Target("thumb"),
54 |         Target("thumbeb"),
55 |         Target("ve"),
56 |         Target("xcore"),
57 |     ],
58 |     "cpu": [
59 |         # Intel
60 |         Target("x86_64", "alderlake"),
61 |         Target("x86_64", "sapphirerapids"),
62 |         # AMD
63 |         Target("x86_64", "znver3"),
64 |         # Apple
65 |         Target("aarch64", "apple-a16"),
66 |         Target("aarch64", "apple-m2"),
67 |         # Samsung
68 |         Target("aarch64", "exynos-m5"),
69 |         # ARM
70 |         Target("aarch64", "cortex-a710"),
71 |         Target("aarch64", "cortex-x2"),
72 |         Target("aarch64", "cortex-r82"),
73 |         # Target("aarch64", "neoverse-v2"),
74 |         # AMD
75 |         Target("amdgcn", "gfx1100"),
76 |         Target("amdgcn", "gfx1036"),
77 |         Target("amdgcn", "gfx1010"),
78 |         # Qualcomm
79 |         Target("hexagon", "hexagonv69"),
80 |         # Nvidia
81 |         Target("nvptx64", "sm_90"),
82 |         # SiFive
83 |         Target("riscv64", "sifive-u74"),
84 |         # WASM
85 |         Target("wasm64", "bleeding-edge"),
86 |     ],
87 | }
88 | 


--------------------------------------------------------------------------------
/scripts/lib/llc_command.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import re
 3 | from typing import Iterable, NamedTuple, Optional
 4 | 
 5 | from lib import LLC
 6 | from lib.triple import Triple
 7 | from lib.target import Target
 8 | 
 9 | 
10 | class LLCCommand(NamedTuple):
11 |     target: Target
12 |     global_isel: bool
13 | 
14 |     def get_options(self, output: Optional[str | Path] = None) -> Iterable[str]:
15 |         yield f"-mtriple={self.target.triple}"
16 | 
17 |         if self.target.cpu:
18 |             yield f"-mcpu={self.target.cpu}"
19 | 
20 |         if len(self.target.attrs) > 0:
21 |             yield f"-mattr={','.join(self.target.attrs)}"
22 | 
23 |         if self.global_isel:
24 |             yield "-global-isel"
25 | 
26 |         if output:
27 |             yield f"-o"
28 |             yield str(output)
29 | 
30 |     def get_args(
31 |         self, input: str | Path, output: Optional[str | Path] = None
32 |     ) -> list[str]:
33 |         return [
34 |             str(LLC),
35 |             *self.get_options(output),
36 |             str(input),
37 |         ]
38 | 
39 |     @classmethod
40 |     def parse(
41 |         cls, command: str, default_triple: Optional[Triple] = None
42 |     ) -> "LLCCommand":
43 |         assert "llc" in command
44 | 
45 |         triple = cls.__get_triple_from_command(command)
46 | 
47 |         if triple is None:
48 |             triple = default_triple
49 | 
50 |         assert triple is not None, f"Cannot determine triple"
51 | 
52 |         return LLCCommand(
53 |             target=Target(
54 |                 triple=triple,
55 |                 cpu=cls.__get_cpu_from_command(command),
56 |                 attrs=cls.__get_attrs_from_command(command),
57 |             ),
58 |             global_isel=re.match(r".*-global-isel", command) is not None,
59 |         )
60 | 
61 |     @staticmethod
62 |     def __get_triple_from_command(command: str) -> Optional[Triple]:
63 |         if (match := re.match(r".*-mtriple[= ]\"?([a-z0-9_-]+)", command)) is not None:
64 |             return Triple.parse(match.group(1))
65 | 
66 |         if (match := re.match(r".*-march[= ]\"?([a-z0-9_-]+)", command)) is not None:
67 |             return Triple(arch=match.group(1))
68 | 
69 |         return None
70 | 
71 |     @staticmethod
72 |     def __get_cpu_from_command(command: str) -> Optional[str]:
73 |         if (match := re.match(r".*-mcpu[= ]\"?([a-z0-9-]+)", command)) is not None:
74 |             return match.group(1)
75 |         else:
76 |             return None
77 | 
78 |     @staticmethod
79 |     def __get_attrs_from_command(command: str) -> Iterable[str]:
80 |         return (
81 |             attr
82 |             for arg_val in re.findall(r"-mattr[= ]\"?([A-Za-z0-9,\+-]+)", command)
83 |             for attr in arg_val.split(",")
84 |         )
85 | 


--------------------------------------------------------------------------------
/mutator/src/fuzzmutate.cpp:
--------------------------------------------------------------------------------
 1 | #include "mutator.h"
 2 | 
 3 | #include "llvm/ADT/StringRef.h"
 4 | #include "llvm/Analysis/TargetLibraryInfo.h"
 5 | #include "llvm/Bitcode/BitcodeReader.h"
 6 | #include "llvm/Bitcode/BitcodeWriter.h"
 7 | #include "llvm/CodeGen/CommandFlags.h"
 8 | #include "llvm/FuzzMutate/FuzzerCLI.h"
 9 | #include "llvm/FuzzMutate/IRMutator.h"
10 | #include "llvm/FuzzMutate/Operations.h"
11 | #include "llvm/IR/Constants.h"
12 | #include "llvm/IR/LLVMContext.h"
13 | #include "llvm/IR/LegacyPassManager.h"
14 | #include "llvm/IR/Module.h"
15 | #include "llvm/IR/Verifier.h"
16 | #include "llvm/IRReader/IRReader.h"
17 | #include "llvm/Support/CommandLine.h"
18 | #include "llvm/Support/DataTypes.h"
19 | #include "llvm/Support/Debug.h"
20 | #include "llvm/Support/SourceMgr.h"
21 | #include "llvm/Support/TargetSelect.h"
22 | #include "llvm/Target/TargetMachine.h"
23 | 
24 | #include <fstream>
25 | #include <iostream>
26 | #include <stdio.h>
27 | #include <stdlib.h>
28 | #include <string.h>
29 | #include <vector>
30 | using namespace llvm;
31 | 
32 | static std::unique_ptr<IRMutator> Mutator;
33 | 
34 | extern "C" {
35 | 
36 | // Original IRMutator setting.
37 | void createISelMutator() {
38 |   std::vector<TypeGetter> Types{
39 |       Type::getInt1Ty,  Type::getInt8Ty,  Type::getInt16Ty, Type::getInt32Ty,
40 |       Type::getInt64Ty, Type::getFloatTy, Type::getDoubleTy};
41 | 
42 |   std::vector<std::unique_ptr<IRMutationStrategy>> Strategies;
43 |   std::vector<fuzzerop::OpDescriptor> Ops;
44 | 
45 |   describeFuzzerIntOps(Ops);
46 |   describeFuzzerFloatOps(Ops);
47 |   describeFuzzerControlFlowOps(Ops);
48 |   describeFuzzerPointerOps(Ops);
49 |   describeFuzzerAggregateOps(Ops);
50 |   describeFuzzerVectorOps(Ops);
51 | 
52 |   Strategies.emplace_back(new InjectorIRStrategy(std::move(Ops)));
53 |   Strategies.emplace_back(new InstDeleterIRStrategy());
54 | 
55 |   Mutator =
56 |       std::make_unique<IRMutator>(std::move(Types), std::move(Strategies));
57 | }
58 | 
59 | size_t LLVMFuzzerCustomMutator(uint8_t *Data, size_t Size, size_t MaxSize,
60 |                                unsigned int Seed) {
61 |   LLVMContext Context;
62 |   std::unique_ptr<Module> M;
63 |   if (Size <= 1)
64 |     // We get bogus data given an empty corpus - just create a new module.
65 |     M.reset(new Module("M", Context));
66 |   else
67 |     M = parseModule(Data, Size, Context);
68 |   if (!M) {
69 |     errs() << "Parse module error. No mutation is done. Data size: " << Size
70 |            << ". Given data wrote to err.bc\n";
71 |     std::ofstream outfile =
72 |         std::ofstream("err.bc", std::ios::out | std::ios::binary);
73 |     outfile.write((char *)Data, Size);
74 |     outfile.close();
75 |     exit(1);
76 |   }
77 | 
78 |   Mutator->mutateModule(*M, Seed, MaxSize);
79 | 
80 |   return writeModule(*M, Data, MaxSize);
81 | }
82 | }


--------------------------------------------------------------------------------
/scripts/collect_combined_mt_coverage.py:
--------------------------------------------------------------------------------
  1 | from itertools import groupby
  2 | from pathlib import Path
  3 | from typing import Iterable
  4 | from bitarray import bitarray
  5 | from tap import Tap
  6 | from math import ceil
  7 | 
  8 | from lib.arch import ARCH_TO_BACKEND_MAP
  9 | from lib.experiment import Experiment, get_all_experiments
 10 | from lib.matcher_table_sizes import (
 11 |     DAGISEL_MATCHER_TABLE_SIZES,
 12 |     GISEL_MATCHER_TABLE_SIZES,
 13 | )
 14 | 
 15 | 
 16 | class Args(Tap):
 17 |     input: str
 18 |     """root directory containing fuzzing output"""
 19 | 
 20 |     def configure(self) -> None:
 21 |         self.add_argument("input")
 22 | 
 23 | 
 24 | def read_coverage_map(path: Path, matcher_table_size: int) -> bitarray:
 25 |     cvg_map = bitarray()
 26 | 
 27 |     with open(path, "rb") as file:
 28 |         cvg_map.fromfile(file)
 29 |         assert ceil(matcher_table_size / 64) * 64 == len(cvg_map)
 30 |         cvg_map = cvg_map[:matcher_table_size]
 31 | 
 32 |     return cvg_map
 33 | 
 34 | 
 35 | def get_combined_coverage_map(
 36 |     experiments: Iterable[Experiment], map_size: int, map_rel_path: str
 37 | ) -> bitarray:
 38 |     combined_cvg_map = bitarray(map_size)
 39 |     combined_cvg_map.setall(1)
 40 | 
 41 |     for expr in experiments:
 42 |         cvg_map_path = expr.path.joinpath(map_rel_path)
 43 | 
 44 |         if not cvg_map_path.exists():
 45 |             print(f"WARNING: {cvg_map_path} does not exist. Skipped.")
 46 |             continue
 47 | 
 48 |         combined_cvg_map &= read_coverage_map(cvg_map_path, map_size)
 49 | 
 50 |     return combined_cvg_map
 51 | 
 52 | 
 53 | def get_matcher_table_size(backend: str, isel: str) -> int:
 54 |     backend = ARCH_TO_BACKEND_MAP[backend]
 55 | 
 56 |     if isel == "dagisel":
 57 |         return DAGISEL_MATCHER_TABLE_SIZES[backend]
 58 |     elif isel == "gisel":
 59 |         return GISEL_MATCHER_TABLE_SIZES[backend]
 60 |     else:
 61 |         raise Exception("Invalid ISel")
 62 | 
 63 | 
 64 | def main():
 65 |     args = Args().parse_args()
 66 | 
 67 |     for (arch, isel), exprs in groupby(
 68 |         get_all_experiments(args.input),
 69 |         lambda expr: (expr.target.triple.arch, expr.isel),
 70 |     ):
 71 |         matcher_table_size = get_matcher_table_size(arch, isel)
 72 |         exprs = list(exprs)
 73 | 
 74 |         initial_cvg_map = get_combined_coverage_map(
 75 |             exprs,
 76 |             matcher_table_size,
 77 |             "default/fuzz_initial_shadowmap",
 78 |         )
 79 | 
 80 |         current_cvg_map = get_combined_coverage_map(
 81 |             exprs,
 82 |             matcher_table_size,
 83 |             "default/fuzz_shadowmap",
 84 |         )
 85 | 
 86 |         assert len(initial_cvg_map) == len(current_cvg_map)
 87 | 
 88 |         print(
 89 |             arch.ljust(10),
 90 |             isel.ljust(8),
 91 |             f"{matcher_table_size}".ljust(8),
 92 |             f"{initial_cvg_map.count(0) / matcher_table_size :.3%}".ljust(6),
 93 |             "->",
 94 |             f"{current_cvg_map.count(0) / matcher_table_size :.3%}".ljust(6),
 95 |         )
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/mutator/src/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "mutator.h"
  2 | 
  3 | #include "llvm/FuzzMutate/FuzzerCLI.h"
  4 | #include "llvm/FuzzMutate/IRMutator.h"
  5 | #include "llvm/FuzzMutate/Operations.h"
  6 | #include "llvm/IR/LLVMContext.h"
  7 | #include "llvm/IR/Module.h"
  8 | #include "llvm/IR/Verifier.h"
  9 | #include "llvm/Support/raw_ostream.h"
 10 | #include <fstream>
 11 | #include <iostream>
 12 | #include <stdio.h>
 13 | #include <stdlib.h>
 14 | #include <string.h>
 15 | #include <time.h>
 16 | #include <unistd.h>
 17 | #include <vector>
 18 | 
 19 | #define MAX_SIZE 1048576
 20 | 
 21 | // https://stackoverflow.com/questions/322938/recommended-way-to-initialize-srand
 22 | unsigned long mix(unsigned long a, unsigned long b, unsigned long c) {
 23 |   a = a - b;
 24 |   a = a - c;
 25 |   a = a ^ (c >> 13);
 26 |   b = b - c;
 27 |   b = b - a;
 28 |   b = b ^ (a << 8);
 29 |   c = c - a;
 30 |   c = c - b;
 31 |   c = c ^ (b >> 13);
 32 |   a = a - b;
 33 |   a = a - c;
 34 |   a = a ^ (c >> 12);
 35 |   b = b - c;
 36 |   b = b - a;
 37 |   b = b ^ (a << 16);
 38 |   c = c - a;
 39 |   c = c - b;
 40 |   c = c ^ (b >> 5);
 41 |   a = a - b;
 42 |   a = a - c;
 43 |   a = a ^ (c >> 3);
 44 |   b = b - c;
 45 |   b = b - a;
 46 |   b = b ^ (a << 10);
 47 |   c = c - a;
 48 |   c = c - b;
 49 |   c = c ^ (b >> 15);
 50 |   return c;
 51 | }
 52 | int main(int argc, char **argv) {
 53 |   if (argc < 2) {
 54 |     fprintf(stderr, "I need a file to mutate on");
 55 |     exit(1);
 56 |   }
 57 |   std::ifstream infile(argv[1], std::ios::binary | std::ios::ate);
 58 |   std::streamsize size = infile.tellg();
 59 |   infile.seekg(0, std::ios::beg);
 60 | 
 61 |   std::vector<char> buffer(MAX_SIZE);
 62 |   if (infile.read(buffer.data(), size)) {
 63 |     srand(mix(clock(), time(NULL), getpid()));
 64 |     createISelMutator();
 65 |     unsigned int Seed = rand();
 66 |     if (argc > 2) {
 67 |       Seed = atoi(argv[2]);
 68 |     }
 69 |     llvm::errs() << Seed << "\n";
 70 |     bool validateMode = false;
 71 |     if (argc > 3 && argv[3][1] == 'v') {
 72 |       validateMode = true;
 73 |     }
 74 |     size_t newSize =
 75 |         LLVMFuzzerCustomMutator((uint8_t *)buffer.data(), size, MAX_SIZE, Seed);
 76 |     if (!validateMode) {
 77 |       std::ofstream outbc =
 78 |           std::ofstream("out.bc", std::ios::out | std::ios::binary);
 79 |       outbc.write(buffer.data(), newSize);
 80 |       outbc.close();
 81 |     }
 82 |     llvm::LLVMContext Context;
 83 |     std::unique_ptr<llvm::Module> M =
 84 |         llvm::parseModule((uint8_t *)buffer.data(), newSize, Context);
 85 | #ifdef DEBUG
 86 |     if (!validateMode)
 87 |       M->dump();
 88 | #endif
 89 |     /*
 90 |     std::error_code EC;
 91 |     llvm::raw_fd_ostream outll("out.ll", EC);
 92 |     M->print(outll, nullptr);
 93 |     */
 94 |     // llvm::errs() << "Verifing Module...";
 95 |     if (verifyModule(*M, &llvm::errs(), nullptr)) {
 96 |       llvm::errs() << "Verifier failed. Seed: " << Seed << "\n";
 97 |       // llvm::errs() << *M << "\n";
 98 |     } else {
 99 |       // llvm::errs() << "Good.\n";
100 |     }
101 |   } else {
102 |     fprintf(stderr, "I can't read the file.");
103 |   }
104 |   infile.close();
105 |   return 0;
106 | }


--------------------------------------------------------------------------------
/scripts/collect_bad_inputs.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import random
  3 | import shutil
  4 | import subprocess
  5 | from typing import Iterable
  6 | from tap import Tap
  7 | 
  8 | from lib.experiment import get_all_experiments
  9 | from lib.time_parser import get_time_in_seconds
 10 | 
 11 | 
 12 | class Args(Tap):
 13 |     input: str
 14 |     """Path to the fuzzing output directory"""
 15 | 
 16 |     output: str
 17 |     """Path to the directory to write bad inputs and seeds to"""
 18 | 
 19 |     time: str
 20 |     """
 21 |     the threshold duration for an experiment to be considered failed.
 22 |     Current input that cause an experiment to fail in less than this time
 23 |     is considered a bad input.
 24 |     (e.g. '100s', '30m', '2h', '1d')
 25 |     """
 26 | 
 27 |     n: int = 256
 28 |     """Number of random seeds to test for each bad input"""
 29 | 
 30 |     driver: str = "mutator/build/MutatorDriver"
 31 |     """Path to the mutator driver executable"""
 32 | 
 33 |     def configure(self) -> None:
 34 |         self.add_argument("input")
 35 |         self.add_argument("-o", "--output")
 36 |         self.add_argument("-t", "--time")
 37 | 
 38 |     def get_time_in_seconds(self) -> int:
 39 |         return get_time_in_seconds(self.time)
 40 | 
 41 | 
 42 | def copy_bad_inputs(
 43 |     fuzzing_out_dir: Path, out_dir: Path, time_secs: int
 44 | ) -> Iterable[Path]:
 45 |     for expr in get_all_experiments(fuzzing_out_dir):
 46 |         if expr.run_time < time_secs:
 47 |             dest_path = out_dir.joinpath(expr.name + ".bc")
 48 | 
 49 |             if not expr.cur_input_path.exists():
 50 |                 print(f"Warning: {expr.cur_input_path} does not exist!")
 51 |                 continue
 52 | 
 53 |             shutil.copy(expr.cur_input_path, dest_path)
 54 |             yield dest_path
 55 | 
 56 | 
 57 | def mutate(mutator_driver: Path, input_bc: Path, seed: int) -> int:
 58 |     return subprocess.run(
 59 |         [mutator_driver, input_bc, str(seed)],
 60 |         stdout=subprocess.DEVNULL,
 61 |         stderr=subprocess.DEVNULL,
 62 |     ).returncode
 63 | 
 64 | 
 65 | def collect_bad_seeds(mutator_driver: Path, input_bc: Path, n: int) -> Iterable[int]:
 66 |     for _ in range(n):
 67 |         seed = random.randint(0, 4294967295)
 68 |         ret_code = mutate(mutator_driver, input_bc, seed)
 69 |         if ret_code != 0:
 70 |             yield seed
 71 | 
 72 | 
 73 | def main() -> None:
 74 |     args = Args().parse_args()
 75 | 
 76 |     out_dir = Path(args.output)
 77 |     out_dir.mkdir(exist_ok=True)
 78 | 
 79 |     bad_input_paths = list(
 80 |         copy_bad_inputs(
 81 |             fuzzing_out_dir=Path(args.input),
 82 |             out_dir=out_dir,
 83 |             time_secs=args.get_time_in_seconds(),
 84 |         )
 85 |     )
 86 | 
 87 |     print(f"{len(bad_input_paths)} bad inputs written to dir {out_dir}.")
 88 | 
 89 |     for bad_input_path in bad_input_paths:
 90 |         print(f"Collecting bad seeds for {bad_input_path}...")
 91 | 
 92 |         bad_input_path.parent.joinpath(
 93 |             bad_input_path.name.removesuffix(".bc") + ".seeds.txt"
 94 |         ).write_text(
 95 |             "\n".join(
 96 |                 str(seed)
 97 |                 for seed in collect_bad_seeds(
 98 |                     mutator_driver=Path(args.driver),
 99 |                     input_bc=bad_input_path,
100 |                     n=args.n,
101 |                 )
102 |             )
103 |         )
104 | 
105 |     print("Done.")
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     main()
110 | 


--------------------------------------------------------------------------------
/scripts/combine_fuzzing_results.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | from functools import reduce
  5 | from typing import Callable
  6 | 
  7 | from lib import IRFUZZER_DATA_ENV
  8 | from lib.experiment import Experiment, get_all_experiments
  9 | from lib.fs import subdirs_of
 10 | 
 11 | 
 12 | class BlackList:
 13 |     name: str
 14 |     func: Callable[[Experiment, int], bool]
 15 | 
 16 |     def __init__(self, name, func):
 17 |         self.name = name
 18 |         self.func = func
 19 | 
 20 |     def ignore(self, expr: Experiment, mapped_id: int):
 21 |         do_ignore = self.func(expr, mapped_id)
 22 |         if do_ignore:
 23 |             print("\t", self.name, "Failed")
 24 |         return do_ignore
 25 | 
 26 | 
 27 | use_xcore_makeup = BlackList(
 28 |     "use_xcore_makeup",
 29 |     lambda expr_info, _: "xcore" == expr_info.arch
 30 |     and "xcore-makeup" not in expr_info.expr_path,
 31 | )
 32 | max_five_expr = BlackList("max_five_expr", lambda _, mapped_id: mapped_id > 4)
 33 | fuzzed_long_enough = BlackList(
 34 |     "fuzzed_long_enough", lambda expr_info, _: expr_info.run_time < 259000
 35 | )
 36 | ignore_arm64 = BlackList(
 37 |     "ignore_arm64", lambda expr_info, mapped_id: "arm64" in expr_info.arch
 38 | )
 39 | 
 40 | blacklists = [use_xcore_makeup, max_five_expr, fuzzed_long_enough, ignore_arm64]
 41 | 
 42 | 
 43 | def merge_subdirs_by_symlink(src: str, dest: str) -> None:
 44 |     for archive_dir in subdirs_of(src):
 45 |         for expr in get_all_experiments(archive_dir.path):
 46 |             symlink_dest_dir = os.path.join(
 47 |                 dest, expr.fuzzer, expr.isel, str(expr.target)
 48 |             )
 49 |             os.makedirs(symlink_dest_dir, exist_ok=True)
 50 |             mapped_id = 1 + max(
 51 |                 [
 52 |                     -1,
 53 |                     *(
 54 |                         int(dir_entry.name)
 55 |                         for dir_entry in subdirs_of(symlink_dest_dir)
 56 |                     ),
 57 |                 ]
 58 |             )
 59 |             symlink_src = expr.path
 60 |             symlink_dest = os.path.join(symlink_dest_dir, str(mapped_id))
 61 |             print(
 62 |                 symlink_dest,
 63 |                 " -> ",
 64 |                 symlink_src,
 65 |                 flush=True,
 66 |             )
 67 | 
 68 |             if reduce(
 69 |                 lambda a, b: a or b,
 70 |                 [bl.ignore(expr, mapped_id) for bl in blacklists],
 71 |             ):
 72 |                 print("NOT USED", flush=True)
 73 |             else:
 74 |                 os.symlink(symlink_src, symlink_dest)
 75 |                 print("DONE", flush=True)
 76 | 
 77 | 
 78 | def main() -> None:
 79 |     parser = argparse.ArgumentParser(description="Combine experiments into one root.")
 80 |     parser.add_argument(
 81 |         "-i",
 82 |         "--input",
 83 |         type=str,
 84 |         default="",
 85 |         help=f"The directory containing all inputs. Default to ${IRFUZZER_DATA_ENV}",
 86 |     )
 87 |     args = parser.parse_args()
 88 |     if args.input == "":
 89 |         args.input = os.getenv(IRFUZZER_DATA_ENV)
 90 |         if args.input == None:
 91 |             logging.error(
 92 |                 f"Input directory not set, set --input or {IRFUZZER_DATA_ENV}"
 93 |             )
 94 |             exit(1)
 95 |     # make sure current working directory is archive before running this
 96 |     merge_subdirs_by_symlink(args.input, os.path.join(args.input, "../combined"))
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     logging.basicConfig()
101 |     logging.getLogger().setLevel(logging.INFO)
102 |     main()
103 | 


--------------------------------------------------------------------------------
/scripts/collect_matcher_table_size.py:
--------------------------------------------------------------------------------
  1 | from io import TextIOWrapper
  2 | import multiprocessing
  3 | import re
  4 | import subprocess
  5 | from pathlib import Path
  6 | from typing import Optional
  7 | 
  8 | from tap import Tap
  9 | from lib import LLVM
 10 | from lib.fs import subdirs_of
 11 | 
 12 | 
 13 | LLVM_AFL_BUILD_PATH = Path(LLVM, "build-afl")
 14 | 
 15 | 
 16 | class Args(Tap):
 17 |     jobs: int = multiprocessing.cpu_count()
 18 |     output: Optional[str] = "scripts/lib/matcher_table_sizes.py"
 19 | 
 20 |     def configure(self) -> None:
 21 |         self.add_argument("-o", "--output")
 22 | 
 23 | 
 24 | def get_obj_file_suffix(global_isel: bool) -> str:
 25 |     return "InstructionSelector" if global_isel else "ISelDAGToDAG"
 26 | 
 27 | 
 28 | def remove_matcher_table_build_files(global_isel: bool) -> None:
 29 |     suffix = get_obj_file_suffix(global_isel)
 30 | 
 31 |     for target_dir in subdirs_of(Path(LLVM_AFL_BUILD_PATH, "lib/Target")):
 32 |         if not target_dir.is_dir() or target_dir.name == "CMakeFiles":
 33 |             continue
 34 | 
 35 |         backend = target_dir.name
 36 |         paths = list(
 37 |             Path(target_dir).glob(
 38 |                 f"CMakeFiles/LLVM{backend}CodeGen.dir/**/*{suffix}.cpp.o"
 39 |             )
 40 |         )
 41 | 
 42 |         for path in paths:
 43 |             print(f"Removing {path}...")
 44 |             path.unlink()
 45 | 
 46 | 
 47 | def build_llvm_afl(jobs: int) -> str:
 48 |     args = ["ninja", "-j", str(jobs)]
 49 |     print(" ".join(args))
 50 | 
 51 |     p = subprocess.run(
 52 |         args=args,
 53 |         cwd=LLVM_AFL_BUILD_PATH,
 54 |         stdout=subprocess.PIPE,
 55 |         stderr=subprocess.PIPE,
 56 |     )
 57 | 
 58 |     stdout = p.stdout.decode("utf-8")
 59 |     stderr = p.stderr.decode("utf-8")
 60 | 
 61 |     if len(stderr) > 0:
 62 |         print(stderr)
 63 |         exit(1)
 64 | 
 65 |     p.check_returncode()
 66 | 
 67 |     ## Filter out compiler outputs, mostly warnings.
 68 |     stdout = "\n".join(filter(lambda l: len(l) > 1 and l[0] == "[", stdout.split("\n")))
 69 | 
 70 |     print(stdout)
 71 | 
 72 |     return stdout
 73 | 
 74 | 
 75 | def get_output_pattern(global_isel: bool) -> str:
 76 |     suffix = get_obj_file_suffix(global_isel)
 77 |     line1 = rf"\[(\d+)/\d+\] Building CXX object lib/Target/.+/CMakeFiles/.+/(.+){suffix}\.cpp\.o"
 78 |     line2 = r"\[\+\] MatcherTable size: (\d+)"
 79 |     return rf"{line1}\n{line2}"
 80 | 
 81 | 
 82 | def extract_matcher_table_size(stdout: str, global_isel: bool) -> dict[str, int]:
 83 |     matches = re.findall(get_output_pattern(global_isel), stdout)
 84 | 
 85 |     table_sizes = {}
 86 | 
 87 |     for match in matches:
 88 |         backend = match[1]
 89 |         table_size = int(match[2])
 90 |         table_sizes[backend] = table_size
 91 | 
 92 |     return table_sizes
 93 | 
 94 | 
 95 | def dump_py(
 96 |     name: str, dict: dict[str, int], file: Optional[TextIOWrapper] = None
 97 | ) -> None:
 98 |     print(name + ": dict[str, int] = {", file=file)
 99 |     for key in sorted(dict.keys()):
100 |         print(f'    "{key}": {dict[key]},', file=file)
101 |     print("}", file=file)
102 | 
103 | 
104 | def main() -> None:
105 |     args = Args().parse_args()
106 | 
107 |     remove_matcher_table_build_files(global_isel=False)
108 |     remove_matcher_table_build_files(global_isel=True)
109 | 
110 |     stdout = build_llvm_afl(jobs=args.jobs)
111 | 
112 |     dag_isel_table_sizes = extract_matcher_table_size(stdout, global_isel=False)
113 |     global_isel_table_sizes = extract_matcher_table_size(stdout, global_isel=True)
114 | 
115 |     f = open(args.output, "w") if args.output and args.output != "-" else None
116 | 
117 |     dump_py("DAGISEL_MATCHER_TABLE_SIZES", dag_isel_table_sizes, file=f)
118 |     print(file=f)
119 |     dump_py("GISEL_MATCHER_TABLE_SIZES", global_isel_table_sizes, file=f)
120 | 
121 |     if f:
122 |         f.close()
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     main()
127 | 


--------------------------------------------------------------------------------
/scripts/batch_classify.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import subprocess
  4 | 
  5 | from classify import classify
  6 | from pathlib import Path
  7 | 
  8 | from lib import LLC, LLVM_DIS
  9 | from lib.fs import subdirs_of
 10 | from lib.llc_command import LLCCommand
 11 | from lib.process_concurrency import run_concurrent_subprocesses
 12 | from lib.target import Target, TargetFilter
 13 | 
 14 | 
 15 | def classify_wrapper(
 16 |     input_dir: Path,
 17 |     output_dir: Path,
 18 |     target: Target,
 19 |     global_isel: bool = False,
 20 |     generate_ll_files: bool = True,
 21 | ) -> None:
 22 |     llc_command = LLCCommand(target=target, global_isel=global_isel)
 23 |     args = [str(LLC), *llc_command.get_options(output="-")]
 24 | 
 25 |     print(f"Start classifying {input_dir} using '{(' '.join(args))}'...")
 26 | 
 27 |     classify(
 28 |         args,
 29 |         input_dir,
 30 |         output_dir,
 31 |         force=True,
 32 |         verbose=False,
 33 |         create_symlink_to_source=True,
 34 |         hash_stacktrace_only=True,
 35 |         hash_op_code_only_for_isel_crash=True,
 36 |         remove_addr_in_stacktrace=True,
 37 |         ignore_undefined_external_symbol=True,
 38 |     )
 39 | 
 40 |     print(f"Done classifying {input_dir} using '{(' '.join(args))}'.")
 41 | 
 42 |     if generate_ll_files:
 43 |         print(f"Generating human-readable IR files for {output_dir}...")
 44 | 
 45 |         run_concurrent_subprocesses(
 46 |             Path(output_dir).rglob("*.bc"),
 47 |             lambda ir_bc_path: subprocess.Popen(
 48 |                 args=[LLVM_DIS, ir_bc_path],
 49 |                 stdout=subprocess.DEVNULL,
 50 |                 stderr=subprocess.DEVNULL,
 51 |             ),
 52 |         )
 53 | 
 54 |         print(f"Done generating human-readable IR files for {output_dir}.")
 55 | 
 56 | 
 57 | def batch_classify(
 58 |     input_root_dir: Path,
 59 |     output_root_dir: Path,
 60 |     global_isel: bool = False,
 61 |     generate_ll_files: bool = True,
 62 |     target_filter: TargetFilter = lambda _: True,
 63 | ) -> None:
 64 |     for target_dir in subdirs_of(input_root_dir):
 65 |         target = Target.parse(target_dir.name)
 66 | 
 67 |         if not target_filter(target):
 68 |             continue
 69 | 
 70 |         for replicate_dir in subdirs_of(target_dir.path):
 71 |             try:
 72 |                 classify_wrapper(
 73 |                     input_dir=Path(replicate_dir.path, "default", "crashes"),
 74 |                     output_dir=output_root_dir.joinpath(
 75 |                         target_dir.name, replicate_dir.name
 76 |                     ),
 77 |                     target=target,
 78 |                     global_isel=global_isel,
 79 |                     generate_ll_files=generate_ll_files,
 80 |                 )
 81 |             except Exception:
 82 |                 logging.exception(
 83 |                     f"Something went wrong when processing {target_dir.path}"
 84 |                 )
 85 | 
 86 | 
 87 | def main() -> None:
 88 |     parser = argparse.ArgumentParser(
 89 |         description="Batch classify LLVM crashes",
 90 |     )
 91 | 
 92 |     parser.add_argument(
 93 |         "-i",
 94 |         "--input",
 95 |         type=str,
 96 |         required=True,
 97 |         help="The input directory containing all fuzzer directories",
 98 |     )
 99 | 
100 |     parser.add_argument(
101 |         "-o",
102 |         "--output",
103 |         type=str,
104 |         required=True,
105 |         help="The output directory",
106 |     )
107 | 
108 |     args = parser.parse_args()
109 | 
110 |     for fuzzer_dir in subdirs_of(args.input):
111 |         for isel_dir in subdirs_of(fuzzer_dir.path):
112 |             batch_classify(
113 |                 input_root_dir=Path(isel_dir.path),
114 |                 output_root_dir=Path(args.output, fuzzer_dir.name, isel_dir.name),
115 |                 global_isel=isel_dir.name == "gisel",
116 |             )
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/scripts/lib/target.py:
--------------------------------------------------------------------------------
  1 | from functools import reduce
  2 | import re
  3 | from typing import Callable, Iterable, Literal, Optional
  4 | 
  5 | from lib.triple import Triple
  6 | 
  7 | 
  8 | class Target:
  9 |     triple: Triple
 10 |     cpu: Optional[str]
 11 |     attrs: set[str]
 12 | 
 13 |     @property
 14 |     def backend(self) -> str:
 15 |         return self.triple.backend
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         triple: Triple | str,
 20 |         cpu: Optional[str] = None,
 21 |         attrs: Iterable[str] | str | None = None,
 22 |     ) -> None:
 23 |         self.triple = triple if isinstance(triple, Triple) else Triple.parse(triple)
 24 |         self.cpu = None if cpu is None or cpu == "" else cpu
 25 | 
 26 |         if isinstance(attrs, str):
 27 |             attrs = attrs.split(",")
 28 | 
 29 |         self.attrs = (
 30 |             set(
 31 |                 (("+" + attr) if not attr.startswith(("+", "-")) else attr)
 32 |                 for attr in attrs
 33 |             )
 34 |             if attrs
 35 |             else set()
 36 |         )
 37 | 
 38 |     def __repr__(self) -> str:
 39 |         def get_parts() -> Iterable[str]:
 40 |             yield str(self.triple)
 41 | 
 42 |             if self.cpu:
 43 |                 yield self.cpu
 44 | 
 45 |             for attr in sorted(self.attrs):
 46 |                 yield attr
 47 | 
 48 |         return ",".join(get_parts())
 49 | 
 50 |     def __eq__(self, __o: object) -> bool:
 51 |         if not isinstance(__o, Target):
 52 |             return False
 53 | 
 54 |         return (
 55 |             self.triple == __o.triple
 56 |             and self.cpu == __o.cpu
 57 |             and self.attrs == __o.attrs
 58 |         )
 59 | 
 60 |     def __hash__(self) -> int:
 61 |         return hash(str(self))
 62 | 
 63 |     @staticmethod
 64 |     def parse(s: str) -> "Target":
 65 |         """
 66 |         Acceptable formats:
 67 |         "<triple> [<cpu>] [<attr1> <attr2> ...]",
 68 |         "<triple> [<cpu>] [<attr1>,<attr2>,...]", or
 69 |         "<triple>[,<cpu>][,<attr1>,<attr2>,...]".
 70 |         An attribute must start with '+' or '-' to avoid ambiguity.
 71 |         """
 72 | 
 73 |         parts = [part for part in re.split(r" |,", s) if part != ""]
 74 |         n = len(parts)
 75 | 
 76 |         assert n > 0
 77 | 
 78 |         # triple only
 79 |         if n == 1:
 80 |             return Target(triple=parts[0])
 81 | 
 82 |         # triple with attributes
 83 |         if parts[1].startswith(("+", "-")):
 84 |             return Target(
 85 |                 triple=parts[0],
 86 |                 cpu=None,
 87 |                 attrs=parts[1:],
 88 |             )
 89 | 
 90 |         # triple with cpu
 91 |         if n == 2:
 92 |             return Target(
 93 |                 triple=parts[0],
 94 |                 cpu=parts[1],
 95 |             )
 96 | 
 97 |         # triple with cpu and attributes
 98 |         return Target(triple=parts[0], cpu=parts[1], attrs=parts[2:])
 99 | 
100 | 
101 | TargetFilter = Callable[[Target], bool]
102 | TargetProp = Literal["triple", "arch", "vendor", "os", "abi", "cpu", "attrs"]
103 | 
104 | 
105 | def get_target_prop_selector(
106 |     prop: TargetProp,
107 | ) -> Callable[[Target], Triple | str | set[str] | None]:
108 |     match prop:
109 |         case "triple":
110 |             return lambda target: target.triple
111 |         case "arch":
112 |             return lambda target: target.triple.arch
113 |         case "vendor":
114 |             return lambda target: target.triple.vendor
115 |         case "os":
116 |             return lambda target: target.triple.os
117 |         case "abi":
118 |             return lambda target: target.triple.abi
119 |         case "cpu":
120 |             return lambda target: target.cpu
121 |         case "attrs":
122 |             return lambda target: target.attrs
123 | 
124 | 
125 | def get_target_prop_equality_checker(
126 |     target: Target, prop: TargetProp
127 | ) -> Callable[[Target], bool]:
128 |     prop_selector = get_target_prop_selector(prop)
129 |     return lambda candidate: prop_selector(candidate) == prop_selector(target)
130 | 
131 | 
132 | def create_target_filter(
133 |     target: Target, props_to_match: Iterable[TargetProp]
134 | ) -> TargetFilter:
135 |     return reduce(
136 |         lambda curr_filter, prop: (
137 |             lambda candidate: (
138 |                 curr_filter(candidate)
139 |                 and get_target_prop_equality_checker(target, prop)(candidate)
140 |             )
141 |         ),
142 |         props_to_match,
143 |         lambda _: True,
144 |     )
145 | 


--------------------------------------------------------------------------------
/scripts/collect_seeds.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | import subprocess
  4 | from typing import Iterable, Literal, Optional
  5 | 
  6 | from tap import Tap
  7 | 
  8 | from lib.fs import count_files
  9 | from lib.llc_command import LLCCommand
 10 | from lib.llc_test import LLCTest, parse_llc_tests
 11 | from lib.target import Target, TargetFilter, TargetProp, create_target_filter
 12 | from lib.triple import Triple
 13 | 
 14 | 
 15 | class Args(Tap):
 16 |     triple: str
 17 |     cpu: Optional[str] = None
 18 |     attrs: list[str] = []
 19 |     global_isel: bool = False
 20 | 
 21 |     props_to_match: list[TargetProp] = ["triple", "cpu", "attrs"]
 22 |     """
 23 |     the properties of a test target to match those of the fuzzing target,
 24 |     used to determine which tests should be included as seeds.
 25 |     """
 26 | 
 27 |     seed_format: Literal["bc", "ll"] = "bc"
 28 |     """
 29 |     whether to create symlinks to the tests, or assemble to bitcode (*.bc) files.
 30 |     """
 31 | 
 32 |     timeout: Optional[float] = None
 33 |     """
 34 |     only include test cases that can be compiled within the specified in seconds.
 35 |     """
 36 | 
 37 |     output: str
 38 |     """directory for storing seeds (will create if not exist)"""
 39 | 
 40 |     def configure(self) -> None:
 41 |         self.add_argument("-o", "--output")
 42 | 
 43 | 
 44 | def get_runnable_llc_tests(
 45 |     backend: str,
 46 |     global_isel: bool,
 47 |     target_filter: TargetFilter = lambda _: True,
 48 | ) -> Iterable[LLCTest]:
 49 |     return (
 50 |         test
 51 |         for test in parse_llc_tests(backend_filter=lambda a: a == backend)
 52 |         if any(
 53 |             cmd.global_isel == global_isel and target_filter(cmd.target)
 54 |             for cmd in test.runnable_llc_commands
 55 |         )
 56 |     )
 57 | 
 58 | 
 59 | def validate_seed(
 60 |     seed_path: Path, llc_command: LLCCommand, timeout_secs: Optional[float] = None
 61 | ) -> bool:
 62 |     try:
 63 |         subprocess.run(
 64 |             llc_command.get_args(input=seed_path, output="-"),
 65 |             timeout=timeout_secs,
 66 |             check=True,
 67 |             stdout=subprocess.DEVNULL,
 68 |             stderr=subprocess.DEVNULL,
 69 |         )
 70 | 
 71 |         return True
 72 |     except subprocess.CalledProcessError:
 73 |         logging.warning(f"Seed candidate {seed_path} does not compile.")
 74 |     except subprocess.TimeoutExpired:
 75 |         logging.warning(f"Seed candidate {seed_path} timed out when compiling.")
 76 | 
 77 |     return False
 78 | 
 79 | 
 80 | def collect_seeds_from_tests(
 81 |     target: Target,
 82 |     global_isel: bool,
 83 |     out_dir_parent: Path,
 84 |     props_to_match: list[TargetProp] = ["triple", "cpu", "attrs"],
 85 |     dump_bc: bool = True,
 86 |     symlink_to_ll: bool = False,
 87 |     timeout_secs: Optional[float] = None,
 88 | ) -> Path:
 89 |     print(f"Collecting seeds for target {target}...")
 90 | 
 91 |     out_dir = out_dir_parent.joinpath(
 92 |         "gisel" if global_isel else "dagisel", str(target)
 93 |     )
 94 |     out_dir.mkdir(parents=True)
 95 | 
 96 |     llc_command = LLCCommand(target=target, global_isel=global_isel)
 97 | 
 98 |     for test in get_runnable_llc_tests(
 99 |         backend=target.backend,
100 |         global_isel=global_isel,
101 |         target_filter=create_target_filter(target, props_to_match),
102 |     ):
103 |         if symlink_to_ll and validate_seed(test.path, llc_command, timeout_secs):
104 |             out_dir.joinpath(test.path.name).symlink_to(test.path.absolute())
105 | 
106 |         if dump_bc:
107 |             bc_path = test.dump_bc(out_dir)
108 | 
109 |             if not validate_seed(bc_path, llc_command, timeout_secs):
110 |                 bc_path.unlink(missing_ok=True)
111 | 
112 |     print(f"{count_files(out_dir)} seeds written to {out_dir}.")
113 | 
114 |     return out_dir
115 | 
116 | 
117 | def main() -> None:
118 |     args = Args(underscores_to_dashes=True).parse_args()
119 | 
120 |     target = Target(
121 |         triple=Triple.parse(args.triple),
122 |         cpu=args.cpu,
123 |         attrs=args.attrs[0] if len(args.attrs) == 1 else args.attrs,
124 |     )
125 | 
126 |     collect_seeds_from_tests(
127 |         target=target,
128 |         global_isel=args.global_isel,
129 |         out_dir_parent=Path(args.output),
130 |         props_to_match=args.props_to_match,
131 |         dump_bc=args.seed_format == "bc",
132 |         symlink_to_ll=args.seed_format == "ll",
133 |         timeout_secs=args.timeout,
134 |     )
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     main()
139 | 


--------------------------------------------------------------------------------
/llvm-isel-afl/llvm-isel-fuzzer.cpp:
--------------------------------------------------------------------------------
  1 | //===--- llvm-isel-fuzzer.cpp - Fuzzer for instruction selection ----------===//
  2 | //
  3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 | // See https://llvm.org/LICENSE.txt for license information.
  5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 | //
  7 | //===----------------------------------------------------------------------===//
  8 | //
  9 | // Tool to fuzz instruction selection using libFuzzer.
 10 | //
 11 | //===----------------------------------------------------------------------===//
 12 | 
 13 | #include "llvm/ADT/StringRef.h"
 14 | #include "llvm/Analysis/TargetLibraryInfo.h"
 15 | #include "llvm/Bitcode/BitcodeReader.h"
 16 | #include "llvm/Bitcode/BitcodeWriter.h"
 17 | #include "llvm/CodeGen/CommandFlags.h"
 18 | #include "llvm/FuzzMutate/FuzzerCLI.h"
 19 | #include "llvm/FuzzMutate/IRMutator.h"
 20 | #include "llvm/FuzzMutate/Operations.h"
 21 | #include "llvm/IR/Constants.h"
 22 | #include "llvm/IR/LLVMContext.h"
 23 | #include "llvm/IR/LegacyPassManager.h"
 24 | #include "llvm/IR/Module.h"
 25 | #include "llvm/IR/Verifier.h"
 26 | #include "llvm/IRReader/IRReader.h"
 27 | #if LLVM_VERSION_MAJOR >= 14
 28 | #include "llvm/MC/TargetRegistry.h"
 29 | #else
 30 | #include "llvm/Support/TargetRegistry.h"
 31 | #endif
 32 | #include "llvm/Support/CommandLine.h"
 33 | #include "llvm/Support/DataTypes.h"
 34 | #include "llvm/Support/Debug.h"
 35 | #include "llvm/Support/SourceMgr.h"
 36 | #include "llvm/Support/TargetSelect.h"
 37 | #include "llvm/Target/TargetMachine.h"
 38 | 
 39 | #define DEBUG_TYPE "isel-fuzzer"
 40 | 
 41 | using namespace llvm;
 42 | 
 43 | static codegen::RegisterCodeGenFlags CGF;
 44 | 
 45 | static cl::opt<char>
 46 |     OptLevel("O",
 47 |              cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
 48 |                       "(default = '-O2')"),
 49 |              cl::Prefix, cl::ZeroOrMore, cl::init('2'));
 50 | 
 51 | static cl::opt<std::string>
 52 |     TargetTriple("mtriple", cl::desc("Override target triple for module"));
 53 | 
 54 | static std::unique_ptr<TargetMachine> TM;
 55 | 
 56 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
 57 |   if (Size <= 1)
 58 |     // We get bogus data given an empty corpus - ignore it.
 59 |     return 0;
 60 | 
 61 |   LLVMContext Context;
 62 |   auto M = parseAndVerify(Data, Size, Context);
 63 |   if (!M) {
 64 |     errs() << "error: input module is broken!\n";
 65 |     return 0;
 66 |   }
 67 | 
 68 |   // Set up the module to build for our target.
 69 |   M->setTargetTriple(TM->getTargetTriple().normalize());
 70 |   M->setDataLayout(TM->createDataLayout());
 71 | 
 72 |   // Build up a PM to do instruction selection.
 73 |   legacy::PassManager PM;
 74 |   TargetLibraryInfoImpl TLII(TM->getTargetTriple());
 75 |   PM.add(new TargetLibraryInfoWrapperPass(TLII));
 76 |   raw_null_ostream OS;
 77 |   TM->addPassesToEmitFile(PM, OS, nullptr, CGFT_Null);
 78 |   PM.run(*M);
 79 | 
 80 |   return 0;
 81 | }
 82 | 
 83 | extern "C" LLVM_ATTRIBUTE_USED int LLVMFuzzerInitialize(int *argc,
 84 |                                                         char ***argv) {
 85 |   EnableDebugBuffering = true;
 86 | 
 87 |   /// TODO: Only init the one we are fuzzing, would that meke it faster?
 88 |   InitializeAllTargets();
 89 |   InitializeAllTargetMCs();
 90 |   InitializeAllAsmPrinters();
 91 |   InitializeAllAsmParsers();
 92 | 
 93 |   // handleExecNameEncodedBEOpts(*argv[0]);
 94 |   cl::ParseCommandLineOptions(*argc, *argv);
 95 | 
 96 |   if (TargetTriple.empty()) {
 97 |     errs() << *argv[0] << ": -mtriple must be specified\n";
 98 |     exit(1);
 99 |   }
100 | 
101 |   Triple TheTriple = Triple(Triple::normalize(TargetTriple));
102 | 
103 |   // Get the target specific parser.
104 |   std::string Error;
105 |   const Target *TheTarget =
106 |       TargetRegistry::lookupTarget(codegen::getMArch(), TheTriple, Error);
107 |   if (!TheTarget) {
108 |     errs() << argv[0] << ": " << Error;
109 |     return 1;
110 |   }
111 | 
112 |   // Set up the pipeline like llc does.
113 |   std::string CPUStr = codegen::getCPUStr(),
114 |               FeaturesStr = codegen::getFeaturesStr();
115 | 
116 |   CodeGenOpt::Level OLvl = CodeGenOpt::Default;
117 |   switch (OptLevel) {
118 |   default:
119 |     errs() << argv[0] << ": invalid optimization level.\n";
120 |     return 1;
121 |   case ' ':
122 |     break;
123 |   case '0':
124 |     OLvl = CodeGenOpt::None;
125 |     break;
126 |   case '1':
127 |     OLvl = CodeGenOpt::Less;
128 |     break;
129 |   case '2':
130 |     OLvl = CodeGenOpt::Default;
131 |     break;
132 |   case '3':
133 |     OLvl = CodeGenOpt::Aggressive;
134 |     break;
135 |   }
136 | 
137 |   TargetOptions Options = codegen::InitTargetOptionsFromCodeGenFlags(TheTriple);
138 |   TM.reset(TheTarget->createTargetMachine(
139 |       TheTriple.getTriple(), CPUStr, FeaturesStr, Options,
140 |       codegen::getExplicitRelocModel(), codegen::getExplicitCodeModel(), OLvl));
141 |   assert(TM && "Could not allocate target machine!");
142 | 
143 |   return 0;
144 | }
145 | 


--------------------------------------------------------------------------------
/scripts/lib/llc_test.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import re
  3 | import subprocess
  4 | from typing import Callable, Iterable, Optional
  5 | from lib import LLVM, LLVM_AS
  6 | 
  7 | from lib.llc_command import LLCCommand
  8 | from lib.triple import Triple
  9 | 
 10 | 
 11 | class LLCTest:
 12 |     path: Path
 13 | 
 14 |     backend: str
 15 | 
 16 |     test_commands: list[str]
 17 | 
 18 |     runnable_llc_commands: list[LLCCommand]
 19 |     """
 20 |     llc commands that can be directly executed without crashing using the test case as an input
 21 |     without going through `opt`, `sed`, etc. first.
 22 |     """
 23 | 
 24 |     code_lines: list[str]
 25 | 
 26 |     def __init__(self, backend: str, file_path: Path) -> None:
 27 |         assert file_path.name.endswith(".ll")
 28 | 
 29 |         self.backend = backend
 30 |         self.path = file_path
 31 |         self.test_commands = []
 32 |         self.code_lines = []
 33 | 
 34 |         with open(file_path) as file:
 35 |             multiline_command = False  # whether last RUN header ends with '\'
 36 |             while line := file.readline():
 37 |                 if re.match(r".*;.+NOTE:(.+)", line):
 38 |                     continue
 39 | 
 40 |                 match = re.match(r".*;.*RUN:(.+)", line)
 41 | 
 42 |                 if match is not None:
 43 |                     command = match.group(1).strip()
 44 | 
 45 |                     if multiline_command:
 46 |                         last_command_prev_part = (
 47 |                             self.test_commands[-1].removesuffix("\\").strip()
 48 |                         )
 49 |                         self.test_commands[-1] = f"{last_command_prev_part} {command}"
 50 |                     else:
 51 |                         self.test_commands.append(command)
 52 | 
 53 |                     multiline_command = command.endswith("\\")
 54 |                 else:
 55 |                     assert (
 56 |                         not multiline_command
 57 |                     ), f"ERROR: something unexpected happened when parsing commands for {file_path}"
 58 |                     self.code_lines.append(line)
 59 | 
 60 |         assert (
 61 |             len(self.test_commands) > 0
 62 |         ), f"WARNING: {file_path} does not contain any test command."
 63 | 
 64 |         assert (
 65 |             len(self.code_lines) > 0
 66 |         ), f"WARNING: {file_path} does not contain any test code."
 67 | 
 68 |         llc_commands = [cmd for cmd in self.test_commands if "llc " in cmd]
 69 |         assert (
 70 |             len(llc_commands) > 0
 71 |         ), f"WARNING: {file_path} does not contain any `llc` command."
 72 | 
 73 |         default_triple = self.get_default_triple()
 74 |         runnable_llc_commands_raw = filter(
 75 |             lambda cmd: cmd.startswith("llc"),
 76 |             (cmd.split("|")[0] for cmd in llc_commands),
 77 |         )
 78 | 
 79 |         try:
 80 |             self.runnable_llc_commands = [
 81 |                 LLCCommand.parse(cmd, default_triple)
 82 |                 for cmd in runnable_llc_commands_raw
 83 |             ]
 84 |         except Exception as e:
 85 |             raise Exception(
 86 |                 f"ERROR: Failed to parse llc command(s) in {file_path}."
 87 |             ) from e
 88 | 
 89 |         assert (
 90 |             len(self.runnable_llc_commands) > 0
 91 |         ), f"WARNING: {file_path} does not contain any runnable `llc` command."
 92 | 
 93 |     def get_default_triple(self) -> Optional[Triple]:
 94 |         lines_with_triple = [
 95 |             line for line in self.code_lines if line.startswith("target triple")
 96 |         ]
 97 | 
 98 |         if len(lines_with_triple) == 0:
 99 |             return None
100 | 
101 |         assert (
102 |             len(lines_with_triple) == 1
103 |         ), f"UNEXPECTED: {self.path} has more than one triple specified in code"
104 | 
105 |         match = re.match(r'target triple ?= ?"([a-z0-9_\.-]+)"', lines_with_triple[0])
106 |         assert (
107 |             match is not None
108 |         ), f"UNEXPECTED: failed to extract triple from '{lines_with_triple[0]}'"
109 | 
110 |         return Triple.parse(match.group(1))
111 | 
112 |     def dump_bc(self, out_dir: Path) -> Path:
113 |         out_path = out_dir.joinpath(self.path.name.removesuffix(".ll") + ".bc")
114 | 
115 |         process = subprocess.run(
116 |             [
117 |                 LLVM_AS,
118 |                 self.path,
119 |                 "-o",
120 |                 out_path,
121 |             ]
122 |         )
123 | 
124 |         if process.returncode != 0:
125 |             print(f"WARNING: failed to convert {self.path} to {out_path}")
126 |         
127 |         return out_path
128 | 
129 | 
130 | def parse_llc_tests(
131 |     backend_filter: Callable[[str], bool] = lambda _: True,
132 |     verbose: bool = False,
133 | ) -> Iterable[LLCTest]:
134 |     total = 0
135 |     success = 0
136 | 
137 |     for backend_dir in Path(LLVM, "llvm/test/CodeGen").iterdir():
138 |         if not backend_dir.is_dir() or not backend_filter(backend_dir.name):
139 |             continue
140 | 
141 |         for file_path in backend_dir.rglob("*.ll"):
142 |             try:
143 |                 yield LLCTest(backend_dir.name, file_path)
144 |                 success += 1
145 |             except Exception as e:
146 |                 if verbose:
147 |                     print(e)
148 |             total += 1
149 | 
150 |     print(f"Successfully parsed {success}/{total} LLC tests.")
151 | 


--------------------------------------------------------------------------------
/scripts/batch_compile.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import subprocess
  4 | from typing import Iterable, Optional
  5 | 
  6 | from lib.process_concurrency import MAX_SUBPROCESSES, run_concurrent_subprocesses
  7 | 
  8 | 
  9 | def build_clang_flags(
 10 |     target: str,
 11 |     sysroot: Optional[str] = None,
 12 |     include_paths: list[str] = [],
 13 |     opt_level: str = "0",
 14 | ) -> Iterable[str]:
 15 |     yield f"--target={target}"
 16 |     yield "-O" + opt_level
 17 | 
 18 |     if sysroot is not None:
 19 |         yield f"--sysroot={sysroot}"
 20 | 
 21 |     for include_path in include_paths:
 22 |         yield f"-I{include_path}"
 23 | 
 24 |     yield "-emit-llvm"
 25 |     yield "-c"
 26 | 
 27 | 
 28 | def batch_compile(
 29 |     src_dir: str, out_dir: str, clang_flags: list[str], n_jobs: Optional[int] = None
 30 | ) -> None:
 31 |     print(
 32 |         f'Compiling source code in {src_dir} to {out_dir} using "clang {" ".join(clang_flags)}"...'
 33 |     )
 34 | 
 35 |     os.makedirs(out_dir, exist_ok=True)
 36 | 
 37 |     run_concurrent_subprocesses(
 38 |         iter=[
 39 |             file_name for file_name in os.listdir(src_dir) if file_name.endswith(".c")
 40 |         ],
 41 |         subprocess_creator=lambda file_name: subprocess.Popen(
 42 |             args=[
 43 |                 "clang",
 44 |                 *clang_flags,
 45 |                 os.path.join(src_dir, file_name),
 46 |                 "-o",
 47 |                 os.path.join(out_dir, file_name.replace(".c", ".bc")),
 48 |             ],
 49 |             stderr=subprocess.DEVNULL,
 50 |             stdout=subprocess.DEVNULL,
 51 |         ),
 52 |         max_jobs=MAX_SUBPROCESSES if n_jobs is None else n_jobs,
 53 |     )
 54 | 
 55 | 
 56 | def main() -> None:
 57 |     parser = argparse.ArgumentParser(description="Batch compiling C code to LLVM IR")
 58 | 
 59 |     parser.add_argument(
 60 |         "-i",
 61 |         "--input",
 62 |         type=str,
 63 |         required=True,
 64 |         help="The input directory containing C source code files",
 65 |     )
 66 | 
 67 |     parser.add_argument(
 68 |         "-o",
 69 |         "--output",
 70 |         type=str,
 71 |         required=True,
 72 |         help="The output directory for LLVM IR bytecode files",
 73 |     )
 74 | 
 75 |     parser.add_argument(
 76 |         "-j",
 77 |         "--jobs",
 78 |         type=int,
 79 |         help="The number of concurrent subprocesses",
 80 |     )
 81 | 
 82 |     parser.add_argument(
 83 |         "--csmith-root",
 84 |         type=str,
 85 |         default="../csmith",
 86 |         help="The root directory for CSmith repo",
 87 |     )
 88 | 
 89 |     args = parser.parse_args()
 90 | 
 91 |     def batch_compile_wrapper(
 92 |         target: str,
 93 |         sysroot: Optional[str] = None,
 94 |         include: Optional[str] = None,
 95 |         apt_package: Optional[str] = None,
 96 |         link: Optional[str] = None,
 97 |     ) -> None:
 98 |         if not os.path.exists(args.csmith_root):
 99 |             print(f"ERROR: missing CSmith in {args.csmith_root}.")
100 |             print(f"Run `git clone https://github.com/csmith-project/csmith.git {args.csmith_root}`")
101 |             return
102 | 
103 |         if (include is not None and not os.path.exists(include)) or (
104 |             sysroot is not None and not os.path.exists(sysroot)
105 |         ):
106 |             print(f"ERROR: missing headers for target {target}.")
107 |             if apt_package is not None:
108 |                 print(f"Run `sudo apt install {apt_package}`.")
109 |             if link is not None:
110 |                 print(f"See {link} for how to get the required headers.")
111 |             return
112 | 
113 |         batch_compile(
114 |             src_dir=args.input,
115 |             out_dir=os.path.join(args.output, target),
116 |             clang_flags=list(
117 |                 build_clang_flags(
118 |                     target=target,
119 |                     sysroot=sysroot,
120 |                     include_paths=[os.path.join(args.csmith_root, "runtime")]
121 |                     + ([] if include is None else [include]),
122 |                     opt_level="2",
123 |                 )
124 |             ),
125 |             n_jobs=args.jobs,
126 |         )
127 | 
128 |     batch_compile_wrapper(
129 |         "i686",
130 |         include="/usr/i686-linux-gnu/include",
131 |         apt_package="libc6-dev-i386-cross",
132 |     )
133 |     batch_compile_wrapper(
134 |         "x86_64",
135 |         include="/usr/x86_64-linux-gnu/include",
136 |         apt_package="libc6-dev-amd64-cross",
137 |     )
138 |     batch_compile_wrapper(
139 |         "arm",
140 |         include="/usr/arm-linux-gnueabi/include",
141 |         apt_package="libc6-dev-armel-cross",
142 |     )
143 |     batch_compile_wrapper(
144 |         "aarch64",
145 |         include="/usr/aarch64-linux-gnu/include",
146 |         apt_package="libc6-dev-arm64-cross",
147 |     )
148 |     batch_compile_wrapper(
149 |         "riscv32",
150 |         include="./riscv32/sysroot/usr/include",
151 |         link="https://github.com/riscv-collab/riscv-gnu-toolchain",
152 |     )
153 |     batch_compile_wrapper(
154 |         "riscv64",
155 |         include="/usr/riscv64-linux-gnu/include",
156 |         apt_package="libc6-dev-riscv64-cross",
157 |     )
158 |     batch_compile_wrapper(
159 |         "wasm32-wasi",
160 |         sysroot="./wasi-sdk-14.0/share/wasi-sysroot",
161 |         link="https://github.com/WebAssembly/wasi-sdk",
162 |     )
163 | 
164 | 
165 | if __name__ == "__main__":
166 |     main()
167 | 


--------------------------------------------------------------------------------
/mutator/src/mutator.cpp:
--------------------------------------------------------------------------------
  1 | #include "mutator.h"
  2 | 
  3 | #include "llvm/ADT/StringRef.h"
  4 | #include "llvm/Analysis/TargetLibraryInfo.h"
  5 | #include "llvm/Bitcode/BitcodeReader.h"
  6 | #include "llvm/Bitcode/BitcodeWriter.h"
  7 | #include "llvm/CodeGen/CommandFlags.h"
  8 | #include "llvm/FuzzMutate/FuzzerCLI.h"
  9 | #include "llvm/FuzzMutate/IRMutator.h"
 10 | #include "llvm/FuzzMutate/Operations.h"
 11 | #include "llvm/IR/Constants.h"
 12 | #include "llvm/IR/DerivedTypes.h"
 13 | #include "llvm/IR/LLVMContext.h"
 14 | #include "llvm/IR/LegacyPassManager.h"
 15 | #include "llvm/IR/Module.h"
 16 | #include "llvm/IR/Verifier.h"
 17 | #include "llvm/IRReader/IRReader.h"
 18 | #include "llvm/Support/CommandLine.h"
 19 | #include "llvm/Support/DataTypes.h"
 20 | #include "llvm/Support/Debug.h"
 21 | #include "llvm/Support/SourceMgr.h"
 22 | #include "llvm/Support/TargetSelect.h"
 23 | #include "llvm/Target/TargetMachine.h"
 24 | 
 25 | #include <fstream>
 26 | #include <iostream>
 27 | #include <stdio.h>
 28 | #include <stdlib.h>
 29 | #include <string.h>
 30 | #include <vector>
 31 | 
 32 | #ifdef DEBUG
 33 | #include "llvm/IR/Verifier.h"
 34 | #include "llvm/Transforms/Utils/Cloning.h"
 35 | #endif
 36 | using namespace llvm;
 37 | 
 38 | static std::unique_ptr<IRMutator> Mutator;
 39 | 
 40 | extern "C" {
 41 | 
 42 | void dumpOnFailure(unsigned int Seed, uint8_t *Data, size_t Size,
 43 |                    size_t MaxSize) {
 44 |   time_t seconds = time(NULL);
 45 |   errs() << "Mutation failed, seed: " << Seed << "\n";
 46 |   char oldname[256];
 47 |   memset(oldname, 0, 256);
 48 |   sprintf(oldname, "%u-%zu-%zu.old.bc", Seed, MaxSize, seconds);
 49 |   std::ofstream oldoutfile =
 50 |       std::ofstream(oldname, std::ios::out | std::ios::binary);
 51 |   oldoutfile.write((char *)Data, Size);
 52 |   oldoutfile.close();
 53 | }
 54 | 
 55 | void addVectorTypeGetters(std::vector<TypeGetter> &Types) {
 56 |   int VectorLength[] = {1, 2, 4, 8, 16, 32};
 57 |   std::vector<TypeGetter> BasicTypeGetters(Types);
 58 |   for (auto typeGetter : BasicTypeGetters) {
 59 |     for (int length : VectorLength) {
 60 |       Types.push_back([typeGetter, length](LLVMContext &C) {
 61 |         return VectorType::get(typeGetter(C), length, false);
 62 |       });
 63 |     }
 64 |   }
 65 | }
 66 | 
 67 | /// TODO:
 68 | /// Type* getStructType(Context& C);
 69 | 
 70 | void createISelMutator() {
 71 |   std::vector<TypeGetter> Types{
 72 |       Type::getInt1Ty,  Type::getInt8Ty,  Type::getInt16Ty, Type::getInt32Ty,
 73 |       Type::getInt64Ty, Type::getFloatTy, Type::getDoubleTy};
 74 |   std::vector<TypeGetter> ScalarTypes = Types;
 75 | 
 76 |   addVectorTypeGetters(Types);
 77 | 
 78 |   TypeGetter OpaquePtrGetter = [](LLVMContext &C) {
 79 |     return PointerType::get(Type::getInt32Ty(C), 0);
 80 |   };
 81 |   Types.push_back(OpaquePtrGetter);
 82 | 
 83 |   // Copy scalar types to change distribution.
 84 |   for (int i = 0; i < 5; i++)
 85 |     Types.insert(Types.end(), ScalarTypes.begin(), ScalarTypes.end());
 86 | 
 87 |   std::vector<std::unique_ptr<IRMutationStrategy>> Strategies;
 88 |   std::vector<fuzzerop::OpDescriptor> Ops = InjectorIRStrategy::getDefaultOps();
 89 | 
 90 |   Strategies.push_back(std::make_unique<InjectorIRStrategy>(
 91 |       InjectorIRStrategy::getDefaultOps()));
 92 |   Strategies.push_back(std::make_unique<InstDeleterIRStrategy>());
 93 |   Strategies.push_back(std::make_unique<InstModificationIRStrategy>());
 94 |   Strategies.push_back(std::make_unique<InsertFunctionStrategy>());
 95 |   Strategies.push_back(std::make_unique<InsertCFGStrategy>());
 96 |   Strategies.push_back(std::make_unique<InsertPHIStrategy>());
 97 |   Strategies.push_back(std::make_unique<SinkInstructionStrategy>());
 98 |   Strategies.push_back(std::make_unique<ShuffleBlockStrategy>());
 99 | 
100 |   Mutator =
101 |       std::make_unique<IRMutator>(std::move(Types), std::move(Strategies));
102 | }
103 | 
104 | size_t LLVMFuzzerCustomMutator(uint8_t *Data, size_t Size, size_t MaxSize,
105 |                                unsigned int Seed) {
106 |   LLVMContext Context;
107 |   std::unique_ptr<Module> M;
108 |   if (Size <= 1)
109 |     // We get bogus data given an empty corpus - just create a new module.
110 |     M.reset(new Module("M", Context));
111 |   else
112 |     M = parseModule(Data, Size, Context);
113 |   if (!M) {
114 |     errs() << "Parse module error. No mutation is done. Data size: " << Size
115 |            << ". Given data wrote to err.bc\n";
116 |     std::ofstream outfile =
117 |         std::ofstream("err.bc", std::ios::out | std::ios::binary);
118 |     outfile.write((char *)Data, Size);
119 |     outfile.close();
120 | #ifdef DEBUG
121 |     exit(1);
122 | #else
123 |     // We don't do any change.
124 |     return Size;
125 | #endif
126 |   }
127 | #ifdef DEBUG
128 |   std::unique_ptr<Module> OldM = CloneModule(*M);
129 | #endif
130 | 
131 | #ifdef DEBUG
132 |   try {
133 | #endif
134 |     srand(Seed);
135 |     Seed = rand();
136 |     // for (int i = 0; i < 4; i++) {
137 |     Mutator->mutateModule(*M, Seed, MaxSize);
138 |     // }
139 | #ifdef DEBUG
140 |   } catch (...) {
141 |     dumpOnFailure(Seed, Data, Size, MaxSize);
142 |     return Size;
143 |   }
144 | #endif
145 | 
146 | #ifdef DEBUG
147 |   uint8_t NewData[MaxSize];
148 |   size_t NewSize = writeModule(*M, NewData, MaxSize);
149 |   LLVMContext NewC;
150 |   auto NewM = parseModule(NewData, NewSize, NewC);
151 |   if (NewM == nullptr) {
152 |     dumpOnFailure(Seed, Data, Size, MaxSize);
153 |     return Size;
154 |   } else {
155 |     memset(Data, 0, MaxSize);
156 |     memcpy(Data, NewData, NewSize);
157 |     return NewSize;
158 |   }
159 | #else
160 |   return writeModule(*M, Data, MaxSize);
161 | #endif
162 | }
163 | }


--------------------------------------------------------------------------------
/scripts/process_data.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Iterable, Tuple
  3 | import pandas as pd
  4 | from matplotlib import pyplot
  5 | import os
  6 | import argparse
  7 | import logging
  8 | import subprocess
  9 | 
 10 | from lib import IRFUZZER_DATA_ENV
 11 | from lib.experiment import Experiment, get_all_experiments
 12 | 
 13 | 
 14 | def iterate_over_all_experiments(
 15 |     dir: Path | str, allow_missing_data: bool = False
 16 | ) -> Iterable[Tuple[Experiment, pd.DataFrame]]:
 17 |     for expr in get_all_experiments(dir):
 18 |         try:
 19 |             yield (expr, expr.read_plot_data())
 20 |         except FileNotFoundError:
 21 |             if not allow_missing_data:
 22 |                 raise
 23 | 
 24 | 
 25 | def combine_last_row_of_each_experiment_data(
 26 |     experiments: Iterable[Tuple[Experiment, pd.DataFrame]], columns: list[str]
 27 | ) -> pd.DataFrame:
 28 |     return pd.DataFrame(
 29 |         columns=["fuzzer", "isel", "target", "replicate", *columns],
 30 |         data=(
 31 |             [
 32 |                 exp.fuzzer,
 33 |                 exp.isel,
 34 |                 str(exp.target),
 35 |                 exp.replicate_id,
 36 |                 *df.tail(1)[columns].values.flatten().tolist(),
 37 |             ]
 38 |             for (exp, df) in experiments
 39 |         ),
 40 |     )
 41 | 
 42 | 
 43 | def generate_plots(
 44 |     experiments: Iterable[Tuple[Experiment, pd.DataFrame]], dir_out: str
 45 | ) -> None:
 46 |     pyplot.ioff()
 47 | 
 48 |     for (experiment, df) in experiments:
 49 |         figure_path = os.path.join(
 50 |             dir_out,
 51 |             experiment.fuzzer,
 52 |             experiment.isel,
 53 |             str(experiment.target),
 54 |             str(experiment.replicate_id),
 55 |         )
 56 |         os.makedirs(figure_path, exist_ok=True)
 57 | 
 58 |         try:
 59 |             df.plot(x="total_execs", y="saved_crashes").figure.savefig(
 60 |                 os.path.join(figure_path, "crashes-vs-execs.png")
 61 |             )
 62 |             df.plot(x="total_execs", y="shw_cvg").figure.savefig(
 63 |                 os.path.join(figure_path, "shwcvg-vs-execs.png")
 64 |             )
 65 |             df.plot(x="# relative_time", y="saved_crashes").figure.savefig(
 66 |                 os.path.join(figure_path, "crashes-vs-time.png")
 67 |             )
 68 |             df.plot(x="# relative_time", y="shw_cvg").figure.savefig(
 69 |                 os.path.join(figure_path, "shwcvg-vs-time.png")
 70 |             )
 71 |         except:
 72 |             print(
 73 |                 f"ERROR: Cannot plot {experiment.fuzzer}/{experiment.isel}/{experiment.target}/{experiment.replicate_id}"
 74 |             )
 75 | 
 76 |         pyplot.close()
 77 | 
 78 | 
 79 | def get_last_col(args):
 80 |     df = combine_last_row_of_each_experiment_data(
 81 |         iterate_over_all_experiments(args.input, allow_missing_data=True),
 82 |         columns=[
 83 |             "# relative_time",
 84 |             "total_execs",
 85 |             "bit_cvg",
 86 |             "shw_cvg",
 87 |             "corpus_count",
 88 |         ],
 89 |     )
 90 |     outpath = os.path.join(args.output, "last_row_of_each_experiment.csv")
 91 |     df.to_csv(outpath, index=False)
 92 | 
 93 | 
 94 | def get_summary(args):
 95 |     df = combine_last_row_of_each_experiment_data(
 96 |         iterate_over_all_experiments(args.input, allow_missing_data=True),
 97 |         columns=[
 98 |             "# relative_time",
 99 |             "total_execs",
100 |             "bit_cvg",
101 |             "shw_cvg",
102 |             "corpus_count",
103 |         ],
104 |     )
105 |     outpath = os.path.join(args.output, "summary.csv")
106 |     df_summary = (
107 |         df.drop(columns=["replicate"])
108 |         .groupby(["fuzzer", "isel", "target"])
109 |         .agg(["min", "max", "count", "mean", "std"])
110 |     )
111 | 
112 |     df_summary.to_csv(outpath)
113 | 
114 | 
115 | def main() -> None:
116 | 
117 |     parser = argparse.ArgumentParser(description="Process fuzzing output")
118 |     parser.add_argument(
119 |         "-i",
120 |         "--input",
121 |         type=str,
122 |         default="",
123 |         help=f"The directory containing all inputs. Default to ${IRFUZZER_DATA_ENV}",
124 |     )
125 |     parser.add_argument(
126 |         "-o",
127 |         "--output",
128 |         type=str,
129 |         default="./output",
130 |         help="The directory containing processed results, will force removal if it exists.",
131 |     )
132 |     parser.add_argument(
133 |         "-t",
134 |         "--type",
135 |         type=str,
136 |         choices=["LastCol", "Summary", "Plot", "Mann", "Data"],
137 |         required=True,
138 |         help="Type of the job you want me to do.",
139 |     )
140 |     args = parser.parse_args()
141 |     if args.input == "":
142 |         args.input = os.getenv(IRFUZZER_DATA_ENV)
143 |         if args.input == None:
144 |             logging.error(
145 |                 f"Input directory not set, set --input or {IRFUZZER_DATA_ENV}"
146 |             )
147 |             exit(1)
148 |     if args.type != "Data":
149 |         if os.path.exists(args.output):
150 |             logging.warning(f"{args.output} exists, removing.")
151 |             subprocess.run(["rm", "-rf", args.output])
152 |         os.mkdir(args.output)
153 | 
154 |     if args.type == "LastCol":
155 |         get_last_col(args)
156 |     elif args.type == "Summary":
157 |         # TODO: All data required by summary can be found in expr_info now
158 |         # maybe stop reading the whole csv as it is slow.
159 |         get_summary(args)
160 |     elif args.type == "Plot":
161 |         generate_plots(
162 |             experiments=iterate_over_all_experiments(
163 |                 args.input, allow_missing_data=True
164 |             ),
165 |             dir_out=args.output,
166 |         )
167 |     elif args.type == "Mann":
168 |         # Mann Whitney U Test to tell if we are statically significant.
169 |         pass
170 | 
171 | 
172 | if __name__ == "__main__":
173 |     logging.basicConfig()
174 |     logging.getLogger().setLevel(logging.INFO)
175 |     main()
176 | 


--------------------------------------------------------------------------------
/scripts/compare_experiments.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | from pathlib import Path
  4 | from typing import Iterable, Iterator, Tuple
  5 | from matplotlib import pyplot
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | from lib.plot_data import read_plot_data
 10 | 
 11 | 
 12 | def interpolate_data(
 13 |     df: pd.DataFrame, x_col: str, y_col: str, desired_xs: Iterator[int]
 14 | ) -> Iterable[Tuple[int, int]]:
 15 |     desired_x = next(desired_xs)
 16 |     prev_row = None
 17 | 
 18 |     for _, curr_row in df.iterrows():
 19 |         curr_x, curr_y = curr_row[x_col], curr_row[y_col]
 20 | 
 21 |         if desired_x is None:
 22 |             return
 23 | 
 24 |         if curr_x == desired_x:
 25 |             # no need for interpolation
 26 |             yield (desired_x, curr_y)
 27 |             desired_x = next(desired_xs, None)
 28 |         elif curr_x > desired_x:
 29 |             # linear interpolation
 30 |             if prev_row is None:
 31 |                 raise Exception("Not supported yet")
 32 | 
 33 |             prev_x, prev_y = prev_row[x_col], prev_row[y_col]
 34 |             slope = (curr_y - prev_y) / (curr_x - prev_x)
 35 |             yield (desired_x, prev_y + (desired_x - prev_x) * slope)
 36 |             desired_x = next(desired_xs, None)
 37 | 
 38 |         prev_row = curr_row
 39 | 
 40 | 
 41 | def interpolate_data_multiple(
 42 |     dfs: Iterable[pd.DataFrame], x_col: str, y_col: str, desired_xs: range
 43 | ):
 44 |     return pd.DataFrame(
 45 |         {
 46 |             x_col: desired_xs,
 47 |             **dict(
 48 |                 (
 49 |                     f"{y_col}_{idx}",
 50 |                     [
 51 |                         y
 52 |                         for _, y in interpolate_data(df, x_col, y_col, iter(desired_xs))
 53 |                     ],
 54 |                 )
 55 |                 for idx, df in enumerate(dfs)
 56 |             ),
 57 |         }
 58 |     )
 59 | 
 60 | 
 61 | def get_confidence_intervals(
 62 |     df: pd.DataFrame, x_col: str, summary_col_prefix: str, t: float
 63 | ) -> pd.DataFrame:
 64 |     df_temp = df.drop(columns=[x_col])
 65 | 
 66 |     n = df_temp.count(axis=1)
 67 |     mean = df_temp.mean(axis=1)
 68 |     std_dev = df_temp.std(axis=1)
 69 |     std_err = std_dev / np.sqrt(n)
 70 | 
 71 |     return pd.DataFrame(
 72 |         {
 73 |             x_col: df[x_col],
 74 |             f"{summary_col_prefix}_ci_lower": mean - t * std_err,
 75 |             f"{summary_col_prefix}_mean": mean,
 76 |             f"{summary_col_prefix}_ci_upper": mean + t * std_err,
 77 |         }
 78 |     )
 79 | 
 80 | 
 81 | def iterate_plot_data_for_replicates(
 82 |     dir: str, n_replicate: int
 83 | ) -> Iterable[pd.DataFrame]:
 84 |     return (
 85 |         read_plot_data(
 86 |             Path(
 87 |                 dir,
 88 |                 str(i),
 89 |                 "default/plot_data",
 90 |             )
 91 |         )
 92 |         for i in range(n_replicate)
 93 |     )
 94 | 
 95 | 
 96 | def compare(
 97 |     dir_mt_off: str,
 98 |     dir_mt_on: str,
 99 |     n_replicate: int,
100 |     x_col: str,
101 |     y_col: str,
102 |     desired_xs: range,
103 |     t: float,
104 | ) -> pd.DataFrame:
105 |     df_off = interpolate_data_multiple(
106 |         dfs=iterate_plot_data_for_replicates(dir_mt_off, n_replicate),
107 |         x_col=x_col,
108 |         y_col=y_col,
109 |         desired_xs=desired_xs,
110 |     )
111 | 
112 |     df_on = interpolate_data_multiple(
113 |         dfs=iterate_plot_data_for_replicates(dir_mt_on, n_replicate),
114 |         x_col=x_col,
115 |         y_col=y_col,
116 |         desired_xs=desired_xs,
117 |     )
118 | 
119 |     df_off_ci = get_confidence_intervals(df_off, x_col, y_col, t)
120 |     df_on_ci = get_confidence_intervals(df_on, x_col, y_col, t)
121 | 
122 |     return pd.merge(left=df_off_ci, right=df_on_ci, on=x_col, suffixes=("_off", "_on"))
123 | 
124 | 
125 | def main():
126 |     parser = argparse.ArgumentParser(
127 |         description="Compare matcher table coverage of experiments",
128 |     )
129 | 
130 |     parser.add_argument(
131 |         "-off",
132 |         "--dir-mt-off",
133 |         type=str,
134 |         required=True,
135 |         help="The dir of fuzzing results with matcher table on",
136 |     )
137 | 
138 |     parser.add_argument(
139 |         "-on",
140 |         "--dir-mt-on",
141 |         type=str,
142 |         required=True,
143 |         help="The dir of fuzzing results with matcher table on",
144 |     )
145 | 
146 |     parser.add_argument(
147 |         "-o",
148 |         "--out",
149 |         type=str,
150 |         default="compare-all.png",
151 |         help="The path to the figure to be saved",
152 |     )
153 | 
154 |     args = parser.parse_args()
155 | 
156 |     x_col = "# relative_time"
157 |     y_col = "shw_cvg"
158 |     desired_xs = range(800, 80000 + 1, 200)
159 |     t = 2.776  # t(df=4, two-tail alpha=0.05)
160 |     dir_mt_off = os.path.join(args.dir_mt_off, "irfuzzer/dagisel")
161 |     dir_mt_on = os.path.join(args.dir_mt_on, "irfuzzer/dagisel")
162 |     archs = ["aarch64", "arm", "nvptx", "riscv64", "x86_64"]
163 | 
164 |     fig, axs = pyplot.subplots(
165 |         nrows=1, ncols=len(archs), layout="constrained", figsize=(12, 2.4)
166 |     )
167 | 
168 |     for i, arch in enumerate(archs):
169 |         df_ci = compare(
170 |             dir_mt_off=os.path.join(dir_mt_off, arch),
171 |             dir_mt_on=os.path.join(dir_mt_on, arch),
172 |             n_replicate=5,
173 |             x_col=x_col,
174 |             y_col=y_col,
175 |             desired_xs=desired_xs,
176 |             t=t,
177 |         )
178 | 
179 |         axs[i].set_title(arch)
180 | 
181 |         axs[i].plot(x_col, f"{y_col}_mean_off", data=df_ci, color="#4899dc")
182 |         axs[i].fill_between(
183 |             x=x_col,
184 |             y1=f"{y_col}_ci_lower_off",
185 |             y2=f"{y_col}_ci_upper_off",
186 |             data=df_ci,
187 |             color="#a2ccee",
188 |             alpha=0.5,
189 |         )
190 | 
191 |         axs[i].plot(x_col, f"{y_col}_mean_on", data=df_ci, color="#f89d49")
192 |         axs[i].fill_between(
193 |             x=x_col,
194 |             y1=f"{y_col}_ci_lower_on",
195 |             y2=f"{y_col}_ci_upper_on",
196 |             data=df_ci,
197 |             color="#fccea7",
198 |             alpha=0.5,
199 |         )
200 | 
201 |     axs[0].set_ylabel("Matcher Table Coverage")
202 |     axs[len(archs) // 2].set_xlabel("Time (sec)")
203 |     axs[0].legend(
204 |         [
205 |             "Matcher Table Off (Mean)",
206 |             "Matcher Table Off (95% CI)",
207 |             "Matcher Table On (Mean)",
208 |             "Matcher Table On (95% CI)",
209 |         ],
210 |         bbox_to_anchor=(0, 1.25, 6, 0.2),
211 |         loc="lower left",
212 |         mode="expand",
213 |         ncol=4,
214 |     )
215 | 
216 |     fig.savefig(args.out)
217 | 
218 | 
219 | if __name__ == "__main__":
220 |     main()
221 | 


--------------------------------------------------------------------------------
/mutator/src/afl-mutator.c:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #include "afl-fuzz.h"
  4 | #include "mutator.h"
  5 | 
  6 | #include <stdint.h>
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <string.h>
 10 | 
 11 | #define DATA_SIZE (4096)
 12 | typedef struct custom_ir_mutator {
 13 |   afl_state_t *afl;
 14 |   uint8_t *mutator_buf;
 15 | } CustomIRMutator;
 16 | 
 17 | /**
 18 |  * Initialize this custom mutator
 19 |  *
 20 |  * @param[in] afl a pointer to the internal state object. Can be ignored for
 21 |  * now.
 22 |  * @param[in] seed A seed for this mutator - the same seed should always mutate
 23 |  * in the same way.
 24 |  * @return Pointer to the data object this custom mutator instance should use.
 25 |  *         There may be multiple instances of this mutator in one afl-fuzz run!
 26 |  *         Return NULL on error.
 27 |  */
 28 | CustomIRMutator *afl_custom_init(afl_state_t *afl, unsigned int seed) {
 29 | 
 30 |   CustomIRMutator *mutator =
 31 |       (CustomIRMutator *)calloc(1, sizeof(CustomIRMutator));
 32 |   if (!mutator) {
 33 |     perror("afl_custom_init alloc");
 34 |     return NULL;
 35 |   }
 36 | 
 37 |   mutator->afl = afl;
 38 |   // The mutator can be think of as a deterministic function where
 39 |   // new_M = Mutate(M, seed);
 40 |   srand(seed);
 41 | 
 42 |   if ((mutator->mutator_buf = (u8 *)malloc(MAX_FILE)) == NULL) {
 43 | 
 44 |     free(mutator);
 45 |     perror("mutator_buf alloc");
 46 |     return NULL;
 47 |   }
 48 |   createISelMutator();
 49 |   return mutator;
 50 | }
 51 | 
 52 | /**
 53 |  * Perform custom mutations on a given input
 54 |  *
 55 |  * (Optional for now. Required in the future)
 56 |  *
 57 |  * @param[in] data pointer returned in afl_custom_init for this fuzz case
 58 |  * @param[in] buf Pointer to input data to be mutated
 59 |  * @param[in] buf_size Size of input data
 60 |  * @param[out] out_buf the buffer we will work on. we can reuse *buf. NULL on
 61 |  * error.
 62 |  * @param[in] add_buf Buffer containing the additional test case
 63 |  * @param[in] add_buf_size Size of the additional test case
 64 |  * @param[in] max_size Maximum size of the mutated output. The mutation must not
 65 |  *     produce data larger than max_size.
 66 |  * @return Size of the mutated output.
 67 |  */
 68 | size_t afl_custom_fuzz(CustomIRMutator *mutator, uint8_t *buf, size_t buf_size,
 69 |                        u8 **out_buf, uint8_t *add_buf,
 70 |                        size_t add_buf_size, // add_buf can be NULL
 71 |                        size_t max_size) {
 72 | 
 73 |   memcpy(mutator->mutator_buf, buf, buf_size);
 74 |   size_t out_size =
 75 |       LLVMFuzzerCustomMutator(mutator->mutator_buf, buf_size, max_size, rand());
 76 | 
 77 |   /* return size of mutated data */
 78 |   *out_buf = mutator->mutator_buf;
 79 |   return out_size;
 80 | }
 81 | 
 82 | /**
 83 |  * A post-processing function to use right before AFL writes the test case to
 84 |  * disk in order to execute the target.
 85 |  *
 86 |  * (Optional) If this functionality is not needed, simply don't define this
 87 |  * function.
 88 |  *
 89 |  * @param[in] data pointer returned in afl_custom_init for this fuzz case
 90 |  * @param[in] buf Buffer containing the test case to be executed
 91 |  * @param[in] buf_size Size of the test case
 92 |  * @param[out] out_buf Pointer to the buffer containing the test case after
 93 |  *     processing. External library should allocate memory for out_buf.
 94 |  *     The buf pointer may be reused (up to the given buf_size);
 95 |  * @return Size of the output buffer after processing or the needed amount.
 96 |  *     A return of 0 indicates an error.
 97 |  */
 98 | /*
 99 | size_t afl_custom_post_process(CustomIRMutator *data, uint8_t *buf,
100 |                                size_t buf_size, uint8_t **out_buf) {
101 | 
102 |   uint8_t *post_process_buf =
103 |       maybe_grow(BUF_PARAMS(data, post_process), buf_size + 5);
104 |   if (!post_process_buf) {
105 | 
106 |     perror("custom mutator realloc failed.");
107 |     *out_buf = NULL;
108 |     return 0;
109 |   }
110 | 
111 |   memcpy(post_process_buf + 5, buf, buf_size);
112 |   post_process_buf[0] = 'A';
113 |   post_process_buf[1] = 'F';
114 |   post_process_buf[2] = 'L';
115 |   post_process_buf[3] = '+';
116 |   post_process_buf[4] = '+';
117 | 
118 |   *out_buf = post_process_buf;
119 | 
120 |   return buf_size + 5;
121 | }
122 | */
123 | 
124 | /**
125 |  * This method is called at the start of each trimming operation and receives
126 |  * the initial buffer. It should return the amount of iteration steps possible
127 |  * on this input (e.g. if your input has n elements and you want to remove
128 |  * them one by one, return n, if you do a binary search, return log(n),
129 |  * and so on...).
130 |  *
131 |  * If your trimming algorithm doesn't allow you to determine the amount of
132 |  * (remaining) steps easily (esp. while running), then you can alternatively
133 |  * return 1 here and always return 0 in post_trim until you are finished and
134 |  * no steps remain. In that case, returning 1 in post_trim will end the
135 |  * trimming routine. The whole current index/max iterations stuff is only used
136 |  * to show progress.
137 |  *
138 |  * (Optional)
139 |  *
140 |  * @param data pointer returned in afl_custom_init for this fuzz case
141 |  * @param buf Buffer containing the test case
142 |  * @param buf_size Size of the test case
143 |  * @return The amount of possible iteration steps to trim the input.
144 |  *        negative on error.
145 |  */
146 | /*
147 | int32_t afl_custom_init_trim(CustomIRMutator *data, uint8_t *buf,
148 |                              size_t buf_size) {
149 | 
150 |   // We simply trim once
151 |   data->trimmming_steps = 1;
152 | 
153 |   data->cur_step = 0;
154 | 
155 |   if (!maybe_grow(BUF_PARAMS(data, trim), buf_size)) {
156 | 
157 |     perror("init_trim grow");
158 |     return -1;
159 |   }
160 | 
161 |   memcpy(data->trim_buf, buf, buf_size);
162 | 
163 |   data->trim_size_current = buf_size;
164 | 
165 |   return data->trimmming_steps;
166 | }
167 | */
168 | 
169 | /**
170 |  * This method is called for each trimming operation. It doesn't have any
171 |  * arguments because we already have the initial buffer from init_trim and we
172 |  * can memorize the current state in *data. This can also save
173 |  * reparsing steps for each iteration. It should return the trimmed input
174 |  * buffer, where the returned data must not exceed the initial input data in
175 |  * length. Returning anything that is larger than the original data (passed
176 |  * to init_trim) will result in a fatal abort of AFLFuzz.
177 |  *
178 |  * (Optional)
179 |  *
180 |  * @param[in] data pointer returned in afl_custom_init for this fuzz case
181 |  * @param[out] out_buf Pointer to the buffer containing the trimmed test case.
182 |  *     External library should allocate memory for out_buf.
183 |  *     AFL++ will not release the memory after saving the test case.
184 |  *     Keep a ref in *data.
185 |  *     *out_buf = NULL is treated as error.
186 |  * @return Pointer to the size of the trimmed test case
187 |  */
188 | /*
189 | size_t afl_custom_trim(CustomIRMutator *data, uint8_t **out_buf) {
190 | 
191 |   *out_buf = data->trim_buf;
192 | 
193 |   // Remove the last byte of the trimming input
194 |   return data->trim_size_current - 1;
195 | }
196 | */
197 | 
198 | /**
199 |  * This method is called after each trim operation to inform you if your
200 |  * trimming step was successful or not (in terms of coverage). If you receive
201 |  * a failure here, you should reset your input to the last known good state.
202 |  *
203 |  * (Optional)
204 |  *
205 |  * @param[in] data pointer returned in afl_custom_init for this fuzz case
206 |  * @param success Indicates if the last trim operation was successful.
207 |  * @return The next trim iteration index (from 0 to the maximum amount of
208 |  *     steps returned in init_trim). negative ret on failure.
209 |  */
210 | /*
211 | int32_t afl_custom_post_trim(CustomIRMutator *data, int success) {
212 | 
213 |   if (success) {
214 | 
215 |     ++data->cur_step;
216 |     return data->cur_step;
217 |   }
218 | 
219 |   return data->trimmming_steps;
220 | }
221 | */
222 | 
223 | /**
224 |  * Perform a single custom mutation on a given input.
225 |  * This mutation is stacked with the other muatations in havoc.
226 |  *
227 |  * (Optional)
228 |  *
229 |  * @param[in] data pointer returned in afl_custom_init for this fuzz case
230 |  * @param[in] buf Pointer to the input data to be mutated and the mutated
231 |  *     output
232 |  * @param[in] buf_size Size of input data
233 |  * @param[out] out_buf The output buffer. buf can be reused, if the content
234 |  * fits. *out_buf = NULL is treated as error.
235 |  * @param[in] max_size Maximum size of the mutated output. The mutation must
236 |  *     not produce data larger than max_size.
237 |  * @return Size of the mutated output.
238 |  */
239 | /*
240 | size_t afl_custom_havoc_mutation(CustomIRMutator *mutator, u8 *buf,
241 |                                  size_t buf_size, u8 **out_buf,
242 |                                  size_t max_size) {
243 |   memcpy(mutator->mutator_buf, buf, buf_size);
244 |   size_t out_size = LLVMFuzzerCustomMutator(mutator->mutator_buf, buf_size,
245 |                                             max_size, mutator->seed);
246 | 
247 |   // return size of mutated data
248 |   *out_buf = mutator->mutator_buf;
249 |   return out_size;
250 | }
251 | */
252 | 
253 | /**
254 |  * Return the probability (in percentage) that afl_custom_havoc_mutation
255 |  * is called in havoc. By default it is 6 %.
256 |  *
257 |  * (Optional)
258 |  *
259 |  * @param[in] data pointer returned in afl_custom_init for this fuzz case
260 |  * @return The probability (0-100).
261 |  */
262 | /*
263 | uint8_t afl_custom_havoc_mutation_probability(CustomIRMutator *data) {
264 |   return 100; // 100 %
265 | }
266 | */
267 | 
268 | /**
269 |  * Determine whether the fuzzer should fuzz the queue entry or not.
270 |  *
271 |  * (Optional)
272 |  *
273 |  * @param[in] data pointer returned in afl_custom_init for this fuzz case
274 |  * @param filename File name of the test case in the queue entry
275 |  * @return Return True(1) if the fuzzer will fuzz the queue entry, and
276 |  *     False(0) otherwise.
277 |  */
278 | /*
279 | uint8_t afl_custom_queue_get(CustomIRMutator *data, const uint8_t *filename) {
280 | 
281 |   return 1;
282 | }
283 | */
284 | 
285 | /**
286 |  * Allow for additional analysis (e.g. calling a different tool that does a
287 |  * different kind of coverage and saves this for the custom mutator).
288 |  *
289 |  * (Optional)
290 |  *
291 |  * @param data pointer returned in afl_custom_init for this fuzz case
292 |  * @param filename_new_queue File name of the new queue entry
293 |  * @param filename_orig_queue File name of the original queue entry
294 |  * @return if the file contents was modified return 1 (True), 0 (False)
295 |  *         otherwise
296 |  */
297 | /*
298 | uint8_t afl_custom_queue_new_entry(CustomIRMutator *data,
299 |                                    const uint8_t *filename_new_queue,
300 |                                    const uint8_t *filename_orig_queue) {
301 | 
302 |   // Additional analysis on the original or new test case
303 |   return 0;
304 | }
305 | */
306 | 
307 | /**
308 |  * Deinitialize everything
309 |  *
310 |  * @param data The data ptr from afl_custom_init
311 |  */
312 | void afl_custom_deinit(CustomIRMutator *mutator) {
313 |   free(mutator->mutator_buf);
314 |   free(mutator);
315 | }
316 | 


--------------------------------------------------------------------------------
/llvm-isel-afl/afl-driver.cpp:
--------------------------------------------------------------------------------
  1 | //===- afl_driver.cpp - a glue between AFL and libFuzzer --------*- C++ -* ===//
  2 | //
  3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 | // See https://llvm.org/LICENSE.txt for license information.
  5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 | //===----------------------------------------------------------------------===//
  7 | 
  8 | /* This file allows to fuzz libFuzzer-style target functions
  9 |  (LLVMFuzzerTestOneInput) with AFL using AFL's persistent (in-process) mode.
 10 | 
 11 | Usage:
 12 | ################################################################################
 13 | cat << EOF > test_fuzzer.cc
 14 | #include <stddef.h>
 15 | #include <stdint.h>
 16 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
 17 |   if (size > 0 && data[0] == 'H')
 18 |     if (size > 1 && data[1] == 'I')
 19 |        if (size > 2 && data[2] == '!')
 20 |        __builtin_trap();
 21 |   return 0;
 22 | }
 23 | EOF
 24 | # Build your target with -fsanitize-coverage=trace-pc-guard using fresh clang.
 25 | clang -g -fsanitize-coverage=trace-pc-guard test_fuzzer.cc -c
 26 | # Build afl-llvm-rt.o.c from the AFL distribution.
 27 | clang -c -w $AFL_HOME/llvm_mode/afl-llvm-rt.o.c
 28 | # Build this file, link it with afl-llvm-rt.o.o and the target code.
 29 | clang++ afl_driver.cpp test_fuzzer.o afl-llvm-rt.o.o
 30 | # Run AFL:
 31 | rm -rf IN OUT; mkdir IN OUT; echo z > IN/z;
 32 | $AFL_HOME/afl-fuzz -i IN -o OUT ./a.out
 33 | ################################################################################
 34 | AFL_DRIVER_STDERR_DUPLICATE_FILENAME: Setting this *appends* stderr to the file
 35 | specified. If the file does not exist, it is created. This is useful for getting
 36 | stack traces (when using ASAN for example) or original error messages on hard
 37 | to reproduce bugs. Note that any content written to stderr will be written to
 38 | this file instead of stderr's usual location.
 39 | 
 40 | AFL_DRIVER_CLOSE_FD_MASK: Similar to libFuzzer's -close_fd_mask behavior option.
 41 | If 1, close stdout at startup. If 2 close stderr; if 3 close both.
 42 | 
 43 | */
 44 | #include <assert.h>
 45 | #include <errno.h>
 46 | #include <stdarg.h>
 47 | #include <stdint.h>
 48 | #include <stdio.h>
 49 | #include <stdlib.h>
 50 | #include <string.h>
 51 | #include <unistd.h>
 52 | 
 53 | #include <fstream>
 54 | #include <iostream>
 55 | #include <vector>
 56 | 
 57 | // Platform detection. Copied from FuzzerInternal.h
 58 | #ifdef __linux__
 59 | #define LIBFUZZER_LINUX 1
 60 | #define LIBFUZZER_APPLE 0
 61 | #define LIBFUZZER_NETBSD 0
 62 | #define LIBFUZZER_FREEBSD 0
 63 | #elif __APPLE__
 64 | #define LIBFUZZER_LINUX 0
 65 | #define LIBFUZZER_APPLE 1
 66 | #define LIBFUZZER_NETBSD 0
 67 | #define LIBFUZZER_FREEBSD 0
 68 | #elif __NetBSD__
 69 | #define LIBFUZZER_LINUX 0
 70 | #define LIBFUZZER_APPLE 0
 71 | #define LIBFUZZER_NETBSD 1
 72 | #define LIBFUZZER_FREEBSD 0
 73 | #elif __FreeBSD__
 74 | #define LIBFUZZER_LINUX 0
 75 | #define LIBFUZZER_APPLE 0
 76 | #define LIBFUZZER_NETBSD 0
 77 | #define LIBFUZZER_FREEBSD 1
 78 | #else
 79 | #error "Support for your platform has not been implemented"
 80 | #endif
 81 | 
 82 | // libFuzzer interface is thin, so we don't include any libFuzzer headers.
 83 | extern "C" {
 84 | int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size);
 85 | __attribute__((weak)) int LLVMFuzzerInitialize(int *argc, char ***argv);
 86 | }
 87 | 
 88 | // Notify AFL about persistent mode.
 89 | static volatile char AFL_PERSISTENT[] = "##SIG_AFL_PERSISTENT##";
 90 | extern "C" int __afl_persistent_loop(unsigned int);
 91 | static volatile char suppress_warning2 = AFL_PERSISTENT[0];
 92 | 
 93 | // Notify AFL about deferred forkserver.
 94 | static volatile char AFL_DEFER_FORKSVR[] = "##SIG_AFL_DEFER_FORKSRV##";
 95 | extern "C" void __afl_manual_init();
 96 | static volatile char suppress_warning1 = AFL_DEFER_FORKSVR[0];
 97 | 
 98 | // Input buffer.
 99 | static const size_t kMaxAflInputSize = 1 << 20;
100 | static uint8_t AflInputBuf[kMaxAflInputSize];
101 | 
102 | // Use this optionally defined function to output sanitizer messages even if
103 | // user asks to close stderr.
104 | extern "C" __attribute__((weak)) void __sanitizer_set_report_fd(void *);
105 | 
106 | // Keep track of where stderr content is being written to, so that
107 | // dup_and_close_stderr can use the correct one.
108 | static FILE *output_file = stderr;
109 | 
110 | // Experimental feature to use afl_driver without AFL's deferred mode.
111 | // Needs to run before __afl_auto_init.
112 | __attribute__((constructor(0))) static void __decide_deferred_forkserver(void) {
113 |   if (getenv("AFL_DRIVER_DONT_DEFER")) {
114 |     if (unsetenv("__AFL_DEFER_FORKSRV")) {
115 |       perror("Failed to unset __AFL_DEFER_FORKSRV");
116 |       abort();
117 |     }
118 |   }
119 | }
120 | 
121 | // If the user asks us to duplicate stderr, then do it.
122 | static void maybe_duplicate_stderr() {
123 |   char *stderr_duplicate_filename =
124 |       getenv("AFL_DRIVER_STDERR_DUPLICATE_FILENAME");
125 | 
126 |   if (!stderr_duplicate_filename)
127 |     return;
128 | 
129 |   FILE *stderr_duplicate_stream =
130 |       freopen(stderr_duplicate_filename, "a+", stderr);
131 | 
132 |   if (!stderr_duplicate_stream) {
133 |     fprintf(
134 |         stderr,
135 |         "Failed to duplicate stderr to AFL_DRIVER_STDERR_DUPLICATE_FILENAME");
136 |     abort();
137 |   }
138 |   output_file = stderr_duplicate_stream;
139 | }
140 | 
141 | // Most of these I/O functions were inspired by/copied from libFuzzer's code.
142 | static void discard_output(int fd) {
143 |   FILE *temp = fopen("/dev/null", "w");
144 |   if (!temp)
145 |     abort();
146 |   dup2(fileno(temp), fd);
147 |   fclose(temp);
148 | }
149 | 
150 | static void close_stdout() { discard_output(STDOUT_FILENO); }
151 | 
152 | // Prevent the targeted code from writing to "stderr" but allow sanitizers and
153 | // this driver to do so.
154 | static void dup_and_close_stderr() {
155 |   int output_fileno = fileno(output_file);
156 |   int output_fd = dup(output_fileno);
157 |   if (output_fd <= 0)
158 |     abort();
159 |   FILE *new_output_file = fdopen(output_fd, "w");
160 |   if (!new_output_file)
161 |     abort();
162 |   if (!__sanitizer_set_report_fd)
163 |     return;
164 |   __sanitizer_set_report_fd(reinterpret_cast<void *>(output_fd));
165 |   discard_output(output_fileno);
166 | }
167 | 
168 | static void Printf(const char *Fmt, ...) {
169 |   va_list ap;
170 |   va_start(ap, Fmt);
171 |   vfprintf(output_file, Fmt, ap);
172 |   va_end(ap);
173 |   fflush(output_file);
174 | }
175 | 
176 | // Close stdout and/or stderr if user asks for it.
177 | static void maybe_close_fd_mask() {
178 |   char *fd_mask_str = getenv("AFL_DRIVER_CLOSE_FD_MASK");
179 |   if (!fd_mask_str)
180 |     return;
181 |   int fd_mask = atoi(fd_mask_str);
182 |   if (fd_mask & 2)
183 |     dup_and_close_stderr();
184 |   if (fd_mask & 1)
185 |     close_stdout();
186 | }
187 | 
188 | // Define LLVMFuzzerMutate to avoid link failures for targets that use it
189 | // with libFuzzer's LLVMFuzzerCustomMutator.
190 | extern "C" size_t LLVMFuzzerMutate(uint8_t *Data, size_t Size, size_t MaxSize) {
191 |   assert(false && "LLVMFuzzerMutate should not be called from afl_driver");
192 |   return 0;
193 | }
194 | 
195 | // Execute any files provided as parameters.
196 | static int ExecuteFilesOnyByOne(int argc, char **argv) {
197 |   for (int i = 1; i < argc; i++) {
198 |     std::ifstream in(argv[i], std::ios::binary);
199 |     in.seekg(0, in.end);
200 |     size_t length = in.tellg();
201 |     in.seekg(0, in.beg);
202 |     std::cout << "Reading " << length << " bytes from " << argv[i] << std::endl;
203 |     // Allocate exactly length bytes so that we reliably catch buffer overflows.
204 |     std::vector<char> bytes(length);
205 |     in.read(bytes.data(), bytes.size());
206 |     assert(in);
207 |     LLVMFuzzerTestOneInput(reinterpret_cast<const uint8_t *>(bytes.data()),
208 |                            bytes.size());
209 |     std::cout << "Execution successful" << std::endl;
210 |   }
211 |   return 0;
212 | }
213 | 
214 | #define GET_TARGET_INFO_FROM_ENV_OR_EXIT(ENV_NAME, ARG_NAME, ARGV)             \
215 |   {                                                                            \
216 |     char *ARG_NAME = getenv(#ENV_NAME);                                        \
217 |     if (ARG_NAME) {                                                            \
218 |       static char arg_##ARG_NAME[256];                                         \
219 |       memset(arg_##ARG_NAME, 0, 256);                                          \
220 |       sprintf(arg_##ARG_NAME, "-m%s=%s", #ARG_NAME, ARG_NAME);                 \
221 |       Printf("%s: %s\n", #ENV_NAME, arg_##ARG_NAME);                           \
222 |       ARGV.push_back(arg_##ARG_NAME);                                          \
223 |     } else {                                                                   \
224 |       Printf("%s not found, abort.\n", #ENV_NAME);                             \
225 |       exit(1);                                                                 \
226 |     }                                                                          \
227 |   }
228 | 
229 | int main(int argc, char **argv) {
230 |   Printf("======================= INFO =========================\n"
231 |          "This binary is built for AFL-fuzz.\n"
232 |          "To run the target function on individual input(s) execute this:\n"
233 |          "  %s < INPUT_FILE\n"
234 |          "or\n"
235 |          "  %s INPUT_FILE1 [INPUT_FILE2 ... ]\n"
236 |          "To fuzz with afl-fuzz execute this:\n"
237 |          "  afl-fuzz [afl-flags] %s [-N]\n"
238 |          "afl-fuzz will run N iterations before "
239 |          "re-spawning the process (default: 1000)\n"
240 |          "======================================================\n",
241 |          argv[0], argv[0], argv[0]);
242 | 
243 |   maybe_duplicate_stderr();
244 |   maybe_close_fd_mask();
245 |   if (LLVMFuzzerInitialize) {
246 |     std::vector<char *> Argv({argv[0]});
247 |     char *g = getenv("GLOBAL_ISEL");
248 |     if (g && g[0] == '1') {
249 |       Printf("Fuzzing GlobalISel\n");
250 |       Argv.push_back((char *)"-global-isel");
251 |     } else {
252 |       Printf("Fuzzing DAGISel\n");
253 |     }
254 | 
255 |     GET_TARGET_INFO_FROM_ENV_OR_EXIT(TRIPLE, triple, Argv);
256 |     GET_TARGET_INFO_FROM_ENV_OR_EXIT(CPU, cpu, Argv);
257 |     GET_TARGET_INFO_FROM_ENV_OR_EXIT(ATTR, attr, Argv);
258 | 
259 |     char *tbl_size = getenv("MATCHER_TABLE_SIZE");
260 |     if (tbl_size) {
261 |       Printf("MATCHER_TABLE_SIZE set to %s", tbl_size);
262 |     } else {
263 |       Printf("MATCHER_TABLE_SIZE not found, abort.\n");
264 |       exit(1);
265 |     }
266 |     char **AArgv = Argv.data();
267 |     int AArgc = Argv.size();
268 |     LLVMFuzzerInitialize(&AArgc, &AArgv);
269 |   }
270 |   // Do any other expensive one-time initialization here.
271 | 
272 |   if (!getenv("AFL_DRIVER_DONT_DEFER"))
273 |     __afl_manual_init();
274 | 
275 |   int N = 1000;
276 |   if (argc == 2 && argv[1][0] == '-')
277 |     N = atoi(argv[1] + 1);
278 |   else if (argc == 2 && (N = atoi(argv[1])) > 0)
279 |     Printf("WARNING: using the deprecated call style `%s %d`\n", argv[0], N);
280 |   else if (argc > 1)
281 |     return ExecuteFilesOnyByOne(argc, argv);
282 | 
283 |   assert(N > 0);
284 | 
285 |   // Call LLVMFuzzerTestOneInput here so that coverage caused by initialization
286 |   // on the first execution of LLVMFuzzerTestOneInput is ignored.
287 |   uint8_t dummy_input[1] = {0};
288 |   LLVMFuzzerTestOneInput(dummy_input, 1);
289 | 
290 |   int num_runs = 0;
291 |   while (__afl_persistent_loop(N)) {
292 |     ssize_t n_read = read(0, AflInputBuf, kMaxAflInputSize);
293 |     if (n_read > 0) {
294 |       // Copy AflInputBuf into a separate buffer to let asan find buffer
295 |       // overflows. Don't use unique_ptr/etc to avoid extra dependencies.
296 |       uint8_t *copy = new uint8_t[n_read];
297 |       memcpy(copy, AflInputBuf, n_read);
298 |       num_runs++;
299 |       LLVMFuzzerTestOneInput(copy, n_read);
300 |       delete[] copy;
301 |     }
302 |   }
303 |   Printf("%s: successfully executed %d input(s)\n", argv[0], num_runs);
304 | }
305 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2023 Yuyang (Peter) Rong
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/scripts/classify.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import subprocess
  3 | import os
  4 | import re
  5 | from typing import Iterable, Iterator, List, Optional, Set, Tuple
  6 | import shutil
  7 | from pathlib import Path
  8 | import tempfile
  9 | 
 10 | from lib.process_concurrency import run_concurrent_subprocesses
 11 | 
 12 | 
 13 | class StackTrace:
 14 |     # using tuple instead of list for easier equality check
 15 |     stack_frames: Tuple[Tuple[str, str], ...]
 16 | 
 17 |     def __init__(self, stacktrace: Iterable[str], remove_addr: bool = False):
 18 |         stack_frames: List[Tuple[str, str]] = []
 19 | 
 20 |         for line in stacktrace:
 21 |             words = line.strip().split(" ")
 22 |             assert words[0].startswith("#")
 23 |             function = " ".join(words[2:-1])
 24 |             location = words[-1]
 25 |             if remove_addr:
 26 |                 location = re.sub(r"0x[0-9a-f]+", "0x_", location)
 27 |             stack_frames.append((function, location))
 28 | 
 29 |         self.stack_frames = tuple(stack_frames)
 30 | 
 31 |     def __str__(self) -> str:
 32 |         ret = ""
 33 |         for (f, l) in self.stack_frames:
 34 |             ret += f"\t{f} {l}\n"
 35 |         return ret
 36 | 
 37 |     def __len__(self) -> int:
 38 |         return len(self.stack_frames)
 39 | 
 40 |     def __eq__(self, other) -> bool:
 41 |         return self.stack_frames == other.stack_frames
 42 | 
 43 |     def __hash__(self) -> int:
 44 |         return hash(self.stack_frames)
 45 | 
 46 | 
 47 | class CrashError:
 48 |     return_code: int
 49 |     failed_pass: Optional[str]
 50 |     message_raw: str
 51 |     message_minimized: str
 52 |     type: str
 53 |     subtype: Optional[str]
 54 |     undefined_external_symbol: bool
 55 |     stack_trace: StackTrace
 56 |     hash_stacktrace_only: bool
 57 |     hash_op_code_only_for_isel_crash: bool
 58 | 
 59 |     def __init__(
 60 |         self,
 61 |         args: List[str],
 62 |         return_code: int,
 63 |         stderr_iter: Iterator[str],
 64 |         hash_stacktrace_only: bool = False,
 65 |         hash_op_code_only_for_isel_crash: bool = False,
 66 |         remove_addr_in_stacktrace: bool = False,
 67 |     ):
 68 |         self.return_code = return_code
 69 |         self.hash_stacktrace_only = hash_stacktrace_only
 70 |         self.hash_op_code_only_for_isel_crash = hash_op_code_only_for_isel_crash
 71 |         self.undefined_external_symbol = False
 72 | 
 73 |         # extract and minimize error message
 74 |         message_lines = []
 75 |         while (
 76 |             (curr_line := next(stderr_iter, None))
 77 |             and curr_line
 78 |             != "PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.\n"
 79 |         ):
 80 |             # do not include the entire DAG in the error message
 81 |             if curr_line == "\n" or re.match(r"^ +0x[0-9a-f]+: .+ = .+\n$", curr_line):
 82 |                 continue
 83 | 
 84 |             if re.match(r'LLVM ERROR: Undefined external symbol ".+"\n', curr_line):
 85 |                 self.undefined_external_symbol = True
 86 | 
 87 |             message_lines.append(curr_line)
 88 | 
 89 |         self.message_raw = "".join(message_lines)
 90 | 
 91 |         self.message_minimized = (
 92 |             re.sub(r"%[0-9]+", "%_", self.message_raw)
 93 |             .replace(args[0], os.path.basename(args[0]))
 94 |             .replace(args[-1], "ir.bc")
 95 |         )
 96 | 
 97 |         self.message_minimized = re.sub(r"0x[0-9a-f]+", "0x_", self.message_minimized)
 98 |         self.message_minimized = re.sub(
 99 |             r"(unable to allocate function argument #)[0-9]+",
100 |             r"\1_",
101 |             self.message_minimized,
102 |         )
103 |         self.message_minimized = re.sub(
104 |             r"(Error while trying to spill )(.+)( from class )(.+)(: Cannot scavenge register without an emergency spill slot!)",
105 |             r"\1_\3\4\5",
106 |             self.message_minimized,
107 |         )
108 | 
109 |         # extract failed pass and stack trace
110 |         self.failed_pass = None
111 |         if (curr_line := next(stderr_iter, None)) and curr_line == "Stack dump:\n":
112 |             # extract failed pass
113 |             while (
114 |                 curr_line := next(stderr_iter, None)
115 |             ) and "llvm::sys::PrintStackTrace" not in curr_line:
116 |                 if (
117 |                     match := re.match(
118 |                         r" *[0-9]+\.\tRunning pass \'([A-Za-z0-9 ]+)\'", curr_line
119 |                     )
120 |                 ) is not None:
121 |                     self.failed_pass = match.group(1)
122 | 
123 |             # extract stack trace
124 |             try:
125 |                 self.stack_trace = StackTrace(stderr_iter, remove_addr_in_stacktrace)
126 |             except:
127 |                 print(f"WARNING: Unable to parse stack trace for {args[-1]}")
128 |                 self.stack_trace = StackTrace([])
129 |         else:
130 |             self.stack_trace = StackTrace([])
131 | 
132 |         # determine error type
133 |         if self.message_raw.startswith("LLVM ERROR: unable to legalize instruction:"):
134 |             self.type = "instruction-legalization"
135 |             matches = re.findall(r"G_[A-Z_]+", message_lines[0])
136 |             assert len(matches) == 1
137 |             self.subtype = matches[0]
138 |         elif self.message_raw.startswith("LLVM ERROR: cannot select:"):
139 |             self.type = "global-instruction-selection"
140 |             matches = re.findall(r"G_[A-Z_]+", message_lines[0])
141 |             assert len(matches) == 1
142 |             self.subtype = matches[0]
143 |         elif self.message_raw.startswith("LLVM ERROR: Cannot select:"):
144 |             self.type = "dag-instruction-selection"
145 |             match = re.match(
146 |                 r"LLVM ERROR: Cannot select:.+ = ([a-zA-Z0-9_:]+(<.+>)?)",
147 |                 message_lines[0],
148 |             )
149 |             if match is None:
150 |                 print(f'ERROR: failed to extract instruction from "{message_lines[0]}"')
151 |                 self.subtype = "Unknown"
152 |             else:
153 |                 self.subtype = match.group(1).split("<")[0]
154 |         else:
155 |             if self.failed_pass is None:
156 |                 self.type = "other"
157 |             else:
158 |                 self.type = self.failed_pass.lower().replace(" ", "-")
159 |             self.subtype = None
160 | 
161 |     def __str__(self) -> str:
162 |         return "\n".join(
163 |             [
164 |                 f"Return Code: {self.return_code}",
165 |                 f"Error Type: {self.type}",
166 |                 f"Failed Pass: {self.failed_pass}",
167 |                 "Minimized Message:",
168 |                 self.message_minimized,
169 |                 "Stack Trace:",
170 |                 str(self.stack_trace),
171 |             ]
172 |         )
173 | 
174 |     def get_folder_name(self) -> str:
175 |         return os.path.join(
176 |             self.type,
177 |             self.subtype if self.subtype is not None else "",
178 |             f"tracedepth_{len(self.stack_trace)}__hash_0x{hash(self):08x}",
179 |         )
180 | 
181 |     def __hash__(self):
182 |         if self.hash_op_code_only_for_isel_crash and (
183 |             self.type == "dag-instruction-selection"
184 |             or self.type == "global-instruction-selection"
185 |         ):
186 |             return hash(self.subtype)
187 | 
188 |         if self.hash_stacktrace_only:
189 |             return hash(self.stack_trace)
190 | 
191 |         return hash(self.stack_trace) ^ hash(self.message_minimized)
192 | 
193 | 
194 | def classify(
195 |     cmd: List[str],
196 |     input_dir: str | Path,
197 |     output_dir: str | Path,
198 |     force: bool,
199 |     verbose: bool = False,
200 |     create_symlink_to_source: bool = True,
201 |     hash_stacktrace_only: bool = False,
202 |     hash_op_code_only_for_isel_crash: bool = False,
203 |     remove_addr_in_stacktrace: bool = False,
204 |     ignore_undefined_external_symbol: bool = False,
205 | ) -> None:
206 |     output_dir = os.path.abspath(output_dir)
207 |     input_dir = os.path.abspath(input_dir)
208 |     temp_dir = tempfile.gettempdir()
209 | 
210 |     if os.path.exists(output_dir):
211 |         if force:
212 |             shutil.rmtree(output_dir)
213 |         else:
214 |             print(f"{output_dir} already exists, use -f to remove it. Abort.")
215 |             exit(1)
216 | 
217 |     Path(output_dir).mkdir(parents=True)
218 | 
219 |     crash_hashes: Set[int] = set()
220 |     false_alarms: List[str] = []
221 | 
222 |     def on_process_exit(file_name: str, exit_code: Optional[int], p: subprocess.Popen) -> None:
223 |         ir_bc_path: str = p.args[-1]  # type: ignore
224 |         stderr_dump_path = os.path.join(temp_dir, file_name + ".stderr")
225 |         stderr_dump_file = open(stderr_dump_path)
226 | 
227 |         if os.stat(stderr_dump_path).st_size == 0:
228 |             false_alarms.append(ir_bc_path)
229 |             return
230 | 
231 |         crash = CrashError(
232 |             p.args,  # type: ignore
233 |             p.returncode,
234 |             stderr_dump_file,
235 |             hash_stacktrace_only,
236 |             hash_op_code_only_for_isel_crash,
237 |             remove_addr_in_stacktrace,
238 |         )
239 | 
240 |         stderr_dump_file.close()
241 |         os.remove(stderr_dump_path)
242 | 
243 |         if ignore_undefined_external_symbol and crash.undefined_external_symbol:
244 |             return
245 | 
246 |         folder_name = crash.get_folder_name()
247 |         folder_path = os.path.join(output_dir, folder_name)
248 |         Path(folder_path).mkdir(parents=True, exist_ok=True)
249 | 
250 |         if hash(crash) not in crash_hashes:
251 |             crash_hashes.add(hash(crash))
252 |             with open(
253 |                 os.path.join(output_dir, folder_name + ".log"), "w+"
254 |             ) as report_path:
255 |                 print(crash, file=report_path)
256 | 
257 |             if verbose:
258 |                 print("New crash type:", folder_name)
259 | 
260 |         if create_symlink_to_source:
261 |             os.symlink(
262 |                 ir_bc_path,
263 |                 os.path.join(folder_path, os.path.basename(ir_bc_path) + ".bc"),
264 |             )
265 | 
266 |     run_concurrent_subprocesses(
267 |         iter=list(
268 |             filter(
269 |                 lambda file_name: file_name.split(".")[-1] not in ["md", "txt", "s"],
270 |                 os.listdir(input_dir),
271 |             )
272 |         ),
273 |         subprocess_creator=lambda file_name: subprocess.Popen(
274 |             cmd + [os.path.join(input_dir, file_name)],
275 |             stdout=subprocess.DEVNULL,
276 |             stderr=open(os.path.join(temp_dir, file_name + ".stderr"), "w"),
277 |         ),
278 |         on_exit=on_process_exit,
279 |     )
280 | 
281 |     print(f"{len(false_alarms)} false positives, {len(crash_hashes)} unique crashes")
282 |     with open(os.path.join(output_dir, "false_positives.txt"), "a+") as file:
283 |         file.writelines(line + "\n" for line in false_alarms)
284 | 
285 |     with open(os.path.join(output_dir, "unique_crashes"), "w+") as file:
286 |         file.write(str(len(crash_hashes)))
287 | 
288 | 
289 | def main() -> None:
290 |     parser = argparse.ArgumentParser(
291 |         description="Run all crashed cases and classify them"
292 |     )
293 |     parser.add_argument(
294 |         "--cmd",
295 |         type=str,
296 |         required=True,
297 |         help="The command to run on all files in the input dir",
298 |     )
299 |     parser.add_argument(
300 |         "--input", type=str, required=True, help="The directory containing input files"
301 |     )
302 |     parser.add_argument(
303 |         "--output",
304 |         type=str,
305 |         required=False,
306 |         default="output",
307 |         help="The directory to store all organized output",
308 |     )
309 |     parser.add_argument(
310 |         "-f",
311 |         "--force",
312 |         action="store_true",
313 |         help="force delete the output directory if it already exists.",
314 |     )
315 |     args = parser.parse_args()
316 |     classify(args.cmd.split(" "), args.input, args.output, args.force, verbose=True)
317 | 
318 | 
319 | if __name__ == "__main__":
320 |     main()
321 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # IR Fuzzer
  2 | 
  3 | You can find the camera ready paper [here](https://github.com/user-attachments/files/19461085/IRFuzzer.pdf)
  4 | 
  5 | This repo hasn't been actively maintained, but I will review and accept any PR if that helps.
  6 | 
  7 | # Quick start
  8 | 
  9 | ## Compile 
 10 | 
 11 | You should be able to prepare everything by running `./build.sh`. 
 12 | It should compile everything for you.
 13 | If it failed for any reason, please send an issue to this repo.
 14 | 
 15 | The script will set some environment variables. 
 16 | You may want to leave these in your `.bashrc` for further fuzzing:
 17 | 
 18 | ```sh
 19 | # Path to this directory
 20 | export FUZZING_HOME=$(pwd)
 21 | # The LLVM you want to fuzz
 22 | export LLVM=<Your LLVM>
 23 | export AFL=AFLplusplus
 24 | export PATH=$PATH:$HOME/clang+llvm/bin
 25 | # Tell AFL++ to only use our mutator
 26 | export AFL_CUSTOM_MUTATOR_ONLY=1
 27 | # Tell AFL++ Where our mutator is
 28 | export AFL_CUSTOM_MUTATOR_LIBRARY=$FUZZING_HOME/mutator/build/libAFLCustomIRMutator.so
 29 | # AFL instrumentation method
 30 | export AFL_LLVM_INSTRUMENT=CLASSIC
 31 | ```
 32 | 
 33 | If you want to use dockerized environment, you can also do
 34 | 
 35 | ```sh
 36 | docker build . -t irfuzzer
 37 | ```
 38 | 
 39 | ## Seed selection
 40 | 
 41 | Seed is the initial input we give fuzzers, they have a directly impact on fuzzing performance.
 42 | `seeds` provides a default seed start fuzzing, it is an empty module with some function signatures.
 43 | For better fuzzing performance, you are more than welcome to move modules in `$LLVM/llvm/test/CodeGen/<Arch>` into `seeds`.
 44 | Notice that `seeds` only accepts bytecode, not LLVM IR.
 45 | 
 46 | ## Run
 47 | 
 48 | ### Env vars
 49 | 
 50 | You can specify different arguments for the driver using environment variables.
 51 | 
 52 | **Required**
 53 | 
 54 | ```sh
 55 | export TRIPLE=<Your triple>
 56 | export CPU=
 57 | export ATTR=
 58 | ```
 59 | You can specify triples like `x86_64`, `aarch64`, `aie`, etc. 
 60 | If you don't know what triples you have, try `llc --version`, it will list all triples you have.
 61 | `CPU` and `ATTR` can be left empty, but it is a must have.
 62 | They are equivalent to `-mcpu` and `-mattr` you would normally put when using `llc`.
 63 | 
 64 | ```sh
 65 | export MATCHER_TABLE_SIZE=13780
 66 | ```
 67 | Matcher table size refers to the size of the matcher table generated by TableGen. 
 68 | The table is automatically generated as a static variable in  in `SelectCode(SDNode *N) <Target>GenDAGISel.inc`(For SelectionDAG) and in `<Target>InstructionSelector::getMatchTable() <Target>GenGlobalISel.inc`(For GlobalIsel). You have three ways to find its length:
 69 | 
 70 | 1. every time AFL's compiler compiles the project, it counts the table size and pops a `[+] MatcherTable size: 22660`. You can look out for that.
 71 | 2. If you missed it, you can delete the object file (`ISelDAGToDAG.cpp.o` or `InstructionSelector.cpp.o`) and force a re-compilation.
 72 | ```sh
 73 | $ cd build-afl
 74 | $ rm lib/Target/AIE/CMakeFiles/LLVMAIECodeGen.dir/AIEISelDAGToDAG.cpp.o
 75 | $ ninja
 76 | 
 77 | [6/27] Building CXX object lib/Target/AIE/CMakeFiles/LLVMAIECodeGen.dir/AIEISelDAGToDAG.cpp.o
 78 | [+] MatcherTable size: 22660
 79 | ```
 80 | 3. You can also find this data in [`scripts/common.py`](./scripts/common.py). 
 81 | It may not be 100% accurate as the code gets updated. 
 82 | 
 83 | 
 84 | **Optional**
 85 | 
 86 | ```
 87 | export GLOBAL_ISEL=1;
 88 | ```
 89 | By default, we are fuzzing SelectionDAG. If you want to fuzz GlobalIsel, attach this environment variable. Please make sure `MATCHER_TABLE_SIZE` matches with GlobalIsel's table size.
 90 | 
 91 | ### Command line
 92 | 
 93 | **Once the environments are set**, the easiest way to start fuzzing is to do
 94 | ```sh
 95 | ./AFLplusplus/afl-fuzz -i <seed-dir> -o fuzzing llvm-isel-afl/build/isel-fuzzing
 96 | ```
 97 | It would start a fuzzing instant to fuzz SelectionDAG.
 98 | Some useful argument you might give `afl-fuzz` includes:
 99 | - `-E <n>`: execute/mutate the input for `n` times and quit
100 | - `-V <t>`: run the fuzzer for `t` seconds and quit
101 | 
102 | Fuzzing can take weeks, if not days.
103 | I recommend using [`screen`](https://www.gnu.org/software/screen/) to run the fuzzing in the background.
104 | 
105 | AFL++ will give you a fancy UI to describe what's happening. 
106 | You may check [this](https://github.com/mirrorer/afl/blob/master/docs/status_screen.txt) page to help you understand the stats.
107 | 
108 | ### Archs and table size
109 | 
110 | Check `./script/common.py`.
111 | 
112 | ## Scripts
113 | 
114 | ### Dependencies
115 | 
116 | We prepared many scripts to automate the fuzzing process.
117 | These scripts runs on Python 3.10+, as it supports type hints to make it look less messy.
118 | Use `python3.10` explictly to avoid conflict with `python3.6`... suppose you are still using ubuntu 18.04 or order.
119 | To install some dependencies you may want to:
120 | ```sh
121 | # If your ubuntu is so old you don't have python3.10 in your apt I can't help you...
122 | # `apt install -y python3.10 python3-pip wget`
123 | wget https://bootstrap.pypa.io/get-pip.py
124 | python3.10 get-pip.py
125 | 
126 | # You can install all the dependencies of the scripts with:
127 | pip3.10 install -r scripts/requirements.txt
128 | ```
129 | 
130 | ### Description and usage
131 | 
132 | - `common.py`: this is not intended to be directly called, yet it have many metadata inside, you are welcome to take a look.
133 | - `fuzz.py`: this fuzzes a lot of triples using `docker` or `screen`. 
134 | - `batch_classify.py`: this script runs all the crashed inputs and cluster the same ones together using the stack trace. You may want to run this after a fuzzing process.
135 | - `combine-fuzzing-results.py`: this script combines multiple fuzzing directories into one. If you are not writing a paper and need massive data you probably don't need it.
136 | - `process_data.py`: summarize the fuzzing result.
137 | 
138 | Using `fuzz.py` don't need you to set any environment variables, the script will take care of it.
139 | You would most likely use the `fuzz.py` like this:
140 | 
141 | ```sh
142 | python3.10 scripts/fuzz.py -i seeds -o fuzzing -r 5 --set="  aie" --type=screen --isel=dagisel --fuzzer=irfuzzer --time=1w -j 80 --on_exist=force
143 | ```
144 | 
145 | It means: start fuzzing using input from `seeds` (`-i seed`), put the result in `fuzzing` (`-o fuzzing`), repeat the experiment for five times (`-r 5`), test aie without attribute and cpu setting (`--set="  aie"`), use screen to monitor the fuzzing (`--type=screen`), test SelectionDAG (`--isel=dagisel`), use our fuzzer (`--fuzzer=irfuzzer`), test for a week (`--time=1w`), start at most 80 jobs in parallel (`-j 80`) and if the output directory already exists, force remove it (`--on_exist=force`)
146 | 
147 | # How do we fuzz
148 | 
149 | See the details in our paper
150 | 
151 | # Trophies & Findings
152 | 
153 | (I think I will attach more links to keep track of these later)
154 | 
155 | ## AI Engine
156 | - AIE1 GlobalIsel lacks floating point support
157 |     - G_FCONSTANT [fixed.](https://gitenterprise.xilinx.com/XRLabs/llvm-aie/pull/194)
158 | - AIE1 GlobalIsel lacks vector support.
159 | - AIE1 SelectionDAG has bugs in the memory store.
160 | - AIE1 SelectionDAG has truncation errors. [Fixed.](https://gitenterprise.xilinx.com/XRLabs/llvm-aie/pull/161/)
161 | - AIE1 `vst.spil` generates two stores to the same address. [PoC.](https://gitenterprise.xilinx.com/XRLabs/peano_usage/pull/15) [Fixed.](https://gitenterprise.xilinx.com/XRLabs/llvm-aie/pull/203)
162 | 
163 | ## Open sourced architecture
164 | 
165 | See our [trophies repo](https://github.com/DataCorrupted/LLVM-fuzzing-trophies).
166 | 
167 | # FAQ
168 | 
169 | __Why build two versions of LLVM?__
170 | 
171 | One version is built by AFL's compiler, and another is built by LLVM14 and contains a new mutator we designed. 
172 | AFL needs to inject some code to the AIE compiler to keep track of runtime info (Edge coverage, MatcherTable coverage, etc.)
173 | Besides, the driver also depends on it.
174 | The other version is the dependency for the mutator. You __can__ use AFL instrumented mutator, but it would slow down mutation speed and thus not recommended.
175 | 
176 | __Why fuzz a fork of AIE that is not up-to-date?__
177 | 
178 | Mainly because mutator also needs to understand the architecture we are fuzzing, although it only generates mid-end IR.
179 | Therefore, until we merge mutator's code into AIE, all you can do is keep merging the code you want to test to mutator branch and compile everything.
180 | 
181 | __Are we fuzzing AIE2?__
182 | 
183 | Currently we are only fuzzing AIE1 since it is more complete than AIE2. 
184 | But you can fuzz AIE2 if you want to. In principle fuzzing AIE1 is no different than AIE2. 
185 | All you need to do is set `TRIPLE=aie2` and set `MATCHER_TABLE_SIZE` correctly.
186 | 
187 | __AIE compilation hangs__
188 | 
189 | It's an known issue that `Target/AIE/MCTargetDesc/AIEMCFormats.cpp` will take a long time (~10 minutes) to compile. A function in it `__cxx_global_var_init()` will cause the optimizer to run for a really long time. It is an interesting bug, but we haven't had time to fix it.
190 | 
191 | __What is a seed and what to use__
192 | 
193 | Seed is the initial file you give fuzzer to work on. 
194 | Unfortunately, this is required for AFL. (libFuzzer can cold-start without seed).
195 | In this repo, we included a minimal seed in `seeds/` so you can start fuzzing without really worrying about it.
196 | 
197 | However, academic research and industry practice have shown that a better seed can lead to better results. You may reach the same result faster or find behavior unseen before with different seeds.
198 | So if you can manually craft some seeds to cover different codes you want to test, for example, if you want to focus on floating point, you can create seeds with floating point calculations in them.
199 | 
200 | To create a seed, you can write LLVM IR manually and convert it to bitcode using `llvm-as`. Or you can cast bitcode to IR using `llvm-dis` and change some of the instructions.
201 | 
202 | __Matcher table coverage is 0.0%__
203 | 
204 | Table coverage may be low but never 0.0% in any cases. Please make sure the matcher table is correctly instrumented.
205 | 
206 | 1. Make sure your binary is linked against the library compiled by AFL.
207 | 2. Make sure AFL instrumented it. During compilation, there should be a line telling you `[+] Instrumenting matcher table.`
208 | 
209 | __What does the stats in AFL's UI mean?__
210 | 
211 | You may check [this](https://github.com/mirrorer/afl/blob/master/docs/status_screen.txt) page to help you understand the stats.
212 | 
213 | We introduced a new coverage, so `map density` shows two stats. The first one is edge coverage, which should reach 70~80% in a day or two, meaning that (almost) all control flow has been tested. 
214 | The second stat is matcher table coverage. It shows how much the table has been referenced. The higher, the better.
215 | 
216 | __My fuzzer is running slow__
217 | 
218 | There are two reasons it could happen.
219 | AFL has high file system interactions. Therefore, make sure your directory is not a nfs or any remotely mounted hard drive. If you want even faster speed, you can mount a tmpfs to do fuzzing in the memory.
220 | 
221 | Another reason is your seeds are taking a long time to execute. You may either choose smaller initial seeds or use shorter timeouts by adding `-t <timeout>` to AFL's arguments.
222 | 
223 | __Where are the crashes located?__
224 | 
225 | `$FUZZING_HOME/fuzzing_output/default/crashes`
226 | 
227 | __How to reproduce errors?__
228 | 
229 | One upside of fuzzing is it always gives you reproducible PoC. 
230 | You can run `build-release/bin/llc <args> <crashing-input>`.
231 | 
232 | We have also find cases where `llc` won't reproduce. 
233 | In that case try
234 | ```sh
235 | export CPU=<YourCPU>
236 | export ATTR=<YourAttr>
237 | export TRIPLE=<YourTriple>
238 | export MATCHER_TABLE_SIZE=<YourSize>
239 | ./llvm-isel-fuzzing/build/isel-fuzzing < <input>
240 | ```
241 | 
242 | We have noticed some setting difference between `llc` and our driver `isel-fuzzing`.
243 | We haven't had time to deal with it. Will update this later.
244 | 
245 | If there are any input that can't be reproduced even using `isel-fuzzing`, there are two possibilities:
246 | - Your matcher table size is set wrong.
247 | - It may be a bug and please send us an issue.
248 | 
249 | __What if `MatcherTable` is not set or set incorrectly?__
250 | 
251 | To pass compilation and AFL's self-testing, `MATCHER_TABLE_SIZE` is defaulted to a small amount. You would most like to see `Shadow table size: 32 too small. Did you set it properly?` that means it is not set.
252 | If `MATCHER_TABLE_SIZE` is not set correctly, you will have false positives where the seed is stored in `crashes` (Indicating the fuzzer finds the seed crashing), but you can't reproduce it with `llc`. 
253 | That means the runtime code we injected is crashing, not the LLVM itself. Most likely, it's because `MATCHER_TABLE_SIZE` is set too small, and an OOB Write happened.
254 | 
255 | __My mutator aborted during fuzzing?__
256 | 
257 | This is a common issue, its not a bug in the mutator.
258 | Most likely you didn't set the types correctly. 
259 | If mutator can't find a typed value to complete an instruction generation, it aborts.
260 | Therefore, it is important to write all types when creating the mutator.
261 | 
262 | Mutator is non-deterministic, debuging is hard. 
263 | But here's a trick, the mutator is deterministic is the seed is the same.
264 | If your fuzzer crashed, go find the `.cur_input` in your repo, this is the last input that mutator worked on before it crashed.
265 | Use `./mutator/scripts/validate.sh .cur_input` to verify the mutator with this input.
266 | The script will (hopefully) give you the seed that crashed the mutator.
267 | You can then debug the mutator by providing it with a deterministic seed that validator just poped out: `./mutator/build/MutatorDriver .cur_input <seed>`.
268 | If you can confirm that the last stack trace is `SourcePred.generate`, that's it, you didn't provide all the types required.
269 | If you see any other reasons for crashing, contact me.
270 | 
271 | Also, when mutator dies, the fuzzer become a zombie process, don't forget to clean it up :)


--------------------------------------------------------------------------------
/scripts/fuzz.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | import subprocess
  4 | from typing import Iterable, Literal, NamedTuple, Optional
  5 | import typing
  6 | import os
  7 | from tap import Tap
  8 | import docker
  9 | from time import sleep
 10 | 
 11 | from collect_seeds import TargetProp, collect_seeds_from_tests
 12 | from lib.process_concurrency import MAX_SUBPROCESSES, run_concurrent_subprocesses
 13 | from lib.target import Target
 14 | from lib.matcher_table_sizes import (
 15 |     DAGISEL_MATCHER_TABLE_SIZES,
 16 |     GISEL_MATCHER_TABLE_SIZES,
 17 | )
 18 | from lib.target_lists import TARGET_LISTS
 19 | from lib.time_parser import get_time_in_seconds
 20 | 
 21 | 
 22 | class FuzzerConfig(NamedTuple):
 23 |     extra_env: dict[str, str]
 24 |     extra_cmd: list[str] = []
 25 | 
 26 |     def getIRFuzzer(other_env: dict[str, str] = {}, other_cmd: list[str] = []):
 27 |         extra_env = {
 28 |             "AFL_CUSTOM_MUTATOR_ONLY": "1",
 29 |             "AFL_CUSTOM_MUTATOR_LIBRARY": "mutator/build/libAFLCustomIRMutator.so",
 30 |         }
 31 |         extra_env.update(other_env)
 32 |         return FuzzerConfig(
 33 |             extra_env=extra_env,
 34 |             extra_cmd=other_cmd,
 35 |         )
 36 | 
 37 | 
 38 | Fuzzer = Literal["aflplusplus", "libfuzzer", "irfuzzer"]
 39 | ISel = Literal["dagisel", "gisel"]
 40 | ClutserType = Literal["screen", "docker", "stdout"]
 41 | 
 42 | 
 43 | DOCKER_IMAGE = "irfuzzer"
 44 | FUZZERS: dict[str, FuzzerConfig] = {
 45 |     "aflplusplus": FuzzerConfig(extra_env={"AFL_CUSTOM_MUTATOR_ONLY": "0"}),
 46 |     "libfuzzer": FuzzerConfig(
 47 |         extra_env={
 48 |             "AFL_CUSTOM_MUTATOR_ONLY": "1",
 49 |             "AFL_CUSTOM_MUTATOR_LIBRARY": "mutator/build/libAFLFuzzMutate.so",
 50 |         },
 51 |     ),
 52 |     "irfuzzer": FuzzerConfig.getIRFuzzer(other_cmd=[" -w"]),
 53 | }
 54 | # Check Fuzzer and FUZZERS match.
 55 | assert list(FUZZERS.keys()) == list(
 56 |     typing.get_args(Fuzzer)
 57 | ), "FUZZERS and Fuzzer don't match"
 58 | 
 59 | 
 60 | class ExperimentConfig(NamedTuple):
 61 |     fuzzer: Fuzzer
 62 |     target: Target
 63 |     isel: ISel
 64 |     seed_dir: Path
 65 |     expr_root: Path
 66 |     time: int
 67 |     replicate_id: int
 68 | 
 69 |     @property
 70 |     def name(self) -> str:
 71 |         return f"{self.fuzzer}:{self.isel}:{self.target}:{self.replicate_id}"
 72 | 
 73 |     @property
 74 |     def matcher_table_size(self) -> Optional[int]:
 75 |         matcher_table_sizes = (
 76 |             GISEL_MATCHER_TABLE_SIZES
 77 |             if self.isel == "gisel"
 78 |             else DAGISEL_MATCHER_TABLE_SIZES
 79 |         )
 80 | 
 81 |         backend = self.target.backend
 82 | 
 83 |         if backend not in matcher_table_sizes:
 84 |             return None
 85 | 
 86 |         return matcher_table_sizes[backend]
 87 | 
 88 |     def get_fuzzing_env(self) -> dict[str, str]:
 89 |         envs = {
 90 |             "TRIPLE": str(self.target.triple),
 91 |             "CPU": self.target.cpu if self.target.cpu else "",
 92 |             "ATTR": ",".join(self.target.attrs),
 93 |             "GLOBAL_ISEL": "1" if self.isel == "gisel" else "0",
 94 |             "MATCHER_TABLE_SIZE": str(self.matcher_table_size),
 95 |         }
 96 |         envs.update(FUZZERS[self.fuzzer].extra_env)
 97 |         return envs
 98 | 
 99 |     def get_fuzzing_command(self, output_dir: str | Path) -> str:
100 |         cmd = [
101 |             "$AFL/afl-fuzz",
102 |             "-V",
103 |             str(self.time),
104 |             "-i",
105 |             str(self.seed_dir),
106 |             "-o",
107 |             str(output_dir),
108 |         ]
109 | 
110 |         cmd += FUZZERS[self.fuzzer].extra_cmd
111 | 
112 |         cmd.append("llvm-isel-afl/build/isel-fuzzing")
113 | 
114 |         return " ".join(cmd)
115 | 
116 |     def get_output_dir(self) -> Path:
117 |         return self.expr_root.joinpath(
118 |             self.fuzzer,
119 |             self.isel,
120 |             str(self.target),
121 |             str(self.replicate_id),
122 |         )
123 | 
124 | 
125 | class Args(Tap):
126 |     """
127 |     Command-line Arguments
128 |     (Reference: https://github.com/swansonk14/typed-argument-parser)
129 |     """
130 | 
131 |     fuzzers: list[Fuzzer] = ["irfuzzer"]
132 |     """the fuzzer used for fuzzing"""
133 | 
134 |     seeds: str
135 |     """
136 |     the directory containing input seeds for fuzzing (if 'seeding-from-tests' flag is not set)
137 |     or the directory to store the seeds collected from tests (if 'seeding-from-tests' flag is set)
138 |     """
139 | 
140 |     seeding_from_tests: bool = False
141 |     """whether to use tests as seeds for fuzzing"""
142 | 
143 |     props_to_match: list[TargetProp] = ["triple", "cpu", "attrs"]
144 |     """
145 |     the properties of a test target to match those of the fuzzing target,
146 |     used to determine which tests should be included as seeds.
147 |     (if 'seeding_from_tests' flag is not set, this option as no effect)
148 |     """
149 | 
150 |     timeout: Optional[float] = 0.1
151 |     """
152 |     only include test cases that can be compiled within the specified in seconds.
153 |     (if 'seeding_from_tests' flag is not set, this option has no effect)
154 |     """
155 | 
156 |     output: str = "./fuzzing"
157 |     """the output directory"""
158 | 
159 |     on_exist: Literal["abort", "force", "ignore"] = "abort"
160 |     """the action to take if the output directory already exists"""
161 | 
162 |     isel: ISel = "dagisel"
163 |     """the LLVM instruction selection method to fuzz"""
164 | 
165 |     target_lists: Optional[list[str]] = None
166 |     """
167 |     the name(s) of pre-defined list(s) of targets
168 |     (see 'lib/target_lists.py' for details)
169 |     (can be overriden by `--targets`)
170 |     """
171 | 
172 |     targets: Optional[list[str]] = None
173 |     """
174 |     manually specify targets to fuzz ('tier' will be ignored).
175 |     Format for each target can be
176 |     "<triple> [<cpu>] [<attr1> <attr2> ...]",
177 |     "<triple> [<cpu>] [<attr1>,<attr2>,...]", or
178 |     "<triple>[,<cpu>][,<attr1>,<attr2>,...]".
179 |     (An attribute must start with '+' or '-' to avoid ambiguity.)
180 |     """
181 | 
182 |     time: str = "5m"
183 |     """duration for each experiment (e.g. '100s', '30m', '2h', '1d')"""
184 | 
185 |     repeat: int = 1
186 |     """how many times each experiemt should run"""
187 | 
188 |     offset: int = 0
189 |     """the offset to start counting experiments"""
190 | 
191 |     jobs: int = MAX_SUBPROCESSES
192 |     """the max number of concurrent subprocesses"""
193 | 
194 |     type: Optional[ClutserType] = None
195 |     """the method to start fuzzing cluster"""
196 | 
197 |     def configure(self):
198 |         self.add_argument("-j", "--jobs")
199 |         self.add_argument("-o", "--output")
200 |         self.add_argument("-r", "--repeat")
201 |         self.add_argument("-t", "--time")
202 | 
203 |     def get_fuzzing_targets(self) -> list[Target]:
204 |         if self.target_lists is not None:
205 |             return [target for key in self.target_lists for target in TARGET_LISTS[key]]
206 |         elif self.targets is not None:
207 |             return [Target.parse(s) for s in self.targets]
208 |         else:
209 |             logging.error("Either '--tier' or '--set' has to be specified.")
210 |             exit(1)
211 | 
212 |     def get_time_in_seconds(self) -> int:
213 |         return get_time_in_seconds(self.time)
214 | 
215 | 
216 | def get_experiment_configs(
217 |     fuzzers: list[Fuzzer],
218 |     isel: ISel,
219 |     targets: list[Target],
220 |     time: int,
221 |     repeat: int,
222 |     offset: int,
223 |     seed_dir: Path,
224 |     expr_root: Path,
225 |     seeding_from_tests: bool,
226 |     props_to_match: list[TargetProp],
227 |     compilation_timout_secs: Optional[float],
228 | ) -> Iterable[ExperimentConfig]:
229 |     for fuzzer in fuzzers:
230 |         for target in targets:
231 |             expr_seed_dir = seed_dir
232 | 
233 |             if seeding_from_tests:
234 |                 expr_seed_dir = collect_seeds_from_tests(
235 |                     target=target,
236 |                     global_isel=isel == "gisel",
237 |                     out_dir_parent=seed_dir,
238 |                     props_to_match=props_to_match,
239 |                     dump_bc=True,
240 |                     symlink_to_ll=False,
241 |                     timeout_secs=compilation_timout_secs,
242 |                 )
243 | 
244 |             for r in range(repeat):
245 |                 expr_config = ExperimentConfig(
246 |                     fuzzer=fuzzer,
247 |                     target=target,
248 |                     isel=isel,
249 |                     seed_dir=expr_seed_dir,
250 |                     expr_root=expr_root,
251 |                     time=time,
252 |                     replicate_id=r + offset,
253 |                 )
254 | 
255 |                 if expr_config.matcher_table_size is None:
256 |                     logging.warn(
257 |                         f"Can't find matcher table size for target '{expr_config.target}', not fuzzing"
258 |                     )
259 |                     continue
260 | 
261 |                 yield expr_config
262 | 
263 | 
264 | def combine_commands(*commands: str) -> str:
265 |     return " && ".join(commands)
266 | 
267 | 
268 | def batch_fuzz_using_docker(
269 |     experiment_configs: list[ExperimentConfig],
270 |     jobs: int,
271 | ) -> None:
272 |     """
273 |     Run each experiment inside a dedicated Docker container.
274 |     (Docker Python SDK Reference: https://docker-py.readthedocs.io/en/stable/)
275 |     """
276 | 
277 |     client = docker.client.from_env()
278 |     container_queue = []
279 | 
280 |     def dequeue_and_wait():
281 |         dequeued_container = container_queue.pop(0)  # FIFO
282 |         if dequeued_container.status != "exited":
283 |             dequeued_container.wait()
284 | 
285 |     for i, experiment in enumerate(experiment_configs):
286 |         if len(container_queue) == jobs:
287 |             dequeue_and_wait()
288 | 
289 |         logging.info(f"Starting experiment {experiment.name}...")
290 | 
291 |         seed_dir = experiment.seed_dir
292 |         out_dir = experiment.get_output_dir()
293 |         out_dir.mkdir(parents=True)
294 | 
295 |         container = client.containers.run(
296 |             image=DOCKER_IMAGE,
297 |             command=[
298 |                 "bash",
299 |                 "-c",
300 |                 combine_commands(
301 |                     # Docker is responsible for core binding,
302 |                     # if AFL_NO_AFFINITY is not set, fuzzer will fail to start
303 |                     "export AFL_NO_AFFINITY=1",
304 |                     experiment.get_fuzzing_command("/fuzzing"),
305 |                     f"chown -R {os.getuid()} /fuzzing/default",
306 |                     "mv /fuzzing/default /output/default",
307 |                 ),
308 |             ],
309 |             remove=True,
310 |             detach=True,
311 |             name=experiment.name.replace("+", "").replace(",", "-").replace(":", "-"),
312 |             environment=experiment.get_fuzzing_env(),
313 |             cpuset_cpus=str(i % jobs),  # core binding
314 |             tmpfs={"/fuzzing": "size=1G"},
315 |             volumes=[
316 |                 f"{seed_dir.absolute()}:{seed_dir.absolute()}",
317 |                 f"{out_dir.absolute()}:/output",
318 |             ],
319 |         )
320 | 
321 |         container_queue.append(container)
322 | 
323 |     # wait for all running containers to exit
324 |     while len(container_queue) > 0:
325 |         dequeue_and_wait()
326 | 
327 | 
328 | def batch_fuzz(
329 |     experiment_configs: list[ExperimentConfig],
330 |     type: ClutserType,
331 |     jobs: int,
332 | ) -> None:
333 |     if type == "docker":
334 |         batch_fuzz_using_docker(experiment_configs, jobs)
335 |         return
336 | 
337 |     def start_subprocess(experiment: ExperimentConfig) -> subprocess.Popen:
338 |         logging.info(f"Starting experiment {experiment.name}...")
339 | 
340 |         out_dir = experiment.get_output_dir()
341 |         out_dir.mkdir(parents=True)
342 | 
343 |         env = experiment.get_fuzzing_env()
344 | 
345 |         if type == "stdout":
346 |             env["AFL_NO_UI"] = "1"
347 | 
348 |         fuzzing_command = experiment.get_fuzzing_command(out_dir)
349 | 
350 |         if type == "screen":
351 |             # If using screen, this script will not be able to detect whether the fuzzing process fails early or did not
352 |             # complete within the estimated time.
353 |             fuzzing_command = f'screen -S "{experiment.name}" -dm bash -c "{fuzzing_command}" && sleep {experiment.time + 180}'
354 | 
355 |         process = subprocess.Popen(
356 |             fuzzing_command,
357 |             env={**os.environ, **env},
358 |             shell=True,
359 |             stdin=subprocess.PIPE,
360 |             stdout=subprocess.DEVNULL,
361 |         )
362 | 
363 |         # Sleep for 1s so aflplusplus has time to bind core. Otherwise two fuzzers may bind to the same core.
364 |         sleep(1)
365 | 
366 |         return process
367 | 
368 |     run_concurrent_subprocesses(
369 |         iter=experiment_configs,
370 |         subprocess_creator=start_subprocess,
371 |         on_exit=lambda expr_cfg, exit_code, p: print(
372 |             f"Experiment {expr_cfg.name} exited with code {exit_code}"
373 |         ),
374 |         max_jobs=jobs,
375 |     )
376 | 
377 | 
378 | def fuzz(expr_config: ExperimentConfig) -> int:
379 |     out_dir = expr_config.get_output_dir()
380 |     out_dir.mkdir(parents=True)
381 | 
382 |     process = subprocess.run(
383 |         expr_config.get_fuzzing_command(out_dir),
384 |         env={**os.environ, **expr_config.get_fuzzing_env()},
385 |         shell=True,
386 |     )
387 | 
388 |     print(f"Fuzzing process exited with code {process.returncode}.")
389 |     return process.returncode
390 | 
391 | 
392 | def main() -> None:
393 |     args = Args(underscores_to_dashes=True).parse_args()
394 | 
395 |     out_root = Path(args.output)
396 |     if out_root.exists():
397 |         logging.info(f"{args.output} already exists.")
398 |         if args.on_exist == "force":
399 |             logging.info(f"'on-exist' set to {args.on_exist}, will force remove")
400 |             subprocess.run(["rm", "-rf", out_root])
401 |         elif args.on_exist == "abort":
402 |             logging.error(f"'on-exist' set to {args.on_exist}, won't work on it.")
403 |             exit(1)
404 | 
405 |     expr_configs = list(
406 |         get_experiment_configs(
407 |             fuzzers=args.fuzzers,
408 |             isel=args.isel,
409 |             targets=args.get_fuzzing_targets(),
410 |             time=args.get_time_in_seconds(),
411 |             repeat=args.repeat,
412 |             offset=args.offset,
413 |             seed_dir=Path(args.seeds),
414 |             expr_root=out_root,
415 |             seeding_from_tests=args.seeding_from_tests,
416 |             props_to_match=args.props_to_match,
417 |             compilation_timout_secs=args.timeout,
418 |         )
419 |     )
420 | 
421 |     # Pause for some seconds before starting.
422 |     start_pause = 5
423 |     print(
424 |         f"\nThe following {len(expr_configs)} experiment(s) will start in {start_pause} seconds:\n"
425 |     )
426 |     for expr in expr_configs:
427 |         print(f" - {expr.name}")
428 |     print()
429 | 
430 |     sleep(start_pause)
431 | 
432 |     if len(expr_configs) == 1 and args.type is None:
433 |         exit(fuzz(expr_config=expr_configs[0]))
434 |     elif args.type is None:
435 |         logging.error(
436 |             "'--type' must be specified when running multiple fuzzing experiments"
437 |         )
438 |     else:
439 |         batch_fuzz(
440 |             experiment_configs=expr_configs,
441 |             type=args.type,
442 |             jobs=args.jobs,
443 |         )
444 | 
445 | 
446 | if __name__ == "__main__":
447 |     logging.basicConfig()
448 |     logging.getLogger().setLevel(logging.INFO)
449 |     main()
450 | 


--------------------------------------------------------------------------------