├── .gitignore ├── utils └── app │ ├── app │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── api.py │ │ ├── status.py │ │ ├── delete.py │ │ ├── add.py │ │ ├── compare.py │ │ └── search.py │ ├── core │ │ ├── __init__.py │ │ ├── schema.py │ │ └── genome_service.py │ ├── defaults.py │ └── main.py │ └── uwsgi.ini ├── codegenome ├── __init__.py ├── ir │ ├── __init__.py │ ├── canon.py │ └── ir.py ├── genes │ ├── __init__.py │ ├── base.py │ ├── utils.py │ └── sigmal.py ├── kg │ └── __init__.py ├── lifters │ ├── __init__.py │ ├── base.py │ └── retdec.py ├── pipelines │ ├── base.py │ ├── __init__.py │ └── retdecsigmal.py ├── utils.py ├── _file_format.py └── _defaults.py ├── docs └── _static │ ├── overview.png │ └── code_genome.png ├── .gitmodules ├── .env.defaults ├── docker ├── install_cleanup.sh ├── install_llvm_pass.sh ├── install_pyleargist.sh ├── llvm-gcc-fix.patch ├── llvmlite-settypename.patch ├── install_retdec.sh ├── install_all_local.sh ├── install_llvmlite.sh ├── Dockerfile.dev ├── Dockerfile └── decompiler-config.json ├── requirements.txt ├── tests ├── p │ ├── build_elf.sh │ ├── g.c │ ├── build.sh │ └── p.c ├── unit_tests.py ├── test_sigmal.py ├── test_data.py ├── test_ir.py ├── test_lifters.py ├── test_api_core.py └── test_kg.py ├── CODEOWNERS ├── .pre-commit-config.yaml ├── scripts ├── bin2bc_dir ├── fmt.sh ├── build_hash_map.py ├── run_service.py ├── build_gkg.py ├── bin2bc └── cg ├── setup.py ├── README.md ├── Makefile ├── CONTRIBUTING.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /utils/app/app/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /utils/app/app/api/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /utils/app/app/core/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /codegenome/__init__.py: -------------------------------------------------------------------------------- 1 | from .kg import GenomeKG 2 | -------------------------------------------------------------------------------- /codegenome/ir/__init__.py: -------------------------------------------------------------------------------- 1 | from .ir import IRBinary 2 | -------------------------------------------------------------------------------- /codegenome/genes/__init__.py: -------------------------------------------------------------------------------- 1 | from .sigmal import SigmalGene 2 | -------------------------------------------------------------------------------- /codegenome/kg/__init__.py: -------------------------------------------------------------------------------- 1 | from .kg import BinGene, GenomeKG 2 | -------------------------------------------------------------------------------- /codegenome/lifters/__init__.py: -------------------------------------------------------------------------------- 1 | from .retdec import CGRetdec 2 | -------------------------------------------------------------------------------- /docs/_static/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code-genome/codegenome/HEAD/docs/_static/overview.png -------------------------------------------------------------------------------- /docs/_static/code_genome.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/code-genome/codegenome/HEAD/docs/_static/code_genome.png -------------------------------------------------------------------------------- /utils/app/uwsgi.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | module = app.main 3 | callable = app 4 | enable-threads = true 5 | master = true 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "deps/canon_pass"] 2 | path = deps/canon_pass 3 | url = https://github.com/code-genome/canon_pass.git 4 | -------------------------------------------------------------------------------- /codegenome/pipelines/base.py: -------------------------------------------------------------------------------- 1 | class CGPipeline(object): 2 | def process_file(self, file_path): 3 | raise NotImplemented() 4 | -------------------------------------------------------------------------------- /codegenome/lifters/base.py: -------------------------------------------------------------------------------- 1 | class CGLifterBase(object): 2 | def process_file(self, file_path, output_path=None): 3 | raise NotImplemented() 4 | -------------------------------------------------------------------------------- /codegenome/genes/base.py: -------------------------------------------------------------------------------- 1 | class CGGeneBase(object): 2 | def from_data(self, data): 3 | raise NotImplemented() 4 | 5 | def from_bitcode(self, data): 6 | raise NotImplemented() 7 | -------------------------------------------------------------------------------- /.env.defaults: -------------------------------------------------------------------------------- 1 | CG_DATA_ROOT_DIR="~/.cg" 2 | TMP_UPLOAD_DIR="/tmp/" 3 | GC_SERVICE_LOG_PATH="/tmp/cg.rest.log" 4 | CG_DEBUG=40 #python logger int (CRITICAL = 50, ERROR = 40, WARNING = 30, INFO = 20, DEBUG = 10) 5 | -------------------------------------------------------------------------------- /codegenome/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | def get_pipeline_by_version(gene_version, **kwargs): 2 | if gene_version == "genes_v0_0_1": 3 | from .retdecsigmal import RetdecSigmalV1 4 | 5 | return RetdecSigmalV1(**kwargs) 6 | -------------------------------------------------------------------------------- /docker/install_cleanup.sh: -------------------------------------------------------------------------------- 1 | #cleanup 2 | rm -f /tmp/llvmlite-settypename.patch 3 | rm -rf /tmp/llvm_pass 4 | rm -rf /tmp/tmp_src 5 | rm -rf /tmp/* 6 | apt-get remove -y \ 7 | git g++ make cmake libcurl4-openssl-dev\ 8 | python3-dev 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.23.4 2 | sklearn==0.0 3 | Pillow==9.3.0 4 | scipy==1.9.3 5 | pefile 6 | enum34 7 | joblib 8 | flask==2.1.3 9 | flask-restx==0.5.1 10 | werkzeug==2.1.2 11 | sqlitedict 12 | jsonlines 13 | python-dotenv 14 | -------------------------------------------------------------------------------- /tests/p/build_elf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | P=p 3 | A='elf' 4 | gcc -O0 -o $P'.gcc.0.'$A $P'.c' 5 | gcc -O3 -o $P'.gcc.3.'$A $P'.c' 6 | clang -O0 -o $P'.clang.0.'$A $P'.c' 7 | clang -O3 -o $P'.clang.3.'$A $P'.c' 8 | clang -Oz -o $P'.clang.z.'$A $P'.c' 9 | 10 | -------------------------------------------------------------------------------- /docker/install_llvm_pass.sh: -------------------------------------------------------------------------------- 1 | TMP=/tmp/llvm_pass 2 | [ ! -e "$TMP" ] && cp -r ../deps/canon_pass "$TMP"/llvm_pass 3 | 4 | cd "$TMP" && \ 5 | PATH=/opt/llvm/bin/:$PATH make &&\ 6 | cp build/libcanonicalization-pass.so /opt/llvm/lib/libcanonicalization-pass.so && \ 7 | cd /tmp && \ 8 | rm -rf "$TMP" 9 | -------------------------------------------------------------------------------- /utils/app/app/defaults.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | #override for REST service 4 | DEFAULT_CALCULATION_METHOD = "jaccard_distance" 5 | DEFAULT_COMPARE_METHOD = "genes_v1_3_1.jaccard_distance_w" 6 | DEFAULT_OUTPUT_DETAIL = "simple" 7 | TMP_DIR_PREFIX = "cg_temp_upload_" 8 | TMP_UPLOAD_DIR = os.environ.get("TMP_UPLOAD_DIR", "/tmp/") 9 | -------------------------------------------------------------------------------- /docker/install_pyleargist.sh: -------------------------------------------------------------------------------- 1 | #install pyleargist 2 | apt-get install -y \ 3 | git \ 4 | python3-dev \ 5 | libfftw3-dev ; \ 6 | git clone https://github.com/vertexcover-io/pyleargist.git; \ 7 | cd pyleargist; \ 8 | pip3 install . ;\ 9 | apt-get remove -y \ 10 | libfftw3-dev; \ 11 | cd .. ;\ 12 | rm -rf pyleargist -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | ##################################################### 2 | # 3 | # List of approvers for codegenome repository 4 | # 5 | ##################################################### 6 | # 7 | # Learn about CODEOWNERS file format: 8 | # https://help.github.com/en/articles/about-code-owners 9 | # 10 | 11 | * @dhilung @souljang 12 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/mirrors-prettier 3 | rev: v2.1.2 4 | hooks: 5 | - id: prettier 6 | - repo: https://github.com/psf/black 7 | rev: 22.3.0 8 | hooks: 9 | - id: black 10 | - repo: https://github.com/PyCQA/isort 11 | rev: 5.11.5 12 | hooks: 13 | - id: isort 14 | -------------------------------------------------------------------------------- /scripts/bin2bc_dir: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # usages: bin2bc_dir 3 | 4 | SRC="$1" 5 | DST="$2" 6 | if [ "$SRC" == "" ];then 7 | echo Usage: $0 input_dir output_dir 8 | exit 1 9 | fi 10 | if [ "$DST" == "" ];then 11 | DST="." 12 | else 13 | mkdir -p $DST 14 | fi 15 | 16 | find "$SRC"|xargs -n 1 -P $(nproc --all) ./bin2bc --output "$DST" --keep_dsm --keep_ll 17 | -------------------------------------------------------------------------------- /codegenome/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | class ProfileLog: 5 | def __init__(self, logger, name=""): 6 | self.name = name 7 | self.logger = logger 8 | 9 | def __enter__(self): 10 | self.start = time.time() 11 | 12 | def __exit__(self, type, value, traceback): 13 | self.t = time.time() - self.start 14 | self.logger.info(self.name + " time: %f" % self.t) 15 | -------------------------------------------------------------------------------- /tests/unit_tests.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import unittest 5 | 6 | logging.basicConfig( 7 | filename="/tmp/cg_unit_tests.log", 8 | level=logging.DEBUG, 9 | format="%(asctime)s, %(name)s, %(levelname)s, %(message)s", 10 | datefmt="%m/%d/%Y %H:%M:%S", 11 | ) 12 | 13 | from test_ir import * 14 | from test_kg import * 15 | from test_lifters import * 16 | from test_sigmal import * 17 | 18 | if __name__ == "__main__": 19 | unittest.main() 20 | -------------------------------------------------------------------------------- /utils/app/app/api/api.py: -------------------------------------------------------------------------------- 1 | from flask_restx import Api 2 | 3 | from ..main import app 4 | 5 | 6 | def check_event_loop(): 7 | pass 8 | 9 | 10 | api = Api( 11 | app, 12 | version="0.0.1", 13 | title="Code Genome", 14 | description="Code Genome APIs", 15 | ) 16 | 17 | from . import add # noqa 18 | from . import compare # noqa 19 | from . import delete # noqa 20 | from . import search # noqa 21 | from . import status # noqa 22 | 23 | # import config 24 | -------------------------------------------------------------------------------- /tests/p/g.c: -------------------------------------------------------------------------------- 1 | #include 2 | int g; 3 | int* gp = &g; 4 | int vec[]= {1,2,3}; 5 | int* intptr; 6 | 7 | int f0(int a){ 8 | int tmp = a; 9 | tmp = a + *gp; 10 | return tmp; 11 | } 12 | 13 | int f1(int a) 14 | { 15 | int tmp; 16 | tmp = a + vec[0]; 17 | return tmp; 18 | } 19 | 20 | void f2(int a){ 21 | a = f0(a); 22 | printf("%d\n", a); 23 | } 24 | 25 | int main(int argc, char* argv[]) 26 | { 27 | int a = atoi(argv[1]); 28 | int x = f0(a); 29 | int y = f1(x); 30 | f2(y); 31 | } 32 | -------------------------------------------------------------------------------- /tests/p/build.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | P=$1 4 | A='unknown' 5 | if [ "$(uname)" == "Linux" ];then 6 | A='elf' 7 | llvm_dis=$(which llvm-dis) 8 | if [ "$llvm_dis" == "" ];then 9 | llvm_dis='/opt/llvm/bin/llvm-dis' 10 | fi 11 | fi 12 | 13 | if [ "$(uname)" == "Darwin" ];then 14 | A='mac' 15 | llvm_dis='/usr/local/Cellar/llvm/9.0.0/bin/llvm-dis' 16 | fi 17 | 18 | #echo gcc -O0 -o $P'.gcc.0.'$A $P'.c' 19 | #echo gcc -O3 -o $P'.gcc.3.'$A $P'.c' 20 | echo clang -O0 -o $P'.clang.0.'$A $P'.c' 21 | echo clang -O3 -o $P'.clang.3.'$A $P'.c' 22 | echo clang -O0 -emit-llvm -o $P'.clang.0.bc' -c $P'.c' 23 | echo $llvm_dis $P'.clang.0.bc' 24 | echo clang -O3 -emit-llvm -o $P'.clang.3.bc' -c $P'.c' 25 | echo $llvm_dis $P'.clang.3.bc' 26 | 27 | 28 | -------------------------------------------------------------------------------- /scripts/fmt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #pip install pre-commit 3 | pre-commit install-hooks 4 | pre-commit run --all-files 5 | RETURN_CODE=$? 6 | 7 | function echoWarning() { 8 | LIGHT_YELLOW='\033[1;33m' 9 | NC='\033[0m' # No Color 10 | echo -e "${LIGHT_YELLOW}${1}${NC}" 11 | } 12 | 13 | if [ "$RETURN_CODE" -ne 0 ]; then 14 | if [ "${CI}" != "true" ]; then 15 | echoWarning "☝️ This appears to have failed, but actually your files have been formatted." 16 | echoWarning "Make a new commit with these changes before making a pull request." 17 | else 18 | echoWarning "This test failed because your code isn't formatted correctly." 19 | echoWarning 'Locally, run `make run fmt`, it will appear to fail, but change files.' 20 | echoWarning "Add the changed files to your commit and this stage will pass." 21 | fi 22 | 23 | exit $RETURN_CODE 24 | fi 25 | -------------------------------------------------------------------------------- /docker/llvm-gcc-fix.patch: -------------------------------------------------------------------------------- 1 | diff --git a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h 2 | index 9e3478e9fd29..efd55339418b 100644 3 | --- a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h 4 | +++ b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h 5 | @@ -4,6 +4,8 @@ 6 | #include "llvm/Demangle/Compiler.h" 7 | #include "llvm/Demangle/StringView.h" 8 | #include 9 | +#include 10 | +#include 11 | 12 | class OutputStream; 13 | 14 | diff --git a/llvm/utils/benchmark/src/benchmark_register.h b/llvm/utils/benchmark/src/benchmark_register.h 15 | index 0705e219f2fa..6001fb8e0e48 100644 16 | --- a/llvm/utils/benchmark/src/benchmark_register.h 17 | +++ b/llvm/utils/benchmark/src/benchmark_register.h 18 | @@ -2,6 +2,7 @@ 19 | #define BENCHMARK_REGISTER_H 20 | 21 | #include 22 | +#include 23 | 24 | #include "check.h" 25 | 26 | -------------------------------------------------------------------------------- /scripts/build_hash_map.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import pickle 4 | import re 5 | import shutil 6 | import subprocess 7 | import sys 8 | 9 | """ 10 | Build sha256 hashmap 11 | 12 | usage: 13 | python build_hash_map.py src_path output_path 14 | """ 15 | 16 | PAT_EXECS = "ELF|PE32|DLL" 17 | 18 | 19 | def hashmap(srcd, output=None): 20 | hmap = {} 21 | flist = subprocess.check_output(["find", srcd]).split("\n") 22 | for fn in flist: 23 | ft = subprocess.check_output(["file", "-b", fn]).strip() 24 | m = re.findall(PAT_EXECS, ft, re.IGNORECASE) 25 | if len(m) > 0: 26 | bin_id = hashlib.sha256(open(fn, "rb").read()).hexdigest() 27 | if bin_id in hmap: 28 | hmap[bin_id].append(fn) 29 | else: 30 | hmap[bin_id] = [fn] 31 | if output: 32 | with open(output, "w") as f: 33 | pickle.dump(hmap, f) 34 | return hmap 35 | 36 | 37 | hashmap(sys.argv[1], sys.argv[2]) 38 | -------------------------------------------------------------------------------- /scripts/run_service.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import logging 4 | import os 5 | import sys 6 | import dotenv 7 | dotenv.load_dotenv() 8 | 9 | logfn = os.environ.get("GC_SERVICE_LOG_PATH", "/tmp/cg_dev_run.log") 10 | if not os.path.exists(os.path.dirname(logfn)): 11 | os.makedirs(os.path.dirname(logfn)) 12 | 13 | logging.basicConfig( 14 | filename=logfn, 15 | format="%(asctime)s, %(name)s, %(levelname)s, %(message)s", 16 | datefmt="%m/%d/%Y %H:%M:%S", 17 | level=logging.DEBUG, 18 | ) 19 | 20 | 21 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../")) 22 | sys.path.insert( 23 | 0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils/app/") 24 | ) 25 | from app.main import * # noqa 26 | 27 | sgkg_log = logging.getLogger("codegenome") 28 | sgkg_log.setLevel(logging.DEBUG) 29 | 30 | host = "127.0.0.1" 31 | port = 5001 32 | 33 | if len(sys.argv) > 1: 34 | host = sys.argv[1] 35 | if len(sys.argv) > 2: 36 | port = int(sys.argv[2]) 37 | app.run(host=host, debug=True, port=port) 38 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import find_namespace_packages, setup 3 | from setuptools.command.develop import develop 4 | from setuptools.command.install import install 5 | 6 | with open("README.md", "r") as fh: 7 | long_description = fh.read() 8 | 9 | with open("requirements.txt") as f: 10 | requirements = f.read().splitlines() 11 | 12 | 13 | class PostInstallDependencies(install): 14 | def run(self): 15 | install.run(self) 16 | 17 | 18 | setup( 19 | name="codegenome", 20 | version="0.0.1", 21 | description="Code Genome framework", 22 | url="https://research.ibm.com/", 23 | author="IBM Research", 24 | author_email="dkirat@us.ibm.com", 25 | classifiers=[ 26 | "Programming Language :: Python :: 3", 27 | "Operating System :: OS Independent", 28 | ], 29 | packages=find_namespace_packages(include=["codegenome*"]), 30 | scripts=["scripts/cg"], 31 | python_requires=">=3.8", 32 | install_requires=requirements, 33 | mdclass={"install": PostInstallDependencies}, 34 | ) 35 | -------------------------------------------------------------------------------- /utils/app/app/main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.config 3 | import os 4 | 5 | from flask import Flask 6 | from flask_restx import Api, Resource, fields 7 | from werkzeug.middleware.proxy_fix import ProxyFix 8 | import dotenv 9 | dotenv.load_dotenv() 10 | 11 | # logging.config.fileConfig('logging.conf') 12 | logging.basicConfig(filename= os.environ.get("GC_SERVICE_LOG_PATH", "/tmp/cg.rest.log")) 13 | log = logging.getLogger("codegenome.rest") 14 | log.setLevel(int(os.environ.get("CG_DEBUG", logging.ERROR))) 15 | ch = logging.StreamHandler() 16 | ch.setFormatter(logging.Formatter("%(asctime)s, %(name)s, %(levelname)s, %(message)s")) 17 | log.addHandler(ch) 18 | 19 | 20 | app = Flask(__name__) 21 | app.wsgi_app = ProxyFix(app.wsgi_app) 22 | 23 | from .core.genome_service import create_genome_service # noqa 24 | 25 | kgs = create_genome_service() 26 | from .api import api # noqa 27 | 28 | if __name__ == "__main__": 29 | # Only for debugging while developing 30 | # app.run(host="127.0.0.1", debug=True, port=5000) 31 | app.run(host="0.0.0.0", debug=True) 32 | -------------------------------------------------------------------------------- /docker/llvmlite-settypename.patch: -------------------------------------------------------------------------------- 1 | diff --git a/ffi/value.cpp b/ffi/value.cpp 2 | index 05c67b6..9327faa 100644 3 | --- a/ffi/value.cpp 4 | +++ b/ffi/value.cpp 5 | @@ -408,6 +408,18 @@ LLVMPY_GetTypeName(LLVMTypeRef type) 6 | return LLVMPY_CreateString(""); 7 | } 8 | 9 | +API_EXPORT(void) 10 | +LLVMPY_SetTypeName(LLVMTypeRef type, const char *Name) 11 | +{ 12 | + // try to convert to a struct type, works for other derived 13 | + // types too 14 | + llvm::Type* unwrapped = llvm::unwrap(type); 15 | + llvm::StructType* ty = llvm::dyn_cast(unwrapped); 16 | + if (ty && !ty->isLiteral()) { 17 | + ty->setName(Name); 18 | + } 19 | +} 20 | + 21 | API_EXPORT(bool) 22 | LLVMPY_TypeIsPointer(LLVMTypeRef type) 23 | { 24 | diff --git a/llvmlite/binding/value.py b/llvmlite/binding/value.py 25 | index 4e21b3e..b13cdba 100644 26 | --- a/llvmlite/binding/value.py 27 | +++ b/llvmlite/binding/value.py 28 | @@ -53,6 +53,10 @@ class TypeRef(ffi.ObjectRef): 29 | """ 30 | return ffi.ret_string(ffi.lib.LLVMPY_GetTypeName(self)) 31 | 32 | + @name.setter 33 | + def name(self, val): 34 | + ffi.lib.LLVMPY_SetTypeName(self, _encode_string(val)) 35 | + 36 | @property 37 | def is_pointer(self): 38 | """ 39 | -------------------------------------------------------------------------------- /docker/install_retdec.sh: -------------------------------------------------------------------------------- 1 | # install retdec 2 | pushd $(pwd) 3 | PREFIX=/opt/cg/retdec 4 | mkdir -p $PREFIX 5 | VER=$(cat /etc/issue|cut -d' ' -f2) 6 | 7 | if [[ $VER < "22" ]]; then 8 | #ubuntu version < 22 9 | #BIN_URL=https://github.com/avast/retdec/releases/download/v4.0/retdec-v4.0-ubuntu-64b.tar.xz does not work 10 | 11 | DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential cmake git openssl libssl-dev python3 autoconf automake libtool pkg-config m4 zlib1g-dev upx doxygen graphviz 12 | mkdir -p /tmp/retdec 13 | cd /tmp/retdec 14 | git clone https://github.com/avast/retdec.git && \ 15 | cd retdec && \ 16 | git checkout 3435bc827d2c2c5da91dfb84509af0c034ee22b5 && \ 17 | mkdir build && \ 18 | cd build && \ 19 | cmake .. -DCMAKE_INSTALL_PREFIX=$PREFIX -DCMAKE_LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/7/ && \ 20 | make -j 8 && \ 21 | make install 22 | 23 | rm -rf /tmp/retdec 24 | 25 | else 26 | BIN_URL=https://github.com/avast/retdec/releases/download/v5.0/RetDec-v5.0-Linux-Release.tar.xz 27 | wget $BIN_URL -O /tmp/retdec.tar.xz && \ 28 | tar -xJf /tmp/retdec.tar.xz -C $PREFIX/ ;\ 29 | rm /tmp/retdec.tar.xz 30 | fi 31 | 32 | # replace with our config 33 | popd 34 | cp decompiler-config.json $PREFIX'/share/retdec/decompiler-config.json' 35 | echo "Retdec installed. Please do: export RETDEC_PATH=$PREFIX" 36 | -------------------------------------------------------------------------------- /tests/test_sigmal.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | import unittest 5 | 6 | import numpy as np 7 | 8 | logging.basicConfig( 9 | filename="/tmp/cg-test-sigmal.log", 10 | level=logging.DEBUG, 11 | format="%(asctime)s, %(name)s, %(levelname)s, %(message)s", 12 | datefmt="%m/%d/%Y %H:%M:%S", 13 | ) 14 | 15 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../")) 16 | 17 | import test_data as data 18 | 19 | 20 | class TestSigmal(unittest.TestCase): 21 | def setUp(self): 22 | from codegenome.genes import SigmalGene 23 | from codegenome.ir import IRBinary 24 | 25 | self.irb = IRBinary(data.global_ir, ll=True) 26 | self.sm = SigmalGene() 27 | pass 28 | 29 | def test_gene(self): 30 | bc = self.irb.fs["f0"].get_bc() 31 | g1 = self.sm.from_bitcode(bc) 32 | g2 = self.sm.from_bitcode(bc, gene_type="sigmal") 33 | g3 = self.sm.from_bitcode(bc, gene_type="sigmal2") 34 | g4 = self.sm.from_bitcode(bc, gene_type="sigmal2b") 35 | g5 = self.sm.from_bitcode(bc, gene_type="func_only") 36 | g6 = self.sm.from_data(bc) 37 | 38 | self.assertTrue(np.array_equal(g1, g2)) 39 | self.assertTrue(np.array_equal(g1, g6)) 40 | self.assertFalse(np.array_equal(g1, g3)) 41 | self.assertFalse(np.array_equal(g3, g4)) 42 | self.assertFalse(np.array_equal(g4, g5)) 43 | 44 | 45 | if __name__ == "__main__": 46 | unittest.main(verbosity=2) 47 | -------------------------------------------------------------------------------- /tests/test_data.py: -------------------------------------------------------------------------------- 1 | global_c = """ 2 | int g; 3 | int* gp = &g; 4 | int vec[]= {1,2,3}; 5 | int* intptr; 6 | 7 | int f0(int a){ 8 | int tmp = a; 9 | tmp = a + *gp; 10 | return tmp; 11 | } 12 | 13 | int f1(int a) 14 | { 15 | int tmp; 16 | tmp = a + vec[0]; 17 | f0(a); 18 | return tmp; 19 | } 20 | """ 21 | 22 | global_ir = """ 23 | @g = global i32 123, align 4 24 | @gp = global i32* @g, align 8 25 | @vec = global [3 x i32] [i32 1, i32 2, i32 3], align 4 26 | @intptr = common global i32* null, align 8 27 | 28 | define i32 @f0(i32 %a) { 29 | %1 = load i32*, i32** @gp, align 8 30 | %2 = load i32, i32* %1, align 4 31 | %3 = add nsw i32 %2, %a 32 | ret i32 %3 33 | } 34 | 35 | define i32 @f1(i32) { 36 | %2 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @vec, i64 0, i64 0), align 4 37 | %3 = add nsw i32 %2, %0 38 | ret i32 %3 39 | } 40 | """ 41 | 42 | type_ir = """ 43 | %type1 = type { 44 | i32, 45 | i32, 46 | double 47 | } 48 | %type2 = type { 49 | i32, 50 | i32, 51 | %type1 52 | } 53 | 54 | define i32* @f0(i32 %a) { 55 | 56 | %1 = alloca %type2 57 | %2 = getelementptr %type2, %type2* %1, i32 0, i32 1 58 | ret i32* %2 59 | } 60 | """ 61 | 62 | externs_ir = """ 63 | declare i32 @printf(i8*, ...) local_unnamed_addr 64 | 65 | define i32 @local_func(i32 %x) { 66 | ret i32 %x 67 | } 68 | 69 | define i32 @f0(i32 %a, i8* %format) { 70 | %1 = call i32 @local_func(i32 %a) 71 | %2 = tail call i32 (i8*, ...) @printf(i8* %format) 72 | ret i32 %1 73 | } 74 | """ 75 | -------------------------------------------------------------------------------- /utils/app/app/api/status.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from flask_restx import Resource, fields 4 | 5 | from ..core.genome_service import (API_STATE_EMPTY_RESULT, 6 | API_STATE_RESULT_NOT_READY) 7 | from ..main import kgs 8 | from .api import api 9 | 10 | ns = api.namespace("api/v1/status", description="Service status") 11 | 12 | logger = logging.getLogger("codegenome.rest") 13 | 14 | 15 | @ns.route("/") 16 | class ConfigStatus(Resource): 17 | """Service status.""" 18 | 19 | def get(self): 20 | """Service status""" 21 | try: 22 | return kgs.status() 23 | except KeyError as e: 24 | api.abort(404, f"Failed to retrieve service status.") 25 | 26 | 27 | @ns.route("/job/") 28 | @ns.response(200, "Final result") 29 | @ns.response(202, "Request received. Result not ready. Must retry.") 30 | @ns.response(204, "Result empty") 31 | @ns.response(404, "Job id not found") 32 | class Job(Resource): 33 | """Job status""" 34 | 35 | def get(self, job_id): 36 | """Get Job status.""" 37 | 38 | try: 39 | ret = kgs.check_job(job_id) 40 | if ret.get("status") == API_STATE_RESULT_NOT_READY: 41 | return ret, 202 42 | elif ret.get("status") == API_STATE_EMPTY_RESULT: 43 | if ret.get("query") is None: # root search node not found. 44 | return ret, 404 45 | 46 | return ret 47 | except Exception as e: 48 | api.abort(500, f"Exception: {e}") 49 | -------------------------------------------------------------------------------- /docker/install_all_local.sh: -------------------------------------------------------------------------------- 1 | # local install 2 | apt-get update && \ 3 | apt-get install -y \ 4 | python3 \ 5 | python3-pip \ 6 | python-is-python3 \ 7 | wget 8 | 9 | cp -f decompiler-config.json /tmp/ 10 | cp -f install_retdec.sh /tmp/ 11 | cp -f install_pyleargist.sh /tmp/ 12 | 13 | cp -f llvmlite-settypename.patch /tmp/ 14 | cp -f llvm-gcc-fix.patch /tmp/ 15 | cp -f install_llvmlite.sh /tmp/ 16 | 17 | cp -rf ../llvm_pass /tmp/llvm_pass 18 | cp -f install_llvm_pass.sh /tmp/ 19 | 20 | export PATH="/opt/cg/retdec/bin:$PATH" 21 | export RETDEC_PATH="/opt/cg/retdec" 22 | 23 | cleanup() { 24 | cd /tmp 25 | rm -f /tmp/llvmlite-settypename.patch 26 | rm -f /tmp/llvm-gcc-fix.patch 27 | rm -rf /tmp/llvm_pass 28 | rm -rf /tmp/tmp_src 29 | rm -f /tmp/decompiler-config.json 30 | rm -f /tmp/install_retdec.sh 31 | rm -f /tmp/install_pyleargist.sh 32 | rm -f /tmp/install_llvmlite.sh 33 | rm -f /tmp/install_llvm_pass.sh 34 | } 35 | 36 | cd /tmp && bash install_retdec.sh && \ 37 | cd /tmp && bash install_pyleargist.sh && \ 38 | cd /tmp && bash install_llvmlite.sh && \ 39 | cd /tmp && bash install_llvm_pass.sh 40 | 41 | # Check the exit status of the last command 42 | if [ $? -ne 0 ]; then 43 | echo "Installation failed. Do you want to cleanup files from /tmp? (y/n)" 44 | read answer 45 | 46 | case ${answer:0:1} in 47 | y|Y ) 48 | echo "Cleaning up.." 49 | cleanup 50 | ;; 51 | * ) 52 | echo "Exiting..." 53 | exit 1 54 | ;; 55 | esac 56 | else 57 | cleanup 58 | fi 59 | 60 | -------------------------------------------------------------------------------- /utils/app/app/api/delete.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from flask_restx import Resource, fields 4 | from flask import request 5 | from flask_restx import Resource 6 | from werkzeug.datastructures import FileStorage 7 | import copy 8 | import traceback 9 | import logging 10 | import tempfile 11 | 12 | from .api import api 13 | from ..main import kgs 14 | from ..core.genome_service import API_STATE_RESULT_NOT_READY, API_STATE_EMPTY_RESULT, API_STATE_ERROR 15 | from ..defaults import * 16 | 17 | logger = logging.getLogger('codegenome.rest') 18 | ns = api.namespace("api/v1/delete", description="Delete from KG.") 19 | 20 | upload_parser = api.parser() 21 | 22 | file_args = api.model( 23 | "file_args", 24 | {"file_id": fields.String( 25 | required=True, default='', 26 | description="The identifier of the file")}) 27 | 28 | 29 | @ns.route("/file") 30 | @ns.response(200, "Success") 31 | @ns.response(404, "Object id not found") 32 | class DeleteFile(Resource): 33 | """Delete by `file_id`.""" 34 | 35 | @ns.expect(file_args) 36 | def post(self): 37 | """Delete by `file_id`.""" 38 | try: 39 | args = dict(api.payload) 40 | ret = kgs.delete_file(args.get('file_id')) 41 | if ret.get('status') == API_STATE_RESULT_NOT_READY: 42 | return ret, 202 43 | elif ret.get('status') == API_STATE_EMPTY_RESULT: 44 | return ret, 404 45 | elif ret.get('status') == API_STATE_ERROR: 46 | return ret, 500 47 | return ret 48 | except Exception as e: 49 | api.abort(500, f"Exception: {e}") 50 | -------------------------------------------------------------------------------- /codegenome/_file_format.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import joblib 4 | 5 | _GKG_FILE_VERSION = "0.3" 6 | _CANON_FILE_VERSION_ = "0.3" 7 | _GENE_FILE_VERSION_ = "0.3" 8 | 9 | 10 | def get_file_meta(file_path, file_size=None): 11 | if file_size is None: 12 | file_size = os.path.getsize(file_path) 13 | 14 | return {"file_path": file_path, "file_size": file_size} 15 | 16 | 17 | def prep_gkg_file(gkg): 18 | file_content = { 19 | "type": "gkg", 20 | "version": _GKG_FILE_VERSION, 21 | "data": gkg.serialize(), 22 | } 23 | return file_content 24 | 25 | 26 | def read_gkg_file(path): 27 | data = joblib.load(path) 28 | assert data["type"] == "gkg" 29 | assert data["version"] == _GKG_FILE_VERSION 30 | return data 31 | 32 | 33 | def prep_gene_file(genes, binid, file_meta): 34 | file_content = { 35 | "type": "gene", 36 | "version": _GENE_FILE_VERSION_, 37 | "binid": binid, 38 | "genes": genes, 39 | "file_meta": file_meta, 40 | } 41 | return file_content 42 | 43 | 44 | def read_gene_file(path): 45 | data = joblib.load(path) 46 | assert data["type"] == "gene" 47 | assert data["version"] == _GENE_FILE_VERSION_ 48 | return data 49 | 50 | 51 | def prep_canon_file(ir_bin, file_meta): 52 | file_content = { 53 | "type": "canon", 54 | "version": _CANON_FILE_VERSION_, 55 | "binid": ir_bin._bin_id, 56 | "funcs": ir_bin.serialize(), 57 | "file_meta": file_meta, 58 | } 59 | return file_content 60 | 61 | 62 | def read_canon_file(path): 63 | data = joblib.load(path) 64 | assert data["type"] == "canon" 65 | assert data["version"] == _CANON_FILE_VERSION_ 66 | return data 67 | -------------------------------------------------------------------------------- /utils/app/app/core/schema.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import uuid 3 | 4 | import numpy as np 5 | 6 | DEFAULT_ID_DELIMITER = ":" # TODO move to a separate common module 7 | 8 | 9 | def get_type_from_id(id_): 10 | if type(id_) == str: 11 | id_comp = id_.split(DEFAULT_ID_DELIMITER) 12 | if len(id_comp) > 1: 13 | return id_comp[0] 14 | return None 15 | 16 | 17 | class KGNodeTypes: 18 | file = "file" 19 | gene = "gene" 20 | cache = "cache" 21 | stat = "stat" 22 | 23 | 24 | class KGNodeID: 25 | @staticmethod 26 | def _mk_id(_type, _hash): 27 | # return f"{_type}:{_hash}" 28 | return _hash 29 | 30 | @staticmethod 31 | def split(_id): 32 | return str(_id).split(":") 33 | 34 | @staticmethod 35 | def id(_type, data=None, hash=None, file_path=None, return_hash=False): 36 | if hash is None: 37 | if data is not None: 38 | if type(data) != bytes: 39 | # forced conversion! 40 | data = str(data).strip().lower().encode("utf-8") 41 | hash = hashlib.sha256(data).hexdigest() 42 | 43 | elif file_path is not None: 44 | with open(file_path, "rb") as f: 45 | hash = hashlib.sha256(f.read()).hexdigest() 46 | else: 47 | raise Exception("parameter missing") 48 | 49 | if return_hash: 50 | return KGNodeID._mk_id(_type, hash), hash 51 | return KGNodeID._mk_id(_type, hash) 52 | 53 | @staticmethod 54 | def file_id(file_data=None, file_hash=None, file_path=None, return_hash=False): 55 | return KGNodeID.id( 56 | KGNodeTypes.file, file_data, file_hash, file_path, return_hash 57 | ) 58 | -------------------------------------------------------------------------------- /docker/install_llvmlite.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # install modified llvmlite 3 | # Tested on Ubuntu 22.04 4 | 5 | cp llvmlite-settypename.patch llvm-gcc-fix.patch /tmp/ 6 | 7 | apt-get update && \ 8 | DEBIAN_FRONTEND=noninteractive apt-get install -y \ 9 | python3 python3-pip python-is-python3 \ 10 | git g++ make cmake vim unzip libcurl4-openssl-dev wget ;\ 11 | 12 | TMP=/tmp/tmp_src 13 | mkdir -p $TMP 14 | wget -O $TMP/llvm.tar.gz https://github.com/llvm/llvm-project/archive/llvmorg-8.0.1.tar.gz ;\ 15 | cd $TMP && tar xf $TMP/llvm.tar.gz; \ 16 | cd $TMP && git clone https://github.com/numba/llvmlite.git && \ 17 | cd llvmlite && \ 18 | git checkout aa11b129c0b55973067422397821ae6d44fa5e70 && \ 19 | git apply --whitespace=nowarn /tmp/llvmlite-settypename.patch && \ 20 | mv $TMP/llvmlite/conda-recipes/twine_cfg_undefined_behavior.patch $TMP/llvmlite/conda-recipes/twine_cfg_undefined_behavior.patch.bak;\ 21 | cd $TMP/llvm-project-llvmorg-8.0.1/llvm && \ 22 | for f in $TMP/llvmlite/conda-recipes/*.patch; do patch -fN -p1 -i $f; done ;\ 23 | cd $TMP/llvm-project-llvmorg-8.0.1/ && \ 24 | patch -fN -p1 -i /tmp/llvm-gcc-fix.patch ;\ 25 | 26 | # fix recipes -------- 27 | LLVMLITESRC=$TMP/llvmlite 28 | 29 | BUILD=$LLVMLITESRC/conda-recipes/llvmdev/build.sh 30 | 31 | if grep -q '^RECIPE_DIR' $BUILD; then 32 | true; 33 | else 34 | ex $BUILD </dev/null 21 | docker rm $(image-ui) &>/dev/null 22 | docker image rm $(image) &>/dev/null 23 | docker image rm $(image-ui) &>/dev/null 24 | 25 | start_local : 26 | mkdir -p $(shell echo ~)/.cg 27 | sudo chown -R 1001:1001 $(shell echo ~)/.cg 28 | 29 | #run worker 30 | docker run --rm -d -u 1001:1001 -p 5001:5001 -v $(shell echo ~)/.cg:/home/cguser/.cg --name $(image) $(image) 31 | 32 | #run ui 33 | docker run --rm -d -p 5000:5000 --add-host host.docker.internal:host-gateway -e CG_HOST="http://host.docker.internal:5001" --name $(image-ui) $(image-ui) 34 | 35 | start_worker : 36 | mkdir -p $(shell echo ~)/.cg 37 | sudo chown -R 1001:1001 $(shell echo ~)/.cg 38 | 39 | #run worker 40 | docker run --rm -d -u 1001:1001 -p 5001:5001 -v $(shell echo ~)/.cg:/home/cguser/.cg --name $(image) ghcr.io/code-genome/cg-worker:latest 41 | 42 | start_ui : 43 | #run ui 44 | docker run --rm -d -p 5000:5000 --add-host host.docker.internal:host-gateway -e CG_HOST="http://host.docker.internal:5001" --name $(image-ui) ghcr.io/code-genome/cg-ui:latest 45 | 46 | start : start_worker start_ui 47 | 48 | stop : 49 | docker stop $(image) 50 | docker stop $(image-ui) 51 | 52 | deps : 53 | cd docker 54 | sudo bash install_all_local.sh 55 | 56 | docker-build-dev : docker/Dockerfile.dev 57 | docker build -f docker/Dockerfile.dev --build-arg HOST_UID=$(shell id -u) -t $(image-dev) . 58 | 59 | dev-cli : 60 | docker run --rm -v $(shell pwd):/cg -t -i --entrypoint /bin/bash $(image-dev) 61 | 62 | pre-commit : 63 | pre-commit run --all-files 64 | -------------------------------------------------------------------------------- /scripts/build_gkg.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import sys 5 | 6 | 7 | def main(args): 8 | 9 | logging.basicConfig( 10 | filename="/tmp/build_gkg.log", 11 | level=logging.DEBUG, 12 | format="%(asctime)s, %(name)s, %(levelname)s, %(message)s", 13 | datefmt="%m/%d/%Y %H:%M:%S", 14 | force=True, 15 | ) 16 | 17 | log = logging.getLogger("gkg") 18 | if args.verbose: 19 | h = logging.StreamHandler(sys.stdout) 20 | h.setLevel(logging.DEBUG) 21 | log.addHandler(h) 22 | 23 | log.info("starting build_gkg") 24 | 25 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")) 26 | from sigmal.gkg import GenomeKG 27 | 28 | gkg = GenomeKG() 29 | log.info("creating GenomeKG from %s" % (args.input_dir)) 30 | gkg.create(args.input_dir) 31 | log.info("OK: GenomeKG created.") 32 | if args.compute_tree: 33 | log.info( 34 | "computing BallTree.. using distance metric %s" % (args.distance_metric) 35 | ) 36 | gkg.compute_tree(metric=args.distance_metric) 37 | log.info("OK: BallTree computed.") 38 | log.info("saving GenomeKG..") 39 | r = gkg.save(args.output_file) 40 | log.info("OK: GenomeKG save to %s" % (r)) 41 | 42 | 43 | if __name__ == "__main__": 44 | ap = argparse.ArgumentParser() 45 | ap.add_argument("-v", "--verbose", default=False, action="store_true") 46 | ap.add_argument( 47 | "-c", 48 | "--compute_tree", 49 | default=True, 50 | action="store_true", 51 | help="Compute balltree.", 52 | ) 53 | ap.add_argument( 54 | "--distance_metric", 55 | default="minkowski", 56 | action="store_true", 57 | help="Distance metric for compute balltree.", 58 | ) 59 | ap.add_argument( 60 | "-o", 61 | "--output_file", 62 | default=None, 63 | help="Optional output GenomeKG file path. Defaults to {input_dir}.gkg.", 64 | ) 65 | 66 | ap.add_argument("input_dir") 67 | 68 | args = ap.parse_args() 69 | 70 | exit(main(args)) 71 | -------------------------------------------------------------------------------- /utils/app/app/api/add.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import logging 4 | import os 5 | import tempfile 6 | import traceback 7 | 8 | from flask import request 9 | from flask_restx import Resource, fields 10 | from werkzeug.datastructures import FileStorage 11 | 12 | from ..core.genome_service import (API_STATE_EMPTY_RESULT, API_STATE_ERROR, 13 | API_STATE_RESULT_NOT_READY) 14 | from ..defaults import * 15 | from ..main import kgs 16 | from .api import api 17 | 18 | logger = logging.getLogger("codegenome.rest") 19 | ns = api.namespace("api/v1/add", description="Add to KG.") 20 | 21 | upload_parser = api.parser() 22 | upload_parser.add_argument("file", location="files", type=FileStorage, required=True) 23 | 24 | 25 | @ns.route("/file") 26 | @ns.response(200, "Final result") 27 | @ns.response( 28 | 202, "Request received. Result not ready. Must check using `status/job/`." 29 | ) 30 | @ns.response(204, "Result empty") 31 | @ns.response(404, "Submissions id not found") 32 | @ns.expect(upload_parser) 33 | class Add(Resource): 34 | def post(self): 35 | args = upload_parser.parse_args(request) 36 | uploaded_file = args["file"] # This is FileStorage instance 37 | # We can get the filename, stream, mimetype, etc. from it 38 | logger.info("Received a file %s" % uploaded_file) 39 | try: 40 | if not os.path.exists(TMP_UPLOAD_DIR): 41 | os.makedirs(TMP_UPLOAD_DIR) 42 | 43 | tmpdir = tempfile.mkdtemp(prefix=TMP_DIR_PREFIX, dir=TMP_UPLOAD_DIR) 44 | tmpfn = os.path.join(tmpdir, os.path.basename(uploaded_file.filename)) 45 | uploaded_file.save(tmpfn) 46 | 47 | ret = kgs.api_add_file(tmpfn) 48 | if ret.get("status") == API_STATE_RESULT_NOT_READY: 49 | return ret, 202 50 | elif ret.get("status") == API_STATE_EMPTY_RESULT: 51 | if ret.get("query") is None: # root search node not found. 52 | return ret, 206 53 | elif ret.get("status") == API_STATE_ERROR: 54 | return ret, 404 55 | return ret 56 | except Exception as e: 57 | api.abort(404, f"Exception: {e}") 58 | -------------------------------------------------------------------------------- /tests/test_lifters.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import json 3 | import logging 4 | import os 5 | import sys 6 | import time 7 | import unittest 8 | 9 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../")) 10 | from codegenome.lifters.retdec import CGRetdec # noqa 11 | 12 | os.environ.setdefault("RETDEC_PATH", "/opt/cg/retdec/") 13 | 14 | TEST_D = "/tmp/cg_lifter_test" 15 | TEST_FN = "p/p.c" 16 | FUNC = "f1" 17 | 18 | LOG_FN = os.path.join(TEST_D, "cg-test.log") 19 | 20 | GENE_PATH = os.path.join(TEST_D, "sigmal") 21 | GKG_PATH = GENE_PATH + ".gkg" 22 | bin_id = None # populated by test_compile() 23 | 24 | test_gene_id = None # populated by test_bingene() 25 | 26 | FN = os.path.splitext(os.path.basename(TEST_FN))[0] 27 | DEST_FN = os.path.join(TEST_D, FN) 28 | 29 | 30 | def prepare(): 31 | os.system("rm -rf " + TEST_D) 32 | os.system("mkdir -p " + TEST_D) 33 | logging.basicConfig( 34 | filename=LOG_FN, 35 | level=logging.DEBUG, 36 | format="%(asctime)s, %(name)s, %(levelname)s, %(message)s", 37 | datefmt="%m/%d/%Y %H:%M:%S", 38 | force=True, 39 | ) 40 | 41 | 42 | def clear(): 43 | print(f"clearing {TEST_D}") 44 | # os.system('rm -rf '+TEST_D) 45 | 46 | 47 | class TestLifter(unittest.TestCase): 48 | @classmethod 49 | def setUpClass(cls): 50 | prepare() 51 | 52 | @classmethod 53 | def tearDownClass(cls): 54 | clear() 55 | 56 | # test order is sorted test function names! 57 | 58 | def test_01_to_bc(self): 59 | global bin_id 60 | cmd = "clang -O0 -o %s %s" % (DEST_FN, TEST_FN) 61 | os.system(cmd) 62 | self.assertTrue(os.path.exists(DEST_FN)) 63 | with open(DEST_FN, "rb") as f: 64 | bin_id = hashlib.sha256(f.read()).hexdigest() 65 | 66 | retdec = CGRetdec() 67 | 68 | retdec.process_file(DEST_FN) 69 | 70 | out_fn = os.path.join(TEST_D, FN + ".bc") 71 | self.assertTrue(os.path.exists(out_fn)) 72 | 73 | out_dir = os.path.join(TEST_D, "tmp") 74 | os.makedirs(out_dir) 75 | 76 | retdec.process_file(DEST_FN, output_dir=out_dir, output_fname=bin_id) 77 | 78 | out_fn = os.path.join(out_dir, bin_id + ".bc") 79 | self.assertTrue(os.path.exists(out_fn)) 80 | 81 | 82 | if __name__ == "__main__": 83 | unittest.main(verbosity=2) 84 | -------------------------------------------------------------------------------- /codegenome/_defaults.py: -------------------------------------------------------------------------------- 1 | """ 2 | Global default values 3 | """ 4 | import os 5 | import dotenv 6 | import logging 7 | 8 | #not configurable defaults 9 | UNIVERSAL_FUNC_NAME = "_F" 10 | KNOWN_CALCULATION_METHODS = ["jaccard_distance", "jaccard_distance_w", "all"] 11 | VALID_OUTPUT_DETAILS = ["simple", "complete"] 12 | 13 | logger = logging.getLogger("cg.defaults") 14 | dotenv.load_dotenv() 15 | 16 | CG_DATA_ROOT_DIR = os.path.expanduser(os.environ.get('CG_DATA_ROOT_DIR',"~/.cg")) 17 | CG_CACHE_DIR = os.path.expanduser(os.environ.get('CG_CACHE_DIR', os.path.join(CG_DATA_ROOT_DIR, 'cache'))) 18 | 19 | if not os.path.exists(CG_DATA_ROOT_DIR): 20 | os.makedirs(CG_DATA_ROOT_DIR) 21 | 22 | if not os.path.exists(CG_CACHE_DIR): 23 | os.makedirs(CG_CACHE_DIR) 24 | 25 | DEFAULT_GENE_VERSION = os.environ.get("DEFAULT_GENE_VERSION", "genes_v0_0_1") 26 | DEFAULT_EXEC_GENE_VERSION = os.environ.get("DEFAULT_EXEC_GENE_VERSION", DEFAULT_GENE_VERSION) 27 | 28 | DEFAULT_CALCULATION_METHOD = os.environ.get("DEFAULT_CALCULATION_METHOD", "jaccard_distance_w") 29 | if DEFAULT_CALCULATION_METHOD not in KNOWN_CALCULATION_METHODS: 30 | logger.error(f"Invalid DEFAULT_CALCULATION_METHOD={DEFAULT_CALCULATION_METHOD}") 31 | DEFAULT_CALCULATION_METHOD = "jaccard_distance_w" 32 | 33 | # function compare 34 | # minimum canonicalized function size (canon_bc_size). Smaller than this size will be 35 | # skipped during comparison. 36 | # Ref: 928 is the bc size of the following code 37 | # 'source_filename = ""\n\ndeclare i64 @gf1() local_unnamed_addr\n\ndefine i64 @_F() local_unnamed_addr {\nb1:\n %v1 = tail call i64 @gf1()\n ret i64 %v1\n}\n' 38 | # 39 | MIN_GENE_SIZE_FILE_COMPARE = int(os.environ.get( "MIN_GENE_SIZE_FILE_COMPARE" ,1000)) 40 | 41 | # max genes allowed per file during comparison. 42 | MAX_GENES_PER_FILE_COMPARE = int(os.environ.get( "MAX_GENES_PER_FILE_COMPARE" ,50000)) 43 | 44 | # during pairwise file compare greater than or equal to this threshold will be considered as a match `~` 45 | FILE_COMPARE_FUNC_MATCH_SIM_THRESHOLD = float(os.environ.get( "FILE_COMPARE_FUNC_MATCH_SIM_THRESHOLD" ,0.99)) 46 | 47 | # for the same function names, greater than or equal to this threshold will be considered as a mismatch `!`, 48 | # smaller wil be considered delete `-` 49 | FILE_COMPARE_FUNC_MISMATCH_SIM_THRESHOLD = float(os.environ.get("FILE_COMPARE_FUNC_MISMATCH_SIM_THRESHOLD",0.80)) 50 | -------------------------------------------------------------------------------- /tests/p/p.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int g=1; 5 | int* gp = &g; 6 | int vec[]= {1,2,3}; 7 | int* intptr; 8 | 9 | int gf1(int a){ 10 | int tmp = a; 11 | tmp = a + *gp; 12 | intptr = &g; 13 | return tmp; 14 | } 15 | 16 | int gf2(int a) 17 | { 18 | int tmp; 19 | tmp = a + vec[0]; 20 | return tmp; 21 | } 22 | 23 | int f1(int a){ 24 | int x; 25 | x = a + 32; 26 | return x; 27 | } 28 | 29 | int f2(int a){ 30 | int local=31; 31 | local +=1; 32 | local = a + local; 33 | return local; 34 | } 35 | 36 | int f3(int a){ 37 | int local = a; 38 | a = local; 39 | local = 30; 40 | a = a + local; 41 | a+=2; 42 | __asm__("xor %eax, %eax"); 43 | __asm__("xor %eax, %eax"); 44 | __asm__("xor %eax, %eax"); 45 | return a; 46 | } 47 | 48 | 49 | int f4(int a){ 50 | int l; int m; l = 30; 51 | m = l - 10; 52 | if (a>10){ 53 | a = a+l; 54 | a+=2; 55 | return a; 56 | } 57 | else{ 58 | __asm__("xor %eax, %eax"); 59 | return a+m+12; 60 | } 61 | } 62 | 63 | int f5(int a){ 64 | int l; int m; l = 30; m = 20; 65 | if (a>100){ 66 | l = l + a; 67 | if(a>501){ 68 | int tmp = 30-a;tmp = a+tmp; 69 | a = a + tmp+2; 70 | return a; 71 | } 72 | a = l + 2; l = m = a; 73 | return l; 74 | } 75 | else if(a>10) 76 | { 77 | int tmp = a; a +=a; a = a - tmp; 78 | a = a + 32; return a; 79 | }else { 80 | int tmp = 30-a; tmp = tmp + a; 81 | tmp = tmp +2; a = a + tmp; 82 | return a; 83 | } 84 | } 85 | 86 | int f5_(int a){ 87 | int l; 88 | int m; 89 | l = 30; 90 | m = 20; 91 | if (a>100){ 92 | l = l + a; 93 | if(a>501){ 94 | int tmp = 30-a; 95 | tmp = a+tmp; 96 | a = a + tmp+2; 97 | return a; 98 | } 99 | a = l + 2; 100 | l = m = a; 101 | return l; 102 | } 103 | else if(a>10) 104 | { 105 | int tmp = a; 106 | a +=a; 107 | a = a - tmp; 108 | a = a + 32; 109 | return a; 110 | }else { 111 | int tmp = 30-a; 112 | tmp = tmp + a; 113 | tmp = tmp +2; 114 | a = a + tmp; 115 | return a; 116 | } 117 | } 118 | 119 | int main(int argc, char* argv[]) 120 | { 121 | int a = atoi(argv[1]); 122 | printf("%d\n%d\n%d",f1(a), f2(a),f3(a)); 123 | a = f4(0); 124 | a = f5(0); 125 | a = gf1(0); 126 | a = gf2(0); 127 | return 0; 128 | } 129 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guide 2 | 3 | ## Setting up the project 4 | 5 | Build is currently supported only on Debian-based distributions (e.g. Ubuntu). 6 | 7 | To clone this repo, run 8 | 9 | ``` 10 | git clone https://github.com/code-genome/codegenome.git 11 | cd codegenome 12 | git submodule update --init --recursive 13 | ``` 14 | 15 | Create a virtual environment. 16 | 17 | ``` 18 | python -mvenv .venv 19 | . .venv/bin/activate 20 | ``` 21 | 22 | Install dependencies. 23 | 24 | ``` 25 | make deps 26 | ``` 27 | 28 | Install requirements. 29 | 30 | ``` 31 | pip install -r requirements.txt 32 | ``` 33 | 34 | Test run the CLI tool. 35 | 36 | ``` 37 | python script/cg genediff /bin/chmod /bin/chown 38 | ``` 39 | 40 | 41 | ## Running pre-commit before committing 42 | 43 | First, install the pre-commit hooks: 44 | 45 | ```bash 46 | pip install pre-commit 47 | pre-commit install 48 | ``` 49 | 50 | To run pre-commit before committing: 51 | 52 | ```bash 53 | pre-commit run --all-files 54 | ``` 55 | 56 | Or simply run: 57 | 58 | ```bash 59 | make pre-commit 60 | ``` 61 | 62 | This will run the pre-commit hooks on all files. 63 | 64 | The pre-commit hooks will: 65 | 1. Check for any linting errors 66 | 2. Check for any formatting errors 67 | 3. Check for any security vulnerabilities 68 | 4. Check for spelling errors 69 | 4. Verify you used relative imports inside src/ directory 70 | 5. Verify you used library imports outside src/ directory 71 | 72 | ## Running Tests 73 | 74 | 75 | ``` 76 | cd tests 77 | python unit_tests.py 78 | ``` 79 | 80 | # Repo principles: 81 | 82 | ## Git 83 | 84 | ## Legal 85 | 86 | We have tried to make it as easy as possible to make contributions. This applies to how we handle the legal aspects of contribution. We use the same approach - the Developer's Certificate of Origin 1.1 (DCO) - that the Linux® Kernel community uses to manage code contributions. 87 | 88 | We simply ask that when submitting a patch for review, the developer must include a sign-off statement in the commit message. 89 | 90 | Here is an example Signed-off-by line, which indicates that the submitter accepts the DCO: 91 | 92 | Signed-off-by: John Doe 93 | You can include this automatically when you commit a change to your local git repository using the following command: 94 | 95 | git commit -s 96 | 97 | ### Commit 98 | Always commit with a [good commit message](https://cbea.ms/git-commit/) and sign off: 99 | 100 | Example: 101 | 102 | ```bash 103 | git commit -s 104 | ``` 105 | 106 | ### Push 107 | Push into a new branch and open a PR. 108 | 109 | Example: 110 | 111 | ```bash 112 | git push origin main: 113 | ``` 114 | -------------------------------------------------------------------------------- /utils/app/app/api/compare.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import traceback 3 | 4 | from flask_restx import Resource, fields 5 | 6 | from ..core.genome_service import (API_STATE_EMPTY_RESULT, 7 | API_STATE_RESULT_NOT_READY) 8 | from ..defaults import * 9 | from ..main import kgs 10 | from .api import api, check_event_loop 11 | 12 | logger = logging.getLogger("codegenome.rest") 13 | 14 | ns = api.namespace("api/v1/compare", description="Compare binaries.") 15 | 16 | compare_file_id_args = api.model( 17 | "compare_file_id_args", 18 | { 19 | "id1": fields.String( 20 | required=True, description="The file identifier (file sha256 hash)" 21 | ), 22 | "id2": fields.String( 23 | required=True, description="The file identifier (file sha256 hash)" 24 | ), 25 | "method": fields.String( 26 | required=False, 27 | default=DEFAULT_COMPARE_METHOD, 28 | description="Internal query method to be used. \ 29 | Currently supported values: [`gene_v0`, `genes_v1_3_0`, `genes_v1_3_0.jaccard_distance`, `genes_v1_3_0.jaccard_distance_w`,\ 30 | `genes_v1_3_0.composition_ratio`, `genes_v1_3_0.composition_ratio_w`,\ 31 | `genes_v1_3_0.containment_ratio`]", 32 | ), 33 | "output_detail": fields.String( 34 | required=False, 35 | default=DEFAULT_OUTPUT_DETAIL, 36 | description="Output format. \ 37 | Supported values: ['simple','complete']", 38 | ), 39 | }, 40 | ) 41 | 42 | 43 | @ns.route("/files/by_file_ids") 44 | @ns.response(200, "Final result") 45 | @ns.response(202, "Request received. Result not ready. Must retry.") 46 | @ns.response(204, "Result empty") 47 | @ns.response(404, "File id not found") 48 | class KGCompareFileIDs(Resource): 49 | """Compare binaries using genes.""" 50 | 51 | @ns.expect(compare_file_id_args) 52 | def post(self): 53 | """Compare binaries using genes.""" 54 | args = api.payload 55 | check_event_loop() 56 | try: 57 | ret = kgs.api_files_compare_kg( 58 | file_id1=args["id1"], 59 | file_id2=args["id2"], 60 | method=args.get("method", DEFAULT_COMPARE_METHOD), 61 | output_detail=args.get("output_detail", DEFAULT_OUTPUT_DETAIL), 62 | ) 63 | if ret.get("status") == API_STATE_RESULT_NOT_READY: 64 | return ret, 202 65 | elif ret.get("status") == API_STATE_EMPTY_RESULT: 66 | if ret.get("query") is None: # root search node not found. 67 | return ret, 404 68 | else: 69 | return ret, 204 70 | 71 | return ret 72 | except Exception as e: 73 | api.abort(405, f"Exception: {e}") 74 | 75 | 76 | # TODO 77 | # @ns.route("/packages/by_package_ids") 78 | # @ns.route("/genes/by_gene_ids") 79 | -------------------------------------------------------------------------------- /codegenome/lifters/retdec.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import subprocess 5 | import tempfile 6 | import time 7 | 8 | from .base import CGLifterBase 9 | 10 | logger = logging.getLogger("codegenome.lifter.retdec") 11 | 12 | DEFAULT_RETDEC_PATH = "/opt/retdec" 13 | 14 | 15 | class CGRetdec(CGLifterBase): 16 | def __init__(self, retdec_path=None, logger=logger): 17 | self.retdec_path = ( 18 | os.environ.get("RETDEC_PATH", DEFAULT_RETDEC_PATH) 19 | if retdec_path is None 20 | else retdec_path 21 | ) 22 | self.logger = logger 23 | 24 | def process_file( 25 | self, 26 | file_path, 27 | output_dir=None, 28 | output_fname=None, 29 | retdec_logfile_path=None, 30 | retdec_path=None, 31 | keep_aux_files=False, 32 | overwrite=True, 33 | ): 34 | self.logger.debug( 35 | f"process_file. {file_path, output_dir, retdec_logfile_path, retdec_path}" 36 | ) 37 | final_output_path = None 38 | fn = os.path.basename(file_path) 39 | 40 | if output_dir is None: 41 | output_dir = os.path.dirname(file_path) 42 | if output_dir == "": 43 | output_dir = "./" 44 | 45 | final_output_dir = output_dir 46 | output_dir = tempfile.mkdtemp(prefix="cgtmp__", dir="/tmp/") 47 | 48 | try: 49 | 50 | if output_fname is None: 51 | output_fname = os.path.basename(file_path) 52 | else: 53 | output_fname = os.path.basename(output_fname) 54 | 55 | if retdec_path is None: 56 | retdec_path = self.retdec_path 57 | 58 | if retdec_logfile_path is None: 59 | retdec_logfile_path = os.path.join( 60 | output_dir, output_fname + ".retdec.log" 61 | ) 62 | 63 | args = [ 64 | os.path.join(retdec_path, "bin/retdec-decompiler"), 65 | "-o", 66 | os.path.join(output_dir, output_fname), 67 | file_path, 68 | ] 69 | 70 | self.logger.info(f"running {args}") 71 | 72 | t = time.time() 73 | with open(retdec_logfile_path, "w") as fout: 74 | ret = subprocess.call(args, stdout=fout, stderr=fout) 75 | 76 | if not keep_aux_files: 77 | # output debug logs 78 | with open(retdec_logfile_path, "r") as f: 79 | logger.debug(f"RETDEC_LOG:\n{f.read()}\n") 80 | 81 | if ret == 0: 82 | logger.debug( 83 | f"RETDEC_OK. Time: {time.time()-t} secs. {[file_path, '->', output_dir]}" 84 | ) 85 | 86 | else: 87 | logger.debug( 88 | f"RETDEC_ERROR. Time: {time.time()-t} secs. {[file_path, '->', output_dir]}" 89 | ) 90 | # move 91 | 92 | for fn in os.listdir(output_dir): 93 | ext = os.path.splitext(fn)[-1].lower() 94 | 95 | if not keep_aux_files: 96 | if ext != ".bc": 97 | continue 98 | if ext in [".bc", ".dsm", ".ll", ".log"]: 99 | src = os.path.join(output_dir, fn) 100 | 101 | if os.path.isfile(src): 102 | dst = os.path.join(final_output_dir, fn) 103 | if fn.endswith(".bc"): 104 | final_output_path = dst 105 | if os.path.exists(dst) and (not overwrite): 106 | continue 107 | # copy 108 | shutil.copy2(src, dst) 109 | os.remove(src) 110 | 111 | finally: 112 | shutil.rmtree(output_dir) 113 | return final_output_path 114 | -------------------------------------------------------------------------------- /tests/test_api_core.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import json 3 | import logging 4 | import os 5 | import sys 6 | import time 7 | import unittest 8 | 9 | import test_data as data 10 | 11 | sys.path.insert( 12 | 0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils/app") 13 | ) 14 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../")) 15 | from app.core.genome_service import * # noqa 16 | 17 | from codegenome._defaults import UNIVERSAL_FUNC_NAME # noqa 18 | 19 | TEST_D = "/tmp/cg_test_api" 20 | GENE_D = os.path.join("/tmp/cg_test_api", "local.kg") 21 | TEST_FN = "p/p.c" 22 | FUNC = "f1" 23 | FN = os.path.splitext(os.path.basename(TEST_FN))[0] 24 | DEST_FN = os.path.join(TEST_D, FN) 25 | 26 | LOG_FN = os.path.join(TEST_D, "cg-test-api-core.log") 27 | 28 | bin_id = None 29 | 30 | CG_CONFIG = {"cache_dir": TEST_D, "gene_dir": GENE_D, "keep_aux_files": True} 31 | 32 | 33 | def prepare(): 34 | os.system("rm -rf " + TEST_D) 35 | os.system("mkdir -p " + TEST_D) 36 | logging.basicConfig( 37 | filename=LOG_FN, 38 | level=logging.DEBUG, 39 | format="%(asctime)s, %(name)s, %(levelname)s, %(message)s", 40 | datefmt="%m/%d/%Y %H:%M:%S", 41 | force=True, 42 | ) 43 | 44 | 45 | def clear(): 46 | print(f"clearing {TEST_D}") 47 | # os.system('rm -rf '+TEST_D) 48 | 49 | 50 | class TestAPI(unittest.TestCase): 51 | @classmethod 52 | def setUpClass(cls): 53 | prepare() 54 | 55 | @classmethod 56 | def tearDownClass(cls): 57 | clear() 58 | 59 | def setUp(self): 60 | pass 61 | 62 | def test_01_compile(self): 63 | global bin_id 64 | import shutil 65 | 66 | dest_s = os.path.join(TEST_D, os.path.basename(TEST_FN)) 67 | shutil.copy(TEST_FN, dest_s) 68 | cmd = "clang -O0 -o %s %s" % (DEST_FN, dest_s) 69 | os.system(cmd) 70 | self.assertTrue(os.path.exists(DEST_FN)) 71 | bin_id = hashlib.sha256(open(DEST_FN, "rb").read()).hexdigest() 72 | 73 | def test_02_genome_service(self): 74 | global bin_id 75 | gs = GenomeService(CG_CONFIG) 76 | self.assertTrue(os.path.exists(TEST_D)) 77 | 78 | ret = gs.check_job("abcd") 79 | self.assertEqual(ret["status"], API_STATE_ERROR) 80 | 81 | ret = gs.api_add_file(DEST_FN) 82 | 83 | self.assertEqual(ret["status"], API_STATE_RESULT_NOT_READY) 84 | job_id = ret.get("job_id") 85 | # check immediately, should fail 86 | ret = gs.check_job(job_id) 87 | self.assertEqual(ret["status"], API_STATE_RESULT_NOT_READY) 88 | 89 | for _ in range(5): 90 | time.sleep(1) 91 | # print(_) 92 | ret = gs.check_job(job_id) 93 | if ret.get("status") == API_STATE_SUCCESS: 94 | break 95 | # print(ret) 96 | self.assertEqual(ret["status"], API_STATE_SUCCESS) 97 | self.assertEqual(ret["ret_status"], "new_file") 98 | 99 | # check job, should not exit (self clean after successful completion) 100 | ret = gs.check_job(job_id) 101 | # print(ret) 102 | # only enable in prod 103 | # self.assertEqual(ret['status'], API_STATE_ERROR) 104 | 105 | # try on existing file 106 | ret = gs.api_add_file(DEST_FN) 107 | 108 | self.assertEqual(ret["status"], API_STATE_SUCCESS) 109 | self.assertEqual(ret["ret_status"], "existing_file") 110 | 111 | ret = gs.api_files_compare_kg(bin_id, bin_id) 112 | 113 | # print(ret) 114 | self.assertEqual(ret["status"], API_STATE_SUCCESS) 115 | self.assertTrue(ret.get("query") is not None) 116 | 117 | # get ll 118 | ret = gs.api_get_ir("main", bin_id) 119 | # print(ret) 120 | self.assertEqual(ret["status"], API_STATE_SUCCESS) 121 | ret = ret.get("data") 122 | self.assertTrue("llvm_ir" in ret) 123 | self.assertTrue("gene_id" in ret) 124 | ir = ret.get("llvm_ir") 125 | # print(ir) 126 | self.assertEqual(type(ir), str) 127 | self.assertTrue(UNIVERSAL_FUNC_NAME in ir) 128 | 129 | 130 | if __name__ == "__main__": 131 | unittest.main(verbosity=2) 132 | -------------------------------------------------------------------------------- /codegenome/genes/utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import pickle 3 | import zlib 4 | 5 | import numpy as np 6 | 7 | from .._defaults import * 8 | 9 | 10 | def encode_gene(gene_data): 11 | if type(gene_data) == list: 12 | gene_data = np.array(gene_data).astype("float32").tobytes() 13 | if type(gene_data) == np.ndarray: 14 | gene_data = gene_data.astype("float32").tobytes() 15 | if type(gene_data) != bytes: 16 | raise Exception("gene data can not be converted to bytes.") 17 | return base64.b64encode(zlib.compress(gene_data)).decode("ascii") 18 | 19 | 20 | def decode_gene(data_str): 21 | return np.frombuffer(zlib.decompress(base64.b64decode(data_str)), dtype="float32") 22 | 23 | 24 | def decode_gene_by_ver(gene): 25 | # Implement version specific decoding if needed 26 | # gene.get('version') 27 | 28 | raw_gene = gene["value"] 29 | if type(raw_gene) == str: 30 | raw_gene = decode_gene(gene["value"]) 31 | return raw_gene 32 | 33 | 34 | def gene_distance(raw_gene1, raw_gene2, normalized=True): 35 | x = np.linalg.norm(raw_gene1 - raw_gene2) 36 | if normalized: 37 | x /= np.sqrt(len(raw_gene1)) 38 | return x 39 | 40 | 41 | def gene_similarity_score_adjusted(sim): 42 | return np.power(float(sim), 2) # tone down similarity score 43 | 44 | 45 | def gene_similarity(raw_gene1, raw_gene2, adjusted=False, normalized=True): 46 | sim = 1.0 - gene_distance(raw_gene1, raw_gene2, normalized) 47 | if adjusted: 48 | sim = gene_similarity_score_adjusted(sim) 49 | return sim 50 | 51 | 52 | def gene_distance_by_ver(gene1, gene2, normalized=True): 53 | raw_gene1, raw_gene2 = decode_gene_by_ver(gene1), decode_gene_by_ver(gene2) 54 | x = np.linalg.norm(raw_gene1 - raw_gene2) 55 | if normalized: 56 | x /= np.sqrt(len(raw_gene1)) 57 | return x 58 | 59 | 60 | def gene_similarity_by_ver(gene1, gene2, adjusted=False, normalized=True): 61 | sim = 1.0 - gene_distance_by_ver(gene1, gene2, normalized) 62 | if adjusted: 63 | sim = gene_similarity_score_adjusted(sim) 64 | return sim 65 | 66 | 67 | class GeneIterator(object): 68 | def __init__(self, data): 69 | self.idx = 0 70 | self.data = data 71 | 72 | def __iter__(self): 73 | return self 74 | 75 | def __next__(self): 76 | self.idx += 1 77 | try: 78 | return self.__getitem__(self.idx - 1) 79 | except IndexError: 80 | self.idx = 0 81 | raise StopIteration 82 | 83 | def __getitem__(self, ii): 84 | # override according to file version 85 | return self.data[ii] 86 | 87 | 88 | class GeneFile(object): 89 | def __init__(self, data=None, file_path=None): 90 | if data is not None: 91 | self.data = pickle.loads(data) 92 | elif file_path is not None: 93 | self.data = pickle.load(open(file_path, "rb")) 94 | else: 95 | raise Exception("invalid argument.") 96 | 97 | if self.data["type"] != "gene": 98 | raise Exception(f"invalid file type {self.data['type']}") 99 | 100 | self.version = self.data["version"] 101 | 102 | if self.version == "0.3": 103 | self.init_v0_3() 104 | else: 105 | raise Exception("Unknown file version.") 106 | 107 | def init_v0_3(self): 108 | self.binid = self.data["binid"] 109 | self._genes = self.data["genes"] 110 | self._meta = self.data["file_meta"] 111 | 112 | class GeneIteratorEx(GeneIterator): 113 | def __getitem__(self, ii): 114 | cid, funcs, gene, gene_meta = self.data[ii] 115 | bc_size, file_offset = gene_meta 116 | return { 117 | "canon_bc_id": cid, 118 | "func_names": funcs, 119 | "gene": gene, 120 | "canon_bc_size": bc_size, 121 | "file_offset": file_offset, 122 | } 123 | 124 | self.genes = GeneIteratorEx(self._genes) 125 | 126 | @classmethod 127 | def load(cls, file_path): 128 | return cls(file_path=file_path) 129 | 130 | @classmethod 131 | def loads(cls, data): 132 | return cls(data=data) 133 | -------------------------------------------------------------------------------- /codegenome/ir/canon.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import json 4 | import jsonlines 5 | import logging 6 | import datetime 7 | import subprocess 8 | import hashlib 9 | 10 | DEFAULT_LLVM_PATH = '/opt/llvm' 11 | logger = logging.getLogger('codegenome.canon') 12 | 13 | class IRCanonPassBinary(object): 14 | def __init__(self, input_data, output='canon.jsonl', bin_id='', pass_file='libcanonicalization-pass.so', llvm_path=None): 15 | self.input_data = input_data 16 | self._bin_id = bin_id 17 | self.llvm_path = os.environ.get( 18 | 'LLVM_PATH', DEFAULT_LLVM_PATH) if llvm_path is None else llvm_path 19 | self.pass_file = os.path.join(self.llvm_path, 'lib',pass_file ) 20 | self.opt_bin = os.path.join(self.llvm_path, 'bin', 'opt') 21 | self.output = output 22 | self.stat = {} 23 | 24 | def canon_pass(self): 25 | args = [self.opt_bin, '--load', self.pass_file,'--canonicalization', '--canon-out', 26 | self.output] 27 | #print(' '.join(args)) 28 | logger.info(f'running {args}') 29 | try: 30 | t = time.time() 31 | ret = subprocess.run(args, input=self.input_data, stdout=subprocess.DEVNULL, 32 | stderr=subprocess.DEVNULL) 33 | if ret.returncode == 0: 34 | logger.debug( 35 | f"CANON_PASS_OK. Time: {time.time()-t} secs. {['->', self.output]}") 36 | return self.output 37 | 38 | else: 39 | logger.debug( 40 | f"CANON_PASS_ERROR. Time: {time.time()-t} secs. {['->', self.output]}") 41 | # move 42 | except Exception as ex: 43 | logger.error(f"Exception: {ex}") 44 | 45 | return None 46 | 47 | def serialize(self, statf=None): 48 | import llvmlite.binding as llvm #lazy loading 49 | fns = [] 50 | i = 0 51 | tot = 0 52 | err = 0 53 | st = time.time() 54 | 55 | jsonl = self.canon_pass() 56 | if jsonl is None: 57 | return None 58 | t1 = time.time() 59 | 60 | with jsonlines.open(jsonl) as reader: 61 | for func in reader: 62 | #code, data, extern, name 63 | try: 64 | if func['extern']: 65 | continue 66 | s = time.time() 67 | #sort data 68 | data = func['data'].split('\n') 69 | data.sort() 70 | data = '\n'.join([x for x in data if x!='']) 71 | 72 | m = llvm.parse_assembly( data + '\n' + func['code'] ) 73 | bc = m.as_bitcode() 74 | s = time.time() - s 75 | tot += 1 76 | 77 | gid = hashlib.sha256(bc).hexdigest() 78 | # TODO get file_offset 79 | bc_size = len(bc) 80 | file_offset = 0 81 | meta = (bc_size, file_offset) 82 | 83 | # format (gene_id, func_name, bitcode, meta) 84 | func_name = func['name'] 85 | fns.append((gid, func_name, bc, meta)) 86 | 87 | if statf: 88 | txt = '{"type": "OK", "i": %d, "ts": "%s", "func": "%s", "time": %f, "size": %d}' % ( 89 | i, str(datetime.datetime.now()), func_name, s, len(bc)) 90 | statf.write(txt + '\n') 91 | except Exception as e: 92 | err += 1 93 | txt = '{"type": "ERR", "i": %d, "ts": "%s", "func": "%s", "e": "%s", "bin_id": "%s"}' % ( 94 | i, str(datetime.datetime.now()), func_name, str(e), self._bin_id) 95 | logger.warning(txt) 96 | if statf: 97 | statf.write(txt + '\n') 98 | else: 99 | pass 100 | t2 = time.time() 101 | if statf: 102 | stat = {"type": "stat", "bin_id": self._bin_id, "total": tot, 103 | "errors": err, "func_count": len(self.fs), 'pass_time': t1-st, 'time': t2-t1} 104 | for k, v in self.stat.items(): 105 | stat[k] = v 106 | statf.write(json.dumps(stat) + '\n') 107 | 108 | return fns 109 | -------------------------------------------------------------------------------- /utils/app/app/api/search.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import traceback 3 | 4 | from flask_restx import Resource, fields 5 | 6 | from ..core.genome_service import (API_STATE_EMPTY_RESULT, API_STATE_ERROR, 7 | API_STATE_RESULT_NOT_READY) 8 | from ..defaults import * 9 | from ..main import kgs 10 | from .api import api, check_event_loop 11 | 12 | logger = logging.getLogger("codegenome.rest") 13 | 14 | ns = api.namespace("api/v1/search", description="Search for info") 15 | 16 | gene_info_args = api.model( 17 | "gene_info_args", 18 | { 19 | "gene_id": fields.String( 20 | required=False, 21 | default="", 22 | description="The gene identifier (sha256 hash). If this is not passed, `file_id` and `function_name` must be passed ", 23 | ), 24 | "file_id": fields.String( 25 | required=False, 26 | default="", 27 | description="The file identifier if known (file sha256 hash)", 28 | ), 29 | "function_name": fields.String( 30 | required=False, 31 | default="", 32 | description="For searching by function name if known.", 33 | ), 34 | "include_llvm_ir": fields.Boolean( 35 | required=False, default=False, description="Include LLVM IR in output." 36 | ), 37 | "include_asm": fields.Boolean( 38 | required=False, default=False, description="Include disassembly output." 39 | ), 40 | "include_gene_value": fields.Boolean( 41 | required=False, 42 | default=False, 43 | description="Include raw gene value in output.", 44 | ), 45 | "include_function_names": fields.Boolean( 46 | required=False, 47 | default=False, 48 | description="Include all function names in output.", 49 | ), 50 | }, 51 | ) 52 | 53 | obj_info_args = api.model( 54 | "obj_info_args", 55 | { 56 | "obj_id": fields.String( 57 | required=False, default="", description="The identifier of gene or file" 58 | ), 59 | "output_detail": fields.String( 60 | required=False, 61 | default=DEFAULT_OUTPUT_DETAIL, 62 | description="Output format. \ 63 | Supported values: ['simple','complete']", 64 | ), 65 | }, 66 | ) 67 | 68 | 69 | @ns.route("/gene") 70 | @ns.response(200, "Final result") 71 | @ns.response(202, "Request received. Result not ready. Must retry.") 72 | @ns.response(404, "Object id not found") 73 | class SearchGene(Resource): 74 | """Search by id""" 75 | 76 | @ns.expect(gene_info_args) 77 | def post(self): 78 | """Search either by `gene_id` or (`file_id` and `function_name`) combination.""" 79 | try: 80 | args = api.payload 81 | ret = kgs.api_get_gene_info(**args) 82 | if ret.get("status") == API_STATE_RESULT_NOT_READY: 83 | return ret, 202 84 | elif ret.get("status") == API_STATE_EMPTY_RESULT: 85 | return ret, 404 86 | elif ret.get("status") == API_STATE_ERROR: 87 | return ret, 500 88 | 89 | return ret 90 | except Exception as e: 91 | api.abort(500, f"Exception: {e}") 92 | 93 | 94 | @ns.route("/by_id") 95 | @ns.response(200, "Final result") 96 | @ns.response(202, "Request received. Result not ready. Must retry.") 97 | @ns.response(404, "Object id not found") 98 | class SearchID(Resource): 99 | """Search by id""" 100 | 101 | @ns.expect(obj_info_args) 102 | def post(self): 103 | """Search either by `gene_id` or (`file_id` and `function_name`) combination.""" 104 | try: 105 | args = dict(api.payload) 106 | output = args.pop("output_detail") 107 | flag = False 108 | if output == "complete": 109 | flag = True 110 | 111 | args.update( 112 | { 113 | "include_genes": flag, 114 | "include_llvm_ir": flag, 115 | "include_asm": flag, 116 | "include_gene_value": flag, 117 | "include_function_names": flag, 118 | } 119 | ) 120 | ret = kgs.api_get_node_info(**args) 121 | if ret.get("status") == API_STATE_RESULT_NOT_READY: 122 | return ret, 202 123 | elif ret.get("status") == API_STATE_EMPTY_RESULT: 124 | return ret, 404 125 | elif ret.get("status") == API_STATE_ERROR: 126 | return ret, 500 127 | 128 | return ret 129 | except Exception as e: 130 | api.abort(500, f"Exception: {e}") 131 | -------------------------------------------------------------------------------- /scripts/bin2bc: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ -x /run/bin2bc ]; then 4 | TIMEOUT= 5 | [ -x /usr/bin/timeout ] && TIMEOUT='/usr/bin/timeout -k 1000 900' 6 | $TIMEOUT /opt/cg/retdec/bin/retdec-decompiler "$1" 7 | exit $? 8 | fi 9 | 10 | usage() 11 | { 12 | { 13 | echo "Usage: $0 " 14 | echo "" 15 | echo " --output Directory to store output files in. Defaults" 16 | echo " to the current directory." 17 | echo "" 18 | echo " --retdec Directory where retdec is installed. Defaults" 19 | echo " to /opt/retdec." 20 | echo "" 21 | echo " --config Specify the retdec-decompiler config file to" 22 | echo " use. Default is retdec-decompiler default file." 23 | echo "" 24 | echo " --force Force rebuild even if binary is older than existing" 25 | echo " byte code file." 26 | echo "" 27 | echo " --keep_filename Keep output filename same as input binary. Output" 28 | echo " maybe overwritten." 29 | echo "" 30 | echo " --verbose Be verbose about what is happening." 31 | echo "" 32 | } 1>&2 33 | exit 1 34 | } 35 | 36 | RETDECDIR=/opt/retdec 37 | OUTPUT=. 38 | CF= 39 | FORCE=0 40 | VERBOSE=0 41 | KEEP_FILENAME=0 42 | KEEP_DSM=0 43 | KEEP_LL=0 44 | 45 | while [ $# -ne 0 ] 46 | do 47 | case "$1" in 48 | -h) usage;; 49 | --help) usage;; 50 | --output) OUTPUT="$2"; shift; shift;; 51 | --retdec) RETDECDIR="$2"; shift; shift;; 52 | --config) CF="--config $2"; shift; shift;; 53 | --force) FORCE=1; shift;; 54 | --keep_filename) KEEP_FILENAME=1; shift;; 55 | --keep_dsm) KEEP_DSM=1; shift;; 56 | --keep_ll) KEEP_LL=1; shift;; 57 | --verbose) VERBOSE=1; shift;; 58 | *) break; 59 | esac 60 | done 61 | 62 | USEDOCKER=0 63 | 64 | [ -x $RETDECDIR/bin/retdec-decompiler ] && USEDOCKER=0 65 | 66 | if [ $USEDOCKER -eq 1 ]; then 67 | SIGMAL="`docker images | awk '$1 == "sigmal" {print $1}'`" 68 | [ "x$SIGMAL" = 'x' ] && { 69 | echo "$0: Unable to find retdec-decompiler installation" 1>&2 70 | echo " locally or from a sigmal docker image." 1>&2 71 | exit 1 72 | } 73 | fi 74 | 75 | TMPDIR=/tmp/bin2bc.$$ 76 | mkdir $TMPDIR 77 | 78 | if [ $USEDOCKER -eq 1 ]; then 79 | BIN=`basename $0` 80 | cp $0 $TMPDIR/ && chmod 755 $TMPDIR/$BIN 81 | else 82 | [ "x$CF" = 'x' -a -f $RETDECDIR/share/retdec/llvmir-only.json ] && { 83 | CF="--config $RETDECDIR/share/retdec/llvmir-only.json" 84 | } 85 | fi 86 | 87 | for bin 88 | do 89 | FT=`file "$bin"` 90 | case "$FT" in 91 | *ELF*) ;; 92 | *) 93 | [ $VERBOSE -eq 1 ] && { 94 | echo "`date` Skipping $bin; is not an ELF executable." 1>&2 95 | } 96 | continue;; 97 | esac 98 | 99 | input="`echo \"$bin\" | tr / _ | sed -e 's/_\.\._/_/g' -e 's/^\.\._/_/' -e 's/^_//'`" 100 | 101 | [ $FORCE -eq 0 -a -f "$OUTPUT/$input.bc" -a $bin -ot "$OUTPUT/$input".bc ] && { 102 | [ $VERBOSE -eq 1 ] && { 103 | echo "`date` Skipping $bin; byte code is current." 1>&2 104 | } 105 | continue 106 | } 107 | 108 | rm -f "$TMPDIR/$input*" 109 | cp "$bin" "$TMPDIR/$input" 110 | 111 | LOG="$TMPDIR/retdec.log" 112 | 113 | if [ $USEDOCKER -eq 0 ]; then 114 | [ $VERBOSE -eq 1 ] && { 115 | echo "`date` Decompiling $bin to bytecode." 1>&1 116 | } 117 | $RETDECDIR/bin/retdec-decompiler $CF "$TMPDIR/$input" > $LOG 2>&1 118 | else 119 | [ $VERBOSE -eq 1 ] && { 120 | echo "`date` Decompiling $bin to bytecode via docker." 1>&2 121 | } 122 | docker run --rm -tt --entrypoint /run/bin2bc -v "$TMPDIR":/run sigmal /run/"$input" > $LOG 2>&1 123 | fi 124 | 125 | rm -f "$TMPDIR/$input" 126 | rm -f "$TMPDIR/$input.config.json" 127 | 128 | if [ -f "$TMPDIR/$input.bc" ]; then 129 | [ $VERBOSE -eq 1 ] && { 130 | echo "`date` Finished decompiling $bin." 1>&2 131 | } 132 | output=$input 133 | output="`basename \"$bin\"`" 134 | if [ $KEEP_FILENAME -eq 1 ]; then 135 | output="`basename \"$bin\"`" 136 | fi 137 | cp "$TMPDIR/$input.bc" "$OUTPUT/$output".bc 138 | if [ $KEEP_DSM -eq 1 ]; then 139 | cp "$TMPDIR/$input.dsm" "$OUTPUT/$output".dsm 140 | fi 141 | if [ $KEEP_LL -eq 1 ]; then 142 | cp "$TMPDIR/$input.ll" "$OUTPUT/$output".ll 143 | fi 144 | 145 | rm -f "$TMPDIR/$input.bc" 146 | else 147 | [ $VERBOSE -eq 1 ] && { 148 | echo "`date` Error decompiling $bin." 1>&2 149 | } 150 | tail -5 $LOG | sed -e 's/^/ /' 151 | fi 152 | rm $LOG 153 | done 154 | 155 | rm -rf $TMPDIR 156 | -------------------------------------------------------------------------------- /tests/test_kg.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import json 3 | import logging 4 | import os 5 | import shutil 6 | import sys 7 | import time 8 | import unittest 9 | 10 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../")) 11 | 12 | os.environ.setdefault("RETDEC_PATH", "/opt/cg/retdec/") 13 | 14 | from codegenome._defaults import (DEFAULT_GENE_VERSION, # noqa 15 | UNIVERSAL_FUNC_NAME) 16 | from codegenome.kg import GenomeKG # noqa 17 | 18 | TEST_D = "/tmp/cg_kg_test" 19 | TEST_FN = "p/p.c" 20 | FUNC = "f1" 21 | KG_REPO = os.path.join(TEST_D, "testkg.gkg") 22 | LOG_FN = os.path.join(TEST_D, "cg-test.log") 23 | 24 | 25 | FN = os.path.splitext(os.path.basename(TEST_FN))[0] 26 | DEST_FN = os.path.join(TEST_D, FN) 27 | DEST_FN2 = DEST_FN + "_2" 28 | 29 | 30 | def prepare(): 31 | if os.path.exists(TEST_D): 32 | shutil.rmtree(TEST_D) 33 | os.makedirs(TEST_D) 34 | logging.basicConfig( 35 | filename=LOG_FN, 36 | level=logging.DEBUG, 37 | format="%(asctime)s, %(name)s, %(levelname)s, %(message)s", 38 | datefmt="%m/%d/%Y %H:%M:%S", 39 | force=True, 40 | ) 41 | 42 | 43 | def clear(): 44 | print(f"clearing {TEST_D}") 45 | # shutil.rmtree(TEST_D) 46 | 47 | 48 | class TestKG(unittest.TestCase): 49 | @classmethod 50 | def setUpClass(cls): 51 | prepare() 52 | 53 | @classmethod 54 | def tearDownClass(cls): 55 | clear() 56 | 57 | # test order is sorted test function names! 58 | 59 | def test_01_add_file(self): 60 | global bin_id, bin_id2 61 | cmd = "clang -O0 -o %s %s" % (DEST_FN, TEST_FN) 62 | os.system(cmd) 63 | cmd = "clang -O1 -o %s %s" % (DEST_FN2, TEST_FN) 64 | os.system(cmd) 65 | self.assertTrue(os.path.exists(DEST_FN)) 66 | self.assertTrue(os.path.exists(DEST_FN2)) 67 | with open(DEST_FN, "rb") as f: 68 | bin_id = hashlib.sha256(f.read()).hexdigest() 69 | 70 | with open(DEST_FN2, "rb") as f: 71 | bin_id2 = hashlib.sha256(f.read()).hexdigest() 72 | 73 | kg = GenomeKG(KG_REPO) 74 | kg.add_file(DEST_FN, keep_aux_files=False) 75 | self.assertTrue(os.path.exists(KG_REPO)) 76 | self.assertFalse(os.path.exists(os.path.join(KG_REPO, ".auxs", bin_id + ".bc"))) 77 | self.assertFalse( 78 | os.path.exists(os.path.join(KG_REPO, ".auxs", bin_id + ".canon")) 79 | ) 80 | self.assertTrue( 81 | os.path.exists( 82 | os.path.join(KG_REPO, "genes", DEFAULT_GENE_VERSION, bin_id + ".gene") 83 | ) 84 | ) 85 | 86 | shutil.rmtree(KG_REPO) 87 | kg = GenomeKG(KG_REPO) 88 | kg.add_file(DEST_FN) 89 | self.assertTrue(os.path.exists(KG_REPO)) 90 | self.assertTrue(os.path.exists(os.path.join(KG_REPO, ".auxs", bin_id + ".bc"))) 91 | self.assertTrue( 92 | os.path.exists(os.path.join(KG_REPO, ".auxs", bin_id + ".canon")) 93 | ) 94 | self.assertTrue( 95 | os.path.exists( 96 | os.path.join(KG_REPO, "genes", DEFAULT_GENE_VERSION, bin_id + ".gene") 97 | ) 98 | ) 99 | 100 | self.assertEqual(len(kg.bins), 1) 101 | self.assertTrue(len(kg.gene_ids) >= 14) 102 | ll = kg.get_ll(kg.gene_ids[0]) 103 | # print(ll) 104 | self.assertTrue(ll is not None) 105 | 106 | bg = kg.get_bin(bin_id) 107 | self.assertTrue(bg is not None) 108 | self.assertTrue(len(bg.gene_ids) >= 14) 109 | 110 | ll = bg.get_ll(bg.gene_ids[0]) 111 | # print(ll) 112 | self.assertTrue(ll is not None) 113 | 114 | # re add 115 | t1 = time.time() 116 | kg.add_file(DEST_FN) 117 | t2 = time.time() 118 | self.assertTrue(t2 - t1 < 0.1) # should be fast with no reprocessing 119 | 120 | kg.add_file(DEST_FN2) 121 | 122 | self.assertEqual(len(kg.bins), 2) 123 | self.assertTrue(len(kg.gene_ids) >= 20) 124 | 125 | def test_02_load(self): 126 | kg = GenomeKG(KG_REPO) 127 | self.assertEqual(len(kg.bins), 0) 128 | 129 | kg.load() 130 | self.assertEqual(len(kg.bins), 2) 131 | self.assertTrue(len(kg.gene_ids) >= 20) 132 | ll = kg.get_ll(kg.gene_ids[0]) 133 | # print(ll) 134 | self.assertTrue(ll is not None) 135 | 136 | # import ipdb; ipdb.set_trace() 137 | 138 | def test_03_bindiff_old(self): 139 | kg = GenomeKG(KG_REPO) 140 | kg.load() 141 | a, b = list(kg.bins.keys())[:2] 142 | diff = kg.bindiff_old(a, b) 143 | self.assertTrue(diff < 0.4) 144 | 145 | def test_04_bindiff(self): 146 | kg = GenomeKG(KG_REPO) 147 | kg.load() 148 | a, b = list(kg.bins.keys())[:2] 149 | ret, stat = kg.bindiff(a, b) 150 | self.assertEqual(ret.get("similarity"), 100) 151 | self.assertEqual(len(ret.get("diff_details")), 21) 152 | 153 | def test_05_bc(self): 154 | kg = GenomeKG(KG_REPO) 155 | kg.load() 156 | gids = kg.get_gene_ids("main") 157 | ll = kg.get_ll(gids[0]) 158 | self.assertTrue("@gv1" in ll) 159 | self.assertTrue("main" not in ll) 160 | 161 | def test_05_local_apis(self): 162 | kg = GenomeKG(KG_REPO) 163 | kg.load() 164 | bin = kg.get_bin(bin_id) 165 | gid = bin.get_gene_id("main") 166 | gid2 = kg.get_gene_ids("main", bin_id) 167 | gid3 = kg.get_gene_ids("main", bin_id2) 168 | 169 | self.assertEqual(len(gid2), 1) 170 | self.assertEqual(len(gid3), 1) 171 | self.assertEqual(gid, gid2[0]) 172 | self.assertNotEqual(gid, gid3[0]) 173 | 174 | gids = kg.get_gene_ids("main") 175 | self.assertEqual(len(gids), 2) 176 | 177 | ir = kg.get_ll(gid) 178 | # print(ir) 179 | self.assertTrue(type(ir), str) 180 | self.assertTrue(UNIVERSAL_FUNC_NAME in ir) 181 | 182 | ginfo = kg.get_gene_info(gid) 183 | print(ginfo) 184 | self.assertEqual("gene", ginfo.get("type")) 185 | self.assertTrue("llvm_ir" in ginfo) 186 | self.assertEqual(["main"], list(ginfo.get("function_names", {}).values())[0]) 187 | import ipdb 188 | 189 | ipdb.set_trace() # noqa 190 | 191 | 192 | if __name__ == "__main__": 193 | unittest.main(verbosity=2) 194 | -------------------------------------------------------------------------------- /scripts/cg: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ## 3 | ## This code is part of the Code Genome Framework. 4 | ## 5 | ## (C) Copyright IBM 2023. 6 | ## 7 | ## This code is licensed under the Apache License, Version 2.0. You may 8 | ## obtain a copy of this license in the LICENSE.txt file in the root directory 9 | ## of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. 10 | ## 11 | ## Any modifications or derivative works of this code must retain this 12 | ## copyright notice, and modified files need to carry a notice indicating 13 | ## that they have been altered from the originals. 14 | ## 15 | import argparse 16 | import json 17 | import logging 18 | import os 19 | import shutil 20 | import subprocess 21 | import sys 22 | 23 | logging.basicConfig(level=logging.ERROR) 24 | 25 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../")) 26 | 27 | import codegenome._defaults as defaults # noqa 28 | 29 | CG_CACHE_DIR = os.path.expanduser(os.environ.get("CG_CACHE_DIR", "~/.cg/cache")) 30 | CG_DOCKER_IMAGE_NAME = os.environ.get("CG_DOCKER_IMAGE_NAME", "cg-worker") 31 | 32 | 33 | def genediff_docker(args): 34 | avars = vars(args) 35 | avars.pop("func") 36 | avars.pop("docker") 37 | cach_dir = avars.pop("cache_dir") 38 | file1, file2 = avars.pop("file1"), avars.pop("file2") 39 | file1, file2 = os.path.abspath(file1), os.path.abspath(file2) 40 | 41 | assert os.path.isfile(file1) 42 | assert os.path.isfile(file2) 43 | 44 | opts = [] 45 | for k, v in avars.items(): 46 | if v is not False: 47 | opts.append("--" + k) 48 | if v is not True: 49 | opts.append(str(v)) 50 | tmp_cache_dir = "/tmp/cache" 51 | tmp_file1 = os.path.join("/tmp/file1", os.path.basename(file1)) 52 | tmp_file2 = os.path.join("/tmp/file2", os.path.basename(file2)) 53 | 54 | cg_opts = ["--cache_dir", tmp_cache_dir] 55 | 56 | proc_args = ( 57 | [ 58 | "docker", 59 | "run", 60 | "--rm", 61 | "-v", 62 | os.path.abspath(cach_dir) + ":" + tmp_cache_dir, 63 | "-v", 64 | os.path.dirname(file1) + ":" + os.path.dirname(tmp_file1), 65 | "-v", 66 | os.path.dirname(file2) + ":" + os.path.dirname(tmp_file2), 67 | "-it", 68 | CG_DOCKER_IMAGE_NAME, 69 | ] 70 | + ["cg"] 71 | + cg_opts 72 | + ["genediff"] 73 | + opts 74 | + [tmp_file1, tmp_file2] 75 | ) 76 | 77 | proc = subprocess.Popen(proc_args, stdout=subprocess.PIPE) 78 | while True: 79 | line = proc.stdout.readline() 80 | if not line: 81 | break 82 | sys.stdout.buffer.write(line) 83 | 84 | 85 | def genediff(args): 86 | import codegenome as cg # noqa 87 | 88 | logger = logging.getLogger("codegenome") 89 | logger.setLevel(logging.ERROR) 90 | repo_path = os.path.join(args.cache_dir, "local.kg") 91 | if not os.path.exists(repo_path): 92 | os.makedirs(repo_path) 93 | 94 | if args.docker: 95 | return genediff_docker(args) 96 | 97 | kg = cg.GenomeKG(repo_path) 98 | if args.verbose: 99 | ch = logging.StreamHandler() 100 | logger.addHandler(ch) 101 | logger.setLevel(logging.WARNING) 102 | kg.logger = logger 103 | 104 | args.match_sim_thr /= 100 105 | args.mismatch_sim_thr /= 100 106 | 107 | b1 = kg.add_file(args.file1, keep_aux_files=(not args.remove_aux_files)) 108 | b2 = kg.add_file(args.file2, keep_aux_files=(not args.remove_aux_files)) 109 | ret, stat = kg.bindiff( 110 | b1, 111 | b2, 112 | match_sim_thr=args.match_sim_thr, 113 | mismatch_sim_thr=args.mismatch_sim_thr, 114 | method=args.method, 115 | output_detail=args.output_detail, 116 | ) 117 | if args.format == "json": 118 | print(json.dumps(ret)) 119 | else: 120 | print_output(ret, args.no_color) 121 | 122 | 123 | def print_output(r, no_color=False): 124 | color_code = { 125 | "=": "\033[;32m", 126 | "!": "\033[;31m", 127 | "~": "\033[;36m", 128 | "+": "\033[;35m", 129 | "-": "\033[;33m", 130 | } 131 | print(f"similarity:\t{r.get('similarity')}") 132 | try: 133 | for l in r.get("diff_details"): 134 | if not no_color: 135 | sys.stdout.write(color_code.get(l["op"], "")) 136 | 137 | print(f"{l['op']}, {l['f1']}, \t{l['f2']}, \t{l['score']}") 138 | 139 | if not no_color: 140 | sys.stdout.write("\033[0m") 141 | finally: 142 | if not no_color: 143 | sys.stdout.write("\033[0m") 144 | 145 | 146 | def clear_cache(args): 147 | if os.path.isdir(args.cache_dir): 148 | x = input( 149 | f"Clearing Code Genome Cache at {args.cache_dir}?. Press [y] to continue. " 150 | ) 151 | if x.lower().strip() == "y": 152 | shutil.rmtree(args.cache_dir) 153 | else: 154 | sys.stderr.write(f"Invalid cache directory ({args.cache_dir}).\n") 155 | exit(2) 156 | 157 | 158 | if __name__ == "__main__": 159 | parser = argparse.ArgumentParser() 160 | 161 | parser.add_argument( 162 | "--cache_dir", 163 | type=str, 164 | default=CG_CACHE_DIR, 165 | help="Cache directory. Defaults to `~/.cg/cache`", 166 | ) 167 | parser.add_argument( 168 | "--clear_cache", 169 | action="store_true", 170 | default=False, 171 | help="clear cache directory. Default dir is `~/.cg/cache`", 172 | ) 173 | 174 | subparsers = parser.add_subparsers(help="commands") 175 | 176 | diff_parser = subparsers.add_parser( 177 | "genediff", help="Binary diff using function level genes." 178 | ) 179 | diff_parser.add_argument( 180 | "-v", "--verbose", action="store_true", default=False, help="Verbose output." 181 | ), 182 | diff_parser.add_argument( 183 | "-d", "--docker", action="store_true", default=False, help="Use docker." 184 | ) 185 | diff_parser.add_argument( 186 | "--remove_aux_files", 187 | action="store_true", 188 | default=False, 189 | help="If enabled, removes auxillary files to save storage. Function details such as machine-code and llvm-ir will be unavailable.", 190 | ) 191 | diff_parser.add_argument( 192 | "-f", "--format", default="default", help="Output format. Options: default|json" 193 | ) 194 | diff_parser.add_argument( 195 | "--no_color", 196 | action="store_true", 197 | default=False, 198 | help="No color in default output.", 199 | ) 200 | diff_parser.add_argument( 201 | "-gv", 202 | "--gene_version", 203 | type=str, 204 | default=defaults.DEFAULT_GENE_VERSION, 205 | help="Code Genome version.", 206 | ) 207 | diff_parser.add_argument( 208 | "--match_sim_thr", 209 | type=float, 210 | default=defaults.FILE_COMPARE_FUNC_MATCH_SIM_THRESHOLD * 100, 211 | help="Function match similarity threshold. Greater than or equal to this threshold will be considered as a match `~`", 212 | ) 213 | diff_parser.add_argument( 214 | "--mismatch_sim_thr", 215 | type=float, 216 | default=defaults.FILE_COMPARE_FUNC_MISMATCH_SIM_THRESHOLD * 100, 217 | help="Function mismatch similarity threshold. Greater than or equal to this threshold will be considered as a mismatch `!`, smaller wil be considered delete `-`", 218 | ) 219 | diff_parser.add_argument( 220 | "-m", 221 | "--method", 222 | type=str, 223 | default=defaults.DEFAULT_CALCULATION_METHOD, 224 | help=f"Calculation method for file-level similarity. Valid values: {str(defaults.KNOWN_CALCULATION_METHODS)}", 225 | ) 226 | diff_parser.add_argument( 227 | "--output_detail", 228 | type=str, 229 | default=defaults.VALID_OUTPUT_DETAILS[0], 230 | help=f"Output details. Valid values: {str(defaults.VALID_OUTPUT_DETAILS)}", 231 | ) 232 | 233 | diff_parser.add_argument("file1", type=str, help="First filepath") 234 | diff_parser.add_argument("file2", type=str, help="Second filepath") 235 | diff_parser.set_defaults(func=genediff) 236 | 237 | parser.set_defaults(func=lambda x: parser.print_help()) 238 | 239 | try: 240 | args = parser.parse_args() 241 | if args.clear_cache: 242 | exit(clear_cache(args)) 243 | 244 | except IOError as msg: 245 | parser.error(str(msg)) 246 | exit(2) 247 | 248 | args.func(args) 249 | -------------------------------------------------------------------------------- /codegenome/genes/sigmal.py: -------------------------------------------------------------------------------- 1 | ## 2 | ## This code is part of the Code Genome Framework. 3 | ## 4 | ## (C) Copyright IBM 2023. 5 | ## 6 | ## This code is licensed under the Apache License, Version 2.0. You may 7 | ## obtain a copy of this license in the LICENSE.txt file in the root directory 8 | ## of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. 9 | ## 10 | ## Any modifications or derivative works of this code must retain this 11 | ## copyright notice, and modified files need to carry a notice indicating 12 | ## that they have been altered from the originals. 13 | ## 14 | 15 | import array 16 | import hashlib 17 | import logging 18 | import os 19 | import sys 20 | from collections import deque 21 | from datetime import datetime 22 | from threading import Lock, Thread 23 | 24 | import numpy as np 25 | # import matplotlib.pylab as plt 26 | import scipy 27 | from PIL import Image 28 | from sklearn.neighbors import BallTree 29 | 30 | from .base import CGGeneBase 31 | 32 | logger = logging.getLogger("codegenome.gene.sigmal") 33 | 34 | MAX_SIZE_KB = 10000 35 | FEATURE_UNIT = 128 36 | FEATURE_SHAPE = (FEATURE_UNIT, FEATURE_UNIT) 37 | FEATURE_SIZE = 320 38 | COL_SIZE = 256 39 | SIZE_MAP = [ 40 | (10, 32), 41 | (30, 64), 42 | (60, 128), 43 | (100, 256), 44 | (500, 512), 45 | (1000, 768), 46 | (MAX_SIZE_KB, 1024), 47 | ] 48 | 49 | GENE_TYPE_CONFIG = { 50 | "sigmal2": {"resample": Image.Resampling.NEAREST, "weights": [0.8, 0.2]}, 51 | "sigmal2b": {"resample": Image.Resampling.BICUBIC, "weights": [0.8, 0.2]}, 52 | } 53 | 54 | 55 | def prep_data_sigmal2(bc): 56 | """ 57 | Split IR to function and auxiliary data 58 | """ 59 | import llvmlite.binding as llvm 60 | 61 | from codegenome._defaults import UNIVERSAL_FUNC_NAME 62 | 63 | obj = llvm.parse_bitcode(bc) 64 | fns = {f.name: f for f in obj.functions} 65 | func_str = str(fns[UNIVERSAL_FUNC_NAME]) 66 | 67 | struc_str = [str(x) for x in obj.struct_types] 68 | gvs_str = [str(x) for x in obj.global_variables] 69 | other_funcdef_str = [str(v) for k, v in fns.items() if k != UNIVERSAL_FUNC_NAME] 70 | 71 | aux_str = "\n".join(struc_str + gvs_str + other_funcdef_str) 72 | if aux_str == "": 73 | aux_str = " " 74 | 75 | return func_str, aux_str 76 | 77 | 78 | class SigmalGene(CGGeneBase): 79 | def from_data(self, data): 80 | return self.feats_from_binary(data) 81 | 82 | def from_bitcode(self, data, gene_type="sigmal"): 83 | """ 84 | gene_type can be sigmal|sigmal2|sigmal2b|func_only 85 | 86 | """ 87 | if gene_type == "sigmal": 88 | raw_gene = self.feats_from_binary(data) 89 | else: 90 | if gene_type in GENE_TYPE_CONFIG: 91 | func, aux = prep_data_sigmal2(data) 92 | raw_gene = self.feats_from_binary_list( 93 | [func, aux], 94 | weights=GENE_TYPE_CONFIG[gene_type]["weights"], 95 | resample=GENE_TYPE_CONFIG[gene_type]["resample"], 96 | ) 97 | elif gene_type == "func_only": 98 | func, aux = prep_data_sigmal2(data) 99 | raw_gene = self.feats_from_binary_list([func], weights=[1.0]) 100 | return raw_gene 101 | 102 | def feats_from_file(self, fn, only_desc=False): 103 | with open(fn, "rb") as f: 104 | fdata = f.read() 105 | md5 = hashlib.md5(fdata).hexdigest() 106 | dsize = os.path.getsize(fn) 107 | if only_desc: 108 | return md5, dsize, None 109 | else: 110 | return md5, dsize, self.feats_from_binary(fdata) 111 | return None 112 | 113 | def feats_from_buff(self, data, only_desc=False): 114 | md5 = hashlib.md5(data).hexdigest() 115 | dsize = len(data) 116 | if only_desc: 117 | return md5, dsize, None 118 | else: 119 | return md5, dsize, self.feats_from_binary(data) 120 | 121 | def binary_to_img_old(self, data): 122 | dsize = len(data) 123 | dsize_kb = dsize / 1024 124 | col_size = 32 125 | for fs, sz in SIZE_MAP: 126 | if dsize_kb < fs: 127 | col_size = sz 128 | 129 | return self.array_to_img(np.frombuffer(data, dtype="B"), col_size) 130 | 131 | def array_to_img( 132 | self, data, col_size=COL_SIZE, return_array=False, auto_resize_col_size=True 133 | ): 134 | dsize = len(data) 135 | if auto_resize_col_size: 136 | if dsize < (col_size * col_size): 137 | # resize col_size to form a square image 138 | col_size = int(np.sqrt(dsize)) 139 | 140 | rows = int(dsize / col_size) 141 | rem = dsize % col_size 142 | # print((dsize, col_size, rem)) 143 | if rem != 0: 144 | a = np.append(data, np.zeros(col_size - rem, dtype="B")).reshape( 145 | (rows + 1, col_size) 146 | ) 147 | else: 148 | a = data.reshape((rows, col_size)) 149 | 150 | if return_array: 151 | return a 152 | 153 | im = Image.fromarray(a) 154 | return im 155 | 156 | def binary_to_img( 157 | self, data, col_size=COL_SIZE, return_array=False, auto_resize_col_size=True 158 | ): 159 | return self.array_to_img( 160 | np.frombuffer(data, dtype="B"), col_size, return_array, auto_resize_col_size 161 | ) 162 | 163 | def feats_from_binary(self, data): 164 | import leargist # lazy loading 165 | 166 | im = self.binary_to_img(data) 167 | im = im.resize(FEATURE_SHAPE, resample=Image.BICUBIC) 168 | des = leargist.color_gist(im) 169 | des = des[0:FEATURE_SIZE] 170 | return des 171 | 172 | def feats_from_binary_list(self, data_list, weights, resample=Image.NEAREST): 173 | import leargist # lazy loading 174 | 175 | N = len(data_list) 176 | assert N == len(weights) 177 | assert sum(weights) == 1.0 178 | w, h = FEATURE_SHAPE 179 | shapes = [(w, int(float(x) * h)) for x in weights] 180 | # print(shapes) 181 | 182 | ims = [] 183 | for i, data in enumerate(data_list): 184 | if type(data) == str: 185 | data = bytes(data, "utf8") 186 | w, h = shapes[i] 187 | # single pixel hight img 188 | im = Image.frombytes("L", (len(data), 1), data) 189 | im = im.resize((w * h, 1), resample=resample) 190 | im = np.asarray(im).reshape((h, w)) 191 | ims.append(im) 192 | 193 | # plt.imshow(im,cmap='gray',vmin=0,vmax=255) 194 | # plt.show() 195 | 196 | im = np.vstack(ims) 197 | 198 | # plt.imshow(im,cmap='gray',vmin=0,vmax=255) 199 | # plt.show() 200 | 201 | des = leargist.bw_gist(im) 202 | des = des[0:FEATURE_SIZE] 203 | return des 204 | 205 | def show(self, img, dpi=72): 206 | if type(img) == np.ndarray: 207 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 208 | h, w, c = img.shape 209 | else: 210 | h, w = img.size 211 | 212 | fh = h / dpi 213 | fw = w / dpi 214 | 215 | if fh <= 0: 216 | fh = 1 217 | if fw <= 0: 218 | fw = 1 219 | 220 | # plt.figure(figsize=(fh, fw)) 221 | # plt.imshow(img, 'viridis') 222 | 223 | def dist(self, fn1, fn2): 224 | h1, l1, f1 = self.feats_from_file(fn1) 225 | h2, l2, f2 = self.feats_from_file(fn2) 226 | return np.linalg.norm(f1 - f2) 227 | 228 | def dist_buff(self, d1, d2): 229 | h1, l1, f1 = self.feats_from_buff(d1) 230 | h2, l2, f2 = self.feats_from_buff(d2) 231 | return np.linalg.norm(f1 - f2) 232 | 233 | def _debug_feats_from_file(self, fn): 234 | with open(fn, "rb") as f: 235 | data = f.read() 236 | self._debug_feats_from_buff(data, fn) 237 | 238 | def _debug_feats_from_buff(self, data, fn=""): 239 | import leargist # lazy loading 240 | 241 | im = self.binary_to_img(data) 242 | dpi = 30 243 | 244 | self.show(im, dpi) 245 | # plt.title("binary data (%d bytes)\n%s"%(len(data),os.path.basename(fn))) 246 | 247 | im = im.resize(FEATURE_SHAPE) 248 | 249 | self.show(im, dpi) 250 | # plt.title("resize (shape:%s)"%(str(FEATURE_SHAPE))) 251 | 252 | des = leargist.color_gist(im)[0:FEATURE_SIZE] 253 | im = self.array_to_img(des, 32) 254 | 255 | self.show(im, 5) 256 | # plt.title("features (len:%d)"%(FEATURE_SIZE)) 257 | # plt.imshow(im) 258 | -------------------------------------------------------------------------------- /codegenome/pipelines/retdecsigmal.py: -------------------------------------------------------------------------------- 1 | ## 2 | ## This code is part of the Code Genome Framework. 3 | ## 4 | ## (C) Copyright IBM 2023. 5 | ## 6 | ## This code is licensed under the Apache License, Version 2.0. You may 7 | ## obtain a copy of this license in the LICENSE.txt file in the root directory 8 | ## of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. 9 | ## 10 | ## Any modifications or derivative works of this code must retain this 11 | ## copyright notice, and modified files need to carry a notice indicating 12 | ## that they have been altered from the originals. 13 | ## 14 | 15 | import hashlib 16 | import logging 17 | import os 18 | import pickle 19 | import tempfile 20 | import time 21 | import traceback 22 | 23 | from .._file_format import * 24 | from ..genes.sigmal import GENE_TYPE_CONFIG, SigmalGene, prep_data_sigmal2 25 | from ..ir import IRBinary 26 | from ..ir.canon import IRCanonPassBinary 27 | from ..lifters.retdec import CGRetdec 28 | from .base import CGPipeline 29 | 30 | DB_GENE_DIR = "genes" 31 | DB_AUX_DIR = ".auxs" 32 | DB_LOG_DIR = ".logs" 33 | DB_INDEX_NAME = "index.gkg" 34 | DEFAULT_GENE_TYPE = "sigmal2" 35 | 36 | _logger = logging.getLogger("codegenome.pipelines.RetdecSigmal") 37 | 38 | 39 | def _retdec_bin_to_ir( 40 | file_path, 41 | output_dir=None, 42 | output_fname=None, 43 | keep_aux_files=False, 44 | overwrite=True, 45 | logger=None, 46 | ): 47 | retdec = CGRetdec(logger=logger) 48 | bc_path = retdec.process_file( 49 | file_path, 50 | output_dir=output_dir, 51 | output_fname=output_fname, 52 | keep_aux_files=keep_aux_files, 53 | overwrite=overwrite, 54 | ) 55 | with open(bc_path, "rb") as f: 56 | out = f.read() 57 | if not keep_aux_files: 58 | os.remove(bc_path) 59 | return out 60 | 61 | 62 | def _ir_to_canon( 63 | ir_data, output_path=None, opt_level=None, bin_id=None, metadata=None, logger=None 64 | ): 65 | logger = _logger if logger is None else logger 66 | logger.debug(f"Creating IRBinary") 67 | irb = IRBinary(ir_data, opt_level=opt_level, bin_id=bin_id) 68 | canon = prep_canon_file(irb, metadata) 69 | 70 | if output_path: 71 | with open(output_path, "wb") as cf: 72 | pickle.dump(canon, cf, protocol=pickle.HIGHEST_PROTOCOL) 73 | 74 | return canon 75 | 76 | 77 | def _ir_to_canon_using_pass( 78 | ir_data, output_path=None, bin_id=None, metadata=None, logger=None 79 | ): 80 | logger = _logger if logger is None else logger 81 | if output_path: 82 | jsonl_output = os.path.splitext(output_path)[0] + ".canon.jsonl" 83 | else: 84 | fd, jsonl_output = tempfile.mkstemp() 85 | os.close(fd) 86 | 87 | logger.debug(f"Creating IRCanonPassBinary") 88 | irb = IRCanonPassBinary(ir_data, output=jsonl_output, bin_id=bin_id) 89 | canon = prep_canon_file(irb, metadata) 90 | 91 | if output_path: 92 | with open(output_path, "wb") as cf: 93 | pickle.dump(canon, cf, protocol=pickle.HIGHEST_PROTOCOL) 94 | return canon 95 | 96 | 97 | def _canon_to_sigmal_gene( 98 | canon, output_path=None, gene_type=DEFAULT_GENE_TYPE, logger=None 99 | ): 100 | logger = _logger if logger is None else logger 101 | logger.debug(f"Creating Sigmal gene") 102 | t = time.time() 103 | sg = SigmalGene() 104 | sg_genes = [] 105 | # find unique genes 106 | gid_funcs = {} 107 | for gid, func, bc, meta in canon["funcs"]: 108 | if gid not in gid_funcs: 109 | gid_funcs[gid] = [func] 110 | else: 111 | gid_funcs[gid].append(func) 112 | 113 | done = set() 114 | 115 | for gid, func, bc, meta in canon["funcs"]: 116 | if gid not in done: 117 | raw_gene = sg.from_bitcode(bc, gene_type) 118 | # format 119 | gene_data = (gid, gid_funcs[gid], raw_gene, meta) 120 | sg_genes.append(gene_data) 121 | done.add(gid) 122 | t = time.time() - t 123 | out = prep_gene_file(sg_genes, canon["binid"], canon["file_meta"]) 124 | logger.info("process_canon_to_gene time: %f" % (t)) 125 | 126 | if output_path: 127 | with open(output_path, "wb") as f: 128 | pickle.dump(out, f, protocol=pickle.HIGHEST_PROTOCOL) 129 | return out 130 | 131 | 132 | class RetdecSigmal(CGPipeline): 133 | def __init__(self): 134 | self.logger = logging.getLogger("codegenome.pipelines.RetdecSigmal") 135 | 136 | def process_file( 137 | self, 138 | file_path, 139 | sigmal_gene_type=DEFAULT_GENE_TYPE, 140 | output_dir=None, 141 | output_fname=None, 142 | keep_aux_files=True, 143 | overwrite=True, 144 | bin_id=None, 145 | logger=None, 146 | return_genes=False, 147 | keep_gene_file=True, 148 | ): 149 | metadata = get_file_meta(file_path) 150 | if bin_id is None: 151 | with open(file_path, "rb") as f: 152 | bin_id = hashlib.sha256(f.read()).hexdigest() 153 | output_dir = os.path.dirname(file_path) if output_dir is None else output_dir 154 | output_fname = ( 155 | os.path.basename(file_path) if output_fname is None else output_fname 156 | ) 157 | 158 | logger = self.logger if logger is None else logger 159 | 160 | logger.debug("Lifting to IR.") 161 | try: 162 | ir_path = ( 163 | None 164 | if keep_aux_files is False 165 | else os.path.join(output_dir, output_fname + ".bc") 166 | ) 167 | ir_data = _retdec_bin_to_ir( 168 | file_path, 169 | output_dir=output_dir, 170 | output_fname=output_fname, 171 | keep_aux_files=keep_aux_files, 172 | overwrite=overwrite, 173 | logger=logger, 174 | ) 175 | if not ir_data: 176 | logger.error("_retdec_bin_to_ir failed.") 177 | return False 178 | except Exception as ex: 179 | logger.error(f"Exception: {ex}. {repr(traceback.format_exc())}") 180 | return False 181 | 182 | logger.debug("IR to canonical IR") 183 | 184 | try: 185 | canon_path = ( 186 | None 187 | if keep_aux_files is False 188 | else os.path.join(output_dir, output_fname + ".canon") 189 | ) 190 | canon = _ir_to_canon_using_pass( 191 | ir_data, 192 | output_path=canon_path, 193 | bin_id=bin_id, 194 | metadata=metadata, 195 | logger=None, 196 | ) 197 | if canon is None: 198 | logger.error("_ir_to_canon failed.") 199 | return False 200 | except Exception as ex: 201 | logger.error(f"Exception: {ex}. {repr(traceback.format_exc())}") 202 | return False 203 | 204 | logger.debug("Canonical IR to Sigmal gene") 205 | 206 | try: 207 | gene_path = os.path.join(output_dir, output_fname + ".gene") 208 | if (keep_aux_files == False) and (keep_gene_file == False): 209 | gene_path = None 210 | 211 | genes = _canon_to_sigmal_gene( 212 | canon, output_path=gene_path, gene_type=sigmal_gene_type, logger=logger 213 | ) 214 | if canon is None: 215 | logger.error("_ir_to_canon failed.") 216 | return False 217 | except Exception as ex: 218 | logger.error(f"Exception: {ex}. {repr(traceback.format_exc())}") 219 | return False 220 | if return_genes: 221 | return genes 222 | return True 223 | 224 | 225 | class RetdecSigmalV1(RetdecSigmal): 226 | def __init__(self): 227 | super().__init__() 228 | self.gene_version = "genes_v0_0_1" 229 | 230 | def process_file( 231 | self, 232 | file_path, 233 | sigmal_gene_type=DEFAULT_GENE_TYPE, 234 | output_dir=None, 235 | output_fname=None, 236 | keep_aux_files=True, 237 | overwrite=True, 238 | bin_id=None, 239 | logger=None, 240 | return_genes=False, 241 | keep_gene_file=True, 242 | ): 243 | 244 | return super().process_file( 245 | file_path, 246 | "sigmal2", 247 | output_dir=output_dir, 248 | output_fname=output_fname, 249 | keep_aux_files=keep_aux_files, 250 | overwrite=overwrite, 251 | bin_id=bin_id, 252 | logger=logger, 253 | return_genes=return_genes, 254 | keep_gene_file=keep_gene_file, 255 | ) 256 | -------------------------------------------------------------------------------- /docker/decompiler-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "decompParams": { 3 | "verboseOut": true, 4 | "outputFormat": "plain", 5 | "keepAllFuncs": false, 6 | "selectedDecodeOnly": false, 7 | "detectStaticCode": true, 8 | "backendDisabledOpts": "", 9 | "backendEnabledOpts": "", 10 | "backendCallInfoObtainer": "optim", 11 | "backendVarRenamer": "readable", 12 | "backendNoOpts": false, 13 | "backendEmitCfg": false, 14 | "backendEmitCg": false, 15 | "backendAggressiveOpts": false, 16 | "backendKeepAllBrackets": false, 17 | "backendKeepLibraryFuncs": false, 18 | "backendNoTimeVaryingInfo": false, 19 | "backendNoVarRenaming": false, 20 | "backendNoCompoundOperators": false, 21 | "backendNoSymbolicNames": false, 22 | "timeout": 0, 23 | "maxMemoryLimit": 0, 24 | "maxMemoryLimitHalfRam": true, 25 | "ordinalNumDirectory": "./support/ordinals/", 26 | "staticSignPaths": ["./support/generic/yara_patterns/static-code/"], 27 | "libraryTypeInfoPaths": [ 28 | "./support/generic/types/arm.json", 29 | "./support/generic/types/cstdlib.json", 30 | "./support/generic/types/linux.json", 31 | "./support/generic/types/windows.json", 32 | "./support/generic/types/windrivers.json" 33 | ], 34 | "cryptoPatternPaths": [ 35 | "./support/generic/yara_patterns/signsrch/signsrch.yarac", 36 | "./support/generic/yara_patterns/signsrch/signsrch_regex.yarac" 37 | ], 38 | "llvmPasses": [ 39 | "retdec-provider-init", 40 | "retdec-decoder", 41 | "verify", 42 | "retdec-x86-addr-spaces", 43 | "retdec-x87-fpu", 44 | "retdec-main-detection", 45 | "retdec-idioms-libgcc", 46 | "retdec-inst-opt", 47 | "retdec-cond-branch-opt", 48 | "retdec-syscalls", 49 | "retdec-stack", 50 | "retdec-constants", 51 | "retdec-param-return", 52 | "retdec-inst-opt-rda", 53 | "retdec-inst-opt", 54 | "retdec-simple-types", 55 | "retdec-write-dsm", 56 | "retdec-remove-asm-instrs", 57 | "retdec-class-hierarchy", 58 | "retdec-select-fncs", 59 | "retdec-inst-opt", 60 | "retdec-register-localization", 61 | "retdec-value-protect", 62 | "instcombine", 63 | "tbaa", 64 | "basicaa", 65 | "simplifycfg", 66 | "early-cse", 67 | "tbaa", 68 | "basicaa", 69 | "scoped-noalias", 70 | "assumption-cache-tracker", 71 | "profile-summary-info", 72 | "forceattrs", 73 | "inferattrs", 74 | "domtree", 75 | "callsite-splitting", 76 | "ipsccp", 77 | "called-value-propagation", 78 | "globalopt", 79 | "domtree", 80 | "mem2reg", 81 | "deadargelim", 82 | "domtree", 83 | "aa", 84 | "loops", 85 | "lazy-branch-prob", 86 | "lazy-block-freq", 87 | "opt-remark-emitter", 88 | "instcombine", 89 | "simplifycfg", 90 | "early-cse", 91 | "basiccg", 92 | "globals-aa", 93 | "prune-eh", 94 | "functionattrs", 95 | "argpromotion", 96 | "domtree", 97 | "sroa", 98 | "aa", 99 | "memoryssa", 100 | "early-cse-memssa", 101 | "speculative-execution", 102 | "aa", 103 | "lazy-value-info", 104 | "jump-threading", 105 | "correlated-propagation", 106 | "simplifycfg", 107 | "domtree", 108 | "aa", 109 | "loops", 110 | "lazy-branch-prob", 111 | "lazy-block-freq", 112 | "opt-remark-emitter", 113 | "instcombine", 114 | "libcalls-shrinkwrap", 115 | "loops", 116 | "branch-prob", 117 | "block-freq", 118 | "lazy-branch-prob", 119 | "lazy-block-freq", 120 | "opt-remark-emitter", 121 | "aa", 122 | "loops", 123 | "lazy-branch-prob", 124 | "lazy-block-freq", 125 | "opt-remark-emitter", 126 | "tailcallelim", 127 | "simplifycfg", 128 | "reassociate", 129 | "domtree", 130 | "loops", 131 | "loop-simplify", 132 | "lcssa-verification", 133 | "lcssa", 134 | "aa", 135 | "scalar-evolution", 136 | "loop-rotate", 137 | "memoryssa", 138 | "licm", 139 | "lcssa", 140 | "loop-unswitch", 141 | "simplifycfg", 142 | "domtree", 143 | "aa", 144 | "loops", 145 | "lazy-branch-prob", 146 | "lazy-block-freq", 147 | "opt-remark-emitter", 148 | "instcombine", 149 | "loop-simplifycfg", 150 | "loop-simplify", 151 | "lcssa-verification", 152 | "aa", 153 | "loop-accesses", 154 | "loop-load-elim", 155 | "lcssa", 156 | "scalar-evolution", 157 | "indvars", 158 | "loop-idiom", 159 | "loop-deletion", 160 | "loop-unroll", 161 | "mldst-motion", 162 | "phi-values", 163 | "aa", 164 | "memdep", 165 | "lazy-branch-prob", 166 | "lazy-block-freq", 167 | "opt-remark-emitter", 168 | "gvn", 169 | "phi-values", 170 | "aa", 171 | "memdep", 172 | "memcpyopt", 173 | "sccp", 174 | "demanded-bits", 175 | "bdce", 176 | "aa", 177 | "lazy-branch-prob", 178 | "lazy-block-freq", 179 | "opt-remark-emitter", 180 | "instcombine", 181 | "lazy-value-info", 182 | "jump-threading", 183 | "correlated-propagation", 184 | "aa", 185 | "phi-values", 186 | "memdep", 187 | "dse", 188 | "bdce", 189 | "aa", 190 | "memoryssa", 191 | "loops", 192 | "loop-simplify", 193 | "lcssa-verification", 194 | "lcssa", 195 | "scalar-evolution", 196 | "licm", 197 | "postdomtree", 198 | "adce", 199 | "simplifycfg", 200 | "domtree", 201 | "aa", 202 | "loops", 203 | "lazy-branch-prob", 204 | "lazy-block-freq", 205 | "opt-remark-emitter", 206 | "instcombine", 207 | "barrier", 208 | "elim-avail-extern", 209 | "basiccg", 210 | "rpo-functionattrs", 211 | "strip-dead-prototypes", 212 | "globaldce", 213 | "constmerge", 214 | "constprop", 215 | "instcombine", 216 | "instcombine", 217 | "tbaa", 218 | "basicaa", 219 | "simplifycfg", 220 | "early-cse", 221 | "tbaa", 222 | "basicaa", 223 | "globalopt", 224 | "globaldce", 225 | "basiccg", 226 | "globals-aa", 227 | "domtree", 228 | "float2int", 229 | "domtree", 230 | "mem2reg", 231 | "instcombine", 232 | "simplifycfg", 233 | "early-cse", 234 | "lazy-value-info", 235 | "jump-threading", 236 | "correlated-propagation", 237 | "simplifycfg", 238 | "instcombine", 239 | "simplifycfg", 240 | "reassociate", 241 | "loops", 242 | "loop-simplify", 243 | "lcssa-verification", 244 | "lcssa", 245 | "aa", 246 | "scalar-evolution", 247 | "loop-rotate", 248 | "licm", 249 | "lcssa", 250 | "instcombine", 251 | "loop-simplifycfg", 252 | "loop-accesses", 253 | "lazy-branch-prob", 254 | "lazy-block-freq", 255 | "opt-remark-emitter", 256 | "loop-distribute", 257 | "branch-prob", 258 | "block-freq", 259 | "scalar-evolution", 260 | "aa", 261 | "loop-accesses", 262 | "demanded-bits", 263 | "lazy-branch-prob", 264 | "lazy-block-freq", 265 | "opt-remark-emitter", 266 | "loop-simplify", 267 | "scalar-evolution", 268 | "aa", 269 | "loop-accesses", 270 | "lazy-branch-prob", 271 | "lazy-block-freq", 272 | "loop-load-elim", 273 | "aa", 274 | "lazy-branch-prob", 275 | "lazy-block-freq", 276 | "opt-remark-emitter", 277 | "lcssa", 278 | "indvars", 279 | "loop-idiom", 280 | "loop-deletion", 281 | "gvn", 282 | "sccp", 283 | "instcombine", 284 | "lazy-value-info", 285 | "jump-threading", 286 | "correlated-propagation", 287 | "dse", 288 | "bdce", 289 | "adce", 290 | "simplifycfg", 291 | "domtree", 292 | "loops", 293 | "scalar-evolution", 294 | "aa", 295 | "demanded-bits", 296 | "lazy-branch-prob", 297 | "lazy-block-freq", 298 | "opt-remark-emitter", 299 | "opt-remark-emitter", 300 | "instcombine", 301 | "loop-simplify", 302 | "lcssa-verification", 303 | "lcssa", 304 | "scalar-evolution", 305 | "loop-unroll", 306 | "lazy-branch-prob", 307 | "lazy-block-freq", 308 | "opt-remark-emitter", 309 | "instcombine", 310 | "loop-simplify", 311 | "lcssa-verification", 312 | "lcssa", 313 | "scalar-evolution", 314 | "licm", 315 | "lazy-branch-prob", 316 | "lazy-block-freq", 317 | "opt-remark-emitter", 318 | "transform-warning", 319 | "alignment-from-assumptions", 320 | "strip-dead-prototypes", 321 | "globaldce", 322 | "constmerge", 323 | "domtree", 324 | "constprop", 325 | "instcombine", 326 | "retdec-inst-opt", 327 | "retdec-simple-types", 328 | "retdec-stack-ptr-op-remove", 329 | "retdec-idioms", 330 | "instcombine", 331 | "retdec-inst-opt", 332 | "retdec-idioms", 333 | "retdec-remove-phi", 334 | "loops", 335 | "branch-prob", 336 | "block-freq", 337 | "loop-simplify", 338 | "lcssa-verification", 339 | "lcssa", 340 | "aa", 341 | "scalar-evolution", 342 | "block-freq", 343 | "loop-sink", 344 | "lazy-branch-prob", 345 | "lazy-block-freq", 346 | "opt-remark-emitter", 347 | "instsimplify", 348 | "div-rem-pairs", 349 | "simplifycfg", 350 | "verify", 351 | "retdec-value-protect", 352 | "retdec-write-bc" 353 | ] 354 | } 355 | } 356 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /codegenome/ir/ir.py: -------------------------------------------------------------------------------- 1 | ## 2 | ## This code is part of the Code Genome Framework. 3 | ## 4 | ## (C) Copyright IBM 2023. 5 | ## 6 | ## This code is licensed under the Apache License, Version 2.0. You may 7 | ## obtain a copy of this license in the LICENSE.txt file in the root directory 8 | ## of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. 9 | ## 10 | ## Any modifications or derivative works of this code must retain this 11 | ## copyright notice, and modified files need to carry a notice indicating 12 | ## that they have been altered from the originals. 13 | ## 14 | 15 | # Set environment variables SG_IR_OPTIMIZE_EXTERNAL and LLVM_OPT_PATH to run external IR optimizer. 16 | # E.g. with: 17 | # export SG_IR_OPTIMIZE_EXTERNAL=1 18 | # export LLVM_OPT_PATH=/usr/bin/opt-8 19 | 20 | import collections 21 | import datetime 22 | import hashlib 23 | import json 24 | import logging 25 | import os 26 | import random 27 | import re 28 | import subprocess 29 | import sys 30 | import tempfile 31 | import time 32 | 33 | import llvmlite.binding as llvm 34 | 35 | from .._defaults import UNIVERSAL_FUNC_NAME 36 | from .._file_format import _CANON_FILE_VERSION_ 37 | 38 | logger = logging.getLogger("codegenome.ir") 39 | 40 | 41 | class SigmalEx(object): 42 | def normalize_func(self, f): 43 | pass 44 | 45 | 46 | # challenges 47 | # - type names are immutable (should not be part of genome as type-name can be arbitrary. Hope retdec has some consistencies ) 48 | # - recursive identifier dependencies 49 | # - registers can not be renamed (?) 50 | 51 | 52 | class Function(object): 53 | def __init__(self, obj, parent, dbg=False): 54 | """ 55 | Object for referencing an LLVM IR function. 56 | """ 57 | 58 | self._obj = obj 59 | self._parent = parent 60 | self.name = str(obj.name) 61 | 62 | # explicitly preserver order of the items 63 | self.args = collections.OrderedDict() 64 | self.blocks = collections.OrderedDict() 65 | self.insts = collections.OrderedDict() 66 | self.types = collections.OrderedDict() 67 | self.attrs = [] 68 | 69 | st = time.time() 70 | 71 | self.opnames = set() 72 | self.gvars = set() 73 | self.dbg = dbg 74 | self.done_set = set() 75 | 76 | # recursive call! 77 | self.add_types(obj.type) 78 | 79 | self._init(obj, parent, dbg) 80 | 81 | def _init(self, obj, parent, dbg): 82 | """ 83 | Populate all the components of the function. 84 | """ 85 | st = time.time() 86 | for a in obj.attributes: 87 | self.attrs.append(a) 88 | 89 | for a in obj.arguments: 90 | self.args[a.name] = a 91 | self.add_types(a.type) 92 | 93 | idx = 0 94 | for b in obj.blocks: 95 | self.blocks[b.name] = b 96 | 97 | for i in b.instructions: 98 | if i.name != "": 99 | self.insts[i.name] = i 100 | elif str(i.type) != "void": 101 | # not a function call 102 | i.name = "tv" + str(idx) 103 | idx += 1 104 | self.insts[i.name] = i 105 | 106 | self.add_types(i.type) 107 | 108 | # inspect instruction ops 109 | ops = [x for x in i.operands] 110 | for op in ops: 111 | if op.name != "": 112 | if not op.is_global: # TODO does not always work 113 | self.opnames.add(op.name) 114 | 115 | self.add_with_global_deps(op) 116 | 117 | self.externs = self.opnames - set( 118 | list(self.args.keys()) 119 | + list(self.blocks.keys()) 120 | + list(self.insts.keys()) 121 | + [self.name] 122 | ) 123 | self.externs = self.externs.union(self.gvars) - set([self.name]) 124 | if dbg: 125 | print(self.externs) 126 | 127 | st = time.time() - st 128 | self.stat = {"init": st} 129 | 130 | def __str__(self): 131 | raise Exception("str Function not supported.") 132 | 133 | @property 134 | def name(self): 135 | return self._obj.name 136 | 137 | @name.setter 138 | def name(self, x): 139 | self._obj.name = x 140 | 141 | def add_with_global_deps(self, obj, depth=0): 142 | """ 143 | Recursively add referenced variables. Only adds global variables or types. 144 | """ 145 | if obj not in self.done_set: 146 | self.done_set.add(obj) 147 | else: 148 | return 149 | if isinstance(obj, llvm.ValueRef): 150 | # value 151 | ids = self._parent.get_identifiers(obj) 152 | for x in ids: 153 | # only handle if it's a global variable or a type object 154 | 155 | xn = x[1:] 156 | if x[:1] == "@": 157 | # global variable id 158 | if xn not in self.gvars: 159 | self.gvars.add(xn) 160 | 161 | # try extracting additional references 162 | gv = self._parent.get_gv_by_name(xn) 163 | if gv is not None: 164 | if gv != obj: 165 | self.add_with_global_deps(gv, depth + 1) 166 | else: 167 | # function may not have been parsed yet! so pass 168 | # print("Warning: undefined global var! %s, %s (%s)"%(gv, xn, str(x))) 169 | pass 170 | 171 | else: 172 | xnt = self._parent.get_gtype_by_name(xn) 173 | if xnt is not None: 174 | # add type 175 | self.add_types(xnt) 176 | 177 | def add_types(self, tp): 178 | """ 179 | Recursively add type definitions. 180 | """ 181 | while tp.is_pointer: 182 | tp = tp.element_type 183 | 184 | ids = [x[1:] for x in self._parent.get_identifiers(tp)] 185 | for x in ids: 186 | if x not in self.types: 187 | if x in self._parent.gtypes: 188 | _tp = self._parent.gtypes[x] 189 | self.types[x] = _tp 190 | if tp != _tp: 191 | self.add_types(_tp) 192 | else: 193 | raise Exception("undefined type! " + str(x)) 194 | 195 | def get_ll(self): 196 | """ 197 | Main method for generating the final text for ll file. 198 | """ 199 | _name = self._obj.name 200 | self._obj.name = UNIVERSAL_FUNC_NAME 201 | i = 1 202 | for a in self.args.values(): 203 | a.name = "a%d" % (i) 204 | i += 1 205 | i = 1 206 | for b in self.blocks.values(): 207 | b.name = "b%d" % (i) 208 | i += 1 209 | 210 | i = 1 211 | for b in self.insts.values(): 212 | b.name = "v%d" % (i) 213 | i += 1 214 | 215 | i = 1 216 | for b in self.types.values(): 217 | b.name = "t%d" % (i) 218 | i += 1 219 | 220 | # --- rename externs 221 | 222 | externs = [] 223 | # sort externs 224 | for ex in self.externs: 225 | obj = self._parent.get_gv_by_name(ex) 226 | if obj is not None: 227 | if isinstance(obj, Function): 228 | ln = self._parent.str_external_funcs(obj._obj) 229 | else: 230 | # global var 231 | ln = self._parent.str_globals(obj) 232 | ln = ln.split("=") 233 | ln = "=".join(ln[1:]).strip() 234 | externs.append([ln, ex, obj]) 235 | 236 | externs.sort() 237 | 238 | i = 1 239 | j = 1 240 | k = 1 241 | for _, ex, obj in externs: 242 | 243 | if obj is not None: 244 | if isinstance(obj, Function): 245 | # TODO intelligent rename. e.g.; don't rename *printf* 246 | # TODO rename function arg names as well 247 | # print "processing "+ex+str(obj._obj) 248 | # do not rename external 249 | if not obj._obj.is_declaration: 250 | obj.name = "gf%d" % i 251 | i += 1 252 | else: 253 | pass 254 | # obj.name = 'gef%d'%k 255 | # k+=1 256 | else: 257 | # global var 258 | try: 259 | obj.name = "gv%d" % j 260 | except Exception as e: 261 | print(_name, "Error ", e, obj) 262 | 263 | j += 1 264 | # externs[ex] = obj 265 | 266 | # if self.dbg: 267 | # print '-------', ex, ':', obj 268 | # print str(self._obj) 269 | # return 270 | 271 | # ------------------ 272 | # types 273 | 274 | tps = "\n".join([str(x) for x in self.types.values()]) 275 | 276 | # ----------------- 277 | # globals 278 | 279 | # append externs (hack) 280 | gefs = gfs = gvs = "" 281 | 282 | for _, ex, obj in externs: 283 | if obj is not None: 284 | if isinstance(obj, Function): 285 | ln = self._parent.str_external_funcs(obj._obj) 286 | if not obj._obj.is_declaration: 287 | gfs += "\n" + ln 288 | else: 289 | gefs += "\n" + ln 290 | else: 291 | # global var 292 | ln = self._parent.str_globals(obj) 293 | gvs += "\n" + ln 294 | 295 | # combine 296 | gvs = "\n".join([gefs, gfs, gvs]) 297 | 298 | # ------------------ 299 | # generate main code 300 | 301 | body = str(self._obj) 302 | body = self._parent.str_rm_meta(body) 303 | # ------------------ 304 | 305 | # reset externs 306 | for _, ex, obj in externs: 307 | if obj is not None: 308 | obj.name = ex 309 | 310 | for k, v in self.args.items(): 311 | v.name = k 312 | for k, v in self.blocks.items(): 313 | v.name = k 314 | for k, v in self.insts.items(): 315 | v.name = k 316 | for k, v in self.types.items(): 317 | v.name = k 318 | self._obj.name = _name 319 | 320 | return "\n".join([tps, gvs, body]) 321 | 322 | def get_bc(self): 323 | m = llvm.parse_assembly(self.get_ll()) 324 | return m.as_bitcode() 325 | 326 | 327 | class IRBinary(object): 328 | def __init__(self, data, ll=False, opt_level=3, bin_id=""): 329 | global logger 330 | self.logger = logger 331 | self._re_p = re.compile(r"[%@]\"?[-a-zA-Z$._0-9][-a-zA-Z$._0-9@]*\"?") 332 | self._re_gv = re.compile(r"^[%@]\"?[-a-zA-Z$._0-9][-a-zA-Z$._0-9@]*\"?") 333 | self._re_meta = re.compile(", *!.+\n") 334 | self._bin_id = bin_id 335 | self._opt_level = opt_level 336 | 337 | st = time.time() 338 | if ll: 339 | self._m = llvm.parse_assembly(data) 340 | else: 341 | self._m = llvm.parse_bitcode(data) 342 | st = time.time() - st 343 | self.stat = {"parse": st} 344 | self.logger.info("stat:" + json.dumps(self.stat)) 345 | 346 | self.fs = collections.OrderedDict() 347 | self.fs_objs = {} 348 | self.gv = {} 349 | self.gtypes = {} 350 | self._ids = {} 351 | 352 | self._init() 353 | 354 | def _init(self): 355 | st = time.time() 356 | if self._opt_level > 0: 357 | if os.environ.get("SG_IR_OPTIMIZE_EXTERNAL") is not None: 358 | self._optimize_external(self._opt_level) 359 | else: 360 | self._optimize(self._opt_level) 361 | st = time.time() - st 362 | self.stat["optimize"] = st 363 | self.logger.info("stat:" + json.dumps(self.stat)) 364 | 365 | st = time.time() 366 | 367 | for g in self._m.global_variables: 368 | if g.name == "": 369 | gid = self.get_gv_identifier(g) 370 | for gn in gid: 371 | self.gv[gn[1:]] = g 372 | else: 373 | self.gv[g.name] = g 374 | 375 | i = 0 376 | for tp in self._m.struct_types: 377 | if tp.name != "": 378 | self.gtypes[tp.name] = tp 379 | else: 380 | tp.name = "_ANON_TYPE_%d" % (i) 381 | i += 1 382 | self.gtypes[tp.name] = tp 383 | 384 | self.stat["globals_init"] = time.time() - st 385 | st = time.time() 386 | 387 | # function llvmobj_dict 388 | for f in self._m.functions: 389 | self.fs_objs[f.name] = f 390 | 391 | self.logger.info("Creating function objects.") 392 | for f in self._m.functions: 393 | t1 = time.time() 394 | self.fs[f.name] = Function(f, self) 395 | self.logger.info(f"{f.name} took {time.time()-t1}secs.") 396 | st = time.time() - st 397 | self.stat["func_init"] = st 398 | 399 | st = time.time() 400 | self._collision_correction(self.fs) 401 | self._collision_correction(self.gv) 402 | st = time.time() - st 403 | self.stat["collision_correction"] = st 404 | 405 | self.logger.info("stat:" + json.dumps(self.stat)) 406 | 407 | def _optimize_external(self, opt_level): 408 | opt_path = os.environ.get("LLVM_OPT_PATH", "opt-8") 409 | 410 | tmp = tempfile.NamedTemporaryFile("w+b", delete=True) 411 | output_filename = tmp.name + ".bc.tmp" 412 | 413 | try: 414 | tmp.write(self._m.as_bitcode()) 415 | tmp.flush() 416 | self.logger.debug(f"created a tmp bc file{tmp.name}") 417 | args = [opt_path, f"--O{opt_level}", "-o", output_filename, tmp.name] 418 | self.logger.debug(f"Running {' '.join(args)}") 419 | ret = subprocess.run(args) 420 | if ret.returncode != 0: 421 | err = ( 422 | f"optimization step failed while running command: {' '.join(args)} " 423 | ) 424 | raise Exception(err) 425 | 426 | self._m = llvm.parse_bitcode(open(output_filename, "rb").read()) 427 | os.remove(output_filename) 428 | self.logger.debug("optimization completed.") 429 | except Exception as err: 430 | self.logger.error(err) 431 | finally: 432 | tmp.close() 433 | if os.path.exists(tmp.name): 434 | os.remove(tmp.name) 435 | if os.path.exists(output_filename): 436 | os.remove(output_filename) 437 | return self._m 438 | 439 | def _optimize(self, opt_level): 440 | llvm.initialize() 441 | llvm.initialize_native_target() 442 | llvm.initialize_native_asmprinter() 443 | 444 | self._m.verify() 445 | 446 | with llvm.create_module_pass_manager() as pm: 447 | with llvm.create_pass_manager_builder() as pmb: 448 | pmb.opt_level = opt_level 449 | pmb.populate(pm) 450 | pm.run(self._m) 451 | 452 | def _collision_correction(self, d): 453 | if UNIVERSAL_FUNC_NAME in d: 454 | rname = "_x_" 455 | while rname in d: 456 | rname = "".join(random.sample(string.letters, 8)) 457 | tfs = d.pop(UNIVERSAL_FUNC_NAME) 458 | tfs.name = rname 459 | d[rname] = tfs 460 | 461 | def get_identifiers(self, obj): 462 | """ 463 | Returns a list of all object identifies (function names, 464 | variable names, type names etc.) referenced by obj. 465 | Builds self._ids{} map on demand to avoid redundant processing. 466 | """ 467 | out = self._ids.get(obj) 468 | if out is None: 469 | s, obj_str = self.str_external_funcs(obj, return_obj_str=True) 470 | if s == "": 471 | # not a function 472 | s = obj_str 473 | out = [x.replace('"', "") for x in self._re_p.findall(s)] 474 | self._ids[obj] = out 475 | return out 476 | 477 | def get_gv_identifier(self, obj): 478 | return [x.replace('"', "") for x in self._re_gv.findall(str(obj))] 479 | 480 | def str_globals(self, g): 481 | return str(g) 482 | 483 | def str_external_funcs(self, obj, return_obj_str=False): 484 | """ 485 | Returns function declaration line if the obj is a function 486 | otherwise an empty string. 487 | """ 488 | obj_str = str(obj) # str(obj) is an expensive call! 489 | out_ln = "" 490 | for ln in obj_str.split("\n"): 491 | if ln.startswith("declare"): 492 | out_ln = ln 493 | break 494 | elif ln.startswith("define"): 495 | ln = ln.strip() 496 | ln = "declare" + ln[6:-1] 497 | out_ln = ln 498 | break 499 | # ln = self.str_globals(ln) 500 | if return_obj_str: 501 | return out_ln, obj_str 502 | return out_ln 503 | 504 | def str_rm_meta(self, s): 505 | return self._re_meta.sub("\n", s) 506 | 507 | def get_gtype_by_name(self, name): 508 | if name in self.gtypes: 509 | return self.gtypes[name] 510 | return None 511 | 512 | def get_gv_by_name(self, name): 513 | out = self.gv.get(name) 514 | if out is None: 515 | out = self.fs.get(name) 516 | return out 517 | 518 | def p_inst(self, x): 519 | print(str(x).strip()) 520 | print("opcode:" + x.opcode) 521 | print("operands:") 522 | for op in x.operands: 523 | print("type:%s, name:%s, value:%s" % (op.type, op.name, str(op))) 524 | 525 | def serialize(self, statf=None): 526 | fns = [] 527 | i = 0 528 | tot = 0 529 | err = 0 530 | st = time.time() 531 | 532 | for k, v in self.fs.items(): 533 | # skip declare 534 | if v._obj.is_declaration: 535 | continue 536 | i += 1 537 | try: 538 | s = time.time() 539 | bc = v.get_bc() 540 | s = time.time() - s 541 | tot += 1 542 | 543 | gid = hashlib.sha256(bc).hexdigest() 544 | # TODO get file_offset 545 | bc_size = len(bc) 546 | file_offset = 0 547 | meta = (bc_size, file_offset) 548 | 549 | # format (gene_id, func_name, bitcode, meta) 550 | fns.append((gid, k, bc, meta)) 551 | 552 | if statf: 553 | txt = ( 554 | '{"type": "OK", "i": %d, "ts": "%s", "func": "%s", "time": %f, "size": %d}' 555 | % (i, str(datetime.datetime.now()), k, s, len(bc)) 556 | ) 557 | statf.write(txt + "\n") 558 | except Exception as e: 559 | err += 1 560 | txt = ( 561 | '{"type": "ERR", "i": %d, "ts": "%s", "func": "%s", "e": "%s", "bin_id": "%s"}' 562 | % (i, str(datetime.datetime.now()), k, str(e), self._bin_id) 563 | ) 564 | self.logger.error(txt) 565 | if statf: 566 | statf.write(txt + "\n") 567 | else: 568 | pass 569 | st = time.time() - st 570 | if statf: 571 | stat = { 572 | "type": "stat", 573 | "bin_id": self._bin_id, 574 | "total": tot, 575 | "errors": err, 576 | "func_count": len(self.fs), 577 | "time": st, 578 | } 579 | for k, v in self.stat.items(): 580 | stat[k] = v 581 | statf.write(json.dumps(stat) + "\n") 582 | 583 | return fns 584 | -------------------------------------------------------------------------------- /utils/app/app/core/genome_service.py: -------------------------------------------------------------------------------- 1 | import binascii 2 | import datetime 3 | import hashlib 4 | import json 5 | import logging 6 | import os 7 | import shutil 8 | import sys 9 | import threading 10 | import time 11 | import traceback 12 | from textwrap import indent 13 | 14 | import numpy as np 15 | from sqlitedict import SqliteDict 16 | 17 | import codegenome as cg 18 | import codegenome._defaults as defaults 19 | 20 | from ..defaults import * 21 | from .schema import KGNodeID 22 | 23 | CG_CACHE_DIR = defaults.CG_CACHE_DIR 24 | 25 | if not os.path.exists(CG_CACHE_DIR): 26 | os.makedirs(CG_CACHE_DIR) 27 | 28 | CG_GENE_DIR = os.path.expanduser( 29 | os.environ.get("CG_GENE_DIR", os.path.join(CG_CACHE_DIR, "local.kg")) 30 | ) 31 | CG_DOCKER_IMAGE_NAME = os.environ.get("CG_DOCKER_IMAGE_NAME", "cg-worker") 32 | 33 | 34 | DEFAULT_API_CACHE_TTL_SECS = -1 # negative value, never expire 35 | # node age check logic for cache invalidation: always refresh cache for young nodes (default: 1hr old node). 36 | DEFAULT_API_CACHE_NODE_AGE_THRESHOLD_SECS = 60 * 60 37 | DEFAULT_RECORD_API_STATS = 1 # record api stats in cache db 38 | DEFAULT_API_COMPUTE_TIMEOUT_SECS = 24 * 60 * 60 # 1 day 39 | DEFAULT_KEEP_AUX_FILES = 0 40 | 41 | API_STATE_SUCCESS = "Success" 42 | API_STATE_RESULT_NOT_READY = "ResultNotReady" 43 | API_STATE_EMPTY_RESULT = "ResultEmpty" 44 | API_STATE_ERROR = "Error" 45 | 46 | 47 | log = logging.getLogger("codegenome.rest.kg_service") 48 | 49 | 50 | def crc32(obj): 51 | return str( 52 | binascii.crc32(json.dumps(obj, default=lambda x: str(x)).encode("utf-8")) 53 | ) 54 | 55 | 56 | def is_exec(obj): 57 | # TODO implement proper test 58 | return True 59 | 60 | 61 | class JobDBDict(SqliteDict): 62 | def __init__(self, *args, **kwargs): 63 | self._lock = threading.Lock() 64 | super().__init__(*args, **kwargs) 65 | 66 | def __setitem__(self, key, value): 67 | with self._lock: 68 | super().__setitem__(key, value) 69 | super().commit() 70 | 71 | def __delitem__(self, key): 72 | with self._lock: 73 | super().__delitem__(key) 74 | super().commit() 75 | 76 | 77 | class GenomeService(object): 78 | def __init__(self, config={}): 79 | self.start_ts = time.time() 80 | self.config = config 81 | self.config.setdefault("cache_dir", CG_CACHE_DIR) 82 | self.config.setdefault("gene_dir", CG_GENE_DIR) 83 | 84 | self.config.setdefault( 85 | "keep_aux_files", 86 | int(os.environ.get("CG_KEEP_AUX_FILES", DEFAULT_KEEP_AUX_FILES)), 87 | ) 88 | self._disable_index_cache = int(config.get("disable_index_cache", 0)) 89 | self.api_cache_ttl = self.config.get( 90 | "api_cache_ttl_secs", 91 | int(os.environ.get("API_CACHE_TTL_SECS", DEFAULT_API_CACHE_TTL_SECS)), 92 | ) 93 | self.record_stats = self.config.get( 94 | "api_record_stats", 95 | int(os.environ.get("RECORD_API_STATS", DEFAULT_RECORD_API_STATS)), 96 | ) 97 | self.api_compute_timeout = self.config.get( 98 | "api_compute_timeout_secs", 99 | int( 100 | os.environ.get( 101 | "API_COMPUTE_TIMEOUT_SECS", DEFAULT_API_COMPUTE_TIMEOUT_SECS 102 | ) 103 | ), 104 | ) 105 | self.api_cache_node_age_threshold = self.config.get( 106 | "api_cache_node_age_threshold", 107 | int( 108 | os.environ.get( 109 | "API_CACHE_NODE_AGE_THRESHOLD_SECS", 110 | DEFAULT_API_CACHE_NODE_AGE_THRESHOLD_SECS, 111 | ) 112 | ), 113 | ) 114 | 115 | self.kg = cg.GenomeKG(db_dir=config.get("gene_dir")) 116 | self._jobs = JobDBDict( 117 | os.path.join(self.config.get("cache_dir"), "jobs.sqlite") 118 | ) 119 | self._threads = {} 120 | 121 | self._update_status() 122 | 123 | def _update_status(self): 124 | updates = {} 125 | for k, v in self._jobs.items(): 126 | if (v.get("end_ts") is None) and (k not in self._threads): 127 | v["status"] = "error" 128 | updates[k] = v 129 | for k, v in updates.items(): 130 | self._jobs[k] = v 131 | 132 | def status(self): 133 | incomplete = [] 134 | for k, v in self._jobs.items(): 135 | if "end_ts" not in v: 136 | incomplete.append( 137 | { 138 | "job_id": str(k), 139 | "job": v, 140 | "duration_secs": int(time.time() - v.get("start_ts")), 141 | } 142 | ) 143 | 144 | return { 145 | "start_time": str(datetime.datetime.fromtimestamp(int(self.start_ts))), 146 | "uptime_secs": int(time.time() - self.start_ts), 147 | "total_genes": len(self.kg.gene_ids), 148 | "total_binaries": len(self.kg.bins), 149 | "gene_version": self.kg.gene_version, 150 | "jobs": {"total": len(self._jobs), "incomplete": incomplete}, 151 | } 152 | 153 | def make_fileid(self, data): 154 | h = hashlib.sha256() 155 | h.update(data) 156 | return h.digest().hex() 157 | 158 | def _add_file(self, file_id, file_path, cleanup=True): 159 | log.debug(f"add_file(file_path={file_path})") 160 | qkey = ["add_file", file_path, cleanup] 161 | try: 162 | fid = self.kg.add_file( 163 | file_path=file_path, keep_aux_files=self.config.get("keep_aux_files") 164 | ) 165 | if fid is None: 166 | out = { 167 | "status": API_STATE_ERROR, 168 | "file_id": file_id, 169 | "status_msg": "File processing failed.", 170 | } 171 | elif fid != file_id: 172 | out = { 173 | "status": API_STATE_ERROR, 174 | "file_id": file_id, 175 | "status_msg": f"file id mismatch {fid} != {file_id}", 176 | } 177 | else: 178 | out = { 179 | "status": API_STATE_SUCCESS, 180 | "file_id": file_id, 181 | "ret_status": "new_file", 182 | } 183 | self._api_thread_final(file_id, qkey, out) 184 | if cleanup: 185 | tdir = os.path.dirname(file_path) 186 | if tdir.startswith(TMP_DIR_PREFIX): 187 | log.debug(f"removing directory {tdir}") 188 | shutil.rmtree(tdir) 189 | 190 | return out 191 | except Exception as err: 192 | log.error( 193 | f"Exception at add_file({file_path}). {err}. {repr(traceback.format_exc())}." 194 | ) 195 | out = {"status": API_STATE_ERROR, "status_msg": str(err)} 196 | self._api_thread_final(file_id, qkey, out) 197 | return out 198 | 199 | def api_add_file(self, file_path): 200 | log.debug(f"api_add_file({file_path})") 201 | 202 | with open(file_path, "rb") as f: 203 | file_id = self.make_fileid(f.read()) 204 | 205 | n = self.kg.get_node(file_id) 206 | 207 | if n: 208 | return { 209 | "status": API_STATE_SUCCESS, 210 | "file_id": file_id, 211 | "ret_status": "existing_file", 212 | } 213 | 214 | qkey = ["add_file", file_path, True] 215 | 216 | ret = self._api_thread_enter(file_id, qkey, target=self._add_file) 217 | ret["file_id"] = file_id 218 | return ret 219 | 220 | def _update_output( 221 | self, fnode, results, fnode2=None, filename=True, filetypes=True 222 | ): 223 | # filename 224 | def _update_node_filename(n): 225 | if type(n) == dict: 226 | for k in ["name", "metadata.name"]: 227 | fn = n.get(k) 228 | if fn: 229 | n[k] = os.path.basename(fn) 230 | 231 | # filetypes 232 | def _update_node_filetypes(n): 233 | if type(n) == dict: 234 | ftypes = [] 235 | dels = [] 236 | for k in n.keys(): 237 | if k.startswith("filetype."): 238 | ftypes.append(k.split(".")[1]) 239 | dels.append(k) 240 | for k in dels: 241 | n.pop(k) 242 | 243 | n["filetypes"] = ftypes 244 | 245 | if filename: 246 | _update_node_filename(fnode) 247 | _update_node_filename(fnode2) 248 | if filetypes: 249 | _update_node_filetypes(fnode) 250 | _update_node_filetypes(fnode2) 251 | 252 | def _prep_output(self, output, output_detail): 253 | if output_detail == "simple": 254 | for r in output.get("results", []): 255 | if type(r) == dict: 256 | r.setdefault( 257 | "sha256", 258 | r.get("object", {}).get( 259 | "sha256", r.get("id", "").split(":")[-1] 260 | ), 261 | ) 262 | if "object" in r: 263 | r.pop("object") 264 | 265 | return output 266 | 267 | def _cleanup_jobs(self, job_id): 268 | # TODO implement 269 | pass 270 | 271 | def check_job(self, job_id): 272 | job = self._jobs.get(job_id) 273 | if job: 274 | ret = job.get("result") 275 | if ret: 276 | # Clean job 277 | # self._jobs.pop(job_id) 278 | return ret 279 | ret = { 280 | "status": API_STATE_RESULT_NOT_READY, 281 | "start_ts": job.get("start_ts"), 282 | "job_id": job_id, 283 | } 284 | file_id = job.get("file_id") 285 | if file_id: 286 | ret["file_id"] = file_id 287 | 288 | return ret 289 | 290 | return {"status": API_STATE_ERROR, "status_msg": f"Job {job_id} not found"} 291 | 292 | def del_job(self, job_id): 293 | if job_id in self._jobs: 294 | log.warning(f"Deleting job({job_id}).") 295 | self._jobs.pop(job_id) 296 | return {"status": API_STATE_SUCCESS} 297 | 298 | return {"status": API_STATE_ERROR, "status_msg": f"Job {job_id} not found"} 299 | 300 | def delete_file(self, file_id): 301 | log.warning(f"Deleting file({file_id}).") 302 | # try removing jobs 303 | dels = [] 304 | for k, v in self._jobs.items(): 305 | if file_id == v.get("result", {}).get("file_id"): 306 | dels.append(k) 307 | for k in dels: 308 | self._jobs.pop(k) 309 | 310 | ret = self.kg.delete_file(file_id=file_id) 311 | if ret: 312 | return {"status": API_STATE_SUCCESS} 313 | 314 | return { 315 | "status": API_STATE_ERROR, 316 | "status_msg": f"Error deleting file {file_id}.", 317 | } 318 | 319 | def _create_job_id(self, obj_id, qkey): 320 | if qkey[0] == "add_file": 321 | # file path will be random 322 | return crc32([obj_id]) 323 | return crc32([obj_id, qkey]) 324 | 325 | def _api_thread_enter(self, obj_id, qkey, target): 326 | if not callable(target): 327 | raise Exception(f"target [{target}] argument must be callable.") 328 | try: 329 | # check if running 330 | job_id = self._create_job_id(obj_id, qkey) 331 | job = self._jobs.get(job_id) 332 | prev_out = None 333 | if job: 334 | ret = job.get("result") 335 | if ret: 336 | self._cleanup_jobs(job_id) 337 | dt = time.time() - job["end_ts"] 338 | 339 | cache_ok = False 340 | if self.api_cache_ttl < 0: 341 | # always use cache 342 | cache_ok = True 343 | elif dt < self.api_cache_ttl: 344 | cache_ok = True 345 | # always return last result if available, but trigger new update if needed 346 | prev_out = ret 347 | if type(prev_out) == dict and prev_out.get("status") in [ 348 | API_STATE_ERROR, 349 | API_STATE_EMPTY_RESULT, 350 | ]: 351 | cache_ok = False 352 | 353 | if cache_ok: 354 | log.info(f"Returning cached result for {(obj_id,qkey)}") 355 | return ret 356 | else: 357 | dt = time.time() - job.get("start_ts") 358 | # is thread live? 359 | running = False 360 | try: 361 | if job_id in self._threads and self._threads[job_id].is_alive(): 362 | running = True 363 | except: 364 | pass 365 | if running: 366 | if dt < self.api_compute_timeout: 367 | # still computing 368 | return { 369 | "status": API_STATE_RESULT_NOT_READY, 370 | "start_ts": job["start_ts"], 371 | "job_id": job_id, 372 | } 373 | else: 374 | # you can not kill the thread 375 | # signal to stop, thread function need to cooperate 376 | job["stop"] = True 377 | job["status"] = "stopping" 378 | if job_id in self._threads: 379 | self._threads.pop(job_id) 380 | else: 381 | # job crashed 382 | job["status"] = "error" 383 | 384 | try: 385 | args = [obj_id] + qkey[1:] 386 | th = threading.Thread(target=target, args=args) 387 | th.start() 388 | sts = int(time.time()) 389 | self._threads[job_id] = th 390 | self._jobs[job_id] = {"start_ts": sts, "status": "running"} 391 | if prev_out: 392 | return prev_out 393 | else: 394 | return { 395 | "status": API_STATE_RESULT_NOT_READY, 396 | "start_ts": sts, 397 | "job_id": job_id, 398 | } 399 | except Exception as err: 400 | log.error( 401 | f"Exception creating job thread. {err} Using blocking call for {target.__name__}{(obj_id, qkey)})" 402 | ) 403 | return target(*args) 404 | 405 | except Exception as err: 406 | log.error( 407 | f"Exception at _api_thread_enter({(obj_id, qkey, target.__name__)}). {err}. {repr(traceback.format_exc())}" 408 | ) 409 | 410 | def _api_thread_final(self, obj_id, qkey, out): 411 | try: 412 | job_id = self._create_job_id(obj_id, qkey) 413 | job = self._jobs.get(job_id, {}) 414 | job["result"] = out 415 | job["end_ts"] = time.time() 416 | job["status"] = "completed" 417 | self._jobs[job_id] = job 418 | if job_id in self._threads: 419 | self._threads.pop(job_id) 420 | except Exception as err: 421 | log.error( 422 | f"Exception at _api_thread_final({(obj_id,qkey)}). {err}. {repr(traceback.format_exc())}" 423 | ) 424 | 425 | def api_files_compare_kg( 426 | self, 427 | file_id1, 428 | file_id2, 429 | method=DEFAULT_COMPARE_METHOD, 430 | output_detail=DEFAULT_OUTPUT_DETAIL, 431 | ): 432 | """ 433 | Main api exposed to the external UI rest-api. 434 | """ 435 | log.debug( 436 | f"api_files_compare_kg(file_id1={file_id1}, file_id2={file_id2}, method={method}, output_detail={output_detail}" 437 | ) 438 | file_id1 = KGNodeID.file_id(file_hash=file_id1) 439 | file_id2 = KGNodeID.file_id(file_hash=file_id2) 440 | 441 | obj_id = file_id1 442 | qkey = ["files_compare_kg", file_id2, method, output_detail] 443 | 444 | # test direct 445 | return self._files_compare_kg(obj_id, file_id2, method, output_detail) 446 | 447 | return self._api_thread_enter(obj_id, qkey, target=self._files_compare_kg) 448 | 449 | def _files_compare_kg( 450 | self, file_id1, file_id2, method="gene_v0", output_detail=DEFAULT_OUTPUT_DETAIL 451 | ): 452 | log.debug( 453 | f"_files_compare_kg(file_id1={file_id1}, file_id2={file_id2}, method={method}, output_detail={output_detail}" 454 | ) 455 | qkey = ["files_compare_kg", file_id2, method, output_detail] 456 | try: 457 | t1 = time.time() 458 | fnode1 = self.kg.get_node(file_id1) 459 | fnode2 = self.kg.get_node(file_id2) 460 | t2 = time.time() 461 | if fnode1 is None or fnode2 is None: 462 | msg = "" 463 | if fnode1 is None: 464 | msg = f"file_id:{file_id1} could not be found." 465 | if fnode2 is None: 466 | msg += f"file_id:{file_id2} could not be found." 467 | 468 | out = { 469 | "status": API_STATE_EMPTY_RESULT, 470 | "results": [], 471 | "stats": {"init_prep_time": t2 - t1}, 472 | "status_msg": msg, 473 | } 474 | self._api_thread_final(file_id1, qkey, out) 475 | log.warn(f"_files_compare_kg returning: {out}") 476 | return out 477 | 478 | if (not is_exec(fnode1)) or (not is_exec(fnode2)): 479 | # not a executable file, reset version to gene_v0 480 | log.info("not a executable file, reset version to gene_v0") 481 | method = "gene_v0" 482 | 483 | flags = method.split(".") 484 | version = flags[0] 485 | if len(flags) > 1: 486 | method = flags[1] 487 | else: 488 | method = DEFAULT_CALCULATION_METHOD 489 | 490 | if version == "gene_v0": 491 | results, stats = self.file_compare_gene_v0( 492 | fnode1, fnode2, output_detail=output_detail 493 | ) 494 | self._update_output(fnode1, results, fnode2) 495 | elif version in ["genes_v1_3_0", "genes_v1_3_1"]: 496 | # TODO pass match/mimatch thrs 497 | results, stats = self.kg.bindiff( 498 | fnode1, fnode2, method=method, output_detail=output_detail 499 | ) 500 | self._update_output(fnode1, results, fnode2) 501 | else: 502 | log.error( 503 | f"_files_compare_kg error for fileids: {file_id1, file_id2}, version: {version}, method: {method}" 504 | ) 505 | results, stats = { 506 | "error": f"version: {version}, method: {method} not supported." 507 | }, {} 508 | 509 | stats["init_prep_time"] = stats.get("init_prep_time", 0.0) + (t2 - t1) 510 | out = { 511 | "query": [fnode1, fnode2], 512 | "results": results, 513 | "stats": stats, 514 | "status": API_STATE_SUCCESS, 515 | } 516 | if "error" in results: 517 | out["status"] = API_STATE_ERROR 518 | out["status_msg"] = results["error"] 519 | elif len(results) == 0: 520 | out["status"] = API_STATE_EMPTY_RESULT 521 | out = self._prep_output(out, output_detail) 522 | self._api_thread_final(file_id1, qkey, out) 523 | return out 524 | except Exception as err: 525 | log.error( 526 | f"Exception at _files_compare_kg(). {err}. {repr(traceback.format_exc())}." 527 | ) 528 | out = {"status": API_STATE_ERROR, "status_msg": str(err)} 529 | self._api_thread_final(file_id1, qkey, out) 530 | return out 531 | 532 | def api_get_gene_info( 533 | self, 534 | gene_id=None, 535 | file_id=None, 536 | function_name=None, 537 | include_llvm_ir=False, 538 | include_asm=False, 539 | include_gene_value=False, 540 | include_function_names=False, 541 | ): 542 | log.debug( 543 | f"api_get_gene_info({gene_id=}, {file_id=}, {function_name=}, {include_llvm_ir=}, {include_asm=}...)" 544 | ) 545 | # not threaded 546 | try: 547 | out = {"status": API_STATE_ERROR, "status_msg": "Unknown error"} 548 | if gene_id: 549 | data = self.kg.get_gene_info( 550 | gene_id, 551 | function_name=function_name, 552 | llvm_ir=include_llvm_ir, 553 | include_asm=include_asm, 554 | gene_value=include_gene_value, 555 | func_names=include_function_names, 556 | ) 557 | if data: 558 | out = {"status": API_STATE_SUCCESS, "data": data} 559 | else: 560 | out = { 561 | "status": API_STATE_EMPTY_RESULT, 562 | "status_msg": "Can not get gene info", 563 | } 564 | elif (file_id != "") and (function_name != ""): 565 | gids = self.kg.get_gene_ids(function_name, file_id, include_bin_id=True) 566 | # in case a binary may have multiple functions with same name, take last 567 | if len(gids) > 0: 568 | gene_id, file_id = gids[-1] 569 | data = self.kg.get_gene_info( 570 | gene_id, 571 | bin_id=file_id, 572 | function_name=function_name, 573 | llvm_ir=include_llvm_ir, 574 | include_asm=include_asm, 575 | gene_value=include_gene_value, 576 | func_names=include_function_names, 577 | ) 578 | if data: 579 | out = {"status": API_STATE_SUCCESS, "data": data} 580 | else: 581 | out = { 582 | "status": API_STATE_EMPTY_RESULT, 583 | "status_msg": "Can not get gene info", 584 | } 585 | else: 586 | out = { 587 | "status": API_STATE_EMPTY_RESULT, 588 | "status_msg": f"Function gene not found for {file_id} -> {function_name}.", 589 | } 590 | else: 591 | out = { 592 | "status": API_STATE_ERROR, 593 | "status_msg": "Not enough arguments. gene_id or (file_id and function_name) must be passed.", 594 | } 595 | return out 596 | except Exception as err: 597 | log.error( 598 | f"Exception at api_get_ir(). {err}. {repr(traceback.format_exc())}." 599 | ) 600 | out = {"status": API_STATE_ERROR, "status_msg": str(err)} 601 | return out 602 | 603 | def api_get_node_info( 604 | self, 605 | obj_id=None, 606 | include_genes=False, 607 | include_llvm_ir=False, 608 | include_asm=False, 609 | include_gene_value=False, 610 | include_function_names=False, 611 | ): 612 | # not threaded 613 | try: 614 | if obj_id in self.kg.bins: 615 | data = self.kg.get_node(obj_id) 616 | if data: 617 | if include_genes: 618 | data["genes"] = self.kg.bins.get(obj_id, {}) 619 | out = {"status": API_STATE_SUCCESS, "data": data} 620 | else: 621 | out = { 622 | "status": API_STATE_EMPTY_RESULT, 623 | "status_msg": f"Object id {obj_id} not found", 624 | } 625 | 626 | else: 627 | data = self.kg.get_gene_info( 628 | obj_id, 629 | llvm_ir=include_llvm_ir, 630 | include_asm=include_asm, 631 | gene_value=include_gene_value, 632 | func_names=include_function_names, 633 | ) 634 | if data: 635 | out = {"status": API_STATE_SUCCESS, "data": data} 636 | else: 637 | out = { 638 | "status": API_STATE_EMPTY_RESULT, 639 | "status_msg": f"Object id {obj_id} not found", 640 | } 641 | return out 642 | 643 | except Exception as err: 644 | log.error( 645 | f"Exception at api_get_ir(). {err}. {repr(traceback.format_exc())}." 646 | ) 647 | out = {"status": API_STATE_ERROR, "status_msg": str(err)} 648 | return out 649 | 650 | 651 | def read_config(): 652 | config = { 653 | "gene_dir": CG_GENE_DIR, 654 | "cache_dir": CG_CACHE_DIR, 655 | "keep_aux_files": True, 656 | } 657 | return config 658 | 659 | 660 | def create_genome_service(): 661 | config = read_config() 662 | log.debug(f"read_config(). {config}") 663 | kgs = GenomeService(config) 664 | log.debug("GenomeService object created.") 665 | # do it from api to reduce service startup 666 | log.debug("updating index.") 667 | t1 = time.time() 668 | kgs.kg.load() 669 | t2 = time.time() 670 | log.debug("updating index completed.") 671 | return kgs 672 | --------------------------------------------------------------------------------