├── .gitignore
├── utils
    └── app
    │   ├── app
    │       ├── __init__.py
    │       ├── api
    │       │   ├── __init__.py
    │       │   ├── api.py
    │       │   ├── status.py
    │       │   ├── delete.py
    │       │   ├── add.py
    │       │   ├── compare.py
    │       │   └── search.py
    │       ├── core
    │       │   ├── __init__.py
    │       │   ├── schema.py
    │       │   └── genome_service.py
    │       ├── defaults.py
    │       └── main.py
    │   └── uwsgi.ini
├── codegenome
    ├── __init__.py
    ├── ir
    │   ├── __init__.py
    │   ├── canon.py
    │   └── ir.py
    ├── genes
    │   ├── __init__.py
    │   ├── base.py
    │   ├── utils.py
    │   └── sigmal.py
    ├── kg
    │   └── __init__.py
    ├── lifters
    │   ├── __init__.py
    │   ├── base.py
    │   └── retdec.py
    ├── pipelines
    │   ├── base.py
    │   ├── __init__.py
    │   └── retdecsigmal.py
    ├── utils.py
    ├── _file_format.py
    └── _defaults.py
├── docs
    └── _static
    │   ├── overview.png
    │   └── code_genome.png
├── .gitmodules
├── .env.defaults
├── docker
    ├── install_cleanup.sh
    ├── install_llvm_pass.sh
    ├── install_pyleargist.sh
    ├── llvm-gcc-fix.patch
    ├── llvmlite-settypename.patch
    ├── install_retdec.sh
    ├── install_all_local.sh
    ├── install_llvmlite.sh
    ├── Dockerfile.dev
    ├── Dockerfile
    └── decompiler-config.json
├── requirements.txt
├── tests
    ├── p
    │   ├── build_elf.sh
    │   ├── g.c
    │   ├── build.sh
    │   └── p.c
    ├── unit_tests.py
    ├── test_sigmal.py
    ├── test_data.py
    ├── test_ir.py
    ├── test_lifters.py
    ├── test_api_core.py
    └── test_kg.py
├── CODEOWNERS
├── .pre-commit-config.yaml
├── scripts
    ├── bin2bc_dir
    ├── fmt.sh
    ├── build_hash_map.py
    ├── run_service.py
    ├── build_gkg.py
    ├── bin2bc
    └── cg
├── setup.py
├── README.md
├── Makefile
├── CONTRIBUTING.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/utils/app/app/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/utils/app/app/api/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/utils/app/app/core/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/codegenome/__init__.py:
--------------------------------------------------------------------------------
1 | from .kg import GenomeKG
2 | 


--------------------------------------------------------------------------------
/codegenome/ir/__init__.py:
--------------------------------------------------------------------------------
1 | from .ir import IRBinary
2 | 


--------------------------------------------------------------------------------
/codegenome/genes/__init__.py:
--------------------------------------------------------------------------------
1 | from .sigmal import SigmalGene
2 | 


--------------------------------------------------------------------------------
/codegenome/kg/__init__.py:
--------------------------------------------------------------------------------
1 | from .kg import BinGene, GenomeKG
2 | 


--------------------------------------------------------------------------------
/codegenome/lifters/__init__.py:
--------------------------------------------------------------------------------
1 | from .retdec import CGRetdec
2 | 


--------------------------------------------------------------------------------
/docs/_static/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-genome/codegenome/HEAD/docs/_static/overview.png


--------------------------------------------------------------------------------
/docs/_static/code_genome.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/code-genome/codegenome/HEAD/docs/_static/code_genome.png


--------------------------------------------------------------------------------
/utils/app/uwsgi.ini:
--------------------------------------------------------------------------------
1 | [uwsgi]
2 | module = app.main
3 | callable = app
4 | enable-threads = true
5 | master = true
6 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "deps/canon_pass"]
2 | 	path = deps/canon_pass
3 | 	url = https://github.com/code-genome/canon_pass.git
4 | 


--------------------------------------------------------------------------------
/codegenome/pipelines/base.py:
--------------------------------------------------------------------------------
1 | class CGPipeline(object):
2 |     def process_file(self, file_path):
3 |         raise NotImplemented()
4 | 


--------------------------------------------------------------------------------
/codegenome/lifters/base.py:
--------------------------------------------------------------------------------
1 | class CGLifterBase(object):
2 |     def process_file(self, file_path, output_path=None):
3 |         raise NotImplemented()
4 | 


--------------------------------------------------------------------------------
/codegenome/genes/base.py:
--------------------------------------------------------------------------------
1 | class CGGeneBase(object):
2 |     def from_data(self, data):
3 |         raise NotImplemented()
4 | 
5 |     def from_bitcode(self, data):
6 |         raise NotImplemented()
7 | 


--------------------------------------------------------------------------------
/.env.defaults:
--------------------------------------------------------------------------------
1 | CG_DATA_ROOT_DIR="~/.cg"
2 | TMP_UPLOAD_DIR="/tmp/"
3 | GC_SERVICE_LOG_PATH="/tmp/cg.rest.log"
4 | CG_DEBUG=40 #python logger int (CRITICAL = 50, ERROR = 40, WARNING = 30, INFO = 20, DEBUG = 10)
5 | 


--------------------------------------------------------------------------------
/codegenome/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | def get_pipeline_by_version(gene_version, **kwargs):
2 |     if gene_version == "genes_v0_0_1":
3 |         from .retdecsigmal import RetdecSigmalV1
4 | 
5 |         return RetdecSigmalV1(**kwargs)
6 | 


--------------------------------------------------------------------------------
/docker/install_cleanup.sh:
--------------------------------------------------------------------------------
1 | #cleanup
2 | rm -f /tmp/llvmlite-settypename.patch
3 | rm -rf /tmp/llvm_pass
4 | rm -rf /tmp/tmp_src
5 | rm -rf /tmp/*
6 | apt-get remove -y \
7 |     git g++ make cmake libcurl4-openssl-dev\
8 |     python3-dev
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.23.4
 2 | sklearn==0.0
 3 | Pillow==9.3.0
 4 | scipy==1.9.3
 5 | pefile
 6 | enum34
 7 | joblib
 8 | flask==2.1.3
 9 | flask-restx==0.5.1
10 | werkzeug==2.1.2
11 | sqlitedict
12 | jsonlines
13 | python-dotenv
14 | 


--------------------------------------------------------------------------------
/tests/p/build_elf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | P=p
 3 | A='elf'
 4 | gcc -O0 -o $P'.gcc.0.'$A $P'.c'
 5 | gcc -O3 -o $P'.gcc.3.'$A $P'.c'
 6 | clang -O0 -o $P'.clang.0.'$A $P'.c'
 7 | clang -O3 -o $P'.clang.3.'$A $P'.c'
 8 | clang -Oz -o $P'.clang.z.'$A $P'.c'
 9 | 
10 | 


--------------------------------------------------------------------------------
/docker/install_llvm_pass.sh:
--------------------------------------------------------------------------------
1 | TMP=/tmp/llvm_pass
2 | [ ! -e "$TMP" ] && cp -r ../deps/canon_pass "$TMP"/llvm_pass
3 | 
4 | cd "$TMP" && \
5 |   PATH=/opt/llvm/bin/:$PATH make &&\
6 |   cp build/libcanonicalization-pass.so /opt/llvm/lib/libcanonicalization-pass.so && \
7 |   cd /tmp && \
8 |   rm -rf "$TMP"
9 | 


--------------------------------------------------------------------------------
/utils/app/app/defaults.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | #override for REST service
4 | DEFAULT_CALCULATION_METHOD = "jaccard_distance"
5 | DEFAULT_COMPARE_METHOD = "genes_v1_3_1.jaccard_distance_w"
6 | DEFAULT_OUTPUT_DETAIL = "simple"
7 | TMP_DIR_PREFIX = "cg_temp_upload_"
8 | TMP_UPLOAD_DIR = os.environ.get("TMP_UPLOAD_DIR", "/tmp/")
9 | 


--------------------------------------------------------------------------------
/docker/install_pyleargist.sh:
--------------------------------------------------------------------------------
 1 | #install pyleargist
 2 | apt-get install -y \
 3 |   git \
 4 |   python3-dev \
 5 |   libfftw3-dev ; \
 6 |   git clone https://github.com/vertexcover-io/pyleargist.git; \
 7 |   cd pyleargist; \
 8 |   pip3 install . ;\
 9 |   apt-get remove -y \
10 |   libfftw3-dev; \
11 |   cd .. ;\
12 |   rm -rf pyleargist


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | #####################################################
 2 | #
 3 | # List of approvers for codegenome repository
 4 | #
 5 | #####################################################
 6 | #
 7 | # Learn about CODEOWNERS file format:
 8 | #  https://help.github.com/en/articles/about-code-owners
 9 | #
10 | 
11 | *  @dhilung @souljang
12 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/mirrors-prettier
 3 |     rev: v2.1.2
 4 |     hooks:
 5 |       - id: prettier
 6 |   - repo: https://github.com/psf/black
 7 |     rev: 22.3.0
 8 |     hooks:
 9 |       - id: black
10 |   - repo: https://github.com/PyCQA/isort
11 |     rev: 5.11.5
12 |     hooks:
13 |       - id: isort
14 | 


--------------------------------------------------------------------------------
/scripts/bin2bc_dir:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # usages: bin2bc_dir <input_dir> <output_dir>
 3 | 
 4 | SRC="$1"
 5 | DST="$2"
 6 | if [ "$SRC" == "" ];then
 7 |     echo Usage: $0 input_dir output_dir
 8 |     exit 1
 9 | fi
10 | if [ "$DST" == "" ];then
11 |     DST="."
12 | else
13 |     mkdir -p $DST
14 | fi
15 | 
16 | find "$SRC"|xargs -n 1 -P $(nproc --all) ./bin2bc --output "$DST" --keep_dsm --keep_ll
17 | 


--------------------------------------------------------------------------------
/codegenome/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | class ProfileLog:
 5 |     def __init__(self, logger, name=""):
 6 |         self.name = name
 7 |         self.logger = logger
 8 | 
 9 |     def __enter__(self):
10 |         self.start = time.time()
11 | 
12 |     def __exit__(self, type, value, traceback):
13 |         self.t = time.time() - self.start
14 |         self.logger.info(self.name + " time: %f" % self.t)
15 | 


--------------------------------------------------------------------------------
/tests/unit_tests.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | import unittest
 5 | 
 6 | logging.basicConfig(
 7 |     filename="/tmp/cg_unit_tests.log",
 8 |     level=logging.DEBUG,
 9 |     format="%(asctime)s, %(name)s, %(levelname)s, %(message)s",
10 |     datefmt="%m/%d/%Y %H:%M:%S",
11 | )
12 | 
13 | from test_ir import *
14 | from test_kg import *
15 | from test_lifters import *
16 | from test_sigmal import *
17 | 
18 | if __name__ == "__main__":
19 |     unittest.main()
20 | 


--------------------------------------------------------------------------------
/utils/app/app/api/api.py:
--------------------------------------------------------------------------------
 1 | from flask_restx import Api
 2 | 
 3 | from ..main import app
 4 | 
 5 | 
 6 | def check_event_loop():
 7 |     pass
 8 | 
 9 | 
10 | api = Api(
11 |     app,
12 |     version="0.0.1",
13 |     title="Code Genome",
14 |     description="Code Genome APIs",
15 | )
16 | 
17 | from . import add  # noqa
18 | from . import compare  # noqa
19 | from . import delete  # noqa
20 | from . import search  # noqa
21 | from . import status  # noqa
22 | 
23 | # import config
24 | 


--------------------------------------------------------------------------------
/tests/p/g.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | int g;
 3 | int* gp = &g;
 4 | int vec[]= {1,2,3};
 5 | int* intptr;
 6 | 
 7 | int f0(int a){
 8 |     int tmp = a;
 9 |     tmp = a + *gp;
10 |     return tmp;
11 | }
12 | 
13 | int f1(int a)
14 | {
15 |   int tmp;
16 |   tmp = a + vec[0];
17 |   return tmp;
18 | }
19 | 
20 | void f2(int a){
21 |   a = f0(a);
22 |   printf("%d\n", a);
23 | }
24 | 
25 | int main(int argc, char* argv[])
26 | {
27 |   int a = atoi(argv[1]);
28 |   int x = f0(a);
29 |   int y = f1(x);
30 |   f2(y);
31 | }
32 | 


--------------------------------------------------------------------------------
/tests/p/build.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | 
 3 | P=$1
 4 | A='unknown'
 5 | if [ "$(uname)" == "Linux" ];then
 6 |     A='elf'
 7 |     llvm_dis=$(which llvm-dis)
 8 |     if [ "$llvm_dis" == "" ];then
 9 |         llvm_dis='/opt/llvm/bin/llvm-dis'
10 |     fi
11 | fi
12 | 
13 | if [ "$(uname)" == "Darwin" ];then
14 |     A='mac'
15 |     llvm_dis='/usr/local/Cellar/llvm/9.0.0/bin/llvm-dis'
16 | fi
17 | 
18 | #echo gcc -O0 -o $P'.gcc.0.'$A $P'.c'
19 | #echo gcc -O3 -o $P'.gcc.3.'$A $P'.c'
20 | echo clang -O0 -o $P'.clang.0.'$A $P'.c'
21 | echo clang -O3 -o $P'.clang.3.'$A $P'.c'
22 | echo clang -O0 -emit-llvm -o $P'.clang.0.bc' -c $P'.c'
23 | echo $llvm_dis $P'.clang.0.bc'
24 | echo clang -O3 -emit-llvm -o $P'.clang.3.bc' -c $P'.c'
25 | echo $llvm_dis $P'.clang.3.bc'
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/fmt.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #pip install pre-commit
 3 | pre-commit install-hooks
 4 | pre-commit run --all-files
 5 | RETURN_CODE=$?
 6 | 
 7 | function echoWarning() {
 8 |   LIGHT_YELLOW='\033[1;33m'
 9 |   NC='\033[0m' # No Color
10 |   echo -e "${LIGHT_YELLOW}${1}${NC}"
11 | }
12 | 
13 | if [ "$RETURN_CODE" -ne 0 ]; then
14 |   if [ "${CI}" != "true" ]; then
15 |     echoWarning "☝️ This appears to have failed, but actually your files have been formatted."
16 |     echoWarning "Make a new commit with these changes before making a pull request."
17 |   else
18 |     echoWarning "This test failed because your code isn't formatted correctly."
19 |     echoWarning 'Locally, run `make run fmt`, it will appear to fail, but change files.'
20 |     echoWarning "Add the changed files to your commit and this stage will pass."
21 |   fi
22 | 
23 |   exit $RETURN_CODE
24 | fi
25 | 


--------------------------------------------------------------------------------
/docker/llvm-gcc-fix.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
 2 | index 9e3478e9fd29..efd55339418b 100644
 3 | --- a/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
 4 | +++ b/llvm/include/llvm/Demangle/MicrosoftDemangleNodes.h
 5 | @@ -4,6 +4,8 @@
 6 |  #include "llvm/Demangle/Compiler.h"
 7 |  #include "llvm/Demangle/StringView.h"
 8 |  #include <array>
 9 | +#include <cstdint>
10 | +#include <string>
11 |  
12 |  class OutputStream;
13 |  
14 | diff --git a/llvm/utils/benchmark/src/benchmark_register.h b/llvm/utils/benchmark/src/benchmark_register.h
15 | index 0705e219f2fa..6001fb8e0e48 100644
16 | --- a/llvm/utils/benchmark/src/benchmark_register.h
17 | +++ b/llvm/utils/benchmark/src/benchmark_register.h
18 | @@ -2,6 +2,7 @@
19 |  #define BENCHMARK_REGISTER_H
20 |  
21 |  #include <vector>
22 | +#include <limits>
23 |  
24 |  #include "check.h"
25 |  
26 | 


--------------------------------------------------------------------------------
/scripts/build_hash_map.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import os
 3 | import pickle
 4 | import re
 5 | import shutil
 6 | import subprocess
 7 | import sys
 8 | 
 9 | """
10 | Build sha256 hashmap
11 | 
12 | usage:
13 | python build_hash_map.py src_path output_path
14 | """
15 | 
16 | PAT_EXECS = "ELF|PE32|DLL"
17 | 
18 | 
19 | def hashmap(srcd, output=None):
20 |     hmap = {}
21 |     flist = subprocess.check_output(["find", srcd]).split("\n")
22 |     for fn in flist:
23 |         ft = subprocess.check_output(["file", "-b", fn]).strip()
24 |         m = re.findall(PAT_EXECS, ft, re.IGNORECASE)
25 |         if len(m) > 0:
26 |             bin_id = hashlib.sha256(open(fn, "rb").read()).hexdigest()
27 |             if bin_id in hmap:
28 |                 hmap[bin_id].append(fn)
29 |             else:
30 |                 hmap[bin_id] = [fn]
31 |     if output:
32 |         with open(output, "w") as f:
33 |             pickle.dump(hmap, f)
34 |     return hmap
35 | 
36 | 
37 | hashmap(sys.argv[1], sys.argv[2])
38 | 


--------------------------------------------------------------------------------
/scripts/run_service.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import json
 3 | import logging
 4 | import os
 5 | import sys
 6 | import dotenv
 7 | dotenv.load_dotenv()
 8 | 
 9 | logfn = os.environ.get("GC_SERVICE_LOG_PATH", "/tmp/cg_dev_run.log")
10 | if not os.path.exists(os.path.dirname(logfn)):
11 |     os.makedirs(os.path.dirname(logfn))
12 |     
13 | logging.basicConfig(
14 |     filename=logfn,
15 |     format="%(asctime)s, %(name)s, %(levelname)s, %(message)s",
16 |     datefmt="%m/%d/%Y %H:%M:%S",
17 |     level=logging.DEBUG,
18 | )
19 | 
20 | 
21 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../"))
22 | sys.path.insert(
23 |     0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils/app/")
24 | )
25 | from app.main import *  # noqa
26 | 
27 | sgkg_log = logging.getLogger("codegenome")
28 | sgkg_log.setLevel(logging.DEBUG)
29 | 
30 | host = "127.0.0.1"
31 | port = 5001
32 | 
33 | if len(sys.argv) > 1:
34 |     host = sys.argv[1]
35 | if len(sys.argv) > 2:
36 |     port = int(sys.argv[2])
37 | app.run(host=host, debug=True, port=port)
38 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from setuptools import find_namespace_packages, setup
 3 | from setuptools.command.develop import develop
 4 | from setuptools.command.install import install
 5 | 
 6 | with open("README.md", "r") as fh:
 7 |     long_description = fh.read()
 8 | 
 9 | with open("requirements.txt") as f:
10 |     requirements = f.read().splitlines()
11 | 
12 | 
13 | class PostInstallDependencies(install):
14 |     def run(self):
15 |         install.run(self)
16 | 
17 | 
18 | setup(
19 |     name="codegenome",
20 |     version="0.0.1",
21 |     description="Code Genome framework",
22 |     url="https://research.ibm.com/",
23 |     author="IBM Research",
24 |     author_email="dkirat@us.ibm.com",
25 |     classifiers=[
26 |         "Programming Language :: Python :: 3",
27 |         "Operating System :: OS Independent",
28 |     ],
29 |     packages=find_namespace_packages(include=["codegenome*"]),
30 |     scripts=["scripts/cg"],
31 |     python_requires=">=3.8",
32 |     install_requires=requirements,
33 |     mdclass={"install": PostInstallDependencies},
34 | )
35 | 


--------------------------------------------------------------------------------
/utils/app/app/main.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import logging.config
 3 | import os
 4 | 
 5 | from flask import Flask
 6 | from flask_restx import Api, Resource, fields
 7 | from werkzeug.middleware.proxy_fix import ProxyFix
 8 | import dotenv
 9 | dotenv.load_dotenv()
10 | 
11 | # logging.config.fileConfig('logging.conf')
12 | logging.basicConfig(filename= os.environ.get("GC_SERVICE_LOG_PATH", "/tmp/cg.rest.log"))
13 | log = logging.getLogger("codegenome.rest")
14 | log.setLevel(int(os.environ.get("CG_DEBUG", logging.ERROR)))
15 | ch = logging.StreamHandler()
16 | ch.setFormatter(logging.Formatter("%(asctime)s, %(name)s, %(levelname)s, %(message)s"))
17 | log.addHandler(ch)
18 | 
19 | 
20 | app = Flask(__name__)
21 | app.wsgi_app = ProxyFix(app.wsgi_app)
22 | 
23 | from .core.genome_service import create_genome_service  # noqa
24 | 
25 | kgs = create_genome_service()
26 | from .api import api  # noqa
27 | 
28 | if __name__ == "__main__":
29 |     # Only for debugging while developing
30 |     # app.run(host="127.0.0.1", debug=True, port=5000)
31 |     app.run(host="0.0.0.0", debug=True)
32 | 


--------------------------------------------------------------------------------
/docker/llvmlite-settypename.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/ffi/value.cpp b/ffi/value.cpp
 2 | index 05c67b6..9327faa 100644
 3 | --- a/ffi/value.cpp
 4 | +++ b/ffi/value.cpp
 5 | @@ -408,6 +408,18 @@ LLVMPY_GetTypeName(LLVMTypeRef type)
 6 |      return LLVMPY_CreateString("");
 7 |  }
 8 |  
 9 | +API_EXPORT(void)
10 | +LLVMPY_SetTypeName(LLVMTypeRef type, const char *Name)
11 | +{
12 | +    // try to convert to a struct type, works for other derived
13 | +    // types too
14 | +    llvm::Type* unwrapped = llvm::unwrap(type);
15 | +    llvm::StructType* ty = llvm::dyn_cast<llvm::StructType>(unwrapped);
16 | +    if (ty && !ty->isLiteral()) {
17 | +        ty->setName(Name);
18 | +    }
19 | +}
20 | +
21 |  API_EXPORT(bool)
22 |  LLVMPY_TypeIsPointer(LLVMTypeRef type)
23 |  {
24 | diff --git a/llvmlite/binding/value.py b/llvmlite/binding/value.py
25 | index 4e21b3e..b13cdba 100644
26 | --- a/llvmlite/binding/value.py
27 | +++ b/llvmlite/binding/value.py
28 | @@ -53,6 +53,10 @@ class TypeRef(ffi.ObjectRef):
29 |          """
30 |          return ffi.ret_string(ffi.lib.LLVMPY_GetTypeName(self))
31 |  
32 | +    @name.setter
33 | +    def name(self, val):
34 | +        ffi.lib.LLVMPY_SetTypeName(self, _encode_string(val))
35 | +
36 |      @property
37 |      def is_pointer(self):
38 |          """
39 | 


--------------------------------------------------------------------------------
/docker/install_retdec.sh:
--------------------------------------------------------------------------------
 1 | # install retdec 
 2 | pushd $(pwd)
 3 | PREFIX=/opt/cg/retdec
 4 | mkdir -p $PREFIX
 5 | VER=$(cat /etc/issue|cut -d' ' -f2)
 6 | 
 7 | if [[ $VER < "22" ]]; then
 8 |     #ubuntu version < 22
 9 |     #BIN_URL=https://github.com/avast/retdec/releases/download/v4.0/retdec-v4.0-ubuntu-64b.tar.xz does not work
10 |     
11 |     DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential cmake git openssl libssl-dev python3 autoconf automake libtool pkg-config m4 zlib1g-dev upx doxygen graphviz
12 |     mkdir -p /tmp/retdec
13 |     cd /tmp/retdec
14 |     git clone https://github.com/avast/retdec.git && \
15 | 	cd retdec && \
16 | 	git checkout 3435bc827d2c2c5da91dfb84509af0c034ee22b5 && \
17 | 	mkdir build && \
18 | 	cd build && \
19 | 	cmake .. -DCMAKE_INSTALL_PREFIX=$PREFIX -DCMAKE_LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/7/ && \
20 | 	make -j 8 && \
21 | 	make install
22 | 
23 |     rm -rf /tmp/retdec
24 | 
25 | else
26 |     BIN_URL=https://github.com/avast/retdec/releases/download/v5.0/RetDec-v5.0-Linux-Release.tar.xz
27 |     wget $BIN_URL -O /tmp/retdec.tar.xz && \
28 |     tar -xJf /tmp/retdec.tar.xz -C $PREFIX/ ;\
29 |     rm /tmp/retdec.tar.xz
30 | fi
31 | 
32 | # replace with our config
33 | popd
34 | cp decompiler-config.json $PREFIX'/share/retdec/decompiler-config.json'
35 | echo "Retdec installed. Please do: export RETDEC_PATH=$PREFIX"
36 | 


--------------------------------------------------------------------------------
/tests/test_sigmal.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | import unittest
 5 | 
 6 | import numpy as np
 7 | 
 8 | logging.basicConfig(
 9 |     filename="/tmp/cg-test-sigmal.log",
10 |     level=logging.DEBUG,
11 |     format="%(asctime)s, %(name)s, %(levelname)s, %(message)s",
12 |     datefmt="%m/%d/%Y %H:%M:%S",
13 | )
14 | 
15 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../"))
16 | 
17 | import test_data as data
18 | 
19 | 
20 | class TestSigmal(unittest.TestCase):
21 |     def setUp(self):
22 |         from codegenome.genes import SigmalGene
23 |         from codegenome.ir import IRBinary
24 | 
25 |         self.irb = IRBinary(data.global_ir, ll=True)
26 |         self.sm = SigmalGene()
27 |         pass
28 | 
29 |     def test_gene(self):
30 |         bc = self.irb.fs["f0"].get_bc()
31 |         g1 = self.sm.from_bitcode(bc)
32 |         g2 = self.sm.from_bitcode(bc, gene_type="sigmal")
33 |         g3 = self.sm.from_bitcode(bc, gene_type="sigmal2")
34 |         g4 = self.sm.from_bitcode(bc, gene_type="sigmal2b")
35 |         g5 = self.sm.from_bitcode(bc, gene_type="func_only")
36 |         g6 = self.sm.from_data(bc)
37 | 
38 |         self.assertTrue(np.array_equal(g1, g2))
39 |         self.assertTrue(np.array_equal(g1, g6))
40 |         self.assertFalse(np.array_equal(g1, g3))
41 |         self.assertFalse(np.array_equal(g3, g4))
42 |         self.assertFalse(np.array_equal(g4, g5))
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     unittest.main(verbosity=2)
47 | 


--------------------------------------------------------------------------------
/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | global_c = """
 2 | int g;
 3 | int* gp = &g;
 4 | int vec[]= {1,2,3};
 5 | int* intptr;
 6 | 
 7 | int f0(int a){
 8 |     int tmp = a;
 9 |     tmp = a + *gp;
10 |     return tmp;
11 | }
12 | 
13 | int f1(int a)
14 | {
15 |   int tmp;
16 |   tmp = a + vec[0];
17 |   f0(a);
18 |   return tmp;
19 | }
20 | """
21 | 
22 | global_ir = """
23 | @g = global i32 123, align 4
24 | @gp = global i32* @g, align 8
25 | @vec = global [3 x i32] [i32 1, i32 2, i32 3], align 4
26 | @intptr = common global i32* null, align 8
27 | 
28 | define i32 @f0(i32 %a) {
29 |   %1 = load i32*, i32** @gp, align 8
30 |   %2 = load i32, i32* %1, align 4
31 |   %3 = add nsw i32 %2, %a
32 |   ret i32 %3
33 | }
34 | 
35 | define i32 @f1(i32) {
36 |   %2 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @vec, i64 0, i64 0), align 4
37 |   %3 = add nsw i32 %2, %0
38 |   ret i32 %3
39 | }
40 | """
41 | 
42 | type_ir = """
43 | %type1 =  type {
44 |     i32,
45 |     i32,
46 |     double
47 | }
48 | %type2 = type {
49 |     i32,
50 |     i32,
51 |     %type1
52 | }
53 | 
54 | define i32* @f0(i32 %a) {
55 | 
56 |   %1 = alloca %type2
57 |   %2 = getelementptr %type2, %type2* %1, i32 0, i32 1
58 |   ret i32* %2
59 | }
60 | """
61 | 
62 | externs_ir = """
63 | declare i32 @printf(i8*, ...) local_unnamed_addr
64 | 
65 | define i32 @local_func(i32 %x) {
66 |   ret i32 %x
67 |   }
68 | 
69 | define i32 @f0(i32 %a, i8* %format) {
70 |   %1 = call i32 @local_func(i32 %a)
71 |   %2 = tail call i32 (i8*, ...) @printf(i8* %format)
72 |   ret i32 %1
73 | }
74 | """
75 | 


--------------------------------------------------------------------------------
/utils/app/app/api/status.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from flask_restx import Resource, fields
 4 | 
 5 | from ..core.genome_service import (API_STATE_EMPTY_RESULT,
 6 |                                    API_STATE_RESULT_NOT_READY)
 7 | from ..main import kgs
 8 | from .api import api
 9 | 
10 | ns = api.namespace("api/v1/status", description="Service status")
11 | 
12 | logger = logging.getLogger("codegenome.rest")
13 | 
14 | 
15 | @ns.route("/")
16 | class ConfigStatus(Resource):
17 |     """Service status."""
18 | 
19 |     def get(self):
20 |         """Service status"""
21 |         try:
22 |             return kgs.status()
23 |         except KeyError as e:
24 |             api.abort(404, f"Failed to retrieve service status.")
25 | 
26 | 
27 | @ns.route("/job/<job_id>")
28 | @ns.response(200, "Final result")
29 | @ns.response(202, "Request received. Result not ready. Must retry.")
30 | @ns.response(204, "Result empty")
31 | @ns.response(404, "Job id not found")
32 | class Job(Resource):
33 |     """Job status"""
34 | 
35 |     def get(self, job_id):
36 |         """Get Job status."""
37 | 
38 |         try:
39 |             ret = kgs.check_job(job_id)
40 |             if ret.get("status") == API_STATE_RESULT_NOT_READY:
41 |                 return ret, 202
42 |             elif ret.get("status") == API_STATE_EMPTY_RESULT:
43 |                 if ret.get("query") is None:  # root search node not found.
44 |                     return ret, 404
45 | 
46 |             return ret
47 |         except Exception as e:
48 |             api.abort(500, f"Exception: {e}")
49 | 


--------------------------------------------------------------------------------
/docker/install_all_local.sh:
--------------------------------------------------------------------------------
 1 | # local install
 2 | apt-get update && \
 3 |   apt-get install -y \
 4 |   python3 \
 5 |   python3-pip \
 6 |   python-is-python3 \
 7 |   wget
 8 | 
 9 | cp -f decompiler-config.json /tmp/
10 | cp -f install_retdec.sh /tmp/
11 | cp -f install_pyleargist.sh /tmp/
12 | 
13 | cp -f llvmlite-settypename.patch /tmp/
14 | cp -f llvm-gcc-fix.patch /tmp/
15 | cp -f install_llvmlite.sh /tmp/
16 | 
17 | cp -rf ../llvm_pass /tmp/llvm_pass
18 | cp -f install_llvm_pass.sh /tmp/
19 | 
20 | export PATH="/opt/cg/retdec/bin:$PATH"
21 | export RETDEC_PATH="/opt/cg/retdec"
22 | 
23 | cleanup() {
24 |     cd /tmp
25 |     rm -f /tmp/llvmlite-settypename.patch
26 |     rm -f /tmp/llvm-gcc-fix.patch
27 |     rm -rf /tmp/llvm_pass
28 |     rm -rf /tmp/tmp_src
29 |     rm -f /tmp/decompiler-config.json
30 |     rm -f /tmp/install_retdec.sh
31 |     rm -f /tmp/install_pyleargist.sh
32 |     rm -f /tmp/install_llvmlite.sh
33 |     rm -f /tmp/install_llvm_pass.sh
34 | }
35 | 
36 | cd /tmp && bash install_retdec.sh && \
37 | cd /tmp && bash install_pyleargist.sh && \
38 | cd /tmp && bash install_llvmlite.sh && \
39 | cd /tmp && bash install_llvm_pass.sh
40 | 
41 | # Check the exit status of the last command
42 | if [ $? -ne 0 ]; then
43 |     echo "Installation failed. Do you want to cleanup files from /tmp? (y/n)"
44 |     read answer
45 | 
46 |     case ${answer:0:1} in
47 |         y|Y )
48 |             echo "Cleaning up.."
49 |             cleanup
50 |             ;;
51 |         * )
52 |             echo "Exiting..."
53 |             exit 1
54 |             ;;
55 |     esac
56 | else
57 |     cleanup
58 | fi
59 | 
60 | 


--------------------------------------------------------------------------------
/utils/app/app/api/delete.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from flask_restx import Resource, fields
 4 | from flask import request
 5 | from flask_restx import Resource
 6 | from werkzeug.datastructures import FileStorage
 7 | import copy
 8 | import traceback
 9 | import logging
10 | import tempfile
11 | 
12 | from .api import api
13 | from ..main import kgs
14 | from ..core.genome_service import API_STATE_RESULT_NOT_READY, API_STATE_EMPTY_RESULT, API_STATE_ERROR
15 | from ..defaults import *
16 | 
17 | logger = logging.getLogger('codegenome.rest')
18 | ns = api.namespace("api/v1/delete", description="Delete from KG.")
19 | 
20 | upload_parser = api.parser()
21 | 
22 | file_args = api.model(
23 |     "file_args",
24 |     {"file_id": fields.String(
25 |         required=True, default='',
26 |         description="The identifier of the file")})
27 | 
28 | 
29 | @ns.route("/file")
30 | @ns.response(200, "Success")
31 | @ns.response(404, "Object id not found")
32 | class DeleteFile(Resource):
33 |     """Delete by `file_id`."""
34 | 
35 |     @ns.expect(file_args)
36 |     def post(self):
37 |         """Delete by `file_id`."""
38 |         try:
39 |             args = dict(api.payload)
40 |             ret = kgs.delete_file(args.get('file_id'))
41 |             if ret.get('status') == API_STATE_RESULT_NOT_READY:
42 |                 return ret, 202
43 |             elif ret.get('status') == API_STATE_EMPTY_RESULT:
44 |                 return ret, 404
45 |             elif ret.get('status') == API_STATE_ERROR:
46 |                 return ret, 500
47 |             return ret
48 |         except Exception as e:
49 |             api.abort(500, f"Exception: {e}")
50 | 


--------------------------------------------------------------------------------
/codegenome/_file_format.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import joblib
 4 | 
 5 | _GKG_FILE_VERSION = "0.3"
 6 | _CANON_FILE_VERSION_ = "0.3"
 7 | _GENE_FILE_VERSION_ = "0.3"
 8 | 
 9 | 
10 | def get_file_meta(file_path, file_size=None):
11 |     if file_size is None:
12 |         file_size = os.path.getsize(file_path)
13 | 
14 |     return {"file_path": file_path, "file_size": file_size}
15 | 
16 | 
17 | def prep_gkg_file(gkg):
18 |     file_content = {
19 |         "type": "gkg",
20 |         "version": _GKG_FILE_VERSION,
21 |         "data": gkg.serialize(),
22 |     }
23 |     return file_content
24 | 
25 | 
26 | def read_gkg_file(path):
27 |     data = joblib.load(path)
28 |     assert data["type"] == "gkg"
29 |     assert data["version"] == _GKG_FILE_VERSION
30 |     return data
31 | 
32 | 
33 | def prep_gene_file(genes, binid, file_meta):
34 |     file_content = {
35 |         "type": "gene",
36 |         "version": _GENE_FILE_VERSION_,
37 |         "binid": binid,
38 |         "genes": genes,
39 |         "file_meta": file_meta,
40 |     }
41 |     return file_content
42 | 
43 | 
44 | def read_gene_file(path):
45 |     data = joblib.load(path)
46 |     assert data["type"] == "gene"
47 |     assert data["version"] == _GENE_FILE_VERSION_
48 |     return data
49 | 
50 | 
51 | def prep_canon_file(ir_bin, file_meta):
52 |     file_content = {
53 |         "type": "canon",
54 |         "version": _CANON_FILE_VERSION_,
55 |         "binid": ir_bin._bin_id,
56 |         "funcs": ir_bin.serialize(),
57 |         "file_meta": file_meta,
58 |     }
59 |     return file_content
60 | 
61 | 
62 | def read_canon_file(path):
63 |     data = joblib.load(path)
64 |     assert data["type"] == "canon"
65 |     assert data["version"] == _CANON_FILE_VERSION_
66 |     return data
67 | 


--------------------------------------------------------------------------------
/utils/app/app/core/schema.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import uuid
 3 | 
 4 | import numpy as np
 5 | 
 6 | DEFAULT_ID_DELIMITER = ":"  # TODO move to a separate common module
 7 | 
 8 | 
 9 | def get_type_from_id(id_):
10 |     if type(id_) == str:
11 |         id_comp = id_.split(DEFAULT_ID_DELIMITER)
12 |         if len(id_comp) > 1:
13 |             return id_comp[0]
14 |     return None
15 | 
16 | 
17 | class KGNodeTypes:
18 |     file = "file"
19 |     gene = "gene"
20 |     cache = "cache"
21 |     stat = "stat"
22 | 
23 | 
24 | class KGNodeID:
25 |     @staticmethod
26 |     def _mk_id(_type, _hash):
27 |         # return f"{_type}:{_hash}"
28 |         return _hash
29 | 
30 |     @staticmethod
31 |     def split(_id):
32 |         return str(_id).split(":")
33 | 
34 |     @staticmethod
35 |     def id(_type, data=None, hash=None, file_path=None, return_hash=False):
36 |         if hash is None:
37 |             if data is not None:
38 |                 if type(data) != bytes:
39 |                     # forced conversion!
40 |                     data = str(data).strip().lower().encode("utf-8")
41 |                 hash = hashlib.sha256(data).hexdigest()
42 | 
43 |             elif file_path is not None:
44 |                 with open(file_path, "rb") as f:
45 |                     hash = hashlib.sha256(f.read()).hexdigest()
46 |             else:
47 |                 raise Exception("parameter missing")
48 | 
49 |         if return_hash:
50 |             return KGNodeID._mk_id(_type, hash), hash
51 |         return KGNodeID._mk_id(_type, hash)
52 | 
53 |     @staticmethod
54 |     def file_id(file_data=None, file_hash=None, file_path=None, return_hash=False):
55 |         return KGNodeID.id(
56 |             KGNodeTypes.file, file_data, file_hash, file_path, return_hash
57 |         )
58 | 


--------------------------------------------------------------------------------
/docker/install_llvmlite.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # install modified llvmlite
 3 | # Tested on Ubuntu 22.04
 4 | 
 5 | cp llvmlite-settypename.patch llvm-gcc-fix.patch /tmp/
 6 | 
 7 | apt-get update && \
 8 |     DEBIAN_FRONTEND=noninteractive apt-get install -y \
 9 |     python3 python3-pip python-is-python3 \
10 |     git g++ make cmake vim unzip libcurl4-openssl-dev wget  ;\
11 | 
12 | TMP=/tmp/tmp_src
13 | mkdir -p $TMP
14 | wget -O $TMP/llvm.tar.gz https://github.com/llvm/llvm-project/archive/llvmorg-8.0.1.tar.gz ;\
15 | cd $TMP && tar xf $TMP/llvm.tar.gz; \
16 | cd $TMP && git clone https://github.com/numba/llvmlite.git && \
17 | cd llvmlite && \
18 | git checkout aa11b129c0b55973067422397821ae6d44fa5e70 && \
19 | git apply --whitespace=nowarn /tmp/llvmlite-settypename.patch && \
20 | mv $TMP/llvmlite/conda-recipes/twine_cfg_undefined_behavior.patch $TMP/llvmlite/conda-recipes/twine_cfg_undefined_behavior.patch.bak;\
21 | cd $TMP/llvm-project-llvmorg-8.0.1/llvm && \
22 | for f in $TMP/llvmlite/conda-recipes/*.patch; do  patch -fN -p1 -i $f; done ;\
23 | cd $TMP/llvm-project-llvmorg-8.0.1/ && \
24 | patch -fN -p1 -i /tmp/llvm-gcc-fix.patch ;\
25 | 
26 | # fix recipes --------
27 | LLVMLITESRC=$TMP/llvmlite
28 | 
29 | BUILD=$LLVMLITESRC/conda-recipes/llvmdev/build.sh
30 | 
31 | if grep -q '^RECIPE_DIR' $BUILD; then
32 |    true;
33 | else
34 | ex $BUILD <<EX1
35 | /^# SVML tests on x86_64/
36 | a
37 | RECIPE_DIR=$LLVMLITESRC/conda-recipes/llvmdev
38 | .
39 | wq!
40 | EX1
41 | fi
42 | # fix --------
43 | 
44 | cd $TMP/llvm-project-llvmorg-8.0.1/llvm && \
45 | chmod +x $TMP/llvmlite/conda-recipes/llvmdev/build.sh && \
46 | PREFIX=/opt/llvm CPU_COUNT=12 CMAKE_BUILD_PARALLEL_LEVEL=2 $TMP/llvmlite/conda-recipes/llvmdev/build.sh ;\
47 | cd $TMP/llvmlite && LLVM_CONFIG=/opt/llvm/bin/llvm-config python3 setup.py install
48 | 
49 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.dev:
--------------------------------------------------------------------------------
 1 | FROM --platform=linux/amd64 ubuntu:22.04
 2 | 
 3 | RUN apt-get update && \
 4 |   apt-get install -y \
 5 |   python3 \
 6 |   python3-pip \
 7 |   python-is-python3 \
 8 |   wget && \
 9 |   apt-get clean
10 | 
11 | WORKDIR "/tmp"
12 | 
13 | # retdec 
14 | COPY docker/decompiler-config.json /tmp/
15 | COPY docker/install_retdec.sh /tmp/
16 | 
17 | ENV PATH="$PATH:/opt/cg/retdec/bin"
18 | ENV RETDEC_PATH="/opt/cg/retdec"
19 | 
20 | # leargist
21 | COPY docker/install_pyleargist.sh /tmp/
22 | 
23 | #install modified llvmlite
24 | COPY docker/llvmlite-settypename.patch docker/llvm-gcc-fix.patch docker/install_llvmlite.sh /tmp/
25 | 
26 | #llvm pass
27 | COPY deps/canon_pass /tmp/llvm_pass
28 | COPY docker/install_llvm_pass.sh /tmp/
29 | 
30 | #cleanup
31 | COPY docker/install_cleanup.sh /tmp/
32 | 
33 | #install and cleanup at the same time for a thin docker layer
34 | RUN cd /tmp && bash install_retdec.sh && \
35 |   cd /tmp && bash install_pyleargist.sh && \
36 |   cd /tmp && bash install_llvmlite.sh && \
37 |   cd /tmp && bash install_llvm_pass.sh
38 |   #cd /tmp && bash install_cleanup.sh do not cleanup for dev
39 | 
40 | ARG APP_DIR="/cg"
41 | RUN mkdir -p ${APP_DIR}
42 | WORKDIR ${APP_DIR}
43 | 
44 | COPY codegenome ./codegenome
45 | COPY scripts ./scripts
46 | COPY requirements.txt setup.py README.md ./
47 | COPY scripts/run_service.py ./
48 | COPY utils/app/ ./
49 | 
50 | ARG HOST_UID=1000
51 | # Create a non-root user
52 | ARG USERNAME=cguser
53 | ARG USER_UID=$HOST_UID
54 | ARG USER_GID=$USER_UID
55 | RUN groupadd --gid $USER_GID $USERNAME \
56 |     && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME
57 | 
58 | RUN python -mpip install .
59 | RUN chmod -x run_service.py
60 | COPY .env.defaults .env
61 | 
62 | # dev
63 | COPY utils/app/ ./
64 | 
65 | #keep user as root
66 | #USER $USERNAME
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Code Genome
 2 | 
 3 | Code Genome is a framework for extracting semantic code fingerprints or "genes" from executable binaries or source code.
 4 | 
 5 | Core functionalities of the framework are gene extraction from binaries, gene-level binary diffing (`genediff`), and gene search.
 6 | 
 7 | ![overview](./docs/_static/overview.png)
 8 | 
 9 | ## Quick Start
10 | 
11 | To clone this repo, run
12 | 
13 | ```
14 | git clone https://github.com/code-genome/codegenome.git
15 | cd codegenome
16 | git submodule update --init --recursive
17 | ```
18 | 
19 | To start a docker based instance, run
20 | 
21 | ```
22 | make start
23 | ```
24 | 
25 | This will start a CodeGenome UI instance at http://localhost:5000
26 | 
27 | ![codegenome](./docs/_static/code_genome.png)
28 | 
29 | To stop the instance, run
30 | 
31 | ```
32 | make stop
33 | ```
34 | 
35 | Cache data for processed files are located by default at `~/.cg/cache` folder.
36 | 
37 | ## Build and Installation
38 | 
39 | ### Local docker build
40 | 
41 | To build a docker image locally, run
42 | 
43 | ```
44 | make docker-builds
45 | make start_local
46 | ```
47 | 
48 | ### Local build
49 | 
50 | A local build and installation will require [RetDec](https://github.com/avast/retdec) and [LLVM](https://github.com/llvm).
51 | 
52 | Build is currently supported only on Debian-based distributions (e.g. Ubuntu).
53 | 
54 | Create a virtual environment.
55 | 
56 | ```
57 | python -mvenv .venv
58 | . .venv/bin/activate
59 | ```
60 | 
61 | Install dependencies.
62 | 
63 | ```
64 | make deps
65 | ```
66 | 
67 | Install `codegenome`.
68 | 
69 | ```
70 | pip install .
71 | ```
72 | 
73 | Run a CLI tool.
74 | 
75 | ```
76 | cg genediff /path/to/binary1 /path/to/binary2
77 | ```
78 | 
79 | ## Contributing
80 | 
81 | Check out our [contributing](./CONTRIBUTING.md) guide to learn how to contribute.
82 | 


--------------------------------------------------------------------------------
/tests/test_ir.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import sys
 5 | import time
 6 | import unittest
 7 | 
 8 | import test_data as data
 9 | 
10 | logging.basicConfig(
11 |     filename="/tmp/cg-test-ir.log",
12 |     level=logging.DEBUG,
13 |     format="%(asctime)s, %(name)s, %(levelname)s, %(message)s",
14 |     datefmt="%m/%d/%Y %H:%M:%S",
15 | )
16 | 
17 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../"))
18 | 
19 | 
20 | class TestIR(unittest.TestCase):
21 |     def setUp(self):
22 |         pass
23 | 
24 |     def test_global(self):
25 |         from codegenome.ir import IRBinary
26 | 
27 |         irb = IRBinary(data.global_ir, ll=True)
28 |         self.assertTrue(len(irb.fs), 2)
29 |         f0_str = irb.fs["f0"].get_ll()
30 |         self.assertTrue("@gv1" in f0_str)
31 |         self.assertTrue("@gv2" in f0_str)
32 | 
33 |     def test_types(self):
34 |         from codegenome.ir import IRBinary
35 | 
36 |         irb = IRBinary(data.type_ir, ll=True)
37 |         self.assertTrue(len(irb.fs), 2)
38 |         f0_str = irb.fs["f0"].get_ll()
39 |         # print(f0_str)
40 |         # import ipdb; ipdb.set_trace()
41 |         self.assertTrue("%t1" in f0_str)
42 |         self.assertTrue("%t2" in f0_str)
43 |         self.assertFalse("type1" in f0_str)
44 |         self.assertFalse("type2" in f0_str)
45 | 
46 |     def test_externs(self):
47 |         from codegenome.ir import IRBinary
48 | 
49 |         irb = IRBinary(data.externs_ir, ll=True)
50 |         self.assertTrue(len(irb.fs), 2)
51 |         f0_str = irb.fs["f0"].get_ll()
52 |         # print(f0_str)
53 |         # import ipdb
54 |         # ipdb.set_trace()
55 |         self.assertTrue("@printf" in f0_str)
56 |         self.assertTrue("@gf1" in f0_str)
57 |         self.assertFalse("local_func" in f0_str)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     unittest.main(verbosity=2)
62 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM --platform=linux/amd64 ubuntu:22.04
 2 | 
 3 | RUN apt-get update && \
 4 |   apt-get install -y \
 5 |   python3 \
 6 |   python3-pip \
 7 |   python-is-python3 \
 8 |   wget && \
 9 |   apt-get clean
10 | 
11 | WORKDIR "/tmp"
12 | 
13 | # retdec 
14 | COPY docker/decompiler-config.json /tmp/
15 | COPY docker/install_retdec.sh /tmp/
16 | 
17 | ENV PATH="$PATH:/opt/cg/retdec/bin"
18 | ENV RETDEC_PATH="/opt/cg/retdec"
19 | 
20 | # leargist
21 | COPY docker/install_pyleargist.sh /tmp/
22 | 
23 | #install modified llvmlite
24 | COPY docker/llvmlite-settypename.patch docker/llvm-gcc-fix.patch docker/install_llvmlite.sh /tmp/
25 | 
26 | #llvm pass
27 | COPY deps/canon_pass /tmp/llvm_pass
28 | COPY docker/install_llvm_pass.sh /tmp/
29 | 
30 | #cleanup
31 | COPY docker/install_cleanup.sh /tmp/
32 | 
33 | #install and cleanup at the same time for a thin docker layer
34 | RUN cd /tmp && bash install_retdec.sh && \
35 |   cd /tmp && bash install_pyleargist.sh && \
36 |   cd /tmp && bash install_llvmlite.sh && \
37 |   cd /tmp && bash install_llvm_pass.sh && \
38 |   cd /tmp && bash install_cleanup.sh
39 | 
40 | ARG APP_DIR="/cg"
41 | RUN mkdir -p ${APP_DIR}
42 | WORKDIR ${APP_DIR}
43 | 
44 | COPY codegenome ./codegenome
45 | COPY scripts ./scripts
46 | COPY requirements.txt setup.py README.md ./
47 | COPY scripts/run_service.py ./
48 | COPY utils/app/ ./
49 | 
50 | ARG HOST_UID=1000
51 | # Create a non-root user
52 | ARG USERNAME=cguser
53 | ARG USER_UID=$HOST_UID
54 | ARG USER_GID=$USER_UID
55 | RUN groupadd --gid $USER_GID $USERNAME \
56 |     && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME
57 | 
58 | RUN python -mpip install .
59 | RUN chmod -x run_service.py
60 | COPY .env.defaults .env
61 | # dev
62 | #install clang for unit testing
63 | # RUN apt-get install -y clang 
64 | 
65 | USER $USERNAME
66 | 
67 | CMD ["python3", "/cg/run_service.py", "0.0.0.0", "5001"]
68 | LABEL org.opencontainers.image.source=https://github.com/code-genome/codegenome
69 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | image = cg-worker
 2 | image-dev = cg-dev
 3 | image-ui = cg-ui
 4 | 
 5 | docker-build-worker : docker/Dockerfile
 6 | 	# git pull
 7 | 	docker build -f docker/Dockerfile --build-arg HOST_UID=1001 -t $(image) .
 8 | 
 9 | docker-build-ui :
10 | 	cd /tmp/ && rm -rf /tmp/codegenome_ui && \
11 | 	git clone https://github.com/code-genome/codegenome_ui.git && \
12 | 	cd codegenome_ui && \
13 | 	docker build -t $(image-ui) . &&\
14 | 	cd /tmp/ && rm -rf /tmp/codegenome_ui
15 | 
16 | docker-builds : docker-build-worker docker-build-ui
17 | 	#none
18 | 
19 | docker-builds-remove:
20 | 	docker rm $(image) &>/dev/null
21 | 	docker rm $(image-ui) &>/dev/null
22 | 	docker image rm $(image) &>/dev/null
23 | 	docker image rm $(image-ui) &>/dev/null
24 | 
25 | start_local :
26 | 	mkdir -p $(shell echo ~)/.cg
27 | 	sudo chown -R 1001:1001 $(shell echo ~)/.cg
28 | 
29 | 	#run worker 
30 | 	docker run --rm -d -u 1001:1001 -p 5001:5001 -v  $(shell echo ~)/.cg:/home/cguser/.cg --name $(image) $(image)
31 | 
32 | 	#run ui
33 | 	docker run --rm -d -p 5000:5000 --add-host host.docker.internal:host-gateway -e CG_HOST="http://host.docker.internal:5001" --name $(image-ui) $(image-ui)
34 | 
35 | start_worker :
36 | 	mkdir -p $(shell echo ~)/.cg
37 | 	sudo chown -R 1001:1001 $(shell echo ~)/.cg
38 | 
39 | 	#run worker 
40 | 	docker run --rm -d -u 1001:1001 -p 5001:5001 -v  $(shell echo ~)/.cg:/home/cguser/.cg --name $(image) ghcr.io/code-genome/cg-worker:latest
41 | 
42 | start_ui :
43 | 	#run ui
44 | 	docker run --rm -d -p 5000:5000 --add-host host.docker.internal:host-gateway -e CG_HOST="http://host.docker.internal:5001" --name $(image-ui) ghcr.io/code-genome/cg-ui:latest
45 | 
46 | start : start_worker start_ui
47 | 
48 | stop : 
49 | 	docker stop $(image)
50 | 	docker stop $(image-ui)
51 | 
52 | deps :
53 | 	cd docker
54 | 	sudo bash install_all_local.sh
55 | 
56 | docker-build-dev : docker/Dockerfile.dev
57 | 	docker build -f docker/Dockerfile.dev --build-arg HOST_UID=$(shell id -u) -t $(image-dev) .
58 | 
59 | dev-cli : 
60 | 	docker run --rm -v $(shell pwd):/cg -t -i --entrypoint /bin/bash $(image-dev)
61 | 
62 | pre-commit :
63 | 	pre-commit run --all-files
64 | 


--------------------------------------------------------------------------------
/scripts/build_gkg.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import os
 4 | import sys
 5 | 
 6 | 
 7 | def main(args):
 8 | 
 9 |     logging.basicConfig(
10 |         filename="/tmp/build_gkg.log",
11 |         level=logging.DEBUG,
12 |         format="%(asctime)s, %(name)s, %(levelname)s, %(message)s",
13 |         datefmt="%m/%d/%Y %H:%M:%S",
14 |         force=True,
15 |     )
16 | 
17 |     log = logging.getLogger("gkg")
18 |     if args.verbose:
19 |         h = logging.StreamHandler(sys.stdout)
20 |         h.setLevel(logging.DEBUG)
21 |         log.addHandler(h)
22 | 
23 |     log.info("starting build_gkg")
24 | 
25 |     sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
26 |     from sigmal.gkg import GenomeKG
27 | 
28 |     gkg = GenomeKG()
29 |     log.info("creating GenomeKG from %s" % (args.input_dir))
30 |     gkg.create(args.input_dir)
31 |     log.info("OK: GenomeKG created.")
32 |     if args.compute_tree:
33 |         log.info(
34 |             "computing BallTree.. using distance metric %s" % (args.distance_metric)
35 |         )
36 |         gkg.compute_tree(metric=args.distance_metric)
37 |         log.info("OK: BallTree computed.")
38 |     log.info("saving GenomeKG..")
39 |     r = gkg.save(args.output_file)
40 |     log.info("OK: GenomeKG save to %s" % (r))
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     ap = argparse.ArgumentParser()
45 |     ap.add_argument("-v", "--verbose", default=False, action="store_true")
46 |     ap.add_argument(
47 |         "-c",
48 |         "--compute_tree",
49 |         default=True,
50 |         action="store_true",
51 |         help="Compute balltree.",
52 |     )
53 |     ap.add_argument(
54 |         "--distance_metric",
55 |         default="minkowski",
56 |         action="store_true",
57 |         help="Distance metric for compute balltree.",
58 |     )
59 |     ap.add_argument(
60 |         "-o",
61 |         "--output_file",
62 |         default=None,
63 |         help="Optional output GenomeKG file path. Defaults to {input_dir}.gkg.",
64 |     )
65 | 
66 |     ap.add_argument("input_dir")
67 | 
68 |     args = ap.parse_args()
69 | 
70 |     exit(main(args))
71 | 


--------------------------------------------------------------------------------
/utils/app/app/api/add.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import json
 3 | import logging
 4 | import os
 5 | import tempfile
 6 | import traceback
 7 | 
 8 | from flask import request
 9 | from flask_restx import Resource, fields
10 | from werkzeug.datastructures import FileStorage
11 | 
12 | from ..core.genome_service import (API_STATE_EMPTY_RESULT, API_STATE_ERROR,
13 |                                    API_STATE_RESULT_NOT_READY)
14 | from ..defaults import *
15 | from ..main import kgs
16 | from .api import api
17 | 
18 | logger = logging.getLogger("codegenome.rest")
19 | ns = api.namespace("api/v1/add", description="Add to KG.")
20 | 
21 | upload_parser = api.parser()
22 | upload_parser.add_argument("file", location="files", type=FileStorage, required=True)
23 | 
24 | 
25 | @ns.route("/file")
26 | @ns.response(200, "Final result")
27 | @ns.response(
28 |     202, "Request received. Result not ready. Must check using `status/job/<job_id>`."
29 | )
30 | @ns.response(204, "Result empty")
31 | @ns.response(404, "Submissions id not found")
32 | @ns.expect(upload_parser)
33 | class Add(Resource):
34 |     def post(self):
35 |         args = upload_parser.parse_args(request)
36 |         uploaded_file = args["file"]  # This is FileStorage instance
37 |         # We can get the filename, stream, mimetype, etc. from it
38 |         logger.info("Received a file %s" % uploaded_file)
39 |         try:
40 |             if not os.path.exists(TMP_UPLOAD_DIR):
41 |                 os.makedirs(TMP_UPLOAD_DIR)
42 | 
43 |             tmpdir = tempfile.mkdtemp(prefix=TMP_DIR_PREFIX, dir=TMP_UPLOAD_DIR)
44 |             tmpfn = os.path.join(tmpdir, os.path.basename(uploaded_file.filename))
45 |             uploaded_file.save(tmpfn)
46 | 
47 |             ret = kgs.api_add_file(tmpfn)
48 |             if ret.get("status") == API_STATE_RESULT_NOT_READY:
49 |                 return ret, 202
50 |             elif ret.get("status") == API_STATE_EMPTY_RESULT:
51 |                 if ret.get("query") is None:  # root search node not found.
52 |                     return ret, 206
53 |             elif ret.get("status") == API_STATE_ERROR:
54 |                 return ret, 404
55 |             return ret
56 |         except Exception as e:
57 |             api.abort(404, f"Exception: {e}")
58 | 


--------------------------------------------------------------------------------
/tests/test_lifters.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import json
 3 | import logging
 4 | import os
 5 | import sys
 6 | import time
 7 | import unittest
 8 | 
 9 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../"))
10 | from codegenome.lifters.retdec import CGRetdec  # noqa
11 | 
12 | os.environ.setdefault("RETDEC_PATH", "/opt/cg/retdec/")
13 | 
14 | TEST_D = "/tmp/cg_lifter_test"
15 | TEST_FN = "p/p.c"
16 | FUNC = "f1"
17 | 
18 | LOG_FN = os.path.join(TEST_D, "cg-test.log")
19 | 
20 | GENE_PATH = os.path.join(TEST_D, "sigmal")
21 | GKG_PATH = GENE_PATH + ".gkg"
22 | bin_id = None  # populated by test_compile()
23 | 
24 | test_gene_id = None  # populated by test_bingene()
25 | 
26 | FN = os.path.splitext(os.path.basename(TEST_FN))[0]
27 | DEST_FN = os.path.join(TEST_D, FN)
28 | 
29 | 
30 | def prepare():
31 |     os.system("rm -rf " + TEST_D)
32 |     os.system("mkdir -p " + TEST_D)
33 |     logging.basicConfig(
34 |         filename=LOG_FN,
35 |         level=logging.DEBUG,
36 |         format="%(asctime)s, %(name)s, %(levelname)s, %(message)s",
37 |         datefmt="%m/%d/%Y %H:%M:%S",
38 |         force=True,
39 |     )
40 | 
41 | 
42 | def clear():
43 |     print(f"clearing {TEST_D}")
44 |     # os.system('rm -rf '+TEST_D)
45 | 
46 | 
47 | class TestLifter(unittest.TestCase):
48 |     @classmethod
49 |     def setUpClass(cls):
50 |         prepare()
51 | 
52 |     @classmethod
53 |     def tearDownClass(cls):
54 |         clear()
55 | 
56 |     # test order is sorted test function names!
57 | 
58 |     def test_01_to_bc(self):
59 |         global bin_id
60 |         cmd = "clang -O0 -o %s %s" % (DEST_FN, TEST_FN)
61 |         os.system(cmd)
62 |         self.assertTrue(os.path.exists(DEST_FN))
63 |         with open(DEST_FN, "rb") as f:
64 |             bin_id = hashlib.sha256(f.read()).hexdigest()
65 | 
66 |         retdec = CGRetdec()
67 | 
68 |         retdec.process_file(DEST_FN)
69 | 
70 |         out_fn = os.path.join(TEST_D, FN + ".bc")
71 |         self.assertTrue(os.path.exists(out_fn))
72 | 
73 |         out_dir = os.path.join(TEST_D, "tmp")
74 |         os.makedirs(out_dir)
75 | 
76 |         retdec.process_file(DEST_FN, output_dir=out_dir, output_fname=bin_id)
77 | 
78 |         out_fn = os.path.join(out_dir, bin_id + ".bc")
79 |         self.assertTrue(os.path.exists(out_fn))
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     unittest.main(verbosity=2)
84 | 


--------------------------------------------------------------------------------
/codegenome/_defaults.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Global default values
 3 | """
 4 | import os
 5 | import dotenv
 6 | import logging
 7 | 
 8 | #not configurable defaults
 9 | UNIVERSAL_FUNC_NAME = "_F" 
10 | KNOWN_CALCULATION_METHODS = ["jaccard_distance", "jaccard_distance_w", "all"]
11 | VALID_OUTPUT_DETAILS = ["simple", "complete"]
12 | 
13 | logger = logging.getLogger("cg.defaults")
14 | dotenv.load_dotenv()
15 | 
16 | CG_DATA_ROOT_DIR = os.path.expanduser(os.environ.get('CG_DATA_ROOT_DIR',"~/.cg"))
17 | CG_CACHE_DIR = os.path.expanduser(os.environ.get('CG_CACHE_DIR', os.path.join(CG_DATA_ROOT_DIR, 'cache')))
18 | 
19 | if not os.path.exists(CG_DATA_ROOT_DIR):
20 |     os.makedirs(CG_DATA_ROOT_DIR)
21 |     
22 | if not os.path.exists(CG_CACHE_DIR):
23 |     os.makedirs(CG_CACHE_DIR)
24 | 
25 | DEFAULT_GENE_VERSION = os.environ.get("DEFAULT_GENE_VERSION", "genes_v0_0_1")
26 | DEFAULT_EXEC_GENE_VERSION = os.environ.get("DEFAULT_EXEC_GENE_VERSION", DEFAULT_GENE_VERSION)
27 | 
28 | DEFAULT_CALCULATION_METHOD = os.environ.get("DEFAULT_CALCULATION_METHOD", "jaccard_distance_w")
29 | if DEFAULT_CALCULATION_METHOD not in KNOWN_CALCULATION_METHODS:
30 |     logger.error(f"Invalid DEFAULT_CALCULATION_METHOD={DEFAULT_CALCULATION_METHOD}")
31 |     DEFAULT_CALCULATION_METHOD = "jaccard_distance_w"
32 |     
33 | # function compare
34 | # minimum canonicalized function size (canon_bc_size). Smaller than this size will be
35 | # skipped during comparison.
36 | # Ref: 928 is the bc size of the following code
37 | # 'source_filename = "<string>"\n\ndeclare i64 @gf1() local_unnamed_addr\n\ndefine i64 @_F() local_unnamed_addr {\nb1:\n  %v1 = tail call i64 @gf1()\n  ret i64 %v1\n}\n'
38 | #
39 | MIN_GENE_SIZE_FILE_COMPARE = int(os.environ.get( "MIN_GENE_SIZE_FILE_COMPARE" ,1000))
40 | 
41 | # max genes allowed per file during comparison.
42 | MAX_GENES_PER_FILE_COMPARE = int(os.environ.get( "MAX_GENES_PER_FILE_COMPARE" ,50000))
43 | 
44 | # during pairwise file compare greater than or equal to this threshold will be considered as a match `~`
45 | FILE_COMPARE_FUNC_MATCH_SIM_THRESHOLD = float(os.environ.get( "FILE_COMPARE_FUNC_MATCH_SIM_THRESHOLD" ,0.99))
46 | 
47 | # for the same function names, greater than or equal to this threshold will be considered as a mismatch `!`,
48 | # smaller wil be considered delete `-`
49 | FILE_COMPARE_FUNC_MISMATCH_SIM_THRESHOLD = float(os.environ.get("FILE_COMPARE_FUNC_MISMATCH_SIM_THRESHOLD",0.80))
50 | 


--------------------------------------------------------------------------------
/tests/p/p.c:
--------------------------------------------------------------------------------
  1 | #include<stdio.h>
  2 | #include <stdlib.h>
  3 | 
  4 | int g=1;
  5 | int* gp = &g;
  6 | int vec[]= {1,2,3};
  7 | int* intptr;
  8 | 
  9 | int gf1(int a){
 10 |     int tmp = a;
 11 |     tmp = a + *gp;
 12 |     intptr = &g;
 13 |     return tmp;
 14 | }
 15 | 
 16 | int gf2(int a)
 17 | {
 18 |   int tmp;
 19 |   tmp = a + vec[0];
 20 |   return tmp;
 21 | }
 22 | 
 23 | int f1(int a){
 24 |   int x;
 25 |   x = a + 32;
 26 |   return x;
 27 | }
 28 | 
 29 | int f2(int a){
 30 |   int local=31;
 31 |   local +=1;  
 32 |   local = a + local;
 33 |   return local;
 34 | }
 35 | 
 36 | int f3(int a){
 37 |   int local = a;  
 38 |   a = local;
 39 |   local = 30;
 40 |   a = a + local;
 41 |   a+=2;
 42 |   __asm__("xor %eax, %eax");
 43 |   __asm__("xor %eax, %eax");
 44 |   __asm__("xor %eax, %eax");
 45 |   return a;
 46 | }
 47 | 
 48 | 
 49 | int f4(int a){
 50 |   int l; int m; l = 30;
 51 |   m = l - 10;
 52 |   if (a>10){
 53 |     a = a+l;
 54 |     a+=2;
 55 |     return a;
 56 |   }
 57 |   else{
 58 |     __asm__("xor %eax, %eax");
 59 |     return a+m+12;
 60 |   }
 61 | }
 62 | 
 63 | int f5(int a){
 64 |   int l;  int m;  l = 30; m = 20;
 65 |   if (a>100){
 66 |     l = l + a;
 67 |     if(a>501){
 68 |       int tmp = 30-a;tmp = a+tmp;
 69 |       a = a + tmp+2;
 70 |       return a;
 71 |     }
 72 |     a = l + 2; l = m = a;
 73 |     return l;
 74 |   }
 75 |   else  if(a>10)
 76 |   {
 77 |     int tmp = a; a +=a; a = a - tmp;
 78 |     a = a + 32; return a;
 79 |   }else {
 80 |     int tmp = 30-a; tmp = tmp + a;
 81 |     tmp = tmp +2; a = a + tmp;
 82 |     return a;
 83 |   }
 84 | }
 85 | 
 86 | int f5_(int a){
 87 |   int l;
 88 |   int m;
 89 |   l = 30;
 90 |   m = 20;
 91 |   if (a>100){
 92 |     l = l + a;
 93 |     if(a>501){
 94 |       int tmp = 30-a;
 95 |       tmp = a+tmp;
 96 |       a = a + tmp+2;
 97 |       return a;
 98 |     }
 99 |     a = l + 2;
100 |     l = m = a;
101 |     return l;
102 |   }
103 |   else  if(a>10)
104 |   {
105 |     int tmp = a;
106 |     a +=a;
107 |     a = a - tmp;
108 |     a = a + 32;
109 |     return a;
110 |   }else {
111 |     int tmp = 30-a;
112 |     tmp = tmp + a;
113 |     tmp = tmp +2;
114 |     a = a + tmp;
115 |     return a;
116 |   }
117 | }
118 | 
119 | int main(int argc, char* argv[])
120 | {
121 |   int a = atoi(argv[1]);
122 |   printf("%d\n%d\n%d",f1(a), f2(a),f3(a));
123 |   a = f4(0);
124 |   a = f5(0);
125 |   a = gf1(0);
126 |   a = gf2(0);
127 |   return 0;
128 | }
129 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing Guide
  2 | 
  3 | ## Setting up the project
  4 | 
  5 | Build is currently supported only on Debian-based distributions (e.g. Ubuntu).
  6 | 
  7 | To clone this repo, run
  8 | 
  9 | ```
 10 | git clone https://github.com/code-genome/codegenome.git
 11 | cd codegenome
 12 | git submodule update --init --recursive
 13 | ```
 14 | 
 15 | Create a virtual environment.
 16 | 
 17 | ```
 18 | python -mvenv .venv
 19 | . .venv/bin/activate
 20 | ```
 21 | 
 22 | Install dependencies.
 23 | 
 24 | ```
 25 | make deps
 26 | ```
 27 | 
 28 | Install requirements.
 29 | 
 30 | ```
 31 | pip install -r requirements.txt
 32 | ```
 33 | 
 34 | Test run the CLI tool.
 35 | 
 36 | ```
 37 | python script/cg genediff /bin/chmod /bin/chown
 38 | ```
 39 | 
 40 | 
 41 | ## Running pre-commit before committing
 42 | 
 43 | First, install the pre-commit hooks:
 44 | 
 45 | ```bash
 46 | pip install pre-commit
 47 | pre-commit install
 48 | ```
 49 | 
 50 | To run pre-commit before committing:
 51 | 
 52 | ```bash
 53 | pre-commit run --all-files
 54 | ```
 55 | 
 56 | Or simply run:
 57 | 
 58 | ```bash
 59 | make pre-commit
 60 | ```
 61 | 
 62 | This will run the pre-commit hooks on all files.
 63 | 
 64 | The pre-commit hooks will:
 65 | 1. Check for any linting errors
 66 | 2. Check for any formatting errors
 67 | 3. Check for any security vulnerabilities
 68 | 4. Check for spelling errors
 69 | 4. Verify you used relative imports inside src/ directory
 70 | 5. Verify you used library imports outside src/ directory
 71 | 
 72 | ## Running Tests
 73 | 
 74 | 
 75 | ```
 76 | cd tests
 77 | python unit_tests.py
 78 | ```
 79 | 
 80 | # Repo principles:
 81 | 
 82 | ## Git
 83 | 
 84 | ## Legal
 85 | 
 86 | We have tried to make it as easy as possible to make contributions. This applies to how we handle the legal aspects of contribution. We use the same approach - the Developer's Certificate of Origin 1.1 (DCO) - that the Linux® Kernel community uses to manage code contributions.
 87 | 
 88 | We simply ask that when submitting a patch for review, the developer must include a sign-off statement in the commit message.
 89 | 
 90 | Here is an example Signed-off-by line, which indicates that the submitter accepts the DCO:
 91 | 
 92 | Signed-off-by: John Doe <john.doe@example.com>
 93 | You can include this automatically when you commit a change to your local git repository using the following command:
 94 | 
 95 | git commit -s
 96 | 
 97 | ### Commit
 98 | Always commit with a [good commit message](https://cbea.ms/git-commit/) and sign off:
 99 | 
100 | Example:
101 | 
102 | ```bash
103 | git commit -s
104 | ```
105 | 
106 | ### Push
107 | Push into a new branch and open a PR.
108 | 
109 | Example:
110 | 
111 | ```bash
112 | git push origin main:<my-new-branch-name>
113 | ```
114 | 


--------------------------------------------------------------------------------
/utils/app/app/api/compare.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import traceback
 3 | 
 4 | from flask_restx import Resource, fields
 5 | 
 6 | from ..core.genome_service import (API_STATE_EMPTY_RESULT,
 7 |                                    API_STATE_RESULT_NOT_READY)
 8 | from ..defaults import *
 9 | from ..main import kgs
10 | from .api import api, check_event_loop
11 | 
12 | logger = logging.getLogger("codegenome.rest")
13 | 
14 | ns = api.namespace("api/v1/compare", description="Compare binaries.")
15 | 
16 | compare_file_id_args = api.model(
17 |     "compare_file_id_args",
18 |     {
19 |         "id1": fields.String(
20 |             required=True, description="The file identifier (file sha256 hash)"
21 |         ),
22 |         "id2": fields.String(
23 |             required=True, description="The file identifier (file sha256 hash)"
24 |         ),
25 |         "method": fields.String(
26 |             required=False,
27 |             default=DEFAULT_COMPARE_METHOD,
28 |             description="Internal query method to be used. \
29 |     Currently supported values: [`gene_v0`, `genes_v1_3_0`, `genes_v1_3_0.jaccard_distance`, `genes_v1_3_0.jaccard_distance_w`,\
30 |          `genes_v1_3_0.composition_ratio`, `genes_v1_3_0.composition_ratio_w`,\
31 |          `genes_v1_3_0.containment_ratio`]",
32 |         ),
33 |         "output_detail": fields.String(
34 |             required=False,
35 |             default=DEFAULT_OUTPUT_DETAIL,
36 |             description="Output format. \
37 |     Supported values: ['simple','complete']",
38 |         ),
39 |     },
40 | )
41 | 
42 | 
43 | @ns.route("/files/by_file_ids")
44 | @ns.response(200, "Final result")
45 | @ns.response(202, "Request received. Result not ready. Must retry.")
46 | @ns.response(204, "Result empty")
47 | @ns.response(404, "File id not found")
48 | class KGCompareFileIDs(Resource):
49 |     """Compare binaries using genes."""
50 | 
51 |     @ns.expect(compare_file_id_args)
52 |     def post(self):
53 |         """Compare binaries using genes."""
54 |         args = api.payload
55 |         check_event_loop()
56 |         try:
57 |             ret = kgs.api_files_compare_kg(
58 |                 file_id1=args["id1"],
59 |                 file_id2=args["id2"],
60 |                 method=args.get("method", DEFAULT_COMPARE_METHOD),
61 |                 output_detail=args.get("output_detail", DEFAULT_OUTPUT_DETAIL),
62 |             )
63 |             if ret.get("status") == API_STATE_RESULT_NOT_READY:
64 |                 return ret, 202
65 |             elif ret.get("status") == API_STATE_EMPTY_RESULT:
66 |                 if ret.get("query") is None:  # root search node not found.
67 |                     return ret, 404
68 |                 else:
69 |                     return ret, 204
70 | 
71 |             return ret
72 |         except Exception as e:
73 |             api.abort(405, f"Exception: {e}")
74 | 
75 | 
76 | # TODO
77 | # @ns.route("/packages/by_package_ids")
78 | # @ns.route("/genes/by_gene_ids")
79 | 


--------------------------------------------------------------------------------
/codegenome/lifters/retdec.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import shutil
  4 | import subprocess
  5 | import tempfile
  6 | import time
  7 | 
  8 | from .base import CGLifterBase
  9 | 
 10 | logger = logging.getLogger("codegenome.lifter.retdec")
 11 | 
 12 | DEFAULT_RETDEC_PATH = "/opt/retdec"
 13 | 
 14 | 
 15 | class CGRetdec(CGLifterBase):
 16 |     def __init__(self, retdec_path=None, logger=logger):
 17 |         self.retdec_path = (
 18 |             os.environ.get("RETDEC_PATH", DEFAULT_RETDEC_PATH)
 19 |             if retdec_path is None
 20 |             else retdec_path
 21 |         )
 22 |         self.logger = logger
 23 | 
 24 |     def process_file(
 25 |         self,
 26 |         file_path,
 27 |         output_dir=None,
 28 |         output_fname=None,
 29 |         retdec_logfile_path=None,
 30 |         retdec_path=None,
 31 |         keep_aux_files=False,
 32 |         overwrite=True,
 33 |     ):
 34 |         self.logger.debug(
 35 |             f"process_file. {file_path, output_dir, retdec_logfile_path, retdec_path}"
 36 |         )
 37 |         final_output_path = None
 38 |         fn = os.path.basename(file_path)
 39 | 
 40 |         if output_dir is None:
 41 |             output_dir = os.path.dirname(file_path)
 42 |         if output_dir == "":
 43 |             output_dir = "./"
 44 | 
 45 |         final_output_dir = output_dir
 46 |         output_dir = tempfile.mkdtemp(prefix="cgtmp__", dir="/tmp/")
 47 | 
 48 |         try:
 49 | 
 50 |             if output_fname is None:
 51 |                 output_fname = os.path.basename(file_path)
 52 |             else:
 53 |                 output_fname = os.path.basename(output_fname)
 54 | 
 55 |             if retdec_path is None:
 56 |                 retdec_path = self.retdec_path
 57 | 
 58 |             if retdec_logfile_path is None:
 59 |                 retdec_logfile_path = os.path.join(
 60 |                     output_dir, output_fname + ".retdec.log"
 61 |                 )
 62 | 
 63 |             args = [
 64 |                 os.path.join(retdec_path, "bin/retdec-decompiler"),
 65 |                 "-o",
 66 |                 os.path.join(output_dir, output_fname),
 67 |                 file_path,
 68 |             ]
 69 | 
 70 |             self.logger.info(f"running {args}")
 71 | 
 72 |             t = time.time()
 73 |             with open(retdec_logfile_path, "w") as fout:
 74 |                 ret = subprocess.call(args, stdout=fout, stderr=fout)
 75 | 
 76 |             if not keep_aux_files:
 77 |                 # output debug logs
 78 |                 with open(retdec_logfile_path, "r") as f:
 79 |                     logger.debug(f"RETDEC_LOG:\n{f.read()}\n")
 80 | 
 81 |             if ret == 0:
 82 |                 logger.debug(
 83 |                     f"RETDEC_OK. Time: {time.time()-t} secs. {[file_path, '->', output_dir]}"
 84 |                 )
 85 | 
 86 |             else:
 87 |                 logger.debug(
 88 |                     f"RETDEC_ERROR. Time: {time.time()-t} secs. {[file_path, '->', output_dir]}"
 89 |                 )
 90 |             # move
 91 | 
 92 |             for fn in os.listdir(output_dir):
 93 |                 ext = os.path.splitext(fn)[-1].lower()
 94 | 
 95 |                 if not keep_aux_files:
 96 |                     if ext != ".bc":
 97 |                         continue
 98 |                 if ext in [".bc", ".dsm", ".ll", ".log"]:
 99 |                     src = os.path.join(output_dir, fn)
100 | 
101 |                     if os.path.isfile(src):
102 |                         dst = os.path.join(final_output_dir, fn)
103 |                         if fn.endswith(".bc"):
104 |                             final_output_path = dst
105 |                         if os.path.exists(dst) and (not overwrite):
106 |                             continue
107 |                         # copy
108 |                         shutil.copy2(src, dst)
109 |                         os.remove(src)
110 | 
111 |         finally:
112 |             shutil.rmtree(output_dir)
113 |         return final_output_path
114 | 


--------------------------------------------------------------------------------
/tests/test_api_core.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import json
  3 | import logging
  4 | import os
  5 | import sys
  6 | import time
  7 | import unittest
  8 | 
  9 | import test_data as data
 10 | 
 11 | sys.path.insert(
 12 |     0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils/app")
 13 | )
 14 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../"))
 15 | from app.core.genome_service import *  # noqa
 16 | 
 17 | from codegenome._defaults import UNIVERSAL_FUNC_NAME  # noqa
 18 | 
 19 | TEST_D = "/tmp/cg_test_api"
 20 | GENE_D = os.path.join("/tmp/cg_test_api", "local.kg")
 21 | TEST_FN = "p/p.c"
 22 | FUNC = "f1"
 23 | FN = os.path.splitext(os.path.basename(TEST_FN))[0]
 24 | DEST_FN = os.path.join(TEST_D, FN)
 25 | 
 26 | LOG_FN = os.path.join(TEST_D, "cg-test-api-core.log")
 27 | 
 28 | bin_id = None
 29 | 
 30 | CG_CONFIG = {"cache_dir": TEST_D, "gene_dir": GENE_D, "keep_aux_files": True}
 31 | 
 32 | 
 33 | def prepare():
 34 |     os.system("rm -rf " + TEST_D)
 35 |     os.system("mkdir -p " + TEST_D)
 36 |     logging.basicConfig(
 37 |         filename=LOG_FN,
 38 |         level=logging.DEBUG,
 39 |         format="%(asctime)s, %(name)s, %(levelname)s, %(message)s",
 40 |         datefmt="%m/%d/%Y %H:%M:%S",
 41 |         force=True,
 42 |     )
 43 | 
 44 | 
 45 | def clear():
 46 |     print(f"clearing {TEST_D}")
 47 |     # os.system('rm -rf '+TEST_D)
 48 | 
 49 | 
 50 | class TestAPI(unittest.TestCase):
 51 |     @classmethod
 52 |     def setUpClass(cls):
 53 |         prepare()
 54 | 
 55 |     @classmethod
 56 |     def tearDownClass(cls):
 57 |         clear()
 58 | 
 59 |     def setUp(self):
 60 |         pass
 61 | 
 62 |     def test_01_compile(self):
 63 |         global bin_id
 64 |         import shutil
 65 | 
 66 |         dest_s = os.path.join(TEST_D, os.path.basename(TEST_FN))
 67 |         shutil.copy(TEST_FN, dest_s)
 68 |         cmd = "clang -O0 -o %s %s" % (DEST_FN, dest_s)
 69 |         os.system(cmd)
 70 |         self.assertTrue(os.path.exists(DEST_FN))
 71 |         bin_id = hashlib.sha256(open(DEST_FN, "rb").read()).hexdigest()
 72 | 
 73 |     def test_02_genome_service(self):
 74 |         global bin_id
 75 |         gs = GenomeService(CG_CONFIG)
 76 |         self.assertTrue(os.path.exists(TEST_D))
 77 | 
 78 |         ret = gs.check_job("abcd")
 79 |         self.assertEqual(ret["status"], API_STATE_ERROR)
 80 | 
 81 |         ret = gs.api_add_file(DEST_FN)
 82 | 
 83 |         self.assertEqual(ret["status"], API_STATE_RESULT_NOT_READY)
 84 |         job_id = ret.get("job_id")
 85 |         # check immediately, should fail
 86 |         ret = gs.check_job(job_id)
 87 |         self.assertEqual(ret["status"], API_STATE_RESULT_NOT_READY)
 88 | 
 89 |         for _ in range(5):
 90 |             time.sleep(1)
 91 |             # print(_)
 92 |             ret = gs.check_job(job_id)
 93 |             if ret.get("status") == API_STATE_SUCCESS:
 94 |                 break
 95 |         # print(ret)
 96 |         self.assertEqual(ret["status"], API_STATE_SUCCESS)
 97 |         self.assertEqual(ret["ret_status"], "new_file")
 98 | 
 99 |         # check job, should not exit (self clean after successful completion)
100 |         ret = gs.check_job(job_id)
101 |         # print(ret)
102 |         # only enable in prod
103 |         # self.assertEqual(ret['status'],  API_STATE_ERROR)
104 | 
105 |         # try on existing file
106 |         ret = gs.api_add_file(DEST_FN)
107 | 
108 |         self.assertEqual(ret["status"], API_STATE_SUCCESS)
109 |         self.assertEqual(ret["ret_status"], "existing_file")
110 | 
111 |         ret = gs.api_files_compare_kg(bin_id, bin_id)
112 | 
113 |         # print(ret)
114 |         self.assertEqual(ret["status"], API_STATE_SUCCESS)
115 |         self.assertTrue(ret.get("query") is not None)
116 | 
117 |         # get ll
118 |         ret = gs.api_get_ir("main", bin_id)
119 |         # print(ret)
120 |         self.assertEqual(ret["status"], API_STATE_SUCCESS)
121 |         ret = ret.get("data")
122 |         self.assertTrue("llvm_ir" in ret)
123 |         self.assertTrue("gene_id" in ret)
124 |         ir = ret.get("llvm_ir")
125 |         # print(ir)
126 |         self.assertEqual(type(ir), str)
127 |         self.assertTrue(UNIVERSAL_FUNC_NAME in ir)
128 | 
129 | 
130 | if __name__ == "__main__":
131 |     unittest.main(verbosity=2)
132 | 


--------------------------------------------------------------------------------
/codegenome/genes/utils.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import pickle
  3 | import zlib
  4 | 
  5 | import numpy as np
  6 | 
  7 | from .._defaults import *
  8 | 
  9 | 
 10 | def encode_gene(gene_data):
 11 |     if type(gene_data) == list:
 12 |         gene_data = np.array(gene_data).astype("float32").tobytes()
 13 |     if type(gene_data) == np.ndarray:
 14 |         gene_data = gene_data.astype("float32").tobytes()
 15 |     if type(gene_data) != bytes:
 16 |         raise Exception("gene data can not be converted to bytes.")
 17 |     return base64.b64encode(zlib.compress(gene_data)).decode("ascii")
 18 | 
 19 | 
 20 | def decode_gene(data_str):
 21 |     return np.frombuffer(zlib.decompress(base64.b64decode(data_str)), dtype="float32")
 22 | 
 23 | 
 24 | def decode_gene_by_ver(gene):
 25 |     # Implement version specific decoding if needed
 26 |     # gene.get('version')
 27 | 
 28 |     raw_gene = gene["value"]
 29 |     if type(raw_gene) == str:
 30 |         raw_gene = decode_gene(gene["value"])
 31 |     return raw_gene
 32 | 
 33 | 
 34 | def gene_distance(raw_gene1, raw_gene2, normalized=True):
 35 |     x = np.linalg.norm(raw_gene1 - raw_gene2)
 36 |     if normalized:
 37 |         x /= np.sqrt(len(raw_gene1))
 38 |     return x
 39 | 
 40 | 
 41 | def gene_similarity_score_adjusted(sim):
 42 |     return np.power(float(sim), 2)  # tone down similarity score
 43 | 
 44 | 
 45 | def gene_similarity(raw_gene1, raw_gene2, adjusted=False, normalized=True):
 46 |     sim = 1.0 - gene_distance(raw_gene1, raw_gene2, normalized)
 47 |     if adjusted:
 48 |         sim = gene_similarity_score_adjusted(sim)
 49 |     return sim
 50 | 
 51 | 
 52 | def gene_distance_by_ver(gene1, gene2, normalized=True):
 53 |     raw_gene1, raw_gene2 = decode_gene_by_ver(gene1), decode_gene_by_ver(gene2)
 54 |     x = np.linalg.norm(raw_gene1 - raw_gene2)
 55 |     if normalized:
 56 |         x /= np.sqrt(len(raw_gene1))
 57 |     return x
 58 | 
 59 | 
 60 | def gene_similarity_by_ver(gene1, gene2, adjusted=False, normalized=True):
 61 |     sim = 1.0 - gene_distance_by_ver(gene1, gene2, normalized)
 62 |     if adjusted:
 63 |         sim = gene_similarity_score_adjusted(sim)
 64 |     return sim
 65 | 
 66 | 
 67 | class GeneIterator(object):
 68 |     def __init__(self, data):
 69 |         self.idx = 0
 70 |         self.data = data
 71 | 
 72 |     def __iter__(self):
 73 |         return self
 74 | 
 75 |     def __next__(self):
 76 |         self.idx += 1
 77 |         try:
 78 |             return self.__getitem__(self.idx - 1)
 79 |         except IndexError:
 80 |             self.idx = 0
 81 |             raise StopIteration
 82 | 
 83 |     def __getitem__(self, ii):
 84 |         # override according to file version
 85 |         return self.data[ii]
 86 | 
 87 | 
 88 | class GeneFile(object):
 89 |     def __init__(self, data=None, file_path=None):
 90 |         if data is not None:
 91 |             self.data = pickle.loads(data)
 92 |         elif file_path is not None:
 93 |             self.data = pickle.load(open(file_path, "rb"))
 94 |         else:
 95 |             raise Exception("invalid argument.")
 96 | 
 97 |         if self.data["type"] != "gene":
 98 |             raise Exception(f"invalid file type {self.data['type']}")
 99 | 
100 |         self.version = self.data["version"]
101 | 
102 |         if self.version == "0.3":
103 |             self.init_v0_3()
104 |         else:
105 |             raise Exception("Unknown file version.")
106 | 
107 |     def init_v0_3(self):
108 |         self.binid = self.data["binid"]
109 |         self._genes = self.data["genes"]
110 |         self._meta = self.data["file_meta"]
111 | 
112 |         class GeneIteratorEx(GeneIterator):
113 |             def __getitem__(self, ii):
114 |                 cid, funcs, gene, gene_meta = self.data[ii]
115 |                 bc_size, file_offset = gene_meta
116 |                 return {
117 |                     "canon_bc_id": cid,
118 |                     "func_names": funcs,
119 |                     "gene": gene,
120 |                     "canon_bc_size": bc_size,
121 |                     "file_offset": file_offset,
122 |                 }
123 | 
124 |         self.genes = GeneIteratorEx(self._genes)
125 | 
126 |     @classmethod
127 |     def load(cls, file_path):
128 |         return cls(file_path=file_path)
129 | 
130 |     @classmethod
131 |     def loads(cls, data):
132 |         return cls(data=data)
133 | 


--------------------------------------------------------------------------------
/codegenome/ir/canon.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import json
  4 | import jsonlines
  5 | import logging
  6 | import datetime
  7 | import subprocess
  8 | import hashlib
  9 | 
 10 | DEFAULT_LLVM_PATH = '/opt/llvm'
 11 | logger = logging.getLogger('codegenome.canon')
 12 | 
 13 | class IRCanonPassBinary(object):
 14 |     def __init__(self, input_data, output='canon.jsonl', bin_id='', pass_file='libcanonicalization-pass.so', llvm_path=None):
 15 |         self.input_data = input_data
 16 |         self._bin_id = bin_id
 17 |         self.llvm_path = os.environ.get(
 18 |             'LLVM_PATH', DEFAULT_LLVM_PATH) if llvm_path is None else llvm_path
 19 |         self.pass_file = os.path.join(self.llvm_path, 'lib',pass_file )
 20 |         self.opt_bin = os.path.join(self.llvm_path, 'bin', 'opt')
 21 |         self.output = output
 22 |         self.stat = {}
 23 |         
 24 |     def canon_pass(self):
 25 |         args = [self.opt_bin, '--load', self.pass_file,'--canonicalization', '--canon-out',
 26 |                  self.output]
 27 |         #print(' '.join(args))
 28 |         logger.info(f'running {args}')
 29 |         try:
 30 |             t = time.time()
 31 |             ret = subprocess.run(args, input=self.input_data, stdout=subprocess.DEVNULL,
 32 |                         stderr=subprocess.DEVNULL)
 33 |             if ret.returncode == 0:
 34 |                     logger.debug(
 35 |                         f"CANON_PASS_OK. Time: {time.time()-t} secs. {['->', self.output]}")
 36 |                     return self.output
 37 | 
 38 |             else:
 39 |                 logger.debug(
 40 |                     f"CANON_PASS_ERROR. Time: {time.time()-t} secs. {['->', self.output]}")
 41 |                 # move
 42 |         except Exception as ex:
 43 |             logger.error(f"Exception: {ex}")
 44 | 
 45 |         return None
 46 |     
 47 |     def serialize(self, statf=None):
 48 |         import llvmlite.binding as llvm #lazy loading
 49 |         fns = []
 50 |         i = 0
 51 |         tot = 0
 52 |         err = 0
 53 |         st = time.time()
 54 |         
 55 |         jsonl = self.canon_pass()
 56 |         if jsonl is None:
 57 |             return None
 58 |         t1 = time.time()
 59 |         
 60 |         with jsonlines.open(jsonl) as reader:
 61 |             for func in reader:
 62 |                 #code, data, extern, name
 63 |                 try:
 64 |                     if func['extern']:
 65 |                         continue
 66 |                     s = time.time()
 67 |                     #sort data
 68 |                     data = func['data'].split('\n')
 69 |                     data.sort()
 70 |                     data = '\n'.join([x for x in data if x!=''])
 71 |                     
 72 |                     m = llvm.parse_assembly( data + '\n' + func['code'] )
 73 |                     bc =  m.as_bitcode()
 74 |                     s = time.time() - s
 75 |                     tot += 1
 76 | 
 77 |                     gid = hashlib.sha256(bc).hexdigest()
 78 |                     # TODO get file_offset
 79 |                     bc_size = len(bc)
 80 |                     file_offset = 0
 81 |                     meta = (bc_size, file_offset)
 82 | 
 83 |                     # format (gene_id, func_name, bitcode, meta)
 84 |                     func_name = func['name']
 85 |                     fns.append((gid, func_name, bc, meta))
 86 |                     
 87 |                     if statf:
 88 |                         txt = '{"type": "OK", "i": %d, "ts": "%s", "func": "%s", "time": %f, "size": %d}' % (
 89 |                             i, str(datetime.datetime.now()), func_name, s, len(bc))
 90 |                         statf.write(txt + '\n')
 91 |                 except Exception as e:
 92 |                     err += 1
 93 |                     txt = '{"type": "ERR", "i": %d, "ts": "%s", "func": "%s", "e": "%s", "bin_id": "%s"}' % (
 94 |                         i, str(datetime.datetime.now()), func_name, str(e), self._bin_id)
 95 |                     logger.warning(txt)
 96 |                     if statf:
 97 |                         statf.write(txt + '\n')
 98 |                     else:
 99 |                         pass
100 |         t2 = time.time()
101 |         if statf:
102 |             stat = {"type": "stat", "bin_id": self._bin_id, "total": tot,
103 |                     "errors": err, "func_count": len(self.fs), 'pass_time': t1-st, 'time': t2-t1}
104 |             for k, v in self.stat.items():
105 |                 stat[k] = v
106 |             statf.write(json.dumps(stat) + '\n')
107 |         
108 |         return fns
109 | 


--------------------------------------------------------------------------------
/utils/app/app/api/search.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import traceback
  3 | 
  4 | from flask_restx import Resource, fields
  5 | 
  6 | from ..core.genome_service import (API_STATE_EMPTY_RESULT, API_STATE_ERROR,
  7 |                                    API_STATE_RESULT_NOT_READY)
  8 | from ..defaults import *
  9 | from ..main import kgs
 10 | from .api import api, check_event_loop
 11 | 
 12 | logger = logging.getLogger("codegenome.rest")
 13 | 
 14 | ns = api.namespace("api/v1/search", description="Search for info")
 15 | 
 16 | gene_info_args = api.model(
 17 |     "gene_info_args",
 18 |     {
 19 |         "gene_id": fields.String(
 20 |             required=False,
 21 |             default="",
 22 |             description="The gene identifier (sha256 hash). If this is not passed, `file_id` and `function_name` must be passed ",
 23 |         ),
 24 |         "file_id": fields.String(
 25 |             required=False,
 26 |             default="",
 27 |             description="The file identifier if known (file sha256 hash)",
 28 |         ),
 29 |         "function_name": fields.String(
 30 |             required=False,
 31 |             default="",
 32 |             description="For searching by function name if known.",
 33 |         ),
 34 |         "include_llvm_ir": fields.Boolean(
 35 |             required=False, default=False, description="Include LLVM IR in output."
 36 |         ),
 37 |         "include_asm": fields.Boolean(
 38 |             required=False, default=False, description="Include disassembly output."
 39 |         ),
 40 |         "include_gene_value": fields.Boolean(
 41 |             required=False,
 42 |             default=False,
 43 |             description="Include raw gene value in output.",
 44 |         ),
 45 |         "include_function_names": fields.Boolean(
 46 |             required=False,
 47 |             default=False,
 48 |             description="Include all function names in output.",
 49 |         ),
 50 |     },
 51 | )
 52 | 
 53 | obj_info_args = api.model(
 54 |     "obj_info_args",
 55 |     {
 56 |         "obj_id": fields.String(
 57 |             required=False, default="", description="The identifier of gene or file"
 58 |         ),
 59 |         "output_detail": fields.String(
 60 |             required=False,
 61 |             default=DEFAULT_OUTPUT_DETAIL,
 62 |             description="Output format. \
 63 |     Supported values: ['simple','complete']",
 64 |         ),
 65 |     },
 66 | )
 67 | 
 68 | 
 69 | @ns.route("/gene")
 70 | @ns.response(200, "Final result")
 71 | @ns.response(202, "Request received. Result not ready. Must retry.")
 72 | @ns.response(404, "Object id not found")
 73 | class SearchGene(Resource):
 74 |     """Search by id"""
 75 | 
 76 |     @ns.expect(gene_info_args)
 77 |     def post(self):
 78 |         """Search either by `gene_id` or (`file_id` and `function_name`) combination."""
 79 |         try:
 80 |             args = api.payload
 81 |             ret = kgs.api_get_gene_info(**args)
 82 |             if ret.get("status") == API_STATE_RESULT_NOT_READY:
 83 |                 return ret, 202
 84 |             elif ret.get("status") == API_STATE_EMPTY_RESULT:
 85 |                 return ret, 404
 86 |             elif ret.get("status") == API_STATE_ERROR:
 87 |                 return ret, 500
 88 | 
 89 |             return ret
 90 |         except Exception as e:
 91 |             api.abort(500, f"Exception: {e}")
 92 | 
 93 | 
 94 | @ns.route("/by_id")
 95 | @ns.response(200, "Final result")
 96 | @ns.response(202, "Request received. Result not ready. Must retry.")
 97 | @ns.response(404, "Object id not found")
 98 | class SearchID(Resource):
 99 |     """Search by id"""
100 | 
101 |     @ns.expect(obj_info_args)
102 |     def post(self):
103 |         """Search either by `gene_id` or (`file_id` and `function_name`) combination."""
104 |         try:
105 |             args = dict(api.payload)
106 |             output = args.pop("output_detail")
107 |             flag = False
108 |             if output == "complete":
109 |                 flag = True
110 | 
111 |             args.update(
112 |                 {
113 |                     "include_genes": flag,
114 |                     "include_llvm_ir": flag,
115 |                     "include_asm": flag,
116 |                     "include_gene_value": flag,
117 |                     "include_function_names": flag,
118 |                 }
119 |             )
120 |             ret = kgs.api_get_node_info(**args)
121 |             if ret.get("status") == API_STATE_RESULT_NOT_READY:
122 |                 return ret, 202
123 |             elif ret.get("status") == API_STATE_EMPTY_RESULT:
124 |                 return ret, 404
125 |             elif ret.get("status") == API_STATE_ERROR:
126 |                 return ret, 500
127 | 
128 |             return ret
129 |         except Exception as e:
130 |             api.abort(500, f"Exception: {e}")
131 | 


--------------------------------------------------------------------------------
/scripts/bin2bc:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | if [ -x /run/bin2bc ]; then
  4 |    TIMEOUT=
  5 |    [ -x /usr/bin/timeout ] && TIMEOUT='/usr/bin/timeout -k 1000 900'
  6 |    $TIMEOUT /opt/cg/retdec/bin/retdec-decompiler "$1"
  7 |    exit $?
  8 | fi
  9 | 
 10 | usage()
 11 | {
 12 |    {
 13 |      echo "Usage: $0 <options> <executable-files>"
 14 |      echo ""
 15 |      echo "     --output <dir>       Directory to store output files in. Defaults"
 16 |      echo "                          to the current directory."
 17 |      echo ""
 18 |      echo "     --retdec <dir>       Directory where retdec is installed.  Defaults"
 19 |      echo "                          to /opt/retdec."
 20 |      echo ""
 21 |      echo "     --config <file>      Specify the retdec-decompiler config file to"
 22 |      echo "                          use.  Default is retdec-decompiler default file."
 23 |      echo ""
 24 |      echo "     --force              Force rebuild even if binary is older than existing"
 25 |      echo "                          byte code file."
 26 |      echo ""
 27 |      echo "     --keep_filename      Keep output filename same as input binary. Output"
 28 |      echo "                          maybe overwritten."
 29 |      echo ""
 30 |      echo "     --verbose            Be verbose about what is happening."
 31 |      echo ""
 32 |    } 1>&2
 33 |    exit 1
 34 | }
 35 | 
 36 | RETDECDIR=/opt/retdec
 37 | OUTPUT=.
 38 | CF=
 39 | FORCE=0
 40 | VERBOSE=0
 41 | KEEP_FILENAME=0
 42 | KEEP_DSM=0
 43 | KEEP_LL=0
 44 | 
 45 | while [ $# -ne 0 ]
 46 | do
 47 |    case "$1" in
 48 |      -h) usage;;
 49 |      --help) usage;;
 50 |      --output) OUTPUT="$2"; shift; shift;;
 51 |      --retdec) RETDECDIR="$2"; shift; shift;;
 52 |      --config) CF="--config $2"; shift; shift;;
 53 |      --force) FORCE=1; shift;;
 54 |      --keep_filename) KEEP_FILENAME=1; shift;;
 55 |      --keep_dsm) KEEP_DSM=1; shift;;
 56 |      --keep_ll) KEEP_LL=1; shift;;     
 57 |      --verbose) VERBOSE=1; shift;;
 58 |      *) break;
 59 |    esac
 60 | done
 61 | 
 62 | USEDOCKER=0
 63 | 
 64 | [ -x $RETDECDIR/bin/retdec-decompiler ] && USEDOCKER=0
 65 | 
 66 | if [ $USEDOCKER -eq 1 ]; then
 67 |    SIGMAL="`docker images | awk '$1 == "sigmal" {print $1}'`"
 68 |    [ "x$SIGMAL" = 'x' ] && {
 69 |       echo "$0: Unable to find retdec-decompiler installation" 1>&2
 70 |       echo "              locally or from a sigmal docker image." 1>&2
 71 |       exit 1
 72 |    }
 73 | fi
 74 | 
 75 | TMPDIR=/tmp/bin2bc.$$
 76 | mkdir $TMPDIR
 77 | 
 78 | if [ $USEDOCKER -eq 1 ]; then
 79 |   BIN=`basename $0`
 80 |   cp $0 $TMPDIR/ && chmod 755 $TMPDIR/$BIN
 81 | else
 82 |   [ "x$CF" = 'x' -a -f $RETDECDIR/share/retdec/llvmir-only.json ] && {
 83 |      CF="--config $RETDECDIR/share/retdec/llvmir-only.json"
 84 |   }
 85 | fi
 86 | 
 87 | for bin
 88 | do
 89 |    FT=`file "$bin"`
 90 |    case "$FT" in
 91 |     *ELF*) ;;
 92 |     *)
 93 |       [ $VERBOSE -eq 1 ] && {
 94 |         echo "`date` Skipping $bin; is not an ELF executable." 1>&2
 95 |       }
 96 |       continue;;
 97 |    esac
 98 |    
 99 |    input="`echo \"$bin\" | tr / _ | sed -e 's/_\.\._/_/g' -e 's/^\.\._/_/' -e 's/^_//'`"
100 | 
101 |    [ $FORCE -eq 0 -a -f "$OUTPUT/$input.bc" -a $bin -ot "$OUTPUT/$input".bc ] && {
102 |      [ $VERBOSE -eq 1 ] && {
103 |         echo "`date` Skipping $bin; byte code is current." 1>&2
104 |      }
105 |      continue
106 |    }
107 | 
108 |    rm -f "$TMPDIR/$input*"
109 |    cp "$bin" "$TMPDIR/$input"
110 | 
111 |    LOG="$TMPDIR/retdec.log"
112 | 
113 |    if [ $USEDOCKER -eq 0 ]; then
114 |       [ $VERBOSE -eq 1 ] && {
115 |        echo "`date` Decompiling $bin to bytecode." 1>&1
116 |       }
117 |       $RETDECDIR/bin/retdec-decompiler $CF "$TMPDIR/$input" > $LOG 2>&1
118 |    else
119 |       [ $VERBOSE -eq 1 ] && {
120 |        echo "`date` Decompiling $bin to bytecode via docker." 1>&2
121 |       }
122 |       docker run --rm -tt --entrypoint /run/bin2bc -v "$TMPDIR":/run sigmal /run/"$input" > $LOG 2>&1
123 |    fi
124 | 
125 |    rm -f "$TMPDIR/$input"
126 |    rm -f "$TMPDIR/$input.config.json"
127 |    
128 |    if [ -f "$TMPDIR/$input.bc" ]; then
129 |       [ $VERBOSE -eq 1 ] && {
130 |        echo "`date` Finished decompiling $bin." 1>&2
131 |       }
132 |       output=$input
133 |       output="`basename \"$bin\"`"
134 |       if [ $KEEP_FILENAME -eq 1 ]; then
135 |          output="`basename \"$bin\"`"
136 |       fi
137 |       cp "$TMPDIR/$input.bc" "$OUTPUT/$output".bc
138 |       if [ $KEEP_DSM -eq 1 ]; then
139 |          cp "$TMPDIR/$input.dsm" "$OUTPUT/$output".dsm
140 |       fi
141 |       if [ $KEEP_LL -eq 1 ]; then
142 |          cp "$TMPDIR/$input.ll" "$OUTPUT/$output".ll
143 |       fi
144 |       
145 |       rm -f "$TMPDIR/$input.bc"
146 |    else
147 |      [ $VERBOSE -eq 1 ] && {
148 |        echo "`date` Error decompiling $bin." 1>&2
149 |      }
150 |      tail -5 $LOG | sed -e 's/^/    /'
151 |    fi
152 |    rm $LOG
153 | done
154 | 
155 | rm -rf $TMPDIR
156 | 


--------------------------------------------------------------------------------
/tests/test_kg.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import json
  3 | import logging
  4 | import os
  5 | import shutil
  6 | import sys
  7 | import time
  8 | import unittest
  9 | 
 10 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../"))
 11 | 
 12 | os.environ.setdefault("RETDEC_PATH", "/opt/cg/retdec/")
 13 | 
 14 | from codegenome._defaults import (DEFAULT_GENE_VERSION,  # noqa
 15 |                                   UNIVERSAL_FUNC_NAME)
 16 | from codegenome.kg import GenomeKG  # noqa
 17 | 
 18 | TEST_D = "/tmp/cg_kg_test"
 19 | TEST_FN = "p/p.c"
 20 | FUNC = "f1"
 21 | KG_REPO = os.path.join(TEST_D, "testkg.gkg")
 22 | LOG_FN = os.path.join(TEST_D, "cg-test.log")
 23 | 
 24 | 
 25 | FN = os.path.splitext(os.path.basename(TEST_FN))[0]
 26 | DEST_FN = os.path.join(TEST_D, FN)
 27 | DEST_FN2 = DEST_FN + "_2"
 28 | 
 29 | 
 30 | def prepare():
 31 |     if os.path.exists(TEST_D):
 32 |         shutil.rmtree(TEST_D)
 33 |     os.makedirs(TEST_D)
 34 |     logging.basicConfig(
 35 |         filename=LOG_FN,
 36 |         level=logging.DEBUG,
 37 |         format="%(asctime)s, %(name)s, %(levelname)s, %(message)s",
 38 |         datefmt="%m/%d/%Y %H:%M:%S",
 39 |         force=True,
 40 |     )
 41 | 
 42 | 
 43 | def clear():
 44 |     print(f"clearing {TEST_D}")
 45 |     # shutil.rmtree(TEST_D)
 46 | 
 47 | 
 48 | class TestKG(unittest.TestCase):
 49 |     @classmethod
 50 |     def setUpClass(cls):
 51 |         prepare()
 52 | 
 53 |     @classmethod
 54 |     def tearDownClass(cls):
 55 |         clear()
 56 | 
 57 |     # test order is sorted test function names!
 58 | 
 59 |     def test_01_add_file(self):
 60 |         global bin_id, bin_id2
 61 |         cmd = "clang -O0 -o %s %s" % (DEST_FN, TEST_FN)
 62 |         os.system(cmd)
 63 |         cmd = "clang -O1 -o %s %s" % (DEST_FN2, TEST_FN)
 64 |         os.system(cmd)
 65 |         self.assertTrue(os.path.exists(DEST_FN))
 66 |         self.assertTrue(os.path.exists(DEST_FN2))
 67 |         with open(DEST_FN, "rb") as f:
 68 |             bin_id = hashlib.sha256(f.read()).hexdigest()
 69 | 
 70 |         with open(DEST_FN2, "rb") as f:
 71 |             bin_id2 = hashlib.sha256(f.read()).hexdigest()
 72 | 
 73 |         kg = GenomeKG(KG_REPO)
 74 |         kg.add_file(DEST_FN, keep_aux_files=False)
 75 |         self.assertTrue(os.path.exists(KG_REPO))
 76 |         self.assertFalse(os.path.exists(os.path.join(KG_REPO, ".auxs", bin_id + ".bc")))
 77 |         self.assertFalse(
 78 |             os.path.exists(os.path.join(KG_REPO, ".auxs", bin_id + ".canon"))
 79 |         )
 80 |         self.assertTrue(
 81 |             os.path.exists(
 82 |                 os.path.join(KG_REPO, "genes", DEFAULT_GENE_VERSION, bin_id + ".gene")
 83 |             )
 84 |         )
 85 | 
 86 |         shutil.rmtree(KG_REPO)
 87 |         kg = GenomeKG(KG_REPO)
 88 |         kg.add_file(DEST_FN)
 89 |         self.assertTrue(os.path.exists(KG_REPO))
 90 |         self.assertTrue(os.path.exists(os.path.join(KG_REPO, ".auxs", bin_id + ".bc")))
 91 |         self.assertTrue(
 92 |             os.path.exists(os.path.join(KG_REPO, ".auxs", bin_id + ".canon"))
 93 |         )
 94 |         self.assertTrue(
 95 |             os.path.exists(
 96 |                 os.path.join(KG_REPO, "genes", DEFAULT_GENE_VERSION, bin_id + ".gene")
 97 |             )
 98 |         )
 99 | 
100 |         self.assertEqual(len(kg.bins), 1)
101 |         self.assertTrue(len(kg.gene_ids) >= 14)
102 |         ll = kg.get_ll(kg.gene_ids[0])
103 |         # print(ll)
104 |         self.assertTrue(ll is not None)
105 | 
106 |         bg = kg.get_bin(bin_id)
107 |         self.assertTrue(bg is not None)
108 |         self.assertTrue(len(bg.gene_ids) >= 14)
109 | 
110 |         ll = bg.get_ll(bg.gene_ids[0])
111 |         # print(ll)
112 |         self.assertTrue(ll is not None)
113 | 
114 |         # re add
115 |         t1 = time.time()
116 |         kg.add_file(DEST_FN)
117 |         t2 = time.time()
118 |         self.assertTrue(t2 - t1 < 0.1)  # should be fast with no reprocessing
119 | 
120 |         kg.add_file(DEST_FN2)
121 | 
122 |         self.assertEqual(len(kg.bins), 2)
123 |         self.assertTrue(len(kg.gene_ids) >= 20)
124 | 
125 |     def test_02_load(self):
126 |         kg = GenomeKG(KG_REPO)
127 |         self.assertEqual(len(kg.bins), 0)
128 | 
129 |         kg.load()
130 |         self.assertEqual(len(kg.bins), 2)
131 |         self.assertTrue(len(kg.gene_ids) >= 20)
132 |         ll = kg.get_ll(kg.gene_ids[0])
133 |         # print(ll)
134 |         self.assertTrue(ll is not None)
135 | 
136 |         # import ipdb; ipdb.set_trace()
137 | 
138 |     def test_03_bindiff_old(self):
139 |         kg = GenomeKG(KG_REPO)
140 |         kg.load()
141 |         a, b = list(kg.bins.keys())[:2]
142 |         diff = kg.bindiff_old(a, b)
143 |         self.assertTrue(diff < 0.4)
144 | 
145 |     def test_04_bindiff(self):
146 |         kg = GenomeKG(KG_REPO)
147 |         kg.load()
148 |         a, b = list(kg.bins.keys())[:2]
149 |         ret, stat = kg.bindiff(a, b)
150 |         self.assertEqual(ret.get("similarity"), 100)
151 |         self.assertEqual(len(ret.get("diff_details")), 21)
152 | 
153 |     def test_05_bc(self):
154 |         kg = GenomeKG(KG_REPO)
155 |         kg.load()
156 |         gids = kg.get_gene_ids("main")
157 |         ll = kg.get_ll(gids[0])
158 |         self.assertTrue("@gv1" in ll)
159 |         self.assertTrue("main" not in ll)
160 | 
161 |     def test_05_local_apis(self):
162 |         kg = GenomeKG(KG_REPO)
163 |         kg.load()
164 |         bin = kg.get_bin(bin_id)
165 |         gid = bin.get_gene_id("main")
166 |         gid2 = kg.get_gene_ids("main", bin_id)
167 |         gid3 = kg.get_gene_ids("main", bin_id2)
168 | 
169 |         self.assertEqual(len(gid2), 1)
170 |         self.assertEqual(len(gid3), 1)
171 |         self.assertEqual(gid, gid2[0])
172 |         self.assertNotEqual(gid, gid3[0])
173 | 
174 |         gids = kg.get_gene_ids("main")
175 |         self.assertEqual(len(gids), 2)
176 | 
177 |         ir = kg.get_ll(gid)
178 |         # print(ir)
179 |         self.assertTrue(type(ir), str)
180 |         self.assertTrue(UNIVERSAL_FUNC_NAME in ir)
181 | 
182 |         ginfo = kg.get_gene_info(gid)
183 |         print(ginfo)
184 |         self.assertEqual("gene", ginfo.get("type"))
185 |         self.assertTrue("llvm_ir" in ginfo)
186 |         self.assertEqual(["main"], list(ginfo.get("function_names", {}).values())[0])
187 |         import ipdb
188 | 
189 |         ipdb.set_trace()  # noqa
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     unittest.main(verbosity=2)
194 | 


--------------------------------------------------------------------------------
/scripts/cg:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | ##
  3 | ## This code is part of the Code Genome Framework.
  4 | ##
  5 | ## (C) Copyright IBM 2023.
  6 | ##
  7 | ## This code is licensed under the Apache License, Version 2.0. You may
  8 | ## obtain a copy of this license in the LICENSE.txt file in the root directory
  9 | ## of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
 10 | ##
 11 | ## Any modifications or derivative works of this code must retain this
 12 | ## copyright notice, and modified files need to carry a notice indicating
 13 | ## that they have been altered from the originals.
 14 | ##
 15 | import argparse
 16 | import json
 17 | import logging
 18 | import os
 19 | import shutil
 20 | import subprocess
 21 | import sys
 22 | 
 23 | logging.basicConfig(level=logging.ERROR)
 24 | 
 25 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "../"))
 26 | 
 27 | import codegenome._defaults as defaults  # noqa
 28 | 
 29 | CG_CACHE_DIR = os.path.expanduser(os.environ.get("CG_CACHE_DIR", "~/.cg/cache"))
 30 | CG_DOCKER_IMAGE_NAME = os.environ.get("CG_DOCKER_IMAGE_NAME", "cg-worker")
 31 | 
 32 | 
 33 | def genediff_docker(args):
 34 |     avars = vars(args)
 35 |     avars.pop("func")
 36 |     avars.pop("docker")
 37 |     cach_dir = avars.pop("cache_dir")
 38 |     file1, file2 = avars.pop("file1"), avars.pop("file2")
 39 |     file1, file2 = os.path.abspath(file1), os.path.abspath(file2)
 40 | 
 41 |     assert os.path.isfile(file1)
 42 |     assert os.path.isfile(file2)
 43 | 
 44 |     opts = []
 45 |     for k, v in avars.items():
 46 |         if v is not False:
 47 |             opts.append("--" + k)
 48 |             if v is not True:
 49 |                 opts.append(str(v))
 50 |     tmp_cache_dir = "/tmp/cache"
 51 |     tmp_file1 = os.path.join("/tmp/file1", os.path.basename(file1))
 52 |     tmp_file2 = os.path.join("/tmp/file2", os.path.basename(file2))
 53 | 
 54 |     cg_opts = ["--cache_dir", tmp_cache_dir]
 55 | 
 56 |     proc_args = (
 57 |         [
 58 |             "docker",
 59 |             "run",
 60 |             "--rm",
 61 |             "-v",
 62 |             os.path.abspath(cach_dir) + ":" + tmp_cache_dir,
 63 |             "-v",
 64 |             os.path.dirname(file1) + ":" + os.path.dirname(tmp_file1),
 65 |             "-v",
 66 |             os.path.dirname(file2) + ":" + os.path.dirname(tmp_file2),
 67 |             "-it",
 68 |             CG_DOCKER_IMAGE_NAME,
 69 |         ]
 70 |         + ["cg"]
 71 |         + cg_opts
 72 |         + ["genediff"]
 73 |         + opts
 74 |         + [tmp_file1, tmp_file2]
 75 |     )
 76 | 
 77 |     proc = subprocess.Popen(proc_args, stdout=subprocess.PIPE)
 78 |     while True:
 79 |         line = proc.stdout.readline()
 80 |         if not line:
 81 |             break
 82 |         sys.stdout.buffer.write(line)
 83 | 
 84 | 
 85 | def genediff(args):
 86 |     import codegenome as cg  # noqa
 87 | 
 88 |     logger = logging.getLogger("codegenome")
 89 |     logger.setLevel(logging.ERROR)
 90 |     repo_path = os.path.join(args.cache_dir, "local.kg")
 91 |     if not os.path.exists(repo_path):
 92 |         os.makedirs(repo_path)
 93 | 
 94 |     if args.docker:
 95 |         return genediff_docker(args)
 96 | 
 97 |     kg = cg.GenomeKG(repo_path)
 98 |     if args.verbose:
 99 |         ch = logging.StreamHandler()
100 |         logger.addHandler(ch)
101 |         logger.setLevel(logging.WARNING)
102 |         kg.logger = logger
103 | 
104 |     args.match_sim_thr /= 100
105 |     args.mismatch_sim_thr /= 100
106 | 
107 |     b1 = kg.add_file(args.file1, keep_aux_files=(not args.remove_aux_files))
108 |     b2 = kg.add_file(args.file2, keep_aux_files=(not args.remove_aux_files))
109 |     ret, stat = kg.bindiff(
110 |         b1,
111 |         b2,
112 |         match_sim_thr=args.match_sim_thr,
113 |         mismatch_sim_thr=args.mismatch_sim_thr,
114 |         method=args.method,
115 |         output_detail=args.output_detail,
116 |     )
117 |     if args.format == "json":
118 |         print(json.dumps(ret))
119 |     else:
120 |         print_output(ret, args.no_color)
121 | 
122 | 
123 | def print_output(r, no_color=False):
124 |     color_code = {
125 |         "=": "\033[;32m",
126 |         "!": "\033[;31m",
127 |         "~": "\033[;36m",
128 |         "+": "\033[;35m",
129 |         "-": "\033[;33m",
130 |     }
131 |     print(f"similarity:\t{r.get('similarity')}")
132 |     try:
133 |         for l in r.get("diff_details"):
134 |             if not no_color:
135 |                 sys.stdout.write(color_code.get(l["op"], ""))
136 | 
137 |             print(f"{l['op']}, {l['f1']}, \t{l['f2']}, \t{l['score']}")
138 | 
139 |             if not no_color:
140 |                 sys.stdout.write("\033[0m")
141 |     finally:
142 |         if not no_color:
143 |             sys.stdout.write("\033[0m")
144 | 
145 | 
146 | def clear_cache(args):
147 |     if os.path.isdir(args.cache_dir):
148 |         x = input(
149 |             f"Clearing Code Genome Cache at {args.cache_dir}?. Press [y] to continue. "
150 |         )
151 |         if x.lower().strip() == "y":
152 |             shutil.rmtree(args.cache_dir)
153 |     else:
154 |         sys.stderr.write(f"Invalid cache directory ({args.cache_dir}).\n")
155 |         exit(2)
156 | 
157 | 
158 | if __name__ == "__main__":
159 |     parser = argparse.ArgumentParser()
160 | 
161 |     parser.add_argument(
162 |         "--cache_dir",
163 |         type=str,
164 |         default=CG_CACHE_DIR,
165 |         help="Cache directory. Defaults to `~/.cg/cache`",
166 |     )
167 |     parser.add_argument(
168 |         "--clear_cache",
169 |         action="store_true",
170 |         default=False,
171 |         help="clear cache directory. Default dir is `~/.cg/cache`",
172 |     )
173 | 
174 |     subparsers = parser.add_subparsers(help="commands")
175 | 
176 |     diff_parser = subparsers.add_parser(
177 |         "genediff", help="Binary diff using function level genes."
178 |     )
179 |     diff_parser.add_argument(
180 |         "-v", "--verbose", action="store_true", default=False, help="Verbose output."
181 |     ),
182 |     diff_parser.add_argument(
183 |         "-d", "--docker", action="store_true", default=False, help="Use docker."
184 |     )
185 |     diff_parser.add_argument(
186 |         "--remove_aux_files",
187 |         action="store_true",
188 |         default=False,
189 |         help="If enabled, removes auxillary files to save storage. Function details such as machine-code and llvm-ir will be unavailable.",
190 |     )
191 |     diff_parser.add_argument(
192 |         "-f", "--format", default="default", help="Output format. Options: default|json"
193 |     )
194 |     diff_parser.add_argument(
195 |         "--no_color",
196 |         action="store_true",
197 |         default=False,
198 |         help="No color in default output.",
199 |     )
200 |     diff_parser.add_argument(
201 |         "-gv",
202 |         "--gene_version",
203 |         type=str,
204 |         default=defaults.DEFAULT_GENE_VERSION,
205 |         help="Code Genome version.",
206 |     )
207 |     diff_parser.add_argument(
208 |         "--match_sim_thr",
209 |         type=float,
210 |         default=defaults.FILE_COMPARE_FUNC_MATCH_SIM_THRESHOLD * 100,
211 |         help="Function match similarity threshold. Greater than or equal to this threshold will be considered as a match `~`",
212 |     )
213 |     diff_parser.add_argument(
214 |         "--mismatch_sim_thr",
215 |         type=float,
216 |         default=defaults.FILE_COMPARE_FUNC_MISMATCH_SIM_THRESHOLD * 100,
217 |         help="Function mismatch similarity threshold. Greater than or equal to this threshold will be considered as a mismatch `!`, smaller wil be considered delete `-`",
218 |     )
219 |     diff_parser.add_argument(
220 |         "-m",
221 |         "--method",
222 |         type=str,
223 |         default=defaults.DEFAULT_CALCULATION_METHOD,
224 |         help=f"Calculation method for file-level similarity. Valid values: {str(defaults.KNOWN_CALCULATION_METHODS)}",
225 |     )
226 |     diff_parser.add_argument(
227 |         "--output_detail",
228 |         type=str,
229 |         default=defaults.VALID_OUTPUT_DETAILS[0],
230 |         help=f"Output details. Valid values: {str(defaults.VALID_OUTPUT_DETAILS)}",
231 |     )
232 | 
233 |     diff_parser.add_argument("file1", type=str, help="First filepath")
234 |     diff_parser.add_argument("file2", type=str, help="Second filepath")
235 |     diff_parser.set_defaults(func=genediff)
236 | 
237 |     parser.set_defaults(func=lambda x: parser.print_help())
238 | 
239 |     try:
240 |         args = parser.parse_args()
241 |         if args.clear_cache:
242 |             exit(clear_cache(args))
243 | 
244 |     except IOError as msg:
245 |         parser.error(str(msg))
246 |         exit(2)
247 | 
248 |     args.func(args)
249 | 


--------------------------------------------------------------------------------
/codegenome/genes/sigmal.py:
--------------------------------------------------------------------------------
  1 | ##
  2 | ## This code is part of the Code Genome Framework.
  3 | ##
  4 | ## (C) Copyright IBM 2023.
  5 | ##
  6 | ## This code is licensed under the Apache License, Version 2.0. You may
  7 | ## obtain a copy of this license in the LICENSE.txt file in the root directory
  8 | ## of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
  9 | ##
 10 | ## Any modifications or derivative works of this code must retain this
 11 | ## copyright notice, and modified files need to carry a notice indicating
 12 | ## that they have been altered from the originals.
 13 | ##
 14 | 
 15 | import array
 16 | import hashlib
 17 | import logging
 18 | import os
 19 | import sys
 20 | from collections import deque
 21 | from datetime import datetime
 22 | from threading import Lock, Thread
 23 | 
 24 | import numpy as np
 25 | # import matplotlib.pylab as plt
 26 | import scipy
 27 | from PIL import Image
 28 | from sklearn.neighbors import BallTree
 29 | 
 30 | from .base import CGGeneBase
 31 | 
 32 | logger = logging.getLogger("codegenome.gene.sigmal")
 33 | 
 34 | MAX_SIZE_KB = 10000
 35 | FEATURE_UNIT = 128
 36 | FEATURE_SHAPE = (FEATURE_UNIT, FEATURE_UNIT)
 37 | FEATURE_SIZE = 320
 38 | COL_SIZE = 256
 39 | SIZE_MAP = [
 40 |     (10, 32),
 41 |     (30, 64),
 42 |     (60, 128),
 43 |     (100, 256),
 44 |     (500, 512),
 45 |     (1000, 768),
 46 |     (MAX_SIZE_KB, 1024),
 47 | ]
 48 | 
 49 | GENE_TYPE_CONFIG = {
 50 |     "sigmal2": {"resample": Image.Resampling.NEAREST, "weights": [0.8, 0.2]},
 51 |     "sigmal2b": {"resample": Image.Resampling.BICUBIC, "weights": [0.8, 0.2]},
 52 | }
 53 | 
 54 | 
 55 | def prep_data_sigmal2(bc):
 56 |     """
 57 |     Split IR to function and auxiliary data
 58 |     """
 59 |     import llvmlite.binding as llvm
 60 | 
 61 |     from codegenome._defaults import UNIVERSAL_FUNC_NAME
 62 | 
 63 |     obj = llvm.parse_bitcode(bc)
 64 |     fns = {f.name: f for f in obj.functions}
 65 |     func_str = str(fns[UNIVERSAL_FUNC_NAME])
 66 | 
 67 |     struc_str = [str(x) for x in obj.struct_types]
 68 |     gvs_str = [str(x) for x in obj.global_variables]
 69 |     other_funcdef_str = [str(v) for k, v in fns.items() if k != UNIVERSAL_FUNC_NAME]
 70 | 
 71 |     aux_str = "\n".join(struc_str + gvs_str + other_funcdef_str)
 72 |     if aux_str == "":
 73 |         aux_str = " "
 74 | 
 75 |     return func_str, aux_str
 76 | 
 77 | 
 78 | class SigmalGene(CGGeneBase):
 79 |     def from_data(self, data):
 80 |         return self.feats_from_binary(data)
 81 | 
 82 |     def from_bitcode(self, data, gene_type="sigmal"):
 83 |         """
 84 |         gene_type can be sigmal|sigmal2|sigmal2b|func_only
 85 | 
 86 |         """
 87 |         if gene_type == "sigmal":
 88 |             raw_gene = self.feats_from_binary(data)
 89 |         else:
 90 |             if gene_type in GENE_TYPE_CONFIG:
 91 |                 func, aux = prep_data_sigmal2(data)
 92 |                 raw_gene = self.feats_from_binary_list(
 93 |                     [func, aux],
 94 |                     weights=GENE_TYPE_CONFIG[gene_type]["weights"],
 95 |                     resample=GENE_TYPE_CONFIG[gene_type]["resample"],
 96 |                 )
 97 |             elif gene_type == "func_only":
 98 |                 func, aux = prep_data_sigmal2(data)
 99 |                 raw_gene = self.feats_from_binary_list([func], weights=[1.0])
100 |         return raw_gene
101 | 
102 |     def feats_from_file(self, fn, only_desc=False):
103 |         with open(fn, "rb") as f:
104 |             fdata = f.read()
105 |             md5 = hashlib.md5(fdata).hexdigest()
106 |             dsize = os.path.getsize(fn)
107 |             if only_desc:
108 |                 return md5, dsize, None
109 |             else:
110 |                 return md5, dsize, self.feats_from_binary(fdata)
111 |         return None
112 | 
113 |     def feats_from_buff(self, data, only_desc=False):
114 |         md5 = hashlib.md5(data).hexdigest()
115 |         dsize = len(data)
116 |         if only_desc:
117 |             return md5, dsize, None
118 |         else:
119 |             return md5, dsize, self.feats_from_binary(data)
120 | 
121 |     def binary_to_img_old(self, data):
122 |         dsize = len(data)
123 |         dsize_kb = dsize / 1024
124 |         col_size = 32
125 |         for fs, sz in SIZE_MAP:
126 |             if dsize_kb < fs:
127 |                 col_size = sz
128 | 
129 |         return self.array_to_img(np.frombuffer(data, dtype="B"), col_size)
130 | 
131 |     def array_to_img(
132 |         self, data, col_size=COL_SIZE, return_array=False, auto_resize_col_size=True
133 |     ):
134 |         dsize = len(data)
135 |         if auto_resize_col_size:
136 |             if dsize < (col_size * col_size):
137 |                 # resize col_size to form a square image
138 |                 col_size = int(np.sqrt(dsize))
139 | 
140 |         rows = int(dsize / col_size)
141 |         rem = dsize % col_size
142 |         # print((dsize, col_size, rem))
143 |         if rem != 0:
144 |             a = np.append(data, np.zeros(col_size - rem, dtype="B")).reshape(
145 |                 (rows + 1, col_size)
146 |             )
147 |         else:
148 |             a = data.reshape((rows, col_size))
149 | 
150 |         if return_array:
151 |             return a
152 | 
153 |         im = Image.fromarray(a)
154 |         return im
155 | 
156 |     def binary_to_img(
157 |         self, data, col_size=COL_SIZE, return_array=False, auto_resize_col_size=True
158 |     ):
159 |         return self.array_to_img(
160 |             np.frombuffer(data, dtype="B"), col_size, return_array, auto_resize_col_size
161 |         )
162 | 
163 |     def feats_from_binary(self, data):
164 |         import leargist  # lazy loading
165 | 
166 |         im = self.binary_to_img(data)
167 |         im = im.resize(FEATURE_SHAPE, resample=Image.BICUBIC)
168 |         des = leargist.color_gist(im)
169 |         des = des[0:FEATURE_SIZE]
170 |         return des
171 | 
172 |     def feats_from_binary_list(self, data_list, weights, resample=Image.NEAREST):
173 |         import leargist  # lazy loading
174 | 
175 |         N = len(data_list)
176 |         assert N == len(weights)
177 |         assert sum(weights) == 1.0
178 |         w, h = FEATURE_SHAPE
179 |         shapes = [(w, int(float(x) * h)) for x in weights]
180 |         # print(shapes)
181 | 
182 |         ims = []
183 |         for i, data in enumerate(data_list):
184 |             if type(data) == str:
185 |                 data = bytes(data, "utf8")
186 |             w, h = shapes[i]
187 |             # single pixel hight img
188 |             im = Image.frombytes("L", (len(data), 1), data)
189 |             im = im.resize((w * h, 1), resample=resample)
190 |             im = np.asarray(im).reshape((h, w))
191 |             ims.append(im)
192 | 
193 |             # plt.imshow(im,cmap='gray',vmin=0,vmax=255)
194 |             # plt.show()
195 | 
196 |         im = np.vstack(ims)
197 | 
198 |         # plt.imshow(im,cmap='gray',vmin=0,vmax=255)
199 |         # plt.show()
200 | 
201 |         des = leargist.bw_gist(im)
202 |         des = des[0:FEATURE_SIZE]
203 |         return des
204 | 
205 |     def show(self, img, dpi=72):
206 |         if type(img) == np.ndarray:
207 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
208 |             h, w, c = img.shape
209 |         else:
210 |             h, w = img.size
211 | 
212 |         fh = h / dpi
213 |         fw = w / dpi
214 | 
215 |         if fh <= 0:
216 |             fh = 1
217 |         if fw <= 0:
218 |             fw = 1
219 | 
220 |         # plt.figure(figsize=(fh, fw))
221 |         # plt.imshow(img, 'viridis')
222 | 
223 |     def dist(self, fn1, fn2):
224 |         h1, l1, f1 = self.feats_from_file(fn1)
225 |         h2, l2, f2 = self.feats_from_file(fn2)
226 |         return np.linalg.norm(f1 - f2)
227 | 
228 |     def dist_buff(self, d1, d2):
229 |         h1, l1, f1 = self.feats_from_buff(d1)
230 |         h2, l2, f2 = self.feats_from_buff(d2)
231 |         return np.linalg.norm(f1 - f2)
232 | 
233 |     def _debug_feats_from_file(self, fn):
234 |         with open(fn, "rb") as f:
235 |             data = f.read()
236 |             self._debug_feats_from_buff(data, fn)
237 | 
238 |     def _debug_feats_from_buff(self, data, fn="<buffer>"):
239 |         import leargist  # lazy loading
240 | 
241 |         im = self.binary_to_img(data)
242 |         dpi = 30
243 | 
244 |         self.show(im, dpi)
245 |         # plt.title("binary data (%d bytes)\n%s"%(len(data),os.path.basename(fn)))
246 | 
247 |         im = im.resize(FEATURE_SHAPE)
248 | 
249 |         self.show(im, dpi)
250 |         # plt.title("resize (shape:%s)"%(str(FEATURE_SHAPE)))
251 | 
252 |         des = leargist.color_gist(im)[0:FEATURE_SIZE]
253 |         im = self.array_to_img(des, 32)
254 | 
255 |         self.show(im, 5)
256 |         # plt.title("features (len:%d)"%(FEATURE_SIZE))
257 |         # plt.imshow(im)
258 | 


--------------------------------------------------------------------------------
/codegenome/pipelines/retdecsigmal.py:
--------------------------------------------------------------------------------
  1 | ##
  2 | ## This code is part of the Code Genome Framework.
  3 | ##
  4 | ## (C) Copyright IBM 2023.
  5 | ##
  6 | ## This code is licensed under the Apache License, Version 2.0. You may
  7 | ## obtain a copy of this license in the LICENSE.txt file in the root directory
  8 | ## of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
  9 | ##
 10 | ## Any modifications or derivative works of this code must retain this
 11 | ## copyright notice, and modified files need to carry a notice indicating
 12 | ## that they have been altered from the originals.
 13 | ##
 14 | 
 15 | import hashlib
 16 | import logging
 17 | import os
 18 | import pickle
 19 | import tempfile
 20 | import time
 21 | import traceback
 22 | 
 23 | from .._file_format import *
 24 | from ..genes.sigmal import GENE_TYPE_CONFIG, SigmalGene, prep_data_sigmal2
 25 | from ..ir import IRBinary
 26 | from ..ir.canon import IRCanonPassBinary
 27 | from ..lifters.retdec import CGRetdec
 28 | from .base import CGPipeline
 29 | 
 30 | DB_GENE_DIR = "genes"
 31 | DB_AUX_DIR = ".auxs"
 32 | DB_LOG_DIR = ".logs"
 33 | DB_INDEX_NAME = "index.gkg"
 34 | DEFAULT_GENE_TYPE = "sigmal2"
 35 | 
 36 | _logger = logging.getLogger("codegenome.pipelines.RetdecSigmal")
 37 | 
 38 | 
 39 | def _retdec_bin_to_ir(
 40 |     file_path,
 41 |     output_dir=None,
 42 |     output_fname=None,
 43 |     keep_aux_files=False,
 44 |     overwrite=True,
 45 |     logger=None,
 46 | ):
 47 |     retdec = CGRetdec(logger=logger)
 48 |     bc_path = retdec.process_file(
 49 |         file_path,
 50 |         output_dir=output_dir,
 51 |         output_fname=output_fname,
 52 |         keep_aux_files=keep_aux_files,
 53 |         overwrite=overwrite,
 54 |     )
 55 |     with open(bc_path, "rb") as f:
 56 |         out = f.read()
 57 |     if not keep_aux_files:
 58 |         os.remove(bc_path)
 59 |     return out
 60 | 
 61 | 
 62 | def _ir_to_canon(
 63 |     ir_data, output_path=None, opt_level=None, bin_id=None, metadata=None, logger=None
 64 | ):
 65 |     logger = _logger if logger is None else logger
 66 |     logger.debug(f"Creating IRBinary")
 67 |     irb = IRBinary(ir_data, opt_level=opt_level, bin_id=bin_id)
 68 |     canon = prep_canon_file(irb, metadata)
 69 | 
 70 |     if output_path:
 71 |         with open(output_path, "wb") as cf:
 72 |             pickle.dump(canon, cf, protocol=pickle.HIGHEST_PROTOCOL)
 73 | 
 74 |     return canon
 75 | 
 76 | 
 77 | def _ir_to_canon_using_pass(
 78 |     ir_data, output_path=None, bin_id=None, metadata=None, logger=None
 79 | ):
 80 |     logger = _logger if logger is None else logger
 81 |     if output_path:
 82 |         jsonl_output = os.path.splitext(output_path)[0] + ".canon.jsonl"
 83 |     else:
 84 |         fd, jsonl_output = tempfile.mkstemp()
 85 |         os.close(fd)
 86 | 
 87 |     logger.debug(f"Creating IRCanonPassBinary")
 88 |     irb = IRCanonPassBinary(ir_data, output=jsonl_output, bin_id=bin_id)
 89 |     canon = prep_canon_file(irb, metadata)
 90 | 
 91 |     if output_path:
 92 |         with open(output_path, "wb") as cf:
 93 |             pickle.dump(canon, cf, protocol=pickle.HIGHEST_PROTOCOL)
 94 |     return canon
 95 | 
 96 | 
 97 | def _canon_to_sigmal_gene(
 98 |     canon, output_path=None, gene_type=DEFAULT_GENE_TYPE, logger=None
 99 | ):
100 |     logger = _logger if logger is None else logger
101 |     logger.debug(f"Creating Sigmal gene")
102 |     t = time.time()
103 |     sg = SigmalGene()
104 |     sg_genes = []
105 |     # find unique genes
106 |     gid_funcs = {}
107 |     for gid, func, bc, meta in canon["funcs"]:
108 |         if gid not in gid_funcs:
109 |             gid_funcs[gid] = [func]
110 |         else:
111 |             gid_funcs[gid].append(func)
112 | 
113 |     done = set()
114 | 
115 |     for gid, func, bc, meta in canon["funcs"]:
116 |         if gid not in done:
117 |             raw_gene = sg.from_bitcode(bc, gene_type)
118 |             # format
119 |             gene_data = (gid, gid_funcs[gid], raw_gene, meta)
120 |             sg_genes.append(gene_data)
121 |             done.add(gid)
122 |     t = time.time() - t
123 |     out = prep_gene_file(sg_genes, canon["binid"], canon["file_meta"])
124 |     logger.info("process_canon_to_gene time: %f" % (t))
125 | 
126 |     if output_path:
127 |         with open(output_path, "wb") as f:
128 |             pickle.dump(out, f, protocol=pickle.HIGHEST_PROTOCOL)
129 |     return out
130 | 
131 | 
132 | class RetdecSigmal(CGPipeline):
133 |     def __init__(self):
134 |         self.logger = logging.getLogger("codegenome.pipelines.RetdecSigmal")
135 | 
136 |     def process_file(
137 |         self,
138 |         file_path,
139 |         sigmal_gene_type=DEFAULT_GENE_TYPE,
140 |         output_dir=None,
141 |         output_fname=None,
142 |         keep_aux_files=True,
143 |         overwrite=True,
144 |         bin_id=None,
145 |         logger=None,
146 |         return_genes=False,
147 |         keep_gene_file=True,
148 |     ):
149 |         metadata = get_file_meta(file_path)
150 |         if bin_id is None:
151 |             with open(file_path, "rb") as f:
152 |                 bin_id = hashlib.sha256(f.read()).hexdigest()
153 |         output_dir = os.path.dirname(file_path) if output_dir is None else output_dir
154 |         output_fname = (
155 |             os.path.basename(file_path) if output_fname is None else output_fname
156 |         )
157 | 
158 |         logger = self.logger if logger is None else logger
159 | 
160 |         logger.debug("Lifting to IR.")
161 |         try:
162 |             ir_path = (
163 |                 None
164 |                 if keep_aux_files is False
165 |                 else os.path.join(output_dir, output_fname + ".bc")
166 |             )
167 |             ir_data = _retdec_bin_to_ir(
168 |                 file_path,
169 |                 output_dir=output_dir,
170 |                 output_fname=output_fname,
171 |                 keep_aux_files=keep_aux_files,
172 |                 overwrite=overwrite,
173 |                 logger=logger,
174 |             )
175 |             if not ir_data:
176 |                 logger.error("_retdec_bin_to_ir failed.")
177 |                 return False
178 |         except Exception as ex:
179 |             logger.error(f"Exception: {ex}. {repr(traceback.format_exc())}")
180 |             return False
181 | 
182 |         logger.debug("IR to canonical IR")
183 | 
184 |         try:
185 |             canon_path = (
186 |                 None
187 |                 if keep_aux_files is False
188 |                 else os.path.join(output_dir, output_fname + ".canon")
189 |             )
190 |             canon = _ir_to_canon_using_pass(
191 |                 ir_data,
192 |                 output_path=canon_path,
193 |                 bin_id=bin_id,
194 |                 metadata=metadata,
195 |                 logger=None,
196 |             )
197 |             if canon is None:
198 |                 logger.error("_ir_to_canon failed.")
199 |                 return False
200 |         except Exception as ex:
201 |             logger.error(f"Exception: {ex}. {repr(traceback.format_exc())}")
202 |             return False
203 | 
204 |         logger.debug("Canonical IR to Sigmal gene")
205 | 
206 |         try:
207 |             gene_path = os.path.join(output_dir, output_fname + ".gene")
208 |             if (keep_aux_files == False) and (keep_gene_file == False):
209 |                 gene_path = None
210 | 
211 |             genes = _canon_to_sigmal_gene(
212 |                 canon, output_path=gene_path, gene_type=sigmal_gene_type, logger=logger
213 |             )
214 |             if canon is None:
215 |                 logger.error("_ir_to_canon failed.")
216 |                 return False
217 |         except Exception as ex:
218 |             logger.error(f"Exception: {ex}. {repr(traceback.format_exc())}")
219 |             return False
220 |         if return_genes:
221 |             return genes
222 |         return True
223 | 
224 | 
225 | class RetdecSigmalV1(RetdecSigmal):
226 |     def __init__(self):
227 |         super().__init__()
228 |         self.gene_version = "genes_v0_0_1"
229 | 
230 |     def process_file(
231 |         self,
232 |         file_path,
233 |         sigmal_gene_type=DEFAULT_GENE_TYPE,
234 |         output_dir=None,
235 |         output_fname=None,
236 |         keep_aux_files=True,
237 |         overwrite=True,
238 |         bin_id=None,
239 |         logger=None,
240 |         return_genes=False,
241 |         keep_gene_file=True,
242 |     ):
243 | 
244 |         return super().process_file(
245 |             file_path,
246 |             "sigmal2",
247 |             output_dir=output_dir,
248 |             output_fname=output_fname,
249 |             keep_aux_files=keep_aux_files,
250 |             overwrite=overwrite,
251 |             bin_id=bin_id,
252 |             logger=logger,
253 |             return_genes=return_genes,
254 |             keep_gene_file=keep_gene_file,
255 |         )
256 | 


--------------------------------------------------------------------------------
/docker/decompiler-config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "decompParams": {
  3 |     "verboseOut": true,
  4 |     "outputFormat": "plain",
  5 |     "keepAllFuncs": false,
  6 |     "selectedDecodeOnly": false,
  7 |     "detectStaticCode": true,
  8 |     "backendDisabledOpts": "",
  9 |     "backendEnabledOpts": "",
 10 |     "backendCallInfoObtainer": "optim",
 11 |     "backendVarRenamer": "readable",
 12 |     "backendNoOpts": false,
 13 |     "backendEmitCfg": false,
 14 |     "backendEmitCg": false,
 15 |     "backendAggressiveOpts": false,
 16 |     "backendKeepAllBrackets": false,
 17 |     "backendKeepLibraryFuncs": false,
 18 |     "backendNoTimeVaryingInfo": false,
 19 |     "backendNoVarRenaming": false,
 20 |     "backendNoCompoundOperators": false,
 21 |     "backendNoSymbolicNames": false,
 22 |     "timeout": 0,
 23 |     "maxMemoryLimit": 0,
 24 |     "maxMemoryLimitHalfRam": true,
 25 |     "ordinalNumDirectory": "./support/ordinals/",
 26 |     "staticSignPaths": ["./support/generic/yara_patterns/static-code/"],
 27 |     "libraryTypeInfoPaths": [
 28 |       "./support/generic/types/arm.json",
 29 |       "./support/generic/types/cstdlib.json",
 30 |       "./support/generic/types/linux.json",
 31 |       "./support/generic/types/windows.json",
 32 |       "./support/generic/types/windrivers.json"
 33 |     ],
 34 |     "cryptoPatternPaths": [
 35 |       "./support/generic/yara_patterns/signsrch/signsrch.yarac",
 36 |       "./support/generic/yara_patterns/signsrch/signsrch_regex.yarac"
 37 |     ],
 38 |     "llvmPasses": [
 39 |       "retdec-provider-init",
 40 |       "retdec-decoder",
 41 |       "verify",
 42 |       "retdec-x86-addr-spaces",
 43 |       "retdec-x87-fpu",
 44 |       "retdec-main-detection",
 45 |       "retdec-idioms-libgcc",
 46 |       "retdec-inst-opt",
 47 |       "retdec-cond-branch-opt",
 48 |       "retdec-syscalls",
 49 |       "retdec-stack",
 50 |       "retdec-constants",
 51 |       "retdec-param-return",
 52 |       "retdec-inst-opt-rda",
 53 |       "retdec-inst-opt",
 54 |       "retdec-simple-types",
 55 |       "retdec-write-dsm",
 56 |       "retdec-remove-asm-instrs",
 57 |       "retdec-class-hierarchy",
 58 |       "retdec-select-fncs",
 59 |       "retdec-inst-opt",
 60 |       "retdec-register-localization",
 61 |       "retdec-value-protect",
 62 |       "instcombine",
 63 |       "tbaa",
 64 |       "basicaa",
 65 |       "simplifycfg",
 66 |       "early-cse",
 67 |       "tbaa",
 68 |       "basicaa",
 69 |       "scoped-noalias",
 70 |       "assumption-cache-tracker",
 71 |       "profile-summary-info",
 72 |       "forceattrs",
 73 |       "inferattrs",
 74 |       "domtree",
 75 |       "callsite-splitting",
 76 |       "ipsccp",
 77 |       "called-value-propagation",
 78 |       "globalopt",
 79 |       "domtree",
 80 |       "mem2reg",
 81 |       "deadargelim",
 82 |       "domtree",
 83 |       "aa",
 84 |       "loops",
 85 |       "lazy-branch-prob",
 86 |       "lazy-block-freq",
 87 |       "opt-remark-emitter",
 88 |       "instcombine",
 89 |       "simplifycfg",
 90 |       "early-cse",
 91 |       "basiccg",
 92 |       "globals-aa",
 93 |       "prune-eh",
 94 |       "functionattrs",
 95 |       "argpromotion",
 96 |       "domtree",
 97 |       "sroa",
 98 |       "aa",
 99 |       "memoryssa",
100 |       "early-cse-memssa",
101 |       "speculative-execution",
102 |       "aa",
103 |       "lazy-value-info",
104 |       "jump-threading",
105 |       "correlated-propagation",
106 |       "simplifycfg",
107 |       "domtree",
108 |       "aa",
109 |       "loops",
110 |       "lazy-branch-prob",
111 |       "lazy-block-freq",
112 |       "opt-remark-emitter",
113 |       "instcombine",
114 |       "libcalls-shrinkwrap",
115 |       "loops",
116 |       "branch-prob",
117 |       "block-freq",
118 |       "lazy-branch-prob",
119 |       "lazy-block-freq",
120 |       "opt-remark-emitter",
121 |       "aa",
122 |       "loops",
123 |       "lazy-branch-prob",
124 |       "lazy-block-freq",
125 |       "opt-remark-emitter",
126 |       "tailcallelim",
127 |       "simplifycfg",
128 |       "reassociate",
129 |       "domtree",
130 |       "loops",
131 |       "loop-simplify",
132 |       "lcssa-verification",
133 |       "lcssa",
134 |       "aa",
135 |       "scalar-evolution",
136 |       "loop-rotate",
137 |       "memoryssa",
138 |       "licm",
139 |       "lcssa",
140 |       "loop-unswitch",
141 |       "simplifycfg",
142 |       "domtree",
143 |       "aa",
144 |       "loops",
145 |       "lazy-branch-prob",
146 |       "lazy-block-freq",
147 |       "opt-remark-emitter",
148 |       "instcombine",
149 |       "loop-simplifycfg",
150 |       "loop-simplify",
151 |       "lcssa-verification",
152 |       "aa",
153 |       "loop-accesses",
154 |       "loop-load-elim",
155 |       "lcssa",
156 |       "scalar-evolution",
157 |       "indvars",
158 |       "loop-idiom",
159 |       "loop-deletion",
160 |       "loop-unroll",
161 |       "mldst-motion",
162 |       "phi-values",
163 |       "aa",
164 |       "memdep",
165 |       "lazy-branch-prob",
166 |       "lazy-block-freq",
167 |       "opt-remark-emitter",
168 |       "gvn",
169 |       "phi-values",
170 |       "aa",
171 |       "memdep",
172 |       "memcpyopt",
173 |       "sccp",
174 |       "demanded-bits",
175 |       "bdce",
176 |       "aa",
177 |       "lazy-branch-prob",
178 |       "lazy-block-freq",
179 |       "opt-remark-emitter",
180 |       "instcombine",
181 |       "lazy-value-info",
182 |       "jump-threading",
183 |       "correlated-propagation",
184 |       "aa",
185 |       "phi-values",
186 |       "memdep",
187 |       "dse",
188 |       "bdce",
189 |       "aa",
190 |       "memoryssa",
191 |       "loops",
192 |       "loop-simplify",
193 |       "lcssa-verification",
194 |       "lcssa",
195 |       "scalar-evolution",
196 |       "licm",
197 |       "postdomtree",
198 |       "adce",
199 |       "simplifycfg",
200 |       "domtree",
201 |       "aa",
202 |       "loops",
203 |       "lazy-branch-prob",
204 |       "lazy-block-freq",
205 |       "opt-remark-emitter",
206 |       "instcombine",
207 |       "barrier",
208 |       "elim-avail-extern",
209 |       "basiccg",
210 |       "rpo-functionattrs",
211 |       "strip-dead-prototypes",
212 |       "globaldce",
213 |       "constmerge",
214 |       "constprop",
215 |       "instcombine",
216 |       "instcombine",
217 |       "tbaa",
218 |       "basicaa",
219 |       "simplifycfg",
220 |       "early-cse",
221 |       "tbaa",
222 |       "basicaa",
223 |       "globalopt",
224 |       "globaldce",
225 |       "basiccg",
226 |       "globals-aa",
227 |       "domtree",
228 |       "float2int",
229 |       "domtree",
230 |       "mem2reg",
231 |       "instcombine",
232 |       "simplifycfg",
233 |       "early-cse",
234 |       "lazy-value-info",
235 |       "jump-threading",
236 |       "correlated-propagation",
237 |       "simplifycfg",
238 |       "instcombine",
239 |       "simplifycfg",
240 |       "reassociate",
241 |       "loops",
242 |       "loop-simplify",
243 |       "lcssa-verification",
244 |       "lcssa",
245 |       "aa",
246 |       "scalar-evolution",
247 |       "loop-rotate",
248 |       "licm",
249 |       "lcssa",
250 |       "instcombine",
251 |       "loop-simplifycfg",
252 |       "loop-accesses",
253 |       "lazy-branch-prob",
254 |       "lazy-block-freq",
255 |       "opt-remark-emitter",
256 |       "loop-distribute",
257 |       "branch-prob",
258 |       "block-freq",
259 |       "scalar-evolution",
260 |       "aa",
261 |       "loop-accesses",
262 |       "demanded-bits",
263 |       "lazy-branch-prob",
264 |       "lazy-block-freq",
265 |       "opt-remark-emitter",
266 |       "loop-simplify",
267 |       "scalar-evolution",
268 |       "aa",
269 |       "loop-accesses",
270 |       "lazy-branch-prob",
271 |       "lazy-block-freq",
272 |       "loop-load-elim",
273 |       "aa",
274 |       "lazy-branch-prob",
275 |       "lazy-block-freq",
276 |       "opt-remark-emitter",
277 |       "lcssa",
278 |       "indvars",
279 |       "loop-idiom",
280 |       "loop-deletion",
281 |       "gvn",
282 |       "sccp",
283 |       "instcombine",
284 |       "lazy-value-info",
285 |       "jump-threading",
286 |       "correlated-propagation",
287 |       "dse",
288 |       "bdce",
289 |       "adce",
290 |       "simplifycfg",
291 |       "domtree",
292 |       "loops",
293 |       "scalar-evolution",
294 |       "aa",
295 |       "demanded-bits",
296 |       "lazy-branch-prob",
297 |       "lazy-block-freq",
298 |       "opt-remark-emitter",
299 |       "opt-remark-emitter",
300 |       "instcombine",
301 |       "loop-simplify",
302 |       "lcssa-verification",
303 |       "lcssa",
304 |       "scalar-evolution",
305 |       "loop-unroll",
306 |       "lazy-branch-prob",
307 |       "lazy-block-freq",
308 |       "opt-remark-emitter",
309 |       "instcombine",
310 |       "loop-simplify",
311 |       "lcssa-verification",
312 |       "lcssa",
313 |       "scalar-evolution",
314 |       "licm",
315 |       "lazy-branch-prob",
316 |       "lazy-block-freq",
317 |       "opt-remark-emitter",
318 |       "transform-warning",
319 |       "alignment-from-assumptions",
320 |       "strip-dead-prototypes",
321 |       "globaldce",
322 |       "constmerge",
323 |       "domtree",
324 |       "constprop",
325 |       "instcombine",
326 |       "retdec-inst-opt",
327 |       "retdec-simple-types",
328 |       "retdec-stack-ptr-op-remove",
329 |       "retdec-idioms",
330 |       "instcombine",
331 |       "retdec-inst-opt",
332 |       "retdec-idioms",
333 |       "retdec-remove-phi",
334 |       "loops",
335 |       "branch-prob",
336 |       "block-freq",
337 |       "loop-simplify",
338 |       "lcssa-verification",
339 |       "lcssa",
340 |       "aa",
341 |       "scalar-evolution",
342 |       "block-freq",
343 |       "loop-sink",
344 |       "lazy-branch-prob",
345 |       "lazy-block-freq",
346 |       "opt-remark-emitter",
347 |       "instsimplify",
348 |       "div-rem-pairs",
349 |       "simplifycfg",
350 |       "verify",
351 |       "retdec-value-protect",
352 |       "retdec-write-bc"
353 |     ]
354 |   }
355 | }
356 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/codegenome/ir/ir.py:
--------------------------------------------------------------------------------
  1 | ##
  2 | ## This code is part of the Code Genome Framework.
  3 | ##
  4 | ## (C) Copyright IBM 2023.
  5 | ##
  6 | ## This code is licensed under the Apache License, Version 2.0. You may
  7 | ## obtain a copy of this license in the LICENSE.txt file in the root directory
  8 | ## of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
  9 | ##
 10 | ## Any modifications or derivative works of this code must retain this
 11 | ## copyright notice, and modified files need to carry a notice indicating
 12 | ## that they have been altered from the originals.
 13 | ##
 14 | 
 15 | # Set environment variables SG_IR_OPTIMIZE_EXTERNAL and LLVM_OPT_PATH to run external IR optimizer.
 16 | # E.g. with:
 17 | # export SG_IR_OPTIMIZE_EXTERNAL=1
 18 | # export LLVM_OPT_PATH=/usr/bin/opt-8
 19 | 
 20 | import collections
 21 | import datetime
 22 | import hashlib
 23 | import json
 24 | import logging
 25 | import os
 26 | import random
 27 | import re
 28 | import subprocess
 29 | import sys
 30 | import tempfile
 31 | import time
 32 | 
 33 | import llvmlite.binding as llvm
 34 | 
 35 | from .._defaults import UNIVERSAL_FUNC_NAME
 36 | from .._file_format import _CANON_FILE_VERSION_
 37 | 
 38 | logger = logging.getLogger("codegenome.ir")
 39 | 
 40 | 
 41 | class SigmalEx(object):
 42 |     def normalize_func(self, f):
 43 |         pass
 44 | 
 45 | 
 46 | # challenges
 47 | # - type names are immutable (should not be part of genome as type-name can be arbitrary. Hope retdec has some consistencies  )
 48 | # - recursive identifier dependencies
 49 | # - registers can not be renamed (?)
 50 | 
 51 | 
 52 | class Function(object):
 53 |     def __init__(self, obj, parent, dbg=False):
 54 |         """
 55 |         Object for referencing an LLVM IR function.
 56 |         """
 57 | 
 58 |         self._obj = obj
 59 |         self._parent = parent
 60 |         self.name = str(obj.name)
 61 | 
 62 |         # explicitly preserver order of the items
 63 |         self.args = collections.OrderedDict()
 64 |         self.blocks = collections.OrderedDict()
 65 |         self.insts = collections.OrderedDict()
 66 |         self.types = collections.OrderedDict()
 67 |         self.attrs = []
 68 | 
 69 |         st = time.time()
 70 | 
 71 |         self.opnames = set()
 72 |         self.gvars = set()
 73 |         self.dbg = dbg
 74 |         self.done_set = set()
 75 | 
 76 |         # recursive call!
 77 |         self.add_types(obj.type)
 78 | 
 79 |         self._init(obj, parent, dbg)
 80 | 
 81 |     def _init(self, obj, parent, dbg):
 82 |         """
 83 |         Populate all the components of the function.
 84 |         """
 85 |         st = time.time()
 86 |         for a in obj.attributes:
 87 |             self.attrs.append(a)
 88 | 
 89 |         for a in obj.arguments:
 90 |             self.args[a.name] = a
 91 |             self.add_types(a.type)
 92 | 
 93 |         idx = 0
 94 |         for b in obj.blocks:
 95 |             self.blocks[b.name] = b
 96 | 
 97 |             for i in b.instructions:
 98 |                 if i.name != "":
 99 |                     self.insts[i.name] = i
100 |                 elif str(i.type) != "void":
101 |                     # not a function call
102 |                     i.name = "tv" + str(idx)
103 |                     idx += 1
104 |                     self.insts[i.name] = i
105 | 
106 |                 self.add_types(i.type)
107 | 
108 |                 # inspect instruction ops
109 |                 ops = [x for x in i.operands]
110 |                 for op in ops:
111 |                     if op.name != "":
112 |                         if not op.is_global:  # TODO does not always work
113 |                             self.opnames.add(op.name)
114 | 
115 |                     self.add_with_global_deps(op)
116 | 
117 |         self.externs = self.opnames - set(
118 |             list(self.args.keys())
119 |             + list(self.blocks.keys())
120 |             + list(self.insts.keys())
121 |             + [self.name]
122 |         )
123 |         self.externs = self.externs.union(self.gvars) - set([self.name])
124 |         if dbg:
125 |             print(self.externs)
126 | 
127 |         st = time.time() - st
128 |         self.stat = {"init": st}
129 | 
130 |     def __str__(self):
131 |         raise Exception("str Function not supported.")
132 | 
133 |     @property
134 |     def name(self):
135 |         return self._obj.name
136 | 
137 |     @name.setter
138 |     def name(self, x):
139 |         self._obj.name = x
140 | 
141 |     def add_with_global_deps(self, obj, depth=0):
142 |         """
143 |         Recursively add referenced variables. Only adds global variables or types.
144 |         """
145 |         if obj not in self.done_set:
146 |             self.done_set.add(obj)
147 |         else:
148 |             return
149 |         if isinstance(obj, llvm.ValueRef):
150 |             # value
151 |             ids = self._parent.get_identifiers(obj)
152 |             for x in ids:
153 |                 # only handle if it's a global variable or a type object
154 | 
155 |                 xn = x[1:]
156 |                 if x[:1] == "@":
157 |                     # global variable id
158 |                     if xn not in self.gvars:
159 |                         self.gvars.add(xn)
160 | 
161 |                         # try extracting additional references
162 |                         gv = self._parent.get_gv_by_name(xn)
163 |                         if gv is not None:
164 |                             if gv != obj:
165 |                                 self.add_with_global_deps(gv, depth + 1)
166 |                         else:
167 |                             # function may not have been parsed yet! so pass
168 |                             # print("Warning: undefined global var! %s, %s (%s)"%(gv, xn, str(x)))
169 |                             pass
170 | 
171 |                 else:
172 |                     xnt = self._parent.get_gtype_by_name(xn)
173 |                     if xnt is not None:
174 |                         # add type
175 |                         self.add_types(xnt)
176 | 
177 |     def add_types(self, tp):
178 |         """
179 |         Recursively add type definitions.
180 |         """
181 |         while tp.is_pointer:
182 |             tp = tp.element_type
183 | 
184 |         ids = [x[1:] for x in self._parent.get_identifiers(tp)]
185 |         for x in ids:
186 |             if x not in self.types:
187 |                 if x in self._parent.gtypes:
188 |                     _tp = self._parent.gtypes[x]
189 |                     self.types[x] = _tp
190 |                     if tp != _tp:
191 |                         self.add_types(_tp)
192 |                 else:
193 |                     raise Exception("undefined type! " + str(x))
194 | 
195 |     def get_ll(self):
196 |         """
197 |         Main method for generating the final text for ll file.
198 |         """
199 |         _name = self._obj.name
200 |         self._obj.name = UNIVERSAL_FUNC_NAME
201 |         i = 1
202 |         for a in self.args.values():
203 |             a.name = "a%d" % (i)
204 |             i += 1
205 |         i = 1
206 |         for b in self.blocks.values():
207 |             b.name = "b%d" % (i)
208 |             i += 1
209 | 
210 |         i = 1
211 |         for b in self.insts.values():
212 |             b.name = "v%d" % (i)
213 |             i += 1
214 | 
215 |         i = 1
216 |         for b in self.types.values():
217 |             b.name = "t%d" % (i)
218 |             i += 1
219 | 
220 |         # --- rename externs
221 | 
222 |         externs = []
223 |         # sort externs
224 |         for ex in self.externs:
225 |             obj = self._parent.get_gv_by_name(ex)
226 |             if obj is not None:
227 |                 if isinstance(obj, Function):
228 |                     ln = self._parent.str_external_funcs(obj._obj)
229 |                 else:
230 |                     # global var
231 |                     ln = self._parent.str_globals(obj)
232 |                     ln = ln.split("=")
233 |                     ln = "=".join(ln[1:]).strip()
234 |                 externs.append([ln, ex, obj])
235 | 
236 |         externs.sort()
237 | 
238 |         i = 1
239 |         j = 1
240 |         k = 1
241 |         for _, ex, obj in externs:
242 | 
243 |             if obj is not None:
244 |                 if isinstance(obj, Function):
245 |                     # TODO intelligent rename. e.g.; don't rename *printf*
246 |                     # TODO rename function arg names as well
247 |                     # print "processing "+ex+str(obj._obj)
248 |                     # do not rename external
249 |                     if not obj._obj.is_declaration:
250 |                         obj.name = "gf%d" % i
251 |                         i += 1
252 |                     else:
253 |                         pass
254 |                         # obj.name = 'gef%d'%k
255 |                         # k+=1
256 |                 else:
257 |                     # global var
258 |                     try:
259 |                         obj.name = "gv%d" % j
260 |                     except Exception as e:
261 |                         print(_name, "Error ", e, obj)
262 | 
263 |                     j += 1
264 |                 # externs[ex] = obj
265 | 
266 |         #             if self.dbg:
267 |         #                 print '-------', ex, ':', obj
268 |         #                 print str(self._obj)
269 |         #                 return
270 | 
271 |         # ------------------
272 |         # types
273 | 
274 |         tps = "\n".join([str(x) for x in self.types.values()])
275 | 
276 |         # -----------------
277 |         # globals
278 | 
279 |         # append externs (hack)
280 |         gefs = gfs = gvs = ""
281 | 
282 |         for _, ex, obj in externs:
283 |             if obj is not None:
284 |                 if isinstance(obj, Function):
285 |                     ln = self._parent.str_external_funcs(obj._obj)
286 |                     if not obj._obj.is_declaration:
287 |                         gfs += "\n" + ln
288 |                     else:
289 |                         gefs += "\n" + ln
290 |                 else:
291 |                     # global var
292 |                     ln = self._parent.str_globals(obj)
293 |                     gvs += "\n" + ln
294 | 
295 |         # combine
296 |         gvs = "\n".join([gefs, gfs, gvs])
297 | 
298 |         # ------------------
299 |         # generate main code
300 | 
301 |         body = str(self._obj)
302 |         body = self._parent.str_rm_meta(body)
303 |         # ------------------
304 | 
305 |         # reset externs
306 |         for _, ex, obj in externs:
307 |             if obj is not None:
308 |                 obj.name = ex
309 | 
310 |         for k, v in self.args.items():
311 |             v.name = k
312 |         for k, v in self.blocks.items():
313 |             v.name = k
314 |         for k, v in self.insts.items():
315 |             v.name = k
316 |         for k, v in self.types.items():
317 |             v.name = k
318 |         self._obj.name = _name
319 | 
320 |         return "\n".join([tps, gvs, body])
321 | 
322 |     def get_bc(self):
323 |         m = llvm.parse_assembly(self.get_ll())
324 |         return m.as_bitcode()
325 | 
326 | 
327 | class IRBinary(object):
328 |     def __init__(self, data, ll=False, opt_level=3, bin_id=""):
329 |         global logger
330 |         self.logger = logger
331 |         self._re_p = re.compile(r"[%@]\"?[-a-zA-Z$._0-9][-a-zA-Z$._0-9@]*\"?")
332 |         self._re_gv = re.compile(r"^[%@]\"?[-a-zA-Z$._0-9][-a-zA-Z$._0-9@]*\"?")
333 |         self._re_meta = re.compile(", *!.+\n")
334 |         self._bin_id = bin_id
335 |         self._opt_level = opt_level
336 | 
337 |         st = time.time()
338 |         if ll:
339 |             self._m = llvm.parse_assembly(data)
340 |         else:
341 |             self._m = llvm.parse_bitcode(data)
342 |         st = time.time() - st
343 |         self.stat = {"parse": st}
344 |         self.logger.info("stat:" + json.dumps(self.stat))
345 | 
346 |         self.fs = collections.OrderedDict()
347 |         self.fs_objs = {}
348 |         self.gv = {}
349 |         self.gtypes = {}
350 |         self._ids = {}
351 | 
352 |         self._init()
353 | 
354 |     def _init(self):
355 |         st = time.time()
356 |         if self._opt_level > 0:
357 |             if os.environ.get("SG_IR_OPTIMIZE_EXTERNAL") is not None:
358 |                 self._optimize_external(self._opt_level)
359 |             else:
360 |                 self._optimize(self._opt_level)
361 |         st = time.time() - st
362 |         self.stat["optimize"] = st
363 |         self.logger.info("stat:" + json.dumps(self.stat))
364 | 
365 |         st = time.time()
366 | 
367 |         for g in self._m.global_variables:
368 |             if g.name == "":
369 |                 gid = self.get_gv_identifier(g)
370 |                 for gn in gid:
371 |                     self.gv[gn[1:]] = g
372 |             else:
373 |                 self.gv[g.name] = g
374 | 
375 |         i = 0
376 |         for tp in self._m.struct_types:
377 |             if tp.name != "":
378 |                 self.gtypes[tp.name] = tp
379 |             else:
380 |                 tp.name = "_ANON_TYPE_%d" % (i)
381 |                 i += 1
382 |                 self.gtypes[tp.name] = tp
383 | 
384 |         self.stat["globals_init"] = time.time() - st
385 |         st = time.time()
386 | 
387 |         # function llvmobj_dict
388 |         for f in self._m.functions:
389 |             self.fs_objs[f.name] = f
390 | 
391 |         self.logger.info("Creating function objects.")
392 |         for f in self._m.functions:
393 |             t1 = time.time()
394 |             self.fs[f.name] = Function(f, self)
395 |             self.logger.info(f"{f.name} took {time.time()-t1}secs.")
396 |         st = time.time() - st
397 |         self.stat["func_init"] = st
398 | 
399 |         st = time.time()
400 |         self._collision_correction(self.fs)
401 |         self._collision_correction(self.gv)
402 |         st = time.time() - st
403 |         self.stat["collision_correction"] = st
404 | 
405 |         self.logger.info("stat:" + json.dumps(self.stat))
406 | 
407 |     def _optimize_external(self, opt_level):
408 |         opt_path = os.environ.get("LLVM_OPT_PATH", "opt-8")
409 | 
410 |         tmp = tempfile.NamedTemporaryFile("w+b", delete=True)
411 |         output_filename = tmp.name + ".bc.tmp"
412 | 
413 |         try:
414 |             tmp.write(self._m.as_bitcode())
415 |             tmp.flush()
416 |             self.logger.debug(f"created a tmp bc file{tmp.name}")
417 |             args = [opt_path, f"--O{opt_level}", "-o", output_filename, tmp.name]
418 |             self.logger.debug(f"Running {' '.join(args)}")
419 |             ret = subprocess.run(args)
420 |             if ret.returncode != 0:
421 |                 err = (
422 |                     f"optimization step failed while running command: {' '.join(args)} "
423 |                 )
424 |                 raise Exception(err)
425 | 
426 |             self._m = llvm.parse_bitcode(open(output_filename, "rb").read())
427 |             os.remove(output_filename)
428 |             self.logger.debug("optimization completed.")
429 |         except Exception as err:
430 |             self.logger.error(err)
431 |         finally:
432 |             tmp.close()
433 |             if os.path.exists(tmp.name):
434 |                 os.remove(tmp.name)
435 |             if os.path.exists(output_filename):
436 |                 os.remove(output_filename)
437 |         return self._m
438 | 
439 |     def _optimize(self, opt_level):
440 |         llvm.initialize()
441 |         llvm.initialize_native_target()
442 |         llvm.initialize_native_asmprinter()
443 | 
444 |         self._m.verify()
445 | 
446 |         with llvm.create_module_pass_manager() as pm:
447 |             with llvm.create_pass_manager_builder() as pmb:
448 |                 pmb.opt_level = opt_level
449 |                 pmb.populate(pm)
450 |             pm.run(self._m)
451 | 
452 |     def _collision_correction(self, d):
453 |         if UNIVERSAL_FUNC_NAME in d:
454 |             rname = "_x_"
455 |             while rname in d:
456 |                 rname = "".join(random.sample(string.letters, 8))
457 |             tfs = d.pop(UNIVERSAL_FUNC_NAME)
458 |             tfs.name = rname
459 |             d[rname] = tfs
460 | 
461 |     def get_identifiers(self, obj):
462 |         """
463 |         Returns a list of all object identifies (function names,
464 |         variable names, type names etc.) referenced by obj.
465 |         Builds self._ids{} map on demand to avoid redundant processing.
466 |         """
467 |         out = self._ids.get(obj)
468 |         if out is None:
469 |             s, obj_str = self.str_external_funcs(obj, return_obj_str=True)
470 |             if s == "":
471 |                 # not a function
472 |                 s = obj_str
473 |             out = [x.replace('"', "") for x in self._re_p.findall(s)]
474 |             self._ids[obj] = out
475 |         return out
476 | 
477 |     def get_gv_identifier(self, obj):
478 |         return [x.replace('"', "") for x in self._re_gv.findall(str(obj))]
479 | 
480 |     def str_globals(self, g):
481 |         return str(g)
482 | 
483 |     def str_external_funcs(self, obj, return_obj_str=False):
484 |         """
485 |         Returns function declaration line if the obj is a function
486 |         otherwise an empty string.
487 |         """
488 |         obj_str = str(obj)  # str(obj) is an expensive call!
489 |         out_ln = ""
490 |         for ln in obj_str.split("\n"):
491 |             if ln.startswith("declare"):
492 |                 out_ln = ln
493 |                 break
494 |             elif ln.startswith("define"):
495 |                 ln = ln.strip()
496 |                 ln = "declare" + ln[6:-1]
497 |                 out_ln = ln
498 |                 break
499 |         # ln = self.str_globals(ln)
500 |         if return_obj_str:
501 |             return out_ln, obj_str
502 |         return out_ln
503 | 
504 |     def str_rm_meta(self, s):
505 |         return self._re_meta.sub("\n", s)
506 | 
507 |     def get_gtype_by_name(self, name):
508 |         if name in self.gtypes:
509 |             return self.gtypes[name]
510 |         return None
511 | 
512 |     def get_gv_by_name(self, name):
513 |         out = self.gv.get(name)
514 |         if out is None:
515 |             out = self.fs.get(name)
516 |         return out
517 | 
518 |     def p_inst(self, x):
519 |         print(str(x).strip())
520 |         print("opcode:" + x.opcode)
521 |         print("operands:")
522 |         for op in x.operands:
523 |             print("type:%s, name:%s, value:%s" % (op.type, op.name, str(op)))
524 | 
525 |     def serialize(self, statf=None):
526 |         fns = []
527 |         i = 0
528 |         tot = 0
529 |         err = 0
530 |         st = time.time()
531 | 
532 |         for k, v in self.fs.items():
533 |             # skip declare
534 |             if v._obj.is_declaration:
535 |                 continue
536 |             i += 1
537 |             try:
538 |                 s = time.time()
539 |                 bc = v.get_bc()
540 |                 s = time.time() - s
541 |                 tot += 1
542 | 
543 |                 gid = hashlib.sha256(bc).hexdigest()
544 |                 # TODO get file_offset
545 |                 bc_size = len(bc)
546 |                 file_offset = 0
547 |                 meta = (bc_size, file_offset)
548 | 
549 |                 # format (gene_id, func_name, bitcode, meta)
550 |                 fns.append((gid, k, bc, meta))
551 | 
552 |                 if statf:
553 |                     txt = (
554 |                         '{"type": "OK", "i": %d, "ts": "%s", "func": "%s", "time": %f, "size": %d}'
555 |                         % (i, str(datetime.datetime.now()), k, s, len(bc))
556 |                     )
557 |                     statf.write(txt + "\n")
558 |             except Exception as e:
559 |                 err += 1
560 |                 txt = (
561 |                     '{"type": "ERR", "i": %d, "ts": "%s", "func": "%s", "e": "%s", "bin_id": "%s"}'
562 |                     % (i, str(datetime.datetime.now()), k, str(e), self._bin_id)
563 |                 )
564 |                 self.logger.error(txt)
565 |                 if statf:
566 |                     statf.write(txt + "\n")
567 |                 else:
568 |                     pass
569 |         st = time.time() - st
570 |         if statf:
571 |             stat = {
572 |                 "type": "stat",
573 |                 "bin_id": self._bin_id,
574 |                 "total": tot,
575 |                 "errors": err,
576 |                 "func_count": len(self.fs),
577 |                 "time": st,
578 |             }
579 |             for k, v in self.stat.items():
580 |                 stat[k] = v
581 |             statf.write(json.dumps(stat) + "\n")
582 | 
583 |         return fns
584 | 


--------------------------------------------------------------------------------
/utils/app/app/core/genome_service.py:
--------------------------------------------------------------------------------
  1 | import binascii
  2 | import datetime
  3 | import hashlib
  4 | import json
  5 | import logging
  6 | import os
  7 | import shutil
  8 | import sys
  9 | import threading
 10 | import time
 11 | import traceback
 12 | from textwrap import indent
 13 | 
 14 | import numpy as np
 15 | from sqlitedict import SqliteDict
 16 | 
 17 | import codegenome as cg
 18 | import codegenome._defaults as defaults
 19 | 
 20 | from ..defaults import *
 21 | from .schema import KGNodeID
 22 | 
 23 | CG_CACHE_DIR = defaults.CG_CACHE_DIR
 24 | 
 25 | if not os.path.exists(CG_CACHE_DIR):
 26 |     os.makedirs(CG_CACHE_DIR)
 27 | 
 28 | CG_GENE_DIR = os.path.expanduser(
 29 |     os.environ.get("CG_GENE_DIR", os.path.join(CG_CACHE_DIR, "local.kg"))
 30 | )
 31 | CG_DOCKER_IMAGE_NAME = os.environ.get("CG_DOCKER_IMAGE_NAME", "cg-worker")
 32 | 
 33 | 
 34 | DEFAULT_API_CACHE_TTL_SECS = -1  # negative value, never expire
 35 | # node age check logic for cache invalidation: always refresh cache for young nodes (default: 1hr old node).
 36 | DEFAULT_API_CACHE_NODE_AGE_THRESHOLD_SECS = 60 * 60
 37 | DEFAULT_RECORD_API_STATS = 1  # record api stats in cache db
 38 | DEFAULT_API_COMPUTE_TIMEOUT_SECS = 24 * 60 * 60  # 1 day
 39 | DEFAULT_KEEP_AUX_FILES = 0
 40 | 
 41 | API_STATE_SUCCESS = "Success"
 42 | API_STATE_RESULT_NOT_READY = "ResultNotReady"
 43 | API_STATE_EMPTY_RESULT = "ResultEmpty"
 44 | API_STATE_ERROR = "Error"
 45 | 
 46 | 
 47 | log = logging.getLogger("codegenome.rest.kg_service")
 48 | 
 49 | 
 50 | def crc32(obj):
 51 |     return str(
 52 |         binascii.crc32(json.dumps(obj, default=lambda x: str(x)).encode("utf-8"))
 53 |     )
 54 | 
 55 | 
 56 | def is_exec(obj):
 57 |     # TODO implement proper test
 58 |     return True
 59 | 
 60 | 
 61 | class JobDBDict(SqliteDict):
 62 |     def __init__(self, *args, **kwargs):
 63 |         self._lock = threading.Lock()
 64 |         super().__init__(*args, **kwargs)
 65 | 
 66 |     def __setitem__(self, key, value):
 67 |         with self._lock:
 68 |             super().__setitem__(key, value)
 69 |             super().commit()
 70 | 
 71 |     def __delitem__(self, key):
 72 |         with self._lock:
 73 |             super().__delitem__(key)
 74 |             super().commit()
 75 | 
 76 | 
 77 | class GenomeService(object):
 78 |     def __init__(self, config={}):
 79 |         self.start_ts = time.time()
 80 |         self.config = config
 81 |         self.config.setdefault("cache_dir", CG_CACHE_DIR)
 82 |         self.config.setdefault("gene_dir", CG_GENE_DIR)
 83 | 
 84 |         self.config.setdefault(
 85 |             "keep_aux_files",
 86 |             int(os.environ.get("CG_KEEP_AUX_FILES", DEFAULT_KEEP_AUX_FILES)),
 87 |         )
 88 |         self._disable_index_cache = int(config.get("disable_index_cache", 0))
 89 |         self.api_cache_ttl = self.config.get(
 90 |             "api_cache_ttl_secs",
 91 |             int(os.environ.get("API_CACHE_TTL_SECS", DEFAULT_API_CACHE_TTL_SECS)),
 92 |         )
 93 |         self.record_stats = self.config.get(
 94 |             "api_record_stats",
 95 |             int(os.environ.get("RECORD_API_STATS", DEFAULT_RECORD_API_STATS)),
 96 |         )
 97 |         self.api_compute_timeout = self.config.get(
 98 |             "api_compute_timeout_secs",
 99 |             int(
100 |                 os.environ.get(
101 |                     "API_COMPUTE_TIMEOUT_SECS", DEFAULT_API_COMPUTE_TIMEOUT_SECS
102 |                 )
103 |             ),
104 |         )
105 |         self.api_cache_node_age_threshold = self.config.get(
106 |             "api_cache_node_age_threshold",
107 |             int(
108 |                 os.environ.get(
109 |                     "API_CACHE_NODE_AGE_THRESHOLD_SECS",
110 |                     DEFAULT_API_CACHE_NODE_AGE_THRESHOLD_SECS,
111 |                 )
112 |             ),
113 |         )
114 | 
115 |         self.kg = cg.GenomeKG(db_dir=config.get("gene_dir"))
116 |         self._jobs = JobDBDict(
117 |             os.path.join(self.config.get("cache_dir"), "jobs.sqlite")
118 |         )
119 |         self._threads = {}
120 | 
121 |         self._update_status()
122 | 
123 |     def _update_status(self):
124 |         updates = {}
125 |         for k, v in self._jobs.items():
126 |             if (v.get("end_ts") is None) and (k not in self._threads):
127 |                 v["status"] = "error"
128 |                 updates[k] = v
129 |         for k, v in updates.items():
130 |             self._jobs[k] = v
131 | 
132 |     def status(self):
133 |         incomplete = []
134 |         for k, v in self._jobs.items():
135 |             if "end_ts" not in v:
136 |                 incomplete.append(
137 |                     {
138 |                         "job_id": str(k),
139 |                         "job": v,
140 |                         "duration_secs": int(time.time() - v.get("start_ts")),
141 |                     }
142 |                 )
143 | 
144 |         return {
145 |             "start_time": str(datetime.datetime.fromtimestamp(int(self.start_ts))),
146 |             "uptime_secs": int(time.time() - self.start_ts),
147 |             "total_genes": len(self.kg.gene_ids),
148 |             "total_binaries": len(self.kg.bins),
149 |             "gene_version": self.kg.gene_version,
150 |             "jobs": {"total": len(self._jobs), "incomplete": incomplete},
151 |         }
152 | 
153 |     def make_fileid(self, data):
154 |         h = hashlib.sha256()
155 |         h.update(data)
156 |         return h.digest().hex()
157 | 
158 |     def _add_file(self, file_id, file_path, cleanup=True):
159 |         log.debug(f"add_file(file_path={file_path})")
160 |         qkey = ["add_file", file_path, cleanup]
161 |         try:
162 |             fid = self.kg.add_file(
163 |                 file_path=file_path, keep_aux_files=self.config.get("keep_aux_files")
164 |             )
165 |             if fid is None:
166 |                 out = {
167 |                     "status": API_STATE_ERROR,
168 |                     "file_id": file_id,
169 |                     "status_msg": "File processing failed.",
170 |                 }
171 |             elif fid != file_id:
172 |                 out = {
173 |                     "status": API_STATE_ERROR,
174 |                     "file_id": file_id,
175 |                     "status_msg": f"file id mismatch {fid} != {file_id}",
176 |                 }
177 |             else:
178 |                 out = {
179 |                     "status": API_STATE_SUCCESS,
180 |                     "file_id": file_id,
181 |                     "ret_status": "new_file",
182 |                 }
183 |             self._api_thread_final(file_id, qkey, out)
184 |             if cleanup:
185 |                 tdir = os.path.dirname(file_path)
186 |                 if tdir.startswith(TMP_DIR_PREFIX):
187 |                     log.debug(f"removing directory {tdir}")
188 |                     shutil.rmtree(tdir)
189 | 
190 |             return out
191 |         except Exception as err:
192 |             log.error(
193 |                 f"Exception at add_file({file_path}). {err}. {repr(traceback.format_exc())}."
194 |             )
195 |             out = {"status": API_STATE_ERROR, "status_msg": str(err)}
196 |             self._api_thread_final(file_id, qkey, out)
197 |             return out
198 | 
199 |     def api_add_file(self, file_path):
200 |         log.debug(f"api_add_file({file_path})")
201 | 
202 |         with open(file_path, "rb") as f:
203 |             file_id = self.make_fileid(f.read())
204 | 
205 |         n = self.kg.get_node(file_id)
206 | 
207 |         if n:
208 |             return {
209 |                 "status": API_STATE_SUCCESS,
210 |                 "file_id": file_id,
211 |                 "ret_status": "existing_file",
212 |             }
213 | 
214 |         qkey = ["add_file", file_path, True]
215 | 
216 |         ret = self._api_thread_enter(file_id, qkey, target=self._add_file)
217 |         ret["file_id"] = file_id
218 |         return ret
219 | 
220 |     def _update_output(
221 |         self, fnode, results, fnode2=None, filename=True, filetypes=True
222 |     ):
223 |         # filename
224 |         def _update_node_filename(n):
225 |             if type(n) == dict:
226 |                 for k in ["name", "metadata.name"]:
227 |                     fn = n.get(k)
228 |                     if fn:
229 |                         n[k] = os.path.basename(fn)
230 | 
231 |         # filetypes
232 |         def _update_node_filetypes(n):
233 |             if type(n) == dict:
234 |                 ftypes = []
235 |                 dels = []
236 |                 for k in n.keys():
237 |                     if k.startswith("filetype."):
238 |                         ftypes.append(k.split(".")[1])
239 |                         dels.append(k)
240 |                 for k in dels:
241 |                     n.pop(k)
242 | 
243 |                 n["filetypes"] = ftypes
244 | 
245 |         if filename:
246 |             _update_node_filename(fnode)
247 |             _update_node_filename(fnode2)
248 |         if filetypes:
249 |             _update_node_filetypes(fnode)
250 |             _update_node_filetypes(fnode2)
251 | 
252 |     def _prep_output(self, output, output_detail):
253 |         if output_detail == "simple":
254 |             for r in output.get("results", []):
255 |                 if type(r) == dict:
256 |                     r.setdefault(
257 |                         "sha256",
258 |                         r.get("object", {}).get(
259 |                             "sha256", r.get("id", "").split(":")[-1]
260 |                         ),
261 |                     )
262 |                     if "object" in r:
263 |                         r.pop("object")
264 | 
265 |         return output
266 | 
267 |     def _cleanup_jobs(self, job_id):
268 |         # TODO implement
269 |         pass
270 | 
271 |     def check_job(self, job_id):
272 |         job = self._jobs.get(job_id)
273 |         if job:
274 |             ret = job.get("result")
275 |             if ret:
276 |                 # Clean job
277 |                 # self._jobs.pop(job_id)
278 |                 return ret
279 |             ret = {
280 |                 "status": API_STATE_RESULT_NOT_READY,
281 |                 "start_ts": job.get("start_ts"),
282 |                 "job_id": job_id,
283 |             }
284 |             file_id = job.get("file_id")
285 |             if file_id:
286 |                 ret["file_id"] = file_id
287 | 
288 |             return ret
289 | 
290 |         return {"status": API_STATE_ERROR, "status_msg": f"Job {job_id} not found"}
291 | 
292 |     def del_job(self, job_id):
293 |         if job_id in self._jobs:
294 |             log.warning(f"Deleting job({job_id}).")
295 |             self._jobs.pop(job_id)
296 |             return {"status": API_STATE_SUCCESS}
297 | 
298 |         return {"status": API_STATE_ERROR, "status_msg": f"Job {job_id} not found"}
299 | 
300 |     def delete_file(self, file_id):
301 |         log.warning(f"Deleting file({file_id}).")
302 |         # try removing jobs
303 |         dels = []
304 |         for k, v in self._jobs.items():
305 |             if file_id == v.get("result", {}).get("file_id"):
306 |                 dels.append(k)
307 |         for k in dels:
308 |             self._jobs.pop(k)
309 | 
310 |         ret = self.kg.delete_file(file_id=file_id)
311 |         if ret:
312 |             return {"status": API_STATE_SUCCESS}
313 | 
314 |         return {
315 |             "status": API_STATE_ERROR,
316 |             "status_msg": f"Error deleting file {file_id}.",
317 |         }
318 | 
319 |     def _create_job_id(self, obj_id, qkey):
320 |         if qkey[0] == "add_file":
321 |             # file path will be random
322 |             return crc32([obj_id])
323 |         return crc32([obj_id, qkey])
324 | 
325 |     def _api_thread_enter(self, obj_id, qkey, target):
326 |         if not callable(target):
327 |             raise Exception(f"target [{target}] argument must be callable.")
328 |         try:
329 |             # check if running
330 |             job_id = self._create_job_id(obj_id, qkey)
331 |             job = self._jobs.get(job_id)
332 |             prev_out = None
333 |             if job:
334 |                 ret = job.get("result")
335 |                 if ret:
336 |                     self._cleanup_jobs(job_id)
337 |                     dt = time.time() - job["end_ts"]
338 | 
339 |                     cache_ok = False
340 |                     if self.api_cache_ttl < 0:
341 |                         # always use cache
342 |                         cache_ok = True
343 |                     elif dt < self.api_cache_ttl:
344 |                         cache_ok = True
345 |                     # always return last result if available, but trigger new update if needed
346 |                     prev_out = ret
347 |                     if type(prev_out) == dict and prev_out.get("status") in [
348 |                         API_STATE_ERROR,
349 |                         API_STATE_EMPTY_RESULT,
350 |                     ]:
351 |                         cache_ok = False
352 | 
353 |                     if cache_ok:
354 |                         log.info(f"Returning cached result for {(obj_id,qkey)}")
355 |                         return ret
356 |                 else:
357 |                     dt = time.time() - job.get("start_ts")
358 |                     # is thread live?
359 |                     running = False
360 |                     try:
361 |                         if job_id in self._threads and self._threads[job_id].is_alive():
362 |                             running = True
363 |                     except:
364 |                         pass
365 |                     if running:
366 |                         if dt < self.api_compute_timeout:
367 |                             # still computing
368 |                             return {
369 |                                 "status": API_STATE_RESULT_NOT_READY,
370 |                                 "start_ts": job["start_ts"],
371 |                                 "job_id": job_id,
372 |                             }
373 |                         else:
374 |                             # you can not kill the thread
375 |                             # signal to stop, thread function need to cooperate
376 |                             job["stop"] = True
377 |                             job["status"] = "stopping"
378 |                             if job_id in self._threads:
379 |                                 self._threads.pop(job_id)
380 |                     else:
381 |                         # job crashed
382 |                         job["status"] = "error"
383 | 
384 |             try:
385 |                 args = [obj_id] + qkey[1:]
386 |                 th = threading.Thread(target=target, args=args)
387 |                 th.start()
388 |                 sts = int(time.time())
389 |                 self._threads[job_id] = th
390 |                 self._jobs[job_id] = {"start_ts": sts, "status": "running"}
391 |                 if prev_out:
392 |                     return prev_out
393 |                 else:
394 |                     return {
395 |                         "status": API_STATE_RESULT_NOT_READY,
396 |                         "start_ts": sts,
397 |                         "job_id": job_id,
398 |                     }
399 |             except Exception as err:
400 |                 log.error(
401 |                     f"Exception creating job thread. {err} Using blocking call for {target.__name__}{(obj_id, qkey)})"
402 |                 )
403 |                 return target(*args)
404 | 
405 |         except Exception as err:
406 |             log.error(
407 |                 f"Exception at _api_thread_enter({(obj_id, qkey, target.__name__)}). {err}. {repr(traceback.format_exc())}"
408 |             )
409 | 
410 |     def _api_thread_final(self, obj_id, qkey, out):
411 |         try:
412 |             job_id = self._create_job_id(obj_id, qkey)
413 |             job = self._jobs.get(job_id, {})
414 |             job["result"] = out
415 |             job["end_ts"] = time.time()
416 |             job["status"] = "completed"
417 |             self._jobs[job_id] = job
418 |             if job_id in self._threads:
419 |                 self._threads.pop(job_id)
420 |         except Exception as err:
421 |             log.error(
422 |                 f"Exception at _api_thread_final({(obj_id,qkey)}). {err}. {repr(traceback.format_exc())}"
423 |             )
424 | 
425 |     def api_files_compare_kg(
426 |         self,
427 |         file_id1,
428 |         file_id2,
429 |         method=DEFAULT_COMPARE_METHOD,
430 |         output_detail=DEFAULT_OUTPUT_DETAIL,
431 |     ):
432 |         """
433 |         Main api exposed to the external UI rest-api.
434 |         """
435 |         log.debug(
436 |             f"api_files_compare_kg(file_id1={file_id1}, file_id2={file_id2}, method={method}, output_detail={output_detail}"
437 |         )
438 |         file_id1 = KGNodeID.file_id(file_hash=file_id1)
439 |         file_id2 = KGNodeID.file_id(file_hash=file_id2)
440 | 
441 |         obj_id = file_id1
442 |         qkey = ["files_compare_kg", file_id2, method, output_detail]
443 | 
444 |         # test direct
445 |         return self._files_compare_kg(obj_id, file_id2, method, output_detail)
446 | 
447 |         return self._api_thread_enter(obj_id, qkey, target=self._files_compare_kg)
448 | 
449 |     def _files_compare_kg(
450 |         self, file_id1, file_id2, method="gene_v0", output_detail=DEFAULT_OUTPUT_DETAIL
451 |     ):
452 |         log.debug(
453 |             f"_files_compare_kg(file_id1={file_id1}, file_id2={file_id2}, method={method}, output_detail={output_detail}"
454 |         )
455 |         qkey = ["files_compare_kg", file_id2, method, output_detail]
456 |         try:
457 |             t1 = time.time()
458 |             fnode1 = self.kg.get_node(file_id1)
459 |             fnode2 = self.kg.get_node(file_id2)
460 |             t2 = time.time()
461 |             if fnode1 is None or fnode2 is None:
462 |                 msg = ""
463 |                 if fnode1 is None:
464 |                     msg = f"file_id:{file_id1} could not be found."
465 |                 if fnode2 is None:
466 |                     msg += f"file_id:{file_id2} could not be found."
467 | 
468 |                 out = {
469 |                     "status": API_STATE_EMPTY_RESULT,
470 |                     "results": [],
471 |                     "stats": {"init_prep_time": t2 - t1},
472 |                     "status_msg": msg,
473 |                 }
474 |                 self._api_thread_final(file_id1, qkey, out)
475 |                 log.warn(f"_files_compare_kg returning: {out}")
476 |                 return out
477 | 
478 |             if (not is_exec(fnode1)) or (not is_exec(fnode2)):
479 |                 # not a executable file, reset version to gene_v0
480 |                 log.info("not a executable file, reset version to gene_v0")
481 |                 method = "gene_v0"
482 | 
483 |             flags = method.split(".")
484 |             version = flags[0]
485 |             if len(flags) > 1:
486 |                 method = flags[1]
487 |             else:
488 |                 method = DEFAULT_CALCULATION_METHOD
489 | 
490 |             if version == "gene_v0":
491 |                 results, stats = self.file_compare_gene_v0(
492 |                     fnode1, fnode2, output_detail=output_detail
493 |                 )
494 |                 self._update_output(fnode1, results, fnode2)
495 |             elif version in ["genes_v1_3_0", "genes_v1_3_1"]:
496 |                 # TODO pass match/mimatch thrs
497 |                 results, stats = self.kg.bindiff(
498 |                     fnode1, fnode2, method=method, output_detail=output_detail
499 |                 )
500 |                 self._update_output(fnode1, results, fnode2)
501 |             else:
502 |                 log.error(
503 |                     f"_files_compare_kg error for fileids: {file_id1, file_id2}, version: {version}, method: {method}"
504 |                 )
505 |                 results, stats = {
506 |                     "error": f"version: {version}, method: {method} not supported."
507 |                 }, {}
508 | 
509 |             stats["init_prep_time"] = stats.get("init_prep_time", 0.0) + (t2 - t1)
510 |             out = {
511 |                 "query": [fnode1, fnode2],
512 |                 "results": results,
513 |                 "stats": stats,
514 |                 "status": API_STATE_SUCCESS,
515 |             }
516 |             if "error" in results:
517 |                 out["status"] = API_STATE_ERROR
518 |                 out["status_msg"] = results["error"]
519 |             elif len(results) == 0:
520 |                 out["status"] = API_STATE_EMPTY_RESULT
521 |             out = self._prep_output(out, output_detail)
522 |             self._api_thread_final(file_id1, qkey, out)
523 |             return out
524 |         except Exception as err:
525 |             log.error(
526 |                 f"Exception at _files_compare_kg(). {err}. {repr(traceback.format_exc())}."
527 |             )
528 |             out = {"status": API_STATE_ERROR, "status_msg": str(err)}
529 |             self._api_thread_final(file_id1, qkey, out)
530 |             return out
531 | 
532 |     def api_get_gene_info(
533 |         self,
534 |         gene_id=None,
535 |         file_id=None,
536 |         function_name=None,
537 |         include_llvm_ir=False,
538 |         include_asm=False,
539 |         include_gene_value=False,
540 |         include_function_names=False,
541 |     ):
542 |         log.debug(
543 |             f"api_get_gene_info({gene_id=}, {file_id=}, {function_name=}, {include_llvm_ir=}, {include_asm=}...)"
544 |         )
545 |         # not threaded
546 |         try:
547 |             out = {"status": API_STATE_ERROR, "status_msg": "Unknown error"}
548 |             if gene_id:
549 |                 data = self.kg.get_gene_info(
550 |                     gene_id,
551 |                     function_name=function_name,
552 |                     llvm_ir=include_llvm_ir,
553 |                     include_asm=include_asm,
554 |                     gene_value=include_gene_value,
555 |                     func_names=include_function_names,
556 |                 )
557 |                 if data:
558 |                     out = {"status": API_STATE_SUCCESS, "data": data}
559 |                 else:
560 |                     out = {
561 |                         "status": API_STATE_EMPTY_RESULT,
562 |                         "status_msg": "Can not get gene info",
563 |                     }
564 |             elif (file_id != "") and (function_name != ""):
565 |                 gids = self.kg.get_gene_ids(function_name, file_id, include_bin_id=True)
566 |                 # in case a binary may have multiple functions with same name, take last
567 |                 if len(gids) > 0:
568 |                     gene_id, file_id = gids[-1]
569 |                     data = self.kg.get_gene_info(
570 |                         gene_id,
571 |                         bin_id=file_id,
572 |                         function_name=function_name,
573 |                         llvm_ir=include_llvm_ir,
574 |                         include_asm=include_asm,
575 |                         gene_value=include_gene_value,
576 |                         func_names=include_function_names,
577 |                     )
578 |                     if data:
579 |                         out = {"status": API_STATE_SUCCESS, "data": data}
580 |                     else:
581 |                         out = {
582 |                             "status": API_STATE_EMPTY_RESULT,
583 |                             "status_msg": "Can not get gene info",
584 |                         }
585 |                 else:
586 |                     out = {
587 |                         "status": API_STATE_EMPTY_RESULT,
588 |                         "status_msg": f"Function gene not found for {file_id} -> {function_name}.",
589 |                     }
590 |             else:
591 |                 out = {
592 |                     "status": API_STATE_ERROR,
593 |                     "status_msg": "Not enough arguments. gene_id or (file_id and function_name) must be passed.",
594 |                 }
595 |             return out
596 |         except Exception as err:
597 |             log.error(
598 |                 f"Exception at api_get_ir(). {err}. {repr(traceback.format_exc())}."
599 |             )
600 |             out = {"status": API_STATE_ERROR, "status_msg": str(err)}
601 |             return out
602 | 
603 |     def api_get_node_info(
604 |         self,
605 |         obj_id=None,
606 |         include_genes=False,
607 |         include_llvm_ir=False,
608 |         include_asm=False,
609 |         include_gene_value=False,
610 |         include_function_names=False,
611 |     ):
612 |         # not threaded
613 |         try:
614 |             if obj_id in self.kg.bins:
615 |                 data = self.kg.get_node(obj_id)
616 |                 if data:
617 |                     if include_genes:
618 |                         data["genes"] = self.kg.bins.get(obj_id, {})
619 |                     out = {"status": API_STATE_SUCCESS, "data": data}
620 |                 else:
621 |                     out = {
622 |                         "status": API_STATE_EMPTY_RESULT,
623 |                         "status_msg": f"Object id {obj_id} not found",
624 |                     }
625 | 
626 |             else:
627 |                 data = self.kg.get_gene_info(
628 |                     obj_id,
629 |                     llvm_ir=include_llvm_ir,
630 |                     include_asm=include_asm,
631 |                     gene_value=include_gene_value,
632 |                     func_names=include_function_names,
633 |                 )
634 |                 if data:
635 |                     out = {"status": API_STATE_SUCCESS, "data": data}
636 |                 else:
637 |                     out = {
638 |                         "status": API_STATE_EMPTY_RESULT,
639 |                         "status_msg": f"Object id {obj_id} not found",
640 |                     }
641 |             return out
642 | 
643 |         except Exception as err:
644 |             log.error(
645 |                 f"Exception at api_get_ir(). {err}. {repr(traceback.format_exc())}."
646 |             )
647 |             out = {"status": API_STATE_ERROR, "status_msg": str(err)}
648 |             return out
649 | 
650 | 
651 | def read_config():
652 |     config = {
653 |         "gene_dir": CG_GENE_DIR,
654 |         "cache_dir": CG_CACHE_DIR,
655 |         "keep_aux_files": True,
656 |     }
657 |     return config
658 | 
659 | 
660 | def create_genome_service():
661 |     config = read_config()
662 |     log.debug(f"read_config(). {config}")
663 |     kgs = GenomeService(config)
664 |     log.debug("GenomeService object created.")
665 |     # do it from api to reduce service startup
666 |     log.debug("updating index.")
667 |     t1 = time.time()
668 |     kgs.kg.load()
669 |     t2 = time.time()
670 |     log.debug("updating index completed.")
671 |     return kgs
672 | 


--------------------------------------------------------------------------------