├── .gitattributes ├── .gitignore ├── .gitmodules ├── FuncParser-opt ├── FuncParser-opt.jar ├── build.sh ├── jar │ └── antlr-4.7.1-complete.jar ├── manifest.mf └── src │ ├── CoarseSimpleDecl.g4 │ ├── Common.g4 │ ├── Expressions.g4 │ ├── FineSimpleDecl.g4 │ ├── Function.g4 │ ├── FunctionDef.g4 │ ├── Main.java │ ├── Module.g4 │ ├── ModuleLex.g4 │ └── SimpleDecl.g4 ├── LICENSE.md ├── README.md ├── checker └── check_clones.py ├── config.py ├── dep.sh ├── docs ├── examples.md └── 취약점 데이터베이스 생성 솔루션 매뉴얼 V1.0.pdf ├── hmark ├── FuncParser-opt.jar ├── README.md ├── __init__.py ├── get_cpu_count.py ├── hmark.py ├── icon.gif ├── icon.ico ├── parseutility2.py ├── spec_generator.py └── version.py ├── initialize.py ├── paper └── SNP17.pdf ├── src ├── get_cvepatch_from_git.py ├── get_source_from_cvepatch.py ├── repo_updater.py ├── vul_dup_remover.py ├── vul_hidx_generator.py └── vul_verifier.py ├── testcode ├── async.c ├── configs.c ├── module.c └── wrong_case.c └── tools ├── FuncParser-opt.jar ├── __init__.py ├── cvedatagen ├── README.md ├── __init__.py ├── common.py ├── cveXmlDownloader.py ├── cveXmlParser.py └── cveXmlUpdater.py └── parseutility.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.pkl binary 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *test.py 3 | tmp_* 4 | 5 | diff/ 6 | vul/ 7 | diff*/ 8 | vul-*/ 9 | repolists/ 10 | hidx/ 11 | hidx*/ 12 | result-0901* 13 | *.tar.gz 14 | *.html 15 | *.spec 16 | *.pkl 17 | 18 | testcode/ 19 | misc/ 20 | experiments-related/ 21 | codesensor2python/build/ 22 | codesensor2python/ 23 | FuncParser/build/ 24 | kernel44/ 25 | FuncParser/FuncParser.jar 26 | 27 | hmark/build/ 28 | hmark/dist/ 29 | 30 | NVDCVEcrawler/*.xml 31 | 32 | # Pycharm 33 | .idea/ 34 | 35 | # CVE 36 | *.xml 37 | *.zip 38 | 39 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "vulnDBGen"] 2 | path = vulnDBGen 3 | url = https://github.com/squizz617/vulnDBGen 4 | -------------------------------------------------------------------------------- /FuncParser-opt/FuncParser-opt.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squizz617/vuddy/33cdab1ad04a6dcc76011b92821dbeb055c6691e/FuncParser-opt/FuncParser-opt.jar -------------------------------------------------------------------------------- /FuncParser-opt/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | 5 | SOURCE_DIR='src' 6 | BUILD_DIR='build' 7 | JAR_DIR='jar' 8 | ANTLR_JAR='antlr-4.7.1-complete.jar' 9 | RESULT_JAR='FuncParser-471.jar' 10 | 11 | rm -rf ${RESULT_JAR} 12 | rm -rf ${BUILD_DIR} 13 | mkdir ${BUILD_DIR} 14 | 15 | # Copy source files to build dir 16 | cp ./${SOURCE_DIR}/*.g4 ${BUILD_DIR} 17 | cp ./${SOURCE_DIR}/*.java ${BUILD_DIR} 18 | cp ./manifest.mf ${BUILD_DIR} 19 | cp ./${JAR_DIR}/$ANTLR_JAR ${BUILD_DIR} 20 | 21 | cd ${BUILD_DIR} 22 | 23 | # Generate Lexer and Parser from Grammar 24 | 25 | java -cp ./${ANTLR_JAR} org.antlr.v4.Tool Module.g4 Function.g4 26 | 27 | # Compile java-files 28 | #javac -cp ./${ANTLR_JAR_1} ./*.java -Xlint:unchecked 29 | javac -cp ./${ANTLR_JAR} ./*.java -Xlint:deprecation 30 | 31 | # unpack ANTLR-jar since we need some of the class files 32 | jar xf ./${ANTLR_JAR} 33 | 34 | # Create ${RESULT_JAR} 35 | #jar cvfm ${RESULT_JAR} manifest.mf *.class org > /dev/null 36 | jar cvfm ${RESULT_JAR} ../manifest.mf ./*.class org > out 37 | cp ${RESULT_JAR} ../ 38 | -------------------------------------------------------------------------------- /FuncParser-opt/jar/antlr-4.7.1-complete.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squizz617/vuddy/33cdab1ad04a6dcc76011b92821dbeb055c6691e/FuncParser-opt/jar/antlr-4.7.1-complete.jar -------------------------------------------------------------------------------- /FuncParser-opt/manifest.mf: -------------------------------------------------------------------------------- 1 | Main-Class: Main 2 | -------------------------------------------------------------------------------- /FuncParser-opt/src/CoarseSimpleDecl.g4: -------------------------------------------------------------------------------- 1 | grammar CoarseSimpleDecl; 2 | 3 | import SimpleDecl; 4 | 5 | // The following two contain 'water'-rules for expressions 6 | 7 | init_declarator : declarator (('(' expr? ')') | ('=' assign_expr_w_))?; 8 | declarator: ptrs? identifier type_suffix?; 9 | 10 | 11 | type_suffix : ('[' constant_expr_w_ ']') | param_type_list; 12 | 13 | // water rules for expressions 14 | 15 | assign_expr_w_: assign_water* 16 | (('{' assign_expr_w__l2 '}' | '(' assign_expr_w__l2 ')' | '[' assign_expr_w__l2 ']') 17 | assign_water*)*; 18 | 19 | assign_expr_w__l2: assign_water_l2* (('{' assign_expr_w__l2 '}' | '(' assign_expr_w__l2 ')' | '[' assign_expr_w__l2 ']') 20 | assign_water_l2*)*; 21 | 22 | constant_expr_w_: no_squares* ('[' constant_expr_w_ ']' no_squares*)*; 23 | 24 | -------------------------------------------------------------------------------- /FuncParser-opt/src/Common.g4: -------------------------------------------------------------------------------- 1 | grammar Common; 2 | 3 | @header{ 4 | import java.util.Stack; 5 | } 6 | 7 | 8 | @parser::members 9 | { 10 | public boolean skipToEndOfObject() 11 | { 12 | Stack CurlyStack = new Stack(); 13 | Object o = new Object(); 14 | int t = _input.LA(1); 15 | 16 | while(t != EOF && !(CurlyStack.empty() && t == CLOSING_CURLY)){ 17 | 18 | if(t == PRE_ELSE){ 19 | Stack ifdefStack = new Stack(); 20 | consume(); 21 | t = _input.LA(1); 22 | 23 | while(t != EOF && !(ifdefStack.empty() && (t == PRE_ENDIF))){ 24 | if(t == PRE_IF) 25 | ifdefStack.push(o); 26 | else if(t == PRE_ENDIF) 27 | ifdefStack.pop(); 28 | consume(); 29 | t = _input.LA(1); 30 | } 31 | } 32 | 33 | if(t == OPENING_CURLY) 34 | CurlyStack.push(o); 35 | else if(t == CLOSING_CURLY) 36 | CurlyStack.pop(); 37 | 38 | consume(); 39 | t = _input.LA(1); 40 | } 41 | if(t != EOF) 42 | consume(); 43 | return true; 44 | } 45 | 46 | // this should go into FunctionGrammar but ANTLR fails 47 | // to join the parser::members-section on inclusion 48 | 49 | public boolean preProcSkipToEnd() 50 | { 51 | Stack CurlyStack = new Stack(); 52 | Object o = new Object(); 53 | int t = _input.LA(1); 54 | 55 | while(t != EOF && !(CurlyStack.empty() && t == PRE_ENDIF)){ 56 | 57 | if(t == PRE_IF) 58 | CurlyStack.push(o); 59 | else if(t == PRE_ENDIF) 60 | CurlyStack.pop(); 61 | 62 | consume(); 63 | t = _input.LA(1); 64 | } 65 | if(t != EOF) 66 | consume(); 67 | return true; 68 | } 69 | 70 | } 71 | 72 | unary_operator : '&' | '*' | '+'| '-' | '~' | '!'; 73 | relational_operator: ('<'|'>'|'<='|'>='); 74 | 75 | constant 76 | : HEX_LITERAL 77 | | OCTAL_LITERAL 78 | | DECIMAL_LITERAL 79 | | STRING 80 | | CHAR 81 | | FLOATING_POINT_LITERAL 82 | ; 83 | 84 | // keywords & operators 85 | 86 | function_decl_specifiers: ('inline' | 'virtual' | 'explicit' | 'friend' | 'static'); 87 | ptr_operator: ('*' | '&'); 88 | 89 | access_specifier: ('public' | 'private' | 'protected'); 90 | 91 | operator: (('new' | 'delete' ) ('[' ']')?) 92 | | '+' | '-' | '*' | '/' | '%' |'^' | '&' | '|' | '~' 93 | | '!' | '=' | '<' | '>' | '+=' | '-=' | '*=' 94 | | '/=' | '%=' | '^=' | '&=' | '|=' | '>>' 95 | |'<<'| '>>=' | '<<=' | '==' | '!=' 96 | | '<=' | '>=' | '&&' | '||' | '++' | '--' 97 | | ',' | '->*' | '->' | '(' ')' | '[' ']' 98 | ; 99 | 100 | assignment_operator: '=' | '*=' | '/=' | '%=' | '+=' | '-=' | '<<=' | '>>=' | '&=' | '^=' | '|='; 101 | equality_operator: ('=='| '!='); 102 | 103 | template_decl_start : TEMPLATE '<' template_param_list '>'; 104 | 105 | 106 | // template water 107 | template_param_list : (('<' template_param_list '>') | 108 | ('(' template_param_list ')') | 109 | no_angle_brackets_or_brackets)+ 110 | ; 111 | 112 | // water 113 | 114 | no_brackets: ~('(' | ')'); 115 | no_brackets_curlies_or_squares: ~('(' | ')' | '{' | '}' | '[' | ']'); 116 | no_brackets_or_semicolon: ~('(' | ')' | ';'); 117 | no_angle_brackets_or_brackets : ~('<' | '>' | '(' | ')'); 118 | no_curlies: ~('{' | '}'); 119 | no_squares: ~('[' | ']'); 120 | no_squares_or_semicolon: ~('[' | ']' | ';'); 121 | no_comma_or_semicolon: ~(',' | ';'); 122 | 123 | assign_water: ~('(' | ')' | '{' | '}' | '[' | ']' | ';' | ','); 124 | assign_water_l2: ~('(' | ')' | '{' | '}' | '[' | ']'); 125 | 126 | water: .; 127 | -------------------------------------------------------------------------------- /FuncParser-opt/src/Expressions.g4: -------------------------------------------------------------------------------- 1 | grammar Expressions; 2 | 3 | expr: assign_expr (',' expr)?; 4 | 5 | assign_expr: conditional_expression (assignment_operator assign_expr)?; 6 | conditional_expression: or_expression #normOr 7 | | or_expression ('?' expr ':' conditional_expression) #cndExpr; 8 | 9 | 10 | or_expression : and_expression ('||' or_expression)?; 11 | and_expression : inclusive_or_expression ('&&' and_expression)?; 12 | inclusive_or_expression: exclusive_or_expression ('|' inclusive_or_expression)?; 13 | exclusive_or_expression: bit_and_expression ('^' exclusive_or_expression)?; 14 | bit_and_expression: equality_expression ('&' bit_and_expression)?; 15 | equality_expression: relational_expression (equality_operator equality_expression)?; 16 | relational_expression: shift_expression (relational_operator relational_expression)?; 17 | shift_expression: additive_expression ( ('<<'|'>>') shift_expression)?; 18 | additive_expression: multiplicative_expression (('+'| '-') additive_expression)?; 19 | multiplicative_expression: cast_expression ( ('*'| '/'| '%') multiplicative_expression)?; 20 | 21 | cast_expression: ('(' cast_target ')' cast_expression) 22 | | unary_expression 23 | ; 24 | 25 | cast_target: type_name ptr_operator*; 26 | 27 | // currently does not implement delete 28 | 29 | unary_expression: inc_dec cast_expression 30 | | unary_op_and_cast_expr 31 | | sizeof_expression 32 | | new_expression 33 | | postfix_expression 34 | ; 35 | 36 | new_expression: '::'? NEW type_name '[' conditional_expression? ']' 37 | | '::'? NEW type_name '(' expr? ')' 38 | ; 39 | 40 | unary_op_and_cast_expr: unary_operator cast_expression; 41 | 42 | sizeof_expression: sizeof '(' sizeof_operand ')' 43 | | sizeof sizeof_operand2; 44 | 45 | sizeof: 'sizeof'; 46 | 47 | sizeof_operand: type_name ptr_operator *; 48 | sizeof_operand2: unary_expression; 49 | 50 | inc_dec: ('--' | '++'); 51 | 52 | // this is a bit misleading. We're just allowing access_specifiers 53 | // here because C programs can use 'public', 'protected' or 'private' 54 | // as variable names. 55 | 56 | postfix_expression: postfix_expression '[' expr ']' #arrayIndexing 57 | | postfix_expression '(' function_argument_list ')' #funcCall 58 | | postfix_expression '.' TEMPLATE? (identifier) #memberAccess 59 | | postfix_expression '->' TEMPLATE? (identifier) #ptrMemberAccess 60 | | postfix_expression inc_dec #incDecOp 61 | | primary_expression # primaryOnly 62 | ; 63 | 64 | function_argument_list: ( function_argument (',' function_argument)* )?; 65 | function_argument: assign_expr; 66 | 67 | 68 | primary_expression: identifier | constant | '(' expr ')'; 69 | 70 | -------------------------------------------------------------------------------- /FuncParser-opt/src/FineSimpleDecl.g4: -------------------------------------------------------------------------------- 1 | grammar FineSimpleDecl; 2 | 3 | import SimpleDecl; 4 | 5 | init_declarator: declarator '(' expr? ')' #initDeclWithCall 6 | | declarator '=' initializer #initDeclWithAssign 7 | | declarator #initDeclSimple 8 | ; 9 | 10 | declarator: ptrs? identifier type_suffix?; 11 | 12 | type_suffix : ('[' conditional_expression? ']') | param_type_list; 13 | 14 | -------------------------------------------------------------------------------- /FuncParser-opt/src/Function.g4: -------------------------------------------------------------------------------- 1 | grammar Function; 2 | import ModuleLex, Common, Expressions, FineSimpleDecl; 3 | /* 4 | @header{ 5 | package antlr.C; 6 | } 7 | */ 8 | 9 | statements: (pre_opener 10 | | pre_closer 11 | | pre_else {preProcSkipToEnd(); } 12 | | statement)*; 13 | 14 | statement: opening_curly 15 | | closing_curly 16 | | block_starter 17 | | jump_statement 18 | | label 19 | | simple_decl 20 | | expr_statement 21 | | water 22 | ; 23 | 24 | pre_opener: PRE_IF; 25 | pre_else: PRE_ELSE; 26 | pre_closer: PRE_ENDIF; 27 | opening_curly: '{'; 28 | closing_curly: '}'; 29 | 30 | block_starter: selection_or_iteration; 31 | 32 | selection_or_iteration: TRY #Try_statement 33 | | CATCH '(' param_type ')' #Catch_statement 34 | | IF '(' condition ')' #If_statement 35 | | ELSE #Else_statement 36 | | SWITCH '(' condition ')' #Switch_statement 37 | | FOR '(' (for_init_statement | ';') condition? ';' expr? ')' #For_statement 38 | | DO #Do_statement 39 | | WHILE '(' condition ')' #While_statement 40 | ; 41 | 42 | // Don't know why, but: introducing this unused rule results 43 | // in a performance boost. 44 | 45 | do_statement1: DO statement WHILE '(' expr ')'; 46 | 47 | for_init_statement : simple_decl 48 | | expr ';' 49 | ; 50 | 51 | jump_statement: BREAK ';' #breakStatement 52 | | CONTINUE ';' #continueStatement 53 | | GOTO identifier ';' #gotoStatement 54 | | RETURN expr? ';' #returnStatement 55 | ; 56 | 57 | label: CASE? (identifier | number | CHAR ) ':' ; 58 | 59 | expr_statement: expr? ';'; 60 | 61 | condition: expr 62 | | type_name declarator '=' assign_expr; 63 | -------------------------------------------------------------------------------- /FuncParser-opt/src/FunctionDef.g4: -------------------------------------------------------------------------------- 1 | grammar FunctionDef; 2 | import ModuleLex, Function; 3 | 4 | function_def : template_decl_start? return_type? function_name 5 | function_param_list ctor_list? compound_statement; 6 | 7 | return_type : (function_decl_specifiers* type_name) ptr_operator*; 8 | 9 | function_param_list : '(' parameter_decl_clause? ')' CV_QUALIFIER* exception_specification?; 10 | 11 | parameter_decl_clause: (parameter_decl (',' parameter_decl)*) (',' '...')? 12 | | VOID; 13 | parameter_decl : param_decl_specifiers parameter_id | param_decl_specifiers | parameter_id; 14 | parameter_id: ptrs? ('(' parameter_id ')' | parameter_name) type_suffix?; 15 | 16 | compound_statement: OPENING_CURLY { skipToEndOfObject(); }; 17 | //compound_statement: statements; 18 | 19 | ctor_list: ':' ctor_initializer (',' ctor_initializer)*; 20 | ctor_initializer: initializer_id ctor_expr; 21 | initializer_id : '::'? identifier; 22 | ctor_expr: '(' expr? ')'; 23 | 24 | function_name: '(' function_name ')' | identifier | OPERATOR operator; 25 | 26 | exception_specification : THROW '(' type_id_list ')'; 27 | type_id_list: no_brackets* ('(' type_id_list ')' no_brackets*)*; 28 | -------------------------------------------------------------------------------- /FuncParser-opt/src/Main.java: -------------------------------------------------------------------------------- 1 | 2 | import org.antlr.v4.runtime.ANTLRFileStream; 3 | import org.antlr.v4.runtime.ANTLRInputStream; 4 | import org.antlr.v4.runtime.CommonTokenStream; 5 | import org.antlr.v4.runtime.tree.ParseTree; 6 | import org.antlr.v4.runtime.tree.ParseTreeListener; 7 | import org.antlr.v4.runtime.Parser; 8 | //import org.antlr.v4.runtime.tree.TerminalNode; 9 | import org.antlr.v4.runtime.tree.*; 10 | import org.antlr.v4.runtime.ParserRuleContext; 11 | import org.antlr.v4.runtime.RuleContext; 12 | import org.antlr.v4.runtime.misc.Utils; 13 | import org.antlr.v4.runtime.misc.ParseCancellationException; 14 | 15 | import org.antlr.v4.runtime.CharStream; 16 | import org.antlr.v4.runtime.misc.Interval; 17 | 18 | import org.antlr.v4.runtime.DefaultErrorStrategy; 19 | import org.antlr.v4.runtime.BailErrorStrategy; 20 | import org.antlr.v4.runtime.atn.PredictionMode; 21 | 22 | import org.antlr.v4.runtime.Token; 23 | 24 | import java.io.*; 25 | import java.util.List; 26 | import java.util.Arrays; 27 | import java.util.ArrayList; 28 | 29 | import java.util.concurrent.ExecutorService; 30 | import java.util.concurrent.Executors; 31 | import java.util.concurrent.Future; 32 | import java.util.concurrent.Callable; 33 | 34 | 35 | public class Main { 36 | public static void main(String[] args) throws IOException { 37 | List ret; 38 | long t1, t2, t3; 39 | try { 40 | String inputFilename = parseCommandLine(args); 41 | String bParseBody = "1"; // "1": with body parser, "0": without body parser 42 | if (args.length > 1) 43 | bParseBody = args[args.length - 1]; 44 | if (!bParseBody.equals("0") && !bParseBody.equals("1")) 45 | throw new Exception("argument bParseBody(last argument) required."); 46 | 47 | //System.out.println("processors: " + Runtime.getRuntime().availableProcessors()); 48 | t1 = System.currentTimeMillis(); 49 | if (bParseBody.equals("1")) { 50 | TreeParser tp = new TreeParser(); 51 | ret = tp.ParseFile(inputFilename); 52 | } 53 | else { 54 | TreeParser1 tp = new TreeParser1(); 55 | ret = tp.ParseFile(inputFilename); 56 | } 57 | t2 = System.currentTimeMillis(); 58 | } catch (Exception e) { 59 | e.printStackTrace(); 60 | return; 61 | } 62 | 63 | print_functions_all(ret); // print_functions() or print_functions_all() 64 | t3 = System.currentTimeMillis(); 65 | 66 | //System.out.println("parse " + (t2 - t1) / 1000.0); 67 | //System.out.println("print " + (t3 - t2) / 1000.0); 68 | } 69 | 70 | // Print all elements in function class seperated with CR, LF, TAB. 71 | // Please refer function.toString() method. 72 | private static void print_functions_all(List func_list) { 73 | for (function f : func_list) 74 | System.out.print(f); 75 | } 76 | 77 | // Print name, line, parameter, variable, datatype and funccallee in function class. 78 | private static void print_functions(List func_list) { 79 | System.out.println("func_list.size(): " + func_list.size()); 80 | int i = 0; 81 | for (function f : func_list) { 82 | System.out.println("" + (i++) + ": " + f.name + 83 | "(" + f.lineStart + ", " + f.lineStop + ")"); 84 | 85 | System.out.print(" PARAM\t["); 86 | for (String element : f.parameterList) 87 | System.out.print(element + ", "); 88 | System.out.println("]"); 89 | 90 | System.out.print(" LVARS\t["); 91 | for (String element : f.variableList) 92 | System.out.print(element + ", "); 93 | System.out.println("]"); 94 | 95 | System.out.print(" DTYPE\t["); 96 | for (String element : f.dataTypeList) 97 | System.out.print(element + ", "); 98 | System.out.println("]"); 99 | 100 | System.out.print(" CALLS\t["); 101 | for (String element : f.funcCalleeList) 102 | System.out.print(element + ", "); 103 | System.out.println("]\n"); 104 | } 105 | } 106 | 107 | private static String parseCommandLine(String[] args) throws Exception { 108 | if(args.length < 1) { 109 | throw new Exception("filename required."); 110 | } 111 | 112 | return args[0]; 113 | } 114 | } 115 | 116 | class function { 117 | public String parentFile; 118 | public int parentNumLoc = 0; 119 | public String name; 120 | public int lineStart = 0; 121 | public int lineStop = 0; 122 | public int funcId = 0; 123 | public String funcBody; 124 | 125 | public List parameterList; 126 | public List variableList; 127 | public List dataTypeList; 128 | public List funcCalleeList; 129 | 130 | function(String fileName) { 131 | this.parentFile = fileName; 132 | this.parameterList = new ArrayList(); 133 | this.variableList = new ArrayList(); 134 | this.dataTypeList = new ArrayList(); 135 | this.funcCalleeList = new ArrayList(); 136 | } 137 | 138 | public String toString() { 139 | StringBuilder ret = new StringBuilder(); 140 | 141 | ret.append("\r\0?\r?\0\r"); // function string start 142 | ret.append('\n'); 143 | 144 | ret.append(parentFile); 145 | ret.append('\n'); 146 | 147 | ret.append(String.valueOf(parentNumLoc)); 148 | ret.append('\n'); 149 | 150 | ret.append(name); 151 | ret.append('\n'); 152 | 153 | ret.append(String.valueOf(lineStart)); 154 | ret.append('\t'); 155 | ret.append(String.valueOf(lineStop)); 156 | ret.append('\n'); 157 | 158 | ret.append(String.valueOf(funcId)); 159 | ret.append('\n'); 160 | 161 | for (String s : this.parameterList) { 162 | ret.append(s); 163 | ret.append('\t'); 164 | } 165 | ret.append('\n'); 166 | for (String s : this.variableList) { 167 | ret.append(s); 168 | ret.append('\t'); 169 | } 170 | ret.append('\n'); 171 | for (String s : this.dataTypeList) { 172 | ret.append(s); 173 | ret.append('\t'); 174 | } 175 | ret.append('\n'); 176 | for (String s : this.funcCalleeList) { 177 | ret.append(s); 178 | ret.append('\t'); 179 | } 180 | ret.append('\n'); 181 | 182 | ret.append(this.funcBody); 183 | ret.append('\n'); 184 | 185 | return ret.toString(); 186 | } 187 | } 188 | 189 | class JobInstance implements Callable { 190 | public function functionInstance; 191 | public String string; 192 | public int line; 193 | public int enableSLL; 194 | 195 | public JobInstance(String s, function f, int l, int e) { 196 | this.functionInstance = f; 197 | this.string = s; 198 | this.line = l; 199 | this.enableSLL = e; 200 | } 201 | 202 | public function call() throws Exception { 203 | //System.err.println("call() called: " + Thread.currentThread().getName()); 204 | BodyParser p = new BodyParser(); 205 | p.ParseString(this.string, this.functionInstance, this.line, this.enableSLL); 206 | return this.functionInstance; 207 | } 208 | } 209 | 210 | class BodyParser implements ParseTreeListener { 211 | private static int IS_FIRST = 1; 212 | 213 | public final static int FUNCTION_DEF = 0; 214 | public final static int FUNCTION_NAME = 1; 215 | public final static int PARAMETER_NAME = 2; 216 | public final static int DECLARATOR = 3; 217 | public final static int TYPE_NAME = 4; 218 | public final static int FUNCTION_CALL = 5; 219 | public final static int COMPOUND_STMT = 6; 220 | 221 | private final static String[] table = {"function_def", "function_name", "parameter_name", 222 | "declarator", "type_name", "identifier", "compound_statement"}; 223 | private static int[] IDX = {0, 0, 0, 0, 0, 0, 0}; 224 | 225 | private static List ruleNames; 226 | 227 | private function functionInstance = null; 228 | 229 | // Function body's base line 230 | private int defaultLine = 0; 231 | 232 | // Local variable's name 233 | private int declaratorFlag = 0; 234 | private StringBuilder declaratorStr = new StringBuilder(); 235 | 236 | // type (return type, parameter type, local variable type) 237 | private int typeNameFlag = 0; 238 | private StringBuilder typeNameStr = new StringBuilder(); 239 | 240 | private int funcCallFlag = 0; 241 | private StringBuilder funcCallStr = new StringBuilder(); 242 | 243 | // set SLL option 244 | private int enableSLL = 0; 245 | 246 | public BodyParser() { 247 | this.functionInstance = null; 248 | 249 | this.defaultLine = 0; 250 | 251 | this.declaratorFlag = 0; 252 | this.declaratorStr = new StringBuilder(); 253 | 254 | // type (return type, parameter type, local variable type) 255 | this.typeNameFlag = 0; 256 | this.typeNameStr = new StringBuilder(); 257 | 258 | this.funcCallFlag = 0; 259 | this.funcCallStr = new StringBuilder(); 260 | 261 | // set SLL option 262 | this.enableSLL = 0; 263 | } 264 | 265 | private void _init(FunctionParser parser) { 266 | //this(); 267 | this.functionInstance = null; 268 | 269 | this.defaultLine = 0; 270 | 271 | this.declaratorFlag = 0; 272 | this.declaratorStr = new StringBuilder(); 273 | 274 | this.typeNameFlag = 0; 275 | this.typeNameStr = new StringBuilder(); 276 | 277 | this.funcCallFlag = 0; 278 | this.funcCallStr = new StringBuilder(); 279 | 280 | this.enableSLL = 0; 281 | 282 | if (BodyParser.IS_FIRST != 0) { 283 | this.ruleNames = Arrays.asList(parser.getRuleNames()); 284 | 285 | for (int i = 0; i < parser.ruleNames.length; i++) { 286 | for (int j = 0; j < BodyParser.table.length; j++) { 287 | if (parser.ruleNames[i].equals(BodyParser.table[j])) 288 | BodyParser.IDX[j] = i; 289 | } 290 | } 291 | BodyParser.IS_FIRST = 0; 292 | } 293 | } 294 | 295 | public void ParseString(String string, function functionInstance) { 296 | this.ParseString(string, functionInstance, 0); 297 | } 298 | public void ParseString(String string, function functionInstance, int line) { 299 | this.ParseString(string, functionInstance, line, 1); 300 | } 301 | public void ParseString(String string, function funcinstance, int line, int bSLL) { 302 | try { 303 | ANTLRInputStream input = new ANTLRInputStream(string); 304 | FunctionLexer lexer = new FunctionLexer(input); 305 | CommonTokenStream tokens = new CommonTokenStream(lexer); 306 | FunctionParser parser = new FunctionParser(tokens); 307 | parser.removeErrorListeners(); // remove error listener 308 | 309 | if (bSLL != 0) { 310 | //print "start parsing in BodyParser class with SLL mode" 311 | parser.getInterpreter().setPredictionMode(PredictionMode.SLL); 312 | parser.setErrorHandler(new BailErrorStrategy()); 313 | } 314 | 315 | ParseTree tree; 316 | try { 317 | tree = parser.statements(); 318 | } 319 | catch (ParseCancellationException e) { 320 | //print "Exception found in BodyParser class. set LL mode" 321 | parser.reset(); 322 | parser.getInterpreter().setPredictionMode(PredictionMode.LL); 323 | parser.setErrorHandler(new DefaultErrorStrategy()); 324 | tree = parser.statements(); 325 | } 326 | this._init(parser); // reset before traverse a parse tree 327 | this.enableSLL = bSLL; 328 | this.functionInstance = funcinstance; 329 | 330 | if (line != 0) // if line is zero, self.defaultLine is also zero 331 | this.defaultLine = (line - 1); 332 | 333 | //ParseTreeWalker ptw = new ParseTreeWalker(); 334 | //ptw.walk(this, tree); 335 | ParseTreeWalker.DEFAULT.walk(this, tree); 336 | } 337 | catch (Exception e) { 338 | e.printStackTrace(); 339 | } 340 | return; 341 | } 342 | 343 | @Override 344 | public void enterEveryRule(ParserRuleContext ctx) { 345 | int ruleIndex = ctx.getRuleIndex(); 346 | 347 | if (ruleIndex == BodyParser.IDX[BodyParser.DECLARATOR]) 348 | this.declaratorFlag = 1; 349 | else if (ruleIndex == BodyParser.IDX[BodyParser.TYPE_NAME]) 350 | this.typeNameFlag = 1; 351 | else if (ruleIndex == BodyParser.IDX[BodyParser.FUNCTION_CALL]) 352 | this.funcCallFlag = 1; 353 | } 354 | 355 | 356 | @Override 357 | public void exitEveryRule(ParserRuleContext ctx) { 358 | int ruleIndex = ctx.getRuleIndex(); 359 | 360 | if (ruleIndex == BodyParser.IDX[BodyParser.DECLARATOR] && this.declaratorFlag != 0) {// useless if-statement (because, enter declarator -> exit identifier) 361 | //print "LVAR" 362 | this.functionInstance.variableList.add(this.declaratorStr.toString().trim()); 363 | this.declaratorFlag = 0; 364 | this.declaratorStr.setLength(0); 365 | } 366 | else if (ruleIndex == BodyParser.IDX[BodyParser.TYPE_NAME] && this.typeNameFlag != 0) { 367 | //print "DTYPE" 368 | this.functionInstance.dataTypeList.add(this.typeNameStr.toString().trim()); 369 | this.typeNameFlag = 0; 370 | this.typeNameStr.setLength(0); 371 | } 372 | else if (ruleIndex == BodyParser.IDX[BodyParser.FUNCTION_CALL] && this.funcCallFlag != 0) { 373 | //print "CALL" 374 | if (this.funcCallFlag == 2) 375 | this.functionInstance.funcCalleeList.add(this.funcCallStr.toString().trim()); 376 | this.funcCallFlag = 0; 377 | this.funcCallStr.setLength(0); 378 | 379 | if (this.declaratorFlag != 0) {// [enter declarator -> exit identifier]: avoid "a [ 1 ]" in local variable name 380 | this.functionInstance.variableList.add(this.declaratorStr.toString().trim()); 381 | this.declaratorFlag = 0; 382 | this.declaratorStr.setLength(0); 383 | } 384 | } 385 | } 386 | 387 | @Override 388 | public void visitTerminal(TerminalNode node) { 389 | if (this.declaratorFlag != 0) { 390 | String tmpText = Trees.getNodeText(node, this.ruleNames); 391 | 392 | if (!tmpText.equals("*")) { 393 | this.declaratorStr.append(tmpText); 394 | this.declaratorStr.append(' '); 395 | } 396 | } 397 | else if (this.typeNameFlag != 0) { 398 | this.typeNameStr.append(Trees.getNodeText(node, this.ruleNames)); 399 | this.typeNameStr.append(' '); 400 | } 401 | else if (this.funcCallFlag != 0) { 402 | try { 403 | ParseTree p1 = node.getParent().getParent().getParent().getParent(); 404 | 405 | //System.out.println("-----funcCallFlag: " + p1.getClass()); 406 | if (p1 instanceof FunctionParser.FuncCallContext) { 407 | //System.out.println("found"); 408 | this.funcCallStr.append(Trees.getNodeText(node ,this.ruleNames)); 409 | this.funcCallStr.append(' '); 410 | this.funcCallFlag = 2; 411 | } 412 | } 413 | catch (Exception e) { // useless? 414 | //System.out.println("-----funcCallFlag: Exception found"); 415 | } 416 | } 417 | } 418 | 419 | @Override 420 | public void visitErrorNode(ErrorNode node) { } 421 | } 422 | 423 | class TreeParser implements ParseTreeListener { 424 | private static int IS_FIRST = 1; 425 | 426 | public final static int FUNCTION_DEF = 0; 427 | public final static int FUNCTION_NAME = 1; 428 | public final static int PARAMETER_NAME = 2; 429 | public final static int DECLARATOR = 3; 430 | public final static int TYPE_NAME = 4; 431 | public final static int FUNCTION_CALL = 5; 432 | public final static int COMPOUND_STMT = 6; 433 | 434 | private final static String[] table = {"function_def", "function_name", "parameter_name", 435 | "declarator", "type_name", "identifier", "compound_statement"}; 436 | private static int[] IDX = {0, 0, 0, 0, 0, 0, 0}; 437 | 438 | private static List ruleNames; 439 | 440 | private ExecutorService executorService; 441 | private List> future_list = new ArrayList>(); // for multithread 442 | //private List job_list = new ArrayList(); // for singlethread 443 | private function functionInstance = null; 444 | 445 | // Function's name 446 | private int funcNameFlag = 0; 447 | private StringBuilder funcNameStr = new StringBuilder(); 448 | 449 | // Function parameter's name 450 | private int paramNameFlag = 0; 451 | private StringBuilder paramNameStr = new StringBuilder(); //final? 452 | 453 | // type (return type, parameter type, local variable type) 454 | private int typeNameFlag = 0; 455 | private StringBuilder typeNameStr = new StringBuilder(); 456 | 457 | // function definition 458 | private int funcDefFlag = 0; 459 | 460 | // function body (compund_statement) 461 | private int compoundStmtFlag = 0; 462 | 463 | private String srcFileName; 464 | private int numLines = 0; 465 | 466 | // set SLL option 467 | private int enableSLL = 0; 468 | 469 | 470 | public TreeParser() { 471 | this.functionInstance = null; 472 | 473 | this.funcNameFlag = 0; 474 | this.funcNameStr = new StringBuilder(); 475 | 476 | this.paramNameFlag = 0; 477 | this.paramNameStr = new StringBuilder(); 478 | 479 | this.typeNameFlag = 0; 480 | this.typeNameStr = new StringBuilder(); 481 | 482 | this.funcDefFlag = 0; 483 | 484 | this.compoundStmtFlag = 0; 485 | 486 | this.enableSLL = 0; 487 | } 488 | 489 | private void _init(ModuleParser parser) { 490 | //this(); 491 | this.executorService = Executors.newFixedThreadPool( 492 | Runtime.getRuntime().availableProcessors() 493 | ); 494 | 495 | this.functionInstance = null; 496 | 497 | this.funcNameFlag = 0; 498 | this.funcNameStr = new StringBuilder(); 499 | 500 | this.paramNameFlag = 0; 501 | this.paramNameStr = new StringBuilder(); 502 | 503 | this.typeNameFlag = 0; 504 | this.typeNameStr = new StringBuilder(); 505 | 506 | this.funcDefFlag = 0; 507 | 508 | this.compoundStmtFlag = 0; 509 | 510 | this.enableSLL = 0; 511 | 512 | 513 | if (TreeParser.IS_FIRST != 0) { 514 | this.ruleNames = Arrays.asList(parser.getRuleNames()); 515 | 516 | for (int i = 0; i < parser.ruleNames.length; i++) { 517 | for (int j = 0; j < TreeParser.table.length; j++) { 518 | if (parser.ruleNames[i].equals(TreeParser.table[j])) 519 | TreeParser.IDX[j] = i; 520 | } 521 | } 522 | } 523 | TreeParser.IS_FIRST = 0; 524 | } 525 | 526 | public List ParseFile(String srcFileName) { 527 | return this.ParseFile(srcFileName, 1); 528 | } 529 | public List ParseFile(String srcFileName, int bSLL) { 530 | List ret = new ArrayList(); 531 | try { 532 | ANTLRFileStream antlrFileStream = new ANTLRFileStream(srcFileName); 533 | ModuleLexer lexer = new ModuleLexer(antlrFileStream); 534 | CommonTokenStream tokens = new CommonTokenStream(lexer); 535 | ModuleParser parser = new ModuleParser(tokens); 536 | parser.removeErrorListeners(); // remove error listener 537 | 538 | if (bSLL != 0) { 539 | parser.getInterpreter().setPredictionMode(PredictionMode.SLL); 540 | parser.setErrorHandler(new BailErrorStrategy()); 541 | } 542 | 543 | //long t1 = System.currentTimeMillis(); 544 | ParseTree tree; 545 | try { 546 | tree = parser.code(); 547 | } 548 | catch (ParseCancellationException e) { 549 | parser.reset(); 550 | parser.getInterpreter().setPredictionMode(PredictionMode.LL); 551 | parser.setErrorHandler(new DefaultErrorStrategy()); 552 | tree = parser.code(); 553 | } 554 | //long t2 = System.currentTimeMillis(); 555 | //System.err.println("time: " + (t2 - t1) / 1000.0); 556 | this._init(parser); // reset before traverse a parse tree 557 | this.enableSLL = bSLL; 558 | 559 | LineNumberReader lnr = new LineNumberReader(new FileReader(new File(srcFileName))); 560 | while (lnr.skip(Long.MAX_VALUE) > 0); 561 | this.numLines = lnr.getLineNumber() + 1; 562 | lnr.close(); 563 | 564 | this.srcFileName = new String(srcFileName); 565 | 566 | ParseTreeWalker.DEFAULT.walk(this, tree); 567 | 568 | //System.err.println("before get()"); 569 | //long t3 = System.currentTimeMillis(); 570 | for (Future future : this.future_list) { 571 | ret.add(future.get()); 572 | } 573 | //long t4 = System.currentTimeMillis(); 574 | //System.err.println("time: " + (t4 - t3) / 1000.0); 575 | //System.err.println("after get()"); 576 | /* 577 | for (int i = 0; i < job_list.size(); i++) { // singlethread 578 | JobInstance j = job_list.get(i); 579 | BodyParser p = new BodyParser(); 580 | p.ParseString(j.string, j.functionInstance, j.line, j.enableSLL); 581 | ret.add(j.functionInstance); 582 | } 583 | */ 584 | 585 | } catch (Exception e) { 586 | e.printStackTrace(); 587 | this.executorService.shutdownNow(); 588 | return null; 589 | } 590 | this.executorService.shutdown(); 591 | return ret; 592 | } 593 | 594 | @Override 595 | public void enterEveryRule(ParserRuleContext ctx) { 596 | int ruleIndex = ctx.getRuleIndex(); 597 | 598 | if (ruleIndex == TreeParser.IDX[TreeParser.FUNCTION_DEF]) { 599 | this.funcDefFlag = 1; 600 | this.functionInstance = new function(this.srcFileName); 601 | this.functionInstance.parentNumLoc = this.numLines; 602 | this.functionInstance.funcId = this.future_list.size() + 1; 603 | this.functionInstance.lineStart = ctx.getStart().getLine(); 604 | this.functionInstance.lineStop = ctx.getStop().getLine(); 605 | } 606 | else if (this.funcDefFlag == 0) 607 | return; 608 | else if (ruleIndex == TreeParser.IDX[TreeParser.FUNCTION_NAME]) 609 | this.funcNameFlag = 1; 610 | else if (ruleIndex == TreeParser.IDX[TreeParser.PARAMETER_NAME]) 611 | this.paramNameFlag = 1; 612 | else if (ruleIndex == TreeParser.IDX[TreeParser.TYPE_NAME]) 613 | this.typeNameFlag = 1; 614 | else if (ruleIndex == TreeParser.IDX[TreeParser.COMPOUND_STMT]) 615 | this.compoundStmtFlag = 1; 616 | } 617 | 618 | @Override 619 | public void exitEveryRule(ParserRuleContext ctx) { 620 | int ruleIndex = ctx.getRuleIndex(); 621 | 622 | if (ruleIndex == TreeParser.IDX[TreeParser.FUNCTION_DEF] && this.funcDefFlag != 0) 623 | this.funcDefFlag = 0; 624 | else if (ruleIndex == TreeParser.IDX[TreeParser.FUNCTION_NAME] && this.funcNameFlag != 0) { 625 | this.functionInstance.name = this.funcNameStr.toString().trim(); 626 | this.funcNameFlag = 0; 627 | this.funcNameStr.setLength(0); 628 | } 629 | else if (ruleIndex == TreeParser.IDX[TreeParser.PARAMETER_NAME] && this.paramNameFlag != 0) { 630 | this.functionInstance.parameterList.add(this.paramNameStr.toString().trim()); 631 | this.paramNameFlag = 0; 632 | this.paramNameStr.setLength(0); 633 | } 634 | else if (ruleIndex == TreeParser.IDX[TreeParser.TYPE_NAME] && this.typeNameFlag != 0) { 635 | this.functionInstance.dataTypeList.add(this.typeNameStr.toString().trim()); 636 | this.typeNameFlag = 0; 637 | this.typeNameStr.setLength(0); 638 | } 639 | else if (ruleIndex == TreeParser.IDX[TreeParser.COMPOUND_STMT] && this.compoundStmtFlag != 0) { 640 | this.compoundStmtFlag = 0; 641 | 642 | CharStream inputStream = ctx.start.getInputStream(); 643 | int start_index = ctx.start.getStopIndex(); 644 | int stop_index = ctx.stop.getStopIndex(); 645 | String string = inputStream.getText(new Interval(start_index + 1, stop_index - 1)); 646 | int line = ctx.start.getLine(); 647 | 648 | // add function's body 649 | this.functionInstance.funcBody = string; 650 | 651 | //this.job_list.add(new JobInstance(string, this.functionInstance, line, this.enableSLL)); // for singlethread 652 | this.future_list.add( 653 | this.executorService.submit(new JobInstance(string, this.functionInstance, line, this.enableSLL)) 654 | ); // for multithread 655 | } 656 | } 657 | 658 | @Override 659 | public void visitTerminal(TerminalNode node) { 660 | if (this.compoundStmtFlag != 0 || this.funcDefFlag == 0) 661 | return; 662 | else if (this.funcNameFlag != 0) { 663 | this.funcNameStr.append(Trees.getNodeText(node, this.ruleNames)); 664 | this.funcNameStr.append(' '); 665 | } 666 | else if (this.paramNameFlag != 0) { 667 | this.paramNameStr.append(Trees.getNodeText(node, this.ruleNames)); 668 | this.paramNameStr.append(' '); 669 | } 670 | else if (this.typeNameFlag != 0) { 671 | this.typeNameStr.append(Trees.getNodeText(node, this.ruleNames)); 672 | this.typeNameStr.append(' '); 673 | } 674 | } 675 | 676 | @Override 677 | public void visitErrorNode(ErrorNode node) { } 678 | } 679 | 680 | class TreeParser1 implements ParseTreeListener { 681 | private static int IS_FIRST = 1; 682 | 683 | public final static int FUNCTION_DEF = 0; 684 | public final static int FUNCTION_NAME = 1; 685 | public final static int PARAMETER_NAME = 2; 686 | public final static int DECLARATOR = 3; 687 | public final static int TYPE_NAME = 4; 688 | public final static int FUNCTION_CALL = 5; 689 | public final static int COMPOUND_STMT = 6; 690 | 691 | private final static String[] table = {"function_def", "function_name", "parameter_name", 692 | "declarator", "type_name", "identifier", "compound_statement"}; 693 | private static int[] IDX = {0, 0, 0, 0, 0, 0, 0}; 694 | 695 | private static List ruleNames; 696 | 697 | //private ExecutorService executorService; 698 | //private List> future_list = new ArrayList>(); // for multithread 699 | //private List job_list = new ArrayList(); // for singlethread 700 | 701 | private List ret; 702 | 703 | private function functionInstance = null; 704 | 705 | // Function's name 706 | private int funcNameFlag = 0; 707 | private StringBuilder funcNameStr = new StringBuilder(); 708 | 709 | // Function parameter's name 710 | private int paramNameFlag = 0; 711 | private StringBuilder paramNameStr = new StringBuilder(); //final? 712 | 713 | // type (return type, parameter type, local variable type) 714 | private int typeNameFlag = 0; 715 | private StringBuilder typeNameStr = new StringBuilder(); 716 | 717 | // function definition 718 | private int funcDefFlag = 0; 719 | 720 | // function body (compund_statement) 721 | private int compoundStmtFlag = 0; 722 | 723 | private String srcFileName; 724 | private int numLines = 0; 725 | 726 | // set SLL option 727 | private int enableSLL = 0; 728 | 729 | 730 | public TreeParser1() { 731 | this.ret = new ArrayList(); 732 | 733 | this.functionInstance = null; 734 | 735 | this.funcNameFlag = 0; 736 | this.funcNameStr = new StringBuilder(); 737 | 738 | this.paramNameFlag = 0; 739 | this.paramNameStr = new StringBuilder(); 740 | 741 | this.typeNameFlag = 0; 742 | this.typeNameStr = new StringBuilder(); 743 | 744 | this.funcDefFlag = 0; 745 | 746 | this.compoundStmtFlag = 0; 747 | 748 | this.enableSLL = 0; 749 | } 750 | 751 | private void _init(ModuleParser parser) { 752 | //this(); 753 | //this.executorService = Executors.newFixedThreadPool( 754 | // Runtime.getRuntime().availableProcessors() 755 | //); 756 | this.ret = new ArrayList(); 757 | 758 | this.functionInstance = null; 759 | 760 | this.funcNameFlag = 0; 761 | this.funcNameStr = new StringBuilder(); 762 | 763 | this.paramNameFlag = 0; 764 | this.paramNameStr = new StringBuilder(); 765 | 766 | this.typeNameFlag = 0; 767 | this.typeNameStr = new StringBuilder(); 768 | 769 | this.funcDefFlag = 0; 770 | 771 | this.compoundStmtFlag = 0; 772 | 773 | this.enableSLL = 0; 774 | 775 | 776 | if (TreeParser1.IS_FIRST != 0) { 777 | this.ruleNames = Arrays.asList(parser.getRuleNames()); 778 | 779 | for (int i = 0; i < parser.ruleNames.length; i++) { 780 | for (int j = 0; j < TreeParser1.table.length; j++) { 781 | if (parser.ruleNames[i].equals(TreeParser1.table[j])) 782 | TreeParser1.IDX[j] = i; 783 | } 784 | } 785 | } 786 | TreeParser1.IS_FIRST = 0; 787 | } 788 | 789 | public List ParseFile(String srcFileName) { 790 | return this.ParseFile(srcFileName, 1); 791 | } 792 | public List ParseFile(String srcFileName, int bSLL) { 793 | try { 794 | ANTLRFileStream antlrFileStream = new ANTLRFileStream(srcFileName); 795 | ModuleLexer lexer = new ModuleLexer(antlrFileStream); 796 | CommonTokenStream tokens = new CommonTokenStream(lexer); 797 | ModuleParser parser = new ModuleParser(tokens); 798 | parser.removeErrorListeners(); // remove error listener 799 | 800 | if (bSLL != 0) { 801 | parser.getInterpreter().setPredictionMode(PredictionMode.SLL); 802 | parser.setErrorHandler(new BailErrorStrategy()); 803 | } 804 | 805 | //long t1 = System.currentTimeMillis(); 806 | ParseTree tree; 807 | try { 808 | tree = parser.code(); 809 | } 810 | catch (ParseCancellationException e) { 811 | parser.reset(); 812 | parser.getInterpreter().setPredictionMode(PredictionMode.LL); 813 | parser.setErrorHandler(new DefaultErrorStrategy()); 814 | tree = parser.code(); 815 | } 816 | //long t2 = System.currentTimeMillis(); 817 | //System.err.println("time: " + (t2 - t1) / 1000.0); 818 | this._init(parser); // reset before traverse a parse tree 819 | this.enableSLL = bSLL; 820 | 821 | LineNumberReader lnr = new LineNumberReader(new FileReader(new File(srcFileName))); 822 | while (lnr.skip(Long.MAX_VALUE) > 0); 823 | this.numLines = lnr.getLineNumber() + 1; 824 | lnr.close(); 825 | 826 | this.srcFileName = new String(srcFileName); 827 | 828 | ParseTreeWalker.DEFAULT.walk(this, tree); 829 | 830 | //System.err.println("before get()"); 831 | //long t3 = System.currentTimeMillis(); 832 | //for (Future future : this.future_list) { 833 | // ret.add(future.get()); 834 | //} 835 | //long t4 = System.currentTimeMillis(); 836 | //System.err.println("time: " + (t4 - t3) / 1000.0); 837 | //System.err.println("after get()"); 838 | /* 839 | for (int i = 0; i < job_list.size(); i++) { // singlethread 840 | JobInstance j = job_list.get(i); 841 | BodyParser p = new BodyParser(); 842 | p.ParseString(j.string, j.functionInstance, j.line, j.enableSLL); 843 | ret.add(j.functionInstance); 844 | } 845 | */ 846 | 847 | } catch (Exception e) { 848 | e.printStackTrace(); 849 | //this.executorService.shutdownNow(); 850 | return null; 851 | } 852 | //this.executorService.shutdown(); 853 | return this.ret; 854 | } 855 | 856 | @Override 857 | public void enterEveryRule(ParserRuleContext ctx) { 858 | int ruleIndex = ctx.getRuleIndex(); 859 | 860 | if (ruleIndex == TreeParser1.IDX[TreeParser1.FUNCTION_DEF]) { 861 | this.funcDefFlag = 1; 862 | this.functionInstance = new function(this.srcFileName); 863 | this.functionInstance.parentNumLoc = this.numLines; 864 | this.functionInstance.funcId = this.ret.size() + 1; 865 | this.functionInstance.lineStart = ctx.getStart().getLine(); 866 | this.functionInstance.lineStop = ctx.getStop().getLine(); 867 | } 868 | else if (this.funcDefFlag == 0) 869 | return; 870 | else if (ruleIndex == TreeParser1.IDX[TreeParser1.FUNCTION_NAME]) 871 | this.funcNameFlag = 1; 872 | else if (ruleIndex == TreeParser1.IDX[TreeParser1.PARAMETER_NAME]) 873 | this.paramNameFlag = 1; 874 | else if (ruleIndex == TreeParser1.IDX[TreeParser1.TYPE_NAME]) 875 | this.typeNameFlag = 1; 876 | else if (ruleIndex == TreeParser1.IDX[TreeParser1.COMPOUND_STMT]) 877 | this.compoundStmtFlag = 1; 878 | } 879 | 880 | @Override 881 | public void exitEveryRule(ParserRuleContext ctx) { 882 | int ruleIndex = ctx.getRuleIndex(); 883 | 884 | if (ruleIndex == TreeParser1.IDX[TreeParser1.FUNCTION_DEF] && this.funcDefFlag != 0) { 885 | this.ret.add(this.functionInstance); 886 | this.funcDefFlag = 0; 887 | } 888 | else if (ruleIndex == TreeParser1.IDX[TreeParser1.FUNCTION_NAME] && this.funcNameFlag != 0) { 889 | this.functionInstance.name = this.funcNameStr.toString().trim(); 890 | this.funcNameFlag = 0; 891 | this.funcNameStr.setLength(0); 892 | } 893 | else if (ruleIndex == TreeParser1.IDX[TreeParser1.PARAMETER_NAME] && this.paramNameFlag != 0) { 894 | this.functionInstance.parameterList.add(this.paramNameStr.toString().trim()); 895 | this.paramNameFlag = 0; 896 | this.paramNameStr.setLength(0); 897 | } 898 | else if (ruleIndex == TreeParser1.IDX[TreeParser1.TYPE_NAME] && this.typeNameFlag != 0) { 899 | this.functionInstance.dataTypeList.add(this.typeNameStr.toString().trim()); 900 | this.typeNameFlag = 0; 901 | this.typeNameStr.setLength(0); 902 | } 903 | else if (ruleIndex == TreeParser1.IDX[TreeParser1.COMPOUND_STMT] && this.compoundStmtFlag != 0) { 904 | this.compoundStmtFlag = 0; 905 | 906 | CharStream inputStream = ctx.start.getInputStream(); 907 | int start_index = ctx.start.getStopIndex(); 908 | int stop_index = ctx.stop.getStopIndex(); 909 | String string = inputStream.getText(new Interval(start_index + 1, stop_index - 1)); 910 | int line = ctx.start.getLine(); 911 | 912 | // add function's body 913 | this.functionInstance.funcBody = string; 914 | 915 | //this.job_list.add(new JobInstance(string, this.functionInstance, line, this.enableSLL)); // for singlethread 916 | //this.future_list.add( 917 | // this.executorService.submit(new JobInstance(string, this.functionInstance, line, this.enableSLL)) 918 | //); // for multithread 919 | } 920 | } 921 | 922 | @Override 923 | public void visitTerminal(TerminalNode node) { 924 | if (this.compoundStmtFlag != 0 || this.funcDefFlag == 0) 925 | return; 926 | else if (this.funcNameFlag != 0) { 927 | this.funcNameStr.append(Trees.getNodeText(node, this.ruleNames)); 928 | this.funcNameStr.append(' '); 929 | } 930 | else if (this.paramNameFlag != 0) { 931 | this.paramNameStr.append(Trees.getNodeText(node, this.ruleNames)); 932 | this.paramNameStr.append(' '); 933 | } 934 | else if (this.typeNameFlag != 0) { 935 | this.typeNameStr.append(Trees.getNodeText(node, this.ruleNames)); 936 | this.typeNameStr.append(' '); 937 | } 938 | } 939 | 940 | @Override 941 | public void visitErrorNode(ErrorNode node) { } 942 | } 943 | -------------------------------------------------------------------------------- /FuncParser-opt/src/Module.g4: -------------------------------------------------------------------------------- 1 | grammar Module; 2 | 3 | import ModuleLex, Expressions, Common, FunctionDef, CoarseSimpleDecl; 4 | 5 | /* 6 | Copyright (C) 2013 Fabian 'fabs' Yamaguchi 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | 17 | You should have received a copy of the GNU General Public License 18 | along with this program. If not, see . 19 | */ 20 | /* 21 | @header{ 22 | package antlr.C; 23 | } 24 | */ 25 | code : (function_def | simple_decl | using_directive | water)*; 26 | 27 | using_directive: USING NAMESPACE identifier ';'; 28 | 29 | -------------------------------------------------------------------------------- /FuncParser-opt/src/ModuleLex.g4: -------------------------------------------------------------------------------- 1 | lexer grammar ModuleLex; 2 | 3 | // Keywords shared among C/C++/Java 4 | 5 | IF: 'if'; ELSE: 'else'; FOR: 'for'; WHILE: 'while'; 6 | 7 | BREAK: 'break'; CASE: 'case'; CONTINUE: 'continue'; 8 | SWITCH: 'switch'; DO: 'do'; 9 | 10 | GOTO: 'goto'; RETURN: 'return'; 11 | 12 | TYPEDEF: 'typedef'; 13 | VOID: 'void'; UNSIGNED: 'unsigned'; SIGNED: 'signed'; 14 | LONG: 'long'; CV_QUALIFIER : 'const' | 'volatile'; 15 | 16 | // Keywords shared among C++/Java 17 | 18 | VIRTUAL: 'virtual'; 19 | TRY: 'try'; CATCH: 'catch'; THROW: 'throw'; 20 | USING: 'using'; NAMESPACE: 'namespace'; 21 | 22 | // Keywords shared among C/C++ 23 | 24 | AUTO: 'auto'; REGISTER: 'register'; 25 | 26 | // C++ keywords 27 | 28 | OPERATOR: 'operator'; 29 | TEMPLATE: 'template'; 30 | NEW: 'new'; 31 | 32 | CLASS_KEY: ('struct' | 'class' | 'union' | 'enum'); 33 | 34 | ALPHA_NUMERIC: [a-zA-Z_~][a-zA-Z0-9_]*; 35 | 36 | OPENING_CURLY: '{'; 37 | CLOSING_CURLY: '}'; 38 | 39 | // pre-processor directives: C/C++ 40 | 41 | PRE_IF: ('#if' | '#ifdef' | '#ifndef') ~[\r\n]* '\r'? '\n'; 42 | PRE_ELSE: ('#else' | '#elif') ~[\r\n]* '\r'? '\n'; 43 | PRE_ENDIF: '#endif' ~[\r\n]* '\r'? '\n'; 44 | // PREPROC : '#' ~[\r\n]* '\r'? '\n' -> skip; 45 | 46 | 47 | HEX_LITERAL : '0' ('x'|'X') HexDigit+ IntegerTypeSuffix? ; 48 | DECIMAL_LITERAL : ('0' | '1'..'9' '0'..'9'*) IntegerTypeSuffix? ; 49 | OCTAL_LITERAL : '0' ('0'..'7')+ IntegerTypeSuffix? ; 50 | 51 | FLOATING_POINT_LITERAL 52 | : ('0'..'9')+ '.' ('0'..'9')* Exponent? FloatTypeSuffix? 53 | | '.' ('0'..'9')+ Exponent? FloatTypeSuffix? 54 | | ('0'..'9')+ Exponent FloatTypeSuffix? 55 | | ('0'..'9')+ Exponent? FloatTypeSuffix 56 | ; 57 | 58 | CHAR 59 | : '\'' ( EscapeSequence | ~('\''|'\\') ) '\'' 60 | ; 61 | 62 | STRING 63 | : '"' ( EscapeSequence | ~('\\'|'"') )* '"' 64 | ; 65 | 66 | 67 | fragment 68 | IntegerTypeSuffix 69 | : ('u'|'U')? ('l'|'L') 70 | | ('u'|'U') ('l'|'L')? 71 | ; 72 | 73 | fragment 74 | Exponent : ('e'|'E') ('+'|'-')? ('0'..'9')+; 75 | 76 | fragment 77 | FloatTypeSuffix : ('f'|'F'|'d'|'D'); 78 | 79 | 80 | fragment 81 | EscapeSequence 82 | : '\\' . 83 | | UnicodeEscape 84 | | OctalEscape 85 | ; 86 | 87 | fragment 88 | OctalEscape 89 | : '\\' ('0'..'3') ('0'..'7') ('0'..'7') 90 | | '\\' ('0'..'7') ('0'..'7') 91 | | '\\' ('0'..'7') 92 | ; 93 | 94 | fragment 95 | UnicodeEscape 96 | : '\\' 'u' HexDigit HexDigit HexDigit HexDigit 97 | ; 98 | 99 | fragment 100 | HexDigit : ('0'..'9'|'a'..'f'|'A'..'F') ; 101 | 102 | COMMENT 103 | : '/*' .*? '*/' -> skip 104 | ; 105 | WHITESPACE : [ \r\t\u000C\n]+ -> skip 106 | ; 107 | 108 | CPPCOMMENT 109 | : '//' ~[\r\n]* '\r'? '\n' -> skip 110 | ; 111 | 112 | OTHER : . -> skip ; 113 | -------------------------------------------------------------------------------- /FuncParser-opt/src/SimpleDecl.g4: -------------------------------------------------------------------------------- 1 | grammar SimpleDecl; 2 | 3 | simple_decl : (TYPEDEF? template_decl_start?) var_decl; 4 | 5 | var_decl : class_def init_declarator_list? #declByClass 6 | | type_name init_declarator_list #declByType 7 | ; 8 | 9 | init_declarator_list: init_declarator (',' init_declarator)* ';'; 10 | 11 | initializer: assign_expr 12 | |'{' initializer_list '}' 13 | ; 14 | 15 | initializer_list: initializer (',' initializer)*; 16 | 17 | 18 | class_def: CLASS_KEY class_name? base_classes? OPENING_CURLY {skipToEndOfObject(); } ; 19 | class_name: identifier; 20 | base_classes: ':' base_class (',' base_class)*; 21 | base_class: VIRTUAL? access_specifier? identifier; 22 | 23 | type_name : (CV_QUALIFIER* (CLASS_KEY | UNSIGNED | SIGNED)? 24 | base_type ('<' template_param_list '>')? ('::' base_type ('<' template_param_list '>')? )*) CV_QUALIFIER? 25 | | UNSIGNED 26 | | SIGNED 27 | ; 28 | 29 | 30 | base_type: (ALPHA_NUMERIC | VOID | LONG | LONG)+; 31 | 32 | // Parameters 33 | 34 | param_decl_specifiers : (AUTO | REGISTER)? type_name; 35 | 36 | // this is a bit misleading. We're just allowing access_specifiers 37 | // here because C programs can use 'public', 'protected' or 'private' 38 | // as variable names. 39 | 40 | parameter_name: identifier; 41 | 42 | param_type_list: '(' VOID ')' 43 | | '(' (param_type (',' param_type)*)? ')'; 44 | 45 | param_type: param_decl_specifiers param_type_id; 46 | param_type_id: ptrs? ('(' param_type_id ')' | parameter_name?) type_suffix?; 47 | 48 | // operator-identifiers not implemented 49 | identifier : (ALPHA_NUMERIC ('::' ALPHA_NUMERIC)*) | access_specifier; 50 | number: HEX_LITERAL | DECIMAL_LITERAL | OCTAL_LITERAL; 51 | 52 | ptrs: (ptr_operator 'restrict'?)+; 53 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Seulbae Kim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the "Software"), 7 | to deal in the Software without restriction, including without limitation 8 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | and/or sell copies of the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included 13 | in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 16 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 | IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VUDDY (a.k.a. `hmark`) 2 | VUDDY is an approach for **scalable** and **accurate** vulnerable code clone 3 | detection. This approach is specifically designed to accurately find 4 | vulnerabilities in massive code bases (e.g., Linux kernel, 25 MLoC). 5 | Its principles and results are discussed in our 6 | [paper](https://ccs.korea.ac.kr/pds/SNP17.pdf), which was published in 38th 7 | IEEE Symposium on Security and Privacy (S&P'17). 8 | 9 | `hmark` is the implementation of VUDDY. It is the client-side preprocessing 10 | tool for "Vulnerable Code Clone Detection" testing provided by 11 | [IoTcube](https://iotcube.net), an automated vulnerability testing platform. 12 | Detailed instructions are available [here](https://iotcube.net/userguide/manual/hmark). 13 | 14 | This project was funded by IITP (Development of Vulnerability Discovery Technologies 15 | for IoT Software Security), and was conducted at [CSSA](https://cssa.korea.ac.kr) 16 | (Center for Software Security and Assurrance) at Korea University. 17 | 18 | ## Getting Started with `hmark` 19 | 20 | ### Prerequisites 21 | - **Linux or OS X** - *hmark* is designed to work on any of the operating 22 | systems. Tested OS distributions include Ubuntu 14.04, 16.04, and 18.04, 23 | Fedora 25, and OS X. Let me know if your OS is not supported. 24 | - Confirmed in May 2024: VUDDY works seamlessly on Ubuntu 22.04 25 | - Confirmed in Jan 2025: VUDDY works also on Windows 10 26 | - **Python 3** - VUDDY is now fully compatible with Python 3 (Jan 2025 onwards) 27 | - **python-tk** package - (only required if you want GUI) install from your 28 | package manager 29 | - **Java Runtime Environment (JRE)** - We recommend openjdk-8-jre. 30 | 31 | ### Running `hmark` and checking the result on IoTcube (our web service) 32 | 1. `cd hmark` 33 | 2. `python hmark.py [-h] [-c path ON/OFF] [-n] [-V]` 34 | 35 | You can see the help message below by passing an `-h` (or `--help`) argument. 36 | ``` 37 | usage: python hmark.py [-h] [-c path ON/OFF] [-n] [-V] 38 | 39 | - optional arguments: 40 | -h, --help show this help message and exit 41 | 42 | -c path ON/OFF, --cli-mode path ON/OFF 43 | run hmark without GUI by specifying the path to the 44 | target directory, and the abstraction mode 45 | -n, --no-updatecheck bypass update checking (not recommended) 46 | -V, --version print hmark version and exit 47 | ``` 48 | 3. Upload the resulting `hidx` file on IoTcube's [Vulnerable Code Clone 49 | Detection](https://iotcube.net/process/type/wf1) testing. 50 | 51 | ### Running `hmark` and checking the result locally 52 | Follow steps 1 and 2 above to generate the `hidx` of the target program. 53 | Skip step 3. 54 | 55 | 4. To build your own vulnerability database, checkout `vulnDBGen`, 56 | which is a subrepo of this repository and follow the guidelines 57 | to build a vulnerability database locally. 58 | ``` 59 | $ git submodule update --init 60 | $ cd vulnDBGen 61 | $ cat docs/examples.md 62 | ``` 63 | 64 | 5. After building your own vulnerability database, you can locally run the 65 | vulnerable clone checker: 66 | ``` 67 | $ cd .. 68 | $ python3 checker/check_clones.py --help 69 | $ python3 checker/check_clones.py --target path_to_target_hidx --database path_to_vulndb 70 | ``` 71 | 72 | ### Binary Release 73 | Instead of running `hmark` from source code, you can also download and execute 74 | prebuilt binaries. Binaries for Windows, Linux, and OS X are available 75 | [here](https://iotcube.net/downloads). 76 | 77 | ## Reporting Bugs 78 | For reporting bugs, you can [submit an 79 | issue](https://github.com/iotcube/hmark/issues) to the VUDDY GitHub, or send 80 | me an email. Feel free to send pull 81 | requests if you have suggestions or bugfixes! 82 | 83 | ## About 84 | This program is authored and maintained by **Seulbae Kim** 85 | > GitHub [@seulbae-security](https://github.com/seulbae-security) / seulbae@postech.ac.kr 86 | 87 | ## TODOs 88 | Please feel free to submit pull requests for the following items: 89 | * Rewrite everything in Python3 90 | * Use a better parser 91 | * Replace all code that rely on stdin/stdout for IPC (e.g., git executions) with API calls 92 | 93 | -------------------------------------------------------------------------------- /checker/check_clones.py: -------------------------------------------------------------------------------- 1 | #/usr/bin/python3 2 | 3 | import os 4 | import argparse 5 | 6 | def load_hidx(hidx_file): 7 | len_hash_dict = dict() 8 | hash_file_dict = dict() 9 | 10 | with open(hidx_file, "r") as f: 11 | lines = f.readlines() 12 | 13 | init = True 14 | delim = False 15 | for line in lines: 16 | ls = line.strip() 17 | if init: 18 | init = False 19 | continue # skip first line (metadata) 20 | 21 | if len(ls) == 0: 22 | continue 23 | 24 | if "=====" in ls: 25 | delim = True 26 | continue 27 | 28 | tokens = ls.split("\t") 29 | 30 | if not delim: # before delimiter: len to hash list 31 | func_len = int(tokens[0]) 32 | hash_list = set(tokens[1:]) 33 | len_hash_dict[func_len] = hash_list 34 | 35 | else: # after delimiter: hash to file and line 36 | hash_val = tokens[0] 37 | file_name = tokens[1] 38 | line_num = tokens[2] 39 | hash_file_dict[hash_val] = [file_name, line_num] 40 | 41 | return hidx_file, len_hash_dict, hash_file_dict 42 | 43 | 44 | if __name__ == "__main__": 45 | parser = argparse.ArgumentParser() 46 | parser.add_argument( 47 | "-t", 48 | "--target", 49 | type=str, 50 | required=True, 51 | help="hidx of the target program to find vulnerable clones" 52 | ) 53 | 54 | parser.add_argument( 55 | "-d", 56 | "--database", 57 | type=str, 58 | required=True, 59 | help="Path to the directory storing vulnerablility database hidx generated by vulnDBGen" 60 | ) 61 | 62 | args = parser.parse_args() 63 | 64 | if not os.path.exists(args.target): 65 | print(f"[-] {args.target} does not exist") 66 | exit(1) 67 | 68 | if not args.target.endswith(".hidx"): 69 | print(f"[-] {args.target} does not appear to be a hidx file") 70 | exit(1) 71 | 72 | if not os.path.exists(args.database): 73 | print(f"[-] {args.database} does not exist") 74 | exit(1) 75 | 76 | target, target_len_hash_dict, target_hash_file_dict = load_hidx(args.target) 77 | 78 | vdb_list = list() 79 | vdb_len_hash_dict_list = list() 80 | vdb_hash_file_dict_list = list() 81 | 82 | for hidx_file in os.listdir(args.database): 83 | file_ = os.path.join(args.database, hidx_file) 84 | 85 | vdb, vdb_len_hash_dict, vdb_hash_file_dict = load_hidx(file_) 86 | vdb_list.append(vdb) 87 | vdb_len_hash_dict_list.append(vdb_len_hash_dict) 88 | vdb_hash_file_dict_list.append(vdb_hash_file_dict) 89 | 90 | collision_set = set() 91 | 92 | for vdb_idx, vdb in enumerate(vdb_list): 93 | print(f"Target {target} vs VDB {vdb}") 94 | for func_len in target_len_hash_dict: 95 | if func_len not in vdb_len_hash_dict_list[vdb_idx]: 96 | continue 97 | 98 | target_hash_list = target_len_hash_dict[func_len] 99 | vdb_hash_list = vdb_len_hash_dict_list[vdb_idx][func_len] 100 | 101 | collision = target_hash_list.intersection(vdb_hash_list) 102 | 103 | if len(collision) == 0: 104 | continue 105 | 106 | collision_set.update(collision) 107 | 108 | for hash_ in collision_set: 109 | print(hash_) 110 | file_info = target_hash_file_dict[hash_] 111 | vuln_info = vdb_hash_file_dict_list[vdb_idx][hash_] 112 | print(f"[+] {file_info[1]}-th function in {file_info[0]}" 113 | f"is a clone of vulnerability at {vuln_info[0]}") 114 | 115 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import platform 2 | 3 | """ 4 | [Note for Windows] 5 | - Use '\\' or '/' in path 6 | Ex) gitStoragePath = "D:\\Source\\gitrepos" 7 | - Install 'Git for Windows' 8 | - Windows version of VUDDY use its own JRE 9 | 10 | [Note for POSIX] 11 | - Use '/' for path 12 | Ex) gitStoragePath = "/home/ubuntu/gitrepos/" 13 | - Java binary is only needed in POSIX 14 | """ 15 | 16 | gitStoragePath = "/home/ubuntu/gitrepos/" 17 | 18 | pf = platform.platform() 19 | if "Windows" in pf: # Windows 20 | gitBinary = "C:\\Program Files\\Git\\bin\\git.exe" 21 | diffBinary = "C:\\Program Files\\Git\\usr\\bin\\diff.exe" 22 | else: # POSIX 23 | gitBinary = "git" 24 | diffBinary = "diff" 25 | javaBinary = "java" 26 | -------------------------------------------------------------------------------- /dep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo apt-get -y install python-tk 4 | 5 | -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | Vulnerability Database Generator (vulnDBGen) - Use Examples 2 | ==== 3 | 4 | ## 1. Configuration Settings 5 | Set `gitStoragePath`, `gitBinary`, `diffBinary`, and `javaBinary` in `config.py`. 6 | ``` 7 | ~$ cd ~/vulnDBGen 8 | ~/vulnDBGen$ cat config.py 9 | ``` 10 | Result 11 | ``` 12 | import platform 13 | 14 | gitStoragePath = r"/home/squizz/gitrepos" 15 | version = "3.0.3" # for use in IoTcube. 16 | pf = platform.platform() 17 | if "Windows" in pf: # Windows 18 | gitBinary = r"C:\Program Files\Git\bin\git.exe" 19 | diffBinary = r"C:\Program Files\Git\usr\bin\diff.exe" 20 | else: # POSIX 21 | gitBinary = "git" 22 | diffBinary = "diff" 23 | javaBinary = "java" 24 | 25 | ``` 26 | 27 | ## 2. Cloning repositories and collecting vulnerabilities 28 | 29 | ### A. ChakraCore (Microsoft) 30 | ``` 31 | ~$ cd ~/gitrepos 32 | ~/gitrepos$ git clone https://github.com/Microsoft/ChakraCore.git 33 | 34 | ~$ cd ~/vulnDBGen 35 | ~/vulnDBGen$ python initialize.py 36 | ~/vulnDBGen$ python src/get_cvepatch_from_git.py ChakraCore 37 | ~/vulnDBGen$ python src/get_source_from_cvepatch.py ChakraCore 38 | ``` 39 | 40 | ### B. FreeBSD (FreeBSD Foundation) 41 | ``` 42 | ~$ cd ~/gitrepos 43 | ~/gitrepos$ git clone https://github.com/freebsd/freebsd.git 44 | 45 | ~$ cd ~/vulnDBGen 46 | ~/vulnDBGen$ python initialize.py 47 | ~/vulnDBGen$ python src/get_cvepatch_from_git.py freebsd 48 | ~/vulnDBGen$ python src/get_source_from_cvepatch.py freebsd 49 | ``` 50 | 51 | ### C. Gecko (Mozilla) 52 | ``` 53 | ~$ cd ~/gitrepos 54 | ~/gitrepos$ git clone https://github.com/mozilla/gecko-dev.git 55 | 56 | ~$ cd ~/vulnDBGen 57 | ~/vulnDBGen$ python initialize.py 58 | ~/vulnDBGen$ python src/get_cvepatch_from_git.py gecko-dev 59 | ~/vulnDBGen$ python src/get_source_from_cvepatch.py gecko-dev 60 | ``` 61 | 62 | ### D. glibc (GNU) 63 | ``` 64 | ~$ cd ~/gitrepos 65 | ~/gitrepos$ git clone git://sourceware.org/git/glibc.git 66 | 67 | ~$ cd ~/vulnDBGen 68 | ~/vulnDBGen$ python initialize.py 69 | ~/vulnDBGen$ python src/get_cvepatch_from_git.py glibc 70 | ~/vulnDBGen$ python src/get_source_from_cvepatch.py glibc 71 | ``` 72 | 73 | ### E. httpd (APACHE) 74 | ``` 75 | ~$ cd ~/gitrepos 76 | ~/gitrepos$ git clone https://github.com/apache/httpd.git 77 | 78 | ~$ cd ~/vulnDBGen 79 | ~/vulnDBGen$ python initialize.py 80 | ~/vulnDBGen$ python src/get_cvepatch_from_git.py httpd 81 | ~/vulnDBGen$ python src/get_source_from_cvepatch.py httpd 82 | ``` 83 | 84 | ### F. Kerberos Version 5 (MIT) 85 | ``` 86 | ~$ cd ~/gitrepos 87 | ~/gitrepos$ git clone https://github.com/krb5/krb5.git 88 | 89 | ~$ cd ~/vulnDBGen 90 | ~/vulnDBGen$ python initialize.py 91 | ~/vulnDBGen$ python src/get_cvepatch_from_git.py krb5 92 | ~/vulnDBGen$ python src/get_source_from_cvepatch.py krb5 93 | ``` 94 | 95 | ### G. Linux kernel (Linux Foundation) 96 | ``` 97 | ~$ cd ~/gitrepos 98 | ~/gitrepos$ git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 99 | 100 | ~$ cd ~/vulnDBGen 101 | ~/vulnDBGen$ python initialize.py 102 | ~/vulnDBGen$ python src/get_cvepatch_from_git.py linux 103 | ~/vulnDBGen$ python src/get_source_from_cvepatch.py linux 104 | ``` 105 | 106 | ### H. OpenSSL (OpenSSL Software Foundation) 107 | ``` 108 | ~$ cd ~/gitrepos 109 | ~/gitrepos$ git clone https://github.com/openssl/openssl.git 110 | 111 | ~$ cd ~/vulnDBGen 112 | ~/vulnDBGen$ python initialize.py 113 | ~/vulnDBGen$ python src/get_cvepatch_from_git.py openssl 114 | ~/vulnDBGen$ python src/get_source_from_cvepatch.py openssl 115 | ``` 116 | 117 | ### I. PostgreSQL DBMS (The PostgreSQL Global Development Group) 118 | ``` 119 | ~$ cd ~/gitrepos 120 | ~/gitrepos$ git clone https://github.com/postgres/postgres.git 121 | 122 | ~$ cd ~/vulnDBGen 123 | ~/vulnDBGen$ python initialize.py 124 | ~/vulnDBGen$ python src/get_cvepatch_from_git.py postgres 125 | ~/vulnDBGen$ python src/get_source_from_cvepatch.py postgres 126 | ``` 127 | 128 | ### J. Ubuntu Trusty (Canonical Ltd.) 129 | ``` 130 | ~$ cd ~/gitrepos 131 | ~/gitrepos$ git clone git://kernel.ubuntu.com/ubuntu/ubuntu-trusty.git 132 | 133 | ~$ cd ~/vulnDBGen 134 | ~/vulnDBGen$ python initialize.py 135 | ~/vulnDBGen$ python src/get_cvepatch_from_git.py ubuntu-trusty 136 | ~/vulnDBGen$ python src/get_source_from_cvepatch.py ubuntu-trusty 137 | ``` 138 | 139 | ## 3. Filtering vulnerabilities and generating hash index 140 | 141 | ### From all repositories 142 | ``` 143 | ~$ cd ~/vulnDBGen 144 | ~/vulnDBGen$ python src/vul_dup_remover.py 145 | ~/vulnDBGen$ python src/vul_verifier.py 146 | 147 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 0 ChakraCore 148 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 4 ChakraCore 149 | 150 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 0 freebsd 151 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 4 freebsd 152 | 153 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 0 gecko-dev 154 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 4 gecko-dev 155 | 156 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 0 glibc 157 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 4 glibc 158 | 159 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 0 httpd 160 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 4 httpd 161 | 162 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 0 krb5 163 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 4 krb5 164 | 165 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 0 linux 166 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 4 linux 167 | 168 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 0 openssl 169 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 4 openssl 170 | 171 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 0 postgres 172 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 4 postgres 173 | 174 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 0 ubuntu-trusty 175 | ~/vulnDBGen$ python src/vul_hidx_generator.py -a 4 ubuntu-trusty 176 | ``` 177 | -------------------------------------------------------------------------------- /docs/취약점 데이터베이스 생성 솔루션 매뉴얼 V1.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squizz617/vuddy/33cdab1ad04a6dcc76011b92821dbeb055c6691e/docs/취약점 데이터베이스 생성 솔루션 매뉴얼 V1.0.pdf -------------------------------------------------------------------------------- /hmark/FuncParser-opt.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squizz617/vuddy/33cdab1ad04a6dcc76011b92821dbeb055c6691e/hmark/FuncParser-opt.jar -------------------------------------------------------------------------------- /hmark/README.md: -------------------------------------------------------------------------------- 1 | # hmark - Hash index generator for IoTcube's vulnerable code clone detection. 2 | *hmark* is the preprocessor for the "vulnerable code clone detection" 3 | in IoTcube (https://iotcube.korea.ac.kr). 4 | 5 | ## How to run 6 | This documentation addresses how to run hmark on various platforms, 7 | on the basis of *hmark* version 3.0.3. 8 | *hmark* has no application-specific requirements. 9 | 10 | ### Running on Linux 11 | Ubuntu 14.04 and 16.04 (32 and 64-bits) are officially supported by *hmark*. 12 | 1. Change access permissions if necesary. 13 | - 32-bit system: $ sudo chmod a+x hmark_3.0.3_linux_x86 14 | - 64-bit system: $ sudo chmod a+x hmark_3.0.3_linux_x64 15 | 2. Run with or without optional arguments. 16 | - In terminal: 17 | - 32-bit system: $ ./hmark_3.0.3_linux_x86 [-h] [-c path ON/OFF] [-n] [-V] 18 | - 64-bit system: $ ./hmark_3.0.3_linux_x64 [-h] [-c path ON/OFF] [-n] [-V] 19 | - Graphic user interface: 20 | - You can launch app in GUI (e.g., in Nautilus), 21 | but you cannot pass command line arguments. 22 | 23 | ### Running on Mac OS X (macOS) 24 | *hmark* for MAC supports 64-bit architecture. 25 | 1. Change access permissions if necessary. 26 | - $ sudo chmod a+x hmark_3.0.3_osx 27 | 2. Run with or without optional arguments. 28 | - In terminal: 29 | - $ ./hmark_3.0.3_osx [-h] [-c path ON/OFF] [-n] [-V] 30 | 31 | ### Running on Windows 32 | *hmark* works on both 32-bit and 64-bit windows. 33 | The execution is tested on Windows 7, 8, and 10. 34 | 1. Execute the application. 35 | - In terminal: 36 | - 32-bit system: hmark_3.0.3_win_x86.exe [-h] [-c path ON/OFF] [-n] [-V] 37 | - 64-bit system: hmark_3.0.3_win_x64.exe [-h] [-c path ON/OFF] [-n] [-V] 38 | - Graphic user interface: 39 | - You can launch app in GUI (e.g., in Explorer), 40 | but you cannot pass command line arguments. 41 | 42 | ## Optional arguments 43 | You can see the help message below by passing an `-h` (or `--help`) argument. 44 | ``` 45 | usage: ./hmark_3.0.3_linux_x64 [-h] [-c path ON/OFF] [-n] [-V] 46 | 47 | - optional arguments: 48 | -h, --help show this help message and exit 49 | 50 | -c path ON/OFF, --cli-mode path ON/OFF 51 | run hmark without GUI by specifying the path to the 52 | target directory, and the abstraction mode 53 | -n, --no-updatecheck bypass update checking (not recommended) 54 | -V, --version print hmark version and exit 55 | ``` 56 | 57 | ## Troubleshooting 58 | 1. Cannot execute *hmark* in GUI mode. 59 | - Some systems might require you to install several packages. 60 | - Oftentimes, `sudo apt-get install python-tk` will do. 61 | - If not, please contact us (cssa@korea.ac.kr) with the error message. 62 | - You can still use the same functionality as GUI in cli-mode (option `-c`). 63 | 2. App does not run. 64 | - Check the path to *hmark* 65 | - The path should not have any non-ascii, unicode characters. -------------------------------------------------------------------------------- /hmark/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /hmark/get_cpu_count.py: -------------------------------------------------------------------------------- 1 | def get_cpu_count(): 2 | try: 3 | import multiprocessing 4 | return multiprocessing.cpu_count() 5 | except (ImportError, NotImplementedError): 6 | pass 7 | 8 | # http://code.google.com/p/psutil/ 9 | try: 10 | import psutil 11 | return psutil.cpu_count() # psutil.NUM_CPUS on old versions 12 | except (ImportError, AttributeError): 13 | pass 14 | 15 | return 1 16 | -------------------------------------------------------------------------------- /hmark/hmark.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | Version 3.0~ of Hashmarker (CSSA) 4 | Author: Seulbae Kim (seulbae@korea.ac.kr) 5 | http://github.com/squizz617/discovuler-advanced/hmark 6 | """ 7 | #import urllib2 8 | from urllib import request 9 | import platform 10 | import sys 11 | import os 12 | 13 | import time 14 | from re import compile, findall 15 | import webbrowser 16 | # import Tkinter 17 | # import ttk 18 | from hashlib import md5 19 | 20 | import multiprocessing 21 | import subprocess 22 | import parseutility2 as pu 23 | import version 24 | import get_cpu_count 25 | 26 | import argparse 27 | from distutils.version import LooseVersion 28 | 29 | """ GLOBALS """ 30 | localVersion = version.version 31 | osName = "" 32 | bits = "" 33 | urlBase = "http://iotcube.korea.ac.kr/" 34 | urlCheck = urlBase + "getbinaryversion/wf1/" 35 | urlDownload = urlBase + "downloads" 36 | 37 | 38 | def get_platform(): 39 | global osName 40 | global bits 41 | 42 | pf = platform.platform() 43 | bits, _ = platform.architecture() 44 | if "Windows" in pf: 45 | osName = "win" 46 | if "64" in bits: 47 | bits = "64" 48 | else: 49 | bits = "86" 50 | elif "Linux" in pf: 51 | osName = "linux" 52 | if "64" in bits: 53 | bits = "64" 54 | else: 55 | bits = "86" 56 | else: 57 | osName = "osx" 58 | bits = "" 59 | 60 | 61 | def check_update(): 62 | global localVersion 63 | global osName 64 | 65 | if len(localVersion.split('.')) < 3: 66 | localVersion += ".0" 67 | 68 | if osName == "win": 69 | url = urlCheck + osName[0] + bits # ~/w64, or ~/w86 70 | elif osName == "linux": 71 | url = urlCheck + osName[0] + bits # ~/l64, or ~/l86 72 | elif osName == "osx": 73 | url = urlCheck + osName # ~/osx 74 | try: 75 | #response = urllib2.urlopen(url) 76 | response = request.urlopen(url) 77 | except Exception: 78 | print("[-] Update server is not responding.") 79 | print(" Please check your network connection or firewall and try again.") 80 | print(" To bypass update checking, run with [--no-updatecheck] option.") 81 | #raw_input("Press Enter to continue...") 82 | input("Press Enter to continue...") 83 | sys.exit() 84 | latestVersion = "0.0.0" # for exception handling 85 | 86 | #html = response.read() 87 | html = response.read().decode('utf-8') 88 | latestVersion = html 89 | 90 | if latestVersion == "-1": 91 | print("[-] There's something wrong with the server.") 92 | print(" You can report this issue to cssa@korea.ac.kr, with your version info.") 93 | print(" To bypass update checking, run with [--no-updatecheck] option.") 94 | #raw_input("Press Enter to continue...") 95 | input("Press Enter to continue...") 96 | sys.exit() 97 | 98 | if len(latestVersion.split('.')) < 3: 99 | latestVersion += '.0' 100 | 101 | print("Latest server version: " + latestVersion) 102 | print("Current local version: " + localVersion), 103 | 104 | if LooseVersion(localVersion) < LooseVersion(latestVersion): 105 | print("(out-of-date)") 106 | print("[-] Your hmark is not up-to-date.") 107 | print(" Please download and run the latest version.") 108 | print(" Proceeding to the download page.") 109 | print(" To bypass update checking, run with [--no-updatecheck] option.") 110 | 111 | webbrowser.open(urlDownload) 112 | #raw_input("Press Enter to continue...") 113 | input("Press Enter to continue...") 114 | sys.exit() 115 | else: 116 | print("(up-to-date)") 117 | 118 | 119 | def parseFile_shallow_multi(f): 120 | functionInstanceList = pu.parseFile_shallow(f, "GUI") 121 | return (f, functionInstanceList) 122 | 123 | 124 | def parseFile_deep_multi(f): 125 | functionInstanceList = pu.parseFile_deep(f, "GUI") 126 | return (f, functionInstanceList) 127 | 128 | 129 | class App: 130 | def __init__(self, master): 131 | self.master = master 132 | self.defaultbg = master.cget('bg') 133 | 134 | self.mainWidth = 900 # width for the Tk root (root == master of this class) 135 | if osName == "osx": 136 | self.mainHeight = 700 # height for the Tk root 137 | else: 138 | self.mainHeight = 650 139 | 140 | self.screenWidth = master.winfo_screenwidth() # width of the screen 141 | self.screenHeight = master.winfo_screenheight() # height of the screen 142 | 143 | #self.x = (self.screenWidth / 2) - (self.mainWidth / 2) 144 | #self.y = (self.screenHeight / 2) - (self.mainHeight / 2) 145 | self.x = (self.screenWidth // 2) - (self.mainWidth // 2) 146 | self.y = (self.screenHeight // 2) - (self.mainHeight // 2) 147 | 148 | 149 | master.geometry("%dx%d+%d+%d" % (self.mainWidth, self.mainHeight, self.x, self.y)) 150 | master.resizable(width=False, height=False) 151 | 152 | """ MENU """ 153 | self.menubar = Tkinter.Menu(master, tearoff=1) 154 | self.menubar.add_command(label="HELP", command=self.show_help) 155 | self.menubar.add_command(label="ABOUT", command=self.show_about) 156 | master.config(menu=self.menubar) 157 | 158 | """ BROWSE DIRECTORY """ 159 | frmDirectory = Tkinter.Frame(master) 160 | frmDirectory.pack(fill=Tkinter.BOTH, padx=50, pady=(20, 0)) 161 | 162 | self.directory = Tkinter.StringVar() 163 | self.directory.set('Choose the root directory of your program.') 164 | self.btnDirectory = Tkinter.Button(frmDirectory, text="Browse directory", command=self.askDirectory) 165 | self.btnDirectory.pack(side=Tkinter.LEFT) 166 | 167 | self.lblSelected = Tkinter.Label(frmDirectory, fg=self.defaultbg, text="Selected: ") 168 | self.lblSelected.pack(side=Tkinter.LEFT, padx=(10, 0)) 169 | self.lblDirectory = Tkinter.Label(frmDirectory, fg="Red", textvariable=self.directory) 170 | self.lblDirectory.pack(side=Tkinter.LEFT) 171 | 172 | """ ABSTRACTION """ 173 | frmAbstraction = Tkinter.Frame(master) 174 | frmAbstraction.pack(fill=Tkinter.BOTH) 175 | 176 | lblfrmAbstraction = Tkinter.LabelFrame(frmAbstraction, text="Select abstraction mode") 177 | lblfrmAbstraction.pack(fill=Tkinter.BOTH, expand="yes", padx=50, pady=10) 178 | 179 | self.absLevel = Tkinter.IntVar() 180 | R1 = Tkinter.Radiobutton( 181 | lblfrmAbstraction, 182 | text="Abstraction OFF: Detect exact clones only", 183 | variable=self.absLevel, 184 | value=0, 185 | command=self.selectAbst 186 | ) 187 | R2 = Tkinter.Radiobutton( 188 | lblfrmAbstraction, 189 | text="Abstraction ON: Detect near-miss (similar) clones, as well as exact clones", 190 | variable=self.absLevel, 191 | value=4, 192 | command=self.selectAbst 193 | ) 194 | R1.pack(side=Tkinter.LEFT, anchor=Tkinter.W) 195 | R2.pack(side=Tkinter.RIGHT, anchor=Tkinter.W) 196 | 197 | """ GENERATE """ 198 | frmGenerate = Tkinter.Frame(master) 199 | frmGenerate.pack(fill=Tkinter.BOTH, padx=50, pady=5) 200 | self.btnGenerate = Tkinter.Button( 201 | frmGenerate, 202 | width=10000, 203 | text="----- Generate hashmark -----", 204 | state="disabled", 205 | # command=lambda:self.generate("GUI", "", "") 206 | command=self.generate 207 | ) 208 | self.btnGenerate.pack(side=Tkinter.BOTTOM) 209 | 210 | """ PROCESS """ 211 | frmProcess = Tkinter.Frame(master) 212 | frmProcess.pack(fill=Tkinter.X) 213 | 214 | scrollbar = Tkinter.Scrollbar(frmProcess) 215 | scrollbar.pack(side=Tkinter.RIGHT, fill=Tkinter.Y) 216 | self.listProcess = Tkinter.Listbox(frmProcess, state="disabled", width=600, height=26, 217 | yscrollcommand=scrollbar.set, selectmode=Tkinter.SINGLE) 218 | # self.listProcess.insert(END, "") 219 | self.listProcess.pack(side=Tkinter.LEFT, fill=Tkinter.BOTH) 220 | scrollbar.config(command=self.listProcess.yview) 221 | 222 | """ PROGRESSBAR """ 223 | frmPgbar = ttk.Frame(master) 224 | frmPgbar.pack(expand=True, fill=Tkinter.BOTH, side=Tkinter.TOP) 225 | 226 | self.progress = 0 227 | self.progressbar = ttk.Progressbar( 228 | frmPgbar, 229 | orient="horizontal", 230 | mode="determinate", 231 | value=self.progress, 232 | maximum=1 233 | ) 234 | self.progressbar.pack(expand=True, fill=Tkinter.BOTH, side=Tkinter.TOP) 235 | 236 | """ QUIT """ 237 | frmBottom = Tkinter.Frame(master) # , bd=20) 238 | frmBottom.pack(fill=Tkinter.BOTH) 239 | 240 | self.btnOpenFolder = Tkinter.Button( 241 | frmBottom, 242 | width=15, 243 | text="Open hidx folder", 244 | state="disabled", 245 | command=self.openFolder 246 | ) 247 | self.btnQuit = Tkinter.Button( 248 | frmBottom, 249 | width=15, 250 | text="QUIT", 251 | command=frmBottom.quit 252 | ) 253 | self.btnOpenFolder.pack(side=Tkinter.LEFT, padx=50) 254 | self.btnQuit.pack(side=Tkinter.RIGHT, padx=50, pady=15) 255 | 256 | def openFolder(self): 257 | path = os.path.join(os.getcwd(), "hidx") 258 | if osName == "win": 259 | subprocess.Popen( 260 | ["explorer", "/select,", path], 261 | stdout=subprocess.PIPE, 262 | stderr=subprocess.PIPE 263 | ) 264 | elif osName == "linux": 265 | subprocess.Popen( 266 | ["xdg-open", path], 267 | stdout=subprocess.PIPE, 268 | stderr=subprocess.PIPE 269 | ) 270 | elif osName == "osx": 271 | subprocess.Popen( 272 | ["open", "-R", path], 273 | stdout=subprocess.PIPE, 274 | stderr=subprocess.PIPE 275 | ) 276 | 277 | def generate(self): 278 | directory = self.directory.get() 279 | absLevel = int(self.absLevel.get()) 280 | self.progress = 0 281 | 282 | proj = directory.replace('\\', '/').split('/')[-1] 283 | timeIn = time.time() 284 | numFile = 0 285 | numFunc = 0 286 | numLine = 0 287 | 288 | projDic = {} 289 | hashFileMap = {} 290 | 291 | self.listProcess.config(state="normal") 292 | self.listProcess.insert(Tkinter.END, 293 | "Loading source files... This may take a few minutes." 294 | ) 295 | self.listProcess.update() 296 | 297 | fileList = pu.loadSource(directory) 298 | numFile = len(fileList) 299 | 300 | if numFile == 0: 301 | self.listProcess.insert(Tkinter.END, 302 | "Error: Failed loading source files." 303 | ) 304 | self.listProcess.insert(Tkinter.END, 305 | "- Check if you selected proper directory, or if your project contains .c or .cpp files." 306 | ) 307 | else: 308 | # self.listProcess.insert(END, "") 309 | self.listProcess.insert(Tkinter.END, 310 | "Load complete. Generating hashmark..." 311 | ) 312 | # self.listProcess.insert(END, "") 313 | # self.listProcess.insert(END, "") 314 | 315 | if absLevel == 0: 316 | func = parseFile_shallow_multi 317 | else: 318 | func = parseFile_deep_multi 319 | 320 | cpu_count = get_cpu_count.get_cpu_count() 321 | if cpu_count != 1: 322 | cpu_count -= 1 323 | 324 | pool = multiprocessing.Pool(processes=cpu_count) 325 | for idx, tup in enumerate(pool.imap_unordered(func, fileList)): 326 | f = tup[0] 327 | 328 | functionInstanceList = tup[1] 329 | pathOnly = f.split(proj, 1)[1][1:] 330 | progress = float(idx + 1) / numFile 331 | 332 | self.progressbar["value"] = progress 333 | self.progressbar.update() 334 | self.listProcess.insert(Tkinter.END, "[+] " + f) 335 | self.listProcess.see("end") 336 | 337 | numFunc += len(functionInstanceList) 338 | 339 | if len(functionInstanceList) > 0: 340 | numLine += functionInstanceList[0].parentNumLoc 341 | 342 | for f in functionInstanceList: 343 | f.removeListDup() 344 | path = f.parentFile 345 | absBody = pu.abstract(f, absLevel)[1] 346 | # self.listProcess.insert(Tkinter.END, absBody) 347 | absBody = pu.normalize(absBody) 348 | funcLen = len(absBody) 349 | 350 | if funcLen > 50: 351 | hashValue = md5(absBody).hexdigest() 352 | 353 | try: 354 | projDic[funcLen].append(hashValue) 355 | except KeyError: 356 | projDic[funcLen] = [hashValue] 357 | try: 358 | hashFileMap[hashValue].extend([pathOnly, f.funcId]) 359 | except KeyError: 360 | hashFileMap[hashValue] = [pathOnly, f.funcId] 361 | else: 362 | numFunc -= 1 # decrement numFunc by 1 if funclen is under threshold 363 | 364 | self.listProcess.insert(Tkinter.END, "") 365 | self.listProcess.insert(Tkinter.END, "Hash index successfully generated.") 366 | self.listProcess.see("end") 367 | self.listProcess.insert(Tkinter.END, "") 368 | self.listProcess.see("end") 369 | self.listProcess.insert(Tkinter.END, "Saving hash index to file...") 370 | self.listProcess.see("end") 371 | 372 | try: 373 | os.mkdir("hidx") 374 | except: 375 | pass 376 | packageInfo = str(localVersion) + ' ' + str(proj) + ' ' + str(numFile) + ' ' + str(numFunc) + ' ' + str( 377 | numLine) + '\n' 378 | with open("hidx/hashmark_" + str(absLevel) + "_" + proj + ".hidx", 'w') as fp: 379 | fp.write(packageInfo) 380 | 381 | for key in sorted(projDic): 382 | fp.write(str(key) + '\t') 383 | for h in list(set(projDic[key])): 384 | fp.write(h + '\t') 385 | fp.write('\n') 386 | 387 | fp.write('\n=====\n') 388 | 389 | for key in sorted(hashFileMap): 390 | fp.write(str(key) + '\t') 391 | for f in hashFileMap[key]: 392 | fp.write(str(f) + '\t') 393 | fp.write('\n') 394 | 395 | timeOut = time.time() 396 | 397 | self.listProcess.insert(Tkinter.END, "Done.") 398 | self.listProcess.see("end") 399 | self.listProcess.insert(Tkinter.END, "") 400 | self.listProcess.insert(Tkinter.END, "Elapsed time: %.02f sec." % (timeOut - timeIn)) 401 | self.listProcess.see("end") 402 | 403 | self.listProcess.insert(Tkinter.END, "Program statistics:") 404 | self.listProcess.insert(Tkinter.END, " - " + str(numFile) + ' files;') 405 | self.listProcess.insert(Tkinter.END, " - " + str(numFunc) + ' functions;') 406 | self.listProcess.insert(Tkinter.END, " - " + str(numLine) + ' lines of code.') 407 | self.listProcess.see("end") 408 | 409 | self.listProcess.insert(Tkinter.END, "") 410 | self.listProcess.insert(Tkinter.END, 411 | "Hash index saved to: " + os.getcwd().replace("\\", "/") + "/hidx/hashmark_" + str( 412 | absLevel) + "_" + proj + ".hidx") 413 | self.listProcess.see("end") 414 | self.btnOpenFolder.config(state="normal") 415 | 416 | return 0 417 | 418 | def selectAbst(self): 419 | selection = str(self.absLevel.get()) 420 | 421 | def askDirectory(self): 422 | selectedDirectory = tkFileDialog.askdirectory() 423 | if len(selectedDirectory) > 1: 424 | self.lblSelected.config(fg="Black") 425 | self.lblDirectory.config(fg="Black") 426 | self.directory.set(selectedDirectory) 427 | self.btnGenerate.config(state="normal") 428 | 429 | def show_about(self): 430 | top = Tkinter.Toplevel(padx=20, pady=10) 431 | if osName == "win": # this only works for windows. 432 | top.withdraw() # temporarily hide widget for better UI 433 | aboutMessage = """ 434 | hmark is an hash index generator for vulnerable code clone detection. 435 | 436 | Developed by Seulbae Kim @CSSA. 437 | https://iotcube.net 438 | cssa@korea.ac.kr 439 | 440 | """ 441 | msg = Tkinter.Message(top, text=aboutMessage) 442 | msg.pack() 443 | btnOkay = Tkinter.Button(top, text="Okay", command=top.destroy) 444 | btnOkay.pack() 445 | 446 | top.update() 447 | # self.master.update_idletasks() 448 | topw = top.winfo_reqwidth() # width of this widget 449 | toph = top.winfo_reqheight() # height of this widget 450 | parentGeo = self.master.geometry().split('+') 451 | parentX = int(parentGeo[1]) # X coordinate of parent (the main window) 452 | parentY = int(parentGeo[2]) # Y coordinate of parent 453 | 454 | #top.geometry("+%d+%d" % (parentX + self.mainWidth / 2 - topw / 2, parentY + self.mainHeight / 2 - toph / 2)) 455 | top.geometry("+%d+%d" % (parentX + self.mainWidth // 2 - topw // 2, parentY + self.mainHeight // 2 - toph / 2)) 456 | top.resizable(width=False, height=False) 457 | top.grab_set_global() 458 | top.title("About hmark...") 459 | if osName == "win": 460 | top.deiconify() # show widget, as its position is set 461 | 462 | def show_help(self): 463 | top = Tkinter.Toplevel(padx=20, pady=10) 464 | if osName == "win": # this only works for windows. 465 | top.withdraw() # temporarily hide widget 466 | 467 | helpMessage = """ 468 | 1. Select the root directory of your package under which source code is located.\n 469 | 2. Choose the abstraction mode. 470 | - OFF: hmark detects only exact clones. 471 | - ON: hmark detects near-miss clones along with exact clones, by tolerating changes in parameter, variable names, types, and names of the called functions.\n 472 | 3. Generate Hashmark. 473 | """ 474 | msg = Tkinter.Message(top, text=helpMessage) 475 | btnOkay = Tkinter.Button(top, text="Okay", command=top.destroy) 476 | self.master.update_idletasks() 477 | 478 | msg.pack() 479 | btnOkay.pack() 480 | 481 | top.update() 482 | topw = top.winfo_reqwidth() # width of this widget 483 | toph = top.winfo_reqheight() # height of this widget 484 | 485 | parentGeo = self.master.geometry().split('+') 486 | parentX = int(parentGeo[1]) # width of parent (the main window) 487 | parentY = int(parentGeo[2]) # height of parent 488 | 489 | #top.geometry("+%d+%d" % (parentX + self.mainWidth / 2 - topw / 2, parentY + self.mainHeight / 2 - toph / 2)) 490 | top.geometry("+%d+%d" % (parentX + self.mainWidth // 2 - topw // 2, parentY + self.mainHeight // 2 - toph // 2)) 491 | top.resizable(width=False, height=False) 492 | top.grab_set_global() 493 | top.title("Help") 494 | if osName == "win": 495 | top.deiconify() # show widget, as its position is set 496 | 497 | 498 | def run_gui(): 499 | global localVersion 500 | global icon 501 | global Tkinter 502 | global tkFileDialog 503 | global ttk 504 | 505 | #import Tkinter 506 | #import tkFileDialog 507 | #import ttk 508 | import tkinter as Tkinter 509 | from tkinter import filedialog as tkFileDialog 510 | from tkinter import ttk 511 | 512 | 513 | root = Tkinter.Tk() 514 | app = App(root) 515 | root.title("hmark ver " + str(localVersion)) 516 | 517 | try: # if icon is available 518 | icon = resource_path("icon.gif") 519 | img = Tkinter.PhotoImage(file=icon) 520 | root.tk.call('wm', 'iconphoto', root._w, img) 521 | except Tkinter.TclError: # if, for some reason, icon isn't available 522 | pass 523 | 524 | root.mainloop() 525 | 526 | try: 527 | root.destroy() 528 | print("Farewell!") 529 | except Tkinter.TclError: 530 | print("GUI process terminated.") 531 | 532 | 533 | def generate_cli(targetPath, isAbstraction): 534 | import subprocess 535 | directory = targetPath.rstrip('/').rstrip("\\") 536 | 537 | if isAbstraction.lower() == "on": 538 | absLevel = 4 539 | else: 540 | absLevel = 0 541 | 542 | proj = directory.replace('\\', '/').split('/')[-1] 543 | print("PROJ:", proj) 544 | timeIn = time.time() 545 | numFile = 0 546 | numFunc = 0 547 | numLine = 0 548 | 549 | projDic = {} 550 | hashFileMap = {} 551 | 552 | print("[+] Loading source files... This may take a few minutes.") 553 | 554 | fileList = pu.loadSource(directory) 555 | numFile = len(fileList) 556 | 557 | if numFile == 0: 558 | print("[-] Error: Failed loading source files.") 559 | print(" Check if you selected proper directory, or if your project contains .c or .cpp files.") 560 | sys.exit() 561 | else: 562 | print ("[+] Load complete. Generating hashmark...") 563 | 564 | if absLevel == 0: 565 | func = parseFile_shallow_multi 566 | else: 567 | func = parseFile_deep_multi 568 | 569 | cpu_count = get_cpu_count.get_cpu_count() 570 | if cpu_count != 1: 571 | cpu_count -= 1 572 | 573 | pool = multiprocessing.Pool(processes=cpu_count) 574 | for idx, tup in enumerate(pool.imap_unordered(func, fileList)): 575 | f = tup[0] 576 | functionInstanceList = tup[1] 577 | 578 | fullName = proj + f.split(proj, 1)[1] 579 | pathOnly = f.split(proj, 1)[1][1:] 580 | 581 | if osName == "win": 582 | columns = 80 583 | else: 584 | try: 585 | # http://stackoverflow.com/questions/566746/how-to-get-console-window-width-in-python 586 | #rows, columns = subprocess.check_output(['stty', 'size']).split() 587 | rows, columns = subprocess.check_output(['stty', 'size']).decode().split() 588 | except: 589 | columns = 80 590 | 591 | #progress = 100 * float(idx + 1) / numFile 592 | progress = 100 * (idx + 1) / float(numFile) 593 | buf = "\r%.2f%% %s" % (progress, fullName) 594 | buf += " " * (int(columns) - len(buf)) 595 | sys.stdout.write(buf) 596 | sys.stdout.flush() 597 | 598 | numFunc += len(functionInstanceList) 599 | 600 | if len(functionInstanceList) > 0: 601 | numLine += functionInstanceList[0].parentNumLoc 602 | 603 | for f in functionInstanceList: 604 | f.removeListDup() 605 | path = f.parentFile 606 | # print "\nORIGINALLY:", f.funcBody 607 | absBody = pu.abstract(f, absLevel)[1] 608 | absBody = pu.normalize(absBody) 609 | funcLen = len(absBody) 610 | # print "\n", funcLen, absBody 611 | 612 | if funcLen > 50: 613 | #hashValue = md5(absBody).hexdigest() 614 | hashValue = md5(absBody.encode('utf-8')).hexdigest() 615 | 616 | try: 617 | projDic[funcLen].append(hashValue) 618 | except KeyError: 619 | projDic[funcLen] = [hashValue] 620 | try: 621 | hashFileMap[hashValue].extend([pathOnly, f.funcId]) 622 | except KeyError: 623 | hashFileMap[hashValue] = [pathOnly, f.funcId] 624 | else: 625 | numFunc -= 1 # decrement numFunc by 1 if funclen is under threshold 626 | 627 | print("") 628 | print("[+] Hash index successfully generated.") 629 | print("[+] Saving hash index to file..."), 630 | 631 | packageInfo = str(localVersion) + ' ' + str(proj) + ' ' + str(numFile) + ' ' + str(numFunc) + ' ' + str( 632 | numLine) + '\n' 633 | with open("hidx/hashmark_" + str(absLevel) + "_" + proj + ".hidx", 'w') as fp: 634 | fp.write(packageInfo) 635 | 636 | for key in sorted(projDic): 637 | fp.write(str(key) + '\t') 638 | for h in list(set(projDic[key])): 639 | fp.write(h + '\t') 640 | fp.write('\n') 641 | 642 | fp.write('\n=====\n') 643 | 644 | for key in sorted(hashFileMap): 645 | fp.write(str(key) + '\t') 646 | for f in hashFileMap[key]: 647 | fp.write(str(f) + '\t') 648 | fp.write('\n') 649 | 650 | timeOut = time.time() 651 | 652 | print("(Done)") 653 | print("") 654 | print("[+] Elapsed time: %.02f sec." % (timeOut - timeIn)) 655 | print("Program statistics:") 656 | print(" - " + str(numFile) + ' files;') 657 | print(" - " + str(numFunc) + ' functions;') 658 | print(" - " + str(numLine) + ' lines of code.') 659 | print("") 660 | print("[+] Hash index saved to: " + os.getcwd().replace("\\", "/") + "/hidx/hashmark_" + str( 661 | absLevel) + "_" + proj + ".hidx") 662 | 663 | 664 | def run_cli(targetPath, isAbstraction): 665 | generate_cli(targetPath, isAbstraction) 666 | print("Farewell!") 667 | 668 | 669 | def main(): 670 | try: 671 | os.mkdir("hidx") 672 | except: 673 | pass 674 | 675 | get_platform() 676 | 677 | progStr = "hmark_" + localVersion + "_" + osName 678 | if osName == "win": 679 | progStr += "_x" + bits + ".exe" 680 | elif osName == "linux": 681 | progStr = "./" + progStr + "_x" + bits 682 | elif osName == "osx": 683 | progStr = "./" + progStr 684 | 685 | ap = argparse.ArgumentParser( 686 | prog=progStr 687 | ) 688 | 689 | ap.add_argument( 690 | "-c", 691 | "--cli-mode", 692 | dest="cli_mode", 693 | nargs=2, 694 | metavar=("path", "ON/OFF"), 695 | required=False, 696 | help="run hmark without GUI by specifying the path to the target directory, and the abstraction mode" 697 | ) 698 | 699 | ap.add_argument( 700 | "-n", 701 | "--no-updatecheck", 702 | dest="no_update_check", 703 | action="store_true", 704 | required=False, 705 | help="bypass update checking (not recommended)" 706 | ) 707 | ap.add_argument( 708 | "-V", 709 | "--version", 710 | dest="version", 711 | action="store_true", 712 | required=False, 713 | help="print hmark version and exit" 714 | ) 715 | args = ap.parse_args() 716 | 717 | if args.version: 718 | versionString = "hmark" + localVersion + " for " + osName 719 | if osName == "linux" or osName == "win": 720 | versionString = versionString + " (x" + bits + ")" 721 | print(versionString) 722 | sys.exit() 723 | 724 | if args.no_update_check: 725 | print("Bypassed the update checker.") 726 | else: 727 | check_update() 728 | 729 | if osName == "linux" or osName == "osx": 730 | try: 731 | msg = subprocess.check_output("java -version", stderr=subprocess.STDOUT, shell=True) 732 | except subprocess.CalledProcessError as e: 733 | print("Java error:", e) 734 | print("Please try again after installing JDK.") 735 | sys.exit() 736 | 737 | if args.cli_mode: 738 | if os.path.isdir(args.cli_mode[0]) is False: 739 | print("[-] Directory does not exist:", args.cli_mode[0]) 740 | print(" Please specify the right directory to your target.") 741 | sys.exit() 742 | 743 | if args.cli_mode[1].isalpha(): 744 | if args.cli_mode[1].lower() == "on" or args.cli_mode[1].lower() == "off": 745 | print("Running in CLI mode") 746 | print("TARGET: " + args.cli_mode[0]) 747 | print("ABSTRACTION: " + args.cli_mode[1]) 748 | run_cli(args.cli_mode[0], args.cli_mode[1]) 749 | else: 750 | print("[-] Bad parameter: " + args.cli_mode[1]) 751 | print(" Accepted values are ON or OFF.") 752 | sys.exit() 753 | else: 754 | print("[-] Bad parameter: " + args.cli_mode[1]) 755 | print(" Accepted values are ON or OFF.") 756 | sys.exit() 757 | 758 | else: 759 | print("Running GUI") 760 | run_gui() 761 | 762 | 763 | def resource_path(relative_path): 764 | """ Get absolute path to resource, works for dev and for PyInstaller """ 765 | try: 766 | base_path = sys._MEIPASS 767 | except Exception: 768 | base_path = os.path.abspath(".") 769 | 770 | return os.path.join(base_path, relative_path) 771 | 772 | 773 | try: 774 | # Python 3.4+ 775 | if sys.platform.startswith("win"): 776 | import multiprocessing.popen_spawn_win32 as forking 777 | else: 778 | import multiprocessing.popen_fork as forking 779 | except ImportError: 780 | import multiprocessing.forking as forking 781 | 782 | if sys.platform.startswith("win"): 783 | # First define a modified version of Popen. 784 | class _Popen(forking.Popen): 785 | def __init__(self, *args, **kw): 786 | if hasattr(sys, 'frozen'): 787 | # We have to set original _MEIPASS2 value from sys._MEIPASS 788 | # to get --onefile mode working. 789 | os.putenv('_MEIPASS2', sys._MEIPASS) 790 | try: 791 | super(_Popen, self).__init__(*args, **kw) 792 | finally: 793 | if hasattr(sys, 'frozen'): 794 | # On some platforms (e.g. AIX) 'os.unsetenv()' is not 795 | # available. In those cases we cannot delete the variable 796 | # but only set it to the empty string. The bootloader 797 | # can handle this case. 798 | if hasattr(os, 'unsetenv'): 799 | os.unsetenv('_MEIPASS2') 800 | else: 801 | os.putenv('_MEIPASS2', '') 802 | 803 | 804 | # Second override 'Popen' class with our modified version. 805 | forking.Popen = _Popen 806 | 807 | """ EXECUTE """ 808 | if __name__ == "__main__": 809 | multiprocessing.freeze_support() 810 | main() 811 | -------------------------------------------------------------------------------- /hmark/icon.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squizz617/vuddy/33cdab1ad04a6dcc76011b92821dbeb055c6691e/hmark/icon.gif -------------------------------------------------------------------------------- /hmark/icon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squizz617/vuddy/33cdab1ad04a6dcc76011b92821dbeb055c6691e/hmark/icon.ico -------------------------------------------------------------------------------- /hmark/parseutility2.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | Parser utility. 4 | Author: Seulbae Kim 5 | Created: August 03, 2016 6 | """ 7 | 8 | import os 9 | import sys 10 | import subprocess 11 | import re 12 | import platform 13 | 14 | 15 | def get_platform(): 16 | global osName 17 | global bits 18 | 19 | pf = platform.platform() 20 | bits, _ = platform.architecture() 21 | if "Windows" in pf: 22 | osName = "win" 23 | bits = "" 24 | elif "Linux" in pf: 25 | osName = "linux" 26 | if "64" in bits: 27 | bits = "64" 28 | else: 29 | bits = "86" 30 | else: 31 | osName = "osx" 32 | bits = "" 33 | 34 | 35 | def setEnvironment(caller): 36 | get_platform() 37 | global javaCallCommand 38 | if caller == "GUI": 39 | # try: 40 | # base_path = sys._MEIPASS 41 | # except: 42 | # base_path = os.path.abspath(".") 43 | cwd = os.getcwd() 44 | if osName == "win": 45 | # full_path = os.path.join(base_path, "FuncParser.exe") 46 | # javaCallCommand = os.path.join(cwd, "FuncParser-opt.exe ") 47 | base_path = os.path.dirname(os.path.abspath(__file__)) # vuddy/hmark root directory 48 | javaCallCommand = "\"{0}\" -Xmx1024m -jar \"{1}\" ".format("java", os.path.join(base_path, "FuncParser-opt.jar")) 49 | 50 | elif osName == "linux" or osName == "osx": 51 | # full_path = os.path.join(base_path, "FuncParser.jar") 52 | # javaCallCommand = "java -Xmx1024m -jar " + full_path + " " 53 | javaCallCommand = "\"{0}\" -Xmx1024m -jar \"{1}\" ".format("java", os.path.join(cwd, "FuncParser-opt.jar")) 54 | 55 | else: 56 | if osName == "win": 57 | base_path = os.path.dirname(os.path.abspath(__file__)) # vuddy/hmark root directory 58 | # javaCallCommand = os.path.join(base_path, "FuncParser-opt.exe ") 59 | javaCallCommand = "\"{0}\" -Xmx1024m -jar \"{1}\" ".format("java", os.path.join(base_path, "FuncParser-opt.jar")) 60 | elif osName == "linux" or osName == "osx": 61 | base_path = os.path.dirname(os.path.abspath(__file__)) # vuddy/hmark root directory 62 | javaCallCommand = "\"{0}\" -Xmx1024m -jar \"{1}\" ".format("java", os.path.join(base_path, "FuncParser-opt.jar")) 63 | 64 | 65 | class function: 66 | parentFile = None # Absolute file which has the function 67 | parentNumLoc = None # Number of LoC of the parent file 68 | name = None # Name of the function 69 | lines = None # Tuple (lineFrom, lineTo) that indicates the LoC of function 70 | funcId = None # n, indicating n-th function in the file 71 | parameterList = [] # list of parameter variables 72 | variableList = [] # list of local variables 73 | dataTypeList = [] # list of data types, including user-defined types 74 | funcCalleeList = [] # list of called functions' names 75 | funcBody = None 76 | 77 | def __init__(self, fileName): 78 | self.parentFile = fileName 79 | self.parameterList = [] 80 | self.variableList = [] 81 | self.dataTypeList = [] 82 | self.funcCalleeList = [] 83 | 84 | def removeListDup(self): 85 | # for best performance, must execute this method 86 | # for every instance before applying the abstraction. 87 | self.parameterList = list(set(self.parameterList)) 88 | self.variableList = list(set(self.variableList)) 89 | self.dataTypeList = list(set(self.dataTypeList)) 90 | self.funcCalleeList = list(set(self.funcCalleeList)) 91 | 92 | # def getOriginalFunction(self): 93 | # # returns the original function back from the instance. 94 | # fp = open(self.parentFile, 'r') 95 | # srcFileRaw = fp.readlines() 96 | # fp.close() 97 | # return ''.join(srcFileRaw[self.lines[0]-1:self.lines[1]]) 98 | 99 | 100 | def loadSource(rootDirectory): 101 | # returns the list of .src files under the specified root directory. 102 | maxFileSizeInBytes = None 103 | maxFileSizeInBytes = 2*1024*1024 # remove this line if you don't want to restrict 104 | # the maximum file size that you process. 105 | walkList = os.walk(rootDirectory) 106 | srcFileList = [] 107 | for path, dirs, files in walkList: 108 | for fileName in files: 109 | ext = fileName.lower() 110 | if ext.endswith('.c') or ext.endswith('.cpp') or ext.endswith('.cc') or ext.endswith('.c++') or ext.endswith('.cxx'): 111 | absPathWithFileName = path.replace('\\', '/') + '/' + fileName 112 | if maxFileSizeInBytes is not None: 113 | if os.path.getsize(absPathWithFileName) < maxFileSizeInBytes: 114 | srcFileList.append(absPathWithFileName) 115 | else: 116 | srcFileList.append(absPathWithFileName) 117 | return srcFileList 118 | 119 | 120 | def loadVul(rootDirectory): 121 | # returns the list of .vul files under the specified root directory. 122 | maxFileSizeInBytes = None 123 | # maxFileSizeInBytes = 2097152 # remove this line if you don't want to restrict 124 | # the maximum file size that you process. 125 | walkList = os.walk(rootDirectory) 126 | srcFileList = [] 127 | for path, dirs, files in walkList: 128 | for fileName in files: 129 | if fileName.endswith('OLD.vul'): 130 | absPathWithFileName = path.replace('\\', '/') + '/' + fileName 131 | if maxFileSizeInBytes is not None: 132 | if os.path.getsize(absPathWithFileName) < maxFileSizeInBytes: 133 | srcFileList.append(absPathWithFileName) 134 | else: 135 | srcFileList.append(absPathWithFileName) 136 | return srcFileList 137 | 138 | 139 | def removeComment(string): 140 | # Code for removing C/C++ style comments. (Imported from ReDeBug.) 141 | c_regex = re.compile( 142 | r'(?P//.*?$|[{}]+)|(?P/\*.*?\*/)|(?P\'(\\.|[^\\\'])*\'|"(\\.|[^\\"])*"|.[^/\'"]*)', 143 | re.DOTALL | re.MULTILINE) 144 | #return ''.join([c.group('noncomment') for c in c_regex.finditer(string) if c.group('noncomment')]) 145 | return ''.join([c.group('noncomment') for c in c_regex.finditer(string.decode('latin-1')) if c.group('noncomment')]) 146 | 147 | # def getBody(originalFunction): 148 | # # returns the function's body as a string. 149 | # return originalFunction[originalFunction.find('{')+1:originalFunction.rfind('}')] 150 | 151 | 152 | def normalize(string): 153 | # Code for normalizing the input string. 154 | # LF and TAB literals, curly braces, and spaces are removed, 155 | # and all characters are lowercased. 156 | return ''.join(string.replace('\n', '').replace('\r', '').replace('\t', '').replace('{', '').replace('}', '').split( 157 | ' ')).lower() 158 | 159 | 160 | def abstract(instance, level): 161 | # Applies abstraction on the function instance, 162 | # and then returns a tuple consisting of the original body and abstracted body. 163 | originalFunctionBody = instance.funcBody 164 | # print "===================" 165 | originalFunctionBody = removeComment(originalFunctionBody) 166 | # print originalFunctionBody 167 | # print '====================================================' 168 | if int(level) >= 0: # No abstraction. 169 | abstractBody = originalFunctionBody 170 | 171 | if int(level) >= 1: # PARAM 172 | parameterList = instance.parameterList 173 | for param in parameterList: 174 | if len(param) == 0: 175 | continue 176 | try: 177 | paramPattern = re.compile("(^|\\W)" + param + "(\\W)") 178 | abstractBody = paramPattern.sub("\\g<1>FPARAM\\g<2>", abstractBody) 179 | except: 180 | pass 181 | 182 | if int(level) >= 2: # DTYPE 183 | dataTypeList = instance.dataTypeList 184 | for dtype in dataTypeList: 185 | if len(dtype) == 0: 186 | continue 187 | try: 188 | dtypePattern = re.compile("(^|\\W)" + dtype + "(\\W)") 189 | abstractBody = dtypePattern.sub("\\g<1>DTYPE\\g<2>", abstractBody) 190 | except: 191 | pass 192 | 193 | if int(level) >= 3: # LVAR 194 | variableList = instance.variableList 195 | for lvar in variableList: 196 | if len(lvar) == 0: 197 | continue 198 | try: 199 | lvarPattern = re.compile("(^|\\W)" + lvar + "(\\W)") 200 | abstractBody = lvarPattern.sub("\\g<1>LVAR\\g<2>", abstractBody) 201 | except: 202 | pass 203 | 204 | if int(level) >= 4: # FUNCCALL 205 | funcCalleeList = instance.funcCalleeList 206 | for fcall in funcCalleeList: 207 | if len(fcall) == 0: 208 | continue 209 | try: 210 | fcallPattern = re.compile("(^|\\W)" + fcall + "(\\W)") 211 | abstractBody = fcallPattern.sub("\\g<1>FUNCCALL\\g<2>", abstractBody) 212 | except: 213 | pass 214 | 215 | return (originalFunctionBody, abstractBody) 216 | 217 | 218 | delimiter = "\r\0?\r?\0\r" 219 | 220 | 221 | def parseFile_shallow(srcFileName, caller): 222 | # this does not parse body. 223 | global javaCallCommand 224 | global delimiter 225 | setEnvironment(caller) 226 | javaCallCommand += "\"" + srcFileName + "\" 0" 227 | functionInstanceList = [] 228 | try: 229 | astString = subprocess.check_output(javaCallCommand, stderr=subprocess.STDOUT, shell=True) 230 | except subprocess.CalledProcessError as e: 231 | #print "Parser Error:", e 232 | print("Parser Error:", e) 233 | astString = "" 234 | #funcList = astString.split(delimiter) 235 | funcList = astString.split(delimiter.encode('utf-8')) 236 | for func in funcList[1:]: 237 | functionInstance = function(srcFileName) 238 | #elemsList = func.split('\n')[1:-1] 239 | elemsList = func.split(b'\n')[1:-1] 240 | if len(elemsList) > 9: 241 | functionInstance.parentNumLoc = int(elemsList[1]) 242 | functionInstance.name = elemsList[2] 243 | #functionInstance.lines = (int(elemsList[3].split('\t')[0]), int(elemsList[3].split('\t')[1])) 244 | functionInstance.lines = (int(elemsList[3].split(b'\t')[0]), int(elemsList[3].split(b'\t')[1])) 245 | functionInstance.funcId = int(elemsList[4]) 246 | #functionInstance.funcBody = '\n'.join(elemsList[9:]) 247 | functionInstance.funcBody = b'\n'.join(elemsList[9:]) 248 | # print functionInstance.funcBody 249 | # print "-------------------" 250 | 251 | functionInstanceList.append(functionInstance) 252 | 253 | return functionInstanceList 254 | 255 | 256 | def parseFile_deep(srcFileName, caller): 257 | global javaCallCommand 258 | global delimiter 259 | setEnvironment(caller) 260 | # this parses function definition plus body. 261 | javaCallCommand += "\"" + srcFileName + "\" 1" 262 | functionInstanceList = [] 263 | 264 | try: 265 | astString = subprocess.check_output(javaCallCommand, stderr=subprocess.STDOUT, shell=True) 266 | except subprocess.CalledProcessError as e: 267 | print("Parser Error:", e) 268 | astString = "" 269 | 270 | funcList = astString.split(delimiter) 271 | for func in funcList[1:]: 272 | functionInstance = function(srcFileName) 273 | 274 | elemsList = func.split('\n')[1:-1] 275 | # print elemsList 276 | if len(elemsList) > 9: 277 | functionInstance.parentNumLoc = int(elemsList[1]) 278 | functionInstance.name = elemsList[2] 279 | functionInstance.lines = (int(elemsList[3].split('\t')[0]), int(elemsList[3].split('\t')[1])) 280 | functionInstance.funcId = int(elemsList[4]) 281 | functionInstance.parameterList = elemsList[5].rstrip().split('\t') 282 | functionInstance.variableList = elemsList[6].rstrip().split('\t') 283 | functionInstance.dataTypeList = elemsList[7].rstrip().split('\t') 284 | functionInstance.funcCalleeList = elemsList[8].rstrip().split('\t') 285 | functionInstance.funcBody = '\n'.join(elemsList[9:]) 286 | # print '\n'.join(elemsList[9:]) 287 | functionInstanceList.append(functionInstance) 288 | 289 | return functionInstanceList 290 | -------------------------------------------------------------------------------- /hmark/spec_generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import version 4 | 5 | pf = platform.platform() 6 | bits, _ = platform.architecture() 7 | 8 | if 'Windows' in pf: 9 | osName = "win" 10 | if "64" in bits: 11 | bits = "_x64" 12 | else: 13 | bits = "_x86" 14 | elif 'Linux' in pf: 15 | osName = 'linux' 16 | if "64" in bits: 17 | bits = "_x64" 18 | else: 19 | bits = "_x86" 20 | else: 21 | osName = "osx" 22 | bits = "" 23 | 24 | # if '64' in bits: 25 | # bits = 'x64' 26 | # else: 27 | # bits = 'x86' 28 | 29 | # if osName == 'OSX': 30 | # bits = '' 31 | 32 | version = version.version 33 | 34 | fp = open("hmark_" + version + '_' + osName + bits + ".spec", "w") 35 | cwd = os.getcwd() 36 | if osName == "linux": 37 | fp.write("\ 38 | # -*- mode: python -*-\n\n\ 39 | block_cipher = None\n\n\n\ 40 | a = Analysis(['hmark.py'],\n\ 41 | pathex=[r'" + cwd + "'],\n\ 42 | binaries=None,\n\ 43 | datas=None,\n\ 44 | hiddenimports=[],\n\ 45 | hookspath=[],\n\ 46 | runtime_hooks=[],\n\ 47 | excludes=[],\n\ 48 | win_no_prefer_redirects=False,\n\ 49 | win_private_assemblies=False,\n\ 50 | cipher=block_cipher)\n\ 51 | a.datas += [('icon.gif', r'" + os.path.join(cwd, 'icon.gif') + "', 'DATA')]\n\ 52 | pyz = PYZ(a.pure, a.zipped_data,\n\ 53 | cipher=block_cipher)\n\ 54 | exe = EXE(pyz,\n\ 55 | a.scripts,\n\ 56 | a.binaries,\n\ 57 | a.zipfiles,\n\ 58 | a.datas,\n\ 59 | name='hmark_" + version + "_" + osName + bits + "',\n\ 60 | debug=False,\n\ 61 | strip=False,\n\ 62 | upx=True,\n\ 63 | console=True )\n\ 64 | """) 65 | 66 | elif osName == "osx": 67 | fp.write("\ 68 | # -*- mode: python -*-\n\n\ 69 | block_cipher = None\n\n\n\ 70 | a = Analysis(['hmark.py'],\n\ 71 | pathex=[r'" + cwd + "'],\n\ 72 | binaries=None,\n\ 73 | datas=None,\n\ 74 | hiddenimports=[],\n\ 75 | hookspath=[],\n\ 76 | runtime_hooks=[],\n\ 77 | excludes=[],\n\ 78 | win_no_prefer_redirects=False,\n\ 79 | win_private_assemblies=False,\n\ 80 | cipher=block_cipher)\n\ 81 | a.datas += [('icon.gif', r'" + os.path.join(cwd, 'icon.gif') + "', 'DATA')]\n\ 82 | pyz = PYZ(a.pure, a.zipped_data,\n\ 83 | cipher=block_cipher)\n\ 84 | exe = EXE(pyz,\n\ 85 | a.scripts,\n\ 86 | a.binaries,\n\ 87 | a.zipfiles,\n\ 88 | a.datas,\n\ 89 | name='hmark_" + version + "_" + osName + "',\n\ 90 | debug=False,\n\ 91 | strip=False,\n\ 92 | upx=True,\n\ 93 | console=True )\n\ 94 | """) 95 | 96 | elif osName == "win": 97 | fp.write("\ 98 | # -*- mode: python -*-\n\n\ 99 | block_cipher = None\n\n\n\ 100 | a = Analysis(['hmark.py'],\n\ 101 | pathex=[r'" + cwd + "'],\n\ 102 | binaries=None,\n\ 103 | datas=None,\n\ 104 | hiddenimports=[],\n\ 105 | hookspath=[],\n\ 106 | runtime_hooks=[],\n\ 107 | excludes=[],\n\ 108 | win_no_prefer_redirects=False,\n\ 109 | win_private_assemblies=False,\n\ 110 | cipher=block_cipher)\n\ 111 | a.datas += [('icon.gif', r'" + os.path.join(cwd, 'icon.gif') + "', 'DATA')]\n\ 112 | pyz = PYZ(a.pure, a.zipped_data,\n\ 113 | cipher=block_cipher)\n\ 114 | exe = EXE(pyz,\n\ 115 | a.scripts,\n\ 116 | a.binaries,\n\ 117 | a.zipfiles,\n\ 118 | a.datas,\n\ 119 | name='hmark_" + version + "_" + osName + bits + "',\n\ 120 | debug=False,\n\ 121 | strip=False,\n\ 122 | upx=True,\n\ 123 | console=True,\n\ 124 | icon='icon.ico')\ 125 | """) 126 | 127 | fp.close() 128 | print "Pyinstaller spec file generated: " + "hmark_" + version + '_' + osName + bits + ".spec" 129 | -------------------------------------------------------------------------------- /hmark/version.py: -------------------------------------------------------------------------------- 1 | version = "3.1.0" 2 | -------------------------------------------------------------------------------- /initialize.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import platform 4 | # cveXmlDownloader.py 파일이 있는 경로를 추가 5 | sys.path.append(r'E:\jnu\vuddy-demo\vulnDBGen\tools\cvedatagen') 6 | 7 | originalDir = os.path.dirname(os.path.abspath(__file__)) # vuddy root directory 8 | pf = platform.platform() 9 | 10 | try: 11 | import tools.cvedatagen.cveXmlDownloader as Downloader 12 | except ImportError: 13 | import cveXmlDownloader as Downloader 14 | try: 15 | import tools.cvedatagen.cveXmlParser as Parser 16 | except ImportError: 17 | import cveXmlParser as Parser 18 | try: 19 | import tools.cvedatagen.cveXmlUpdater as Updater 20 | except ImportError: 21 | import cveXmlUpdater as Updater 22 | 23 | import tools.cvedatagen.common as common 24 | 25 | 26 | def main(): 27 | print("Making directories...") 28 | dataDir = os.path.join(originalDir, "data", "repolists") 29 | if os.path.exists(dataDir) is False: 30 | os.makedirs(dataDir) 31 | diffDir = os.path.join(originalDir, "diff") 32 | if os.path.exists(diffDir) is False: 33 | os.makedirs(diffDir) 34 | vulDir = os.path.join(originalDir, "vul") 35 | if os.path.exists(vulDir) is False: 36 | os.makedirs(vulDir) 37 | hidxDir = os.path.join(originalDir, "hidx") 38 | if os.path.exists(hidxDir) is False: 39 | os.makedirs(hidxDir) 40 | 41 | 42 | print("Running CVE data generator...") 43 | 44 | os.chdir(os.path.join(originalDir, "data")) 45 | if "cvedata.pkl" not in os.listdir("./"): 46 | print("cvedata.pkl not found. Proceeding to download..") 47 | print("[+] cveXmlDownloader") 48 | Downloader.process() 49 | 50 | print("[+] cveXmlParser") 51 | Parser.process() 52 | else: 53 | print("cvedata.pkl found. Omitting download..") 54 | 55 | print("[+] cveXmlUpdater") 56 | Updater.process() 57 | 58 | os.chdir(originalDir) 59 | print("cvedata.pkl is now up-to-date.\n") 60 | 61 | 62 | if "Windows" in pf: # Windows 63 | if os.path.exists(os.path.join(originalDir, "tools", "FuncParser-opt.exe")) is False: 64 | print("Downloading function parser for Windows...") 65 | os.chdir(os.path.join(originalDir, "tools")) 66 | url = "https://github.com/squizz617/FuncParser-opt/raw/master/FuncParser-opt.zip" 67 | fileName = "FuncParser-opt.zip" 68 | common.download_url(url, fileName) 69 | common.unzip(fileName) 70 | os.remove(fileName) 71 | 72 | print("*** Please modify config.py before running scripts in src/ ***") 73 | 74 | if __name__ == '__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /paper/SNP17.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squizz617/vuddy/33cdab1ad04a6dcc76011b92821dbeb055c6691e/paper/SNP17.pdf -------------------------------------------------------------------------------- /src/get_cvepatch_from_git.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import subprocess 5 | import re 6 | import time 7 | import argparse 8 | import sys 9 | import platform 10 | import multiprocessing as mp 11 | from functools import partial 12 | 13 | try: 14 | import cPickle as pickle 15 | except ImportError: 16 | import pickle 17 | 18 | # Import from parent directory 19 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 20 | import config 21 | 22 | 23 | class InfoStruct: 24 | RepoName = '' # repository name 25 | OriginalDir = '' # vuddy root directory 26 | DiffDir = '' 27 | MultimodeFlag = 0 28 | MultiRepoList = [] 29 | GitBinary = config.gitBinary 30 | GitStoragePath = config.gitStoragePath 31 | CveDict = {} 32 | keyword = "CVE-20" 33 | cveID = None 34 | DebugMode = False 35 | 36 | def __init__(self, originalDir, CveDataPath): 37 | self.OriginalDir = originalDir 38 | self.DiffDir = os.path.join(originalDir, 'diff') 39 | with open(CveDataPath, "rb") as f: 40 | self.CveDict = pickle.load(f) 41 | 42 | 43 | """ GLOBALS """ 44 | originalDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # vuddy root directory 45 | cveDataPath = os.path.join(originalDir, "data", "cvedata.pkl") 46 | info = InfoStruct(originalDir, cveDataPath) # first three arg is dummy for now 47 | printLock = mp.Lock() 48 | 49 | 50 | """ FUNCTIONS """ 51 | def parse_argument(): 52 | global info 53 | 54 | parser = argparse.ArgumentParser(prog='get_cvepatch_from_git.py') 55 | parser.add_argument('REPO', 56 | help='''Repository name''') 57 | parser.add_argument('-m', '--multimode', action="store_true", 58 | help='''Turn on Multimode''') 59 | parser.add_argument('-k', '--keyword', 60 | help="Keyword to GREP, default: CVE-20", default="CVE-20") 61 | parser.add_argument('-c', '--cveid', help="CVE id to assign (Only when doing manual keyword search)") 62 | parser.add_argument('-d', '--debug', action="store_true", help=argparse.SUPPRESS) # Hidden Debug Mode 63 | 64 | args = parser.parse_args() 65 | 66 | info.RepoName = args.REPO 67 | info.keyword = args.keyword 68 | info.cveID = args.cveid 69 | info.MultimodeFlag = 0 70 | info.MultiRepoList = [] 71 | if args.multimode: 72 | info.MultimodeFlag = 1 73 | if "Windows" in platform.platform(): 74 | with open(os.path.join(originalDir, 'data', 'repolists', 'list_' + info.RepoName)) as fp: 75 | for repoLine in fp.readlines(): 76 | if len(repoLine) > 2: 77 | info.MultiRepoList.append(repoLine.rstrip()) 78 | else: 79 | repoBaseDir = os.path.join(info.GitStoragePath, info.RepoName) 80 | command_find = "find " + repoBaseDir + " -type d -exec test -e '{}/.git' ';' -print -prune" 81 | findOutput = subprocess.check_output(command_find, shell=True) 82 | info.MultiRepoList = findOutput.replace(repoBaseDir + "/", "").rstrip().split("\n") 83 | if args.debug: 84 | info.DebugMode = True 85 | 86 | 87 | def init(): 88 | global info 89 | 90 | parse_argument() 91 | 92 | print("Retrieving CVE patch from", info.RepoName) 93 | print("Multi-repo mode:"), 94 | if info.MultimodeFlag: 95 | print("ON.") 96 | else: 97 | print("OFF.") 98 | 99 | print("Initializing..."), 100 | 101 | try: 102 | os.makedirs(os.path.join(info.DiffDir, info.RepoName)) 103 | except OSError: 104 | pass 105 | 106 | print("Done.") 107 | 108 | 109 | def callGitLog(gitDir): 110 | global info 111 | """ 112 | Collect CVE commit log from repository 113 | :param gitDir: repository path 114 | :return: 115 | """ 116 | # print "Calling git log...", 117 | commitsList = [] 118 | gitLogOutput = "" 119 | command_log = "\"{0}\" --no-pager log --all --pretty=fuller --grep=\"{1}\"".format(info.GitBinary, info.keyword) 120 | print(gitDir) 121 | os.chdir(gitDir) 122 | try: 123 | try: 124 | gitLogOutput = subprocess.check_output(command_log, shell=True) 125 | #commitsList = re.split('[\n](?=commit\s\w{40}\nAuthor:\s)|[\n](?=commit\s\w{40}\nMerge:\s)', gitLogOutput) 126 | commitsList = re.split(b'[\n](?=commit\s\w{40}\nAuthor:\s)|[\n](?=commit\s\w{40}\nMerge:\s)', gitLogOutput) 127 | except subprocess.CalledProcessError as e: 128 | print("[-] Git log error:", e) 129 | except UnicodeDecodeError as err: 130 | print("[-] Unicode error:", err) 131 | 132 | # print "Done." 133 | return commitsList 134 | 135 | 136 | def filterCommitMessage(commitMessage): 137 | #추가 138 | commitMessage = commitMessage.decode('utf-8') 139 | """ 140 | Filter false positive commits 141 | Will remove 'Merge', 'Revert', 'Upgrade' commit log 142 | :param commitMessage: commit message 143 | :return: 144 | """ 145 | filterKeywordList = ["merge", "revert", "upgrade"] 146 | matchCnt = 0 147 | for kwd in filterKeywordList: 148 | keywordPattern = r"\W" + kwd + r"\W|\W" + kwd + r"s\W" 149 | compiledKeyworddPattern = re.compile(keywordPattern) 150 | match = compiledKeyworddPattern.search(commitMessage.lower()) 151 | 152 | # bug fixed.. now revert and upgrade commits will be filtered out. 153 | if match: 154 | matchCnt += 1 155 | 156 | if matchCnt > 0: 157 | return 1 158 | else: 159 | return 0 160 | 161 | 162 | def callGitShow(gitBinary, commitHashValue): 163 | """ 164 | Grep data of git show 165 | :param commitHashValue: 166 | :return: 167 | """ 168 | # print "Calling git show...", 169 | command_show = "\"{0}\" show --pretty=fuller {1}".format(gitBinary, commitHashValue) 170 | 171 | gitShowOutput = '' 172 | try: 173 | gitShowOutput = subprocess.check_output(command_show, shell=True) 174 | except subprocess.CalledProcessError as e: 175 | print("error:", e) 176 | 177 | # print "Done." 178 | return gitShowOutput 179 | 180 | 181 | def updateCveInfo(cveDict, cveId): 182 | """ 183 | Get CVSS score and CWE id from CVE id 184 | :param cveId: 185 | :return: 186 | """ 187 | # print "Updating CVE metadata...", 188 | cvss = "0.0" 189 | try: 190 | cvss = str(cveDict[cveId][0]) 191 | except: 192 | cvss = "0.0" 193 | 194 | cwe = "CWE-000" 195 | try: 196 | cwe = cveDict[cveId][1] 197 | except: 198 | cwe = "CWE-000" 199 | if "NVD-CWE-noinfo" in cwe: 200 | cwe = "CWE-000" 201 | else: 202 | cweNum = cwe.split('-')[1].zfill(3) 203 | cwe = "CWE-" + str(cweNum) 204 | 205 | # print "Done." 206 | return cveId + '_' + cvss + '_' + cwe + '_' 207 | 208 | 209 | def process(commitsList, subRepoName): 210 | global info 211 | 212 | flag = 0 213 | if len(commitsList) > 0 and commitsList[0] == '': 214 | flag = 1 215 | print("No commit in", info.RepoName), 216 | else: 217 | print(len(commitsList), "commits in", info.RepoName), 218 | if subRepoName is None: 219 | print("\n") 220 | else: 221 | print(subRepoName) 222 | os.chdir(os.path.join(info.GitStoragePath, info.RepoName, subRepoName)) 223 | 224 | if flag: 225 | return 226 | 227 | if info.DebugMode or "Windows" in platform.platform(): 228 | # Windows - do not use multiprocessing 229 | # Using multiprocessing will lower performance 230 | for commitMessage in commitsList: 231 | parallel_process(subRepoName, commitMessage) 232 | else: # POSIX - use multiprocessing 233 | pool = mp.Pool() 234 | parallel_partial = partial(parallel_process, subRepoName) 235 | pool.map(parallel_partial, commitsList) 236 | pool.close() 237 | pool.join() 238 | 239 | 240 | def parallel_process(subRepoName, commitMessage): 241 | global info 242 | global printLock 243 | 244 | if filterCommitMessage(commitMessage): 245 | return 246 | else: 247 | commitHashValue = commitMessage[7:47] 248 | # 바이트 문자열인 경우 UTF-8로 변환 249 | if isinstance(commitHashValue, bytes): 250 | commitHashValue = commitHashValue.decode('utf-8') 251 | #추가 252 | commitMessage = commitMessage.decode('utf-8') 253 | cvePattern = re.compile('CVE-20\d{2}-\d{4,7}') # note: CVE id can now be 7 digit numbers 254 | cveIdList = list(set(cvePattern.findall(commitMessage))) 255 | 256 | """ 257 | Note: Aug 5, 2016 258 | If multiple CVE ids are assigned to one commit, 259 | store the dependency in a file which is named after 260 | the repo, (e.g., ~/diff/dependency_ubuntu) and use 261 | one representative CVE that has the smallest ID number 262 | for filename. 263 | A sample: 264 | CVE-2014-6416_2e9466c84e5beee964e1898dd1f37c3509fa8853 CVE-2014-6418_CVE-2014-6417_CVE-2014-6416_ 265 | """ 266 | 267 | if len(cveIdList) > 1: # do this only if muliple CVEs are assigned to a commit 268 | dependency = os.path.join(info.DiffDir, "dependency_" + info.RepoName) 269 | with open(dependency, "a") as fp: 270 | cveIdFull = "" 271 | minCve = "" 272 | minimum = 9999999 273 | for cveId in cveIdList: 274 | idDigits = int(cveId.split('-')[2]) 275 | cveIdFull += cveId + '_' 276 | if minimum > idDigits: 277 | minimum = idDigits 278 | minCve = cveId 279 | #세 줄 추가 280 | minCve = minCve.decode('utf-8') if isinstance(minCve, bytes) else minCve 281 | cveIdFull = cveIdFull.decode('utf-8') if isinstance(cveIdFull, bytes) else cveIdFull 282 | fp.write(str(minCve + '_' + commitHashValue + '\t' + cveIdFull + '\n')) 283 | elif len(cveIdList) == 0: 284 | if info.cveID is None: 285 | return 286 | else: 287 | minCve = info.cveID # when CVE ID is given manually through command line argument 288 | else: 289 | minCve = cveIdList[0] 290 | 291 | #추가 292 | #git_command = f"git show --pretty=fuller {commitHashValue}" 293 | gitShowOutput = callGitShow(info.GitBinary, commitHashValue) 294 | # gitShowOutput이 바이트 문자열인 경우 문자열로 변환 295 | if isinstance(gitShowOutput, bytes): 296 | gitShowOutput = gitShowOutput.decode('latin-1') # latin-1 만 가능 297 | 298 | finalFileName = updateCveInfo(info.CveDict, minCve) 299 | 300 | diffFileName = "{0}{1}.diff".format(finalFileName, commitHashValue) 301 | try: 302 | #with open(os.path.join(info.DiffDir, info.RepoName, diffFileName), "w") as fp: 303 | with open(os.path.join(info.DiffDir, info.RepoName, diffFileName), "w", encoding="utf-8") as fp: 304 | if subRepoName is None: 305 | fp.write(gitShowOutput) 306 | else: # multi-repo mode 307 | fp.write(subRepoName + '\n' + gitShowOutput) 308 | with printLock: 309 | print("[+] Writing {0} Done.".format(diffFileName)) 310 | except IOError as e: 311 | with printLock: 312 | print("[+] Writing {0} Error:".format(diffFileName), e) 313 | 314 | 315 | def main(): 316 | global info 317 | 318 | t1 = time.time() 319 | init() 320 | if info.MultimodeFlag: 321 | for sidx, subRepoName in enumerate(info.MultiRepoList): 322 | gitDir = os.path.join(info.GitStoragePath, info.RepoName, subRepoName) # where .git exists 323 | commitsList = callGitLog(gitDir) 324 | print(os.path.join(str(sidx + 1), str(len(info.MultiRepoList)))) 325 | if 0 < len(commitsList): 326 | process(commitsList, subRepoName) 327 | else: 328 | gitDir = os.path.join(info.GitStoragePath, info.RepoName) # where .git exists 329 | commitsList = callGitLog(gitDir) 330 | process(commitsList, None) 331 | 332 | repoDiffDir = os.path.join(info.DiffDir, info.RepoName) 333 | print(str(len(os.listdir(repoDiffDir))) + " patches saved in", repoDiffDir) 334 | print("Done. (" + str(time.time() - t1) + " sec)") 335 | 336 | 337 | if __name__ == '__main__': 338 | mp.freeze_support() 339 | main() 340 | -------------------------------------------------------------------------------- /src/get_source_from_cvepatch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import re 6 | import glob 7 | import argparse 8 | import multiprocessing as mp 9 | from functools import partial 10 | import platform 11 | import time 12 | # Import from parent directory 13 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 14 | try: # for backward-compatibility (in the main repository) 15 | import hmark.parseutility as parseutility 16 | except ImportError: # for subrepo 17 | import tools.parseutility as parseutility 18 | 19 | import config 20 | 21 | # GLOBALS 22 | originalDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # vuddy root directory 23 | diffDir = os.path.join(originalDir, "diff") 24 | # resultList = [] 25 | dummyFunction = parseutility.function(None) 26 | multimodeFlag = 0 27 | debugMode = False 28 | 29 | parseutility.setEnvironment("") 30 | 31 | t1 = time.time() 32 | 33 | """ re patterns """ 34 | pat_src = '[\n](?=diff --git a/)' 35 | pat_chunk = '[\n](?=@@\s[^a-zA-Z]*\s[^a-zA-Z]*\s@@)' 36 | pat_linenum = r"-(\d+,\d+) \+(\d+,\d+) " 37 | pat_linenum = re.compile(pat_linenum) 38 | 39 | 40 | def init(): 41 | # ARGUMENTS 42 | global repoName 43 | global multimodeFlag 44 | global total 45 | global debugMode 46 | 47 | parser = argparse.ArgumentParser() 48 | parser.add_argument('REPO', 49 | help='''Repository name''') 50 | parser.add_argument('-m', '--multimode', action="store_true", 51 | help='''Turn on Multimode''') 52 | parser.add_argument('-d', '--debug', action="store_true", help=argparse.SUPPRESS) # Hidden Debug Mode 53 | 54 | args = parser.parse_args() 55 | 56 | if args.REPO is None: 57 | parser.print_help() 58 | exit() 59 | repoName = args.REPO # name of the directory that holds DIFF patches 60 | if args.multimode: 61 | multimodeFlag = 1 62 | if args.debug: 63 | debugMode = True 64 | 65 | msg = "Retrieve vulnerable functions from {0}\nMulti-repo mode: ".format(repoName) 66 | if multimodeFlag: 67 | print(msg + "On") 68 | else: 69 | print(msg + "Off") 70 | 71 | # try making missing directories 72 | try: 73 | os.makedirs(os.path.join(originalDir, 'tmp')) 74 | except OSError as e: 75 | pass 76 | try: 77 | os.makedirs(os.path.join(originalDir, 'vul', repoName)) 78 | except OSError as e: 79 | pass 80 | 81 | total = len(os.listdir(os.path.join(diffDir, repoName))) 82 | 83 | 84 | def source_from_cvepatch(ctr, diffFileName): # diffFileName holds the filename of each DIFF patch 85 | # diffFileName looks like: CVE-2012-2372_7a9bc620049fed37a798f478c5699a11726b3d33.diff 86 | global repoName 87 | global debugMode 88 | global total 89 | global multimodeFlag 90 | global dummyFunction 91 | global diffDir 92 | global originalDir 93 | 94 | chunksCnt = 0 # number of DIFF patches 95 | currentCounter = 0 96 | 97 | with ctr.diffFileCntLock: 98 | currentCounter = ctr.diffFileCnt.value 99 | print(str(ctr.diffFileCnt.value + 1) + '/' + str(total)), 100 | ctr.diffFileCnt.value += 1 101 | 102 | if os.path.getsize(os.path.join(diffDir, repoName, diffFileName)) > 1000000: 103 | # don't do anything with big DIFFs (merges, upgrades, ...). 104 | print("[-]", diffFileName, "\t(file too large)") 105 | else: 106 | diffFileNameSplitted = diffFileName.split('_') 107 | cveId = diffFileNameSplitted[0] # use only one CVEid 108 | commitHashValue = diffFileNameSplitted[-1].split('.')[0] 109 | 110 | print("[+]", diffFileName, "\t(proceed)") 111 | with open(os.path.join(diffDir, repoName, diffFileName), 'r') as fp: 112 | patchLines = ''.join(fp.readlines()) 113 | patchLinesSplitted = re.split(pat_src, patchLines) 114 | commitLog = patchLinesSplitted[0] 115 | affectedFilesList = patchLinesSplitted[1:] 116 | 117 | repoPath = '' 118 | if multimodeFlag: # multimode DIFFs have repoPath at the beginning. 119 | repoPath = commitLog.split('\n')[0].rstrip().lstrip("\xef\xbb\xbf") 120 | 121 | numAffectedFiles = len(affectedFilesList) 122 | for aidx, affectedFile in enumerate(affectedFilesList): 123 | if debugMode: 124 | print("\tFile # " + str(aidx + 1) + '/' + str(numAffectedFiles)), 125 | firstLine = affectedFile.split('\n')[0] # git --diff a/path/filename.ext b/path/filename.ext 126 | affectedFileName = firstLine.split("--git ")[1].split(" ")[0].split("/")[-1] 127 | codePath = firstLine.split(' b')[1].strip() # path/filename.ext 128 | 129 | if not codePath.endswith(".c") and not codePath.endswith(".cpp") and not codePath.endswith(".cc") and not codePath.endswith(".c++") and not codePath.endswith(".cxx"): 130 | if debugMode: 131 | print("\t[-]", codePath, "(wrong extension)") 132 | else: 133 | secondLine = affectedFile.split('\n')[1] 134 | 135 | if secondLine.startswith("index") == 0: # or secondLine.endswith("100644") == 0: 136 | if debugMode: 137 | print("\t[-]", codePath, "(invalid metadata)") # we are looking for "index" only. 138 | else: 139 | if debugMode: 140 | print("\t[+]", codePath) 141 | indexHashOld = secondLine.split(' ')[1].split('..')[0] 142 | indexHashNew = secondLine.split(' ')[1].split('..')[1] 143 | 144 | chunksList = re.split(pat_chunk, affectedFile)[1:] # diff file per chunk (in list) 145 | chunksCnt += len(chunksList) 146 | 147 | if multimodeFlag: 148 | os.chdir(os.path.join(config.gitStoragePath, repoName, repoPath)) 149 | else: 150 | os.chdir(os.path.join(config.gitStoragePath, repoName)) 151 | 152 | tmpOldFileName = os.path.join(originalDir, "tmp", "{0}_{1}_old".format(repoName, currentCounter)) 153 | command_show = "\"{0}\" show {1} > {2}".format(config.gitBinary, indexHashOld, tmpOldFileName) 154 | os.system(command_show) 155 | 156 | tmpNewFileName = os.path.join(originalDir, "tmp", "{0}_{1}_new".format(repoName, currentCounter)) 157 | command_show = "\"{0}\" show {1} > {2}".format(config.gitBinary, indexHashNew, tmpNewFileName) 158 | os.system(command_show) 159 | 160 | os.chdir(originalDir) 161 | oldFunctionInstanceList = parseutility.parseFile_shallow(tmpOldFileName, "") 162 | newFunctionInstanceList = parseutility.parseFile_shallow(tmpNewFileName, "") 163 | 164 | finalOldFunctionList = [] 165 | 166 | numChunks = len(chunksList) 167 | for ci, chunk in enumerate(chunksList): 168 | if debugMode: 169 | print("\t\tChunk # " + str(ci + 1) + "/" + str(numChunks)), 170 | 171 | chunkSplitted = chunk.split('\n') 172 | chunkFirstLine = chunkSplitted[0] 173 | chunkLines = chunkSplitted[1:] 174 | 175 | if debugMode: 176 | print(chunkFirstLine) 177 | lineNums = pat_linenum.search(chunkFirstLine) 178 | oldLines = lineNums.group(1).split(',') 179 | newLines = lineNums.group(2).split(',') 180 | 181 | offset = int(oldLines[0]) 182 | pmList = [] 183 | lnList = [] 184 | for chunkLine in chunkSplitted[1:]: 185 | if len(chunkLine) != 0: 186 | pmList.append(chunkLine[0]) 187 | 188 | for i, pm in enumerate(pmList): 189 | if pm == ' ' or pm == '-': 190 | lnList.append(offset + i) 191 | elif pm == '+': 192 | lnList.append(offset + i - 1) 193 | offset -= 1 194 | 195 | """ HERE, ADD CHECK FOR NEW FUNCTIONS """ 196 | hitOldFunctionList = [] 197 | for f in oldFunctionInstanceList: 198 | # print f.lines[0], f.lines[1] 199 | 200 | for num in range(f.lines[0], f.lines[1] + 1): 201 | if num in lnList: 202 | # print "Hit at", num 203 | 204 | hitOldFunctionList.append(f) 205 | break # found the function to be patched 206 | 207 | # if f.lines[0] <= offset <= f.lines[1]: 208 | # print "\t\t\tOffset HIT!!", f.name 209 | # elif f.lines[0] <= bound <= f.lines[1]: 210 | # print "\t\t\tBound HIT!!", f.name 211 | 212 | for f in hitOldFunctionList: 213 | # print "Verify hitFunction", f.name 214 | # print "ln", 215 | for num in range(f.lines[0], f.lines[1] + 1): 216 | # print num, 217 | try: 218 | listIndex = lnList.index(num) 219 | except ValueError: 220 | pass 221 | else: 222 | if lnList.count(num) > 1: 223 | listIndex += 1 224 | # print "\nmatch:", num 225 | # print "value\t", chunkSplitted[1:][lnList.index(num)] 226 | # print "pm \t", pmList[lnList.index(num)] 227 | if pmList[listIndex] == '+' or pmList[listIndex] == '-': 228 | # print "Maybe meaningful", 229 | flag = 0 230 | for commentKeyword in ["/*", "*/", "//", "*"]: 231 | if chunkLines[listIndex][1:].lstrip().startswith(commentKeyword): 232 | flag = 1 233 | break 234 | if flag: 235 | pass 236 | # print "but not." 237 | else: 238 | # print "MEANINGFUL!!" 239 | finalOldFunctionList.append(f) 240 | break 241 | else: 242 | pass 243 | # print "Not meaningful" 244 | # print "============\n" 245 | 246 | finalOldFunctionList = list(set(finalOldFunctionList)) # sometimes list has dups 247 | 248 | finalNewFunctionList = [] 249 | for fold in finalOldFunctionList: 250 | flag = 0 251 | for fnew in newFunctionInstanceList: 252 | if fold.name == fnew.name: 253 | finalNewFunctionList.append(fnew) 254 | flag = 1 255 | break 256 | if not flag: 257 | finalNewFunctionList.append(dummyFunction) 258 | 259 | if debugMode: 260 | print("\t\t\t", len(finalNewFunctionList), "functions found.") 261 | vulFileNameBase = diffFileName.split('.diff')[0] + '_' + affectedFileName 262 | 263 | # os.chdir(os.path.join(originalDir, "vul", repoName)) 264 | 265 | for index, f in enumerate(finalOldFunctionList): 266 | os.chdir(originalDir) 267 | oldFuncInstance = finalOldFunctionList[index] 268 | 269 | fp = open(oldFuncInstance.parentFile, 'r') 270 | srcFileRaw = fp.readlines() 271 | fp.close() 272 | finalOldFunction = ''.join(srcFileRaw[oldFuncInstance.lines[0]-1:oldFuncInstance.lines[1]]) 273 | 274 | # oldFuncArgs = '' 275 | # for ai, funcArg in enumerate(oldFuncInstance.parameterList): 276 | # oldFuncArgs += "DTYPE " + funcArg 277 | # if ai + 1 != len(oldFuncInstance.parameterList): 278 | # oldFuncArgs += ', ' 279 | # finalOldFunction = "DTYPE {0} ({1})\n{{ {2}\n}}"\ 280 | # .format(oldFuncInstance.name, oldFuncArgs, oldFuncInstance.funcBody) 281 | 282 | finalOldFuncId = str(oldFuncInstance.funcId) 283 | 284 | newFuncInstance = finalNewFunctionList[index] 285 | 286 | if newFuncInstance.name is None: 287 | finalNewFunction = "" 288 | else: 289 | fp = open(newFuncInstance.parentFile, 'r') 290 | srcFileRaw = fp.readlines() 291 | fp.close() 292 | finalNewFunction = ''.join(srcFileRaw[newFuncInstance.lines[0]-1:newFuncInstance.lines[1]]) 293 | 294 | # finalNewFunction = finalNewFunctionList[index].funcBody 295 | 296 | finalOldBody = finalOldFunction[finalOldFunction.find('{')+1:finalOldFunction.rfind('}')] 297 | finalNewBody = finalNewFunction[finalNewFunction.find('{')+1:finalNewFunction.rfind('}')] 298 | tmpold = parseutility.normalize(parseutility.removeComment(finalOldBody)) 299 | tmpnew = parseutility.normalize(parseutility.removeComment(finalNewBody)) 300 | 301 | if tmpold != tmpnew and len(tmpnew) > 0: 302 | # if two are same, it means nothing but comment is patched. 303 | with ctr.functionCntLock: 304 | ctr.functionCnt.value += 1 305 | os.chdir(os.path.join(originalDir, "vul", repoName)) 306 | vulOldFileName = vulFileNameBase + '_' + finalOldFuncId + "_OLD.vul" 307 | vulNewFileName = vulFileNameBase + '_' + finalOldFuncId + "_NEW.vul" 308 | with open(vulOldFileName, 'w') as fp: 309 | fp.write(finalOldFunction) 310 | with open(vulNewFileName, 'w') as fp: 311 | if finalNewFunctionList[index].name is not None: 312 | fp.write(finalNewFunction) 313 | else: 314 | fp.write("") 315 | diffCommand = "\"{0}\" -u {1} {2} > {3}_{4}.patch".format(config.diffBinary, 316 | vulOldFileName, 317 | vulNewFileName, 318 | vulFileNameBase, 319 | finalOldFuncId) 320 | os.system(diffCommand) 321 | 322 | 323 | def main(): 324 | 325 | ctr = Counter() 326 | diffList = os.listdir(os.path.join(diffDir, repoName)) 327 | if debugMode or "Windows" in platform.platform(): 328 | # Windows - do not use multiprocessing 329 | # Using multiprocessing will lower performance 330 | for diffFile in diffList: 331 | source_from_cvepatch(ctr, diffFile) 332 | else: # POSIX - use multiprocessing 333 | pool = mp.Pool() 334 | parallel_partial = partial(source_from_cvepatch, ctr) 335 | pool.map(parallel_partial, diffList) 336 | pool.close() 337 | pool.join() 338 | 339 | # delete temp source files 340 | wildcard_temp = os.path.join(originalDir, "tmp", repoName + "_*") 341 | for f in glob.glob(wildcard_temp): 342 | os.remove(f) 343 | 344 | print("") 345 | print("Done getting vulnerable functions from", repoName) 346 | #print "Reconstructed", len( 347 | # os.listdir(os.path.join(originalDir, 'vul', repoName))), "vulnerable functions from", diffFileCnt.value, "patches." 348 | print("Reconstructed", ctr.functionCnt.value, "vulnerable functions from", ctr.diffFileCnt.value, "patches.") 349 | print("Elapsed: %.2f sec" % (time.time()-t1)) 350 | 351 | 352 | if __name__ == "__main__": 353 | mp.freeze_support() 354 | class Counter: 355 | diffFileCnt = mp.Value('i', 0) 356 | diffFileCntLock = mp.Manager().Lock() 357 | functionCnt = mp.Value('i', 0) 358 | functionCntLock = mp.Manager().Lock() 359 | init() 360 | main() 361 | -------------------------------------------------------------------------------- /src/repo_updater.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import urllib2 5 | import git 6 | import json 7 | import logging 8 | 9 | 10 | def is_git_repo(path): 11 | try: 12 | _ = git.Repo(path).git_dir 13 | return True 14 | except git.exc.InvalidGitRepositoryError: 15 | return False 16 | 17 | logger = logging.getLogger(__name__) 18 | logger.setLevel(logging.DEBUG) 19 | formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") 20 | file_handler = logging.FileHandler("repo_updater.log") 21 | file_handler.setFormatter(formatter) 22 | stream_handler = logging.StreamHandler() 23 | logger.addHandler(file_handler) 24 | logger.addHandler(stream_handler) 25 | 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('REPO', help='''Repository name''') 28 | 29 | args = parser.parse_args() 30 | 31 | if args.REPO is None: 32 | parser.print_help() 33 | exit() 34 | 35 | cwd = os.getcwd() 36 | repo_name = args.REPO 37 | git_dir = "/home/whiteboxDB/gitrepos" 38 | 39 | if repo_name.lower() == "android": 40 | url_base = "https://android.googlesource.com/" 41 | url_list = url_base + "?format=TEXT" 42 | response = urllib2.urlopen(url_list) 43 | repo_list = response.read().rstrip().split("\n") 44 | repo_base = os.path.join(git_dir, repo_name) 45 | elif repo_name.lower() == "chromium": 46 | url_base = "https://chromium.googlesource.com/" 47 | url_list = url_base + "?format=TEXT" 48 | response = urllib2.urlopen(url_list) 49 | repo_list = response.read().rstrip().split("\n") 50 | repo_base = os.path.join(git_dir, repo_name) 51 | 52 | if not os.path.isdir(repo_base): 53 | os.mkdir(repo_base) 54 | 55 | for ri, repo in enumerate(repo_list): 56 | target_dir = os.path.join(repo_base, repo) 57 | infostr = str(ri+1) + "/" + str(len(repo_list)) + "\t" + repo 58 | if os.path.isdir(target_dir) and is_git_repo(target_dir): 59 | infostr += " EXISTS (PULL)" 60 | logger.info(infostr) 61 | os.chdir(target_dir) 62 | os.system("git pull") 63 | os.chdir(cwd) 64 | else: 65 | infostr += " DOESN'T EXIST (CLONE)" 66 | logger.info(infostr) 67 | os.system("git clone {0}{1} {2}".format(url_base, repo, target_dir)) 68 | 69 | 70 | -------------------------------------------------------------------------------- /src/vul_dup_remover.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import hashlib 6 | 7 | # Import from parent directory 8 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 9 | try: 10 | from hmark.parseutility import normalize 11 | except ImportError: 12 | from tools.parseutility import normalize 13 | 14 | hashdict = {} 15 | cntdict = {} 16 | vulcntlist = [] 17 | repolist = [] 18 | 19 | originalDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # vuddy root directory 20 | vulsDir = os.path.join(originalDir, "vul") 21 | dirs = os.listdir(vulsDir) 22 | dirs.sort() 23 | os.chdir(vulsDir) 24 | for d in dirs: 25 | if os.path.isdir(d): 26 | repolist.append(d) 27 | cntdict[d] = 0 28 | # print d 29 | # print repolist 30 | vulcntlist.append(len(os.listdir(d))) 31 | # print vulcntlist 32 | for vul in os.listdir(d): 33 | if vul.endswith("OLD.vul"): 34 | with open(os.path.join(d, vul), "r") as fp: 35 | text = '\n'.join(fp.readlines()) 36 | #text = normalize(text) 37 | text = normalize(text).encode('utf-8') 38 | checksum = hashlib.md5(text).hexdigest() 39 | try: 40 | hashdict[checksum].append(d + ' ' + vul) 41 | except: 42 | hashdict[checksum] = [d + ' ' + vul] 43 | 44 | cnt = 0 45 | 46 | for key in hashdict: 47 | if len(hashdict[key]) > 1: 48 | for vul in hashdict[key][1:]: 49 | cnt += 1 50 | repo = vul.split(' ')[0] 51 | rest = vul.split(' ')[1] 52 | base = rest[:-8] 53 | cntdict[repo] += 1 54 | os.remove(os.path.join(repo, rest)) 55 | try: 56 | os.remove(os.path.join(repo, base + "_NEW.vul")) 57 | os.remove(os.path.join(repo, base + ".patch")) 58 | except: 59 | pass 60 | 61 | print("[RESULT]") 62 | for idx, r in enumerate(repolist): 63 | print('\t' + r + ":\tdeleted " + str(cntdict[r]) + " duplicate files from " + str(vulcntlist[idx]) + " files.") 64 | 65 | print("Total:", cnt, "duplicate files.") 66 | -------------------------------------------------------------------------------- /src/vul_hidx_generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import hashlib 6 | import time 7 | import argparse 8 | import multiprocessing as mp 9 | from functools import partial 10 | # Import from parent directory 11 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 12 | try: 13 | import hmark.parseutility as parser 14 | except ImportError: 15 | import tools.parseutility as parser 16 | import config 17 | 18 | 19 | def parse_function(absLvl, srcFile): 20 | if absLvl == 0: 21 | functionInstanceList = parser.parseFile_shallow(srcFile, "") 22 | return (srcFile, functionInstanceList, None) 23 | elif absLvl == 4: 24 | functionInstanceList = parser.parseFile_deep(srcFile, "") 25 | # Some lines below are added by Squizz on Jan 16, for FP reduction! 26 | functionInstanceList_New = parser.parseFile_deep(srcFile.replace("OLD.vul", "NEW.vul"), "") 27 | return (srcFile, functionInstanceList, functionInstanceList_New) 28 | 29 | 30 | def main(): 31 | originalDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # vuddy root directory 32 | vulsDir = os.path.join(originalDir, "vul") 33 | 34 | arg_parser = argparse.ArgumentParser() 35 | arg_parser.add_argument('REPO', 36 | help='''Repository name''') 37 | arg_parser.add_argument('-a', '--abstract-level', required=True, type=int, nargs=1, choices=[0, 4], 38 | help='''Abstract Level''') 39 | 40 | args = arg_parser.parse_args() 41 | 42 | projName = args.REPO 43 | intendedAbsLvl = 4 44 | if args.abstract_level: 45 | intendedAbsLvl = args.abstract_level[0] 46 | 47 | projDictList = [] 48 | hashFileMapList = [] 49 | for i in range(0, 5): 50 | projDictList.append({}) 51 | hashFileMapList.append({}) 52 | 53 | print("loading source"), 54 | srcFileList = parser.loadVul(os.path.join(vulsDir, projName)) 55 | print("(done)") 56 | 57 | time0 = time.time() 58 | 59 | numFiles = len(srcFileList) 60 | numFuncs = 0 61 | numLines = 0 62 | 63 | pool = mp.Pool() 64 | func = partial(parse_function, intendedAbsLvl) 65 | for srcFileIdx, returnTuple in enumerate(pool.imap(func, srcFileList)): 66 | srcFile = returnTuple[0] 67 | functionInstanceList = returnTuple[1] 68 | functionInstanceList_New = returnTuple[2] 69 | 70 | print(srcFileIdx + 1, '/', len(srcFileList), srcFile) 71 | numFuncs += len(functionInstanceList) 72 | if len(functionInstanceList) > 0: 73 | numLines += functionInstanceList[0].parentNumLoc 74 | 75 | for fi, f in enumerate(functionInstanceList): 76 | f.removeListDup() 77 | path = f.parentFile 78 | path = "." + path[f.parentFile.find("/vul/"):] 79 | absBody = parser.abstract(f, intendedAbsLvl)[1] 80 | #absBody = parser.normalize(absBody).encode('utf-8') 81 | absBody = parser.normalize(absBody) 82 | # print absBody 83 | funcLen = len(absBody) 84 | # print funcLen, absBody 85 | # print len(absBody) 86 | hashValue = hashlib.md5(absBody).hexdigest() 87 | 88 | if intendedAbsLvl == 4 and len(functionInstanceList_New) > 0: 89 | fnew = functionInstanceList_New[fi] 90 | fnew.removeListDup() 91 | absBodyNew = parser.abstract(fnew, intendedAbsLvl)[1] 92 | absBodyNew = parser.normalize(absBodyNew) 93 | hashValueNew = hashlib.md5(absBodyNew).hexdigest() 94 | 95 | if hashValue == hashValueNew: 96 | # if abstract bodies of old and new func are identical, 97 | # don't create hash index 98 | continue 99 | 100 | try: 101 | projDictList[intendedAbsLvl][funcLen].append(hashValue) 102 | except KeyError: 103 | projDictList[intendedAbsLvl][funcLen] = [hashValue] 104 | 105 | try: 106 | hashFileMapList[intendedAbsLvl][hashValue].extend([path, f.funcId]) 107 | except KeyError: 108 | hashFileMapList[intendedAbsLvl][hashValue] = [path, f.funcId] 109 | 110 | pool.close() 111 | pool.join() 112 | 113 | packageInfo = config.version + ' ' + str(projName) + ' ' + str(numFiles) + ' ' + str(numFuncs) + ' ' + str( 114 | numLines) + '\n' 115 | hidxDir = os.path.join(originalDir, "hidx") 116 | if os.path.exists(hidxDir) is False: 117 | os.makedirs(hidxDir) 118 | hidxFile = os.path.join(hidxDir, "hashmark_{0}_{1}.hidx".format(intendedAbsLvl, projName)) 119 | with open(hidxFile, 'w') as fp: 120 | fp.write(packageInfo) 121 | for key in sorted(projDictList[intendedAbsLvl]): 122 | fp.write(str(key) + '\t') 123 | for h in list(set(projDictList[intendedAbsLvl][key])): 124 | fp.write(h + '\t') 125 | fp.write('\n') 126 | 127 | fp.write('\n=====\n') 128 | 129 | for key in sorted(hashFileMapList[intendedAbsLvl]): 130 | fp.write(str(key) + '\t') 131 | for f in hashFileMapList[intendedAbsLvl][key]: 132 | fp.write(str(f) + '\t') 133 | fp.write('\n') 134 | 135 | print("Hash index saved to:", os.path.join(originalDir, "hidx", "hashmark_{0}_{1}.hidx".format(intendedAbsLvl, projName))) 136 | time1 = time.time() 137 | print("Elapsed time:", time1 - time0) 138 | 139 | 140 | if __name__ == "__main__": 141 | mp.freeze_support() 142 | main() 143 | -------------------------------------------------------------------------------- /src/vul_verifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | # Import from parent directory 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 7 | try: 8 | import hmark.parseutility as pu 9 | except ImportError: 10 | import tools.parseutility as pu 11 | 12 | 13 | def getBody(original): 14 | return original[original.find('{')+1:original.rfind('}')] 15 | 16 | originalDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # vuddy root directory 17 | vulsDir = os.path.join(originalDir, "vul") 18 | dirs = os.listdir(vulsDir) 19 | rmcntDict = {} 20 | for dir in dirs: 21 | # print dir 22 | if dir != "chromium": 23 | continue 24 | for vul in os.listdir(os.path.join(vulsDir, dir)): 25 | if vul.endswith("OLD.vul"): 26 | with open(os.path.join(vulsDir, dir, vul), "r") as fp: 27 | raw = ''.join(fp.readlines()) 28 | body = getBody(pu.removeComment(raw)) 29 | 30 | if body.count(";") == 1: 31 | kill = 1 # this function must be single-line 32 | else: 33 | kill = 0 34 | 35 | cnt = 0 36 | for line in body.split('\n'): 37 | if len(line.strip()) > 0: 38 | cnt += 1 # cnt will be 1 for single lined functions 39 | 40 | with open(os.path.join(vulsDir, dir, vul[:-8] + "_NEW.vul"), 'r') as fp: 41 | newraw = ''.join(fp.readlines()) 42 | newbody = getBody(pu.removeComment(newraw)) 43 | 44 | if kill == 1 or cnt == 1 or pu.normalize(body) == pu.normalize(newbody) or len(newraw) == 0: 45 | vulBase = vul[:-8] 46 | os.remove(os.path.join(vulsDir, dir, vulBase + "_OLD.vul")) 47 | os.remove(os.path.join(vulsDir, dir, vulBase + "_NEW.vul")) 48 | os.remove(os.path.join(vulsDir, dir, vulBase + ".patch")) 49 | try: 50 | rmcntDict[dir] += 1 51 | except: 52 | rmcntDict[dir] = 1 53 | 54 | for dir in rmcntDict: 55 | print("removed", rmcntDict[dir], "FP records from", dir) 56 | -------------------------------------------------------------------------------- /testcode/async.c: -------------------------------------------------------------------------------- 1 | /* 2 | * async.c: Asynchronous function calls for boot performance 3 | * 4 | * (C) Copyright 2009 Intel Corporation 5 | * Author: Arjan van de Ven 6 | * 7 | * This program is free software; you can redistribute it and/or 8 | * modify it under the terms of the GNU General Public License 9 | * as published by the Free Software Foundation; version 2 10 | * of the License. 11 | */ 12 | 13 | 14 | /* 15 | 16 | Goals and Theory of Operation 17 | 18 | The primary goal of this feature is to reduce the kernel boot time, 19 | by doing various independent hardware delays and discovery operations 20 | decoupled and not strictly serialized. 21 | 22 | More specifically, the asynchronous function call concept allows 23 | certain operations (primarily during system boot) to happen 24 | asynchronously, out of order, while these operations still 25 | have their externally visible parts happen sequentially and in-order. 26 | (not unlike how out-of-order CPUs retire their instructions in order) 27 | 28 | Key to the asynchronous function call implementation is the concept of 29 | a "sequence cookie" (which, although it has an abstracted type, can be 30 | thought of as a monotonically incrementing number). 31 | 32 | The async core will assign each scheduled event such a sequence cookie and 33 | pass this to the called functions. 34 | 35 | The asynchronously called function should before doing a globally visible 36 | operation, such as registering device numbers, call the 37 | async_synchronize_cookie() function and pass in its own cookie. The 38 | async_synchronize_cookie() function will make sure that all asynchronous 39 | operations that were scheduled prior to the operation corresponding with the 40 | cookie have completed. 41 | 42 | Subsystem/driver initialization code that scheduled asynchronous probe 43 | functions, but which shares global resources with other drivers/subsystems 44 | that do not use the asynchronous call feature, need to do a full 45 | synchronization with the async_synchronize_full() function, before returning 46 | from their init function. This is to maintain strict ordering between the 47 | asynchronous and synchronous parts of the kernel. 48 | 49 | */ 50 | 51 | #include 52 | #include 53 | #include 54 | #include 55 | #include 56 | #include 57 | #include 58 | #include 59 | 60 | #include "workqueue_internal.h" 61 | 62 | static async_cookie_t next_cookie = 1; 63 | 64 | #define MAX_WORK 32768 65 | #define ASYNC_COOKIE_MAX ULLONG_MAX /* infinity cookie */ 66 | 67 | static LIST_HEAD(async_global_pending); /* pending from all registered doms */ 68 | static ASYNC_DOMAIN(async_dfl_domain); 69 | static DEFINE_SPINLOCK(async_lock); 70 | 71 | struct async_entry { 72 | struct list_head domain_list; 73 | struct list_head global_list; 74 | struct work_struct work; 75 | async_cookie_t cookie; 76 | async_func_t func; 77 | void *data; 78 | struct async_domain *domain; 79 | }; 80 | 81 | static DECLARE_WAIT_QUEUE_HEAD(async_done); 82 | 83 | static atomic_t entry_count; 84 | 85 | static async_cookie_t lowest_in_progress(struct async_domain *domain) 86 | { 87 | struct list_head *pending; 88 | async_cookie_t ret = ASYNC_COOKIE_MAX; 89 | unsigned long flags; 90 | 91 | spin_lock_irqsave(&async_lock, flags); 92 | 93 | if (domain) 94 | pending = &domain->pending; 95 | else 96 | pending = &async_global_pending; 97 | 98 | if (!list_empty(pending)) 99 | ret = list_first_entry(pending, struct async_entry, 100 | domain_list)->cookie; 101 | 102 | spin_unlock_irqrestore(&async_lock, flags); 103 | return ret; 104 | } 105 | 106 | /* 107 | * pick the first pending entry and run it 108 | */ 109 | static void async_run_entry_fn(struct work_struct *work) 110 | { 111 | struct async_entry *entry = 112 | container_of(work, struct async_entry, work); 113 | unsigned long flags; 114 | ktime_t uninitialized_var(calltime), delta, rettime; 115 | 116 | /* 1) run (and print duration) */ 117 | if (initcall_debug && system_state == SYSTEM_BOOTING) { 118 | pr_debug("calling %lli_%pF @ %i\n", 119 | (long long)entry->cookie, 120 | entry->func, task_pid_nr(current)); 121 | calltime = ktime_get(); 122 | } 123 | entry->func(entry->data, entry->cookie); 124 | if (initcall_debug && system_state == SYSTEM_BOOTING) { 125 | rettime = ktime_get(); 126 | delta = ktime_sub(rettime, calltime); 127 | pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", 128 | (long long)entry->cookie, 129 | entry->func, 130 | (long long)ktime_to_ns(delta) >> 10); 131 | } 132 | 133 | /* 2) remove self from the pending queues */ 134 | spin_lock_irqsave(&async_lock, flags); 135 | list_del_init(&entry->domain_list); 136 | list_del_init(&entry->global_list); 137 | 138 | /* 3) free the entry */ 139 | kfree(entry); 140 | atomic_dec(&entry_count); 141 | 142 | spin_unlock_irqrestore(&async_lock, flags); 143 | 144 | /* 4) wake up any waiters */ 145 | wake_up(&async_done); 146 | } 147 | 148 | static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain) 149 | { 150 | struct async_entry *entry; 151 | unsigned long flags; 152 | async_cookie_t newcookie; 153 | 154 | /* allow irq-off callers */ 155 | entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); 156 | 157 | /* 158 | * If we're out of memory or if there's too much work 159 | * pending already, we execute synchronously. 160 | */ 161 | if (!entry || atomic_read(&entry_count) > MAX_WORK) { 162 | kfree(entry); 163 | spin_lock_irqsave(&async_lock, flags); 164 | newcookie = next_cookie++; 165 | spin_unlock_irqrestore(&async_lock, flags); 166 | 167 | /* low on memory.. run synchronously */ 168 | func(data, newcookie); 169 | return newcookie; 170 | } 171 | INIT_LIST_HEAD(&entry->domain_list); 172 | INIT_LIST_HEAD(&entry->global_list); 173 | INIT_WORK(&entry->work, async_run_entry_fn); 174 | entry->func = func; 175 | entry->data = data; 176 | entry->domain = domain; 177 | 178 | spin_lock_irqsave(&async_lock, flags); 179 | 180 | /* allocate cookie and queue */ 181 | newcookie = entry->cookie = next_cookie++; 182 | 183 | list_add_tail(&entry->domain_list, &domain->pending); 184 | if (domain->registered) 185 | list_add_tail(&entry->global_list, &async_global_pending); 186 | 187 | atomic_inc(&entry_count); 188 | spin_unlock_irqrestore(&async_lock, flags); 189 | 190 | /* mark that this task has queued an async job, used by module init */ 191 | current->flags |= PF_USED_ASYNC; 192 | 193 | /* schedule for execution */ 194 | queue_work(system_unbound_wq, &entry->work); 195 | 196 | return newcookie; 197 | } 198 | 199 | /** 200 | * async_schedule - schedule a function for asynchronous execution 201 | * @func: function to execute asynchronously 202 | * @data: data pointer to pass to the function 203 | * 204 | * Returns an async_cookie_t that may be used for checkpointing later. 205 | * Note: This function may be called from atomic or non-atomic contexts. 206 | */ 207 | async_cookie_t async_schedule(async_func_t func, void *data) 208 | { 209 | return __async_schedule(func, data, &async_dfl_domain); 210 | } 211 | EXPORT_SYMBOL_GPL(async_schedule); 212 | 213 | /** 214 | * async_schedule_domain - schedule a function for asynchronous execution within a certain domain 215 | * @func: function to execute asynchronously 216 | * @data: data pointer to pass to the function 217 | * @domain: the domain 218 | * 219 | * Returns an async_cookie_t that may be used for checkpointing later. 220 | * @domain may be used in the async_synchronize_*_domain() functions to 221 | * wait within a certain synchronization domain rather than globally. A 222 | * synchronization domain is specified via @domain. Note: This function 223 | * may be called from atomic or non-atomic contexts. 224 | */ 225 | async_cookie_t async_schedule_domain(async_func_t func, void *data, 226 | struct async_domain *domain) 227 | { 228 | return __async_schedule(func, data, domain); 229 | } 230 | EXPORT_SYMBOL_GPL(async_schedule_domain); 231 | 232 | /** 233 | * async_synchronize_full - synchronize all asynchronous function calls 234 | * 235 | * This function waits until all asynchronous function calls have been done. 236 | */ 237 | void async_synchronize_full(void) 238 | { 239 | async_synchronize_full_domain(NULL); 240 | } 241 | EXPORT_SYMBOL_GPL(async_synchronize_full); 242 | 243 | /** 244 | * async_unregister_domain - ensure no more anonymous waiters on this domain 245 | * @domain: idle domain to flush out of any async_synchronize_full instances 246 | * 247 | * async_synchronize_{cookie|full}_domain() are not flushed since callers 248 | * of these routines should know the lifetime of @domain 249 | * 250 | * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing 251 | */ 252 | void async_unregister_domain(struct async_domain *domain) 253 | { 254 | spin_lock_irq(&async_lock); 255 | WARN_ON(!domain->registered || !list_empty(&domain->pending)); 256 | domain->registered = 0; 257 | spin_unlock_irq(&async_lock); 258 | } 259 | EXPORT_SYMBOL_GPL(async_unregister_domain); 260 | 261 | /** 262 | * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain 263 | * @domain: the domain to synchronize 264 | * 265 | * This function waits until all asynchronous function calls for the 266 | * synchronization domain specified by @domain have been done. 267 | */ 268 | void async_synchronize_full_domain(struct async_domain *domain) 269 | { 270 | async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain); 271 | } 272 | EXPORT_SYMBOL_GPL(async_synchronize_full_domain); 273 | 274 | /** 275 | * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing 276 | * @cookie: async_cookie_t to use as checkpoint 277 | * @domain: the domain to synchronize (%NULL for all registered domains) 278 | * 279 | * This function waits until all asynchronous function calls for the 280 | * synchronization domain specified by @domain submitted prior to @cookie 281 | * have been done. 282 | */ 283 | void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) 284 | { 285 | ktime_t uninitialized_var(starttime), delta, endtime; 286 | 287 | if (initcall_debug && system_state == SYSTEM_BOOTING) { 288 | pr_debug("async_waiting @ %i\n", task_pid_nr(current)); 289 | starttime = ktime_get(); 290 | } 291 | 292 | wait_event(async_done, lowest_in_progress(domain) >= cookie); 293 | 294 | if (initcall_debug && system_state == SYSTEM_BOOTING) { 295 | endtime = ktime_get(); 296 | delta = ktime_sub(endtime, starttime); 297 | 298 | pr_debug("async_continuing @ %i after %lli usec\n", 299 | task_pid_nr(current), 300 | (long long)ktime_to_ns(delta) >> 10); 301 | } 302 | } 303 | EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain); 304 | 305 | /** 306 | * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing 307 | * @cookie: async_cookie_t to use as checkpoint 308 | * 309 | * This function waits until all asynchronous function calls prior to @cookie 310 | * have been done. 311 | */ 312 | void async_synchronize_cookie(async_cookie_t cookie) 313 | { 314 | async_synchronize_cookie_domain(cookie, &async_dfl_domain); 315 | } 316 | EXPORT_SYMBOL_GPL(async_synchronize_cookie); 317 | 318 | /** 319 | * current_is_async - is %current an async worker task? 320 | * 321 | * Returns %true if %current is an async worker task. 322 | */ 323 | bool current_is_async(void) 324 | { 325 | struct worker *worker = current_wq_worker(); 326 | 327 | return worker && worker->current_func == async_run_entry_fn; 328 | } 329 | -------------------------------------------------------------------------------- /testcode/configs.c: -------------------------------------------------------------------------------- 1 | /* 2 | * kernel/configs.c 3 | * Echo the kernel .config file used to build the kernel 4 | * 5 | * Copyright (C) 2002 Khalid Aziz 6 | * Copyright (C) 2002 Randy Dunlap 7 | * Copyright (C) 2002 Al Stone 8 | * Copyright (C) 2002 Hewlett-Packard Company 9 | * 10 | * This program is free software; you can redistribute it and/or modify 11 | * it under the terms of the GNU General Public License as published by 12 | * the Free Software Foundation; either version 2 of the License, or (at 13 | * your option) any later version. 14 | * 15 | * This program is distributed in the hope that it will be useful, but 16 | * WITHOUT ANY WARRANTY; without even the implied warranty of 17 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 18 | * NON INFRINGEMENT. See the GNU General Public License for more 19 | * details. 20 | * 21 | * You should have received a copy of the GNU General Public License 22 | * along with this program; if not, write to the Free Software 23 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 24 | */ 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | /**************************************************/ 34 | /* the actual current config file */ 35 | 36 | /* 37 | * Define kernel_config_data and kernel_config_data_size, which contains the 38 | * wrapped and compressed configuration file. The file is first compressed 39 | * with gzip and then bounded by two eight byte magic numbers to allow 40 | * extraction from a binary kernel image: 41 | * 42 | * IKCFG_ST 43 | * 44 | * IKCFG_ED 45 | */ 46 | #define MAGIC_START "IKCFG_ST" 47 | #define MAGIC_END "IKCFG_ED" 48 | #include "config_data.h" 49 | 50 | 51 | #define MAGIC_SIZE (sizeof(MAGIC_START) - 1) 52 | #define kernel_config_data_size \ 53 | (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2) 54 | 55 | #ifdef CONFIG_IKCONFIG_PROC 56 | 57 | static ssize_t 58 | ikconfig_read_current(struct file *file, char __user *buf, 59 | size_t len, loff_t * offset) 60 | { 61 | return simple_read_from_buffer(buf, len, offset, 62 | kernel_config_data + MAGIC_SIZE, 63 | kernel_config_data_size); 64 | } 65 | 66 | static const struct file_operations ikconfig_file_ops = { 67 | .owner = THIS_MODULE, 68 | .read = ikconfig_read_current, 69 | .llseek = default_llseek, 70 | }; 71 | 72 | static int __init ikconfig_init(void) 73 | { 74 | struct proc_dir_entry *entry; 75 | 76 | /* create the current config file */ 77 | entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL, 78 | &ikconfig_file_ops); 79 | if (!entry) 80 | return -ENOMEM; 81 | 82 | proc_set_size(entry, kernel_config_data_size); 83 | 84 | return 0; 85 | } 86 | 87 | static void __exit ikconfig_cleanup(void) 88 | { 89 | remove_proc_entry("config.gz", NULL); 90 | } 91 | 92 | module_init(ikconfig_init); 93 | module_exit(ikconfig_cleanup); 94 | 95 | #endif /* CONFIG_IKCONFIG_PROC */ 96 | 97 | MODULE_LICENSE("GPL"); 98 | MODULE_AUTHOR("Randy Dunlap"); 99 | MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); 100 | -------------------------------------------------------------------------------- /testcode/wrong_case.c: -------------------------------------------------------------------------------- 1 | void module_layout(struct module *mod, 2 | struct modversion_info *ver, 3 | struct kernel_param *kp, 4 | struct kernel_symbol *ks, 5 | struct tracepoint * const *tp) // cannot identify "struct tracepoint * const *tp" as function argument 6 | { 7 | } 8 | 9 | void func(void) 10 | { 11 | int a0[10]; 12 | int a1[10][10]; // cannot identify two-dimensional array (multi-dimensional array?) 13 | return; 14 | } 15 | -------------------------------------------------------------------------------- /tools/FuncParser-opt.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squizz617/vuddy/33cdab1ad04a6dcc76011b92821dbeb055c6691e/tools/FuncParser-opt.jar -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squizz617/vuddy/33cdab1ad04a6dcc76011b92821dbeb055c6691e/tools/__init__.py -------------------------------------------------------------------------------- /tools/cvedatagen/README.md: -------------------------------------------------------------------------------- 1 | # NVD CVE crawler 2 | 3 | ## Modules 4 | File Name | Description 5 | --------------------|------------ 6 | cveXmlDownloader.py | Downloads XML files from NVD 7 | cveXmlParser.py | Parses and generates cvedata.pkl 8 | cveXmlUpdater.py | Downloads updated records from NVD and updates cvedata.pkl 9 | 10 | ## How to use 11 | 1. Running for the first time 12 | * Run `cveXmlDownloader.py`, `cveXmlParser.py`, and `cveXmlUpdater.py` in a row. 13 | 14 | 2. Later use 15 | * If you have already generated cvedata.pkl through past runs, run cveXmlUpdater.py for updates. 16 | 17 | -------------------------------------------------------------------------------- /tools/cvedatagen/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/squizz617/vuddy/33cdab1ad04a6dcc76011b92821dbeb055c6691e/tools/cvedatagen/__init__.py -------------------------------------------------------------------------------- /tools/cvedatagen/common.py: -------------------------------------------------------------------------------- 1 | #import urllib2 2 | from urllib import request 3 | import sys 4 | from zipfile import ZipFile 5 | from xml.etree.ElementTree import parse 6 | import json 7 | import os 8 | 9 | 10 | def download_url(url, fileName): 11 | #u = urllib2.urlopen(url) 12 | u = request.urlopen(url) 13 | f = open(fileName, "wb") 14 | meta = u.info() 15 | #fileSize = int(meta.getheaders("Content-Length")[0]) 16 | fileSize = int(meta.get_all("Content-Length")[0]) 17 | print("Downloading: %s (%s bytes)" % (fileName, fileSize)) 18 | 19 | downloadedSize = 0 20 | blockSize = 8192 21 | barSize = 30 22 | while True: 23 | buffer = u.read(blockSize) 24 | if not buffer: 25 | break 26 | 27 | downloadedSize += len(buffer) 28 | f.write(buffer) 29 | status = "\r" 30 | #status += "#" * (downloadedSize * barSize / fileSize) 31 | #status += " " * (barSize - downloadedSize * barSize / fileSize) 32 | #status += "%10d [%3.2f%%]" % (downloadedSize, downloadedSize * 100. / fileSize) 33 | status += "#" * (downloadedSize * barSize // fileSize) 34 | status += " " * (barSize - downloadedSize * barSize // fileSize) 35 | status += "%10d [%3.2f%%]" % (downloadedSize, downloadedSize * 100. // fileSize) 36 | # status += chr(8)*(len(status)+1) 37 | sys.stdout.write(status) 38 | sys.stdout.flush() 39 | 40 | sys.stdout.write("\n") 41 | f.close() 42 | 43 | 44 | def unzip(fileName): 45 | print("Extracting: " + fileName), 46 | zip = ZipFile(fileName) 47 | zip.extractall() 48 | zip.close() 49 | print(" [DONE]") 50 | 51 | 52 | def parse_xml(xmlFile): 53 | print("Processing: " + xmlFile), 54 | if not xmlFile.endswith(".json"): 55 | return {} 56 | 57 | update_count = 0 58 | new_count = 0 59 | subDict = {} 60 | cveid = "" 61 | cvss = "" 62 | cweid = "" 63 | reference = [] 64 | summary = "" 65 | 66 | #with open(xmlFile) as f: 67 | with open(xmlFile, 'r', encoding='utf-8') as f: 68 | json_obj = json.load(f) 69 | 70 | cve_dict = json_obj["CVE_Items"] 71 | for cve in cve_dict: 72 | cveid = cve["cve"]["CVE_data_meta"]["ID"] 73 | try: 74 | cweid = cve["cve"]["problemtype"]["problemtype_data"][0]["description"][0]["value"] 75 | except: 76 | cweid = "CWE-000" 77 | 78 | try: 79 | cvss = cve["impact"]["baseMetricV2"]["cvssV2"]["baseScore"] 80 | except: 81 | cvss = "0.0" 82 | 83 | if cveid in subDict: 84 | update_count += 1 85 | else: 86 | new_count += 1 87 | 88 | subDict[cveid] = [cvss, cweid, reference, summary] 89 | 90 | print("[Updated %s records, added %s new records]" % (update_count, new_count)) 91 | return subDict 92 | -------------------------------------------------------------------------------- /tools/cvedatagen/cveXmlDownloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Download and store NVD's CVE data in XML. 4 | See https://nvd.nist.gov/vuln/data-feeds#CVE_FEED for information. 5 | """ 6 | 7 | import os 8 | import datetime 9 | import common 10 | originalDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 11 | 12 | 13 | def process(): 14 | DLDir = os.path.join(originalDir, "data", "CVEXML") 15 | 16 | try: 17 | os.makedirs(DLDir) 18 | except OSError: 19 | pass 20 | 21 | # NVD's XML Vulnerability Feeds have been deprecated. Use JSON instead.. 22 | # https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2002.json.zip 23 | urlBase = "https://nvd.nist.gov/feeds/json/cve/1.1/" 24 | 25 | os.chdir(DLDir) 26 | 27 | for year in range(2002, datetime.datetime.now().year + 1): 28 | fileName = "nvdcve-1.1-{0}.json.zip".format(year) 29 | url = urlBase + fileName 30 | 31 | common.download_url(url, fileName) 32 | common.unzip(fileName) 33 | os.remove(fileName) 34 | 35 | 36 | if __name__ == '__main__': 37 | process() 38 | -------------------------------------------------------------------------------- /tools/cvedatagen/cveXmlParser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | NVD's CVE xml data processor. 4 | xml data is downloaded from https://nvd.nist.gov/download.cfm 5 | This module should be run only once. 6 | or, if the pickle file has been corrupted, run this module again. 7 | Updates of the database is done in cvexmlupdater.py 8 | """ 9 | 10 | import os 11 | import common 12 | try: 13 | import cPickle as pickle 14 | except ImportError: 15 | import pickle 16 | originalDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 17 | 18 | 19 | def process(): 20 | DLDir = os.path.join(originalDir, "data", "CVEXML") 21 | cveDict = {} 22 | 23 | for xml in os.listdir(DLDir): 24 | subDict = common.parse_xml(os.path.join(DLDir, xml)) 25 | cveDict.update(subDict) 26 | 27 | pickle.dump(cveDict, open(os.path.join(originalDir, "data", "cvedata.pkl"), "wb")) 28 | 29 | print("Stored " + str(len(cveDict)) + " CVE data in file 'cvedata.pkl'.") 30 | 31 | 32 | if __name__ == '__main__': 33 | process() 34 | -------------------------------------------------------------------------------- /tools/cvedatagen/cveXmlUpdater.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | CVE data updater. 4 | Run cveXmlDownloader.py and cveXmlParser.py before running this module. 5 | This module downloads "modified" data from NVD, uncompress and update the database. 6 | """ 7 | 8 | import os 9 | import pickle 10 | import common 11 | 12 | originalDir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 13 | 14 | 15 | def process(): 16 | # first download the modified cve data from NVD 17 | fileName = "nvdcve-1.1-modified.json.zip" 18 | url = "https://nvd.nist.gov/feeds/json/cve/1.1/" + fileName 19 | 20 | common.download_url(url, fileName) 21 | common.unzip(fileName) 22 | os.remove(fileName) 23 | 24 | # load the pickled cve data 25 | print("Reading pickled data..."), 26 | cveDict = pickle.load(open(os.path.join(originalDir, "data", "cvedata.pkl"), "rb")) 27 | print("[DONE]") 28 | 29 | subDict = common.parse_xml(fileName.replace(".zip", "")) 30 | cveDict.update(subDict) 31 | 32 | os.remove(fileName.replace(".zip", "")) 33 | 34 | print("Dumping updated pickle..."), 35 | pickle.dump(cveDict, open(os.path.join(originalDir, "data", "cvedata.pkl"), "wb")) 36 | print("[DONE]") 37 | 38 | 39 | if __name__ == '__main__': 40 | process() 41 | -------------------------------------------------------------------------------- /tools/parseutility.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | import re 5 | import platform 6 | 7 | # Import from parent directory 8 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 9 | import config 10 | 11 | 12 | def get_platform(): 13 | global osName 14 | global bits 15 | 16 | pf = platform.platform() 17 | bits, _ = platform.architecture() 18 | if "Windows" in pf: 19 | osName = "win" 20 | bits = "" 21 | elif "Linux" in pf: 22 | osName = "linux" 23 | if "64" in bits: 24 | bits = "64" 25 | else: 26 | bits = "86" 27 | else: 28 | osName = "osx" 29 | bits = "" 30 | 31 | 32 | def setEnvironment(caller): 33 | get_platform() 34 | global javaCallCommand 35 | if caller == "GUI": 36 | # try: 37 | # base_path = sys._MEIPASS 38 | # except: 39 | # base_path = os.path.abspath(".") 40 | cwd = os.getcwd() 41 | if osName == "win": 42 | # full_path = os.path.join(base_path, "FuncParser.exe") 43 | javaCallCommand = os.path.join(cwd, "FuncParser-opt.exe ") 44 | 45 | elif osName == "linux" or osName == "osx": 46 | # full_path = os.path.join(base_path, "FuncParser.jar") 47 | # javaCallCommand = "java -Xmx1024m -jar " + full_path + " " 48 | javaCallCommand = "\"{0}\" -Xmx1024m -jar \"{1}\" ".format(config.javaBinary, os.path.join(cwd, "FuncParser-opt.jar")) 49 | 50 | else: 51 | if osName == "win": 52 | base_path = os.path.dirname(os.path.abspath(__file__)) # vuddy/hmark root directory 53 | javaCallCommand = os.path.join(base_path, "FuncParser-opt.exe ") 54 | elif osName == "linux" or osName == "osx": 55 | base_path = os.path.dirname(os.path.abspath(__file__)) # vuddy/hmark root directory 56 | javaCallCommand = "\"{0}\" -Xmx1024m -jar \"{1}\" ".format(config.javaBinary, os.path.join(base_path, "FuncParser-opt.jar")) 57 | 58 | 59 | class function: 60 | parentFile = None # Absolute file which has the function 61 | parentNumLoc = None # Number of LoC of the parent file 62 | name = None # Name of the function 63 | lines = None # Tuple (lineFrom, lineTo) that indicates the LoC of function 64 | funcId = None # n, indicating n-th function in the file 65 | parameterList = [] # list of parameter variables 66 | variableList = [] # list of local variables 67 | dataTypeList = [] # list of data types, including user-defined types 68 | funcCalleeList = [] # list of called functions' names 69 | funcBody = None 70 | 71 | def __init__(self, fileName): 72 | self.parentFile = fileName 73 | self.parameterList = [] 74 | self.variableList = [] 75 | self.dataTypeList = [] 76 | self.funcCalleeList = [] 77 | 78 | def removeListDup(self): 79 | # for best performance, must execute this method 80 | # for every instance before applying the abstraction. 81 | self.parameterList = list(set(self.parameterList)) 82 | self.variableList = list(set(self.variableList)) 83 | self.dataTypeList = list(set(self.dataTypeList)) 84 | self.funcCalleeList = list(set(self.funcCalleeList)) 85 | 86 | # def getOriginalFunction(self): 87 | # # returns the original function back from the instance. 88 | # fp = open(self.parentFile, 'r') 89 | # srcFileRaw = fp.readlines() 90 | # fp.close() 91 | # return ''.join(srcFileRaw[self.lines[0]-1:self.lines[1]]) 92 | 93 | 94 | def loadSource(rootDirectory): 95 | # returns the list of .src files under the specified root directory. 96 | maxFileSizeInBytes = None 97 | maxFileSizeInBytes = 2097152 # remove this line if you don't want to restrict 98 | # the maximum file size that you process. 99 | walkList = os.walk(rootDirectory) 100 | srcFileList = [] 101 | for path, dirs, files in walkList: 102 | for fileName in files: 103 | ext = fileName.lower() 104 | if ext.endswith('.c') or ext.endswith('.cpp') or ext.endswith('.cc') or ext.endswith('.c++') or ext.endswith('.cxx'): 105 | absPathWithFileName = path.replace('\\', '/') + '/' + fileName 106 | if maxFileSizeInBytes is not None: 107 | if os.path.getsize(absPathWithFileName) < maxFileSizeInBytes: 108 | srcFileList.append(absPathWithFileName) 109 | else: 110 | srcFileList.append(absPathWithFileName) 111 | return srcFileList 112 | 113 | 114 | def loadVul(rootDirectory): 115 | # returns the list of .vul files under the specified root directory. 116 | maxFileSizeInBytes = None 117 | # maxFileSizeInBytes = 2097152 # remove this line if you don't want to restrict 118 | # the maximum file size that you process. 119 | walkList = os.walk(rootDirectory) 120 | srcFileList = [] 121 | for path, dirs, files in walkList: 122 | for fileName in files: 123 | if fileName.endswith('OLD.vul'): 124 | absPathWithFileName = path.replace('\\', '/') + '/' + fileName 125 | if maxFileSizeInBytes is not None: 126 | if os.path.getsize(absPathWithFileName) < maxFileSizeInBytes: 127 | srcFileList.append(absPathWithFileName) 128 | else: 129 | srcFileList.append(absPathWithFileName) 130 | return srcFileList 131 | 132 | 133 | def removeComment(string): 134 | # Code for removing C/C++ style comments. (Imported from ReDeBug.) 135 | c_regex = re.compile( 136 | r'(?P//.*?$|[{}]+)|(?P/\*.*?\*/)|(?P\'(\\.|[^\\\'])*\'|"(\\.|[^\\"])*"|.[^/\'"]*)', 137 | re.DOTALL | re.MULTILINE) 138 | return ''.join([c.group('noncomment') for c in c_regex.finditer(string) if c.group('noncomment')]) 139 | 140 | 141 | # def getBody(originalFunction): 142 | # # returns the function's body as a string. 143 | # return originalFunction[originalFunction.find('{')+1:originalFunction.rfind('}')] 144 | 145 | 146 | def normalize(string): 147 | # Code for normalizing the input string. 148 | # LF and TAB literals, curly braces, and spaces are removed, 149 | # and all characters are lowercased. 150 | return ''.join(string.replace('\n', '').replace('\r', '').replace('\t', '').replace('{', '').replace('}', '').split( 151 | ' ')).lower() 152 | 153 | 154 | def abstract(instance, level): 155 | # Applies abstraction on the function instance, 156 | # and then returns a tuple consisting of the original body and abstracted body. 157 | originalFunctionBody = instance.funcBody 158 | # print "===================" 159 | originalFunctionBody = removeComment(originalFunctionBody) 160 | # print originalFunctionBody 161 | # print '====================================================' 162 | if int(level) >= 0: # No abstraction. 163 | abstractBody = originalFunctionBody 164 | 165 | if int(level) >= 1: # PARAM 166 | parameterList = instance.parameterList 167 | for param in parameterList: 168 | if len(param) == 0: 169 | continue 170 | try: 171 | paramPattern = re.compile("(^|\W)" + param + "(\W)") 172 | abstractBody = paramPattern.sub("\g<1>FPARAM\g<2>", abstractBody) 173 | except: 174 | pass 175 | 176 | if int(level) >= 2: # DTYPE 177 | dataTypeList = instance.dataTypeList 178 | for dtype in dataTypeList: 179 | if len(dtype) == 0: 180 | continue 181 | try: 182 | dtypePattern = re.compile("(^|\W)" + dtype + "(\W)") 183 | abstractBody = dtypePattern.sub("\g<1>DTYPE\g<2>", abstractBody) 184 | except: 185 | pass 186 | 187 | if int(level) >= 3: # LVAR 188 | variableList = instance.variableList 189 | for lvar in variableList: 190 | if len(lvar) == 0: 191 | continue 192 | try: 193 | lvarPattern = re.compile("(^|\W)" + lvar + "(\W)") 194 | abstractBody = lvarPattern.sub("\g<1>LVAR\g<2>", abstractBody) 195 | except: 196 | pass 197 | 198 | if int(level) >= 4: # FUNCCALL 199 | funcCalleeList = instance.funcCalleeList 200 | for fcall in funcCalleeList: 201 | if len(fcall) == 0: 202 | continue 203 | try: 204 | fcallPattern = re.compile("(^|\W)" + fcall + "(\W)") 205 | abstractBody = fcallPattern.sub("\g<1>FUNCCALL\g<2>", abstractBody) 206 | except: 207 | pass 208 | 209 | return (originalFunctionBody, abstractBody) 210 | 211 | 212 | delimiter = "\r\0?\r?\0\r" 213 | 214 | 215 | def parseFile_shallow(srcFileName, caller): 216 | # this does not parse body. 217 | global javaCallCommand 218 | global delimiter 219 | setEnvironment(caller) 220 | javaCallCommand += "\"" + srcFileName + "\" 0" 221 | functionInstanceList = [] 222 | try: 223 | astString = subprocess.check_output(javaCallCommand, stderr=subprocess.STDOUT, shell=True) 224 | except subprocess.CalledProcessError as e: 225 | print("Parser Error:", e) 226 | astString = "" 227 | # astString 문자열로 디코딩 228 | astString = astString.decode('latin-1') 229 | funcList = astString.split(delimiter) 230 | for func in funcList[1:]: 231 | functionInstance = function(srcFileName) 232 | elemsList = func.split('\n')[1:-1] 233 | # print elemsList 234 | if len(elemsList) > 9: 235 | functionInstance.parentNumLoc = int(elemsList[1]) 236 | functionInstance.name = elemsList[2] 237 | functionInstance.lines = (int(elemsList[3].split('\t')[0]), int(elemsList[3].split('\t')[1])) 238 | functionInstance.funcId = int(elemsList[4]) 239 | functionInstance.funcBody = '\n'.join(elemsList[9:]) 240 | # print functionInstance.funcBody 241 | # print "-------------------" 242 | 243 | functionInstanceList.append(functionInstance) 244 | 245 | return functionInstanceList 246 | 247 | 248 | # def parseFile_semiDeep(srcFileName, caller): 249 | # # this does not parse body. 250 | # global javaCallCommand 251 | # global delimiter 252 | # setEnvironment(caller) 253 | # javaCallCommand += "\"" + srcFileName + "\" 0" 254 | # functionInstanceList = [] 255 | # try: 256 | # astString = subprocess.check_output(javaCallCommand, stderr=subprocess.STDOUT, shell=True) 257 | # except subprocess.CalledProcessError as e: 258 | # print "Parser Error:", e 259 | # astString = "" 260 | 261 | # funcList = astString.split(delimiter) 262 | # for func in funcList[1:]: 263 | # functionInstance = function(srcFileName) 264 | # elemsList = func.split('\n')[1:-1] 265 | # # print elemsList 266 | # if len(elemsList) > 9: 267 | # functionInstance.parentNumLoc = int(elemsList[1]) 268 | # functionInstance.name = elemsList[2] 269 | # functionInstance.lines = (int(elemsList[3].split('\t')[0]), int(elemsList[3].split('\t')[1])) 270 | # functionInstance.funcId = int(elemsList[4]) 271 | # functionInstance.parameterList = elemsList[5].rstrip().split('\t') 272 | # functionInstance.funcBody = '\n'.join(elemsList[9:]) 273 | # # print functionInstance.funcBody 274 | # # print "-------------------" 275 | 276 | # functionInstanceList.append(functionInstance) 277 | 278 | # return functionInstanceList 279 | 280 | 281 | def parseFile_deep(srcFileName, caller): 282 | global javaCallCommand 283 | global delimiter 284 | setEnvironment(caller) 285 | # this parses function definition plus body. 286 | javaCallCommand += "\"" + srcFileName + "\" 1" 287 | functionInstanceList = [] 288 | 289 | try: 290 | astString = subprocess.check_output(javaCallCommand, stderr=subprocess.STDOUT, shell=True) 291 | except subprocess.CalledProcessError as e: 292 | print("Parser Error:", e) 293 | astString = "" 294 | 295 | funcList = astString.split(delimiter) 296 | for func in funcList[1:]: 297 | functionInstance = function(srcFileName) 298 | 299 | elemsList = func.split('\n')[1:-1] 300 | # print elemsList 301 | if len(elemsList) > 9: 302 | functionInstance.parentNumLoc = int(elemsList[1]) 303 | functionInstance.name = elemsList[2] 304 | functionInstance.lines = (int(elemsList[3].split('\t')[0]), int(elemsList[3].split('\t')[1])) 305 | functionInstance.funcId = int(elemsList[4]) 306 | functionInstance.parameterList = elemsList[5].rstrip().split('\t') 307 | functionInstance.variableList = elemsList[6].rstrip().split('\t') 308 | functionInstance.dataTypeList = elemsList[7].rstrip().split('\t') 309 | functionInstance.funcCalleeList = elemsList[8].rstrip().split('\t') 310 | functionInstance.funcBody = '\n'.join(elemsList[9:]) 311 | # print '\n'.join(elemsList[9:]) 312 | functionInstanceList.append(functionInstance) 313 | 314 | return functionInstanceList 315 | --------------------------------------------------------------------------------